From 3b63b66129305c04bfb484d192cf794cb6537b02 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Mon, 18 Jun 2018 13:39:29 +0800
Subject: [PATCH 001/318] Initial checkin for BM device support

---
 CMakeLists.txt                                |  21 +
 cmake/compiler_options.cmake                  |  18 +
 cmake/config/anakin_config.h.in               |   2 +
 cmake/gather.cmake                            |   6 +
 framework/core/data_types.h                   |   9 +
 saber/CMakeLists.txt                          |  38 +-
 saber/core/common.h                           |  14 +
 saber/core/impl/bm/bm_device.cpp              |  24 +
 saber/core/impl/bm/bm_impl.cpp                |  89 ++
 saber/core/target_traits.h                    |   7 +
 saber/core/target_wrapper.h                   |  58 ++
 saber/core/tensor_op.cpp                      |  94 ++
 saber/funcs/CMakeLists.txt                    |  12 +
 saber/funcs/impl/bm/base/CMakeLists.txt       |  20 +
 .../impl/bm/base/include/bmdnn/bmdnn_api.h    | 814 ++++++++++++++++++
 .../bm/base/include/bmdnn/bmdnn_ext_api.h     | 438 ++++++++++
 .../bm/base/include/bmdnn/bmdnn_runtime.h     |  20 +
 .../impl/bm/base/include/bmdnn/op_code.h      |  62 ++
 .../bm/base/include/bmlib/bmlib_runtime.h     | 229 +++++
 .../impl/bm/base/include/bmlib/bmlib_utils.h  |  72 ++
 .../impl/bm/base/include/bmruntime/bmblob.h   |  97 +++
 .../impl/bm/base/include/bmruntime/bmcnnctx.h |  58 ++
 .../impl/bm/base/include/bmruntime/bmnet.h    |  78 ++
 .../bm/base/include/bmruntime/bmruntime.h     | 154 ++++
 .../base/include/bmruntime/bmruntime_common.h |  65 ++
 .../include/bmruntime/bmruntime_interface.h   |  11 +
 saber/funcs/impl/bm/vender_activation.h       |  96 +++
 saber/funcs/impl/bm/vender_conv.h             | 195 +++++
 saber/funcs/impl/bm/vender_conv_act.h         | 198 +++++
 saber/funcs/impl/bm/vender_conv_act_pooling.h | 176 ++++
 saber/funcs/impl/bm/vender_fc.h               | 114 +++
 saber/funcs/impl/bm/vender_pooling.h          | 151 ++++
 saber/saber_funcs_param.h                     |  85 ++
 saber/saber_types.h                           |  31 +-
 test/CMakeLists.txt                           |   4 +
 test/saber/bm/test_TargetWrapper_BM.cpp       |  16 +
 test/saber/bm/test_saber_buffer_BM.cpp        | 116 +++
 test/saber/bm/test_saber_buffer_BM.h          |  20 +
 test/saber/bm/test_saber_context_BM.cpp       |  31 +
 test/saber/bm/test_saber_context_BM.h         |  21 +
 test/saber/bm/test_saber_device_BM.cpp        |  20 +
 test/saber/bm/test_saber_device_BM.h          |  21 +
 test/saber/bm/test_saber_func_BM.h            |  38 +
 .../bm/test_saber_func_activation_BM.cpp      | 183 ++++
 test/saber/bm/test_saber_func_conv_BM.cpp     | 725 ++++++++++++++++
 test/saber/bm/test_saber_func_fc_BM.cpp       | 148 ++++
 test/saber/bm/test_saber_func_pooling_BM.cpp  | 311 +++++++
 test/saber/bm/test_saber_shape_BM.cpp         | 126 +++
 test/saber/bm/test_saber_shape_BM.h           |  25 +
 test/saber/bm/test_saber_tensor_BM.cpp        | 642 ++++++++++++++
 test/saber/bm/test_saber_tensor_BM.h          |  21 +
 51 files changed, 6016 insertions(+), 8 deletions(-)
 create mode 100644 saber/core/impl/bm/bm_device.cpp
 create mode 100644 saber/core/impl/bm/bm_impl.cpp
 create mode 100644 saber/funcs/impl/bm/base/CMakeLists.txt
 create mode 100644 saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h
 create mode 100644 saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h
 create mode 100644 saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h
 create mode 100644 saber/funcs/impl/bm/base/include/bmdnn/op_code.h
 create mode 100644 saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h
 create mode 100644 saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h
 create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmblob.h
 create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h
 create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmnet.h
 create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h
 create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h
 create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h
 create mode 100644 saber/funcs/impl/bm/vender_activation.h
 create mode 100644 saber/funcs/impl/bm/vender_conv.h
 create mode 100644 saber/funcs/impl/bm/vender_conv_act.h
 create mode 100644 saber/funcs/impl/bm/vender_conv_act_pooling.h
 create mode 100644 saber/funcs/impl/bm/vender_fc.h
 create mode 100644 saber/funcs/impl/bm/vender_pooling.h
 create mode 100644 test/saber/bm/test_TargetWrapper_BM.cpp
 create mode 100644 test/saber/bm/test_saber_buffer_BM.cpp
 create mode 100644 test/saber/bm/test_saber_buffer_BM.h
 create mode 100644 test/saber/bm/test_saber_context_BM.cpp
 create mode 100644 test/saber/bm/test_saber_context_BM.h
 create mode 100644 test/saber/bm/test_saber_device_BM.cpp
 create mode 100644 test/saber/bm/test_saber_device_BM.h
 create mode 100644 test/saber/bm/test_saber_func_BM.h
 create mode 100644 test/saber/bm/test_saber_func_activation_BM.cpp
 create mode 100644 test/saber/bm/test_saber_func_conv_BM.cpp
 create mode 100644 test/saber/bm/test_saber_func_fc_BM.cpp
 create mode 100644 test/saber/bm/test_saber_func_pooling_BM.cpp
 create mode 100644 test/saber/bm/test_saber_shape_BM.cpp
 create mode 100644 test/saber/bm/test_saber_shape_BM.h
 create mode 100644 test/saber/bm/test_saber_tensor_BM.cpp
 create mode 100644 test/saber/bm/test_saber_tensor_BM.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c4dbfc25d..0a81d7c02 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -63,9 +63,22 @@ anakin_option(USE_CUFFT "Use CuFFT libs." YES if USE_CUDA)
 anakin_option(USE_CUDNN "Use Cudnn libs." YES if USE_CUDA)
 anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform." YES if USE_CUDA)
 anakin_option(BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform" NO if BUILD_CROSS_PLANTFORM)
+
+# compile options for BM place
+anakin_option(USE_BM "Use Cuda libs." YES if NVIDIA_GPU)
+anakin_option(USE_CUBLAS "Use Cublas libs." YES if USE_BM)
+anakin_option(USE_CURAND "Use Curand libs." YES if USE_BM)
+anakin_option(USE_CUFFT "Use CuFFT libs." YES if USE_BM)
+anakin_option(USE_CUDNN "Use Cudnn libs." YES if USE_BM)
+anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform." YES if USE_BM)
+
+
 if(USE_CUDA)
     # Select gpu target arch for local high performance implement sass code . Now we have checked on sm_61 sm_50 and it works well.
     set(SELECTED_SASS_TARGET_ARCH "61")
+elseif(USE_BM)
+    # Select gpu target arch for local high performance implement sass code . Now we have checked on sm_61 sm_50 and it works well.
+    #set(SELECTED_SASS_TARGET_ARCH "61")
 endif()
 if((NOT BUILD_FAT_BIN) AND (NOT BUILD_CROSS_PLANTFORM) AND USE_CUDA)
     # Select the only nvidia gpu arch you want to be built on
@@ -76,6 +89,10 @@ endif()
 anakin_option(BUILD_CUBIN "BUILD with the -cubin option in Device mode" NO if USE_CUDA)
 anakin_option(COMPILE_PTX "Returns a list of PTX files generated from src." NO if USE_CUDA)
 
+# build options for BM.
+anakin_option(BUILD_CUBIN "BUILD with the -cubin option in Device mode" NO if USE_BM)
+anakin_option(COMPILE_PTX "Returns a list of PTX files generated from src." NO if USE_BM)
+
 
 # common build options
 anakin_option(ENABLE_DEBUG "Enable DEBUG(default) mode." NO)
@@ -140,6 +157,10 @@ if(USE_CUDA)
     include(cmake/cuda.cmake)
 endif()
 
+if(USE_BM)
+    #include(cmake/cuda.cmake)
+endif()
+
 if(USE_X86_PLACE)
     set(ANAKIN_TEMP_THIRD_PARTY_PATH ${CMAKE_BINARY_DIR}/third-party)
     if(USE_MKLML)
diff --git a/cmake/compiler_options.cmake b/cmake/compiler_options.cmake
index 169c042fc..49d133c7f 100644
--- a/cmake/compiler_options.cmake
+++ b/cmake/compiler_options.cmake
@@ -112,3 +112,21 @@ if(USE_CUDA)
     # set default nvidia gpu arch
     set(ANAKIN_ARCH_LIST "3.5;5.0;6.0;6.1")
 endif()
+
+if(USE_BM)
+	if(CMAKE_BUILD_TYPE MATCHES Debug)
+		anakin_add_compile_option("-Xcompiler -fPIC" NVCC)
+		anakin_add_compile_option(-G NVCC)
+		anakin_add_compile_option(-g NVCC)
+		anakin_add_compile_option(-std=c++11 NVCC)
+		anakin_add_compile_option(-Wno-deprecated-gpu-targets NVCC) # suppress warning by architectures are deprecated (2.0,2.1)
+	else()
+		anakin_add_compile_option("-Xcompiler -fPIC" NVCC)
+		anakin_add_compile_option(-O3 NVCC)
+		anakin_add_compile_option(-std=c++11 NVCC)
+		anakin_add_compile_option("--default-stream per-thread" NVCC)
+		anakin_add_compile_option(-Wno-deprecated-gpu-targets NVCC)
+	endif()
+	# set default nvidia gpu arch
+	set(ANAKIN_ARCH_LIST "3.5;5.0;6.0;6.1")
+endif()
diff --git a/cmake/config/anakin_config.h.in b/cmake/config/anakin_config.h.in
index b75990953..0a8560593 100644
--- a/cmake/config/anakin_config.h.in
+++ b/cmake/config/anakin_config.h.in
@@ -35,6 +35,8 @@
 
 #cmakedefine USE_CUDA
 
+#cmakedefine USE_BM
+
 #cmakedefine USE_CUDNN
 
 #cmakedefine USE_PYTHON
diff --git a/cmake/gather.cmake b/cmake/gather.cmake
index cc7b3cc27..5017efff7 100644
--- a/cmake/gather.cmake
+++ b/cmake/gather.cmake
@@ -17,6 +17,12 @@ if(USE_CUDA)
     anakin_find_cuda()
 endif()
 
+if(USE_BM)
+    #set other cuda path
+    #set(CUDA_TOOLKIT_ROOT_DIR $ENV{CUDA_PATH})
+    #anakin_find_cuda()
+endif()
+
 
 # find opencl
 if(USE_OPENCL)
diff --git a/framework/core/data_types.h b/framework/core/data_types.h
index f06db5bdc..16bfccd08 100644
--- a/framework/core/data_types.h
+++ b/framework/core/data_types.h
@@ -17,6 +17,7 @@
 #define ANAKIN_DATA_TYPES_H 
 
 #include "framework/core/parameter.h"
+#include "bmlib_runtime.h"
 #include <cstddef>
 
 namespace anakin {
@@ -45,6 +46,7 @@ SABER_TO_BASE_TYPE(AK_UINT16, uint16_t);
 SABER_TO_BASE_TYPE(AK_UINT32, uint32_t);
 SABER_TO_BASE_TYPE(AK_BOOL, bool);
 SABER_TO_BASE_TYPE(AK_STRING, std::string);
+SABER_TO_BASE_TYPE(AK_BM, bm_device_mem_t);
 
 template<typename T>
 struct DataTypeRecover {
@@ -69,6 +71,7 @@ BASE_TYPE_TO_SABER(uint8_t, AK_UINT8);
 BASE_TYPE_TO_SABER(uint32_t, AK_UINT32);
 BASE_TYPE_TO_SABER(bool, AK_BOOL);
 BASE_TYPE_TO_SABER(std::string, AK_STRING);
+BASE_TYPE_TO_SABER(bm_device_mem_t, AK_BM);
 
 template<typename T>
 struct TypeWarpper {
@@ -96,6 +99,7 @@ ANAKIN_TO_TYPE_ID(long long, anakin_int64)
 ANAKIN_TO_TYPE_ID(unsigned long long, anakin_uint64)
 ANAKIN_TO_TYPE_ID(bool, anakin_bool)
 ANAKIN_TO_TYPE_ID(std::string, anakin_string)
+ANAKIN_TO_TYPE_ID(bm_device_mem_t, anakin_bm)
 
 /// unique type tensor
 /// ANAKIN_TO_TYPE_ID(tensor, anakin_tensor)
@@ -133,6 +137,11 @@ ANAKIN_TO_TYPE_ID(Enum, anakin_tuple_enum)
 	ANAKIN_PBLOCK_TO_TYPE_ID(float, ARM, anakin_block_float)
 #endif
 
+#ifdef USE_BM
+	ANAKIN_PBLOCK_TO_TYPE_ID(bm_device_mem_t, BM, anakin_block_float)
+#endif
+
+
 template<typename T>
 struct type_id {
     typedef T type;
diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt
index 415497b0c..440d1de07 100644
--- a/saber/CMakeLists.txt
+++ b/saber/CMakeLists.txt
@@ -56,7 +56,7 @@ if(USE_CUDA)
 	# set select arch for cuda
 	add_subdirectory(${ANAKIN_SABER}/funcs/impl/cuda/base)
 
-	set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS}) 
+	set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS})
 	set(CMAKE_CXX_FLAGS "")
 	if(BUILD_SHARED)
     		CUDA_COMPILE(ANAKIN_SABER_CUDA_C_SRC_OBJS SHARED ${ANAKIN_SABER_CUDA_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
@@ -66,15 +66,41 @@ if(USE_CUDA)
 	endif()
     	set(CMAKE_CXX_FLAGS ${FLAGS_BACKUP})
 
-	set(ANAKIN_SABER_STATIC_RELAY ${ANAKIN_SABER_STATIC_RELAY} 
-				      ${BEGIN_WHOLE_ARCHIVE} 
-				      ${ANAKIN_SABER_SASS_STATIC_LIB} 
-				      ${WHOLE_ARCHIVE_END})	
+	set(ANAKIN_SABER_STATIC_RELAY ${ANAKIN_SABER_STATIC_RELAY}
+				      ${BEGIN_WHOLE_ARCHIVE}
+				      ${ANAKIN_SABER_SASS_STATIC_LIB}
+				      ${WHOLE_ARCHIVE_END})
 endif()
 
+if(USE_BM)
+    anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/bm "cpp" ANAKIN_SABER_BASE_SRC)
+    anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/bm "cpp" ANAKIN_SABER_BASE_SRC)
+
+    # set root
+    set(BM_BASE_CODE_ROOT ${ANAKIN_SABER}/funcs/impl/bm/base)
+    # set select arch for cuda
+    add_subdirectory(${ANAKIN_SABER}/funcs/impl/bm/base)
+
+    set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS})
+    set(CMAKE_CXX_FLAGS "")
+    if(BUILD_SHARED)
+        CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS SHARED ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
+    endif()
+    if(BUILD_STATIC)
+        CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS STATIC ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
+    endif()
+    set(CMAKE_CXX_FLAGS ${FLAGS_BACKUP})
+
+    set(ANAKIN_SABER_STATIC_RELAY ${ANAKIN_SABER_STATIC_RELAY}
+            ${BEGIN_WHOLE_ARCHIVE}
+            ${ANAKIN_SABER_BM_STATIC_LIB}
+            ${WHOLE_ARCHIVE_END})
+endif()
+
+
 # add saber library to static
 if(UNIX OR APPLE) 
-    ADD_LIBRARY(${ANAKIN_SABER_TEMP_COMMMON_LIB} SHARED ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC})
+    ADD_LIBRARY(${ANAKIN_SABER_TEMP_COMMMON_LIB} SHARED ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BM_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC})
 							#$<TARGET_OBJECTS:ANAKIN_SABER_BASE_OBJS>) 
     if(USE_X86_PLACE)
 		message(STATUS ${ANAKIN_SABER_DEPENDENCIES})
diff --git a/saber/core/common.h b/saber/core/common.h
index 9e1bdd89d..2e7cd2650 100644
--- a/saber/core/common.h
+++ b/saber/core/common.h
@@ -146,3 +146,17 @@ const char* cudnn_get_errorstring(cudnnStatus_t status);
 
 #endif //ANAKIN_SABER_CORE_COMMON_H
 
+#ifdef USE_BM
+
+#include "bmlib_runtime.h"
+#include "bmdnn_api.h"
+#include "bmlib_utils.h"
+
+#define BMDNN_CHECK(condition) \
+  do { \
+    bm_status_t error = condition; \
+    CHECK_EQ(error, BM_SUCCESS) << " Failed with error code:" << error; \
+  } while (0)
+
+#endif // USE_BM
+
diff --git a/saber/core/impl/bm/bm_device.cpp b/saber/core/impl/bm/bm_device.cpp
new file mode 100644
index 000000000..c89045dcf
--- /dev/null
+++ b/saber/core/impl/bm/bm_device.cpp
@@ -0,0 +1,24 @@
+#include "core/device.h"
+namespace anakin{
+
+namespace saber{
+
+template <>
+void Device<BM>::create_stream() {
+    // todo
+    LOG(WARNING) << "BM create_stream is not implemented";
+}
+
+template <>
+void Device<BM>::get_info() {
+    // todo
+    LOG(WARNING) << "BM get_info is not implemented";
+}
+
+template void Device<BM>::get_info();
+template void Device<BM>::create_stream();
+
+
+} //namespace saber
+
+} //namespace anakin
diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
new file mode 100644
index 000000000..3ff30773a
--- /dev/null
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -0,0 +1,89 @@
+#include "core/tensor.h"
+#include "env.h"
+
+#include "bmlib_runtime.h"
+#include "bmdnn_api.h"
+#include "bmlib_utils.h"
+
+#ifdef USE_BM
+const char* bmdnn_get_errorstring(bm_status_t error) {
+    switch (error) {
+        case BM_SUCCESS:
+            return "BM API call correct";
+        case BM_ERR_FAILURE:
+            return "BM API fail to return";
+        case BM_ERR_TIMEOUT:
+            return "BM API time out";
+        case BM_ERR_PARAM:
+            return "BM API invalid parameter";
+        case BM_ERR_NOMEM:
+            return "BM API insufficient memory";
+        case BM_ERR_DATA:
+            return "BM API invalid data";
+        case BM_ERR_BUSY:
+            return "BM device is busy";
+        case BM_NOT_SUPPORTED:
+            return "BM unsupported operate";
+    }
+    return "Unknown bmdnn status";
+}
+#endif
+
+namespace anakin{
+
+namespace saber{
+
+#ifdef USE_BM
+
+typedef TargetWrapper<BM, __device_target> BM_API;
+
+static bm_handle_t handle;
+
+void BM_API::get_device_count(int &count) {
+    BMDNN_CHECK(bm_dev_getcount(&count));
+}
+
+void BM_API::set_device(int id){
+    //(bm_handle_t &handle, bool bmkernel_used, int id){
+    BMDNN_CHECK(bm_dev_request(&handle, 0, id));
+}
+
+//TODO: Do we have this functionality?
+int BM_API::get_device_id(){
+    return 0;
+}
+        
+void BM_API::mem_alloc(void** ptr, size_t n){
+    //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n)
+    bm_device_mem_t mem = bm_mem_from_system(ptr);
+    BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n));
+}
+        
+void BM_API::mem_free(void* ptr){
+    //(bm_handle_t handle, bm_device_mem_t mem){
+    if(ptr != nullptr){
+        bm_free_device(handle, bm_mem_from_system(ptr));
+    }
+}
+        
+void BM_API::mem_set(void* ptr, int value, size_t n){
+    //(bm_handle_t handle, const int value, bm_device_mem_t mem){
+    BMDNN_CHECK(bm_memset_device(handle, value, bm_mem_from_system(ptr)));
+}
+
+//! target wrapper
+template struct TargetWrapper<BM, __device_target>;
+
+//! BM Buffer
+template class Buffer<BM>;
+
+//! BM Tensor
+INSTANTIATE_TENSOR(BM, AK_BM, NCHW);
+
+template struct Env<BM>;
+
+#endif //USE_BM
+
+} //namespace saber
+
+} //namespace anakin
diff --git a/saber/core/target_traits.h b/saber/core/target_traits.h
index 9c1d06d95..b4eb38ff0 100644
--- a/saber/core/target_traits.h
+++ b/saber/core/target_traits.h
@@ -27,6 +27,7 @@ struct __cuda_device{};
 struct __arm_device{};
 struct __amd_device{};
 struct __x86_device{};
+struct __bm_device{};
 
 struct __HtoD{};
 struct __HtoH{};
@@ -69,6 +70,12 @@ struct TargetTypeTraits<AMD> {
   typedef __amd_device target_type;
 };
 
+template <>
+struct TargetTypeTraits<BM> {
+  typedef __device_target target_category;
+  typedef __bm_device target_type;
+};
+
 } //namespace saber
 
 } //namespace anakin
diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index 778491505..6d5d6a8d1 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -360,6 +360,64 @@ struct TargetWrapper<NV, __device_target> {
 
 #endif //USE_CUDA
 
+#ifdef USE_BM
+        /**
+ * \brief for Bitmain sophon device target only, device target is BM tpu
+ * use bitmain api to manage memory
+ * support device to device, device to host, host to device memcpy
+*/
+template <>
+struct TargetWrapper<BM, __device_target> {
+    typedef void* event_t;
+    typedef void* stream_t;
+
+    static void get_device_count(int& count);
+
+    static void set_device(int id);
+
+    //We should add strategy to avoid malloc directly
+    static void mem_alloc(void** ptr, size_t n);
+
+    //template <typename void>
+    static void mem_free(void * ptr);
+
+    //template <typename void>
+    static void mem_set(void* ptr, int value, size_t n);
+
+    // brief create event, empty function for bitmain target
+    static void create_event(event_t& event, bool flag = false) {}
+    static void destroy_event(event_t& event) {}
+    static void create_stream(stream_t& stream) {}
+    static void create_stream_with_flag(stream_t& stream, unsigned int flag) {}
+    static void create_stream_with_priority(stream_t& stream, unsigned int flag, int priority) {}
+    static void destroy_stream(stream_t& stream) {}
+    static void record_event(event_t& event, stream_t stream) {}
+    static void query_event(event_t& event) {}
+    static void sync_event(event_t& event) {}
+    static void sync_stream(event_t& event, stream_t& stream) {}
+    // brief create event, empty function for bitmain target
+
+    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+        size_t count, __DtoD);
+
+    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+        size_t count, __HtoD);
+
+    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+        size_t count, __DtoH);
+
+    static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
+        int src_dev, size_t count);
+
+    /**
+     * \brief device target return currently used device id
+     * @return          currently activated device id
+     */
+    static int get_device_id();
+};
+
+#endif //USE_BM
+
 } //namespace saber
 
 } //namespace anakin
diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp
index 2e64dcdec..046fef53c 100644
--- a/saber/core/tensor_op.cpp
+++ b/saber/core/tensor_op.cpp
@@ -362,6 +362,100 @@ Context<NV> ctx) {
 }
 #endif
 
+
+/*#ifdef USE_BM
+
+template<>
+SaberStatus
+DataTensorTransformHelper::convert_weights<Tensor<X86, AK_INT8, NCHW_C4>,
+                          Tensor<X86, AK_FLOAT, NCHW> >(Tensor<X86, AK_INT8, NCHW_C4>& out_tensor,
+                                  const Tensor<X86, AK_FLOAT, NCHW>& in_tensor,
+Context<BM> ctx) {
+    int input_channel = in_tensor.channel();
+    int output_channel = out_tensor.shape()[1];
+    //            LOG(INFO)<<"input_channel = "<<input_channel<<" output_channel = "<<output_channel;
+    _vector_weight_scale.resize(input_channel);
+
+    int weight_inner_dim = in_tensor.channel()
+                           * in_tensor.height()
+                           * in_tensor.width();
+    const float* in_weight_data = in_tensor.data();
+
+    for (int c = 0; c < input_channel; ++c) {
+        float max_val = -1.f;
+
+        for (int i = 0; i < weight_inner_dim; ++i) {
+            float read_data = fabs(in_weight_data[i]);
+            max_val = (read_data > max_val) ? read_data : max_val;
+        }
+
+        _vector_weight_scale[c] = max_val / 127.f;
+        in_weight_data += weight_inner_dim;
+        //                LOG(INFO)<<"max_val = "<<max_val<<" vector: "<<max_val / 127.f;
+    }
+
+    int o_num = out_tensor.num();
+    int o_channel = output_channel;
+    int o_height = out_tensor.height();
+    int o_width = out_tensor.width();
+
+    int out_n_stride = o_channel * o_height * o_width;
+    int out_c_stride = o_height * o_width;
+    int out_h_stride = o_width;
+
+    Shape in_stride = in_tensor.get_stride();
+
+    in_weight_data = in_tensor.data();
+    char* out_weight_data = out_tensor.mutable_data();
+
+    for (int idx = 0; idx < o_num * o_channel * o_height * o_width; ++idx) {
+
+        int n = (idx / (out_n_stride)) % o_num;
+        int in_offset = ((idx / (out_n_stride)) % o_num) * in_stride[0]
+                        + ((idx / (out_c_stride)) % o_channel) * (in_stride[1] * 4)
+                        + ((idx / (out_h_stride)) % o_height) * in_stride[2]
+                        + (idx % o_width) * in_stride[3];
+
+        int out_offset = ((idx / (out_n_stride)) % o_num) * out_n_stride
+                         + ((idx / (out_c_stride)) % o_channel) * out_c_stride
+                         + ((idx / (out_h_stride)) % o_height) * out_h_stride
+                         + (idx % o_width);
+        out_weight_data[out_offset * 4 + 0] = (char)(round(
+                in_weight_data[in_offset + 0 * in_stride[1]] / _vector_weight_scale[n]));
+        out_weight_data[out_offset * 4 + 1] = (char)(round(
+                in_weight_data[in_offset + 1 * in_stride[1]] / _vector_weight_scale[n]));
+        out_weight_data[out_offset * 4 + 2] = (char)(round(
+                in_weight_data[in_offset + 2 * in_stride[1]] / _vector_weight_scale[n]));
+        out_weight_data[out_offset * 4 + 3] = (char)(round(
+                in_weight_data[in_offset + 3 * in_stride[1]] / _vector_weight_scale[n]));
+
+    }
+
+    return SaberSuccess;
+}
+template<>
+SaberStatus
+DataTensorTransformHelper::convert_bias<Tensor<X86, AK_FLOAT, NCHW>,
+                          Tensor<X86, AK_FLOAT, NCHW> >(Tensor<X86, AK_FLOAT, NCHW>& out_tensor,
+                                  const Tensor<X86, AK_FLOAT, NCHW>& in_tensor,
+Context<BM> ctx) {
+    unsigned long weight_size = _vector_weight_scale.size();
+    unsigned long bias_size = in_tensor.size();
+    CHECK_GT(_in_scale, 0);
+    CHECK_GT(weight_size, 0);
+    CHECK_EQ(bias_size, weight_size);
+
+    const float* in_data = in_tensor.data();
+    float* out_data = out_tensor.mutable_data();
+
+    for (int i = 0; i < bias_size; ++i) {
+        out_data[i] = in_data[i] / _in_scale / _vector_weight_scale[i];
+    }
+
+    return SaberSuccess;
+}
+#endif*/
+
 } //namespace saber
 
 } //namespace anakin
diff --git a/saber/funcs/CMakeLists.txt b/saber/funcs/CMakeLists.txt
index deaf76eab..bdd319f13 100644
--- a/saber/funcs/CMakeLists.txt
+++ b/saber/funcs/CMakeLists.txt
@@ -10,6 +10,10 @@ if(USE_CUDA)
     #FILE(GLOB CUDA_BASE_SRCS "cuda/*.cpp" "cuda/*.cu")
     aux_source_directory(impl/cuda CUDA_BASE_SRCS)
 endif()
+if(USE_BM)
+    #FILE(GLOB BM_BASE_SRCS "cuda/*.cpp" "cuda/*.cu")
+    aux_source_directory(impl/bm BM_BASE_SRCS)
+endif()
 if(USE_AMD)
     #FILE(GLOB CUDA_BASE_SRCS "cuda/*.cpp" "cuda/*.cu")
     aux_source_directory(impl/amd AMD_BASE_SRCS)
@@ -48,6 +52,14 @@ foreach(SRC_NAME ${CUDA_BASE_SRCS})
     list(APPEND DIR_SRCS_CUR "${CMAKE_CURRENT_SOURCE_DIR}/${FILE_NAME}")
 endforeach()
 
+foreach(SRC_NAME ${BM_BASE_SRCS})
+    #unpack the dir "/"
+    string(REPLACE "./" "" FILE_NAME ${SRC_NAME})
+    string(REPLACE " " "" FILE_NAME ${FILE_NAME})
+    #string(REPLACE ".cpp" ".cpp;" FILE_NAME ${FILE_NAME})
+    list(APPEND DIR_SRCS_CUR "${CMAKE_CURRENT_SOURCE_DIR}/${FILE_NAME}")
+endforeach()
+
 foreach(SRC_NAME ${X86_BASE_SRCS})
     #unpack the dir "/"
     string(REPLACE "./" "" FILE_NAME ${SRC_NAME})
diff --git a/saber/funcs/impl/bm/base/CMakeLists.txt b/saber/funcs/impl/bm/base/CMakeLists.txt
new file mode 100644
index 000000000..fd4b3d680
--- /dev/null
+++ b/saber/funcs/impl/bm/base/CMakeLists.txt
@@ -0,0 +1,20 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved
+# @file     CMakeLists files in the saber  subdirectory for nvidia gpu code
+# @auther   cuichaowen
+# @date     2017-11-29
+# ----------------------------------------------------------------------------
+
+if(USE_BM)
+    anakin_fetch_files_with_suffix_recursively(${BM_BASE_CODE_ROOT}/include "h" ANAKIN_SABER_BM_C_SRC)
+    anakin_fetch_files_with_suffix_recursively(${BM_BASE_CODE_ROOT}/lib "o" ANAKIN_SABER_BM_STATIC_LIB)
+endif()
+
+macro(anakin_set_upscope src)
+    set(${src} ${${src}} PARENT_SCOPE)
+endmacro()
+
+if(USE_BM)
+    anakin_set_upscope(ANAKIN_SABER_BM_C_SRC)
+    anakin_set_upscope(ANAKIN_SABER_BM_STATIC_LIB)
+endif()
diff --git a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h
new file mode 100644
index 000000000..97feb1972
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h
@@ -0,0 +1,814 @@
+#ifndef BMDNN_API_H
+#define BMDNN_API_H
+
+#include "bmdnn_runtime.h"
+#include "op_code.h"
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * All the name-style of input/output are in the viewpoint of forward operation
+ */
+
+typedef struct kernel_param{
+    int g;
+    int oc;
+    int ic;
+    int h;
+    int w;
+}bm_kernel_param_t;
+
+typedef struct bm_conv_param{
+    int stride_h;
+    int stride_w;
+    int pad_h;
+    int pad_w;
+    int dilation_h;
+    int dilation_w;
+    bool result_add;
+}bm_conv_param_t;
+
+typedef struct bm_pool_param{
+  int stride_h;
+  int stride_w;
+  int pad_h;
+  int pad_w;
+  int kh;
+  int kw;
+  bool is_avg_pooling;
+}bm_pool_param_t;
+
+bm_status_t bmdnn_conv_relu_pool_forward(
+    bm_handle_t      handle,
+    bm_device_mem_t  input,
+    bm_device_mem_t  weight,
+    bm_device_mem_t  bias,
+    bm_tensor_4d_t      input_shape,
+    bm_kernel_param_t   kernel_param,
+    bm_pool_param_t     pool_param,
+    bm_conv_param_t     conv_param,
+    bool                with_bias,
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_conv_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  weight,
+    bm_device_mem_t  bias,
+    bm_tensor_4d_t      input_shape,
+    bm_kernel_param_t   kernel_param,
+    bm_tensor_4d_t      output_shape,
+    bm_conv_param_t     conv_param,
+    bool                with_bias,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_deconv_forward(
+    bm_handle_t      handle,
+    bm_device_mem_t  input,
+    bm_device_mem_t  weight,
+    bm_device_mem_t  bias,
+    bm_tensor_4d_t      input_shape,
+    bm_kernel_param_t   kernel_param,
+    bm_tensor_4d_t      output_shape,
+    bm_conv_param_t     conv_param,
+    bool                with_bias,
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_conv_backward_bias(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 groups,
+    int                 output_c,
+    int                 kh,
+    int                 kw,
+    int                 pad_h,
+    int                 pad_w,
+    int                 stride_h,
+    int                 stride_w,
+    int                 result_add,
+    //output
+    bm_device_mem_t  bias_diff);
+
+bm_status_t bmdnn_pooling_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 kh,
+    int                 kw,
+    int                 pad_h,
+    int                 pad_w,
+    int                 stride_h,
+    int                 stride_w,
+    int                 is_avg_pooling,
+    //output
+    bm_device_mem_t  output
+    );
+bm_status_t bmdnn_upsample_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 size,
+    //output
+    bm_device_mem_t  output
+    );
+bm_status_t bmdnn_roi_pooling_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  rois,
+    int              input_n,
+    int              input_c,
+    int              input_h,
+    int              input_w,
+    int              pooled_h,
+    int              pooled_w,
+    int              roi_num,
+    int              spatial_scale,
+    //output
+    bm_device_mem_t  output
+    );
+
+bm_status_t bmdnn_fc_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  weight,
+    bm_device_mem_t  bias,
+    int              batch_size,
+    int              num_output_neuron,
+    int              num_input_neuron,
+    int              transpose,
+    int              using_bias,
+    int              using_relu,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_fc_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  input,
+    bm_device_mem_t  weight,
+    int              num_output_neuron,
+    int              batch_size,
+    int              num_input_neuron,
+    int              using_bias,
+    int              propagate_down_bias_diff,
+    int              propagate_down_weight_diff,
+    int              propagate_down_bottom,
+    //output
+    bm_device_mem_t  weight_diff,
+    bm_device_mem_t  bias_diff,
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_dropout_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    float            dropout_ratio,
+    int              input_n,
+    int              input_dim,
+    //output
+    bm_device_mem_t  output,
+    bm_device_mem_t  mask);
+
+bm_status_t bmdnn_dropout_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    float               dropout_ratio,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_batchnorm_forward_inference(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  mean_ma,
+    bm_device_mem_t  variance_ma,
+    float               scale_ma,
+    bm_device_mem_t  variance,
+    float               eps,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_batchnorm_forward_train(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    float               ma_fraction,
+    float               eps,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    //output
+    bm_device_mem_t  output,
+    bm_device_mem_t  mean,
+    bm_device_mem_t  variance,
+    bm_device_mem_t  mean_ma,
+    bm_device_mem_t  variance_ma);
+
+bm_status_t bmdnn_batchnorm_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  output,
+    bm_device_mem_t  variance,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 using_global_stats,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_lrn_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 lrn_n,
+    float               alpha,
+    float               beta,
+    float               k,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_lrn_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  output,
+    bm_device_mem_t  input,
+    int                 lrn_n,
+    float               alpha,
+    float               beta,
+    float               k,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_relu_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    float               negative_slope,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_relu_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  input,
+    float               negative_slope,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_sigmoid_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_sigmoid_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  output,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_tanh_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_tanh_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  output,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_softmax_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_c,
+    int                 input_inner_dim,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_softmax_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  output,
+    int                 input_n,
+    int                 input_c,
+    int                 input_inner_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_softmax_loss_forward(
+    bm_handle_t      handle,
+    bm_device_mem_t  input,
+    bm_device_mem_t  label,
+    float               normalizer,
+    int                 input_n,
+    int                 input_c,
+    int                 input_inner_dim,
+    bm_device_mem_t  output,
+    bm_device_mem_t  loss);
+bm_status_t bmdnn_interp_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 pad_bag,
+    int                 pad_end,
+    int                 output_h,
+    int                 output_w,
+    //output
+    bm_device_mem_t  output
+    );
+bm_status_t bmdnn_softmax_loss_backward(
+    bm_handle_t      handle,
+    bm_device_mem_t  output,
+    bm_device_mem_t  label,
+    bm_device_mem_t  loss,
+    float               normalizer,
+    int                 input_n,
+    int                 input_c,
+    int                 input_inner_dim,
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_softmax_loss_bidirection(
+    bm_handle_t      handle,
+    bm_device_mem_t  input,
+    bm_device_mem_t  label,
+    float               normalizer,
+    int                 input_n,
+    int                 input_c,
+    int                 input_inner_dim,
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  loss);
+
+bm_status_t bmdnn_multiregion_forward_parallel(
+    bm_handle_t         handle,
+    //input
+    bm_device_mem_t*     input,
+    int*                 input_n,
+    int*                 input_c,
+    int*                 input_h,
+    int*                 input_w,
+    int                  input_num,
+    int                 classes,
+    int                 coords,
+    int                 nums,
+    int*                 Activate_parm,
+    //output
+    bm_device_mem_t*  output
+);
+
+bm_status_t bmdnn_accuracy(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  label_idx,
+    bm_device_mem_t  input_mem_buffer,
+    int                 input_num,
+    int                 input_dim,
+    int                 top_k,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_coeff_update_sgd(
+    bm_handle_t      handle,
+    bm_device_mem_t  weight_diff,
+    bm_device_mem_t  weight,
+    bm_device_mem_t  history_weight,
+    int                 weight_count,
+    float               base_lr,
+    float               momentum,
+    float               weight_decay);
+
+bm_status_t bmdnn_fc_backward_sgd(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  input,
+    //input and output
+    bm_device_mem_t  weight,
+    bm_device_mem_t  weight_history,
+    int                 num_output_neuron,
+    int                 batch_size,
+    int                 num_input_neuron,
+    int                 using_bias,
+    int                 propagate_down_bias_diff,
+    int                 propagate_down_weight_diff,
+    int                 propagate_down_bottom,
+    float               base_lr,
+    float               momentum,
+    float               weight_decay,
+    //output
+    bm_device_mem_t  bias_diff,
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_permute(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_normalize_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  scale,
+    float               eps,
+    float               scale_val,
+    bool                across_spatial,
+    bool                channel_share,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    //output
+    bm_device_mem_t  output);
+
+/*
+ * MD Operations for user
+ */
+
+
+bm_status_t bmdnn_md_scalar(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  tensor_A,
+    bm_device_mem_t  tensor_B,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    ALIGN_TENSOR_OP             align_tensor_op,
+    int                 result_add,
+    int                 A_is_constant,
+    int                 B_is_constant,
+    float               A_const_val,
+    float               B_const_val,
+    int                 B_N_is_1,
+    int                 B_index_is_1,
+    //output
+    bm_device_mem_t  tensor_R);
+
+bm_status_t bmdnn_md_cmp(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  tensor_A,
+    bm_device_mem_t  tensor_B,
+    bm_device_mem_t  tensor_C,
+    bm_device_mem_t  tensor_D,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 A_is_constant,
+    int                 B_is_constant,
+    int                 C_is_constant,
+    int                 D_is_constant,
+    float               A_constant,
+    float               B_constant,
+    unsigned int        C_constant,
+    unsigned int        D_constant,
+    int                 result_skip,
+    //output
+    bm_device_mem_t  tensor_Y,
+    bm_device_mem_t  tensor_R);
+
+bm_status_t bmdnn_md_sfu(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  tensor_A,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    SFU_OP              sfu_op,
+    float               a,
+    int                 n,
+    //output
+    bm_device_mem_t  tensor_Y);
+
+bm_status_t bmdnn_md_sum(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  tensor_A,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 result_add,
+    //output
+    bm_device_mem_t  tensor_Y);
+
+
+bm_status_t bmdnn_md_linear(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  tensor_A,
+    bm_device_mem_t  tensor_B,
+    bm_device_mem_t  tensor_S,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    LINEAR_OP           linear_op,
+    int                 result_add,
+    int                 B_is_const,
+    int                 S_is_const,
+    float               B_const_val,
+    float               S_const_val,
+    //output
+    bm_device_mem_t  tensor_Y);
+
+bm_status_t bmdnn_img_sum(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  tensor_A,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 result_add,
+    //output
+    bm_device_mem_t  tensor_Y);
+
+/*
+ * fullnet mode
+ */
+bm_status_t bmdnn_fullnet(
+        bm_handle_t handle,
+        unsigned long long bdc_cmd_offset,
+        unsigned long long gdma_cmd_offset,
+        unsigned long long cdma_cmd_offset,
+        unsigned long long cmd_num_offset
+        );
+
+/*
+ * multiple fullnet mode
+ */
+bm_status_t bmdnn_multi_fullnet(
+        bm_handle_t handle,
+        int input_num,
+        unsigned long long* user_input_global_offset,
+        unsigned long long* cmd_input_global_offset,
+        int* input_tensor_size,
+        int output_num,
+        unsigned long long* user_output_global_offset,
+        unsigned long long* cmd_output_global_offset,
+        int* output_tensor_size,
+        unsigned long long bdc_cmd_offset,
+        unsigned long long gdma_cmd_offset,
+        unsigned long long cdma_cmd_offset,
+        int* bdc_cmd_num,
+        int* gdma_cmd_num,
+        int* cdma_cmd_num,
+        int cmdgroup_num
+        );
+
+/*
+ * dynamic fullnet mode
+ */
+bm_status_t bmdnn_dynamic_fullnet(
+        bm_handle_t handle,
+        unsigned long long compiled_ir_global_addr,
+        unsigned int compiled_ir_length,
+        unsigned int batch_num,
+        unsigned int input_num,
+        unsigned long long* input_global_offset,
+        unsigned int* input_height,
+        unsigned int* input_width,
+        unsigned int output_num,
+        unsigned long long* output_global_offset,
+        unsigned long long apd_ctx_mem_offset
+#if defined(USING_CMODEL) && !defined(USING_FULLNET)
+        ,float**    p_refer_result
+#endif
+        );
+
+/**
+  * Depthwise convolution.
+  */
+bm_status_t bmdnn_depthwise_forward(
+        bm_handle_t         handle,
+        bm_device_mem_t     input,
+        bm_device_mem_t     weight,
+        bm_device_mem_t     bias,
+        int                 input_n,
+        int                 input_c,
+        int                 input_h,
+        int                 input_w,
+        int                 kernel_h,
+        int                 kernel_w,
+        int                 dilation_h,
+        int                 dilation_w,
+        int                 pad_h,
+        int                 pad_w,
+        int                 stride_h,
+        int                 stride_w,
+        int                 using_bias,
+        bm_device_mem_t     output);
+
+bm_status_t bmdnn_lstm_forward(
+        bm_handle_t      handle,
+        //input
+        bm_device_mem_t  input,
+        bm_device_mem_t  cont,
+        bm_device_mem_t  input_static,
+        /*bm_device_mem_t  w_hc,
+        bm_device_mem_t  w_xc,*/
+        bm_device_mem_t  w_hxc,
+        bm_device_mem_t  w_xstatic,
+        bm_device_mem_t  b_c,
+        bm_device_mem_t  h_0,
+        bm_device_mem_t  c_0,
+        int                 input_n,
+        int                 seq_len,
+        int                 input_dim,
+        int                 input_static_dim,
+        int                 output_dim,
+        int                 with_input_static,
+        //output
+        bm_device_mem_t  c,
+        bm_device_mem_t  gate,
+        bm_device_mem_t  h_T,
+        bm_device_mem_t  c_T,
+        bm_device_mem_t  h);
+
+bm_status_t bmdnn_netease_ocr_forward(
+        bm_handle_t      handle,
+        //input
+        bm_device_mem_t  conv1_ifmap,
+        bm_device_mem_t  params,
+        bm_device_mem_t  result);
+
+typedef struct dim4_s {
+    int n, c, h, w;
+} dim4_t;
+enum
+{
+    CONV_DEPTHWISE,
+    CONV_3D
+};
+typedef struct mobilenet_conv_param_s
+{
+    /** convolution. */
+    int type;
+    bm_device_mem_t kernel;
+    bm_device_mem_t bias;
+    dim4_t          kernel_shape;
+    int             dilation_h, dilation_w;
+    int             pad_h, pad_w;
+    int             stride_h, stride_w;
+    bool            using_bias;
+    /** batchnorm. */
+    bm_device_mem_t mean;
+    bm_device_mem_t variance;
+    /** relu. */
+    float           slope;
+} mobilenet_conv_param_t;
+bm_status_t bmdnn_mobilenet_forward(
+        bm_handle_t handle,
+        const mobilenet_conv_param_t   *conv,
+        int                             num,
+        const dim4_t                   &input_shape,
+        const bm_device_mem_t          &input_global_mem,
+        dim4_t                         &output_shape,
+        bm_device_mem_t                &output_global_mem,
+        float                           parallel_performance_factor = 1.f);
+
+bm_status_t bmdnn_conv_forward_bank_conflict(
+    bm_handle_t         handle,
+    //input
+    bm_device_mem_t     input,
+    bm_device_mem_t     weight,
+    bm_device_mem_t     bias,
+    bm_tensor_4d_t      input_shape,
+    bm_kernel_param_t   kernel_param,
+    bm_tensor_4d_t      output_shape,
+    bm_conv_param_t     conv_param,
+    bool                with_bias,
+    //output
+    bm_device_mem_t     output);
+
+bm_status_t bmdnn_pooling_forward_bank_conflict(
+    bm_handle_t         handle,
+    //input
+    bm_device_mem_t     input,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 kh,
+    int                 kw,
+    int                 pad_h,
+    int                 pad_w,
+    int                 stride_h,
+    int                 stride_w,
+    int                 is_avg_pooling,
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_fc_forward_bank_conflict(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  weight,
+    bm_device_mem_t  bias,
+    int              batch_size,
+    int              num_output_neuron,
+    int              num_input_neuron,
+    int              transpose,
+    int              using_bias,
+    int              using_relu,
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_conv_forward_power_evaluation(
+    bm_handle_t         handle,
+    //input
+    bm_device_mem_t     input,
+    bm_device_mem_t     weight,
+    bm_device_mem_t     bias,
+    bm_tensor_4d_t      input_shape,
+    bm_kernel_param_t   kernel_param,
+    bm_tensor_4d_t      output_shape,
+    bm_conv_param_t     conv_param,
+    bool                with_bias,
+    //output
+    bm_device_mem_t     output);
+
+bm_status_t bmdnn_img_scale(
+        bm_handle_t handle, bm_device_mem_t dst, bm_device_mem_t src, int n,
+        int c, int dh, int sh, int dw, int sw);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BMDNN_API_H */
diff --git a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h
new file mode 100644
index 000000000..384cd4108
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h
@@ -0,0 +1,438 @@
+#ifndef BMDNN_EXT_API_H
+#define BMDNN_EXT_API_H
+
+#include "bmdnn_runtime.h"
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+bm_status_t bmdnn_threshold_forward(
+    bm_handle_t      handle,
+    float               threshold,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output
+    );
+
+bm_status_t bmdnn_exp_forward(
+    bm_handle_t      handle,
+    float               base,
+    float               input_scale,
+    float               input_shift,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output
+    );
+
+bm_status_t bmdnn_exp_backward(
+    bm_handle_t      handle,
+    float               base,
+    float               input_scale,
+    float               input_shift,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  output,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff
+    );
+
+bm_status_t bmdnn_power_forward(
+    bm_handle_t      handle,
+    float               power_,
+    float               scale_,
+    float               shift_,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output
+    );
+
+bm_status_t bmdnn_power_backward(
+    bm_handle_t      handle,
+    float               power_,
+    float               scale_,
+    float               shift_,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  output,
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff
+    );
+
+bm_status_t bmdnn_euclidean_loss_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  label,
+    bm_device_mem_t  temp_,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  diff,
+    bm_device_mem_t  loss);
+
+bm_status_t bmdnn_euclidean_loss_backward(
+    bm_handle_t      handle,
+    float               alpha,
+    //input
+    bm_device_mem_t  output,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_silence_backward(
+    bm_handle_t      handle,
+    //input
+    //bm_device_mem_t  output_data,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_lstm_unit_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  X_i,
+    bm_device_mem_t  X_f,
+    bm_device_mem_t  X_o,
+    bm_device_mem_t  X_g,
+    bm_device_mem_t  C_prev,
+    bm_device_mem_t  cont_expand,
+    int                 num,
+    int                 hidden_dim,
+    //output
+    bm_device_mem_t  C,
+    bm_device_mem_t  H);
+
+bm_status_t bmdnn_lstm_unit_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  C_diff,
+    bm_device_mem_t  H_diff,
+    bm_device_mem_t  X_i,
+    bm_device_mem_t  X_f,
+    bm_device_mem_t  X_o,
+    bm_device_mem_t  X_g,
+    bm_device_mem_t  C_prev,
+    bm_device_mem_t  C,
+    bm_device_mem_t  cont_expand,
+    int                 num,
+    int                 hidden_dim,
+    //output
+    bm_device_mem_t  C_prev_diff,
+    bm_device_mem_t  X_i_diff,
+    bm_device_mem_t  X_f_diff,
+    bm_device_mem_t  X_o_diff,
+    bm_device_mem_t  X_g_diff);
+
+bm_status_t bmdnn_eltwise_forward(
+    bm_handle_t      handle,
+    int                 op_,
+    int                 flag_first,
+    float               coeffs_,
+    int                 index,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  target,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  mask_data,
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_eltwise_backward(
+    bm_handle_t      handle,
+    int                 op_,
+    int                 flag_first,
+    float               coeffs_,
+    int                 index,
+    //input
+    bm_device_mem_t  output_data,
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  input_data,
+    bm_device_mem_t  mask_data,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_bias_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  bias,
+    int                 outer_dim,
+    int                 dim,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_bias_backward(
+    bm_handle_t      handle,
+    int                 flag,
+    //input
+    bm_device_mem_t  output_diff,
+    int                 outer_dim,
+    int                 bias_dim,
+    int                 inner_dim,
+    //output
+    bm_device_mem_t  input_diff,
+    bm_device_mem_t  bias_diff);
+
+bm_status_t bmdnn_log_forward(
+    bm_handle_t      handle,
+    float               scale,
+    float               shift,
+    float               base,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_log_backward(
+    bm_handle_t      handle,
+    float               scale,
+    float               shift,
+    float               base,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_absval_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_absval_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_sigmoid_cross_entropy_loss_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  target,
+    bm_device_mem_t  buffer,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output,
+    bm_device_mem_t  loss);
+
+bm_status_t bmdnn_sigmoid_cross_entropy_loss_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output,
+    bm_device_mem_t  target,
+    bm_device_mem_t  output_diff,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_contrastive_loss_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input_0,
+    bm_device_mem_t  input_1,
+    bm_device_mem_t  label,
+    bm_device_mem_t  buffer,
+    int                 input_n,
+    int                 input_c,
+    float               margin,
+    bool                legacy_version,
+    //output
+    bm_device_mem_t  diff,
+    bm_device_mem_t  dist_sq,
+    bm_device_mem_t  loss);
+
+bm_status_t bmdnn_contrastive_loss_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  label,
+    bm_device_mem_t  diff,
+    bm_device_mem_t  dist_sq,
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  buffer,
+    int                 input_n,
+    int                 input_dim,
+    float               margin,
+    bool                legacy_version,
+    int                 propagate_down_flag,
+    //output
+    bm_device_mem_t  input_diff_0,
+    bm_device_mem_t  input_diff_1);
+
+bm_status_t bmdnn_filter_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  filter,
+    int                 input_n,
+    int                 output_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_filter_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  filter,
+    int                 input_n,
+    int                 output_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_split_backward(
+    bm_handle_t      handle,
+    //input
+    int                 is_first,
+    bm_device_mem_t  output_diff,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_bnll_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_bnll_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  input,
+    float               threshold,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_prelu_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  slope,
+    float            slope0,
+    int                 channel_shared,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_prelu_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  input,
+    bm_device_mem_t  slope,
+    int                 propagate_down_flag,
+    int                 channel_shared,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    //output
+    bm_device_mem_t  slope_diff,
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_scale_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  scale,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 scale_dim,
+    int                 inner_dim,
+    int                 scale_is_neuron,
+    //output
+    bm_device_mem_t  scale_extension,
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_scale_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  input_data,
+    bm_device_mem_t  scale_extension,
+    int                 propagate_down_flag,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 scale_dim,
+    int                 inner_dim,
+    int                 scale_is_neuron,
+    //output
+    bm_device_mem_t  scale_diff,
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_elu_forward(
+    bm_handle_t      handle,
+    float               alpha,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_elu_backward(
+    bm_handle_t      handle,
+    float               alpha,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  output,
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BMDNN_EXT_API_H */
diff --git a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h
new file mode 100644
index 000000000..6fede1338
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h
@@ -0,0 +1,20 @@
+#ifndef BMDNN_RUNTIME_H_
+#define BMDNN_RUNTIME_H_
+
+#include "bmlib_runtime.h"
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+bm_status_t bmdnn_init(
+    bm_handle_t     *handle);
+
+void bmdnn_deinit(
+    bm_handle_t      handle);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif
diff --git a/saber/funcs/impl/bm/base/include/bmdnn/op_code.h b/saber/funcs/impl/bm/base/include/bmdnn/op_code.h
new file mode 100644
index 000000000..f85846a8a
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmdnn/op_code.h
@@ -0,0 +1,62 @@
+#ifndef OP_CODE_H_
+#define OP_CODE_H_
+
+
+typedef enum align_tensor_op {
+    ALIGN_TENSOR_ADD,
+    ALIGN_TENSOR_SUB,
+    ALIGN_TENSOR_MUL,
+    ALIGN_TENSOR_DIV,
+    TENSOR_INVALID
+} ALIGN_TENSOR_OP;
+
+typedef enum linear_op {
+    LINEAR_MAC,
+    LINEAR_ADD_SQR,
+    LINEAR_SUB_SQR
+} LINEAR_OP;
+
+typedef enum sfu_op {
+    SFU_XN,
+    SFU_EX,
+    SFU_LNX,
+    SFU_RSQ,
+    SFU_INVALID
+} SFU_OP;
+typedef struct tensor_4d_t {
+    int n;
+    int c;
+    int h;
+    int w;
+}bm_tensor_4d_t;
+
+
+#define TENSOR_ADD 0
+#define TENSOR_SUB 1
+#define TENSOR_MUL 2
+//Note the div should be implmented by KAMAKE algorithm
+#define TENSOR_DIV 3
+#define TENSOR_MAX 4
+#define TENSOR_CPY 5
+#define TENSOR_MAC 6
+
+#define TENSOR_N_DIM 0
+#define TENSOR_C_DIM 1
+#define TENSOR_H_DIM 2
+#define TENSOR_W_DIM 3
+
+#define SHARE_REG_MESSAGE_WP            0
+#define SHARE_REG_MESSAGE_RP            1
+#define SHARE_REG_MESSAGE_IRQSTATUS     2
+#define SHARE_REG_CDMA_IRQSTATUS    3 
+
+#define SHAREMEM_MSG_FIXED_OFFSET  (8192)
+#define SHAREMEM_SIZE_BIT  8
+#define SHAREMEM_MASK      ((1<<SHAREMEM_SIZE_BIT) - 1)
+#define SHARE_REG_CNT      16
+
+#define IRQ_STATUS_CDMA_INT             0x1111
+#define IRQ_STATUS_MSG_DONE_INT         0x2222
+
+ 
+#endif /* OP_CODE_H_ */
diff --git a/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h b/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h
new file mode 100644
index 000000000..932b17138
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h
@@ -0,0 +1,229 @@
+#ifndef BMLIB_RUNTIME_H_
+#define BMLIB_RUNTIME_H_
+#include <stdbool.h>
+#include <stddef.h>
+
+#if !defined(__x86_64__) && !defined(__aarch64__)
+#error "BM needs 64-bit to compile"
+#endif
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+typedef enum {
+  BM_SUCCESS                 = 0,
+  BM_ERR_DEVNOTREADY          = 1,   /* Device not ready yet */
+  BM_ERR_FAILURE             = 2,   /* General failure */
+  BM_ERR_TIMEOUT             = 3,   /* Timeout */
+  BM_ERR_PARAM               = 4,   /* Parameters invalid */
+  BM_ERR_NOMEM               = 5,   /* Not enough memory */
+  BM_ERR_DATA                = 6,   /* Data error */
+  BM_ERR_BUSY                = 7,   /* Busy */
+  BM_ERR_NOFEATURE           = 8,    /* Not supported yet */
+  BM_NOT_SUPPORTED           = 9
+} bm_status_t;
+
+typedef enum {
+  BM_MEM_TYPE_DEVICE  = 0,
+  BM_MEM_TYPE_HOST    = 1,
+  BM_MEM_TYPE_SYSTEM  = 2,
+  BM_MEM_TYPE_INT8_DEVICE  = 3,
+  BM_MEM_TYPE_INVALID = 4
+} bm_mem_type_t;
+
+#define BM_MEM_ADDR_NULL     (0xfffffffff)
+
+typedef struct bm_mem_desc {
+  unsigned char                 desc[16];
+} bm_mem_desc_t;
+
+struct bm_context;
+typedef struct bm_context *  bm_handle_t;
+typedef struct bm_mem_desc   bm_device_mem_t;
+typedef struct bm_mem_desc   bm_host_mem_t;
+typedef struct bm_mem_desc   bm_system_mem_t;
+
+#define BM_CHECK_RET(call)                         \
+    do {                                        \
+      bm_status_t ret = call;                \
+	  if ( ret != BM_SUCCESS ) {             \
+        printf("BM_CHECK_RET failed %d\n", ret);   \
+        ASSERT(0);                              \
+        exit(-ret);                             \
+      }                                         \
+    } while(0)
+
+/*
+ * control 
+ */
+void bm_flush(
+    bm_handle_t      handle);
+/*
+ * brief malloc host memory according to a tensor shape(each neuron is 32 bits)
+*/
+
+bm_status_t bm_malloc_neuron_device(
+    bm_handle_t      handle,
+    bm_device_mem_t *pmem,
+    int              n,
+    int              c,
+    int              h,
+    int              w);
+
+/*
+ * brief malloc host memory in size of dword(32 bits)
+*/
+
+bm_status_t bm_malloc_device_dword(
+    bm_handle_t      handle,
+    bm_device_mem_t *pmem,
+    int              count);
+
+/*
+ * brief malloc host memory in size of byte
+*/
+
+bm_status_t bm_malloc_device_byte(
+    bm_handle_t      handle,
+    bm_device_mem_t *pmem,
+    unsigned int     size);
+
+void bm_free_device(
+    bm_handle_t      handle,
+    bm_device_mem_t  mem);
+
+/*
+ * brief malloc host memory in size of byte
+ */
+bm_status_t bm_malloc_host(
+    bm_handle_t      handle,
+    bm_host_mem_t   *pmem,
+    unsigned int     size);
+
+void bm_free_host(
+    bm_handle_t      handle,
+    bm_host_mem_t    mem);
+
+void *bm_host_mem_get_pointer(
+    bm_host_mem_t    mem);
+
+/*
+ * Memory copy and set
+ */
+bm_status_t bm_memcpy_h2d(
+    bm_handle_t      handle,
+    bm_device_mem_t  dst,
+    bm_host_mem_t    src);
+
+bm_status_t bm_memcpy_d2h(
+    bm_handle_t      handle,
+    bm_host_mem_t    dst,
+    bm_device_mem_t  src);
+
+
+bm_status_t bm_memcpy_s2d(
+    bm_handle_t      handle,
+    bm_device_mem_t  dst,
+    bm_system_mem_t  src);
+
+bm_status_t bm_memcpy_d2s(
+    bm_handle_t      handle,
+    bm_system_mem_t  dst,
+    bm_device_mem_t  src);
+
+bm_status_t bm_memcpy_d2d(
+    bm_handle_t     handle,
+    bm_device_mem_t dst,
+    int             dst_offset,
+    bm_device_mem_t src,
+    int             src_offset,
+    int             len);
+
+bm_status_t bm_memset_device(
+    bm_handle_t      handle,
+    const int        value,
+    bm_device_mem_t  mem);
+
+bm_device_mem_t bm_mem_from_system(
+    void *              system_addr);
+
+/*
+*brief malloc one device memory with the shape of (N,C,H,W), copy the sys_mem to
+device mem if need_copy is true
+*/
+
+bm_status_t bm_mem_convert_system_to_device_neuron(
+    bm_handle_t          handle,
+    struct bm_mem_desc  *dev_mem,
+    struct bm_mem_desc   sys_mem,
+    bool                 need_copy,
+    int                  n,
+    int                  c,
+    int                  h,
+    int                  w);
+
+/*
+*brief malloc one device memory with the size of coeff_count, copy the sys_mem to
+device mem if need_copy is true
+*/
+bm_status_t bm_mem_convert_system_to_device_coeff(
+    bm_handle_t          handle,
+    struct bm_mem_desc  *dev_mem,
+    struct bm_mem_desc   sys_mem,
+    bool                 need_copy,
+    int                  coeff_count);
+
+/*
+ * memory info get and set
+ */
+unsigned long long bm_mem_get_device_addr(struct bm_mem_desc mem);
+void               bm_mem_set_device_addr(struct bm_mem_desc & mem, unsigned long long addr);
+unsigned int       bm_mem_get_device_size(struct bm_mem_desc mem);
+void               bm_mem_set_device_size(struct bm_mem_desc & mem, unsigned int size);
+bm_mem_type_t      bm_mem_get_type(struct bm_mem_desc mem);
+
+/* 
+* brief Get the handle of bmlib_runtime
+* return : If the handle has been inited, return the handle it self , else init one and return it
+*/
+bm_handle_t get_bm_handle();
+
+/*
+ * Helper functions
+ */
+
+/**
+* \brief Get the number of nodechip (Constant 1 in bm1682)
+* \return
+* \ref NO
+*/
+int bm_get_nodechip_num(
+    bm_handle_t      handle);
+
+/**
+* \brief Get the number of nodechip (Constant 64 in bm1682)
+* \return
+* \ref NO
+*/
+int bm_get_npu_num(
+    bm_handle_t      handle);
+int bm_get_eu_num( bm_handle_t handle);
+/**
+* \brief Get the number of nodechip (Constant 64 in bm1682)
+* \return
+* \ref NO
+*/
+bm_device_mem_t bm_mem_null(void);
+#define BM_MEM_NULL  (bm_mem_null())
+
+bm_status_t bm_dev_getcount(int* count);
+bm_status_t bm_dev_query(int devid);
+bm_status_t bm_dev_request(bm_handle_t *handle, bool bmkernel_used, int devid);
+void bm_dev_free(bm_handle_t handle);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BM_RUNTIME_H_ */
diff --git a/saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h b/saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h
new file mode 100644
index 000000000..e878343ef
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h
@@ -0,0 +1,72 @@
+#ifndef BMLIB_UTILS_H
+#define BMLIB_UTILS_H
+#include <stdlib.h>
+
+/*
+ * Debug definitions for user app only
+ * Copy from common.h
+ * Don't include for internal usage
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define UNUSED(x)               (void)(x)
+
+#define __ALIGN_MASK(x,mask)    (((x)+(mask))&~(mask))
+#define ALIGN(x,a)              __ALIGN_MASK(x,(__typeof__(x))(a)-1)
+
+int array_cmp(
+    float *p_exp,
+    float *p_got,
+    int len,
+    const char *info_label,
+    float delta);
+
+int tri_array_cmp(
+    float *p_exp,
+    float *p_got,
+    float *third_party,
+    int len,
+    const char *info_label,
+    float delta,
+    int* err_idx);
+
+int array_cmp_int(
+    int *p_exp,
+    int *p_got,
+    int len,
+    const char *info_label
+);
+
+void dump_hex(char *desc, void *addr, int len);
+void dump_data_float(char *desc, void *addr, int n, int c, int h, int w);
+void dump_data_int(char *desc, void *addr, int n, int c, int h, int w);
+void dump_matrix_float(char *desc, void *addr, int row, int col);
+void dump_array_file(char * file, int row_num, int col_num, int transpose, float * parr);
+
+/* dump to file */
+void dump_float_tensor(const char * filename,
+    int length, float * dump_data);
+
+#ifdef __cplusplus
+/* not available in C */
+void random_param(
+    int &n, int &c, int &h, int &w,
+    int &kh, int &kw, int &ph, int &pw, int &sh, int &sw,
+    int &oc);
+
+void random_conv_param(
+    int &n, int &ic, int &ih, int &iw, int &oc,
+    int &kh, int &kw, int &dh, int &dw,
+    int &ph, int &pw, int &sh, int &sw);
+#endif
+
+int conv_coeff_storage_convert(float * coeff_orig, float ** coeff_reformat, unsigned int oc, unsigned int ic, unsigned int kh, unsigned int kw, unsigned int npu_num);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BMLIB_UTILS_H */
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmblob.h b/saber/funcs/impl/bm/base/include/bmruntime/bmblob.h
new file mode 100644
index 000000000..f3e086f91
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmruntime/bmblob.h
@@ -0,0 +1,97 @@
+#ifndef __BM_BLOB_H__
+#define __BM_BLOB_H__
+
+struct bm_mem_desc;
+typedef struct bm_mem_desc bm_device_mem_t;
+namespace bmcnn {
+
+typedef struct { int n, c, h, w; } Shape;
+
+class BMBlob
+{
+public:
+    /**
+     * \brief Constructor of blob.
+     *
+     * \param shape - Shape of blob
+     */
+    explicit BMBlob(const Shape &shape, void *handle);
+    /**
+     * \brief Deconstructor of blob.
+     */
+    virtual ~BMBlob();
+    /**
+     * \brief Reshape blob.
+     * 
+     * \param n - Batch number of blob
+     * \param c - Channel number of blob
+     * \param h - Height of blob section
+     * \param w - Width of blob section
+     *
+     * \note
+     * (1) For now, number of channels is not allowed to be reshaped.\n
+     * (2) After reshaping, data in this blob will be set vanished.\n
+     */
+    void Reshape(int n, int c, int h, int w);
+    /**
+     * \brief Get shape.
+     */
+    inline Shape shape() const
+    { return shape_; }
+    /**
+     * \brief Get batch size.
+     */
+    inline int batch_num() const
+    { return shape_.n; }
+    /**
+     * \brief Get feature
+     *
+     * \return Channel number of the blob\n
+     */
+    inline int channels() const
+    { return shape_.c; }
+    /**
+     * \brief Get height of section
+     */
+    int height() const
+    { return shape_.h; }
+    /**
+     * \brief Get width of section.
+     */
+    int width() const
+    { return shape_.w; }
+    /**
+     * \brief Get read-only pointer to data in cpu.
+     */
+    const float *cpu_data(); 
+    /**
+     * \brief Get mutable pointer of data in cpu.
+     */    
+    float *mutable_cpu_data();
+    /**
+     * \brief Get mutable pointer of memory in device.
+     */    
+    bm_device_mem_t *mutable_dev_mem();
+    /**
+     * \brief Get read-only pointer of memory in device.
+     */    
+    const bm_device_mem_t *dev_mem();
+private:
+    BMBlob(const BMBlob &other);
+    BMBlob &operator=(const BMBlob &other);
+    
+    bm_device_mem_t *dev_mem_;
+    float *sys_data_;
+    Shape shape_;
+    int data_pos_;
+    int capacity_;
+    void *handle_;
+    
+    enum { AIR = 0x00, SYS = 0x01, DEV = 0x10 };
+    void sync_s2d();
+    void sync_d2s();
+};
+
+} /* namespace bmcnn */
+
+#endif /* __BM_BLOB_H__ */
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h b/saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h
new file mode 100644
index 000000000..6b0bfe857
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h
@@ -0,0 +1,58 @@
+#ifndef __BM_CNN_CONTEXT_H__
+#define __BM_CNN_CONTEXT_H__
+
+#include <string>
+#include "bmruntime.h"
+
+namespace bmcnn {
+
+typedef void *bmcnn_ctx_t;
+/**
+ * \brief Create context of BMCNN.
+ *
+ * \param ctx_dir - Directory of context files generated by BMNETC
+ *
+ * \note
+ * The context will be created in the device of ID 0.\n
+ *  
+ * \return
+ * NULL - Creating failed.\n
+ * non-NULL - The handle of the context (creating succeeded).\n
+ */
+bmcnn_ctx_t bmcnn_ctx_create(const std::string &ctx_dir);
+/**
+ * \brief Destroy context of BMCNN
+ * 
+ * \param handle - Handle of the context to be destroyed
+ */
+void bmcnn_ctx_destroy(bmcnn_ctx_t handle);
+/**
+ * \brief Create context of BMCNN in specific devide.
+ * 
+ * \param ctx_dir - Directory of context files generated by BMNETC
+ * \param devid - ID of device where the context will be placed.
+ *
+ * \note
+ * Call \ref bm_dev_getcount to get total number of devices, e.g. N is returned,
+ * valid devid should be in range of 0 ~ (N-1).\n
+ *
+ * \return
+ * NULL - Creating failed that might be caused by incorrect parameter.\n
+ * non-NULL - The handle of the context (creating succeeded).\n
+ */
+bmcnn_ctx_t bmcnn_ctx_create_by_devid(const std::string &ctx_dir, int devid);
+/**
+ * \brief Append context of BMCNN.
+ *
+ * \param ctx_dir - Directory of context files generated by BMNETC or BMNETD.
+ * \param bmrt    - The created handle of context.
+ *  
+ * \return
+ * false - Appending failed.\n
+ * true  - Appending succeeded.\n
+ */
+bool bmcnn_ctx_append(const std::string &ctx_dir, bmruntime *bmrt);
+
+} /* namespace bmcnn */
+
+#endif /* __BM_CNN_CONTEXT_H__ */
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmnet.h b/saber/funcs/impl/bm/base/include/bmruntime/bmnet.h
new file mode 100644
index 000000000..88005e1b8
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmruntime/bmnet.h
@@ -0,0 +1,78 @@
+#ifndef __BM_NET_H__
+#define __BM_NET_H__
+
+#include "bmblob.h"
+#include "bmcnnctx.h"
+#include <vector>
+#include <map>
+#include <string>
+
+#ifdef CROSS_COMPILE
+  #include <memory>
+#else
+  #include <boost/shared_ptr.hpp>
+#endif
+
+
+#ifdef CROSS_COMPILE
+#define NAMESPACE_USED  std
+#else
+#define NAMESPACE_USED  boost
+#endif
+
+namespace bmcnn {
+    
+class BMNet
+{
+public:
+    /**
+     * \brief Constructor of net.
+     *
+     * \param handle - Handler of BMCNN context (created by \ref bmcnn_ctx_create)
+     * \param name - Name of net
+     */
+    explicit BMNet(bmcnn_ctx_t handle, const std::string &name);
+    /**
+     * \brief Deconstructor of blob.
+     */
+    virtual ~BMNet();
+    /**
+     * \brief Reshape all layers from bottom to top.
+     */
+    void Reshape();
+    /**
+     * \brief Run forward.
+     * 
+     * \param sync - Flag of synchronizing.
+     */
+    void Forward(bool sync = false);
+    /**
+     * \brief Get blob by name.
+     *
+     * \param name - Name of blob 
+     * \note
+     * (1) The name could only be of blob in input or output.\n
+     * (2) If the name is not spotted, null pointer will be returned.\n
+     */
+    const NAMESPACE_USED::shared_ptr<BMBlob> blob_by_name(const std::string &name) const;
+    /**
+     * \brief Get maximum shape allowed.
+     */
+    inline const Shape &max_shape() const
+    { return max_shape_; }
+private:
+    BMNet(const BMNet &other);
+    BMNet &operator=(const BMNet &other);
+
+    bmcnn_ctx_t bmcc_ctx_;
+    std::vector<NAMESPACE_USED::shared_ptr<BMBlob> > blobs_;
+    std::vector<BMBlob *> net_input_blobs_;
+    std::vector<BMBlob *> net_output_blobs_;
+    std::string name_;
+    std::map<std::string, size_t> blob_name_index_;
+    Shape max_shape_;
+};
+
+} /* namespace bmcnn */
+
+#endif /* __BM_NET_H__ */
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h
new file mode 100644
index 000000000..daa101fce
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h
@@ -0,0 +1,154 @@
+#ifndef  BMRUNTIME_H_
+#define  BMRUNTIME_H_
+#include <algorithm>
+#include <vector>
+#include "bmlib_runtime.h"
+#include "bmruntime_common.h"
+#include "stdio.h"
+#include <string>
+#include <map>
+#include <set>
+#include <iostream>
+
+using std::vector;
+using std::map;
+using std::set;
+using std::string;
+using std::pair;
+using std::make_pair;
+using std::cout;
+using std::endl;
+typedef unsigned int            u32;
+typedef unsigned long long      u64;
+
+typedef struct stage_param_with_idx{
+  int height_high;
+  int height_low;
+  int width_high;
+  int width_low;
+  int stage_index;
+}stage_param_with_idx_t;
+
+class bmruntime {
+  public:
+    bmruntime(bm_handle_t bm_handle);
+    ~bmruntime();
+
+    bool load_context(const string& ctx_dir);
+
+    const set<string>& get_input_tensor(int net_idx) const;
+    const set<string>& get_input_tensor(const string& net_name);
+
+    const set<string>& get_output_tensor(int net_idx) const;
+    const set<string>& get_output_tensor(const string& net_name);
+
+    const bm_device_mem_t* get_input_blob(const string& tensor_name, int net_idx);
+    const bm_device_mem_t* get_input_blob(const string& tensor_name, const string& net_name);
+
+    const bm_device_mem_t* get_output_blob(const string& tensor_name, int net_idx);
+    const bm_device_mem_t* get_output_blob(const string& tensor_name, const string& net_name);
+
+    bool launch(int net_idx);
+    bool launch(const string& net_name);
+
+    bool launch(int net_idx, const bm_device_mem_t* input_tensors, int input_num,
+            const bm_device_mem_t* output_tensors, int output_num);
+    bool launch(const string& net_name, const bm_device_mem_t* input_tensors, int input_num,
+            const bm_device_mem_t* output_tensors, int output_num);
+
+    bool launch(int net_idx, int n, int h , int w);
+    bool launch(const string& net_name, int n, int h, int w);
+    bool launch(int net_idx, const bm_device_mem_t* input_tensors, int input_num,
+            const bm_device_mem_t* output_tensors, int output_num, int n, int h, int w);
+    bool launch(const string& net_name, const bm_device_mem_t* input_tensors, int input_num,
+            const bm_device_mem_t* output_tensors, int output_num, int n , int h, int w);
+
+    void get_input_blob_max_nhw(const string& tensor_name, int net_idx, int * max_n, int * max_c, int * max_h, int * max_w);
+    void get_input_blob_max_nhw(const string& tensor_name, const string& net_name, int * max_n, int * max_c, int * max_h, int * max_w);
+    void get_output_blob_max_nhw(const string& tensor_name, int net_idx, int * max_n, int * max_c, int * max_h, int * max_w);
+    void get_output_blob_max_nhw(const string& tensor_name, const string& net_name, int * max_n, int *max_c, int * max_h, int * max_w);
+
+    int get_oh_from_ih(const string& input_tensor_name, const string& output_tensor_name, const string& net_name, int ih);
+    int get_oh_from_ih(const string& input_tensor_name, const string& output_tensor_name, int net_idx, int ih);
+    int get_ow_from_iw(const string& input_tensor_name, const string& output_tensor_name, const string& net_name, int iw);
+    int get_ow_from_iw(const string& input_tensor_name, const string& output_tensor_name, int net_idx, int iw);
+
+
+
+
+    bool can_batch_size_change(int net_idx);
+    bool can_batch_size_change(const string& net_name);
+    bool can_height_and_width_change(int net_idx);
+    bool can_height_and_width_change(const string& net_name);
+
+    void show_neuron_network();
+
+    int get_network_number() {return net_num;}
+
+    inline bm_handle_t get_bm_handle() {return m_handle;}
+
+  protected:
+    bool setup_mem_context(const string& ctx_dir);
+    bool setup_cmd_context(const string& ctx_dir);
+    bool set_using_cmd_file(const string& ctx_dir);
+    void load_cmd(u32* cmd, int engine_id, bool last_cmd, u64 start_address, u64 append_mem_offset);
+    bool setup_ir_context(const string& ctx_dir);
+
+    void wrong_net_idx_handle(int net_idx) const;
+
+    int get_net_idx(const string& net_name);
+    int get_stage_idx(int net_idx, int h, int w);
+    u64 get_stage_offset(int net_idx, int stage_idx);
+
+    int compute_output_height(int input_height, int global_kh, int global_stride_h, int global_pad_h, int global_pool_kh);
+    int compute_output_width(int input_width, int global_kw, int global_stride_w, int global_pad_w, int global_pool_kw);
+
+    bm_handle_t m_handle;
+    std::vector<DEVICE_MEM_INFO>            m_device_mem_info_vec;
+    std::vector<bm_device_mem_t>            m_device_mem_vec;
+
+    vector<int>                             m_gdma_total_id_v;
+    vector<int>                             m_cdma_total_id_v;
+    vector<int>                             m_bdc_total_id_v;
+    vector<vector<int> >                    m_gdma_group_id_v;
+    vector<vector<int> >                    m_cdma_group_id_v;
+    vector<vector<int> >                    m_bdc_group_id_v;
+    vector<int>                             m_cmdgroup_num;
+    vector<u64>                             m_gdma_cmd_start_address_v;
+    vector<u64>                             m_cdma_cmd_start_address_v;
+    vector<u64>                             m_bdc_cmd_start_address_v;
+    vector<map<string, bm_device_mem_t> >   input_tensor_mem_map_v;
+    vector<map<string, bm_device_mem_t> >   output_tensor_mem_map_v;
+    vector<set<string> >                    m_input_tensor_set_v;
+    vector<set<string> >                    m_output_tensor_set_v;
+    int                                     net_num;
+    map<string,int>                         net_name_to_idx;
+    vector<int>                             stage_num;
+
+    bool                                    have_ir_info;
+    vector<vector<unsigned int> >           m_ir_info_len;
+    vector<u64>                             m_ir_info_start_address_v;
+    vector<vector<stage_param_with_idx_t> > stage_param_with_idx_vv;
+
+    //io tensor param
+    vector<int>                             n_can_change_v;
+    vector<int>                             h_w_can_change_v;
+
+    vector<vector<map<string, tensor_max_shape_t> > >           input_tensor_max_shape_vv;
+    vector<vector<map<string, tensor_max_shape_t> > >           output_tensor_max_shape_vv;
+    vector<vector<map<string, global_output_tensor_param_t> > > global_output_tensor_param_vv;
+
+    bool m_using_cmd_file;
+    FILE * m_gdma_cmd_file;
+    FILE * m_cdma_cmd_file;
+    FILE * m_bdc_cmd_file;
+
+    //previous value or state
+    int pre_net_num;
+    int pre_m_device_mem_info_vec_size;  
+
+    //append mem offset when appending another framework's context.
+    vector<u64> apd_ctx_mem_offset;
+};
+
+#endif
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h
new file mode 100644
index 000000000..200656739
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h
@@ -0,0 +1,65 @@
+#ifndef BMRUNTIME_COMMON_H
+#define BMRUNTIME_COMMON_H
+
+#define BMRT_ASSERT(_cond)                       \
+  do {                                           \
+    if (!(_cond)) {                              \
+      printf("ASSERT %s: %s: %d: %s\n",          \
+          __FILE__, __func__, __LINE__, #_cond); \
+      exit(-1);                                  \
+    }                                            \
+  } while(0)
+
+typedef enum neuron_device_mem_type {
+    INPUT_NEURON_TENSOR = 0,
+    INTERMEDIATE_NEURON_TENSOR = 1,
+    OUTPUT_NEURON_TENSOR = 2,
+    CMD_BUF_TENSOR = 3,
+    CMD_NUM_TENSOR = 4
+} NEURON_DEVICE_MEM_TYPE;
+
+typedef enum device_mem_type {
+    NEURON = 0,
+    COEFF = 1,
+#ifdef INT8_COEFF_FUNC
+    COEFF_INT8 = 2,
+    COEFF_INT8SCALE = 3,
+    LOCAL = 4
+#else
+    LOCAL = 2
+#endif
+} DEVICE_MEM_TYPE;
+
+typedef struct device_mem_info {
+    DEVICE_MEM_TYPE device_mem_type;
+    NEURON_DEVICE_MEM_TYPE neuron_device_mem_type;
+    int n;
+    int c;
+    int h;
+    int w;
+    int coeff_count;
+    int groups;
+    unsigned long long address;
+} DEVICE_MEM_INFO;
+
+//info for compute output tensor
+typedef struct tensor_max_shape {
+  int max_n;
+  int channel;
+  int max_h;
+  int max_w;
+} tensor_max_shape_t;
+
+typedef struct global_output_tensor_param {
+  int input_idx;
+  int global_kh;
+  int global_kw;
+  int global_stride_h;
+  int global_stride_w;
+  int global_pad_h;
+  int global_pad_w;
+  int global_pool_kh;
+  int global_pool_kw;
+} global_output_tensor_param_t; 
+
+#endif
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h
new file mode 100644
index 000000000..4214674f3
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h
@@ -0,0 +1,11 @@
+#ifndef BMRUNTIME_INTERFACE_H_
+#define BMRUNTIME_INTERFACE_H_
+
+#include "bmruntime.h"
+#include "bmdnn_runtime.h"
+
+bmruntime* create_bmruntime(bm_handle_t* bm_handle);
+
+void destroy_bmruntime(bm_handle_t bm_handle, bmruntime* p_bmrt);
+
+#endif
diff --git a/saber/funcs/impl/bm/vender_activation.h b/saber/funcs/impl/bm/vender_activation.h
new file mode 100644
index 000000000..45541add9
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_activation.h
@@ -0,0 +1,96 @@
+#ifndef ANAKIN_SABER_FUNCS_BMDNN_ACT_H
+#define ANAKIN_SABER_FUNCS_BMDNN_ACT_H
+#include "saber/funcs/impl/impl_activation.h"
+namespace anakin {
+
+namespace saber {
+
+template <DataType OpDtype ,
+    DataType inDtype,
+    DataType outDtype,
+    typename LayOutType_op,
+    typename LayOutType_in,
+    typename LayOutType_out>
+class VenderActivation<BM, OpDtype, inDtype, outDtype,\
+    LayOutType_op, LayOutType_in, LayOutType_out> : \
+    public ImplBase<
+        Tensor<BM, inDtype, LayOutType_in>, 
+        Tensor<BM, outDtype, LayOutType_out>,
+        Tensor<BM, OpDtype, LayOutType_op>,
+        ActivationParam<Tensor<BM, OpDtype, LayOutType_op> > > 
+{
+public:
+    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+
+    VenderActivation()
+            : _handle(NULL), _active_descs(NULL), _input_descs(NULL), _output_descs(NULL) {}
+
+    ~VenderActivation() {
+        if (_input_descs) {
+            BMDNN_CHECK(bm_free_device(_input_descs));
+        }
+        if (_output_descs) {
+            BMDNN_CHECK(bm_free_device(_output_descs));
+        }
+    }
+
+    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            ActivationParam<OpTensor>& param, Context<BM>& ctx) {
+        // not sure
+        return create(inputs, outputs, param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            ActivationParam<OpTensor>& param, Context<BM>& ctx) {
+        // not sure
+        return SaberSuccess;
+    }
+
+    //call bmdnn activation funcs here
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            ActivationParam<OpTensor>& param) {
+
+        const InDataType *in_data = (const InDataType *) inputs[0]->data();
+        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
+        int input_dim = inputs[0]->channel() * inputs[0]->height() * inputs[0]->width();
+        int input_n = inputs[0]->num();
+
+        switch (_active_type) {
+            case Active_sigmoid:
+                BMDNN_CHECK(bmdnn_sigmoid_forward(_handle, _input_descs, input_n, input_dim, _output_descs));
+                break;
+            case Active_relu:
+                BMDNN_CHECK(bmdnn_relu_forward(_handle, _input_descs, input_n, input_dim, _output_descs));
+                break;
+            case Active_tanh:
+                BMDNN_CHECK(bmdnn_tanh_forward(_handle, _input_descs, input_n, input_dim, _output_descs));
+                break;
+        }
+        /* BMDNN_CHECK(cudnnActivationForward(_handle, _active_descs, */
+        /*                                    cudnn::cudnnTypeWrapper<InDataType>::kOne(), */
+        /*                                    _input_descs, in_data, */
+        /*                                    cudnn::cudnnTypeWrapper<InDataType>::kZero(), */
+        /*                                    _output_descs, out_data */
+        /* )); */
+        return SaberSuccess;
+    }
+
+private:
+    bm_handle_t _handle;
+    bm_device_mem_t _input_descs;
+    bm_device_mem_t _output_descs;
+    ActiveType _active_type;
+};
+template class VenderActivation<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
+}
+}
+
+#endif //ANAKIN_SABER_FUNCS_BMDNN_ACT_H
diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h
new file mode 100644
index 000000000..7efdfa611
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_conv.h
@@ -0,0 +1,195 @@
+/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+   
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. 
+*/
+
+#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H
+#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H
+
+#include "saber/funcs/impl/impl_conv.h"
+#include "saber/funcs/impl/bm/bmdnn_api.h"   
+
+namespace anakin{
+
+namespace saber{
+
+template <DataType OpDtype,
+    DataType inDtype,
+    DataType outDtype,
+    typename LayOutType_op,
+    typename LayOutType_in,
+    typename LayOutType_out>
+class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
+    LayOutType_op, LayOutType_in, LayOutType_out> : \
+    public ImplBase<
+        Tensor<BM, inDtype, LayOutType_in>,
+        Tensor<BM, outDtype, LayOutType_out>,
+        Tensor<BM, OpDtype, LayOutType_op>,
+        ConvParam<Tensor<BM, OpDtype, LayOutType_op> > >
+{
+public:
+    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+    VenderConv2D()
+            : _handle(NULL)
+            , _workspaceData(NULL)
+            , _workspace(NULL)
+            , _conv_descs(NULL)
+            , _input_descs(NULL)
+            , _output_descs(NULL)
+            , _filter_desc(NULL)
+            , _workspace_fwd_sizes(0)
+            , _workspaceSizeInBytes(0)
+            , _fwd_algo((cudnnConvolutionFwdAlgo_t)0)
+            , _input_nchw_descs(NULL)
+            , _output_nchw_descs(NULL)
+            , x8_data(NULL)
+            , y8_data(NULL)
+            , x8_data_size(0)
+            , y8_data_size(0)
+    {}
+
+    ~VenderConv2D() {
+
+        if (_conv_descs) {
+            CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(_conv_descs));
+        }
+        if (_input_descs) {
+            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_descs));
+        }
+        if (_output_descs) {
+            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_descs));
+        }
+        if (_filter_desc) {
+            CUDNN_CHECK(cudnnDestroyFilterDescriptor(_filter_desc));
+        }
+        if (_handle != NULL) {
+            CUDNN_CHECK(cudnnDestroy(_handle));
+        }
+        if (_workspaceData != NULL) {
+            cudaFree(_workspaceData);
+        }
+        if (_input_nchw_descs != NULL) {
+            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_nchw_descs));
+        }
+        if (_output_nchw_descs != NULL) {
+            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_nchw_descs));
+        }
+        if (x8_data != NULL) {
+            CUDA_CHECK(cudaFree(x8_data));
+        }
+        if (y8_data != NULL) {
+            CUDA_CHECK(cudaFree(y8_data));
+        }
+    }
+
+    /**
+     * [Create description] Init all cudnn resource here
+     * @AuthorHTL
+     * @DateTime  2018-02-01T16:13:06+0800
+     * @param     inputs                    [description]
+     * @param     outputs                   [description]
+     * @param     param                [conv parameters]
+     */
+    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            ConvParam<OpTensor>& param, Context<BM>& ctx) {
+
+        // ---- init cudnn resources ----
+
+        _workspaceSizeInBytes = 0;
+        _workspaceData = NULL;
+
+        _workspace_fwd_sizes = 0;
+
+        this->_ctx = ctx;
+        // ---- get cuda resources ----
+
+        cudaStream_t cuda_stream;
+        cuda_stream = ctx.get_compute_stream();
+
+        CUDNN_CHECK(cudnnCreate(&_handle));
+        CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream));
+
+        _workspace = NULL;
+
+        int in_channels = inputs[0]->channel();
+
+        // ---- create cudnn Descs ----
+        cudnn::createFilterDesc<OpDataType>(&_filter_desc);
+
+        cudnn::createTensorDesc<InDataType>(&_input_descs);
+        cudnn::createTensorDesc<InDataType>(&_output_descs);
+        cudnn::createConvolutionDesc<OpDataType>(&_conv_descs);
+
+        if (param.bias()->size() > 0) {
+            cudnn::createTensorDesc<OpDataType>(&_bias_desc);
+        }
+
+        cudnnCreateTensorDescriptor(&_input_nchw_descs);
+        cudnnCreateTensorDescriptor(&_output_nchw_descs);
+
+        return create(inputs, outputs, param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            ConvParam<OpTensor>& param, Context<BM>& ctx);
+
+    //call cudnnConvolutionForward here
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
+                          std::vector<DataTensor_out*>& outputs,
+                          ConvParam<OpTensor>& param);
+
+private:
+    cudnnHandle_t _handle;
+    cudnnConvolutionFwdAlgo_t _fwd_algo;
+
+    cudnnTensorDescriptor_t _input_descs;
+    cudnnTensorDescriptor_t _output_descs;
+    cudnnTensorDescriptor_t _bias_desc;
+
+    cudnnFilterDescriptor_t _filter_desc;
+
+    cudnnConvolutionDescriptor_t _conv_descs;
+
+    size_t _workspace_fwd_sizes;
+    size_t _workspaceSizeInBytes;  // size of underlying storage
+
+    void *_workspaceData;  // underlying storage
+    void *_workspace;  // aliases into _workspaceData
+
+    const bool _use_tensor_core = true;
+    const size_t _workspace_limit_bytes = 64 * 1024 * 1024;
+    const cudnnConvolutionFwdPreference_t _preference = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+
+    // create transform descriptor
+    cudnnTensorDescriptor_t _input_nchw_descs;
+    cudnnTensorDescriptor_t _output_nchw_descs;
+
+    void *x8_data;
+    void *y8_data;
+
+    int x8_data_size;
+    int y8_data_size;
+};
+
+
+}
+
+}
+#endif //ANAKIN_SABER_FUNCS_BMDNN_CONV2D_H
diff --git a/saber/funcs/impl/bm/vender_conv_act.h b/saber/funcs/impl/bm/vender_conv_act.h
new file mode 100644
index 000000000..4d9c9f3bb
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_conv_act.h
@@ -0,0 +1,198 @@
+/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+   
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. 
+*/
+
+#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ACT_H
+#define ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ACT_H
+
+#include "saber/funcs/impl/impl_conv_act.h"
+#include "saber/funcs/impl/cuda/cudnn_helper.h"   
+#include <cudnn.h>
+
+namespace anakin{
+
+namespace saber{
+
+template <DataType OpDtype,
+    DataType inDtype,
+    DataType outDtype,
+    typename LayOutType_op,
+    typename LayOutType_in,
+    typename LayOutType_out>
+class VenderConv2DAct<BM, OpDtype, inDtype, outDtype,\
+    LayOutType_op, LayOutType_in, LayOutType_out> : \
+    public ImplBase<
+        Tensor<BM, inDtype, LayOutType_in>,
+        Tensor<BM, outDtype, LayOutType_out>,
+        Tensor<BM, OpDtype, LayOutType_op>,
+        ConvActiveParam<Tensor<BM, OpDtype, LayOutType_op> > >
+{
+public:
+    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+
+    VenderConv2DAct()
+            : _handle(NULL)
+            , _workspaceData(NULL)
+            , _workspace(NULL)
+            , _conv_descs(NULL)
+            , _input_descs(NULL)
+            , _output_descs(NULL)
+            , _filter_desc(NULL)
+            , _workspace_fwd_sizes(0)
+            , _workspaceSizeInBytes(0)
+            , _fwd_algo((cudnnConvolutionFwdAlgo_t)0)
+            , _input_nchw_descs(NULL)
+            , _output_nchw_descs(NULL)
+            , x8_data(NULL)
+            , y8_data(NULL)
+            , x8_data_size(0)
+            , y8_data_size(0)
+    {}
+
+    ~VenderConv2DAct() {
+
+        if (_conv_descs) {
+            CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(_conv_descs));
+        }
+        if (_input_descs) {
+            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_descs));
+        }
+        if (_output_descs) {
+            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_descs));
+        }
+        if (_filter_desc) {
+            CUDNN_CHECK(cudnnDestroyFilterDescriptor(_filter_desc));
+        }
+        if (_handle != NULL) {
+            CUDNN_CHECK(cudnnDestroy(_handle));
+        }
+        if (_workspaceData != NULL) {
+            cudaFree(_workspaceData);
+        }
+        if (_input_nchw_descs != NULL) {
+            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_nchw_descs));
+        }
+        if (_output_nchw_descs != NULL) {
+            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_nchw_descs));
+        }
+        if (x8_data != NULL) {
+            CUDA_CHECK(cudaFree(x8_data));
+        }
+        if (y8_data != NULL) {
+            CUDA_CHECK(cudaFree(y8_data));
+        }
+    }
+
+    /**
+     * [Create description] Init all cudnn resource here
+     * @AuthorHTL
+     * @DateTime  2018-02-01T16:13:06+0800
+     * @param     inputs                    [description]
+     * @param     outputs                   [description]
+     * @param     conv_param                [conv parameters]
+     */
+    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            ConvActiveParam<OpTensor>& param, Context<BM>& ctx) {
+        // ---- init cudnn resources ----
+
+        _workspaceSizeInBytes = 0;
+        _workspaceData = NULL;
+
+        _workspace_fwd_sizes = 0;
+
+        this->_ctx = ctx;
+        // ---- get cuda resources ----
+
+        cudaStream_t cuda_stream;
+        cuda_stream = ctx.get_compute_stream();
+
+        CUDNN_CHECK(cudnnCreate(&_handle));
+        CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream));
+
+        _workspace = NULL;
+
+        int in_channels = inputs[0]->channel();
+
+        // ---- create cudnn Descs ----
+        cudnn::createFilterDesc<OpDataType>(&_filter_desc);
+
+        cudnn::createTensorDesc<InDataType>(&_input_descs);
+        cudnn::createTensorDesc<OutDataType>(&_output_descs);
+        cudnn::createConvolutionDesc<OpDataType>(&_conv_descs);
+        cudnn::create_activation_des<InDataType>(&_active_descs);
+
+        if (param.conv_param.bias()->size() > 0) {
+            cudnn::createTensorDesc<OpDataType>(&_bias_desc);
+        }
+
+        cudnnCreateTensorDescriptor(&_input_nchw_descs);
+        cudnnCreateTensorDescriptor(&_output_nchw_descs);
+
+        return create(inputs, outputs, param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            ConvActiveParam<OpTensor>& param, Context<BM>& ctx);
+    //call cudnnConvolutionForward here
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
+                          std::vector<DataTensor_out*>& outputs,
+                          ConvActiveParam<OpTensor>& param);
+private:
+    cudnnHandle_t _handle;
+    cudnnConvolutionFwdAlgo_t _fwd_algo;
+
+    cudnnTensorDescriptor_t _input_descs;
+    cudnnTensorDescriptor_t _output_descs;
+    cudnnTensorDescriptor_t _bias_desc;
+
+    cudnnFilterDescriptor_t _filter_desc;
+
+    cudnnConvolutionDescriptor_t _conv_descs;
+
+    size_t _workspace_fwd_sizes;
+    size_t _workspaceSizeInBytes;  // size of underlying storage
+
+    void *_workspaceData;  // underlying storage
+    void *_workspace;  // aliases into workspaceData
+
+    const bool _use_tensor_core = true;
+    const size_t _workspace_limit_bytes = 64 * 1024 * 1024;
+    const cudnnConvolutionFwdPreference_t _preference = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+
+    // activation descriptor
+    cudnnActivationDescriptor_t _active_descs;
+
+    // create transform descriptor
+    cudnnTensorDescriptor_t _input_nchw_descs;
+    cudnnTensorDescriptor_t _output_nchw_descs;
+
+    void *x8_data;
+    void *y8_data;
+
+    int x8_data_size;
+    int y8_data_size;
+};
+
+
+}
+
+}
+#endif //ANAKIN_SABER_FUNCS_CUDNN_CONV2D_H
diff --git a/saber/funcs/impl/bm/vender_conv_act_pooling.h b/saber/funcs/impl/bm/vender_conv_act_pooling.h
new file mode 100644
index 000000000..e602a693d
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_conv_act_pooling.h
@@ -0,0 +1,176 @@
+/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+   
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. 
+*/
+
+#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ACT_POOLING_H
+#define ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ACT_POOLING_H
+
+#include "saber/funcs/impl/impl_conv_act_pooling.h"
+#include "saber/funcs/impl/cuda/cudnn_helper.h"   
+#include <cudnn.h>
+
+namespace anakin{
+
+namespace saber{
+
+template <DataType OpDtype,
+    DataType inDtype,
+    DataType outDtype,
+    typename LayOutType_op,
+    typename LayOutType_in,
+    typename LayOutType_out>
+class VenderConv2DActPooling<BM, OpDtype, inDtype, outDtype,\
+    LayOutType_op, LayOutType_in, LayOutType_out> : \
+    public ImplBase<
+        Tensor<BM, inDtype, LayOutType_in>,
+        Tensor<BM, outDtype, LayOutType_out>,
+        Tensor<BM, OpDtype, LayOutType_op>,
+        ConvActivePoolingParam<Tensor<BM, OpDtype, LayOutType_op> > >
+{
+public:
+    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+
+    VenderConv2DActPooling()
+            : _handle(NULL)
+            , _workspaceData(NULL)
+            , _workspace(NULL)
+            , _conv_descs(NULL)
+            , _input_descs(NULL)
+            , _output_descs(NULL)
+            , _filter_desc(NULL)
+            , _workspace_fwd_sizes(0)
+            , _workspaceSizeInBytes(0)
+            , _fwd_algo((cudnnConvolutionFwdAlgo_t)0)
+    {}
+    ~VenderConv2DActPooling() {
+
+        if (_conv_descs) {
+            CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(_conv_descs));
+        }
+        if (_input_descs) {
+            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_descs));
+        }
+        if (_output_descs) {
+            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_descs));
+        }
+        if (_filter_desc) {
+            CUDNN_CHECK(cudnnDestroyFilterDescriptor(_filter_desc));
+        }
+        if (_handle != NULL) {
+            CUDNN_CHECK(cudnnDestroy(_handle));
+        }
+        if (_workspaceData != NULL) {
+            cudaFree(_workspaceData);
+        }
+    }
+
+    /**
+     * [Create description] Init all cudnn resource here
+     * @AuthorHTL
+     * @DateTime  2018-02-01T16:13:06+0800
+     * @param     inputs                    [description]
+     * @param     outputs                   [description]
+     * @param     conv_param                [conv parameters]
+     */
+    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            ConvActivePoolingParam<OpTensor>& param, Context<BM>& ctx) {
+        // ---- init cudnn resources ----
+
+        _workspaceSizeInBytes = 0;
+        _workspaceData = NULL;
+
+        _workspace_fwd_sizes = 0;
+
+        this->_ctx = ctx;
+        // ---- get cuda resources ----
+
+        cudaStream_t cuda_stream;
+        cuda_stream = ctx.get_compute_stream();
+
+        CUDNN_CHECK(cudnnCreate(&_handle));
+        CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream));
+
+        _workspace = NULL;
+
+        int in_channels = inputs[0]->channel();
+
+        // ---- create cudnn Descs ----
+        cudnn::createFilterDesc<OpDataType>(&_filter_desc);
+
+        cudnn::createTensorDesc<InDataType>(&_input_descs);
+        cudnn::createTensorDesc<InDataType>(&_inner_descs);
+        cudnn::createTensorDesc<OutDataType>(&_output_descs);
+        cudnn::createConvolutionDesc<OpDataType>(&_conv_descs);
+        if (param.has_activation) {
+            cudnn::create_activation_des<InDataType>(&_active_descs);
+        }
+        if (param.has_pooling) {
+            cudnn::create_pooling_des<InDataType>(&_pooling_descs);
+        }
+        if (param.conv_param.bias()->size() > 0) {
+            cudnn::createTensorDesc<OpDataType>(&_bias_desc);
+        }
+
+        return create(inputs, outputs, param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            ConvActivePoolingParam<OpTensor>& param, Context<BM>& ctx);
+    //call cudnnConvolutionForward here
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
+                          std::vector<DataTensor_out*>& outputs,
+                          ConvActivePoolingParam<OpTensor>& param);
+private:
+    cudnnHandle_t _handle;
+    cudnnConvolutionFwdAlgo_t _fwd_algo;
+
+    cudnnTensorDescriptor_t _input_descs;
+    cudnnTensorDescriptor_t _output_descs;
+    cudnnTensorDescriptor_t _inner_descs;
+    cudnnTensorDescriptor_t _bias_desc;
+
+    cudnnFilterDescriptor_t _filter_desc;
+
+    cudnnConvolutionDescriptor_t _conv_descs;
+    cudnnPoolingDescriptor_t _pooling_descs;
+
+    size_t _workspace_fwd_sizes;
+    size_t _workspaceSizeInBytes;  // size of underlying storage
+
+    void *_workspaceData;  // underlying storage
+    void *_workspace;  // aliases into workspaceData
+
+    const bool _use_tensor_core = true;
+    const size_t _workspace_limit_bytes = 64 * 1024 * 1024;
+    const cudnnConvolutionFwdPreference_t _preference = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+
+    // activation descriptor
+    cudnnActivationDescriptor_t _active_descs;
+
+    Shape _inner_shape;
+    DataTensor_out _inner_tensor;
+};
+
+
+}
+
+}
+#endif //ANAKIN_SABER_FUNCS_CUDNN_CONV2D_H
diff --git a/saber/funcs/impl/bm/vender_fc.h b/saber/funcs/impl/bm/vender_fc.h
new file mode 100644
index 000000000..5c7c23e67
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_fc.h
@@ -0,0 +1,114 @@
+/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+   
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. 
+*/
+
+#ifndef ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H
+#define ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H
+
+#include "saber/funcs/impl/impl_fc.h"
+
+namespace anakin{
+
+namespace saber{
+
+template <DataType OpDtype,
+        DataType inDtype,
+        DataType outDtype,
+        typename LayOutType_op,
+        typename LayOutType_in,
+        typename LayOutType_out>
+class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, LayOutType_out>: \
+    public ImplBase<
+        Tensor<BM, inDtype, LayOutType_in>, \
+        Tensor<BM, outDtype, LayOutType_out>, \
+        Tensor<BM, OpDtype, LayOutType_op>, \
+        FcParam<Tensor<BM, OpDtype, LayOutType_op>>> {
+
+public:
+    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+
+    VenderFc() = default;
+    ~VenderFc() {
+        if (_handle != nullptr) {
+            CUBLAS_CHECK(cublasDestroy(_handle));
+        }
+    }
+
+    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            FcParam<OpTensor>& param, Context<BM>& ctx){
+        // get context
+        this->_ctx = ctx;
+        cudaStream_t cuda_stream;
+        cuda_stream = ctx.get_compute_stream();
+
+        CUBLAS_CHECK(cublasCreate(&_handle));
+        CUBLAS_CHECK(cublasSetStream(_handle, cuda_stream));
+        return create(inputs, outputs, param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            FcParam<OpTensor>& param, Context<BM>& ctx){
+
+        if (!(ctx == this->_ctx)) {
+            if (_handle != NULL) {
+                CUBLAS_CHECK(cublasDestroy(_handle));
+            }
+            this->_ctx = ctx;
+
+            cudaStream_t cuda_stream;
+            cuda_stream = ctx.get_compute_stream();
+            CUBLAS_CHECK(cublasCreate(&_handle));
+            CUBLAS_CHECK(cublasSetStream(_handle, cuda_stream));
+        }
+
+        Shape shape_out = inputs[0]->valid_shape();
+        _M = inputs[0]->count_valid(0, param.axis);
+        _K = inputs[0]->count_valid(param.axis, inputs[0]->dims());
+        _N = param.num_output;
+        if (_N <= 0) {
+            int weight_size = param.weights->valid_size();
+            _N = weight_size / _K;
+        }
+        //! weights dims must be in h and w
+        _flag_trans_weights = param.is_transpose_weights;
+        return SaberSuccess;
+    }
+
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            FcParam<OpTensor>& param);
+
+
+private:
+    bool _flag_trans_weights{false};
+    int _M;
+    int _K;
+    int _N;
+    cublasHandle_t _handle;
+    bool _is_continue_buf{true};
+};
+
+template class VenderFc<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
+} //namespace saber
+
+} //namespace anakin
+
+#endif //ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H
diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h
new file mode 100644
index 000000000..4990a5357
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_pooling.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+   
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. 
+*/
+
+#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_POOLING_H
+#define ANAKIN_SABER_FUNCS_IMPL_CUDA_POOLING_H
+
+#include "saber/funcs/impl/impl_pooling.h"
+#include "saber/funcs/impl/cuda/cudnn_helper.h"
+
+namespace anakin{
+
+namespace saber {
+
+template <DataType OpDtype ,
+    DataType inDtype,
+    DataType outDtype,
+    typename LayOutType_op,
+    typename LayOutType_in,
+    typename LayOutType_out>
+class VenderPooling<NV, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, LayOutType_out>:\
+ public ImplBase<
+    Tensor<NV, inDtype, LayOutType_in>, 
+    Tensor<NV, outDtype, LayOutType_out>,
+    Tensor<NV, OpDtype, LayOutType_op>,
+    PoolingParam<Tensor<NV, OpDtype, LayOutType_op>>> {
+public:
+    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
+
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+
+    VenderPooling() : _handle(NULL) {}
+
+    ~VenderPooling() {}
+
+    virtual SaberStatus init(const std::vector<DataTensor_in*>& inputs,
+                  std::vector<DataTensor_out*>& outputs,
+                  PoolingParam<OpTensor> &pooling_param, Context<BM> &ctx) {
+
+        this->_ctx = ctx;
+
+        cudaStream_t cuda_stream;
+        cuda_stream = ctx.get_compute_stream();
+
+        CUDNN_CHECK(cudnnCreate(&_handle));
+        CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream));
+
+        cudnn::createTensorDesc<InDataType>(&_input_descs);
+        cudnn::createTensorDesc<OutDataType>(&_output_descs);
+
+        cudnn::create_pooling_des<OpDataType>(&_pooling_descs);
+
+        return create(inputs, outputs, pooling_param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in*>& inputs,
+                std::vector<DataTensor_out*>& outputs,
+                PoolingParam<OpTensor> &pooling_param, Context<BM> &ctx) {
+        if (!(ctx == this->_ctx)) {
+            if (_handle != NULL) {
+                CUDNN_CHECK(cudnnDestroy(_handle));
+            }
+            this->_ctx = ctx;
+
+            cudaStream_t cuda_stream;
+            cuda_stream = ctx.get_compute_stream();
+
+            CUDNN_CHECK(cudnnCreate(&_handle));
+            CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream));
+        }
+
+        int input_num = inputs[0]->num();
+        int input_channel = inputs[0]->channel();
+        int input_height = inputs[0]->height();
+        int input_width = inputs[0]->width();
+        int output_channel = outputs[0]->channel();
+        int output_height = outputs[0]->height();
+        int output_width = outputs[0]->width();
+
+        Shape stride_in = inputs[0]->get_stride();
+        Shape stride_out = outputs[0]->get_stride();
+
+        int dim_a[] = {input_num, input_channel,
+                       input_height, input_width};
+
+        int dim_b[] = {input_num, output_channel,
+                       output_height, output_width};
+
+        cudnn::setTensorNdDesc<InDataType>(&_input_descs,
+                                            inputs[0]->dims(), dim_a, &stride_in[0]);
+
+        cudnn::setTensorNdDesc<OutDataType>(&_output_descs,
+                                             outputs[0]->dims(), dim_b, &stride_out[0]);
+
+        int windowHeight[] = {pooling_param.window_h, pooling_param.window_w};
+        int padding[] = {pooling_param.pad_h, pooling_param.pad_w};
+
+        int stride[] = {pooling_param.stride_h, pooling_param.stride_w};
+
+        cudnn::set_nd_pooling_des<OpDataType>(&_pooling_descs, pooling_param.pooling_type,
+                                               inputs[0]->dims() - 2, windowHeight,
+                                               padding,stride);
+        return SaberSuccess;
+    }
+
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
+                          std::vector<DataTensor_out*>& outputs,
+                          PoolingParam<OpTensor> &param) {
+        const InDataType *in_data = inputs[0]->data();
+        OutDataType *out_data = outputs[0]->mutable_data();
+
+        CUDNN_CHECK(cudnnPoolingForward(_handle, _pooling_descs,
+                                        cudnn::cudnnTypeWrapper<InDataType>::kOne(),
+                                        _input_descs, in_data,
+                                        cudnn::cudnnTypeWrapper<OutDataType>::kZero(),
+                                        _output_descs, out_data
+        ));
+
+        return SaberSuccess;
+    }
+
+private:
+    cudnnHandle_t _handle;
+    cudnnTensorDescriptor_t _input_descs;
+    cudnnTensorDescriptor_t _output_descs;
+    cudnnPoolingDescriptor_t _pooling_descs;
+
+};
+
+template class VenderPooling<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
+
+} //namespace saber
+
+} // namespace anakin
+
+#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_POOLING_H
diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h
index 4b0f170d5..6a109540e 100644
--- a/saber/saber_funcs_param.h
+++ b/saber/saber_funcs_param.h
@@ -379,6 +379,7 @@ struct ConvParam {
     opTensor* bias_tensor;
 };
 // specify for int8
+#ifdef USE_CUDA
 template <>
 struct ConvParam<Tensor<NV, AK_INT8, NCHW> > {
     ConvParam() : group(-1), pad_h(-1), pad_w(-1),
@@ -534,6 +535,90 @@ struct ConvParam<Tensor<NV, AK_INT8, NCHW_C4> > {
     Tensor<NV, AK_INT8, NCHW_C4>* weight_tensor;
     Tensor<NV, AK_FLOAT, NCHW>* bias_tensor;
 };
+#endif //USE_CUDA
+
+#ifdef USE_BM
+template <>
+struct ConvParam<Tensor<BM, AK_BM, NCHW> > {
+    ConvParam() : group(-1), pad_h(-1), pad_w(-1),
+                  stride_h(-1), stride_w(-1),
+                  dilation_h(-1), dilation_w(-1),
+                  weight_tensor(NULL), bias_tensor(NULL), alpha(1.0), beta(0.0){}
+    ConvParam(int group_in, int pad_h_in, int pad_w_in,
+              int stride_h_in, int stride_w_in, int dilation_h_, int dilation_w_,
+              Tensor<BM, AK_BM, NCHW>* weight, Tensor<BM, AK_BM, NCHW>* bias,
+              float alpha_in = 1.0, float beta_in = 0.0)
+            : group(group_in), pad_h(pad_h_in), pad_w(pad_w_in)
+            , stride_h(stride_h_in), stride_w(stride_w_in)
+            , dilation_h(dilation_h_), dilation_w(dilation_w_)
+            , weight_tensor(weight), bias_tensor(bias)
+            , alpha(alpha_in), beta(beta_in)
+    {}
+    ConvParam(const ConvParam &right)
+            : group(right.group), pad_h(right.pad_h)
+            , pad_w(right.pad_w), stride_h(right.stride_h)
+            , stride_w(right.stride_w), dilation_h(right.dilation_h)
+            , dilation_w(right.dilation_w)
+            , weight_tensor(right.weight_tensor)
+            , bias_tensor(right.bias_tensor)
+            , alpha(right.alpha)
+            , beta(right.beta) {}
+    ConvParam &operator=(const ConvParam &right) {
+        group = right.group;
+        pad_h = right.pad_h;
+        pad_w = right.pad_w;
+        stride_h = right.stride_h;
+        stride_w = right.stride_w;
+        dilation_h = right.dilation_h;
+        dilation_w = right.dilation_w;
+        weight_tensor = right.weight_tensor;
+        bias_tensor = right.bias_tensor;
+        alpha = right.alpha;
+        beta = right.beta;
+        return *this;
+    }
+    bool operator==(const ConvParam &right) {
+        bool comp_eq = true;
+        comp_eq = comp_eq && (group == right.group);
+        comp_eq = comp_eq && (pad_h == right.pad_h);
+        comp_eq = comp_eq && (pad_w == right.pad_w);
+        comp_eq = comp_eq && (stride_h == right.stride_h);
+        comp_eq = comp_eq && (stride_w == right.stride_w);
+        comp_eq = comp_eq && (dilation_h == right.dilation_h);
+        comp_eq = comp_eq && (dilation_w == right.dilation_w);
+        comp_eq = comp_eq && (weight_tensor == right.weight_tensor);
+        comp_eq = comp_eq && (bias_tensor == right.bias_tensor);
+        comp_eq = comp_eq && (alpha == right.alpha);
+        comp_eq = comp_eq && (beta == right.beta);
+        return comp_eq;
+    }
+    inline const Tensor<BM, AK_BM, NCHW>* weight() {
+        return weight_tensor;
+    }
+    inline const Tensor<BM, AK_BM, NCHW>* bias() {
+        return bias_tensor;
+    }
+    inline Tensor<BM, AK_BM, NCHW>* mutable_weight() {
+        return weight_tensor;
+    }
+    inline Tensor<BM, AK_BM, NCHW>* mutable_bias() {
+        return bias_tensor;
+    }
+    int group;
+    int pad_h;
+    int pad_w;
+    int stride_h;
+    int stride_w;
+    int dilation_h;
+    int dilation_w;
+    float alpha;
+    float beta;
+private:
+    Tensor<BM, AK_BM, NCHW>* weight_tensor;
+    Tensor<BM, AK_BM, NCHW>* bias_tensor;
+};
+#endif //USE_BM
+
 template <typename opTensor>
 struct PermuteParam {
     PermuteParam() {}
diff --git a/saber/saber_types.h b/saber/saber_types.h
index 8f9b86237..3dccb5f3f 100644
--- a/saber/saber_types.h
+++ b/saber/saber_types.h
@@ -31,7 +31,8 @@ enum TargetTypeEnum {
     eARM = 3,
     eX86 = 4,
     eNVHX86 = 5,
-    eNVHARM = 6
+    eNVHARM = 6,
+    eBM = 7
 };
 
 template <TargetTypeEnum T>
@@ -44,6 +45,8 @@ typedef TargetType<eX86> X86;
 // NV device with pinned memory
 typedef TargetType<eNVHX86> NVHX86;
 //typedef TargetType<eNVHARM> NVHARM;
+// Bitmain device support
+typedef TargetType<eBM> BM;
 // invalid target type, for target has only one memory block
 typedef TargetType<eINVALID> INVLD;
 
@@ -82,7 +85,8 @@ enum DataType {
     AK_STRING       =       10,
     AK_BOOL         =       11,
     AK_SHAPE        =       12,
-    AK_TENSOR       =       13
+    AK_TENSOR       =       13,
+    AK_BM           =       14
 };
 
 typedef enum {
@@ -148,6 +152,29 @@ enum CodeType {
     CORNER_SIZE = 3
 };
 
+typedef enum {
+    ATRS_NormType_NONE = 0,
+    ATRS_NormType_WIDTH = 1,
+    ATRS_NormType_HEIGHT = 2,
+    ATRS_NormType_WIDTH_LOG = 3,
+    ATRS_NormType_HEIGHT_LOG = 4,
+} ATRS_NormType;
+
+typedef enum {
+    DetectionOutputSSD_HEIGHT_AND_WIDTH = 0,
+    DetectionOutputSSD_HEIGHT_OR_WIDTH = 1
+} DetectionOutputSSD_MIN_SIZE_MODE;
+
+typedef enum {
+    ProposalImgScaleToCamCoords_NormType_HEIGHT = 0,
+    ProposalImgScaleToCamCoords_NormType_HEIGHT_LOG = 1
+} ProposalImgScaleToCamCoords_NormType;
+
+typedef enum {
+    ProposalImgScaleToCamCoords_OrienType_PI = 0,
+    ProposalImgScaleToCamCoords_OrienType_PI2 = 1
+} ProposalImgScaleToCamCoords_OrienType;
+
 typedef enum {
     SABER_POWER_HIGH = 0,
     SABER_POWER_LOW  = 1,
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 140094f32..b013d2c4d 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -22,6 +22,10 @@ if(NVIDIA_GPU)
 anakin_fetch_files_with_suffix(${ANAKIN_UNIT_TEST}/saber/cuda "cpp" ANAKIN_TEST_CASE_SRC)
 endif()
 
+if(USE_BM)
+anakin_fetch_files_with_suffix(${ANAKIN_UNIT_TEST}/saber/bm "cpp" ANAKIN_TEST_CASE_SRC)
+endif()
+
 if(USE_X86_PLACE)
 anakin_fetch_files_with_suffix(${ANAKIN_UNIT_TEST}/saber/x86 "cpp" ANAKIN_TEST_CASE_SRC)
 endif()
diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp
new file mode 100644
index 000000000..c6ee0811b
--- /dev/null
+++ b/test/saber/bm/test_TargetWrapper_BM.cpp
@@ -0,0 +1,16 @@
+#include "saber_types.h"
+#include "target_wrapper.h"
+#include <iostream>
+
+#ifdef USE_BM
+using namespace anakin::saber;
+int main() {
+    typedef TargetWrapper<BM> API;
+    void *pmem;
+    int dev_count;
+    API::get_device_count(&dev_count);
+    API::mem_alloc(&pmem, 3*200*200);
+    API::mem_free(pmem);
+}
+#endif
+
diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp
new file mode 100644
index 000000000..a204e7807
--- /dev/null
+++ b/test/saber/bm/test_saber_buffer_BM.cpp
@@ -0,0 +1,116 @@
+#include "test_saber_buffer_bm.h"
+#include "saber/core/buffer.h"
+#include "saber/core/data_traits.h"
+
+using namespace anakin::saber;
+
+template <DataType datatype>
+void test_buffer() {
+
+    typedef TargetWrapper<X86> X86_API;
+    typedef TargetWrapper<BM> BM_API;
+    typedef typename DataTrait<datatype>::dtype Dtype;
+    typedef Buffer<X86> BufferH;
+    typedef Buffer<BM> BufferD;
+
+    int n0 = 1024;
+    int n1 = 2048;
+
+    void* tmp_x86;
+    Dtype* x86_ptr;
+    X86_API::mem_alloc(&tmp_x86, sizeof(Dtype) * n0);
+    x86_ptr = static_cast<Dtype*>(tmp_x86);
+
+    for (int i = 0; i < n0; i++) {
+        x86_ptr[i] = static_cast<Dtype>(i);
+    }
+
+    void* tmp_bm;
+    Dtype* bm_ptr;
+    BM_API::mem_alloc(&tmp_bm, sizeof(Dtype) * n0);
+    bm_ptr = static_cast<Dtype*>(tmp_bm);
+
+    LOG(INFO) << "Buffer: test default(empty) constructor";
+    BufferH x86_buf0;
+    BufferD bm_buf0;
+
+    LOG(INFO) << "Buffer: test constructor with data size";
+    BufferH x86_buf1(n0 * sizeof(Dtype));
+    BufferD bm_buf1(n0 * sizeof(Dtype));
+
+    LOG(INFO) << "Buffer: test constructor with data pointer, size and device id";
+    BufferH x86_buf2(x86_ptr, n0 * sizeof(Dtype), X86_API::get_device_id());
+    BufferD bm_buf2(bm_ptr, n0 * sizeof(Dtype), BM_API::get_device_id());
+
+    LOG(INFO) << "Buffer: test copy constructor";
+    BufferH x86_buf3(x86_buf2);
+    LOG(INFO) << "BM Buffer copy constructor";
+    LOG(INFO) << "bm target id: " << BM_API::get_device_id();
+    LOG(INFO) << "bm buffer target id: " << bm_buf2.get_id();
+    BufferD bm_buf3(bm_buf2);
+    CHECK_EQ(x86_buf3.get_count(), x86_buf2.get_count()) << \
+            "shared buffer should have same data count";
+    CHECK_EQ(bm_buf3.get_count(), bm_buf2.get_count()) << \
+            "shared buffer should have same data count";
+
+    LOG(INFO) << "Buffer: test operator =";
+    x86_buf0 = x86_buf2;
+    bm_buf0 = bm_buf2;
+    CHECK_EQ(x86_buf0.get_count(), x86_buf2.get_count()) << \
+            "shared buffer should have same data count";
+    CHECK_EQ(bm_buf0.get_count(), bm_buf2.get_count()) << \
+            "shared buffer should have same data count";
+
+    LOG(INFO) << "Buffer: test re_alloc";
+    x86_buf1.re_alloc(n1 * sizeof(Dtype));
+    bm_buf1.re_alloc(n1 * sizeof(Dtype));
+    CHECK_EQ(x86_buf1.get_count(), n1 * sizeof(Dtype)) << "buffer count error";
+    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error";
+    CHECK_EQ(bm_buf1.get_count(), n1 * sizeof(Dtype)) << "buffer count error";
+    CHECK_EQ(bm_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error";
+    x86_buf1.re_alloc(n0 * sizeof(Dtype));
+    bm_buf1.re_alloc(n0 * sizeof(Dtype));
+    CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Dtype)) << "buffer count error";
+    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error";
+    CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Dtype)) << "buffer count error";
+    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error";
+
+    LOG(INFO) << "Buffer: test get_id()";
+    LOG(INFO) << "X86 device id: " << x86_buf0.get_id() << \
+              ", bm device id: " << bm_buf0.get_id();
+    CHECK_EQ(X86_API::get_device_id(), x86_buf0.get_id()) << "x86 device id error";
+    CHECK_EQ(BM_API::get_device_id(), bm_buf0.get_id()) << "bm device id error";
+
+    LOG(INFO) << "Buffer: test deep_cpy()";
+    x86_buf1.sync_copy_from(x86_buf2);
+    LOG(INFO) << "deep copy between two host buffer: ";
+    const Dtype* ptr1 = static_cast<const Dtype*>(x86_buf1.get_data());
+    const Dtype* ptr2 = static_cast<const Dtype*>(x86_buf1.get_data());
+
+    for (int i = 0; i < 10; i++) {
+        std::cout << ptr1[i] << std::endl;
+    }
+
+    CHECK_EQ(ptr1[n0 / 2], ptr2[n0 / 2]) << "deep copy between host is incorrect";
+    LOG(INFO) << "deep copy from host buffer to device buffer";
+    bm_buf1.sync_copy_from(x86_buf2);
+    x86_buf1.sync_copy_from(bm_buf1);
+    LOG(INFO) << "deep copy from device buffer to host buffer: ";
+    ptr1 = static_cast<const Dtype*>(x86_buf1.get_data());
+
+    for (int i = 0; i < 10; i++) {
+        std::cout << ptr1[i] << std::endl;
+    }
+}
+
+TEST(TestSaberBufferBM, test_buffer_memcpy) {
+    test_buffer<AK_BM>();
+}
+
+int main(int argc, const char** argv) {
+    // initial logger
+    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
diff --git a/test/saber/bm/test_saber_buffer_BM.h b/test/saber/bm/test_saber_buffer_BM.h
new file mode 100644
index 000000000..8bbbe4511
--- /dev/null
+++ b/test/saber/bm/test_saber_buffer_BM.h
@@ -0,0 +1,20 @@
+#ifndef ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H
+#define ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+
+using namespace anakin::test;
+
+class TestSaberBufferBM : public Test {
+public:
+    TestSaberBufferBM() {}
+    ~TestSaberBufferBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+};
+
+#endif //ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H
diff --git a/test/saber/bm/test_saber_context_BM.cpp b/test/saber/bm/test_saber_context_BM.cpp
new file mode 100644
index 000000000..e221ba8f4
--- /dev/null
+++ b/test/saber/bm/test_saber_context_BM.cpp
@@ -0,0 +1,31 @@
+#include "test_saber_context_BM.h"
+
+#ifdef USE_BM
+
+using namespace anakin::saber;
+
+TEST(TestSaberContextBM, test_BM_context) {
+    Env<BM>::env_init();
+    typedef TargetWrapper<BM> API;
+    typename API::event_t event;
+    API::create_event(event);
+    LOG(INFO) << "test context constructor";
+    Context<BM> ctx0;
+    Context<BM> ctx1(0, 1, 1);
+    LOG(INFO) << "test record event to context data stream and compute stream";
+    API::record_event(event, ctx0.get_data_stream());
+    API::record_event(event, ctx0.get_compute_stream());
+    API::record_event(event, ctx1.get_data_stream());
+    API::record_event(event, ctx1.get_compute_stream());
+}
+
+#endif
+
+int main(int argc, const char** argv) {
+    // initial logger
+    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
diff --git a/test/saber/bm/test_saber_context_BM.h b/test/saber/bm/test_saber_context_BM.h
new file mode 100644
index 000000000..653ee11fd
--- /dev/null
+++ b/test/saber/bm/test_saber_context_BM.h
@@ -0,0 +1,21 @@
+#ifndef SABER_TEST_SABER_CONTEXT_BM_H
+#define SABER_TEST_SABER_CONTEXT_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+#include "core/context.h"
+
+using namespace anakin::test;
+
+class TestSaberContextBM : public Test {
+public:
+    TestSaberContextBM() {}
+    ~TestSaberContextBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+};
+
+#endif //SABER_TEST_SABER_CONTEXT_BM_H
diff --git a/test/saber/bm/test_saber_device_BM.cpp b/test/saber/bm/test_saber_device_BM.cpp
new file mode 100644
index 000000000..1c7086cf1
--- /dev/null
+++ b/test/saber/bm/test_saber_device_BM.cpp
@@ -0,0 +1,20 @@
+#include "test_saber_device_BM.h"
+
+#ifdef USE_BM
+
+using namespace anakin::saber;
+
+TEST(TestSaberDeviceBM, test_BM_device) {
+    Device<BM> dev_BM;
+}
+
+#endif
+
+int main(int argc, const char** argv) {
+    // initial logger
+    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
diff --git a/test/saber/bm/test_saber_device_BM.h b/test/saber/bm/test_saber_device_BM.h
new file mode 100644
index 000000000..3a6d61236
--- /dev/null
+++ b/test/saber/bm/test_saber_device_BM.h
@@ -0,0 +1,21 @@
+#ifndef SABER_TEST_SABER_DEVICE_BM_H
+#define SABER_TEST_SABER_DEVICE_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+#include "core/device.h"
+
+using namespace anakin::test;
+
+class TestSaberDeviceBM : public Test {
+public:
+    TestSaberDeviceBM() {}
+    ~TestSaberDeviceBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+};
+
+#endif //SABER_TEST_SABER_DEVICE_BM_H
diff --git a/test/saber/bm/test_saber_func_BM.h b/test/saber/bm/test_saber_func_BM.h
new file mode 100644
index 000000000..61d27d6f9
--- /dev/null
+++ b/test/saber/bm/test_saber_func_BM.h
@@ -0,0 +1,38 @@
+#ifndef ANAKIN_TEST_SABER_TEST_SABER_FUNC_BM_H
+#define ANAKIN_TEST_SABER_TEST_SABER_FUNC_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+#include "core/tensor.h"
+#include <fstream>
+#include <vector>
+
+using namespace anakin::test;
+
+int read_file(std::vector<float> &results, const char* file_name) {
+
+    std::ifstream infile(file_name);
+    if (!infile.good()) {
+        std::cout << "Cannot open " << std::endl;
+        return false;
+    }
+    LOG(INFO)<<"found filename: "<<file_name;
+    std::string line;
+    while (std::getline(infile, line)) {
+        results.push_back((float)atof(line.c_str()));
+    }
+    return 0;
+}
+
+class TestSaberFuncBM : public Test {
+public:
+    TestSaberFuncBM() {}
+    ~TestSaberFuncBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+};
+
+#endif //ANAKIN_TEST_SABER_TEST_SABER_TENSOR_BM_H
diff --git a/test/saber/bm/test_saber_func_activation_BM.cpp b/test/saber/bm/test_saber_func_activation_BM.cpp
new file mode 100644
index 000000000..5d30a6d64
--- /dev/null
+++ b/test/saber/bm/test_saber_func_activation_BM.cpp
@@ -0,0 +1,183 @@
+#include "core/context.h"
+#include "funcs/activation.h"
+#include "test_saber_func_BM.h"
+#include "tensor_op.h"
+#include "saber_types.h"
+#include <vector>
+
+using namespace anakin::saber;
+
+template <typename Tensor>
+void print_tensor_shape(std::string name, Tensor& t0) {
+
+    LOG(INFO) << name << " valid shape is ["
+              << t0.valid_shape()[0] << ", "
+              << t0.valid_shape()[1] << ", "
+              << t0.valid_shape()[2] << ", "
+              << t0.valid_shape()[3] << "].";
+
+    LOG(INFO) << name << " real shape is ["
+              << t0.shape()[0] << ", "
+              << t0.shape()[1] << ", "
+              << t0.shape()[2] << ", "
+              << t0.shape()[3] << "].";
+
+    LOG(INFO) << name << " offset is ["
+              << t0.offset()[0] << ", "
+              << t0.offset()[1] << ", "
+              << t0.offset()[2] << ", "
+              << t0.offset()[3] << "].";
+}
+
+TEST(TestSaberFuncBM, test_func_constructor) {
+
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+
+    int img_num = 1;
+    int in_channels = 2;
+    int img_h = 8;
+    int img_w = 8;
+
+    Shape img_s(img_num, in_channels, img_h, img_w);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    for (int i = 0; i < img_host.size(); ++i) {
+        img_host.mutable_data()[i] = (float)(0.05 * (i & 0x1f) * -1);
+    }
+
+    img_dev.copy_from(img_host);
+    TensorDf4 output_dev;
+
+    // start Reshape & doInfer
+
+    Context<BM> ctx1(0, 1, 1);
+
+    ActivationParam<TensorDf4> param(Active_elu, 0.1f, 0.1f);
+
+    std::vector<TensorDf4*> input;
+    std::vector<TensorDf4*> output;
+
+    input.push_back(&img_dev);
+    output.push_back(&output_dev);
+
+    Activation<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> act;
+    act.compute_output_shape(input, output, param);
+    output_dev.re_alloc(output[0]->shape());
+
+    // init assume output tensor has been reshpaed by user.
+    act.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
+    act(input, output, param, ctx1);
+
+    cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    output[0]->record_event(cuda_stream);
+    output_dev.sync();
+    print_tensor_device(output_dev);
+    cudaDeviceSynchronize();
+    CUDA_POST_KERNEL_CHECK;
+}
+
+TEST(TestSaberFuncBM, test_func_sub_tensor) {
+
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+
+    int img_num = 1;
+    int in_channels = 2;
+    int img_h = 8;
+    int img_w = 8;
+
+    Shape img_s(img_num, in_channels, img_h, img_w);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    for (int i = 0; i < img_host.size(); ++i) {
+        img_host.mutable_data()[i] = (float)(0.05 * (i & 0x1f) * -1);
+    }
+
+    img_dev.copy_from(img_host);
+    Shape img_s_t0(img_num, in_channels, 4, 4);
+
+    TensorDf4 t0;
+    TensorDf4 t1;
+
+    t0.share_sub_buffer(img_dev, img_s_t0, {0, 0, 0, 0});
+    t1.share_sub_buffer(img_dev, img_s_t0, {0, 0, 4, 4});
+
+    print_tensor_shape("t0", t0);
+    print_tensor_shape("t1", t1);
+
+    TensorDf4 output_dev;
+
+    TensorDf4 out0;
+    TensorDf4 out1;
+
+    // start Reshape & doInfer
+    Context<BM> ctx1(0, 1, 1);
+    Context<BM> ctx2(0, 2, 2);
+
+    ActivationParam<TensorDf4> param1(Active_elu, 0.1f, 0.1f);
+    ActivationParam<TensorDf4> param2(Active_elu, 0.1f, 0.1f);
+
+    std::vector<TensorDf4*> input1, input2;
+    std::vector<TensorDf4*> output1, output2;
+
+    input1.push_back(&t0);
+    input2.push_back(&t1);
+
+    output1.push_back(&out0);
+    output2.push_back(&out1);
+
+    //FIXME where do I get img_s and all those shapes ????
+    output_dev.re_alloc(img_s);
+
+    out0.share_sub_buffer(output_dev, img_s_t0, {0, 0, 0, 0});
+    out1.share_sub_buffer(output_dev, img_s_t0, {0, 0, 4, 4});
+
+    print_tensor_shape("output_dev", output_dev);
+
+    Activation<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> act1;
+    Activation<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> act2;
+
+    act1.compute_output_shape(output1, input1, param1);
+    act2.compute_output_shape(output2, input2, param2);
+
+    print_tensor_shape("out0", out0);
+    print_tensor_shape("out1", out1);
+
+    // init assume output tensor has been reshpaed by user.
+    act1.init(input1, output1, param1, SPECIFY, SABER_IMPL, ctx1);
+    act1(input1, output1, param1, ctx1);
+    cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    output1[0]->record_event(cuda_stream);
+
+    act2.init(input2, output2, param2, SPECIFY, SABER_IMPL, ctx2);
+    act2(input2, output2, param2, ctx2);
+    cudaStream_t cuda_stream2 = ctx2.get_compute_stream();
+    output2[0]->record_event(cuda_stream2);
+
+    out0.sync();
+    out1.sync();
+    print_tensor_device(output_dev);
+    cudaDeviceSynchronize();
+    CUDA_POST_KERNEL_CHECK;
+}
+
+int main(int argc, const char** argv) {
+    Env<BM>::env_init();
+    // initial logger
+    //logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
diff --git a/test/saber/bm/test_saber_func_conv_BM.cpp b/test/saber/bm/test_saber_func_conv_BM.cpp
new file mode 100644
index 000000000..7881cdb97
--- /dev/null
+++ b/test/saber/bm/test_saber_func_conv_BM.cpp
@@ -0,0 +1,725 @@
+#include "core/context.h"
+#include "funcs/conv.h"
+#include "test_saber_func_BM.h"
+#include "tensor_op.h"
+#include "saber_types.h"
+#include <vector>
+//#include "cublas.h"
+
+using namespace anakin::saber;
+
+typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+
+template <typename Tensor>
+void print_tensor_shape(std::string name, Tensor &t0) {
+
+            LOG(INFO) << name << " valid shape is ["
+                      << t0.valid_shape()[0] << ", "
+                      << t0.valid_shape()[1] << ", "
+                      << t0.valid_shape()[2] << ", "
+                      << t0.valid_shape()[3] << "].";
+
+            LOG(INFO) << name << " real shape is ["
+                      << t0.shape()[0] << ", "
+                      << t0.shape()[1] << ", "
+                      << t0.shape()[2] << ", "
+                      << t0.shape()[3] << "].";
+
+            LOG(INFO) << name << " offset is ["
+                      << t0.offset()[0] << ", "
+                      << t0.offset()[1] << ", "
+                      << t0.offset()[2] << ", "
+                      << t0.offset()[3] << "].";
+}
+
+
+
+#if 1
+TEST(TestSaberFuncBM, test_depthwise_conv) {
+
+    int group = 2;
+    int pad_h = 1;
+    int pad_w = 1;
+    int stride_h = 1;
+    int stride_w = 1;
+    int dilation_h = 1;
+    int dilation_w = 1;
+
+    int kernel_h = 3;
+    int kernel_w = 3;
+    int out_channels = 2;
+    
+    int img_num = 1;
+    int in_channels = 2;
+    int img_h = 8;
+    int img_w = 8;
+
+    bool bias_term = true;
+
+    LOG(INFO) << "conv param: ";
+    LOG(INFO) << " img_num = " << img_num;
+    LOG(INFO) << " in_channels = " << in_channels;
+    LOG(INFO) << " img_h = " << img_h;
+    LOG(INFO) << " img_w = " << img_w;
+    LOG(INFO) << " group = " << group;
+    LOG(INFO) << " pad_h = " << pad_h;
+    LOG(INFO) << " pad_w = " << pad_w;
+    LOG(INFO) << " stride_h = " << stride_h;
+    LOG(INFO) << " stride_w = " << stride_w;
+    LOG(INFO) << " dilation_h = " << dilation_h;
+    LOG(INFO) << " dilation_w = " << dilation_w;
+    LOG(INFO) << " kernel_h = " << kernel_h;
+    LOG(INFO) << " kernel_w = " << kernel_w;
+    LOG(INFO) << " out_channels = " << out_channels;
+
+    Shape img_s(img_num, in_channels, img_h, img_w);
+    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
+    Shape bias_s(1, out_channels, 1, 1);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+    
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    for (int i = 0; i < img_host.size(); ++i) {
+        img_host.mutable_data()[i] = 63 & i;
+    }
+
+    img_dev.copy_from(img_host);
+    
+    TensorHf4 weights_host;
+    TensorDf4 weights_dev;
+    
+    weights_host.re_alloc(weights_s);
+    weights_dev.re_alloc(weights_s);
+
+    fill_tensor_host_const(weights_host, 1.f);
+    weights_dev.copy_from(weights_host);
+
+    TensorHf4 bias_host;
+    TensorDf4 bias_dev;
+
+    if (bias_term) {
+        bias_host.re_alloc(bias_s);
+        bias_dev.re_alloc(bias_s);
+
+        fill_tensor_host_const(bias_host, 1.f);
+        bias_dev.copy_from(bias_host);
+    }
+
+    TensorHf4 output_host;
+    TensorDf4 output_dev;
+
+    // start Reshape & doInfer
+    Context<BM> ctx1(0, 1, 1);
+    
+    ConvParam<TensorDf4> param(group, pad_h, pad_w,
+                               stride_h, stride_w,
+                               dilation_h, dilation_w,
+                               &weights_dev, &bias_dev);
+
+    std::vector<TensorDf4*> input;
+    std::vector<TensorDf4*> output;
+
+    input.push_back(&img_dev);
+    output.push_back(&output_dev);
+
+    Conv<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> conv;
+    conv.compute_output_shape(input, output, param);
+
+    output_dev.re_alloc(output[0]->shape());
+    output_host.re_alloc(output[0]->shape());
+
+    LOG(INFO) << "regular start with group = " << group;
+    // init assume output tensor has been reshpaed by user.
+    conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
+
+    conv(input, output, param, ctx1);
+
+    cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    output[0]->record_event(cuda_stream);
+
+    output_dev.sync();
+    print_tensor_device(output_dev);
+
+//    param.group = 1;
+//    param.pad_h = 1;
+//    param.pad_w = 1;
+//
+//    LOG(INFO) << " param changed start with group = "<<param.group;
+//    conv(input, output, param, ctx1);
+//
+//    output_dev.sync();
+//    print_tensor_device(output_dev);
+    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+
+TEST(TestSaberFuncBM, test_conv_param_change) {
+
+    int group = 4;
+    int pad_h = 1;
+    int pad_w = 1;
+    int stride_h = 1;
+    int stride_w = 1;
+    int dilation_h = 1;
+    int dilation_w = 1;
+
+    int kernel_h = 3;
+    int kernel_w = 3;
+    int out_channels = 4;
+
+    int img_num = 1;
+    int in_channels = 4;
+    int img_h = 65;
+    int img_w = 63;
+
+    bool bias_term = true;
+
+    LOG(INFO) << "conv param: ";
+    LOG(INFO) << " img_num = " << img_num;
+    LOG(INFO) << " in_channels = " << in_channels;
+    LOG(INFO) << " img_h = " << img_h;
+    LOG(INFO) << " img_w = " << img_w;
+    LOG(INFO) << " group = " << group;
+    LOG(INFO) << " pad_h = " << pad_h;
+    LOG(INFO) << " pad_w = " << pad_w;
+    LOG(INFO) << " stride_h = " << stride_h;
+    LOG(INFO) << " stride_w = " << stride_w;
+    LOG(INFO) << " dilation_h = " << dilation_h;
+    LOG(INFO) << " dilation_w = " << dilation_w;
+    LOG(INFO) << " kernel_h = " << kernel_h;
+    LOG(INFO) << " kernel_w = " << kernel_w;
+    LOG(INFO) << " out_channels = " << out_channels;
+
+    Shape img_s(img_num, in_channels, img_h, img_w);
+    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
+    Shape bias_s(1, out_channels, 1, 1);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    for (int i = 0; i < img_host.size(); ++i) {
+        img_host.mutable_data()[i] = 0x7f & i;
+    }
+
+    img_dev.copy_from(img_host);
+
+    TensorHf4 weights_host;
+    TensorDf4 weights_dev;
+
+    weights_host.re_alloc(weights_s);
+    weights_dev.re_alloc(weights_s);
+
+    fill_tensor_host_const(weights_host, 1.f);
+    weights_dev.copy_from(weights_host);
+
+    TensorHf4 bias_host;
+    TensorDf4 bias_dev;
+
+    if (bias_term) {
+        bias_host.re_alloc(bias_s);
+        bias_dev.re_alloc(bias_s);
+
+        fill_tensor_host_const(bias_host, 1.f);
+        bias_dev.copy_from(bias_host);
+    }
+
+    TensorHf4 output_host;
+    TensorDf4 output_dev;
+
+    // start Reshape & doInfer
+    Context<BM> ctx1(0, 1, 1);
+
+    ConvParam<TensorDf4> param(group, pad_h, pad_w,
+                               stride_h, stride_w,
+                               dilation_h, dilation_w,
+                               &weights_dev, &bias_dev);
+
+    std::vector<TensorDf4*> input;
+    std::vector<TensorDf4*> output;
+
+    input.push_back(&img_dev);
+    output.push_back(&output_dev);
+
+    Conv<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> conv;
+    conv.compute_output_shape(input, output, param);
+
+    output_dev.re_alloc(output[0]->shape());
+    output_host.re_alloc(output[0]->shape());
+
+            LOG(INFO)<<"regular start with group = "<<group;
+    // init assume output tensor has been reshpaed by user.
+    conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
+
+    conv(input, output, param, ctx1);
+    output_dev.sync();
+//    print_tensor_device(output_dev);
+
+    param.group = 1;
+    param.pad_h = 1;
+    param.pad_w = 1;
+
+    LOG(INFO)<<" param changed start with group = "<<param.group;
+    conv(input, output, param, ctx1);
+
+    cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    output[0]->record_event(cuda_stream);
+
+    output_dev.sync();
+//    print_tensor_device(output_dev);
+    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+
+TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
+
+    int group = 1;
+    int pad_h = 1;
+    int pad_w = 1;
+    int stride_h = 1;
+    int stride_w = 1;
+    int dilation_h = 1;
+    int dilation_w = 1;
+
+    int kernel_h = 3;
+    int kernel_w = 3;
+    int out_channels = 2;
+
+    int img_num = 1;
+    int in_channels = 2;
+    int img_h = 8;
+    int img_w = 8;
+
+    bool bias_term = false;
+
+    LOG(INFO) << "conv param: ";
+    LOG(INFO) << " img_num = " << img_num;
+    LOG(INFO) << " in_channels = " << in_channels;
+    LOG(INFO) << " img_h = " << img_h;
+    LOG(INFO) << " img_w = " << img_w;
+    LOG(INFO) << " group = " << group;
+    LOG(INFO) << " pad_h = " << pad_h;
+    LOG(INFO) << " pad_w = " << pad_w;
+    LOG(INFO) << " stride_h = " << stride_h;
+    LOG(INFO) << " stride_w = " << stride_w;
+    LOG(INFO) << " dilation_h = " << dilation_h;
+    LOG(INFO) << " dilation_w = " << dilation_w;
+    LOG(INFO) << " kernel_h = " << kernel_h;
+    LOG(INFO) << " kernel_w = " << kernel_w;
+    LOG(INFO) << " out_channels = " << out_channels;
+
+    Shape img_s(img_num, in_channels, img_h, img_w);
+    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
+    Shape bias_s(1, out_channels, 1, 1);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    for (int i = 0; i < img_host.size(); ++i) {
+        img_host.mutable_data()[i] = 0x7f & i;
+    }
+
+    img_dev.copy_from(img_host);
+
+    Shape img_s_sub(img_num, in_channels, 4, 4);
+
+    TensorDf4 t0;
+    TensorDf4 t1;
+
+    t0.share_sub_buffer(img_dev, img_s_sub, {0,0,0,0});
+    t1.share_sub_buffer(img_dev, img_s_sub, {0,0,4,4});
+
+    print_tensor_shape("t0", t0);
+    print_tensor_shape("t1", t1);
+
+    TensorHf4 weights_host;
+    TensorDf4 weights_dev;
+
+    weights_host.re_alloc(weights_s);
+    weights_dev.re_alloc(weights_s);
+
+    fill_tensor_host_const(weights_host, 1.f);
+    weights_dev.copy_from(weights_host);
+
+    TensorHf4 bias_host;
+    TensorDf4 bias_dev;
+
+    if (bias_term) {
+        bias_host.re_alloc(bias_s);
+        bias_dev.re_alloc(bias_s);
+
+        fill_tensor_host_const(bias_host, 1.f);
+        bias_dev.copy_from(bias_host);
+    }
+
+    TensorDf4 output_dev;
+
+    // start Reshape & doInfer
+    Context<BM> ctx1(0, 1, 1);
+    Context<BM> ctx2(0, 2, 2);
+
+    TensorDf4 out0;
+    TensorDf4 out1;
+
+    ConvParam<TensorDf4> param0(group, pad_h, pad_w,
+                               stride_h, stride_w,
+                               dilation_h, dilation_w,
+                               &weights_dev, &bias_dev);
+
+    ConvParam<TensorDf4> param1(group, pad_h, pad_w,
+                                stride_h, stride_w,
+                                dilation_h, dilation_w,
+                                &weights_dev, &bias_dev);
+
+    std::vector<TensorDf4*> input0, input1;
+    std::vector<TensorDf4*> output0, output1;
+
+    input0.push_back(&t0);
+    input1.push_back(&t1);
+
+    output0.push_back(&out0);
+    output1.push_back(&out1);
+
+    // FIXME ? where do i get output shape
+    output_dev.re_alloc(img_s);
+
+    Conv<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> conv0;
+    Conv<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> conv1;
+
+    conv0.compute_output_shape(input0, output0, param0);
+    conv1.compute_output_shape(input1, output1, param1);
+
+    out0.share_sub_buffer(output_dev, output0[0]->valid_shape(),{0,0,0,0});
+    out1.share_sub_buffer(output_dev, output1[0]->valid_shape(),{0,0,4,4});
+
+    conv0.init(input0, output0, param0, SPECIFY, VENDER_IMPL, ctx1);
+    conv1.init(input1, output1, param1, SPECIFY, VENDER_IMPL, ctx2);
+
+    conv0(input0, output0, param0, ctx1);
+    conv1(input1, output1, param1, ctx2);
+
+    cudaStream_t cuda_stream1 = ctx1.get_compute_stream();
+    output0[0]->record_event(cuda_stream1);
+
+    cudaStream_t cuda_stream2 = ctx2.get_compute_stream();
+    output1[0]->record_event(cuda_stream2);
+
+    out0.sync();
+    out1.sync();
+
+    print_tensor_device(output_dev);
+
+//    print_tensor_device(output_dev);
+
+    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+#endif
+
+TEST(TestSaberFuncBM, test_conv_fp32_speed_test) {
+
+    int group = 1;
+    int pad_h = 1;
+    int pad_w = 1;
+    int stride_h = 1;
+    int stride_w = 1;
+    int dilation_h = 1;
+    int dilation_w = 1;
+
+    int kernel_h = 1;
+    int kernel_w = 1;
+    int out_channels = 128;
+
+    int img_num = 7;
+    int in_channels = 13;
+    int img_h = 32;
+    int img_w = 32;
+
+    bool bias_term = false;
+
+    LOG(INFO) << "conv param: ";
+    LOG(INFO) << " img_num = " << img_num;
+    LOG(INFO) << " in_channels = " << in_channels;
+    LOG(INFO) << " img_h = " << img_h;
+    LOG(INFO) << " img_w = " << img_w;
+    LOG(INFO) << " group = " << group;
+    LOG(INFO) << " pad_h = " << pad_h;
+    LOG(INFO) << " pad_w = " << pad_w;
+    LOG(INFO) << " stride_h = " << stride_h;
+    LOG(INFO) << " stride_w = " << stride_w;
+    LOG(INFO) << " dilation_h = " << dilation_h;
+    LOG(INFO) << " dilation_w = " << dilation_w;
+    LOG(INFO) << " kernel_h = " << kernel_h;
+    LOG(INFO) << " kernel_w = " << kernel_w;
+    LOG(INFO) << " out_channels = " << out_channels;
+    Shape img_s(img_num, in_channels, img_h, img_w);
+    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
+    Shape bias_s(1, out_channels, 1, 1);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    for (int i = 0; i < img_host.size(); ++i) {
+        img_host.mutable_data()[i] = 1;
+    }
+
+    img_dev.copy_from(img_host);
+
+    TensorHf4 weights_host;
+    TensorDf4 weights_dev;
+
+    weights_host.re_alloc(weights_s);
+    weights_dev.re_alloc(weights_s);
+
+    fill_tensor_host_const(weights_host, 1.f);
+    weights_dev.copy_from(weights_host);
+
+    TensorHf4 bias_host;
+    TensorDf4 bias_dev;
+
+    if (bias_term) {
+        bias_host.re_alloc(bias_s);
+        bias_dev.re_alloc(bias_s);
+
+        fill_tensor_host_const(bias_host, 1.f);
+        bias_dev.copy_from(bias_host);
+    }
+
+    TensorDf4 output_dev;
+
+    // start Reshape & doInfer
+    Context<BM> ctx1(0, 1, 1);
+
+    ConvParam<TensorDf4> param(group, pad_h, pad_w,
+                               stride_h, stride_w,
+                               dilation_h, dilation_w,
+                               &weights_dev, &bias_dev);
+
+    std::vector<TensorDf4*> input;
+    std::vector<TensorDf4*> output;
+
+    input.push_back(&img_dev);
+    output.push_back(&output_dev);
+
+    Conv<BM, AK_FLOAT> conv;
+    conv.compute_output_shape(input, output, param);
+
+    output_dev.re_alloc(output[0]->shape());
+    LOG(INFO) << "Output shape = [ " << output[0]->shape()[0] << " " << output[0]->shape()[1] << " " \
+        << output[0]->shape()[2] << " " << output[0]->shape()[3] << "]";
+    //LOG(INFO) << " blocks = [ " <<  i_div_up(img_num*output[0]->shape()[2]*output[0]->shape()[3],128) << " " << i_div_up(out_channels*kernel_h, 128) << " 1 ]" ; 
+    //选择k最小的那一组，如果一样，则选128*N，N最大的那一组
+    int k0 = i_div_up(out_channels, 128) * 128 - out_channels;
+    int k1 = i_div_up(out_channels, 64) * 64 - out_channels;
+    int k2 = i_div_up(out_channels, 32) * 32 - out_channels;
+    int kk = std::min(std::min(k0,k1),k2);
+    LOG(INFO) << "k0 = " << k0 << " k1 = " << k1 << " k2 = " << k2 << " kk = " << kk;
+    if (kk == k0)
+        LOG(INFO) << "thread = [256,1,1] 128*128" ;
+    if (kk == k1)
+        LOG(INFO) << "thread = [128,1,1] 128*64" ;
+    if (kk == k2)
+        LOG(INFO) << "thread = [128,1,1] 128*32" ;
+
+    LOG(INFO) << "saber conv init";
+    conv.init(input, output, param, SPECIFY, SABER_IMPL, ctx1);
+
+    LOG(INFO) << "saber conv dispatch";
+    conv(input, output, param, ctx1);
+
+    cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    output[0]->record_event(cuda_stream);
+
+    output_dev.sync();
+
+    SaberTimer<BM> t1;
+    int ts = 1;
+
+    for (int i = 0; i < ts; ++i) {
+        t1.start(ctx1);
+        conv(input, output, param, ctx1);
+        output_dev.sync();
+        t1.end(ctx1);
+    }
+
+    LOG(INFO) << "fp32 average time: " << t1.get_average_ms() << " ms";
+
+    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+
+void test_conv_fp32_speed(std::vector<TensorDf4*> &inputs, std::vector<TensorDf4*> &outputs,
+                         TensorDf4 &weights, int kernel_size, int stride, int pad,
+                         int in_channel, int out_channel, TensorDf4 &bias,
+                         anakin::saber::ImplEnum impl) {
+
+    ConvParam<TensorDf4> conv_param(1, pad, pad,
+                                    stride, stride,
+                                    1, 1,
+                                    &weights, &bias);
+    Conv<BM, AK_FLOAT> conv;
+    conv.compute_output_shape(inputs, outputs, conv_param);
+    outputs[0]->re_alloc(outputs[0]->shape());
+    Context<BM> ctx1(0, 1, 1);
+
+    SABER_CHECK(conv.init(inputs, outputs, conv_param, SPECIFY, impl, ctx1));
+
+    conv(inputs, outputs, conv_param, ctx1);
+    outputs[0]->record_event(ctx1.get_compute_stream());
+    outputs[0]->sync();
+
+    cudaDeviceSynchronize();
+
+    SaberTimer<BM> t1;
+    int ts = 100;
+    for (int i = 0; i < ts; ++i) {
+        t1.start(ctx1);
+        conv(inputs, outputs, conv_param, ctx1);
+        outputs[0]->record_event(ctx1.get_compute_stream());
+        outputs[0]->sync();
+        t1.end(ctx1);
+    }
+            LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms";
+
+    cudaDeviceSynchronize();
+}
+
+
+cublasHandle_t  cublas_handle;
+
+void caffe_gemm(const int M, const int N, const int K,\
+					 const float alpha, const float* A,\
+					 const float* B, const float beta, float* C) {
+    int lda = K;
+    int ldb = N;
+    CUBLAS_CHECK(cublasSgemm(cublas_handle,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             N, M, K,
+                             &alpha, B,
+                             ldb, A,
+                             lda, &beta,
+                             C, N));
+}
+
+TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) {
+    int img_num = 1;
+    int kernel = 1;
+
+//    int out_channels = 32;
+//    int in_channels = 128;
+//    int img_h = 52;
+//    int img_w = 112;
+//    int out_channels = 64;
+//    int in_channels = 256;
+//    int img_h = 26;
+//    int img_w = 56;
+    int out_channels = 128;
+    int in_channels = 512;
+    int img_h = 13;
+    int img_w = 28;
+
+//    int out_channels = 512;
+//    int in_channels = 128;
+//    int img_h = 13;
+//    int img_w = 28;
+
+    int pad = 0;
+    int stride = 1;
+    Context<BM> ctx1(0, 1, 1);
+
+    CUBLAS_CHECK(cublasCreate(&cublas_handle));
+    CUBLAS_CHECK(cublasSetStream(cublas_handle, ctx1.get_compute_stream()));
+
+    TensorDf4 weights;
+    weights.re_alloc({out_channels, in_channels, 1, 1});
+
+    TensorDf4 img;
+    img.re_alloc({1, in_channels, img_h, img_w});
+
+    TensorDf4 out;
+    out.re_alloc({1, out_channels, img_h, img_w});
+    TensorDf4 out_gemm;
+    out_gemm.re_alloc({1, out_channels, img_h, img_w});
+
+    fill_tensor_device_rand(weights, -1.f, 1.f);
+    fill_tensor_device_rand(img, -1.f, 1.f);
+
+    LOG(INFO) << "img_num: " << img_num;
+    LOG(INFO) << "kernel: " << kernel;
+    LOG(INFO) << "out_channels: " << out_channels;
+    LOG(INFO) << "in_channels: " << in_channels;
+    LOG(INFO) << "img_h: " << img_h;
+    LOG(INFO) << "img_w: " << img_w;
+    LOG(INFO) << "pad: " << pad;
+    LOG(INFO) << "stride: " << stride;
+
+    TensorDf4 bias;
+
+    std::vector<TensorDf4*> input_v;
+    std::vector<TensorDf4*> output_gemm_v, output_v;
+
+    input_v.push_back(&img);
+    output_v.push_back(&out);
+    output_gemm_v.push_back(&out_gemm);
+    cudaDeviceSynchronize();
+    test_conv_fp32_speed(input_v, output_v,
+                         weights, kernel, stride, pad,
+            in_channels, out_channels, bias,
+            SABER_IMPL);
+    cudaDeviceSynchronize();
+    caffe_gemm(out_channels, img_h * img_w, in_channels,\
+					 1.f, weights.data(),\
+					 img.data(), 0.f, out_gemm.mutable_data());
+    cudaDeviceSynchronize();
+    SaberTimer<BM> t1;
+    int ts = 100;
+
+    for (int i = 0; i < ts; ++i) {
+        t1.start(ctx1);
+        caffe_gemm(out_channels, img_h * img_w, in_channels,\
+					 1.f, weights.data(),\
+					 img.data(), 0.f, out_gemm.mutable_data());
+        out_gemm.record_event(ctx1.get_compute_stream());
+        out_gemm.sync();
+        t1.end(ctx1);
+    }
+    LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms";
+
+    cudaDeviceSynchronize();
+//    print_tensor_device(out);
+//    print_tensor_device(out_gemm);
+    TensorHf4 out_host;
+    TensorHf4 out_gemm_host;
+    out_host.re_alloc(out.shape());
+    out_host.copy_from(out);
+
+    out_gemm_host.re_alloc(out_gemm.shape());
+    out_gemm_host.copy_from(out_gemm);
+    double max_r, max_d;
+    tensor_cmp_host(out_host.data(), out_gemm_host.data(), out_host.size(), max_r, max_d);
+    LOG(INFO) << "cmp result: max_r = " << max_r << " max_d = " << max_d;
+}
+
+int main(int argc, const char** argv){
+    anakin::saber::Env<BM>::env_init();
+
+    // initial logger
+    //logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
diff --git a/test/saber/bm/test_saber_func_fc_BM.cpp b/test/saber/bm/test_saber_func_fc_BM.cpp
new file mode 100644
index 000000000..5101c75f8
--- /dev/null
+++ b/test/saber/bm/test_saber_func_fc_BM.cpp
@@ -0,0 +1,148 @@
+#include "core/context.h"
+#include "funcs/fc.h"
+#include "test_saber_func_fc_BM.h"
+#include "tensor_op.h"
+#include "saber_types.h"
+#include <vector>
+
+using namespace anakin::saber;
+typedef TargetWrapper<BM> API;
+typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+typedef TensorDf4::Dtype ftype;
+
+void fc_compute(const TensorHf4& tin, const TensorHf4& weight, \
+                const TensorHf4& bias, TensorHf4& tout) {
+
+    int m = tin.num();
+    int k = tin.valid_size() / m;
+    int n = weight.valid_size() / k;
+    bool bias_term = bias.valid_size() > 0;
+
+    const float* din = tin.data();
+    const float* w = weight.data();
+    float* dout = tout.mutable_data();
+
+    for (int i = 0; i < m; ++i) {
+        float* pdout = dout + i * n;
+        const float* pdin = din + i * k;
+
+        for (int j = 0; j < n; ++j) {
+            if (bias_term) {
+                pdout[j] = bias.data()[j];
+            } else {
+                pdout[j] = 0;
+            }
+
+            for (int l = 0; l < k; ++l) {
+                pdout[j] += pdin[l] * w[l * n + j];
+            }
+        }
+    }
+}
+
+TEST(TestSaberFuncFcBM, test_func_fc) {
+
+    int test_iter = 100;
+    int w_in = 7;
+    int h_in = 7;
+    int ch_in = 512;
+    int num_in = 1;
+
+    int num_out = 4096;
+    int axis = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_out = {num_in, num_out, 1, 1};
+
+    Shape sh_w{1, 1, w_in* h_in * ch_in, num_out};
+    TensorDf4 weight(sh_w);
+    Shape sh_b{1, 1, 1, num_out};
+    TensorDf4 bias(sh_b);
+    fill_tensor_device_const(weight, 1.f);
+    fill_tensor_device_const(bias, 1.f);
+
+    LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \
+              ch_in << ", height=" << h_in << ", width=" << w_in;
+
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+
+    TensorDf4 tdin;
+    TensorDf4 tdout;
+    tdin.re_alloc(shape_in);
+    fill_tensor_device_const(tdin, 1.f);
+    input_dev_4d.push_back(&tdin);
+    output_dev_4d.push_back(&tdout);
+
+    // start Reshape & doInfer
+    Context<BM> ctx_dev(0, 1, 1);
+
+    FcParam<TensorDf4> param(&weight, &bias, num_out, axis);
+
+    Fc<BM, AK_FLOAT> fc;
+
+    LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \
+              shape_out[2] << ", " << shape_out[3];
+
+    SABER_CHECK(fc.compute_output_shape(input_dev_4d, output_dev_4d, param));
+
+    LOG(INFO) << "re-alloc tensor buffer";
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->valid_shape());
+    Shape va_sh = tdout.valid_shape();
+    LOG(INFO) << "shape out 4d: " << va_sh[0] << ", " << va_sh[1] << ", " << \
+              va_sh[2] << ", " << va_sh[3];
+    CHECK_EQ(tdout.valid_shape() == shape_out, true) << "compute output shape error";
+
+    LOG(INFO) << "FC initialization";
+    SABER_CHECK(fc.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev));
+
+    LOG(INFO) << "FC compute";
+    SaberTimer<BM> t1;
+    t1.clear();
+    t1.start(ctx_dev);
+
+    for (int i = 0; i < test_iter; ++i) {
+        SABER_CHECK(fc(input_dev_4d, output_dev_4d, param, ctx_dev));
+        output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+        output_dev_4d[0]->sync();
+        //cudaDeviceSynchronize();
+    }
+
+    CUDA_POST_KERNEL_CHECK;
+    t1.end(ctx_dev);
+    float ts = t1.get_average_ms();
+    LOG(INFO) << "total time: " << ts << "avg time: " << ts / test_iter;
+    //print_tensor_device(*output_dev_4d[0]);
+    //cudaDeviceSynchronize();
+
+    //! check result
+    TensorHf4 thin(shape_in);
+    TensorHf4 thout(shape_out);
+    TensorHf4 thw(sh_w);
+    TensorHf4 thb(sh_b);
+    thin.copy_from(tdin);
+    thw.copy_from(weight);
+    thb.copy_from(bias);
+    fc_compute(thin, thw, thb, thout);
+    //print_tensor_host(thout);
+
+    TensorHf4 thout_d(shape_out);
+    thout_d.copy_from(tdout);
+    double max_ratio = 0;
+    double max_diff = 0;
+    tensor_cmp_host(thout.data(), thout_d.data(), thout.valid_size(), max_ratio, max_diff);
+    LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio;
+    CHECK_LE(fabs(max_ratio), 1.0e-6) << "error result";
+
+}
+
+int main(int argc, const char** argv) {
+    // initial logger
+    //logger::init(argv[0]);
+    Env<BM>::env_init();
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp
new file mode 100644
index 000000000..04b963675
--- /dev/null
+++ b/test/saber/bm/test_saber_func_pooling_BM.cpp
@@ -0,0 +1,311 @@
+#include "core/context.h"
+#include "funcs/pooling.h"
+#include "test_saber_func_BM.h"
+#include "tensor_op.h"
+#include "saber_types.h"
+#include "funcs/timer.h"
+#include <vector>
+
+using namespace anakin::saber;
+
+TEST(TestSaberFuncBM, test_func_pooling) {
+
+    Env<BM>::env_init();
+    typedef TargetWrapper<BM> API;
+    typename API::event_t event;
+    API::create_event(event);
+
+    typedef TargetWrapper<X86> X86_API;
+    typedef TargetWrapper<BM> BM_API;
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+
+    int img_num = 1;
+    int in_channels = 4;
+    int img_h = 800;
+    int img_w = 1440;
+
+    Shape img_s(img_num, in_channels, img_h, img_w);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    for (int i = 0; i < img_host.size(); ++i) {
+        img_host.mutable_data()[i] = 0x7f & i;
+    }
+
+    img_dev.copy_from(img_host);
+
+    TensorHf4 output_host;
+    TensorDf4 output_dev;
+
+    // start Reshape & doInfer
+
+    Context<BM> ctx1(0, 1, 1);
+    int window_h = 2;
+    int window_w = 2;
+    int pad_h = 1;
+    int pad_w = 1;
+    int stride_h = 1;
+    int stride_w = 1;
+    LOG(INFO) << " img_num: " << img_num;
+    LOG(INFO) << " in_channels: " << in_channels;
+    LOG(INFO) << " img_h: " << img_h;
+    LOG(INFO) << " img_w: " << img_w;
+    LOG(INFO) << " window_h: " << window_h;
+    LOG(INFO) << " window_w: " << window_w;
+    LOG(INFO) << " pad_h: " << pad_h;
+    LOG(INFO) << " pad_w: " << pad_w;
+    LOG(INFO) << " stride_h: " << stride_h;
+    LOG(INFO) << " stride_w: " << stride_w;
+
+    PoolingParam<TensorDf4> param(window_h, window_w, pad_h, pad_w
+                                  , stride_h, stride_w, Pooling_max);
+
+    std::vector<TensorDf4*> input;
+    std::vector<TensorDf4*> output;
+
+    input.push_back(&img_dev);
+    output.push_back(&output_dev);
+
+    Pooling<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> pooling;
+    pooling.compute_output_shape(input, output, param);
+
+    output_dev.re_alloc(output[0]->shape());
+    output_host.re_alloc(output[0]->shape());
+
+    // init assume output tensor has been reshpaed by user.
+    pooling.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
+    pooling(input, output, param, ctx1);
+
+    SaberTimer<BM> t1;
+    int ts = 1000;
+
+    for (int i = 0; i < ts; ++i) {
+        t1.start(ctx1);
+        pooling(input, output, param, ctx1);
+        output[0]->sync();
+        t1.end(ctx1);
+    }
+
+    output_dev.sync();
+    cudaDeviceSynchronize();
+    LOG(INFO) << " average time: " << t1.get_average_ms() << " ms";
+    LOG(INFO) << " tile 10% time: " << t1.get_tile_time(10) << " ms";
+    LOG(INFO) << " tile 50% time: " << t1.get_tile_time(50) << " ms";
+    LOG(INFO) << " tile 90% time: " << t1.get_tile_time(90) << " ms";
+    LOG(INFO) << " tile 95% time: " << t1.get_tile_time(95) << " ms";
+    LOG(INFO) << " tile 99% time: " << t1.get_tile_time(99) << " ms";
+
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+
+TEST(TestSaberFuncBM, test_pooling_result) {
+
+    Env<BM>::env_init();
+    typedef TargetWrapper<BM> API;
+    typename API::event_t event;
+    API::create_event(event);
+
+    typedef TargetWrapper<X86> X86_API;
+    typedef TargetWrapper<BM> BM_API;
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+
+    int img_num = 1;
+    int in_channels = 2;
+    int img_h = 8;
+    int img_w = 8;
+
+    Shape img_s(img_num, in_channels, img_h, img_w);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    for (int i = 0; i < img_host.size(); ++i) {
+        img_host.mutable_data()[i] = 0x7f & i;
+    }
+
+    img_dev.copy_from(img_host);
+
+    TensorDf4 output_dev;
+
+    // start Reshape & doInfer
+
+    Context<BM> ctx1(0, 1, 1);
+    int window_h = 2;
+    int window_w = 2;
+    int pad_h = 1;
+    int pad_w = 1;
+    int stride_h = 1;
+    int stride_w = 1;
+
+    LOG(INFO) << " img_num: " << img_num;
+    LOG(INFO) << " in_channels: " << in_channels;
+    LOG(INFO) << " img_h: " << img_h;
+    LOG(INFO) << " img_w: " << img_w;
+    LOG(INFO) << " window_h: " << window_h;
+    LOG(INFO) << " window_w: " << window_w;
+    LOG(INFO) << " pad_h: " << pad_h;
+    LOG(INFO) << " pad_w: " << pad_w;
+    LOG(INFO) << " stride_h: " << stride_h;
+    LOG(INFO) << " stride_w: " << stride_w;
+
+    PoolingParam<TensorDf4> param(window_h, window_w, pad_h, pad_w
+                                  , stride_h, stride_w, Pooling_max);
+
+    std::vector<TensorDf4*> input;
+    std::vector<TensorDf4*> output;
+
+    input.push_back(&img_dev);
+    output.push_back(&output_dev);
+
+    Pooling<BM, AK_FLOAT> pooling;
+    pooling.compute_output_shape(input, output, param);
+
+    output_dev.re_alloc(output[0]->shape());
+
+    // init assume output tensor has been reshpaed by user.
+    pooling.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
+    pooling(input, output, param, ctx1);
+    cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    output[0]->record_event(cuda_stream);
+
+    output_dev.sync();
+    print_tensor_device(output_dev);
+
+    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+
+TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
+
+    Env<BM>::env_init();
+    typedef TargetWrapper<BM> API;
+    typename API::event_t event;
+    API::create_event(event);
+
+    typedef TargetWrapper<X86> X86_API;
+    typedef TargetWrapper<BM> BM_API;
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+
+    int img_num = 1;
+    int in_channels = 2;
+    int img_h = 8;
+    int img_w = 8;
+
+    Shape img_s(img_num, in_channels, img_h, img_w);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    for (int i = 0; i < img_host.size(); ++i) {
+        img_host.mutable_data()[i] = 0x7f & i;
+    }
+
+    img_dev.copy_from(img_host);
+
+    TensorDf4 t0;
+    TensorDf4 t1;
+    Shape img_s_sub(img_num, in_channels, img_h / 2, img_w / 2);
+
+    t0.share_sub_buffer(img_dev, img_s_sub, {0, 0, 0, 0});
+    t1.share_sub_buffer(img_dev, img_s_sub, {0, 0, 4, 4});
+
+    TensorDf4 output_dev;
+
+    TensorDf4 out0;
+    TensorDf4 out1;
+
+    // start Reshape & doInfer
+
+    Context<BM> ctx1(0, 1, 1);
+    int window_h = 2;
+    int window_w = 2;
+    int pad_h = 1;
+    int pad_w = 1;
+    int stride_h = 1;
+    int stride_w = 1;
+
+    LOG(INFO) << " img_num: " << img_num;
+    LOG(INFO) << " in_channels: " << in_channels;
+    LOG(INFO) << " img_h: " << img_h;
+    LOG(INFO) << " img_w: " << img_w;
+    LOG(INFO) << " window_h: " << window_h;
+    LOG(INFO) << " window_w: " << window_w;
+    LOG(INFO) << " pad_h: " << pad_h;
+    LOG(INFO) << " pad_w: " << pad_w;
+    LOG(INFO) << " stride_h: " << stride_h;
+    LOG(INFO) << " stride_w: " << stride_w;
+
+    PoolingParam<TensorDf4> param(window_h, window_w, pad_h, pad_w
+                                  , stride_h, stride_w, Pooling_max);
+
+    std::vector<TensorDf4*> input;
+    std::vector<TensorDf4*> output;
+
+    input.push_back(&img_dev);
+    output.push_back(&output_dev);
+
+    Pooling<BM, AK_FLOAT> pooling;
+    Pooling<BM, AK_FLOAT> pooling0;
+    Pooling<BM, AK_FLOAT> pooling1;
+
+    pooling.compute_output_shape(input,output,  param);
+
+    Shape total_shape = output[0]->shape();
+
+    output_dev.re_alloc(total_shape);
+    Shape out_sub_shape = {total_shape[0], total_shape[1], total_shape[2] / 2, total_shape[3] / 2};
+
+    out0.share_sub_buffer(output_dev, out_sub_shape, {0, 0, 0, 0});
+    out1.share_sub_buffer(output_dev, out_sub_shape, {0, 0, out_sub_shape[2], out_sub_shape[3]});
+
+    std::vector<TensorDf4*> input0, input1;
+    std::vector<TensorDf4*> output0, output1;
+
+    input0.push_back(&t0);
+    input1.push_back(&t1);
+    output0.push_back(&out0);
+    output1.push_back(&out1);
+
+    // init assume output tensor has been reshpaed by user.
+    pooling0.init(input0, output0, param, SPECIFY, VENDER_IMPL, ctx1);
+    pooling0(input0, output0, param, ctx1);
+
+    pooling1.init(input1, output1, param, SPECIFY, VENDER_IMPL, ctx1);
+    pooling1(input1, output1, param, ctx1);
+
+    cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    out0.record_event(cuda_stream);
+
+    cudaStream_t cuda_stream1 = ctx1.get_compute_stream();
+    out1.record_event(cuda_stream1);
+
+    out0.sync();
+    out1.sync();
+
+    print_tensor_device(output_dev);
+
+    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+
+int main(int argc, const char** argv) {
+    // initial logger
+    //logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
diff --git a/test/saber/bm/test_saber_shape_BM.cpp b/test/saber/bm/test_saber_shape_BM.cpp
new file mode 100644
index 000000000..18479cd18
--- /dev/null
+++ b/test/saber/bm/test_saber_shape_BM.cpp
@@ -0,0 +1,126 @@
+#include "test_saber_shape_BM.h"
+#include "shape.h"
+#include "anakin_config.h"
+
+#ifdef USE_OPENMP
+#include <omp.h>
+#include <core/shape.h>
+#endif
+
+using namespace anakin;
+using namespace saber;
+
+
+TEST(TestSaberShapeBM, test_saber_shape) {
+
+    int dim = 4;
+    Shape sh4d0{0, 0, 0, 0};
+    CHECK_EQ(sh4d0.dims(), 4) << "check shape dim error";
+
+    for (int i = 0; i < dim; ++i) {
+        CHECK_EQ(sh4d0[i], 0) << "check default constructor, dim size error";
+    }
+
+    CHECK_EQ(sh4d0.count(), 0) << "check shape count error";
+
+    int N = 1;
+    int C = 3;
+    int H = 11;
+    int W = 11;
+    std::vector<int> sh_size = {N, C, H, W};
+    //Shape sh4d1(sh_size);
+    Shape sh4d1(N, C, H, W);
+    LOG(INFO) << "Test Saber Shape, size of shape: " << sh4d1.size();
+    CHECK_EQ(sh4d1.count(), N * C * H * W) << "size error with vector constructor!";
+    //CHECK_EQ(sh4d2.size(), N * C * H * W) << "size error with args constructor!";
+
+    CHECK_EQ(sh4d1[0], N) << "get shape size error";
+    CHECK_EQ(sh4d1[1], C) << "get shape size error";
+    CHECK_EQ(sh4d1[2], H) << "get shape size error";
+    CHECK_EQ(sh4d1[3], W) << "get shape size error";
+
+    //CHECK_EQ(sh4d2[0], N) << "get shape size error";
+    //CHECK_EQ(sh4d2[1], C) << "get shape size error";
+    //CHECK_EQ(sh4d2[2], H) << "get shape size error";
+    //CHECK_EQ(sh4d2[3], W) << "get shape size error";
+
+    CHECK_EQ(sh4d1.count(0), N * C * H * W) << "calculate count failed";
+
+    C = 10;
+    sh4d1[1] = C;
+    CHECK_EQ(sh4d1[1], C) << "set shape size error";
+
+    bool is_equal = (sh4d0 == sh4d1);
+    CHECK_EQ(is_equal, false) << "check shape is_equal failed";
+
+    sh4d0 = sh4d1;
+    CHECK_EQ(sh4d1[0], N) << "constructor failed";
+    CHECK_EQ(sh4d1[1], C) << "get shape size error";
+    CHECK_EQ(sh4d1[2], H) << "get shape size error";
+    CHECK_EQ(sh4d1[3], W) << "get shape size error";
+
+    Shape sh4d3 = sh4d1;
+    CHECK_EQ((sh4d3 == sh4d1), true) << "constructor error";
+
+    Shape sh4d4(sh4d1);
+    CHECK_EQ((sh4d4 == sh4d1), true) << "constructor error";
+
+    Shape sh1d0{0};
+    //std::vector<int> sh1d_size = {W};
+
+    //Shape sh1d1(sh1d_size);
+    //Shape sh1d0{W};
+    Shape sh1d1(W);
+
+    Shape sh1d3 = sh1d1;
+    Shape sh1d4(sh1d1);
+
+    CHECK_EQ(sh1d0.dims(), 1) << "shape dim error";
+
+    CHECK_EQ(sh1d0.count(), 0) << "shape size error";
+
+    CHECK_EQ(sh1d0.count(0), 0) << "shape1d count error";
+
+    CHECK_EQ(sh1d1[0], W) << "get shape size error";
+
+    //CHECK_EQ(sh1d2.count(0), W) << "shape dim error";
+
+    CHECK_EQ((sh1d0 != sh1d1), true) << "compare shape error";
+
+    CHECK_EQ((sh1d3 == sh1d1), true) << "compare shape error";
+
+    CHECK_EQ((sh1d4 == sh1d1), true) << "compare shape error";
+
+    Shape sh0{2, 2, 3, 4};
+    Shape sh1{2, 1, 1, 24};
+    Shape sh2{2, 2, 3, 4};
+    Shape sh3{1, 1, 2, 3};
+
+    CHECK_EQ(sh0 == sh2, true) << "error ==";
+    CHECK_EQ(sh3 < sh0, true) << "error <";
+    CHECK_EQ(sh3 >= sh0, false) << "error >=";
+    CHECK_EQ(sh3 > sh0, false) << "error >";
+    CHECK_EQ(sh0 > sh3, true) << "error >";
+    CHECK_EQ(sh0 < sh1, false) << "error <";
+    CHECK_EQ(sh0 <= sh2, true) << "error <=";
+    CHECK_EQ(sh0 >= sh2, true) << "error >=";
+
+    Shape sh001 = Shape::zero(2);
+    Shape sh002 = Shape::zero(3);
+
+    if (sh001 > sh002) {
+        LOG(ERROR) << "error <";
+    }
+
+}
+
+
+int main(int argc, const char** argv) {
+    // initial logger
+    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
+
diff --git a/test/saber/bm/test_saber_shape_BM.h b/test/saber/bm/test_saber_shape_BM.h
new file mode 100644
index 000000000..a2ca02c9b
--- /dev/null
+++ b/test/saber/bm/test_saber_shape_BM.h
@@ -0,0 +1,25 @@
+#ifndef ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H
+#define ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+#include "saber/core/shape.h"
+
+using namespace anakin::test;
+
+class TestSaberShapeBM : public Test {
+public:
+    TestSaberShapeBM() {}
+    ~TestSaberShapeBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+protected:
+    std::string name;
+    std::string _test;
+};
+
+#endif //ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H
+
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
new file mode 100644
index 000000000..d9c65c7b4
--- /dev/null
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -0,0 +1,642 @@
+#include "test_saber_tensor_BM.h"
+#include "tensor_op.h"
+#include <vector>
+using namespace anakin::saber;
+
+typedef TargetWrapper<X86> X86_API;
+typedef TargetWrapper<BM> BM_API;
+typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+typedef TensorHf4::Dtype dtype;
+
+TEST(TestSaberTensorBM, test_tensor_constructor) {
+
+    //! test empty constructor
+    LOG(INFO) << "test default (empty) constructor";
+    TensorHf4 thost0;
+    TensorDf4 tdev0;
+
+    //! test tensor re_alloc function empty constructor
+    Shape sh0(2, 2, 8, 8);
+    LOG(INFO) << "|--test tensor re_alloc function on empty tensor";
+    thost0.re_alloc(sh0);
+    tdev0.re_alloc(sh0);
+    LOG(INFO) << "|--tensor size of host: " << thost0.size();
+    LOG(INFO) << "|--tensor size of device: " << tdev0.size();
+    CHECK_EQ(thost0.size(), 256) << "error with tensor size";
+    CHECK_EQ(tdev0.size(), 256) << "error with tensor size";
+/*
+    //! test tensor re_alloc function on tensor with data
+    LOG(INFO) << "|--test tensor re_alloc function on tensor with data";
+    Shape sh1(1, 2, 4, 4);
+    thost0.re_alloc(sh1);
+    tdev0.re_alloc(sh1);
+    LOG(INFO) << "|--tensor size of host: " << thost0.size();
+    LOG(INFO) << "|--tensor size of device: " << tdev0.size();
+    CHECK_EQ(thost0.size(), 32) << "error with tensor size";
+    CHECK_EQ(tdev0.size(), 32) << "error with tensor size";
+
+    //! test tensor shape() function
+    LOG(INFO) << "|--test tensor shape() function";
+    Shape sho = thost0.shape();
+    LOG(INFO) << "|--shape of tensor: " << sho[0] << ", " << sho[1] << "," << sho[2] << "," << sho[3];
+    LOG(INFO) << "|--test get tensor n, c, h, w function, num = " \
+              << thost0.num() << ", channel = " << thost0.channel() << ", height = " \
+              << thost0.height() << ", width = " << thost0.width();
+
+    //! test tensor mutable_data() function
+    LOG(INFO) << "|--test tensor mutable_data() function, write tensor data buffer with 1.f";
+    fill_tensor_host_const(thost0, 1.f);
+    LOG(INFO) << "|--test tensor data() function, show the const data, 1.f";
+    print_tensor_host(thost0);
+
+    //! test tensor constructor with shape
+    LOG(INFO) << "test tensor constructor with shape";
+    TensorHf4 thost1(sh1);
+    TensorDf4 tdev1(sh1);
+
+    //! test tensor copy_from() function
+    LOG(INFO) << "test copy_from() function, input tensor could be any target";
+    thost1.copy_from(thost0);
+    tdev1.copy_from(thost0);
+    print_tensor_device(tdev1);
+    cudaDeviceSynchronize();
+    thost1.copy_from(tdev1);
+    tdev1.copy_from(tdev0);
+    print_tensor_host(thost1);
+
+    //! test tensor constructor with shape and real_shape
+    LOG(INFO) << "test tensor constructor with shape and real_shape";
+    //! constructor with 3 shapes is removed
+    TensorHf4 thost2(sh0);
+    TensorDf4 tdev2(sh0);
+
+    //! test tensor constructor with data, if target is different, create buffer, and copy the data
+    LOG(INFO) <<
+              "test tensor constructor with data, if target is different, create buffer, and copy the data";
+    dtype* host_data_ptr;
+    dtype* dev_data_ptr;
+    void* tmp_pt_host;
+    void* tmp_pt_dev;
+    X86_API::mem_alloc(&tmp_pt_host, sizeof(dtype) * sh1.count());
+    host_data_ptr = static_cast<dtype*>(tmp_pt_host);
+
+    for (int i = 0; i < sh1.count(); ++i) {
+        host_data_ptr[i] = i;
+    }
+
+    NV_API::mem_alloc(&tmp_pt_dev, sizeof(dtype) * sh1.count());
+    dev_data_ptr = static_cast<dtype*>(tmp_pt_dev);
+    cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice);
+    LOG(INFO) << "|--construct host tensor from host data ptr";
+    TensorHf4 thost3(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
+    LOG(INFO) << "|--constructor device tensor from host data ptr";
+    TensorDf4 tdev3(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
+    print_tensor_host(thost3);
+    print_tensor_device(tdev3);
+    cudaDeviceSynchronize();
+
+    LOG(INFO) << "|--construct host tensor from device data ptr";
+    TensorHf4 thost4(dev_data_ptr, NV(), NV_API::get_device_id(), sh1);
+    LOG(INFO) << "|--constructor device tensor from device data ptr";
+    TensorDf4 tdev4(dev_data_ptr, NV(), NV_API::get_device_id(), sh1);
+    print_tensor_host(thost4);
+    print_tensor_device(tdev4);
+    NV_API::stream_t dev_stream0;
+    NV_API::create_stream_with_flag(dev_stream0, 1);
+    cudaDeviceSynchronize();
+
+    //! test tensor copy constructor
+    LOG(INFO) << "test tensor copy constructor";
+    LOG(INFO) << "|--normal copy constructor";
+    TensorHf4 thost5(thost4);
+    TensorDf4 tdev5(tdev4);
+
+    LOG(INFO) << "|--push back to vector";
+    std::vector<TensorHf4> vthost;
+    std::vector<TensorDf4> vtdev;
+    vthost.push_back(thost0);
+    vthost.push_back(thost1);
+    vthost.push_back(thost2);
+    vthost.push_back(thost3);
+    vthost.push_back(thost4);
+    vthost.push_back(thost5);
+    vtdev.push_back(tdev0);
+    vtdev.push_back(tdev1);
+    vtdev.push_back(tdev2);
+    vtdev.push_back(tdev3);
+    vtdev.push_back(tdev4);
+    vtdev.push_back(tdev5);
+    print_tensor_host(vthost[5]);
+    print_tensor_device(vtdev[5]);
+    cudaDeviceSynchronize();
+
+    //! test share_from function, if targets are the same, buffer is shared, otherwise, buffer is copied
+    LOG(INFO) << "test share_from function";
+    TensorHf4 thost6, thost7;
+    TensorDf4 tdev6, tdev7;
+    thost6.set_shape(thost4.shape());
+    thost7.set_shape(thost4.shape());
+    tdev6.set_shape(thost4.shape());
+    tdev7.set_shape(thost4.shape());
+    Shape sh2(1, 2, 2, 2);
+    Shape offset(0, 0, 1, 1);
+    LOG(INFO) << "|--shared host";
+    thost6.share_sub_buffer(thost4, sh2, offset);
+    LOG(INFO) << "|--copied host";
+    tdev6.share_from(thost4);
+    LOG(INFO) << "|--copied device";
+    thost7.share_from(tdev4);
+    LOG(INFO) << "|--shared device";
+    tdev7.share_from(tdev4);
+
+    LOG(INFO) << "|--change data in shared tensor";
+
+    //Shape sh_real = thost6.shape();
+    //Shape sh_act = thost6.valid_shape();
+    //Shape offset_act = thost6.offset();
+
+    //int start_w = offset_act[3];
+    //int start_h = offset_act[2];
+    //int start_c = offset_act[1];
+    //int start_n = offset_act[0];
+    //int stride_h = sh_real.count(3);
+    //int stride_c = sh_real.count(2);
+    //int stride_n = sh_real.count(1);
+    //int stride_n = sh_real.count(0);
+    Shape stride = thost6.get_stride();
+    int w = thost6.width();
+    int h = thost6.height();
+    int c = thost6.channel();
+    int n = thost6.num();
+
+    dtype* ptr_host = thost6.mutable_data();
+
+    for (int in = 0; in < n; ++in) {
+        dtype* ptr_batch = ptr_host + in * stride[0];
+
+        for (int ic = 0; ic < c; ++ic) {
+            dtype* ptr_channel = ptr_batch + ic * stride[1];
+
+            for (int ih = 0; ih < h; ++ih) {
+                dtype* ptr_row = ptr_channel + ih * stride[2];
+
+                for (int iw = 0; iw < w; ++iw) {
+                    ptr_row[iw] = 1.f;
+                }
+            }
+        }
+    }
+
+    LOG(INFO) << "|--show root tensor while data is changed by shared tensor";
+    print_tensor_host(thost4);
+
+    //! test record tensor event
+    LOG(INFO) << "test record tensor event";
+    NV_API::stream_t dev_stream;
+    NV_API::stream_t dev_stream1;
+    NV_API::create_stream_with_flag(dev_stream, 1);
+    NV_API::create_stream_with_flag(dev_stream1, 1);
+    X86_API::stream_t host_stream;
+    X86_API::create_stream_with_flag(host_stream, 1);
+    LOG(INFO) << "|--test record event on host tensor";
+    fill_tensor_host_const(thost4, 888.f);
+    thost4.record_event(host_stream);
+    thost4.sync();
+    print_tensor_host(thost4);
+    LOG(INFO) << "|--test record event on device tensor";
+    fill_tensor_device_const(tdev4, 666.f, dev_stream);
+    tdev4.record_event(dev_stream);
+    tdev4.sync();
+    print_tensor_device(tdev4, dev_stream1);
+    tdev4.record_event(dev_stream1);
+    tdev4.sync();
+}
+
+TEST(TestSaberTensorNV, test_tensor_deepcopy) {
+    //! tensor constructor with alloc data, if target is different, create buffer, and copy the data
+    LOG(INFO) << "test tensor deep copy";
+    Shape sh0(2, 2, 4, 4);
+    Shape va_sh0(2, 2, 2, 2);
+    Shape off_sh0(0, 0, 1, 1);
+
+    Shape sh1(2, 2, 4, 4);
+    Shape va_sh1(va_sh0);
+    Shape off_sh1(0, 0, 1, 0);
+
+    Shape sh2(2, 32);
+    Shape va_sh2(2, 8);
+    Shape off_sh2(0, 8);
+
+    X86_API::stream_t x86_stream;
+    NV_API::stream_t nv_stream;
+    X86_API::create_stream(x86_stream);
+    NV_API::create_stream(nv_stream);
+
+    //! create source tensor, th0, td0, th01, td01, th1, td1;
+    TensorHf4 th0(sh0);
+
+    for (int i = 0; i < sh0.count(); ++i) {
+        th0.mutable_data()[i] = i;
+    }
+
+    TensorHf4 th1(va_sh0);
+
+    for (int i = 0; i < va_sh0.count(); ++i) {
+        th1.mutable_data()[i] = i;
+    }
+
+    TensorHf4 th01;
+    th01.share_sub_buffer(th0, va_sh0, off_sh0);
+
+    TensorDf4 td0, td1, td01;
+    td0.set_shape(th0.shape());
+    td1.set_shape(th1.shape());
+    td0.share_from(th0);
+    td1.share_from(th1);
+    TensorDf4 dev_tmp0;
+    dev_tmp0.set_shape(th0.shape());
+    dev_tmp0.share_from(th0);
+    td01.share_sub_buffer(dev_tmp0, va_sh0, off_sh0);
+
+    print_tensor_host(th0);
+    print_tensor_host(th1);
+    print_tensor_device(td0);
+    print_tensor_device(td1);
+
+    //! create th2, th3, th21, td2, td3, td21 as dst tensor
+    TensorHf2 th2(sh2);
+    fill_tensor_host_const(th2, 0.f);
+    TensorHf2 th21;
+    th21.share_sub_buffer(th2, va_sh2, off_sh2);
+    TensorHf2 th3(va_sh2);
+
+    TensorDf2 td2(sh2);
+    fill_tensor_device_const(td2, 0.f);
+    cudaDeviceSynchronize();
+    TensorDf2 td21;
+    td21.share_sub_buffer(td2, va_sh2, off_sh2);
+    TensorDf2 td3(va_sh2);
+
+    double max_diff;
+    double  max_ratio;
+    //! test tensor deep copy, entire buffer copy
+    LOG(INFO) << "test tensor deep copy, entire buffer copy, H2H";
+    th3.copy_from(th1);
+    print_tensor_host(th3);
+    tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff);
+    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, sync, H2H";
+    fill_tensor_host_const(th3, 0.f);
+    th3.async_copy_from(th1, x86_stream);
+    th3.record_event(x86_stream);
+    th3.sync();
+    tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff);
+    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, async, H2H";
+
+    LOG(INFO) << "test tensor deep copy, entire buffer copy, D2H";
+    th3.copy_from(td1);
+    print_tensor_host(th3);
+    tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff);
+    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, sync, D2H";
+    fill_tensor_host_const(th3, 0.f);
+    th3.async_copy_from(td1, nv_stream);
+    th3.record_event(x86_stream);
+    th3.sync();
+    tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff);
+    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, async, D2H";
+
+    LOG(INFO) << "test tensor deep copy, entire buffer copy, H2D";
+    td3.copy_from(th1);
+    print_tensor_device(td3);
+    cudaDeviceSynchronize();
+    tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff);
+    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, sync, D2H";
+    fill_tensor_device_const(td3, 0.f);
+    cudaDeviceSynchronize();
+    td3.async_copy_from(th1, nv_stream);
+    td3.record_event(nv_stream);
+    td3.sync();
+    tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff);
+    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, async, D2H";
+
+    LOG(INFO) << "test tensor deep copy, entire buffer copy, D2D";
+    td3.copy_from(td1);
+    print_tensor_device(td3);
+    cudaDeviceSynchronize();
+    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, sync, D2D";
+    fill_tensor_device_const(td3, 0.f);
+    cudaDeviceSynchronize();
+    td3.async_copy_from(td1, nv_stream);
+    td3.record_event(nv_stream);
+    td3.sync();
+    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, async, D2D";
+
+
+    //! test tensor deep copy, src with roi
+    LOG(INFO) << "test tensor deep copy, src with roi, H2H";
+    th3.copy_from(th01);
+    print_tensor_host(th3);
+
+    LOG(INFO) << "test tensor deep copy, src with roi, D2H";
+    th3.copy_from(td01);
+    print_tensor_host(th3);
+
+    LOG(INFO) << "test tensor deep copy, src with roi, H2D";
+    td3.copy_from(th01);
+    print_tensor_device(td3);
+    cudaDeviceSynchronize();
+
+    LOG(INFO) << "test tensor deep copy, src with roi, D2D";
+    td3.copy_from(td01);
+    print_tensor_device(td3);
+    cudaDeviceSynchronize();
+
+
+    //! test tensor deep copy, dst with roi
+    LOG(INFO) << "test tensor deep copy, dst with roi, H2H";
+    print_tensor_host(th21);
+    print_tensor_host(th1);
+    th21.copy_from(th1);
+    print_tensor_host(th21);
+
+    LOG(INFO) << "test tensor deep copy, dst with roi, D2H";
+    th21.copy_from(td1);
+    print_tensor_host(th21);
+
+    LOG(INFO) << "test tensor deep copy, dst with roi, H2D";
+    td21.copy_from(th1);
+    print_tensor_device(td21);
+    cudaDeviceSynchronize();
+
+    LOG(INFO) << "test tensor deep copy, dst with roi, D2D";
+    td21.copy_from(td1);
+    print_tensor_device(td21);
+    cudaDeviceSynchronize();
+
+
+    //! test tensor deep copy, src and dst are with roi
+    LOG(INFO) << "test tensor deep copy, src and dst are with roi, H2H";
+    th21.copy_from(th01);
+    print_tensor_host(th21);
+
+    LOG(INFO) << "test tensor deep copy, src and dst are with roi, D2H";
+    th21.copy_from(td01);
+    print_tensor_host(th21);
+
+    LOG(INFO) << "test tensor deep copy, src and dst are with roi, H2D";
+    td21.copy_from(th01);
+    print_tensor_device(td21);
+    cudaDeviceSynchronize();
+
+    LOG(INFO) << "test tensor deep copy, src and dst are with roi, D2D";
+    td21.copy_from(td01);
+    print_tensor_device(td21);
+    cudaDeviceSynchronize();
+}
+
+TEST(TestSaberTensorNV, test_tensor_shape) {
+    typedef Tensor<X86, AK_FLOAT, NCHW> Tensor4_0;
+    typedef Tensor<X86, AK_FLOAT, NHWC> Tensor4_1;
+    typedef Tensor<X86, AK_FLOAT, HW> Tensor2;
+
+    int nin = 2;
+    int cin = 2;
+    int hin = 4;
+    int win = 4;
+
+    LOG(INFO) << "test tensor interface";
+
+    Tensor4_0 t1(Shape(nin, cin, hin, win));
+    Tensor4_1 t2(Shape(nin, hin, win, cin));
+    Tensor2 t3(Shape(hin, win));
+
+    LOG(INFO) << "test tensor with layout of NCHW";
+    LOG(INFO) << "num: " << t1.num() << ", num idx: " << t1.num_index() << \
+              ", channel: " << t1.channel() << ", channel idx: " << t1.channel_index() << \
+              ", height: " << t1.height() << ", height idx: " << t1.height_index() << \
+              ", widhth: " << t1.width() << ", width idx: " << t1.width_index();
+
+    CHECK_EQ(t1.num(), nin) << "NCHW get num error";
+    CHECK_EQ(t1.channel(), cin) << "NCHW get channel error";
+    CHECK_EQ(t1.height(), hin) << "NCHW get height error";
+    CHECK_EQ(t1.width(), win) << "NCHW get width error";
+
+    CHECK_EQ(t1.num_index(), 0) << "NCHW get num index error";
+    CHECK_EQ(t1.channel_index(), 1) << "NCHW get channel index error";
+    CHECK_EQ(t1.height_index(), 2) << "NCHW get height index error";
+    CHECK_EQ(t1.width_index(), 3) << "NCHW get width index error";
+
+    LOG(INFO) << "test tensor with layout of NHWC";
+    LOG(INFO) << "num: " << t2.num() << ", num idx: " << t2.num_index() << \
+              ", channel: " << t2.channel() << ", channel idx: " << t2.channel_index() << \
+              ", height: " << t2.height() << ", height idx: " << t2.height_index() << \
+              ", widhth: " << t2.width() << ", width idx: " << t2.width_index();
+
+    CHECK_EQ(t2.num(), nin) << "NHWC get num error";
+    CHECK_EQ(t2.channel(), cin) << "NHWC get channel error";
+    CHECK_EQ(t2.height(), hin) << "NHWC get height error";
+    CHECK_EQ(t2.width(), win) << "NHWC get width error";
+
+    CHECK_EQ(t2.num_index(), 0) << "NHWC get num index error";
+    CHECK_EQ(t2.channel_index(), 3) << "NHWC get channel index error";
+    CHECK_EQ(t2.height_index(), 1) << "NHWC get height index error";
+    CHECK_EQ(t2.width_index(), 2) << "NHWC get width index error";
+
+    LOG(INFO) << "test tensor with layout of HW";
+    LOG(INFO) << "num: " << t3.num() << ", num idx: " << t3.num_index() << \
+              ", channel: " << t3.channel() << ", channel idx: " << t3.channel_index() << \
+              ", height: " << t3.height() << ", height idx: " << t3.height_index() << \
+              ", widhth: " << t3.width() << ", width idx: " << t3.width_index();
+
+    CHECK_EQ(t3.num(), 1) << "HW get num error";
+    CHECK_EQ(t3.channel(), 1) << "HW get channel error";
+    CHECK_EQ(t3.height(), hin) << "HW get height error";
+    CHECK_EQ(t3.width(), win) << "HW get width error";
+
+    CHECK_EQ(t3.num_index(), -1) << "HW get num index error";
+    CHECK_EQ(t3.channel_index(), -1) << "HW get channel index error";
+    CHECK_EQ(t3.height_index(), 0) << "HW get height index error";
+    CHECK_EQ(t3.width_index(), 1) << "HW get width index error";
+
+}
+
+TEST(TestSaberTensorNV, test_tensor_reshape_realloc) {
+
+    LOG(INFO) << "test tensor reshape and re_alloc funcs";
+
+    Shape sh0(2, 2, 2, 2);
+    Shape sh1(2, 2, 4, 4);
+    TensorHf4 th0(sh1);
+    TensorDf4 td0(sh1);
+    fill_tensor_host_const(th0, 1);
+    fill_tensor_device_const(td0, 1);
+    LOG(INFO) << "ori tensor with size: " << th0.valid_size();
+    print_tensor_host(th0);
+    print_tensor_device(td0);
+    cudaDeviceSynchronize();
+
+    th0.reshape(sh0);
+    td0.reshape(sh0);
+    LOG(INFO) << "tensor after reshape(from big space to small) with size: " << th0.valid_size();
+    print_tensor_host(th0);
+    print_tensor_device(td0);
+    cudaDeviceSynchronize();
+    fill_tensor_host_const(th0, 1);
+    fill_tensor_device_const(td0, 1);
+    cudaDeviceSynchronize();
+
+    th0.reshape(sh1);
+    td0.reshape(sh1);
+    LOG(INFO) << "tensor after reshape(from small to big, not larger than ori) with size: " <<
+              th0.valid_size();
+    print_tensor_host(th0);
+    print_tensor_device(td0);
+    cudaDeviceSynchronize();
+
+    th0.re_alloc(sh0);
+    td0.re_alloc(sh0);
+    LOG(INFO) << "tensor after re_alloc(from big space to small) with size: " << th0.valid_size();
+    print_tensor_host(th0);
+    print_tensor_device(td0);
+    cudaDeviceSynchronize();
+
+    TensorHf4 th1(sh0);
+    TensorDf4 td1(sh0);
+    LOG(INFO) << "ori tensor with size: " << th1.valid_size();
+    fill_tensor_host_const(th1, 1);
+    fill_tensor_device_const(td1, 1);
+    cudaDeviceSynchronize();
+    print_tensor_host(th1);
+    print_tensor_device(td1);
+    cudaDeviceSynchronize();
+
+    th1.reshape(sh1);
+    td1.reshape(sh1);
+    LOG(INFO) << "tensor after reshape(from small space to big) with size: " << th1.valid_size();
+    //printf("real_shape: %d,%d, %d, %d, valid_shape: %d, %d, %d, %d\n", \
+    th1.shape()[0], th1.shape()[1], th1.shape()[2], th1.shape()[3], \
+    th1.valid_shape()[0], th1.valid_shape()[1], th1.valid_shape()[2], th1.valid_shape()[3]);
+    print_tensor_host(th1);
+    print_tensor_device(td1);
+    cudaDeviceSynchronize();
+    fill_tensor_host_const(th1, 1);
+    fill_tensor_device_const(td1, 1);
+    cudaDeviceSynchronize();
+
+    th1.reshape(sh0);
+    td1.reshape(sh0);
+
+    LOG(INFO) << "tensor after re_alloc(from small space to big) with size: " << th1.valid_size();
+    th1.re_alloc(sh1);
+    td1.re_alloc(sh1);
+    print_tensor_host(th1);
+    print_tensor_device(td1);
+    cudaDeviceSynchronize();
+
+}
+
+TEST(TestSaberTensorNV, test_tensor_op) {
+    Shape sh{1, 2, 2, 10};
+    TensorDf4 td1(sh);
+    TensorHf4 th1(sh);
+    Tensor<NV, AK_INT8, NCHW> td2(sh);
+    Tensor<X86, AK_INT8, NCHW> th2(sh);
+    LOG(INFO) << "testing host fill tensor with const 1.";
+    fill_tensor_host_const(th1, 1.f);
+    LOG(INFO) << "data type: float";
+    print_tensor_host(th1);
+    fill_tensor_host_const(th2, 1);
+    LOG(INFO) << "data type: int8";
+    print_tensor_host(th2);
+
+    LOG(INFO) << "testing device fill tensor with const 1.";
+    fill_tensor_device_const(td1, 1.f);
+    LOG(INFO) << "data type: float";
+    print_tensor_device(td1);
+    fill_tensor_device_const(td2, 1);
+    LOG(INFO) << "data type: int8";
+    print_tensor_device(td2);
+
+    LOG(INFO) << "testing host fill tensor with rand";
+    fill_tensor_host_rand(th1);
+    LOG(INFO) << "data type: float";
+    print_tensor_host(th1);
+    fill_tensor_host_rand(th2);
+    LOG(INFO) << "data type: int8";
+    print_tensor_host(th2);
+
+    LOG(INFO) << "testing device fill tensor with rand";
+    fill_tensor_device_rand(td1);
+    LOG(INFO) << "data type: float";
+    print_tensor_device(td1);
+    fill_tensor_device_rand(td2);
+    LOG(INFO) << "data type: int8";
+    print_tensor_device(td2);
+
+    LOG(INFO) << "testing host fill tensor with rand from 1 to 10";
+    fill_tensor_host_rand(th1, 1, 10);
+    LOG(INFO) << "data type: float";
+    print_tensor_host(th1);
+    fill_tensor_host_rand(th2, 1, 10);
+    LOG(INFO) << "data type: int8";
+    print_tensor_host(th2);
+
+    LOG(INFO) << "testing device fill tensor with rand from 1 to 10";
+    fill_tensor_device_rand(td1, 1, 10);
+    LOG(INFO) << "data type: float";
+    print_tensor_device(td1);
+    fill_tensor_device_rand(td2, 1, 10);
+    LOG(INFO) << "data type: int8";
+    print_tensor_device(td2);
+}
+
+TEST(TestSaberTensorNV, test_tensor_share_diff_dtype) {
+    Shape sh{1, 1, 2, 10};
+    Tensor<NV, AK_FLOAT, NCHW> td1(sh);
+    Tensor<X86, AK_FLOAT, NCHW> th1(sh);
+    Tensor<NV, AK_INT8, NCHW> td2;
+    Tensor<X86, AK_INT8, NCHW> th2;
+    td2.set_shape(sh);
+    th2.set_shape(sh);
+    LOG(INFO) << "testing host fill tensor with const 1.";
+    fill_tensor_host_const(th1, -1);
+    LOG(INFO) << "data type: float";
+    print_tensor_host(th1);
+    fill_tensor_device_const(td1, -1);
+    LOG(INFO) << "data type: int8";
+    print_tensor_device(td1);
+    cudaDeviceSynchronize();
+
+    td2.share_from(td1);
+    th2.share_from(th1);
+
+    print_tensor_host(th2);
+    print_tensor_device(td2);
+    cudaDeviceSynchronize();
+}
+
+TEST(TestSaberTensorNV, test_tensor_base_type) {
+    Shape sh(1, 3, 10, 10);
+    Tensor<NV, AK_FLOAT, NCHW> td1(sh);
+    Tensor<X86, AK_FLOAT, NCHW> th1(sh);
+    fill_tensor_host_rand(th1, 0.f, 255.f);
+    td1.copy_from(th1);
+    TensorBase* tb1;
+    TensorBase* tb2;
+    tb1 = &th1;
+    Shape sh1(1, 1, 10, 10);
+    tb1->set_shape(sh1);
+    Shape sh11 = th1.valid_shape();
+    LOG(INFO) << "base tensor call set shape: " << "n=" << sh11[0] << ", c=" << sh11[1] << \
+              ", h=" << sh11[2] << ", w=" << sh11[3];
+*/
+}
+
+int main(int argc, const char** argv) {
+    // initial logger
+    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
diff --git a/test/saber/bm/test_saber_tensor_BM.h b/test/saber/bm/test_saber_tensor_BM.h
new file mode 100644
index 000000000..32a402258
--- /dev/null
+++ b/test/saber/bm/test_saber_tensor_BM.h
@@ -0,0 +1,21 @@
+#ifndef ANAKIN_TEST_SABER_TEST_SABER_TENSOR_BM_H
+#define ANAKIN_TEST_SABER_TEST_SABER_TENSOR_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+#include "core/tensor.h"
+
+using namespace anakin::test;
+
+class TestSaberTensorBM : public Test {
+public:
+    TestSaberTensorBM() {}
+    ~TestSaberTensorBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+};
+
+#endif //ANAKIN_TEST_SABER_TEST_SABER_TENSOR_BM_H

From 7f726a39e850635d056a43d9a1a751046a57d745 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Mon, 18 Jun 2018 14:14:23 +0800
Subject: [PATCH 002/318] Fix cmake issues

---
 CMakeLists.txt       | 12 ++++++------
 saber/CMakeLists.txt |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0a81d7c02..ccb37468f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,12 +65,12 @@ anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plan
 anakin_option(BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform" NO if BUILD_CROSS_PLANTFORM)
 
 # compile options for BM place
-anakin_option(USE_BM "Use Cuda libs." YES if NVIDIA_GPU)
-anakin_option(USE_CUBLAS "Use Cublas libs." YES if USE_BM)
-anakin_option(USE_CURAND "Use Curand libs." YES if USE_BM)
-anakin_option(USE_CUFFT "Use CuFFT libs." YES if USE_BM)
-anakin_option(USE_CUDNN "Use Cudnn libs." YES if USE_BM)
-anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform." YES if USE_BM)
+#anakin_option(USE_BM "Use Cuda libs." YES if NVIDIA_GPU)
+#anakin_option(USE_CUBLAS "Use Cublas libs." YES if USE_BM)
+#anakin_option(USE_CURAND "Use Curand libs." YES if USE_BM)
+#anakin_option(USE_CUFFT "Use CuFFT libs." YES if USE_BM)
+#anakin_option(USE_CUDNN "Use Cudnn libs." YES if USE_BM)
+#anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform." YES if USE_BM)
 
 
 if(USE_CUDA)
diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt
index 440d1de07..90c7f5c19 100644
--- a/saber/CMakeLists.txt
+++ b/saber/CMakeLists.txt
@@ -84,10 +84,10 @@ if(USE_BM)
     set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS})
     set(CMAKE_CXX_FLAGS "")
     if(BUILD_SHARED)
-        CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS SHARED ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
+        #CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS SHARED ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
     endif()
     if(BUILD_STATIC)
-        CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS STATIC ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
+        #CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS STATIC ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
     endif()
     set(CMAKE_CXX_FLAGS ${FLAGS_BACKUP})
 

From 9221195043c6034f3dc8190601e6858c1d5d45ca Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Tue, 19 Jun 2018 11:04:03 +0800
Subject: [PATCH 003/318] Resolve BM library compilation issue

---
 saber/CMakeLists.txt                    | 10 +++++-----
 saber/funcs/impl/bm/base/CMakeLists.txt |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt
index 90c7f5c19..ac0ebba2a 100644
--- a/saber/CMakeLists.txt
+++ b/saber/CMakeLists.txt
@@ -59,7 +59,7 @@ if(USE_CUDA)
 	set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS})
 	set(CMAKE_CXX_FLAGS "")
 	if(BUILD_SHARED)
-    		CUDA_COMPILE(ANAKIN_SABER_CUDA_C_SRC_OBJS SHARED ${ANAKIN_SABER_CUDA_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
+        CUDA_COMPILE(ANAKIN_SABER_CUDA_C_SRC_OBJS SHARED ${ANAKIN_SABER_CUDA_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
 	endif()
 	if(BUILD_STATIC)
 		CUDA_COMPILE(ANAKIN_SABER_CUDA_C_SRC_OBJS STATIC ${ANAKIN_SABER_CUDA_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
@@ -83,12 +83,12 @@ if(USE_BM)
 
     set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS})
     set(CMAKE_CXX_FLAGS "")
-    if(BUILD_SHARED)
+    #if(BUILD_SHARED)
         #CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS SHARED ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
-    endif()
-    if(BUILD_STATIC)
+    #endif()
+    #if(BUILD_STATIC)
         #CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS STATIC ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
-    endif()
+    #endif()
     set(CMAKE_CXX_FLAGS ${FLAGS_BACKUP})
 
     set(ANAKIN_SABER_STATIC_RELAY ${ANAKIN_SABER_STATIC_RELAY}
diff --git a/saber/funcs/impl/bm/base/CMakeLists.txt b/saber/funcs/impl/bm/base/CMakeLists.txt
index fd4b3d680..59b82abb5 100644
--- a/saber/funcs/impl/bm/base/CMakeLists.txt
+++ b/saber/funcs/impl/bm/base/CMakeLists.txt
@@ -7,7 +7,7 @@
 
 if(USE_BM)
     anakin_fetch_files_with_suffix_recursively(${BM_BASE_CODE_ROOT}/include "h" ANAKIN_SABER_BM_C_SRC)
-    anakin_fetch_files_with_suffix_recursively(${BM_BASE_CODE_ROOT}/lib "o" ANAKIN_SABER_BM_STATIC_LIB)
+    anakin_fetch_files_with_suffix_recursively(${BM_BASE_CODE_ROOT}/lib "so" ANAKIN_SABER_BM_STATIC_LIB)
 endif()
 
 macro(anakin_set_upscope src)

From 0244de9ea7f10b8525eb4bbf4d9be98fb2347721 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Tue, 19 Jun 2018 11:30:42 +0800
Subject: [PATCH 004/318] Remove unnecessary files

---
 .../impl/bm/base/include/bmruntime/bmcnnctx.h | 58 --------------
 .../impl/bm/base/include/bmruntime/bmnet.h    | 78 -------------------
 2 files changed, 136 deletions(-)
 delete mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h
 delete mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmnet.h

diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h b/saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h
deleted file mode 100644
index 6b0bfe857..000000000
--- a/saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#ifndef __BM_CNN_CONTEXT_H__
-#define __BM_CNN_CONTEXT_H__
-
-#include <string>
-#include "bmruntime.h"
-
-namespace bmcnn {
-
-typedef void *bmcnn_ctx_t;
-/**
- * \brief Create context of BMCNN.
- *
- * \param ctx_dir - Directory of context files generated by BMNETC
- *
- * \note
- * The context will be created in the device of ID 0.\n
- *  
- * \return
- * NULL - Creating failed.\n
- * non-NULL - The handle of the context (creating succeeded).\n
- */
-bmcnn_ctx_t bmcnn_ctx_create(const std::string &ctx_dir);
-/**
- * \brief Destroy context of BMCNN
- * 
- * \param handle - Handle of the context to be destroyed
- */
-void bmcnn_ctx_destroy(bmcnn_ctx_t handle);
-/**
- * \brief Create context of BMCNN in specific devide.
- * 
- * \param ctx_dir - Directory of context files generated by BMNETC
- * \param devid - ID of device where the context will be placed.
- *
- * \note
- * Call \ref bm_dev_getcount to get total number of devices, e.g. N is returned,
- * valid devid should be in range of 0 ~ (N-1).\n
- *
- * \return
- * NULL - Creating failed that might be caused by incorrect parameter.\n
- * non-NULL - The handle of the context (creating succeeded).\n
- */
-bmcnn_ctx_t bmcnn_ctx_create_by_devid(const std::string &ctx_dir, int devid);
-/**
- * \brief Append context of BMCNN.
- *
- * \param ctx_dir - Directory of context files generated by BMNETC or BMNETD.
- * \param bmrt    - The created handle of context.
- *  
- * \return
- * false - Appending failed.\n
- * true  - Appending succeeded.\n
- */
-bool bmcnn_ctx_append(const std::string &ctx_dir, bmruntime *bmrt);
-
-} /* namespace bmcnn */
-
-#endif /* __BM_CNN_CONTEXT_H__ */
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmnet.h b/saber/funcs/impl/bm/base/include/bmruntime/bmnet.h
deleted file mode 100644
index 88005e1b8..000000000
--- a/saber/funcs/impl/bm/base/include/bmruntime/bmnet.h
+++ /dev/null
@@ -1,78 +0,0 @@
-#ifndef __BM_NET_H__
-#define __BM_NET_H__
-
-#include "bmblob.h"
-#include "bmcnnctx.h"
-#include <vector>
-#include <map>
-#include <string>
-
-#ifdef CROSS_COMPILE
-  #include <memory>
-#else
-  #include <boost/shared_ptr.hpp>
-#endif
-
-
-#ifdef CROSS_COMPILE
-#define NAMESPACE_USED  std
-#else
-#define NAMESPACE_USED  boost
-#endif
-
-namespace bmcnn {
-    
-class BMNet
-{
-public:
-    /**
-     * \brief Constructor of net.
-     *
-     * \param handle - Handler of BMCNN context (created by \ref bmcnn_ctx_create)
-     * \param name - Name of net
-     */
-    explicit BMNet(bmcnn_ctx_t handle, const std::string &name);
-    /**
-     * \brief Deconstructor of blob.
-     */
-    virtual ~BMNet();
-    /**
-     * \brief Reshape all layers from bottom to top.
-     */
-    void Reshape();
-    /**
-     * \brief Run forward.
-     * 
-     * \param sync - Flag of synchronizing.
-     */
-    void Forward(bool sync = false);
-    /**
-     * \brief Get blob by name.
-     *
-     * \param name - Name of blob 
-     * \note
-     * (1) The name could only be of blob in input or output.\n
-     * (2) If the name is not spotted, null pointer will be returned.\n
-     */
-    const NAMESPACE_USED::shared_ptr<BMBlob> blob_by_name(const std::string &name) const;
-    /**
-     * \brief Get maximum shape allowed.
-     */
-    inline const Shape &max_shape() const
-    { return max_shape_; }
-private:
-    BMNet(const BMNet &other);
-    BMNet &operator=(const BMNet &other);
-
-    bmcnn_ctx_t bmcc_ctx_;
-    std::vector<NAMESPACE_USED::shared_ptr<BMBlob> > blobs_;
-    std::vector<BMBlob *> net_input_blobs_;
-    std::vector<BMBlob *> net_output_blobs_;
-    std::string name_;
-    std::map<std::string, size_t> blob_name_index_;
-    Shape max_shape_;
-};
-
-} /* namespace bmcnn */
-
-#endif /* __BM_NET_H__ */

From 6f63d664878acd2f97e3d08d0cb30f46a4ecb619 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Tue, 19 Jun 2018 17:43:20 +0800
Subject: [PATCH 005/318] Put empty implementation for BM sync_mem for now

---
 saber/CMakeLists.txt        | 2 +-
 saber/core/target_wrapper.h | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt
index ac0ebba2a..82d9bcdab 100644
--- a/saber/CMakeLists.txt
+++ b/saber/CMakeLists.txt
@@ -100,7 +100,7 @@ endif()
 
 # add saber library to static
 if(UNIX OR APPLE) 
-    ADD_LIBRARY(${ANAKIN_SABER_TEMP_COMMMON_LIB} SHARED ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BM_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC})
+    ADD_LIBRARY(${ANAKIN_SABER_TEMP_COMMMON_LIB} SHARED ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC})
 							#$<TARGET_OBJECTS:ANAKIN_SABER_BASE_OBJS>) 
     if(USE_X86_PLACE)
 		message(STATUS ${ANAKIN_SABER_DEPENDENCIES})
diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index 6d5d6a8d1..7c6e2d2fb 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -398,16 +398,16 @@ struct TargetWrapper<BM, __device_target> {
     // brief create event, empty function for bitmain target
 
     static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-        size_t count, __DtoD);
+        size_t count, __DtoD) {};
 
     static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-        size_t count, __HtoD);
+        size_t count, __HtoD) {};
 
     static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-        size_t count, __DtoH);
+        size_t count, __DtoH) {};
 
     static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
-        int src_dev, size_t count);
+        int src_dev, size_t count) {};
 
     /**
      * \brief device target return currently used device id

From ea5a5be4c3dc55a598636f17eea2bb4502fe91f3 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Tue, 19 Jun 2018 18:03:13 +0800
Subject: [PATCH 006/318] Fix wrong input param

---
 test/saber/bm/test_TargetWrapper_BM.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp
index c6ee0811b..c54b392d1 100644
--- a/test/saber/bm/test_TargetWrapper_BM.cpp
+++ b/test/saber/bm/test_TargetWrapper_BM.cpp
@@ -7,8 +7,8 @@ using namespace anakin::saber;
 int main() {
     typedef TargetWrapper<BM> API;
     void *pmem;
-    int dev_count;
-    API::get_device_count(&dev_count);
+    int dev_count = 0;
+    API::get_device_count(dev_count);
     API::mem_alloc(&pmem, 3*200*200);
     API::mem_free(pmem);
 }

From cf0afb04b0302958c8d4e204b2e7f3c0a1966666 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Wed, 20 Jun 2018 11:46:35 +0800
Subject: [PATCH 007/318] Fix param type issue

---
 saber/core/impl/bm/bm_impl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 3ff30773a..143fbec9a 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -55,7 +55,7 @@ int BM_API::get_device_id(){
         
 void BM_API::mem_alloc(void** ptr, size_t n){
     //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n)
-    bm_device_mem_t mem = bm_mem_from_system(ptr);
+    bm_device_mem_t mem = bm_mem_from_system(*ptr);
     BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n));
 }
         

From 5fed8fb96cbbb50d2c99c1cdc373b627d8021f46 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Wed, 20 Jun 2018 14:17:06 +0800
Subject: [PATCH 008/318] Initialize BM handler

---
 saber/core/impl/bm/bm_impl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 143fbec9a..bee5ddab6 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -37,7 +37,7 @@ namespace saber{
 
 typedef TargetWrapper<BM, __device_target> BM_API;
 
-static bm_handle_t handle;
+static bm_handle_t handle = get_bm_handle();
 
 void BM_API::get_device_count(int &count) {
     BMDNN_CHECK(bm_dev_getcount(&count));

From a4dee6249dc498a79b3e13df03a5008e1abdd7c7 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Wed, 20 Jun 2018 15:33:44 +0800
Subject: [PATCH 009/318] Add more unit test for tensor

---
 test/saber/bm/test_TargetWrapper_BM.cpp |  16 ---
 test/saber/bm/test_saber_tensor_BM.cpp  | 130 ++++++++++--------------
 2 files changed, 55 insertions(+), 91 deletions(-)
 delete mode 100644 test/saber/bm/test_TargetWrapper_BM.cpp

diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp
deleted file mode 100644
index c54b392d1..000000000
--- a/test/saber/bm/test_TargetWrapper_BM.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-#include "saber_types.h"
-#include "target_wrapper.h"
-#include <iostream>
-
-#ifdef USE_BM
-using namespace anakin::saber;
-int main() {
-    typedef TargetWrapper<BM> API;
-    void *pmem;
-    int dev_count = 0;
-    API::get_device_count(dev_count);
-    API::mem_alloc(&pmem, 3*200*200);
-    API::mem_free(pmem);
-}
-#endif
-
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index d9c65c7b4..0634d0a2d 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -5,8 +5,8 @@ using namespace anakin::saber;
 
 typedef TargetWrapper<X86> X86_API;
 typedef TargetWrapper<BM> BM_API;
-typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+typedef Tensor<X86, AK_BM, NCHW> TensorHf4;
+typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 typedef TensorHf4::Dtype dtype;
 
 TEST(TestSaberTensorBM, test_tensor_constructor) {
@@ -25,7 +25,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     LOG(INFO) << "|--tensor size of device: " << tdev0.size();
     CHECK_EQ(thost0.size(), 256) << "error with tensor size";
     CHECK_EQ(tdev0.size(), 256) << "error with tensor size";
-/*
+
     //! test tensor re_alloc function on tensor with data
     LOG(INFO) << "|--test tensor re_alloc function on tensor with data";
     Shape sh1(1, 2, 4, 4);
@@ -60,7 +60,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     thost1.copy_from(thost0);
     tdev1.copy_from(thost0);
     print_tensor_device(tdev1);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
     thost1.copy_from(tdev1);
     tdev1.copy_from(tdev0);
     print_tensor_host(thost1);
@@ -85,7 +85,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
         host_data_ptr[i] = i;
     }
 
-    NV_API::mem_alloc(&tmp_pt_dev, sizeof(dtype) * sh1.count());
+    BM_API::mem_alloc(&tmp_pt_dev, sizeof(dtype) * sh1.count());
     dev_data_ptr = static_cast<dtype*>(tmp_pt_dev);
     cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice);
     LOG(INFO) << "|--construct host tensor from host data ptr";
@@ -94,17 +94,18 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     TensorDf4 tdev3(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
     print_tensor_host(thost3);
     print_tensor_device(tdev3);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     LOG(INFO) << "|--construct host tensor from device data ptr";
-    TensorHf4 thost4(dev_data_ptr, NV(), NV_API::get_device_id(), sh1);
+    TensorHf4 thost4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1);
     LOG(INFO) << "|--constructor device tensor from device data ptr";
-    TensorDf4 tdev4(dev_data_ptr, NV(), NV_API::get_device_id(), sh1);
+    TensorDf4 tdev4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1);
     print_tensor_host(thost4);
     print_tensor_device(tdev4);
-    NV_API::stream_t dev_stream0;
-    NV_API::create_stream_with_flag(dev_stream0, 1);
-    cudaDeviceSynchronize();
+
+    //BM_API::stream_t dev_stream0;
+    //BM_API::create_stream_with_flag(dev_stream0, 1);
+    //cudaDeviceSynchronize();
 
     //! test tensor copy constructor
     LOG(INFO) << "test tensor copy constructor";
@@ -129,7 +130,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     vtdev.push_back(tdev5);
     print_tensor_host(vthost[5]);
     print_tensor_device(vtdev[5]);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     //! test share_from function, if targets are the same, buffer is shared, otherwise, buffer is copied
     LOG(INFO) << "test share_from function";
@@ -190,30 +191,10 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
 
     LOG(INFO) << "|--show root tensor while data is changed by shared tensor";
     print_tensor_host(thost4);
-
-    //! test record tensor event
-    LOG(INFO) << "test record tensor event";
-    NV_API::stream_t dev_stream;
-    NV_API::stream_t dev_stream1;
-    NV_API::create_stream_with_flag(dev_stream, 1);
-    NV_API::create_stream_with_flag(dev_stream1, 1);
-    X86_API::stream_t host_stream;
-    X86_API::create_stream_with_flag(host_stream, 1);
-    LOG(INFO) << "|--test record event on host tensor";
-    fill_tensor_host_const(thost4, 888.f);
-    thost4.record_event(host_stream);
-    thost4.sync();
-    print_tensor_host(thost4);
-    LOG(INFO) << "|--test record event on device tensor";
-    fill_tensor_device_const(tdev4, 666.f, dev_stream);
-    tdev4.record_event(dev_stream);
-    tdev4.sync();
-    print_tensor_device(tdev4, dev_stream1);
-    tdev4.record_event(dev_stream1);
-    tdev4.sync();
 }
 
-TEST(TestSaberTensorNV, test_tensor_deepcopy) {
+/*
+TEST(TestSaberTensorBM, test_tensor_deepcopy) {
     //! tensor constructor with alloc data, if target is different, create buffer, and copy the data
     LOG(INFO) << "test tensor deep copy";
     Shape sh0(2, 2, 4, 4);
@@ -229,9 +210,9 @@ TEST(TestSaberTensorNV, test_tensor_deepcopy) {
     Shape off_sh2(0, 8);
 
     X86_API::stream_t x86_stream;
-    NV_API::stream_t nv_stream;
+    BM_API::stream_t nv_stream;
     X86_API::create_stream(x86_stream);
-    NV_API::create_stream(nv_stream);
+    BM_API::create_stream(nv_stream);
 
     //! create source tensor, th0, td0, th01, td01, th1, td1;
     TensorHf4 th0(sh0);
@@ -273,7 +254,7 @@ TEST(TestSaberTensorNV, test_tensor_deepcopy) {
 
     TensorDf2 td2(sh2);
     fill_tensor_device_const(td2, 0.f);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
     TensorDf2 td21;
     td21.share_sub_buffer(td2, va_sh2, off_sh2);
     TensorDf2 td3(va_sh2);
@@ -308,11 +289,11 @@ TEST(TestSaberTensorNV, test_tensor_deepcopy) {
     LOG(INFO) << "test tensor deep copy, entire buffer copy, H2D";
     td3.copy_from(th1);
     print_tensor_device(td3);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
     tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff);
     CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, sync, D2H";
     fill_tensor_device_const(td3, 0.f);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
     td3.async_copy_from(th1, nv_stream);
     td3.record_event(nv_stream);
     td3.sync();
@@ -322,10 +303,10 @@ TEST(TestSaberTensorNV, test_tensor_deepcopy) {
     LOG(INFO) << "test tensor deep copy, entire buffer copy, D2D";
     td3.copy_from(td1);
     print_tensor_device(td3);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
     CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, sync, D2D";
     fill_tensor_device_const(td3, 0.f);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
     td3.async_copy_from(td1, nv_stream);
     td3.record_event(nv_stream);
     td3.sync();
@@ -344,12 +325,12 @@ TEST(TestSaberTensorNV, test_tensor_deepcopy) {
     LOG(INFO) << "test tensor deep copy, src with roi, H2D";
     td3.copy_from(th01);
     print_tensor_device(td3);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     LOG(INFO) << "test tensor deep copy, src with roi, D2D";
     td3.copy_from(td01);
     print_tensor_device(td3);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
 
     //! test tensor deep copy, dst with roi
@@ -366,12 +347,12 @@ TEST(TestSaberTensorNV, test_tensor_deepcopy) {
     LOG(INFO) << "test tensor deep copy, dst with roi, H2D";
     td21.copy_from(th1);
     print_tensor_device(td21);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     LOG(INFO) << "test tensor deep copy, dst with roi, D2D";
     td21.copy_from(td1);
     print_tensor_device(td21);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
 
     //! test tensor deep copy, src and dst are with roi
@@ -386,18 +367,18 @@ TEST(TestSaberTensorNV, test_tensor_deepcopy) {
     LOG(INFO) << "test tensor deep copy, src and dst are with roi, H2D";
     td21.copy_from(th01);
     print_tensor_device(td21);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     LOG(INFO) << "test tensor deep copy, src and dst are with roi, D2D";
     td21.copy_from(td01);
     print_tensor_device(td21);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 }
 
-TEST(TestSaberTensorNV, test_tensor_shape) {
-    typedef Tensor<X86, AK_FLOAT, NCHW> Tensor4_0;
-    typedef Tensor<X86, AK_FLOAT, NHWC> Tensor4_1;
-    typedef Tensor<X86, AK_FLOAT, HW> Tensor2;
+TEST(TestSaberTensorBM, test_tensor_shape) {
+    typedef Tensor<X86, AK_BM, NCHW> Tensor4_0;
+    typedef Tensor<X86, AK_BM, NHWC> Tensor4_1;
+    typedef Tensor<X86, AK_BM, HW> Tensor2;
 
     int nin = 2;
     int cin = 2;
@@ -460,7 +441,7 @@ TEST(TestSaberTensorNV, test_tensor_shape) {
 
 }
 
-TEST(TestSaberTensorNV, test_tensor_reshape_realloc) {
+TEST(TestSaberTensorBM, test_tensor_reshape_realloc) {
 
     LOG(INFO) << "test tensor reshape and re_alloc funcs";
 
@@ -473,17 +454,17 @@ TEST(TestSaberTensorNV, test_tensor_reshape_realloc) {
     LOG(INFO) << "ori tensor with size: " << th0.valid_size();
     print_tensor_host(th0);
     print_tensor_device(td0);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     th0.reshape(sh0);
     td0.reshape(sh0);
     LOG(INFO) << "tensor after reshape(from big space to small) with size: " << th0.valid_size();
     print_tensor_host(th0);
     print_tensor_device(td0);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
     fill_tensor_host_const(th0, 1);
     fill_tensor_device_const(td0, 1);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     th0.reshape(sh1);
     td0.reshape(sh1);
@@ -491,24 +472,24 @@ TEST(TestSaberTensorNV, test_tensor_reshape_realloc) {
               th0.valid_size();
     print_tensor_host(th0);
     print_tensor_device(td0);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     th0.re_alloc(sh0);
     td0.re_alloc(sh0);
     LOG(INFO) << "tensor after re_alloc(from big space to small) with size: " << th0.valid_size();
     print_tensor_host(th0);
     print_tensor_device(td0);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     TensorHf4 th1(sh0);
     TensorDf4 td1(sh0);
     LOG(INFO) << "ori tensor with size: " << th1.valid_size();
     fill_tensor_host_const(th1, 1);
     fill_tensor_device_const(td1, 1);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
     print_tensor_host(th1);
     print_tensor_device(td1);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     th1.reshape(sh1);
     td1.reshape(sh1);
@@ -518,10 +499,10 @@ TEST(TestSaberTensorNV, test_tensor_reshape_realloc) {
     th1.valid_shape()[0], th1.valid_shape()[1], th1.valid_shape()[2], th1.valid_shape()[3]);
     print_tensor_host(th1);
     print_tensor_device(td1);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
     fill_tensor_host_const(th1, 1);
     fill_tensor_device_const(td1, 1);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     th1.reshape(sh0);
     td1.reshape(sh0);
@@ -531,15 +512,15 @@ TEST(TestSaberTensorNV, test_tensor_reshape_realloc) {
     td1.re_alloc(sh1);
     print_tensor_host(th1);
     print_tensor_device(td1);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
 }
 
-TEST(TestSaberTensorNV, test_tensor_op) {
+TEST(TestSaberTensorBM, test_tensor_op) {
     Shape sh{1, 2, 2, 10};
     TensorDf4 td1(sh);
     TensorHf4 th1(sh);
-    Tensor<NV, AK_INT8, NCHW> td2(sh);
+    Tensor<BM, AK_INT8, NCHW> td2(sh);
     Tensor<X86, AK_INT8, NCHW> th2(sh);
     LOG(INFO) << "testing host fill tensor with const 1.";
     fill_tensor_host_const(th1, 1.f);
@@ -590,11 +571,11 @@ TEST(TestSaberTensorNV, test_tensor_op) {
     print_tensor_device(td2);
 }
 
-TEST(TestSaberTensorNV, test_tensor_share_diff_dtype) {
+TEST(TestSaberTensorBM, test_tensor_share_diff_dtype) {
     Shape sh{1, 1, 2, 10};
-    Tensor<NV, AK_FLOAT, NCHW> td1(sh);
-    Tensor<X86, AK_FLOAT, NCHW> th1(sh);
-    Tensor<NV, AK_INT8, NCHW> td2;
+    Tensor<BM, AK_BM, NCHW> td1(sh);
+    Tensor<X86, AK_BM, NCHW> th1(sh);
+    Tensor<BM, AK_INT8, NCHW> td2;
     Tensor<X86, AK_INT8, NCHW> th2;
     td2.set_shape(sh);
     th2.set_shape(sh);
@@ -605,20 +586,20 @@ TEST(TestSaberTensorNV, test_tensor_share_diff_dtype) {
     fill_tensor_device_const(td1, -1);
     LOG(INFO) << "data type: int8";
     print_tensor_device(td1);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     td2.share_from(td1);
     th2.share_from(th1);
 
     print_tensor_host(th2);
     print_tensor_device(td2);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 }
 
-TEST(TestSaberTensorNV, test_tensor_base_type) {
+TEST(TestSaberTensorBM, test_tensor_base_type) {
     Shape sh(1, 3, 10, 10);
-    Tensor<NV, AK_FLOAT, NCHW> td1(sh);
-    Tensor<X86, AK_FLOAT, NCHW> th1(sh);
+    Tensor<BM, AK_BM, NCHW> td1(sh);
+    Tensor<X86, AK_BM, NCHW> th1(sh);
     fill_tensor_host_rand(th1, 0.f, 255.f);
     td1.copy_from(th1);
     TensorBase* tb1;
@@ -629,8 +610,7 @@ TEST(TestSaberTensorNV, test_tensor_base_type) {
     Shape sh11 = th1.valid_shape();
     LOG(INFO) << "base tensor call set shape: " << "n=" << sh11[0] << ", c=" << sh11[1] << \
               ", h=" << sh11[2] << ", w=" << sh11[3];
-*/
-}
+}*/
 
 int main(int argc, const char** argv) {
     // initial logger

From bdd8588fbb8053722169fda8b3bb145b05d7c761 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Wed, 20 Jun 2018 16:01:18 +0800
Subject: [PATCH 010/318] Update Dtype for host

---
 test/saber/bm/test_saber_tensor_BM.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index 0634d0a2d..14f86c8b5 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -5,7 +5,7 @@ using namespace anakin::saber;
 
 typedef TargetWrapper<X86> X86_API;
 typedef TargetWrapper<BM> BM_API;
-typedef Tensor<X86, AK_BM, NCHW> TensorHf4;
+typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
 typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 typedef TensorHf4::Dtype dtype;
 

From 4ca42d6a43f8e7659abe45979efd453c48b3cf35 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Wed, 20 Jun 2018 16:38:08 +0800
Subject: [PATCH 011/318] Conversion from void* to bm_device_mem_t*

---
 saber/core/impl/bm/bm_impl.cpp         | 10 ++++++----
 test/saber/bm/test_saber_tensor_BM.cpp |  2 ++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index bee5ddab6..faca480f0 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -55,20 +55,22 @@ int BM_API::get_device_id(){
         
 void BM_API::mem_alloc(void** ptr, size_t n){
     //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n)
-    bm_device_mem_t mem = bm_mem_from_system(*ptr);
-    BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n));
+    bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr)
+    BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n));
 }
         
 void BM_API::mem_free(void* ptr){
     //(bm_handle_t handle, bm_device_mem_t mem){
     if(ptr != nullptr){
-        bm_free_device(handle, bm_mem_from_system(ptr));
+        bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr)
+        bm_free_device(handle, *pmem);
     }
 }
         
 void BM_API::mem_set(void* ptr, int value, size_t n){
     //(bm_handle_t handle, const int value, bm_device_mem_t mem){
-    BMDNN_CHECK(bm_memset_device(handle, value, bm_mem_from_system(ptr)));
+    bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr)
+    BMDNN_CHECK(bm_memset_device(handle, value, *pmem));
 }
 
 //! target wrapper
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index 14f86c8b5..af279797e 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -55,6 +55,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     TensorHf4 thost1(sh1);
     TensorDf4 tdev1(sh1);
 
+    /*
     //! test tensor copy_from() function
     LOG(INFO) << "test copy_from() function, input tensor could be any target";
     thost1.copy_from(thost0);
@@ -191,6 +192,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
 
     LOG(INFO) << "|--show root tensor while data is changed by shared tensor";
     print_tensor_host(thost4);
+     */
 }
 
 /*

From 78c978a95a5dd5afdec89c6cbbe49e826baa1f93 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Wed, 20 Jun 2018 16:39:48 +0800
Subject: [PATCH 012/318] Convert from void* to bm_device_mem_t*

---
 saber/core/impl/bm/bm_impl.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index faca480f0..f2993426c 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -55,21 +55,21 @@ int BM_API::get_device_id(){
         
 void BM_API::mem_alloc(void** ptr, size_t n){
     //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n)
-    bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr)
+    bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr);
     BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n));
 }
         
 void BM_API::mem_free(void* ptr){
     //(bm_handle_t handle, bm_device_mem_t mem){
     if(ptr != nullptr){
-        bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr)
+        bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr);
         bm_free_device(handle, *pmem);
     }
 }
         
 void BM_API::mem_set(void* ptr, int value, size_t n){
     //(bm_handle_t handle, const int value, bm_device_mem_t mem){
-    bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr)
+    bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr);
     BMDNN_CHECK(bm_memset_device(handle, value, *pmem));
 }
 

From d99ce8a6bd3b0ea9d012f9aa9b844d7a7e6e1373 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Wed, 20 Jun 2018 16:45:27 +0800
Subject: [PATCH 013/318] Revert back first

---
 saber/core/impl/bm/bm_impl.cpp | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index f2993426c..f432cc863 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -55,22 +55,26 @@ int BM_API::get_device_id(){
         
 void BM_API::mem_alloc(void** ptr, size_t n){
     //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n)
-    bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr);
-    BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n));
+    bm_device_mem_t mem = bm_mem_from_system(*ptr);
+    BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n));
+    //bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr);
+    //BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n));
 }
         
 void BM_API::mem_free(void* ptr){
     //(bm_handle_t handle, bm_device_mem_t mem){
     if(ptr != nullptr){
-        bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr);
-        bm_free_device(handle, *pmem);
+        bm_free_device(handle, bm_mem_from_system(ptr));
+        //bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr);
+        //bm_free_device(handle, *pmem);
     }
 }
         
 void BM_API::mem_set(void* ptr, int value, size_t n){
     //(bm_handle_t handle, const int value, bm_device_mem_t mem){
-    bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr);
-    BMDNN_CHECK(bm_memset_device(handle, value, *pmem));
+    BMDNN_CHECK(bm_memset_device(handle, value, bm_mem_from_system(ptr)));
+    //bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr);
+    //BMDNN_CHECK(bm_memset_device(handle, value, *pmem));
 }
 
 //! target wrapper

From 5ea5263eaf7f103b975e710d74f38617227fd117 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Wed, 20 Jun 2018 17:00:14 +0800
Subject: [PATCH 014/318] test

---
 saber/core/impl/bm/bm_impl.cpp         |  2 +-
 saber/saber_funcs_param.h              | 16 ++++++++--------
 test/saber/bm/test_saber_tensor_BM.cpp |  4 ++--
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index f432cc863..baa25f484 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -84,7 +84,7 @@ template struct TargetWrapper<BM, __device_target>;
 template class Buffer<BM>;
 
 //! BM Tensor
-INSTANTIATE_TENSOR(BM, AK_BM, NCHW);
+INSTANTIATE_TENSOR(BM, AK_FLOAT, NCHW);
 
 template struct Env<BM>;
 
diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h
index 6a109540e..d648bf94b 100644
--- a/saber/saber_funcs_param.h
+++ b/saber/saber_funcs_param.h
@@ -539,14 +539,14 @@ struct ConvParam<Tensor<NV, AK_INT8, NCHW_C4> > {
 
 #ifdef USE_BM
 template <>
-struct ConvParam<Tensor<BM, AK_BM, NCHW> > {
+struct ConvParam<Tensor<BM, AK_FLOAT, NCHW> > {
     ConvParam() : group(-1), pad_h(-1), pad_w(-1),
                   stride_h(-1), stride_w(-1),
                   dilation_h(-1), dilation_w(-1),
                   weight_tensor(NULL), bias_tensor(NULL), alpha(1.0), beta(0.0){}
     ConvParam(int group_in, int pad_h_in, int pad_w_in,
               int stride_h_in, int stride_w_in, int dilation_h_, int dilation_w_,
-              Tensor<BM, AK_BM, NCHW>* weight, Tensor<BM, AK_BM, NCHW>* bias,
+              Tensor<BM, AK_FLOAT, NCHW>* weight, Tensor<BM, AK_FLOAT, NCHW>* bias,
               float alpha_in = 1.0, float beta_in = 0.0)
             : group(group_in), pad_h(pad_h_in), pad_w(pad_w_in)
             , stride_h(stride_h_in), stride_w(stride_w_in)
@@ -592,16 +592,16 @@ struct ConvParam<Tensor<BM, AK_BM, NCHW> > {
         comp_eq = comp_eq && (beta == right.beta);
         return comp_eq;
     }
-    inline const Tensor<BM, AK_BM, NCHW>* weight() {
+    inline const Tensor<BM, AK_FLOAT, NCHW>* weight() {
         return weight_tensor;
     }
-    inline const Tensor<BM, AK_BM, NCHW>* bias() {
+    inline const Tensor<BM, AK_FLOAT, NCHW>* bias() {
         return bias_tensor;
     }
-    inline Tensor<BM, AK_BM, NCHW>* mutable_weight() {
+    inline Tensor<BM, AK_FLOAT, NCHW>* mutable_weight() {
         return weight_tensor;
     }
-    inline Tensor<BM, AK_BM, NCHW>* mutable_bias() {
+    inline Tensor<BM, AK_FLOAT, NCHW>* mutable_bias() {
         return bias_tensor;
     }
     int group;
@@ -614,8 +614,8 @@ struct ConvParam<Tensor<BM, AK_BM, NCHW> > {
     float alpha;
     float beta;
 private:
-    Tensor<BM, AK_BM, NCHW>* weight_tensor;
-    Tensor<BM, AK_BM, NCHW>* bias_tensor;
+    Tensor<BM, AK_FLOAT, NCHW>* weight_tensor;
+    Tensor<BM, AK_FLOAT, NCHW>* bias_tensor;
 };
 #endif //USE_BM
 
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index af279797e..ce0bad95a 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -6,7 +6,7 @@ using namespace anakin::saber;
 typedef TargetWrapper<X86> X86_API;
 typedef TargetWrapper<BM> BM_API;
 typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
 typedef TensorHf4::Dtype dtype;
 
 TEST(TestSaberTensorBM, test_tensor_constructor) {
@@ -55,7 +55,6 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     TensorHf4 thost1(sh1);
     TensorDf4 tdev1(sh1);
 
-    /*
     //! test tensor copy_from() function
     LOG(INFO) << "test copy_from() function, input tensor could be any target";
     thost1.copy_from(thost0);
@@ -66,6 +65,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     tdev1.copy_from(tdev0);
     print_tensor_host(thost1);
 
+    /*
     //! test tensor constructor with shape and real_shape
     LOG(INFO) << "test tensor constructor with shape and real_shape";
     //! constructor with 3 shapes is removed

From 1cc471ee9f0845ac0e59f422ebc7622338ae9947 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Wed, 20 Jun 2018 17:10:11 +0800
Subject: [PATCH 015/318] Revert "test"

This reverts commit 5ea5263eaf7f103b975e710d74f38617227fd117.
---
 saber/core/impl/bm/bm_impl.cpp         |  2 +-
 saber/saber_funcs_param.h              | 16 ++++++++--------
 test/saber/bm/test_saber_tensor_BM.cpp |  4 ++--
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index baa25f484..f432cc863 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -84,7 +84,7 @@ template struct TargetWrapper<BM, __device_target>;
 template class Buffer<BM>;
 
 //! BM Tensor
-INSTANTIATE_TENSOR(BM, AK_FLOAT, NCHW);
+INSTANTIATE_TENSOR(BM, AK_BM, NCHW);
 
 template struct Env<BM>;
 
diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h
index d648bf94b..6a109540e 100644
--- a/saber/saber_funcs_param.h
+++ b/saber/saber_funcs_param.h
@@ -539,14 +539,14 @@ struct ConvParam<Tensor<NV, AK_INT8, NCHW_C4> > {
 
 #ifdef USE_BM
 template <>
-struct ConvParam<Tensor<BM, AK_FLOAT, NCHW> > {
+struct ConvParam<Tensor<BM, AK_BM, NCHW> > {
     ConvParam() : group(-1), pad_h(-1), pad_w(-1),
                   stride_h(-1), stride_w(-1),
                   dilation_h(-1), dilation_w(-1),
                   weight_tensor(NULL), bias_tensor(NULL), alpha(1.0), beta(0.0){}
     ConvParam(int group_in, int pad_h_in, int pad_w_in,
               int stride_h_in, int stride_w_in, int dilation_h_, int dilation_w_,
-              Tensor<BM, AK_FLOAT, NCHW>* weight, Tensor<BM, AK_FLOAT, NCHW>* bias,
+              Tensor<BM, AK_BM, NCHW>* weight, Tensor<BM, AK_BM, NCHW>* bias,
               float alpha_in = 1.0, float beta_in = 0.0)
             : group(group_in), pad_h(pad_h_in), pad_w(pad_w_in)
             , stride_h(stride_h_in), stride_w(stride_w_in)
@@ -592,16 +592,16 @@ struct ConvParam<Tensor<BM, AK_FLOAT, NCHW> > {
         comp_eq = comp_eq && (beta == right.beta);
         return comp_eq;
     }
-    inline const Tensor<BM, AK_FLOAT, NCHW>* weight() {
+    inline const Tensor<BM, AK_BM, NCHW>* weight() {
         return weight_tensor;
     }
-    inline const Tensor<BM, AK_FLOAT, NCHW>* bias() {
+    inline const Tensor<BM, AK_BM, NCHW>* bias() {
         return bias_tensor;
     }
-    inline Tensor<BM, AK_FLOAT, NCHW>* mutable_weight() {
+    inline Tensor<BM, AK_BM, NCHW>* mutable_weight() {
         return weight_tensor;
     }
-    inline Tensor<BM, AK_FLOAT, NCHW>* mutable_bias() {
+    inline Tensor<BM, AK_BM, NCHW>* mutable_bias() {
         return bias_tensor;
     }
     int group;
@@ -614,8 +614,8 @@ struct ConvParam<Tensor<BM, AK_FLOAT, NCHW> > {
     float alpha;
     float beta;
 private:
-    Tensor<BM, AK_FLOAT, NCHW>* weight_tensor;
-    Tensor<BM, AK_FLOAT, NCHW>* bias_tensor;
+    Tensor<BM, AK_BM, NCHW>* weight_tensor;
+    Tensor<BM, AK_BM, NCHW>* bias_tensor;
 };
 #endif //USE_BM
 
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index ce0bad95a..af279797e 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -6,7 +6,7 @@ using namespace anakin::saber;
 typedef TargetWrapper<X86> X86_API;
 typedef TargetWrapper<BM> BM_API;
 typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 typedef TensorHf4::Dtype dtype;
 
 TEST(TestSaberTensorBM, test_tensor_constructor) {
@@ -55,6 +55,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     TensorHf4 thost1(sh1);
     TensorDf4 tdev1(sh1);
 
+    /*
     //! test tensor copy_from() function
     LOG(INFO) << "test copy_from() function, input tensor could be any target";
     thost1.copy_from(thost0);
@@ -65,7 +66,6 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     tdev1.copy_from(tdev0);
     print_tensor_host(thost1);
 
-    /*
     //! test tensor constructor with shape and real_shape
     LOG(INFO) << "test tensor constructor with shape and real_shape";
     //! constructor with 3 shapes is removed

From fa2e41ba4df5bfb4f8d739d94230bb709e6aaa18 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Wed, 20 Jun 2018 17:14:21 +0800
Subject: [PATCH 016/318] Debug on copy_from

---
 test/saber/bm/test_saber_tensor_BM.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index af279797e..13b9deff1 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -55,7 +55,6 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     TensorHf4 thost1(sh1);
     TensorDf4 tdev1(sh1);
 
-    /*
     //! test tensor copy_from() function
     LOG(INFO) << "test copy_from() function, input tensor could be any target";
     thost1.copy_from(thost0);
@@ -66,6 +65,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     tdev1.copy_from(tdev0);
     print_tensor_host(thost1);
 
+    /*
     //! test tensor constructor with shape and real_shape
     LOG(INFO) << "test tensor constructor with shape and real_shape";
     //! constructor with 3 shapes is removed

From 75f5063122cdcae1045b78f7d29055ca6b058e42 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Wed, 20 Jun 2018 17:40:20 +0800
Subject: [PATCH 017/318] Revert "Revert "test""

This reverts commit 1cc471ee9f0845ac0e59f422ebc7622338ae9947.
---
 saber/core/impl/bm/bm_impl.cpp         |  2 +-
 saber/saber_funcs_param.h              | 16 ++++++++--------
 test/saber/bm/test_saber_tensor_BM.cpp |  2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index f432cc863..baa25f484 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -84,7 +84,7 @@ template struct TargetWrapper<BM, __device_target>;
 template class Buffer<BM>;
 
 //! BM Tensor
-INSTANTIATE_TENSOR(BM, AK_BM, NCHW);
+INSTANTIATE_TENSOR(BM, AK_FLOAT, NCHW);
 
 template struct Env<BM>;
 
diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h
index 6a109540e..d648bf94b 100644
--- a/saber/saber_funcs_param.h
+++ b/saber/saber_funcs_param.h
@@ -539,14 +539,14 @@ struct ConvParam<Tensor<NV, AK_INT8, NCHW_C4> > {
 
 #ifdef USE_BM
 template <>
-struct ConvParam<Tensor<BM, AK_BM, NCHW> > {
+struct ConvParam<Tensor<BM, AK_FLOAT, NCHW> > {
     ConvParam() : group(-1), pad_h(-1), pad_w(-1),
                   stride_h(-1), stride_w(-1),
                   dilation_h(-1), dilation_w(-1),
                   weight_tensor(NULL), bias_tensor(NULL), alpha(1.0), beta(0.0){}
     ConvParam(int group_in, int pad_h_in, int pad_w_in,
               int stride_h_in, int stride_w_in, int dilation_h_, int dilation_w_,
-              Tensor<BM, AK_BM, NCHW>* weight, Tensor<BM, AK_BM, NCHW>* bias,
+              Tensor<BM, AK_FLOAT, NCHW>* weight, Tensor<BM, AK_FLOAT, NCHW>* bias,
               float alpha_in = 1.0, float beta_in = 0.0)
             : group(group_in), pad_h(pad_h_in), pad_w(pad_w_in)
             , stride_h(stride_h_in), stride_w(stride_w_in)
@@ -592,16 +592,16 @@ struct ConvParam<Tensor<BM, AK_BM, NCHW> > {
         comp_eq = comp_eq && (beta == right.beta);
         return comp_eq;
     }
-    inline const Tensor<BM, AK_BM, NCHW>* weight() {
+    inline const Tensor<BM, AK_FLOAT, NCHW>* weight() {
         return weight_tensor;
     }
-    inline const Tensor<BM, AK_BM, NCHW>* bias() {
+    inline const Tensor<BM, AK_FLOAT, NCHW>* bias() {
         return bias_tensor;
     }
-    inline Tensor<BM, AK_BM, NCHW>* mutable_weight() {
+    inline Tensor<BM, AK_FLOAT, NCHW>* mutable_weight() {
         return weight_tensor;
     }
-    inline Tensor<BM, AK_BM, NCHW>* mutable_bias() {
+    inline Tensor<BM, AK_FLOAT, NCHW>* mutable_bias() {
         return bias_tensor;
     }
     int group;
@@ -614,8 +614,8 @@ struct ConvParam<Tensor<BM, AK_BM, NCHW> > {
     float alpha;
     float beta;
 private:
-    Tensor<BM, AK_BM, NCHW>* weight_tensor;
-    Tensor<BM, AK_BM, NCHW>* bias_tensor;
+    Tensor<BM, AK_FLOAT, NCHW>* weight_tensor;
+    Tensor<BM, AK_FLOAT, NCHW>* bias_tensor;
 };
 #endif //USE_BM
 
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index 13b9deff1..ce0bad95a 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -6,7 +6,7 @@ using namespace anakin::saber;
 typedef TargetWrapper<X86> X86_API;
 typedef TargetWrapper<BM> BM_API;
 typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
 typedef TensorHf4::Dtype dtype;
 
 TEST(TestSaberTensorBM, test_tensor_constructor) {

From 35f96827ec2898616e7d191552a1f1a6e5ab8b1e Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Thu, 21 Jun 2018 09:44:59 +0800
Subject: [PATCH 018/318] Print tensor for BM

---
 test/saber/bm/test_saber_tensor_BM.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index ce0bad95a..cc2adc774 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -59,8 +59,8 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     LOG(INFO) << "test copy_from() function, input tensor could be any target";
     thost1.copy_from(thost0);
     tdev1.copy_from(thost0);
-    print_tensor_device(tdev1);
-    //cudaDeviceSynchronize();
+    //TODO: print tensor for BM device
+    print_tensor_host(tdev1);
     thost1.copy_from(tdev1);
     tdev1.copy_from(tdev0);
     print_tensor_host(thost1);

From c32f16d678979042d44056cd297f15138aab93a7 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Thu, 21 Jun 2018 09:45:35 +0800
Subject: [PATCH 019/318] Revert "Revert "Revert "test"""

This reverts commit 75f5063122cdcae1045b78f7d29055ca6b058e42.
---
 saber/core/impl/bm/bm_impl.cpp         |  2 +-
 saber/saber_funcs_param.h              | 16 ++++++++--------
 test/saber/bm/test_saber_tensor_BM.cpp |  2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index baa25f484..f432cc863 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -84,7 +84,7 @@ template struct TargetWrapper<BM, __device_target>;
 template class Buffer<BM>;
 
 //! BM Tensor
-INSTANTIATE_TENSOR(BM, AK_FLOAT, NCHW);
+INSTANTIATE_TENSOR(BM, AK_BM, NCHW);
 
 template struct Env<BM>;
 
diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h
index d648bf94b..6a109540e 100644
--- a/saber/saber_funcs_param.h
+++ b/saber/saber_funcs_param.h
@@ -539,14 +539,14 @@ struct ConvParam<Tensor<NV, AK_INT8, NCHW_C4> > {
 
 #ifdef USE_BM
 template <>
-struct ConvParam<Tensor<BM, AK_FLOAT, NCHW> > {
+struct ConvParam<Tensor<BM, AK_BM, NCHW> > {
     ConvParam() : group(-1), pad_h(-1), pad_w(-1),
                   stride_h(-1), stride_w(-1),
                   dilation_h(-1), dilation_w(-1),
                   weight_tensor(NULL), bias_tensor(NULL), alpha(1.0), beta(0.0){}
     ConvParam(int group_in, int pad_h_in, int pad_w_in,
               int stride_h_in, int stride_w_in, int dilation_h_, int dilation_w_,
-              Tensor<BM, AK_FLOAT, NCHW>* weight, Tensor<BM, AK_FLOAT, NCHW>* bias,
+              Tensor<BM, AK_BM, NCHW>* weight, Tensor<BM, AK_BM, NCHW>* bias,
               float alpha_in = 1.0, float beta_in = 0.0)
             : group(group_in), pad_h(pad_h_in), pad_w(pad_w_in)
             , stride_h(stride_h_in), stride_w(stride_w_in)
@@ -592,16 +592,16 @@ struct ConvParam<Tensor<BM, AK_FLOAT, NCHW> > {
         comp_eq = comp_eq && (beta == right.beta);
         return comp_eq;
     }
-    inline const Tensor<BM, AK_FLOAT, NCHW>* weight() {
+    inline const Tensor<BM, AK_BM, NCHW>* weight() {
         return weight_tensor;
     }
-    inline const Tensor<BM, AK_FLOAT, NCHW>* bias() {
+    inline const Tensor<BM, AK_BM, NCHW>* bias() {
         return bias_tensor;
     }
-    inline Tensor<BM, AK_FLOAT, NCHW>* mutable_weight() {
+    inline Tensor<BM, AK_BM, NCHW>* mutable_weight() {
         return weight_tensor;
     }
-    inline Tensor<BM, AK_FLOAT, NCHW>* mutable_bias() {
+    inline Tensor<BM, AK_BM, NCHW>* mutable_bias() {
         return bias_tensor;
     }
     int group;
@@ -614,8 +614,8 @@ struct ConvParam<Tensor<BM, AK_FLOAT, NCHW> > {
     float alpha;
     float beta;
 private:
-    Tensor<BM, AK_FLOAT, NCHW>* weight_tensor;
-    Tensor<BM, AK_FLOAT, NCHW>* bias_tensor;
+    Tensor<BM, AK_BM, NCHW>* weight_tensor;
+    Tensor<BM, AK_BM, NCHW>* bias_tensor;
 };
 #endif //USE_BM
 
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index cc2adc774..db0edce6d 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -6,7 +6,7 @@ using namespace anakin::saber;
 typedef TargetWrapper<X86> X86_API;
 typedef TargetWrapper<BM> BM_API;
 typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 typedef TensorHf4::Dtype dtype;
 
 TEST(TestSaberTensorBM, test_tensor_constructor) {

From 65a1bbfdf0adc6c0fc4ff809fc75022af0881ccb Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Thu, 21 Jun 2018 10:00:22 +0800
Subject: [PATCH 020/318] Passing through BM handler

---
 saber/core/context.h           | 12 ++++++++++++
 saber/core/impl/bm/bm_impl.cpp |  5 +++++
 saber/core/target_wrapper.h    |  2 ++
 3 files changed, 19 insertions(+)

diff --git a/saber/core/context.h b/saber/core/context.h
index 847f91e81..15ec2e0b6 100644
--- a/saber/core/context.h
+++ b/saber/core/context.h
@@ -18,6 +18,12 @@
 #include "core/env.h"
 #include "saber/saber_types.h"
 
+#ifdef USE_BM
+#include "bmlib_runtime.h"
+#include "bmdnn_api.h"
+#include "bmlib_utils.h"
+#endif
+
 namespace anakin{
 
 namespace saber{
@@ -105,6 +111,12 @@ class Context final{
         return _stream_compute;
     }
 
+#ifdef USE_BM
+    bm_handle_t get_handler() {
+        return API::get_handler();
+    }
+#endif
+
 #ifdef USE_ARM_PLACE
     void set_power_mode(PowerMode mode);
     void set_act_cores(std::vector<int> ids);
diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index f432cc863..ecfe755d6 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -37,8 +37,13 @@ namespace saber{
 
 typedef TargetWrapper<BM, __device_target> BM_API;
 
+//TODO: check exception
 static bm_handle_t handle = get_bm_handle();
 
+bm_handle_t BM_API::get_handler() {
+    return handle;
+}
+
 void BM_API::get_device_count(int &count) {
     BMDNN_CHECK(bm_dev_getcount(&count));
 }
diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index 7c6e2d2fb..e724235d8 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -414,6 +414,8 @@ struct TargetWrapper<BM, __device_target> {
      * @return          currently activated device id
      */
     static int get_device_id();
+
+    static bm_handle_t get_handler();
 };
 
 #endif //USE_BM

From 50aca5f8267b683df4a1a51a7a344da634296ccd Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Thu, 21 Jun 2018 11:20:14 +0800
Subject: [PATCH 021/318] Implement copy_from for BM; Add back
 test_TargetWrapper_BM

---
 saber/core/tensor.h                     | 17 +++++++++++++++++
 test/saber/bm/test_TargetWrapper_BM.cpp | 16 ++++++++++++++++
 2 files changed, 33 insertions(+)
 create mode 100644 test/saber/bm/test_TargetWrapper_BM.cpp

diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 33e655752..3ac4ae7a9 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -20,6 +20,12 @@
 #include "core/events.h"
 #include "core/tensor_traits.h"
 
+#ifdef USE_BM
+#include "bmlib_runtime.h"
+#include "bmdnn_api.h"
+#include "bmlib_utils.h"
+#endif
+
 namespace anakin{
 
 namespace saber{
@@ -570,6 +576,17 @@ class Tensor : public TensorBase {
         return SaberSuccess;
     }
 
+#ifdef USE_BM
+    SaberStatus copy_from(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
+        CHECK_EQ(valid_size(), tensor.valid_size()) \
+            << "sizes of two valid shapes must be the same";
+
+        BMDNN_CHECK(m_memcpy_s2d(API::get_handler(), mutable_data(), bm_mem_from_system(tensor.data())));
+
+        return SaberSuccess;
+    }
+#endif
+
     /**
      *  \brief Deep copy data within region of interest from input tensor.
      */
diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp
new file mode 100644
index 000000000..c54b392d1
--- /dev/null
+++ b/test/saber/bm/test_TargetWrapper_BM.cpp
@@ -0,0 +1,16 @@
+#include "saber_types.h"
+#include "target_wrapper.h"
+#include <iostream>
+
+#ifdef USE_BM
+using namespace anakin::saber;
+int main() {
+    typedef TargetWrapper<BM> API;
+    void *pmem;
+    int dev_count = 0;
+    API::get_device_count(dev_count);
+    API::mem_alloc(&pmem, 3*200*200);
+    API::mem_free(pmem);
+}
+#endif
+

From 65734d5dfdec189599aa607476945d524474be08 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Thu, 21 Jun 2018 11:35:11 +0800
Subject: [PATCH 022/318] check tensor target type

---
 saber/core/tensor.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 3ac4ae7a9..7fc829555 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -578,6 +578,9 @@ class Tensor : public TensorBase {
 
 #ifdef USE_BM
     SaberStatus copy_from(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
+        CHECK_EQ(typeof(BM), typeof(targetType_t)) \
+            << "this method is only for BM tensor";
+
         CHECK_EQ(valid_size(), tensor.valid_size()) \
             << "sizes of two valid shapes must be the same";
 

From 6ebd0287d6be3e9ac41b0d6c8c333771b958124f Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Thu, 21 Jun 2018 11:39:13 +0800
Subject: [PATCH 023/318] Change back to compliable version

---
 saber/core/tensor.h                    | 14 --------------
 test/saber/bm/test_saber_tensor_BM.cpp |  2 +-
 2 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 7fc829555..e543f7197 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -576,20 +576,6 @@ class Tensor : public TensorBase {
         return SaberSuccess;
     }
 
-#ifdef USE_BM
-    SaberStatus copy_from(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
-        CHECK_EQ(typeof(BM), typeof(targetType_t)) \
-            << "this method is only for BM tensor";
-
-        CHECK_EQ(valid_size(), tensor.valid_size()) \
-            << "sizes of two valid shapes must be the same";
-
-        BMDNN_CHECK(m_memcpy_s2d(API::get_handler(), mutable_data(), bm_mem_from_system(tensor.data())));
-
-        return SaberSuccess;
-    }
-#endif
-
     /**
      *  \brief Deep copy data within region of interest from input tensor.
      */
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index db0edce6d..8aead4bb1 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -55,6 +55,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     TensorHf4 thost1(sh1);
     TensorDf4 tdev1(sh1);
 
+    /*
     //! test tensor copy_from() function
     LOG(INFO) << "test copy_from() function, input tensor could be any target";
     thost1.copy_from(thost0);
@@ -65,7 +66,6 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     tdev1.copy_from(tdev0);
     print_tensor_host(thost1);
 
-    /*
     //! test tensor constructor with shape and real_shape
     LOG(INFO) << "test tensor constructor with shape and real_shape";
     //! constructor with 3 shapes is removed

From b1aa39dcfc107d06fa1f150cfcda30ffbbe40bdf Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Thu, 21 Jun 2018 15:38:08 +0800
Subject: [PATCH 024/318] modify activation op and test

---
 saber/funcs/impl/bm/vender_activation.h       | 38 +++-----
 saber/funcs/impl/bm/vender_fc.h               | 46 ++-------
 .../bm/test_saber_func_activation_BM.cpp      | 97 +------------------
 3 files changed, 21 insertions(+), 160 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_activation.h b/saber/funcs/impl/bm/vender_activation.h
index 45541add9..fadd817b9 100644
--- a/saber/funcs/impl/bm/vender_activation.h
+++ b/saber/funcs/impl/bm/vender_activation.h
@@ -27,17 +27,9 @@ class VenderActivation<BM, OpDtype, inDtype, outDtype,\
     typedef typename DataTensor_out::Dtype OutDataType;
     typedef typename OpTensor::Dtype OpDataType;
 
-    VenderActivation()
-            : _handle(NULL), _active_descs(NULL), _input_descs(NULL), _output_descs(NULL) {}
+    VenderActivation(): _handle(NULL), _active_type(NULL) {}
 
-    ~VenderActivation() {
-        if (_input_descs) {
-            BMDNN_CHECK(bm_free_device(_input_descs));
-        }
-        if (_output_descs) {
-            BMDNN_CHECK(bm_free_device(_output_descs));
-        }
-    }
+    ~VenderActivation() {}
 
     virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
@@ -64,33 +56,29 @@ class VenderActivation<BM, OpDtype, inDtype, outDtype,\
         int input_n = inputs[0]->num();
 
         switch (_active_type) {
-            case Active_sigmoid:
-                BMDNN_CHECK(bmdnn_sigmoid_forward(_handle, _input_descs, input_n, input_dim, _output_descs));
-                break;
             case Active_relu:
-                BMDNN_CHECK(bmdnn_relu_forward(_handle, _input_descs, input_n, input_dim, _output_descs));
+                BMDNN_CHECK(bmdnn_relu_forward(_handle, in_data, input_n, input_dim, out_data));
+                break;
+            case Active_sigmoid:
+                BMDNN_CHECK(bmdnn_sigmoid_forward(_handle, in_data, input_n, input_dim, out_data));
                 break;
             case Active_tanh:
-                BMDNN_CHECK(bmdnn_tanh_forward(_handle, _input_descs, input_n, input_dim, _output_descs));
+                BMDNN_CHECK(bmdnn_tanh_forward(_handle, in_data, input_n, input_dim, out_data));
+                break;
+            case Active_elu:
+                BMDNN_CHECK(bmdnn_elu_forward(_handle, 1.0, in_data, input_n, input_dim, out_data));
                 break;
         }
-        /* BMDNN_CHECK(cudnnActivationForward(_handle, _active_descs, */
-        /*                                    cudnn::cudnnTypeWrapper<InDataType>::kOne(), */
-        /*                                    _input_descs, in_data, */
-        /*                                    cudnn::cudnnTypeWrapper<InDataType>::kZero(), */
-        /*                                    _output_descs, out_data */
-        /* )); */
         return SaberSuccess;
     }
 
 private:
     bm_handle_t _handle;
-    bm_device_mem_t _input_descs;
-    bm_device_mem_t _output_descs;
     ActiveType _active_type;
 };
+
 template class VenderActivation<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
-}
-}
+} // namespace saber
 
+} // namespace anakin
 #endif //ANAKIN_SABER_FUNCS_BMDNN_ACT_H
diff --git a/saber/funcs/impl/bm/vender_fc.h b/saber/funcs/impl/bm/vender_fc.h
index 5c7c23e67..3b018686c 100644
--- a/saber/funcs/impl/bm/vender_fc.h
+++ b/saber/funcs/impl/bm/vender_fc.h
@@ -1,20 +1,5 @@
-/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-   
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. 
-*/
-
-#ifndef ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H
-#define ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H
+#ifndef ANAKIN_SABER_FUNCS_BMDNN_FC_H
+#define ANAKIN_SABER_FUNCS_BMDNN_FC_H
 
 #include "saber/funcs/impl/impl_fc.h"
 
@@ -43,23 +28,12 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
     typedef typename DataTensor_out::Dtype OutDataType;
     typedef typename OpTensor::Dtype OpDataType;
 
-    VenderFc() = default;
-    ~VenderFc() {
-        if (_handle != nullptr) {
-            CUBLAS_CHECK(cublasDestroy(_handle));
-        }
-    }
+    VenderFc() {};
+    ~VenderFc() {}
 
     virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             FcParam<OpTensor>& param, Context<BM>& ctx){
-        // get context
-        this->_ctx = ctx;
-        cudaStream_t cuda_stream;
-        cuda_stream = ctx.get_compute_stream();
-
-        CUBLAS_CHECK(cublasCreate(&_handle));
-        CUBLAS_CHECK(cublasSetStream(_handle, cuda_stream));
         return create(inputs, outputs, param, ctx);
     }
 
@@ -94,16 +68,10 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
 
     virtual SaberStatus dispatch(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
-                            FcParam<OpTensor>& param);
+                            FcParam<OpTensor>& param){
 
+    };
 
-private:
-    bool _flag_trans_weights{false};
-    int _M;
-    int _K;
-    int _N;
-    cublasHandle_t _handle;
-    bool _is_continue_buf{true};
 };
 
 template class VenderFc<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
@@ -111,4 +79,4 @@ template class VenderFc<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
 
 } //namespace anakin
 
-#endif //ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H
+#endif // ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H
diff --git a/test/saber/bm/test_saber_func_activation_BM.cpp b/test/saber/bm/test_saber_func_activation_BM.cpp
index 5d30a6d64..523e94121 100644
--- a/test/saber/bm/test_saber_func_activation_BM.cpp
+++ b/test/saber/bm/test_saber_func_activation_BM.cpp
@@ -58,7 +58,7 @@ TEST(TestSaberFuncBM, test_func_constructor) {
 
     Context<BM> ctx1(0, 1, 1);
 
-    ActivationParam<TensorDf4> param(Active_elu, 0.1f, 0.1f);
+    ActivationParam<TensorDf4> param(Active_relu, 0.1f, 0.1f);
 
     std::vector<TensorDf4*> input;
     std::vector<TensorDf4*> output;
@@ -74,102 +74,7 @@ TEST(TestSaberFuncBM, test_func_constructor) {
     act.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
     act(input, output, param, ctx1);
 
-    cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    output[0]->record_event(cuda_stream);
-    output_dev.sync();
     print_tensor_device(output_dev);
-    cudaDeviceSynchronize();
-    CUDA_POST_KERNEL_CHECK;
-}
-
-TEST(TestSaberFuncBM, test_func_sub_tensor) {
-
-    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
-
-    int img_num = 1;
-    int in_channels = 2;
-    int img_h = 8;
-    int img_w = 8;
-
-    Shape img_s(img_num, in_channels, img_h, img_w);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = (float)(0.05 * (i & 0x1f) * -1);
-    }
-
-    img_dev.copy_from(img_host);
-    Shape img_s_t0(img_num, in_channels, 4, 4);
-
-    TensorDf4 t0;
-    TensorDf4 t1;
-
-    t0.share_sub_buffer(img_dev, img_s_t0, {0, 0, 0, 0});
-    t1.share_sub_buffer(img_dev, img_s_t0, {0, 0, 4, 4});
-
-    print_tensor_shape("t0", t0);
-    print_tensor_shape("t1", t1);
-
-    TensorDf4 output_dev;
-
-    TensorDf4 out0;
-    TensorDf4 out1;
-
-    // start Reshape & doInfer
-    Context<BM> ctx1(0, 1, 1);
-    Context<BM> ctx2(0, 2, 2);
-
-    ActivationParam<TensorDf4> param1(Active_elu, 0.1f, 0.1f);
-    ActivationParam<TensorDf4> param2(Active_elu, 0.1f, 0.1f);
-
-    std::vector<TensorDf4*> input1, input2;
-    std::vector<TensorDf4*> output1, output2;
-
-    input1.push_back(&t0);
-    input2.push_back(&t1);
-
-    output1.push_back(&out0);
-    output2.push_back(&out1);
-
-    //FIXME where do I get img_s and all those shapes ????
-    output_dev.re_alloc(img_s);
-
-    out0.share_sub_buffer(output_dev, img_s_t0, {0, 0, 0, 0});
-    out1.share_sub_buffer(output_dev, img_s_t0, {0, 0, 4, 4});
-
-    print_tensor_shape("output_dev", output_dev);
-
-    Activation<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> act1;
-    Activation<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> act2;
-
-    act1.compute_output_shape(output1, input1, param1);
-    act2.compute_output_shape(output2, input2, param2);
-
-    print_tensor_shape("out0", out0);
-    print_tensor_shape("out1", out1);
-
-    // init assume output tensor has been reshpaed by user.
-    act1.init(input1, output1, param1, SPECIFY, SABER_IMPL, ctx1);
-    act1(input1, output1, param1, ctx1);
-    cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    output1[0]->record_event(cuda_stream);
-
-    act2.init(input2, output2, param2, SPECIFY, SABER_IMPL, ctx2);
-    act2(input2, output2, param2, ctx2);
-    cudaStream_t cuda_stream2 = ctx2.get_compute_stream();
-    output2[0]->record_event(cuda_stream2);
-
-    out0.sync();
-    out1.sync();
-    print_tensor_device(output_dev);
-    cudaDeviceSynchronize();
-    CUDA_POST_KERNEL_CHECK;
 }
 
 int main(int argc, const char** argv) {

From 250451cde19e1d677b2957c9c1ba0166d7cf0893 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Thu, 21 Jun 2018 18:31:38 +0800
Subject: [PATCH 025/318] Enable copy from tensor with different Dtype

---
 saber/core/data_traits.h               | 11 +++++++++++
 saber/core/tensor.h                    | 16 ++++++++++++++++
 test/saber/bm/test_saber_tensor_BM.cpp |  6 ++++--
 3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/saber/core/data_traits.h b/saber/core/data_traits.h
index 0bb732aba..64de4af9f 100644
--- a/saber/core/data_traits.h
+++ b/saber/core/data_traits.h
@@ -17,6 +17,12 @@
 
 #include "saber_types.h"
 
+#ifdef USE_BM
+#include "bmlib_runtime.h"
+#include "bmdnn_api.h"
+#include "bmlib_utils.h"
+#endif
+
 namespace anakin{
 
 namespace saber{
@@ -76,6 +82,11 @@ struct DataTrait<AK_UINT32> {
     typedef unsigned int dtype;
 };
 
+template <>
+struct DataTrait<AK_BM> {
+    typedef bm_device_mem_t dtype;
+};
+
 } //namespace saber
 
 } //namespace anakin
diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index e543f7197..d24287c44 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -21,6 +21,7 @@
 #include "core/tensor_traits.h"
 
 #ifdef USE_BM
+#include <typeinfo>
 #include "bmlib_runtime.h"
 #include "bmdnn_api.h"
 #include "bmlib_utils.h"
@@ -728,6 +729,21 @@ class Tensor : public TensorBase {
         return SaberSuccess;
     }
 
+#ifdef USE_BM
+    template <typename TargetType_t, DataType DataType_t, typename LayOutType_t>
+    SaberStatus copy_from(const Tensor<TargetType_t, DataType_t, LayOutType_t>& tensor) {
+        if (typeid(BM) == typeid(targetType_t) &&
+            typeid(X86) == typeid(TargetType_t) &&
+            typeid(AK_FLOAT) == typeid(DataType_t)){
+
+            Dtype* device_data_ptr = mutable_data();
+            BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(tensor.data())));
+        }
+
+        return SaberSuccess;
+    };
+#endif
+
     /**
      * \brief Asynchronously copy entire buffer from source tensor.
      */
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index 8aead4bb1..83eb472b7 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -55,11 +55,13 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     TensorHf4 thost1(sh1);
     TensorDf4 tdev1(sh1);
 
-    /*
+
     //! test tensor copy_from() function
     LOG(INFO) << "test copy_from() function, input tensor could be any target";
-    thost1.copy_from(thost0);
+    //thost1.copy_from(thost0);
     tdev1.copy_from(thost0);
+
+    /*
     //TODO: print tensor for BM device
     print_tensor_host(tdev1);
     thost1.copy_from(tdev1);

From 435ca51e18f046abf4708a28f18f6de7ac627e0f Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Fri, 22 Jun 2018 09:25:03 +0800
Subject: [PATCH 026/318] Complete copy_from method

---
 saber/core/tensor.h | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index d24287c44..b0e22ec20 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -732,15 +732,36 @@ class Tensor : public TensorBase {
 #ifdef USE_BM
     template <typename TargetType_t, DataType DataType_t, typename LayOutType_t>
     SaberStatus copy_from(const Tensor<TargetType_t, DataType_t, LayOutType_t>& tensor) {
+
+        CHECK_EQ(valid_size(), tensor.valid_size()) \
+            << "sizes of two valid shapes must be the same";
+
+        /// copy from system to device
         if (typeid(BM) == typeid(targetType_t) &&
+            typeid(AK_BM) == typeid(datatype) &&
             typeid(X86) == typeid(TargetType_t) &&
             typeid(AK_FLOAT) == typeid(DataType_t)){
 
             Dtype* device_data_ptr = mutable_data();
             BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(tensor.data())));
+
+            return SaberSuccess;
         }
 
-        return SaberSuccess;
+        /// copy from device to system
+        if (typeid(X86) == typeid(targetType_t) &&
+            typeid(AK_FLOAT) == typeid(datatype) &&
+            typeid(BM) == typeid(TargetType_t) &&
+            typeid(AK_BM) == typeid(DataType_t)){
+
+            Dtype* device_data_ptr = tensor.data();
+            BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr));
+
+            return SaberSuccess;
+        }
+
+        /// other types are not allowed here
+        return SaberInvalidValue;
     };
 #endif
 

From be04e3b2a541d22aab7b4f93a6b843b13af70e3f Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Fri, 22 Jun 2018 09:41:21 +0800
Subject: [PATCH 027/318] const_cast the immutable target data pointer

---
 saber/core/tensor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index b0e22ec20..d43cedef1 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -754,7 +754,7 @@ class Tensor : public TensorBase {
             typeid(BM) == typeid(TargetType_t) &&
             typeid(AK_BM) == typeid(DataType_t)){
 
-            Dtype* device_data_ptr = tensor.data();
+            Dtype* device_data_ptr = const_cast<Dtype*>(tensor.data());
             BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr));
 
             return SaberSuccess;

From 7cc4c5781c0375a23cc6dc354235e591c6885812 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Fri, 22 Jun 2018 10:06:33 +0800
Subject: [PATCH 028/318] Revert back to compilable version

---
 saber/core/tensor.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index d43cedef1..29ec1d006 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -749,16 +749,16 @@ class Tensor : public TensorBase {
         }
 
         /// copy from device to system
-        if (typeid(X86) == typeid(targetType_t) &&
+        /*if (typeid(X86) == typeid(targetType_t) &&
             typeid(AK_FLOAT) == typeid(datatype) &&
             typeid(BM) == typeid(TargetType_t) &&
             typeid(AK_BM) == typeid(DataType_t)){
 
-            Dtype* device_data_ptr = const_cast<Dtype*>(tensor.data());
+            auto* device_data_ptr = tensor.data();
             BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr));
 
             return SaberSuccess;
-        }
+        }*/
 
         /// other types are not allowed here
         return SaberInvalidValue;

From 59f2c69d569990d7761e2e0ffdb37f2131ab820c Mon Sep 17 00:00:00 2001
From: root <weihao.huang@bitmain.com>
Date: Fri, 22 Jun 2018 02:43:01 +0000
Subject: [PATCH 029/318] Modify handle usage & mem_alloc function

---
 saber/core/impl/bm/bm_impl.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index ecfe755d6..6088b3af6 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -37,12 +37,17 @@ namespace saber{
 
 typedef TargetWrapper<BM, __device_target> BM_API;
 
+<<<<<<< HEAD
 //TODO: check exception
 static bm_handle_t handle = get_bm_handle();
 
 bm_handle_t BM_API::get_handler() {
     return handle;
 }
+=======
+//static bm_handle_t handle = get_bm_handle();
+static bm_handle_t handle;
+>>>>>>> Modify handle usage & mem_alloc function
 
 void BM_API::get_device_count(int &count) {
     BMDNN_CHECK(bm_dev_getcount(&count));
@@ -60,18 +65,31 @@ int BM_API::get_device_id(){
         
 void BM_API::mem_alloc(void** ptr, size_t n){
     //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n)
+<<<<<<< HEAD
     bm_device_mem_t mem = bm_mem_from_system(*ptr);
     BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n));
     //bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr);
     //BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n));
+=======
+    //bm_device_mem_t mem = bm_mem_from_system(*ptr);
+    handle = get_bm_handle();
+    bm_device_mem_t *mem = new bm_device_mem_t[1];
+    mem = reinterpret_cast<struct bm_mem_desc *>(ptr);
+    BMDNN_CHECK(bm_malloc_device_byte(handle, mem, n));
+>>>>>>> Modify handle usage & mem_alloc function
 }
         
 void BM_API::mem_free(void* ptr){
     //(bm_handle_t handle, bm_device_mem_t mem){
     if(ptr != nullptr){
         bm_free_device(handle, bm_mem_from_system(ptr));
+<<<<<<< HEAD
         //bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr);
         //bm_free_device(handle, *pmem);
+=======
+        //handle = get_bm_handle();
+	//bm_free_device(handle, reinterpret_cast<struct bm_mem_desc>(*ptr));
+>>>>>>> Modify handle usage & mem_alloc function
     }
 }
         

From 428306bf0d42987f8031c0eb2f675f54bc99d7c8 Mon Sep 17 00:00:00 2001
From: "weihao.huang" <weihao.huang@bitmain.com>
Date: Fri, 22 Jun 2018 04:01:48 +0000
Subject: [PATCH 030/318] Modify handle usage & mem_alloc function

---
 saber/core/impl/bm/bm_impl.cpp | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 6088b3af6..5ad6af84e 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -37,17 +37,13 @@ namespace saber{
 
 typedef TargetWrapper<BM, __device_target> BM_API;
 
-<<<<<<< HEAD
 //TODO: check exception
-static bm_handle_t handle = get_bm_handle();
+//static bm_handle_t handle = get_bm_handle();
+static bm_handle_t handle;
 
 bm_handle_t BM_API::get_handler() {
     return handle;
 }
-=======
-//static bm_handle_t handle = get_bm_handle();
-static bm_handle_t handle;
->>>>>>> Modify handle usage & mem_alloc function
 
 void BM_API::get_device_count(int &count) {
     BMDNN_CHECK(bm_dev_getcount(&count));
@@ -65,31 +61,25 @@ int BM_API::get_device_id(){
         
 void BM_API::mem_alloc(void** ptr, size_t n){
     //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n)
-<<<<<<< HEAD
     bm_device_mem_t mem = bm_mem_from_system(*ptr);
     BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n));
     //bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr);
     //BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n));
-=======
     //bm_device_mem_t mem = bm_mem_from_system(*ptr);
     handle = get_bm_handle();
     bm_device_mem_t *mem = new bm_device_mem_t[1];
     mem = reinterpret_cast<struct bm_mem_desc *>(ptr);
     BMDNN_CHECK(bm_malloc_device_byte(handle, mem, n));
->>>>>>> Modify handle usage & mem_alloc function
 }
         
 void BM_API::mem_free(void* ptr){
     //(bm_handle_t handle, bm_device_mem_t mem){
     if(ptr != nullptr){
         bm_free_device(handle, bm_mem_from_system(ptr));
-<<<<<<< HEAD
         //bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr);
         //bm_free_device(handle, *pmem);
-=======
         //handle = get_bm_handle();
 	//bm_free_device(handle, reinterpret_cast<struct bm_mem_desc>(*ptr));
->>>>>>> Modify handle usage & mem_alloc function
     }
 }
         

From 1fddf4e36cabce6398d6a78906a59872f958b448 Mon Sep 17 00:00:00 2001
From: "weihao.huang" <weihao.huang@bitmain.com>
Date: Fri, 22 Jun 2018 05:31:05 +0000
Subject: [PATCH 031/318] Modify test_TargetWrapper

---
 saber/core/impl/bm/bm_impl.cpp          | 2 --
 test/saber/bm/test_TargetWrapper_BM.cpp | 9 ++++++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 5ad6af84e..4aecb169d 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -61,8 +61,6 @@ int BM_API::get_device_id(){
         
 void BM_API::mem_alloc(void** ptr, size_t n){
     //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n)
-    bm_device_mem_t mem = bm_mem_from_system(*ptr);
-    BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n));
     //bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr);
     //BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n));
     //bm_device_mem_t mem = bm_mem_from_system(*ptr);
diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp
index c54b392d1..a76bef279 100644
--- a/test/saber/bm/test_TargetWrapper_BM.cpp
+++ b/test/saber/bm/test_TargetWrapper_BM.cpp
@@ -4,13 +4,20 @@
 
 #ifdef USE_BM
 using namespace anakin::saber;
+static bm_handle_t handle;
 int main() {
+    bmdnn_init(&handle);
     typedef TargetWrapper<BM> API;
     void *pmem;
     int dev_count = 0;
     API::get_device_count(dev_count);
+    std::cout << dev_count << std::endl;
     API::mem_alloc(&pmem, 3*200*200);
-    API::mem_free(pmem);
+    //API::mem_free(pmem);
+    std::cout << "Press any key to finish execution." << std::endl;
+    int a;
+    std::cin >> a;
+    bmdnn_deinit(handle);
 }
 #endif
 

From e32c50a08175f59730dcca6bc2ee26848bf730bb Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Fri, 22 Jun 2018 13:52:18 +0800
Subject: [PATCH 032/318] fill activation and fc op; compile error

---
 saber/funcs/impl/bm/vender_activation.h |  1 -
 saber/funcs/impl/bm/vender_fc.h         | 42 ++++++----------
 saber/funcs/timer.h                     | 66 +++++++++++++++++++++++++
 test/saber/bm/test_saber_buffer_BM.cpp  |  2 +-
 test/saber/bm/test_saber_func_fc_BM.cpp |  6 +--
 5 files changed, 85 insertions(+), 32 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_activation.h b/saber/funcs/impl/bm/vender_activation.h
index fadd817b9..c4baf8365 100644
--- a/saber/funcs/impl/bm/vender_activation.h
+++ b/saber/funcs/impl/bm/vender_activation.h
@@ -49,7 +49,6 @@ class VenderActivation<BM, OpDtype, inDtype, outDtype,\
     virtual SaberStatus dispatch(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             ActivationParam<OpTensor>& param) {
-
         const InDataType *in_data = (const InDataType *) inputs[0]->data();
         OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
         int input_dim = inputs[0]->channel() * inputs[0]->height() * inputs[0]->width();
diff --git a/saber/funcs/impl/bm/vender_fc.h b/saber/funcs/impl/bm/vender_fc.h
index 3b018686c..82dd6000c 100644
--- a/saber/funcs/impl/bm/vender_fc.h
+++ b/saber/funcs/impl/bm/vender_fc.h
@@ -28,7 +28,7 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
     typedef typename DataTensor_out::Dtype OutDataType;
     typedef typename OpTensor::Dtype OpDataType;
 
-    VenderFc() {};
+    VenderFc(): _handle(NULL) {};
     ~VenderFc() {}
 
     virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
@@ -40,38 +40,28 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
     virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             FcParam<OpTensor>& param, Context<BM>& ctx){
-
-        if (!(ctx == this->_ctx)) {
-            if (_handle != NULL) {
-                CUBLAS_CHECK(cublasDestroy(_handle));
-            }
-            this->_ctx = ctx;
-
-            cudaStream_t cuda_stream;
-            cuda_stream = ctx.get_compute_stream();
-            CUBLAS_CHECK(cublasCreate(&_handle));
-            CUBLAS_CHECK(cublasSetStream(_handle, cuda_stream));
-        }
-
-        Shape shape_out = inputs[0]->valid_shape();
-        _M = inputs[0]->count_valid(0, param.axis);
-        _K = inputs[0]->count_valid(param.axis, inputs[0]->dims());
-        _N = param.num_output;
-        if (_N <= 0) {
-            int weight_size = param.weights->valid_size();
-            _N = weight_size / _K;
-        }
-        //! weights dims must be in h and w
-        _flag_trans_weights = param.is_transpose_weights;
         return SaberSuccess;
     }
 
     virtual SaberStatus dispatch(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             FcParam<OpTensor>& param){
-
+        const InDataType *in_data = (const InDataType *) inputs[0]->data();
+        const InDataType *weights = (const InDataType *) param.weights->get_buf()->get_data();
+        const InDataType *bias = (const InDataType *) param.bias->get_buf()->get_data();
+        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
+        int batch_size = inputs[0]->num();
+        int input_len = inputs[0]->channel();
+        int output_len = param.num_output;
+        int is_transpose = param.is_transpose_weights ? 1 : 0;
+        BMDNN_CHECK(bmdnn_fc_forward(_handle, in_data, weights, bias,
+                                    batch_size, output_len, input_len, is_transpose, 1, 0,
+                                    out_data));
+        return SaberSuccess;
     };
 
+private:
+    bm_handle_t _handle;
 };
 
 template class VenderFc<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
@@ -79,4 +69,4 @@ template class VenderFc<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
 
 } //namespace anakin
 
-#endif // ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H
+#endif // ANAKIN_SABER_FUNCS_BMDNN_FC_H
diff --git a/saber/funcs/timer.h b/saber/funcs/timer.h
index 4b1689383..e5014a9cb 100644
--- a/saber/funcs/timer.h
+++ b/saber/funcs/timer.h
@@ -173,6 +173,72 @@ class SaberTimer<NV> final {
 };
 #endif
 
+#ifdef USE_BM
+template <>
+class SaberTimer<BM> final {
+
+public:
+    SaberTimer() {}
+
+    ~SaberTimer() {}
+
+    void clear() {
+        ms_time.clear();
+    }
+
+    void start(Context<BM> &ctx) {
+        tstart = std::chrono::system_clock::now();
+    }
+
+    void end(Context<BM> &ctx) {
+        tend = std::chrono::system_clock::now();
+        auto ts = std::chrono::duration_cast<std::chrono::microseconds>(tend - tstart);
+        float elapse_ms = 1000.f * float(ts.count()) * std::chrono::microseconds::period::num / \
+            std::chrono::microseconds::period::den;
+        ms_time.push_back(elapse_ms);
+    }
+
+    float get_average_ms() {
+        if (ms_time.size() == 0) {
+            return 0.f;
+        }
+        float sum = 0.f;
+        for (auto i : ms_time){
+            sum += i;
+        }
+        return sum / ms_time.size();
+    }
+
+    // return tile (0-99) time.
+    float get_tile_time(float tile) {
+
+        if (tile <0 || tile > 100) {
+            return -1.f;
+        }
+        int total_items = (int)ms_time.size();
+        if (total_items <= 0) {
+            return -2.f;
+        }
+        ms_time.sort();
+        int pos = (int)(tile * total_items / 100);
+        auto it = ms_time.begin();
+        for (int i = 0; i < pos; ++i) {
+            ++it;
+        }
+        return *it;
+    }
+
+    const std::list<float> get_time_stat() {
+        return ms_time;
+    }
+
+private:
+    std::chrono::time_point<std::chrono::system_clock> tstart;
+    std::chrono::time_point<std::chrono::system_clock> tend;
+    std::list<float> ms_time;
+};
+#endif // USE_BM
+
 }
 }
 
diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp
index a204e7807..93aa6d36e 100644
--- a/test/saber/bm/test_saber_buffer_BM.cpp
+++ b/test/saber/bm/test_saber_buffer_BM.cpp
@@ -1,4 +1,4 @@
-#include "test_saber_buffer_bm.h"
+#include "test_saber_buffer_BM.h"
 #include "saber/core/buffer.h"
 #include "saber/core/data_traits.h"
 
diff --git a/test/saber/bm/test_saber_func_fc_BM.cpp b/test/saber/bm/test_saber_func_fc_BM.cpp
index 5101c75f8..869ff1bfd 100644
--- a/test/saber/bm/test_saber_func_fc_BM.cpp
+++ b/test/saber/bm/test_saber_func_fc_BM.cpp
@@ -1,6 +1,6 @@
 #include "core/context.h"
 #include "funcs/fc.h"
-#include "test_saber_func_fc_BM.h"
+#include "test_saber_func_BM.h"
 #include "tensor_op.h"
 #include "saber_types.h"
 #include <vector>
@@ -41,7 +41,7 @@ void fc_compute(const TensorHf4& tin, const TensorHf4& weight, \
     }
 }
 
-TEST(TestSaberFuncFcBM, test_func_fc) {
+TEST(TestSaberFuncBM, test_func_fc) {
 
     int test_iter = 100;
     int w_in = 7;
@@ -109,12 +109,10 @@ TEST(TestSaberFuncFcBM, test_func_fc) {
         //cudaDeviceSynchronize();
     }
 
-    CUDA_POST_KERNEL_CHECK;
     t1.end(ctx_dev);
     float ts = t1.get_average_ms();
     LOG(INFO) << "total time: " << ts << "avg time: " << ts / test_iter;
     //print_tensor_device(*output_dev_4d[0]);
-    //cudaDeviceSynchronize();
 
     //! check result
     TensorHf4 thin(shape_in);

From 1c4439a7b40a654c9b2ea95ee44319f770607da7 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Sat, 23 Jun 2018 15:58:33 +0800
Subject: [PATCH 033/318] allow copy from tensor with different data type

---
 saber/core/tensor.cpp                  | 24 +++++++++++++++
 saber/core/tensor.h                    | 42 ++------------------------
 test/saber/bm/test_saber_tensor_BM.cpp | 18 ++++++++---
 3 files changed, 40 insertions(+), 44 deletions(-)
 create mode 100644 saber/core/tensor.cpp

diff --git a/saber/core/tensor.cpp b/saber/core/tensor.cpp
new file mode 100644
index 000000000..9283aac90
--- /dev/null
+++ b/saber/core/tensor.cpp
@@ -0,0 +1,24 @@
+#include "tensor.h"
+
+#ifdef USE_BM
+
+#include "bmlib_runtime.h"
+#include "bmdnn_api.h"
+#include "bmlib_utils.h"
+
+template<>
+template<>
+SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
+    //auto* device_data_ptr = mutable_data();
+    //BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(tensor.data())));
+    return SaberSuccess;
+}
+
+template<>
+template<>
+SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor<BM, AK_BM, NCHW>& tensor) {
+    return SaberSuccess;
+}
+
+#endif
+
diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 29ec1d006..2272549cd 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -20,13 +20,6 @@
 #include "core/events.h"
 #include "core/tensor_traits.h"
 
-#ifdef USE_BM
-#include <typeinfo>
-#include "bmlib_runtime.h"
-#include "bmdnn_api.h"
-#include "bmlib_utils.h"
-#endif
-
 namespace anakin{
 
 namespace saber{
@@ -730,39 +723,10 @@ class Tensor : public TensorBase {
     }
 
 #ifdef USE_BM
-    template <typename TargetType_t, DataType DataType_t, typename LayOutType_t>
-    SaberStatus copy_from(const Tensor<TargetType_t, DataType_t, LayOutType_t>& tensor) {
-
-        CHECK_EQ(valid_size(), tensor.valid_size()) \
-            << "sizes of two valid shapes must be the same";
-
-        /// copy from system to device
-        if (typeid(BM) == typeid(targetType_t) &&
-            typeid(AK_BM) == typeid(datatype) &&
-            typeid(X86) == typeid(TargetType_t) &&
-            typeid(AK_FLOAT) == typeid(DataType_t)){
-
-            Dtype* device_data_ptr = mutable_data();
-            BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(tensor.data())));
-
-            return SaberSuccess;
-        }
-
-        /// copy from device to system
-        /*if (typeid(X86) == typeid(targetType_t) &&
-            typeid(AK_FLOAT) == typeid(datatype) &&
-            typeid(BM) == typeid(TargetType_t) &&
-            typeid(AK_BM) == typeid(DataType_t)){
-
-            auto* device_data_ptr = tensor.data();
-            BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr));
-
-            return SaberSuccess;
-        }*/
-
-        /// other types are not allowed here
+    template <typename NewTargetType_t, DataType NewDataType_t, typename NewLayOutType_t>
+    SaberStatus copy_from(const Tensor<NewTargetType_t, NewDataType_t, NewLayOutType_t>& tensor) {
         return SaberInvalidValue;
-    };
+    }
 #endif
 
     /**
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index 83eb472b7..ed3ff0503 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -58,16 +58,24 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
 
     //! test tensor copy_from() function
     LOG(INFO) << "test copy_from() function, input tensor could be any target";
-    //thost1.copy_from(thost0);
-    tdev1.copy_from(thost0);
 
-    /*
+    // host to host
+    thost1.copy_from(thost0);
+    print_tensor_host(thost1);
+
+    // host to device
+    tdev1.copy_from(thost0);
     //TODO: print tensor for BM device
-    print_tensor_host(tdev1);
+    //print_tensor_host(tdev1);
+
+    // device to host
     thost1.copy_from(tdev1);
-    tdev1.copy_from(tdev0);
     print_tensor_host(thost1);
 
+    /*
+    // device to device
+    tdev1.copy_from(tdev0);
+
     //! test tensor constructor with shape and real_shape
     LOG(INFO) << "test tensor constructor with shape and real_shape";
     //! constructor with 3 shapes is removed

From cbf68a70b11ad3d4694108b138b707545b83615d Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Sat, 23 Jun 2018 16:28:04 +0800
Subject: [PATCH 034/318] AK_BM size should return 1

---
 saber/core/tensor.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/saber/core/tensor.cpp b/saber/core/tensor.cpp
index 9283aac90..3a283c1f6 100644
--- a/saber/core/tensor.cpp
+++ b/saber/core/tensor.cpp
@@ -6,6 +6,9 @@
 #include "bmdnn_api.h"
 #include "bmlib_utils.h"
 
+template<>
+size_t Tensor<BM, AK_BM, NCHW>::_type_len{1};
+
 template<>
 template<>
 SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {

From dd763744ff43886e33a24417221e4fa7e2962d8d Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Sat, 23 Jun 2018 19:23:11 +0800
Subject: [PATCH 035/318] Comment out specialization of _type_len for now.

---
 saber/core/tensor.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/saber/core/tensor.cpp b/saber/core/tensor.cpp
index 3a283c1f6..3203f4779 100644
--- a/saber/core/tensor.cpp
+++ b/saber/core/tensor.cpp
@@ -1,13 +1,19 @@
 #include "tensor.h"
 
 #ifdef USE_BM
-
 #include "bmlib_runtime.h"
 #include "bmdnn_api.h"
 #include "bmlib_utils.h"
+#endif
 
-template<>
-size_t Tensor<BM, AK_BM, NCHW>::_type_len{1};
+namespace anakin {
+
+namespace saber {
+
+#ifdef USE_BM
+
+        //template<>
+//size_t Tensor<BM, AK_BM, NCHW>::_type_len{1};
 
 template<>
 template<>
@@ -25,3 +31,5 @@ SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor
 
 #endif
 
+}
+}
\ No newline at end of file

From 166d7c8818679e32b7983bf295c623229fe4ced0 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Sat, 23 Jun 2018 19:59:29 +0800
Subject: [PATCH 036/318] Add implementation for copy_from between device and
 system

---
 saber/core/tensor.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/saber/core/tensor.cpp b/saber/core/tensor.cpp
index 3203f4779..1978666bc 100644
--- a/saber/core/tensor.cpp
+++ b/saber/core/tensor.cpp
@@ -12,20 +12,22 @@ namespace saber {
 
 #ifdef USE_BM
 
-        //template<>
+//template<>
 //size_t Tensor<BM, AK_BM, NCHW>::_type_len{1};
 
 template<>
 template<>
 SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
-    //auto* device_data_ptr = mutable_data();
-    //BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(tensor.data())));
+    auto* device_data_ptr = mutable_data();
+    BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(const_cast<float *>(tensor.data()))));
     return SaberSuccess;
 }
 
 template<>
 template<>
 SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor<BM, AK_BM, NCHW>& tensor) {
+    auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
+    BMDNN_CHECK(bm_memcpy_d2s(TargetWrapper<BM>::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr));
     return SaberSuccess;
 }
 

From 56d2054fbf46895e93278847da33a465e3b8b9ca Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Sat, 23 Jun 2018 22:46:42 +0800
Subject: [PATCH 037/318] Redefine _type_len as function so that we can do
 specialization

---
 saber/core/tensor.cpp |  6 ++++--
 saber/core/tensor.h   | 24 +++++++++++++-----------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/saber/core/tensor.cpp b/saber/core/tensor.cpp
index 1978666bc..081854c86 100644
--- a/saber/core/tensor.cpp
+++ b/saber/core/tensor.cpp
@@ -12,8 +12,10 @@ namespace saber {
 
 #ifdef USE_BM
 
-//template<>
-//size_t Tensor<BM, AK_BM, NCHW>::_type_len{1};
+template<>
+size_t Tensor<BM, AK_BM, NCHW>::_type_len(){
+    return 1;
+}
 
 template<>
 template<>
diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 2272549cd..086436c0e 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -90,7 +90,7 @@ class Tensor : public TensorBase {
         _shape = shape;
         _valid_shape = shape;
         _offset = Shape::zero(shape.dims());
-        _buf = std::make_shared<Buffer<TargetType>>(shape.count() * _type_len);
+        _buf = std::make_shared<Buffer<TargetType>>(shape.count() * _type_len());
         _is_subbuf = false;
     }
 #if 0
@@ -126,7 +126,7 @@ class Tensor : public TensorBase {
         _valid_shape = shape;
         _offset = Shape::zero(shape.dims());
         std::shared_ptr<Buffer<TargetType_t>> buf_from_date = \
-            std::make_shared<Buffer<TargetType_t>>(data_ptr, shape.count() * _type_len, id);
+            std::make_shared<Buffer<TargetType_t>>(data_ptr, shape.count() * _type_len(), id);
         BufferMemShare(_buf, buf_from_date);
         _is_subbuf = false;
     }
@@ -224,7 +224,7 @@ class Tensor : public TensorBase {
         _shape = shape;
         _valid_shape = _shape;
         _offset =Shape::zero(_shape.dims());
-        _buf->alloc(_shape.count() * _type_len);
+        _buf->alloc(_shape.count() * _type_len());
         return SaberSuccess;
     }
 
@@ -286,13 +286,13 @@ class Tensor : public TensorBase {
             CHECK_EQ(_valid_shape + _offset <= _shape, true) << \
                 "valid_shape + offet should <= shape";
         }
-        bool exceed_flag = _shape.count() * _type_len > _buf->get_capacity() \
+        bool exceed_flag = _shape.count() * _type_len() > _buf->get_capacity() \
             && (_is_subbuf || _is_shared);
         //if (exceed_flag) {
         //    return SaberOutOfAuthority;
         //}
         CHECK_EQ(exceed_flag, false) << "shared tensor shape exceed origin data buffer size";
-        SABER_CHECK(_buf->re_alloc(_shape.count() * _type_len));
+        SABER_CHECK(_buf->re_alloc(_shape.count() * _type_len()));
         return SaberSuccess;
     }
 
@@ -529,7 +529,7 @@ class Tensor : public TensorBase {
         CHECK_EQ(_shape > Shape::zero(TensorAPI::layout_dims::value), true) << \
             "current tensor is not initialized (no shape info, use set_shape)";
         typedef typename Tensor_t::Dtype dtype_t;
-        CHECK_LE(size() * _type_len, tensor.size() * sizeof(dtype_t)) << \
+        CHECK_LE(size() * _type_len(), tensor.size() * sizeof(dtype_t)) << \
             "current tensor size should <= input tensor size";
 
         _is_shared = BufferMemShare(_buf, tensor.get_buf()) > 0;
@@ -599,7 +599,7 @@ class Tensor : public TensorBase {
             Dtype* ptr_dst = mutable_data();
             const Dtype* ptr_src = tensor.data();
             process_API::sync_memcpy(ptr_dst, device_id(), ptr_src, tensor.device_id(), \
-                _type_len * valid_size(), flag_type());
+                _type_len() * valid_size(), flag_type());
             return SaberSuccess;
         }
 
@@ -717,7 +717,7 @@ class Tensor : public TensorBase {
             Dtype* ptr_dst = dst + idx_dst;//_buf->get_data_mutable() + idx_dst;
             const Dtype* ptr_src = src + idx_src;//tensor.get_buf()->get_data() + idx_src;
             process_API::sync_memcpy(ptr_dst, device_id(), ptr_src, tensor.device_id(), \
-                _type_len * cpy_len, flag_type());
+                _type_len() * cpy_len, flag_type());
         }
         return SaberSuccess;
     }
@@ -758,7 +758,7 @@ class Tensor : public TensorBase {
             Dtype* ptr_dst = mutable_data();
             const Dtype* ptr_src = tensor.data();
             process_API::async_memcpy(ptr_dst, device_id(), ptr_src, tensor.device_id(), \
-                _type_len * valid_size(), stream, flag_type());
+                _type_len() * valid_size(), stream, flag_type());
             return SaberSuccess;
         }
 
@@ -876,7 +876,7 @@ class Tensor : public TensorBase {
             Dtype* ptr_dst = dst + idx_dst;//_buf->get_data_mutable() + idx_dst;
             const Dtype* ptr_src = src + idx_src;//tensor.get_buf()->get_data() + idx_src;
             process_API::async_memcpy(ptr_dst, device_id(), ptr_src, tensor.device_id(), \
-                _type_len * cpy_len, stream, flag_type());
+                _type_len() * cpy_len, stream, flag_type());
         }
         return SaberSuccess;
     }
@@ -906,7 +906,9 @@ class Tensor : public TensorBase {
 
 private:
     ///< Length of datatype.
-    size_t _type_len{sizeof(Dtype)};
+    size_t _type_len(){
+        return sizeof(Dtype);
+    }
     ///< Represent the raw mem shape.
     Shape _shape;
     ///< Represent the mem you have right to access shape.

From 6ea9474ee6a18ab0698299ab46e1c40012e7e8cc Mon Sep 17 00:00:00 2001
From: "weihao.huang" <weihao.huang@bitmain.com>
Date: Sun, 24 Jun 2018 05:50:24 +0000
Subject: [PATCH 038/318] Fix mem_free function

---
 saber/core/impl/bm/bm_impl.cpp          | 21 ++++++++++++++++-----
 test/saber/bm/test_TargetWrapper_BM.cpp | 19 ++++++++++++-------
 2 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 4aecb169d..d4a312fcf 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -65,21 +65,32 @@ void BM_API::mem_alloc(void** ptr, size_t n){
     //BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n));
     //bm_device_mem_t mem = bm_mem_from_system(*ptr);
     handle = get_bm_handle();
-    bm_device_mem_t *mem = new bm_device_mem_t[1];
-    mem = reinterpret_cast<struct bm_mem_desc *>(ptr);
+    //bm_device_mem_t *mem = new bm_device_mem_t[1];
+    bm_device_mem_t *mem = reinterpret_cast<struct bm_mem_desc *>(*ptr);
     BMDNN_CHECK(bm_malloc_device_byte(handle, mem, n));
 }
         
 void BM_API::mem_free(void* ptr){
     //(bm_handle_t handle, bm_device_mem_t mem){
     if(ptr != nullptr){
-        bm_free_device(handle, bm_mem_from_system(ptr));
+        //bm_free_device(handle, bm_mem_from_system(ptr));
         //bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr);
         //bm_free_device(handle, *pmem);
-        //handle = get_bm_handle();
-	//bm_free_device(handle, reinterpret_cast<struct bm_mem_desc>(*ptr));
+        handle = get_bm_handle();
+        bm_device_mem_t *mem = reinterpret_cast<struct bm_mem_desc *>(ptr);
+	//bm_free_device(handle, reinterpret_cast<struct bm_mem_desc>(ptr));
+	bm_free_device(handle, *mem);
     }
 }
+
+void BM_API::mem_free_BM(bm_device_mem_t mem){
+    //(bm_handle_t handle, bm_device_mem_t mem){
+    if(&mem != nullptr){
+        handle = get_bm_handle();
+	bm_free_device(handle, mem);
+    }
+}
+ 
         
 void BM_API::mem_set(void* ptr, int value, size_t n){
     //(bm_handle_t handle, const int value, bm_device_mem_t mem){
diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp
index a76bef279..c50df3fa3 100644
--- a/test/saber/bm/test_TargetWrapper_BM.cpp
+++ b/test/saber/bm/test_TargetWrapper_BM.cpp
@@ -8,15 +8,20 @@ static bm_handle_t handle;
 int main() {
     bmdnn_init(&handle);
     typedef TargetWrapper<BM> API;
-    void *pmem;
     int dev_count = 0;
     API::get_device_count(dev_count);
-    std::cout << dev_count << std::endl;
-    API::mem_alloc(&pmem, 3*200*200);
-    //API::mem_free(pmem);
-    std::cout << "Press any key to finish execution." << std::endl;
-    int a;
-    std::cin >> a;
+    std::cout << "dev_count: " << dev_count << std::endl;
+    
+    //void *pmem;
+    bm_device_mem_t *pmem = new bm_device_mem_t();
+    std::cout << "mem addr before mem_alloc: " << pmem << std::endl;
+    API::mem_alloc(&pmem, 3*200*400);
+    std::cout << "mem addr after  mem_alloc: " << pmem << std::endl;
+    
+    bm_device_mem_t *test = reinterpret_cast<bm_device_mem_t *>(pmem);
+    API::mem_free_BM((bm_device_mem_t)(*test));
+    std::cout << "End mem_free test." << std::endl;
+    delete pmem;
     bmdnn_deinit(handle);
 }
 #endif

From 89a645f0b1fc40ca22d0602491c1495c726e0f3b Mon Sep 17 00:00:00 2001
From: "weihao.huang" <weihao.huang@bitmain.com>
Date: Sun, 24 Jun 2018 07:26:59 +0000
Subject: [PATCH 039/318] Fix mem_free function

---
 saber/core/target_wrapper.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index e724235d8..1f283a004 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -380,7 +380,9 @@ struct TargetWrapper<BM, __device_target> {
 
     //template <typename void>
     static void mem_free(void * ptr);
-
+    
+    static void mem_free_BM(bm_device_mem_t mem);
+    
     //template <typename void>
     static void mem_set(void* ptr, int value, size_t n);
 

From 12e7c27e643514e3d3a7db6637e992f943c089aa Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Sun, 24 Jun 2018 19:55:07 +0800
Subject: [PATCH 040/318] change mem_free_BM to mem_free; tensor test passed

---
 saber/core/impl/bm/bm_impl.cpp          | 22 +---------------------
 test/saber/bm/test_TargetWrapper_BM.cpp | 12 +++++-------
 2 files changed, 6 insertions(+), 28 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index d4a312fcf..e2e5b9e65 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -60,37 +60,17 @@ int BM_API::get_device_id(){
 }
         
 void BM_API::mem_alloc(void** ptr, size_t n){
-    //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n)
-    //bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr);
-    //BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n));
-    //bm_device_mem_t mem = bm_mem_from_system(*ptr);
     handle = get_bm_handle();
-    //bm_device_mem_t *mem = new bm_device_mem_t[1];
     bm_device_mem_t *mem = reinterpret_cast<struct bm_mem_desc *>(*ptr);
     BMDNN_CHECK(bm_malloc_device_byte(handle, mem, n));
 }
         
 void BM_API::mem_free(void* ptr){
-    //(bm_handle_t handle, bm_device_mem_t mem){
     if(ptr != nullptr){
-        //bm_free_device(handle, bm_mem_from_system(ptr));
-        //bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr);
-        //bm_free_device(handle, *pmem);
         handle = get_bm_handle();
-        bm_device_mem_t *mem = reinterpret_cast<struct bm_mem_desc *>(ptr);
-	//bm_free_device(handle, reinterpret_cast<struct bm_mem_desc>(ptr));
-	bm_free_device(handle, *mem);
+        bm_free_device(handle, *(struct bm_mem_desc*)(ptr));
     }
 }
-
-void BM_API::mem_free_BM(bm_device_mem_t mem){
-    //(bm_handle_t handle, bm_device_mem_t mem){
-    if(&mem != nullptr){
-        handle = get_bm_handle();
-	bm_free_device(handle, mem);
-    }
-}
- 
         
 void BM_API::mem_set(void* ptr, int value, size_t n){
     //(bm_handle_t handle, const int value, bm_device_mem_t mem){
diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp
index c50df3fa3..9d445f16a 100644
--- a/test/saber/bm/test_TargetWrapper_BM.cpp
+++ b/test/saber/bm/test_TargetWrapper_BM.cpp
@@ -8,18 +8,16 @@ static bm_handle_t handle;
 int main() {
     bmdnn_init(&handle);
     typedef TargetWrapper<BM> API;
-    int dev_count = 0;
-    API::get_device_count(dev_count);
-    std::cout << "dev_count: " << dev_count << std::endl;
+    //int dev_count = 0;
+    //API::get_device_count(dev_count);
+    //std::cout << "dev_count: " << dev_count << std::endl;
     
-    //void *pmem;
     bm_device_mem_t *pmem = new bm_device_mem_t();
     std::cout << "mem addr before mem_alloc: " << pmem << std::endl;
     API::mem_alloc(&pmem, 3*200*400);
     std::cout << "mem addr after  mem_alloc: " << pmem << std::endl;
-    
-    bm_device_mem_t *test = reinterpret_cast<bm_device_mem_t *>(pmem);
-    API::mem_free_BM((bm_device_mem_t)(*test));
+    std::cout << "Start mem_free test." << pmem << std::endl;
+    API::mem_free(pmem);
     std::cout << "End mem_free test." << std::endl;
     delete pmem;
     bmdnn_deinit(handle);

From 8a14228683d62dee4c1611246f1b2cdb6b7715c8 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Mon, 25 Jun 2018 10:44:03 +0800
Subject: [PATCH 041/318] remove stream test in context

---
 test/saber/bm/test_saber_context_BM.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/test/saber/bm/test_saber_context_BM.cpp b/test/saber/bm/test_saber_context_BM.cpp
index e221ba8f4..ed93866cf 100644
--- a/test/saber/bm/test_saber_context_BM.cpp
+++ b/test/saber/bm/test_saber_context_BM.cpp
@@ -12,11 +12,8 @@ TEST(TestSaberContextBM, test_BM_context) {
     LOG(INFO) << "test context constructor";
     Context<BM> ctx0;
     Context<BM> ctx1(0, 1, 1);
-    LOG(INFO) << "test record event to context data stream and compute stream";
-    API::record_event(event, ctx0.get_data_stream());
-    API::record_event(event, ctx0.get_compute_stream());
-    API::record_event(event, ctx1.get_data_stream());
-    API::record_event(event, ctx1.get_compute_stream());
+
+    //for BM no need to test stream as it is not in use
 }
 
 #endif

From e318008f7367822a7b4c6771aff457f63ba3d454 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Mon, 25 Jun 2018 11:46:19 +0800
Subject: [PATCH 042/318] Update buffer test for BM

---
 test/saber/bm/test_saber_buffer_BM.cpp | 68 +++++++++++++++-----------
 1 file changed, 39 insertions(+), 29 deletions(-)

diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp
index 93aa6d36e..ea8d7101d 100644
--- a/test/saber/bm/test_saber_buffer_BM.cpp
+++ b/test/saber/bm/test_saber_buffer_BM.cpp
@@ -4,12 +4,22 @@
 
 using namespace anakin::saber;
 
-template <DataType datatype>
+static bm_handle_t handle;
+
+int get_bm_size() {
+    return 1;
+}
+
+template <DataType Ddatatype, DataType Hdatatype>
 void test_buffer() {
 
+    //TODO: init in another place
+    bmdnn_init(&handle);
+
     typedef TargetWrapper<X86> X86_API;
     typedef TargetWrapper<BM> BM_API;
-    typedef typename DataTrait<datatype>::dtype Dtype;
+    typedef typename DataTrait<Ddatatype>::dtype Ddtype;
+    typedef typename DataTrait<Hdatatype>::dtype Hdtype;
     typedef Buffer<X86> BufferH;
     typedef Buffer<BM> BufferD;
 
@@ -17,30 +27,30 @@ void test_buffer() {
     int n1 = 2048;
 
     void* tmp_x86;
-    Dtype* x86_ptr;
-    X86_API::mem_alloc(&tmp_x86, sizeof(Dtype) * n0);
-    x86_ptr = static_cast<Dtype*>(tmp_x86);
+    Hdtype* x86_ptr;
+    X86_API::mem_alloc(&tmp_x86, sizeof(Hdtype) * n0);
+    x86_ptr = static_cast<Hdtype*>(tmp_x86);
 
     for (int i = 0; i < n0; i++) {
-        x86_ptr[i] = static_cast<Dtype>(i);
+        x86_ptr[i] = static_cast<Hdtype>(i);
     }
 
     void* tmp_bm;
-    Dtype* bm_ptr;
-    BM_API::mem_alloc(&tmp_bm, sizeof(Dtype) * n0);
-    bm_ptr = static_cast<Dtype*>(tmp_bm);
+    Ddtype* bm_ptr;
+    BM_API::mem_alloc(&tmp_bm, get_bm_size() * n0);
+    bm_ptr = static_cast<Ddtype*>(tmp_bm);
 
     LOG(INFO) << "Buffer: test default(empty) constructor";
     BufferH x86_buf0;
     BufferD bm_buf0;
 
     LOG(INFO) << "Buffer: test constructor with data size";
-    BufferH x86_buf1(n0 * sizeof(Dtype));
-    BufferD bm_buf1(n0 * sizeof(Dtype));
+    BufferH x86_buf1(n0 * sizeof(Hdtype));
+    BufferD bm_buf1(n0 * sizeof(Ddtype));
 
     LOG(INFO) << "Buffer: test constructor with data pointer, size and device id";
-    BufferH x86_buf2(x86_ptr, n0 * sizeof(Dtype), X86_API::get_device_id());
-    BufferD bm_buf2(bm_ptr, n0 * sizeof(Dtype), BM_API::get_device_id());
+    BufferH x86_buf2(x86_ptr, n0 * sizeof(Hdtype), X86_API::get_device_id());
+    BufferD bm_buf2(bm_ptr, n0 * get_bm_size(), BM_API::get_device_id());
 
     LOG(INFO) << "Buffer: test copy constructor";
     BufferH x86_buf3(x86_buf2);
@@ -62,18 +72,18 @@ void test_buffer() {
             "shared buffer should have same data count";
 
     LOG(INFO) << "Buffer: test re_alloc";
-    x86_buf1.re_alloc(n1 * sizeof(Dtype));
-    bm_buf1.re_alloc(n1 * sizeof(Dtype));
-    CHECK_EQ(x86_buf1.get_count(), n1 * sizeof(Dtype)) << "buffer count error";
-    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error";
-    CHECK_EQ(bm_buf1.get_count(), n1 * sizeof(Dtype)) << "buffer count error";
-    CHECK_EQ(bm_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error";
-    x86_buf1.re_alloc(n0 * sizeof(Dtype));
-    bm_buf1.re_alloc(n0 * sizeof(Dtype));
-    CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Dtype)) << "buffer count error";
-    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error";
-    CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Dtype)) << "buffer count error";
-    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error";
+    x86_buf1.re_alloc(n1 * sizeof(Hdtype));
+    bm_buf1.re_alloc(n1 * sizeof(Ddtype));
+    CHECK_EQ(x86_buf1.get_count(), n1 * sizeof(Hdtype)) << "buffer count error";
+    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Hdtype)) << "buffer capacity error";
+    CHECK_EQ(bm_buf1.get_count(), n1 * sizeof(Ddtype)) << "buffer count error";
+    CHECK_EQ(bm_buf1.get_capacity(), n1 * sizeof(Ddtype)) << "buffer capacity error";
+    x86_buf1.re_alloc(n0 * sizeof(Hdtype));
+    bm_buf1.re_alloc(n0 * sizeof(Ddtype));
+    CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Hdtype)) << "buffer count error";
+    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Hdtype)) << "buffer capacity error";
+    CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Hdtype)) << "buffer count error";
+    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Hdtype)) << "buffer capacity error";
 
     LOG(INFO) << "Buffer: test get_id()";
     LOG(INFO) << "X86 device id: " << x86_buf0.get_id() << \
@@ -84,8 +94,8 @@ void test_buffer() {
     LOG(INFO) << "Buffer: test deep_cpy()";
     x86_buf1.sync_copy_from(x86_buf2);
     LOG(INFO) << "deep copy between two host buffer: ";
-    const Dtype* ptr1 = static_cast<const Dtype*>(x86_buf1.get_data());
-    const Dtype* ptr2 = static_cast<const Dtype*>(x86_buf1.get_data());
+    const Hdtype* ptr1 = static_cast<const Hdtype*>(x86_buf1.get_data());
+    const Hdtype* ptr2 = static_cast<const Hdtype*>(x86_buf1.get_data());
 
     for (int i = 0; i < 10; i++) {
         std::cout << ptr1[i] << std::endl;
@@ -96,7 +106,7 @@ void test_buffer() {
     bm_buf1.sync_copy_from(x86_buf2);
     x86_buf1.sync_copy_from(bm_buf1);
     LOG(INFO) << "deep copy from device buffer to host buffer: ";
-    ptr1 = static_cast<const Dtype*>(x86_buf1.get_data());
+    ptr1 = static_cast<const Hdtype*>(x86_buf1.get_data());
 
     for (int i = 0; i < 10; i++) {
         std::cout << ptr1[i] << std::endl;
@@ -104,7 +114,7 @@ void test_buffer() {
 }
 
 TEST(TestSaberBufferBM, test_buffer_memcpy) {
-    test_buffer<AK_BM>();
+    test_buffer<AK_BM, AK_FLOAT>();
 }
 
 int main(int argc, const char** argv) {

From fa95d52c35c18bd9e339517d95a7af4e173e7dbb Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Mon, 25 Jun 2018 13:26:51 +0800
Subject: [PATCH 043/318] Specialization for Env<BM>

---
 saber/core/env.cpp | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 saber/core/env.cpp

diff --git a/saber/core/env.cpp b/saber/core/env.cpp
new file mode 100644
index 000000000..b294fead4
--- /dev/null
+++ b/saber/core/env.cpp
@@ -0,0 +1,19 @@
+#include "env.h"
+
+namespace anakin {
+
+    namespace saber {
+
+#ifdef USE_BM
+
+        template<>
+        void Env<BM>::env_init(int max_stream){
+            //TODO: decide what to put here
+            LOG(INFO) << "env init for BM";
+        }
+
+#endif
+
+
+    }
+}

From d70704c7ab2141bf081f81018106a983f144274c Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Mon, 25 Jun 2018 14:07:30 +0800
Subject: [PATCH 044/318] env skip bm

---
 saber/core/env.cpp | 19 -------------------
 saber/core/env.h   |  5 +++++
 2 files changed, 5 insertions(+), 19 deletions(-)
 delete mode 100644 saber/core/env.cpp

diff --git a/saber/core/env.cpp b/saber/core/env.cpp
deleted file mode 100644
index b294fead4..000000000
--- a/saber/core/env.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-#include "env.h"
-
-namespace anakin {
-
-    namespace saber {
-
-#ifdef USE_BM
-
-        template<>
-        void Env<BM>::env_init(int max_stream){
-            //TODO: decide what to put here
-            LOG(INFO) << "env init for BM";
-        }
-
-#endif
-
-
-    }
-}
diff --git a/saber/core/env.h b/saber/core/env.h
index 3ae42165b..ceabb868c 100644
--- a/saber/core/env.h
+++ b/saber/core/env.h
@@ -16,6 +16,7 @@
 #define ANAKIN_SABER_CORE_ENV_H
 
 #include "core/device.h"
+#include <type_traits>
 
 namespace anakin{
 
@@ -31,6 +32,10 @@ class Env {
         return *_g_env;
     }
     static void env_init(int max_stream = 4){
+        if(std::is_same<TargetType,BM>::value){
+            LOG(INFO) << "env init for BM";
+            return;
+        }
         Devs& devs = cur_env();
         if (devs.size() > 0){
             return;

From 03fa00150613bc4be5f66112d4d074bdc59c58eb Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Mon, 25 Jun 2018 15:10:32 +0800
Subject: [PATCH 045/318] modify mem_alloc for void*

---
 saber/core/impl/bm/bm_impl.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index e2e5b9e65..c93703a5d 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -61,8 +61,10 @@ int BM_API::get_device_id(){
         
 void BM_API::mem_alloc(void** ptr, size_t n){
     handle = get_bm_handle();
-    bm_device_mem_t *mem = reinterpret_cast<struct bm_mem_desc *>(*ptr);
+    /* bm_device_mem_t *mem = reinterpret_cast<struct bm_mem_desc *>(*ptr); */
+    bm_device_mem_t *mem = new bm_device_mem_t();
     BMDNN_CHECK(bm_malloc_device_byte(handle, mem, n));
+    *ptr = mem;
 }
         
 void BM_API::mem_free(void* ptr){

From f601c0b4b2ac6698ca528dec0c2aca0997554789 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Mon, 25 Jun 2018 15:40:43 +0800
Subject: [PATCH 046/318] Specialization for copy_from

---
 saber/core/tensor.cpp | 39 ---------------------------------------
 saber/core/tensor.h   | 28 ++++++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 39 deletions(-)
 delete mode 100644 saber/core/tensor.cpp

diff --git a/saber/core/tensor.cpp b/saber/core/tensor.cpp
deleted file mode 100644
index 081854c86..000000000
--- a/saber/core/tensor.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-#include "tensor.h"
-
-#ifdef USE_BM
-#include "bmlib_runtime.h"
-#include "bmdnn_api.h"
-#include "bmlib_utils.h"
-#endif
-
-namespace anakin {
-
-namespace saber {
-
-#ifdef USE_BM
-
-template<>
-size_t Tensor<BM, AK_BM, NCHW>::_type_len(){
-    return 1;
-}
-
-template<>
-template<>
-SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
-    auto* device_data_ptr = mutable_data();
-    BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(const_cast<float *>(tensor.data()))));
-    return SaberSuccess;
-}
-
-template<>
-template<>
-SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor<BM, AK_BM, NCHW>& tensor) {
-    auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
-    BMDNN_CHECK(bm_memcpy_d2s(TargetWrapper<BM>::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr));
-    return SaberSuccess;
-}
-
-#endif
-
-}
-}
\ No newline at end of file
diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 086436c0e..3051e16c6 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -725,6 +725,7 @@ class Tensor : public TensorBase {
 #ifdef USE_BM
     template <typename NewTargetType_t, DataType NewDataType_t, typename NewLayOutType_t>
     SaberStatus copy_from(const Tensor<NewTargetType_t, NewDataType_t, NewLayOutType_t>& tensor) {
+        LOG(INFO) << "base copy_from";
         return SaberInvalidValue;
     }
 #endif
@@ -939,6 +940,33 @@ class Tensor : public TensorBase {
     std::vector<int> _seq_offset;
 };
 
+#ifdef USE_BM
+
+template<> inline
+size_t Tensor<BM, AK_BM, NCHW>::_type_len(){
+    return 1;
+}
+
+template<>
+template<> inline
+SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
+    LOG(INFO) << "BM copy_from";
+    auto* device_data_ptr = mutable_data();
+    BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(const_cast<float *>(tensor.data()))));
+    return SaberSuccess;
+}
+
+template<>
+template<> inline
+SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor<BM, AK_BM, NCHW>& tensor) {
+    LOG(INFO) << "X86 copy_from";
+    auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
+    BMDNN_CHECK(bm_memcpy_d2s(TargetWrapper<BM>::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr));
+    return SaberSuccess;
+}
+
+#endif
+
 } //namespace saber
 
 } //namespace anakin

From a8eef8ba0730205c2e7a70784e4e71a2021759e7 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Mon, 25 Jun 2018 15:50:04 +0800
Subject: [PATCH 047/318] Revert speical handling for Env<BM>

---
 saber/core/env.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/saber/core/env.h b/saber/core/env.h
index ceabb868c..3ae42165b 100644
--- a/saber/core/env.h
+++ b/saber/core/env.h
@@ -16,7 +16,6 @@
 #define ANAKIN_SABER_CORE_ENV_H
 
 #include "core/device.h"
-#include <type_traits>
 
 namespace anakin{
 
@@ -32,10 +31,6 @@ class Env {
         return *_g_env;
     }
     static void env_init(int max_stream = 4){
-        if(std::is_same<TargetType,BM>::value){
-            LOG(INFO) << "env init for BM";
-            return;
-        }
         Devs& devs = cur_env();
         if (devs.size() > 0){
             return;

From d6457d9553b9ea382853c069df8d63dda9ed6786 Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Mon, 25 Jun 2018 17:29:29 +0800
Subject: [PATCH 048/318] add conv op, did't test

---
 saber/core/impl/bm/bm_impl.cpp          |   1 +
 saber/funcs/impl/bm/vender_conv.h       | 167 ++++--------------------
 test/saber/bm/test_TargetWrapper_BM.cpp |   6 +-
 3 files changed, 29 insertions(+), 145 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index c93703a5d..1bdb5d140 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -71,6 +71,7 @@ void BM_API::mem_free(void* ptr){
     if(ptr != nullptr){
         handle = get_bm_handle();
         bm_free_device(handle, *(struct bm_mem_desc*)(ptr));
+        delete ptr;
     }
 }
         
diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h
index 7efdfa611..a0a3b3fb5 100644
--- a/saber/funcs/impl/bm/vender_conv.h
+++ b/saber/funcs/impl/bm/vender_conv.h
@@ -1,18 +1,3 @@
-/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-   
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. 
-*/
-
 #ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H
 #define ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H
 
@@ -44,105 +29,13 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
     typedef typename DataTensor_in::Dtype InDataType;
     typedef typename DataTensor_out::Dtype OutDataType;
     typedef typename OpTensor::Dtype OpDataType;
-    VenderConv2D()
-            : _handle(NULL)
-            , _workspaceData(NULL)
-            , _workspace(NULL)
-            , _conv_descs(NULL)
-            , _input_descs(NULL)
-            , _output_descs(NULL)
-            , _filter_desc(NULL)
-            , _workspace_fwd_sizes(0)
-            , _workspaceSizeInBytes(0)
-            , _fwd_algo((cudnnConvolutionFwdAlgo_t)0)
-            , _input_nchw_descs(NULL)
-            , _output_nchw_descs(NULL)
-            , x8_data(NULL)
-            , y8_data(NULL)
-            , x8_data_size(0)
-            , y8_data_size(0)
-    {}
-
-    ~VenderConv2D() {
 
-        if (_conv_descs) {
-            CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(_conv_descs));
-        }
-        if (_input_descs) {
-            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_descs));
-        }
-        if (_output_descs) {
-            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_descs));
-        }
-        if (_filter_desc) {
-            CUDNN_CHECK(cudnnDestroyFilterDescriptor(_filter_desc));
-        }
-        if (_handle != NULL) {
-            CUDNN_CHECK(cudnnDestroy(_handle));
-        }
-        if (_workspaceData != NULL) {
-            cudaFree(_workspaceData);
-        }
-        if (_input_nchw_descs != NULL) {
-            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_nchw_descs));
-        }
-        if (_output_nchw_descs != NULL) {
-            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_nchw_descs));
-        }
-        if (x8_data != NULL) {
-            CUDA_CHECK(cudaFree(x8_data));
-        }
-        if (y8_data != NULL) {
-            CUDA_CHECK(cudaFree(y8_data));
-        }
-    }
+    VenderConv2D(): _handle(NULL) {}
+    ~VenderConv2D() {}
 
-    /**
-     * [Create description] Init all cudnn resource here
-     * @AuthorHTL
-     * @DateTime  2018-02-01T16:13:06+0800
-     * @param     inputs                    [description]
-     * @param     outputs                   [description]
-     * @param     param                [conv parameters]
-     */
     virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             ConvParam<OpTensor>& param, Context<BM>& ctx) {
-
-        // ---- init cudnn resources ----
-
-        _workspaceSizeInBytes = 0;
-        _workspaceData = NULL;
-
-        _workspace_fwd_sizes = 0;
-
-        this->_ctx = ctx;
-        // ---- get cuda resources ----
-
-        cudaStream_t cuda_stream;
-        cuda_stream = ctx.get_compute_stream();
-
-        CUDNN_CHECK(cudnnCreate(&_handle));
-        CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream));
-
-        _workspace = NULL;
-
-        int in_channels = inputs[0]->channel();
-
-        // ---- create cudnn Descs ----
-        cudnn::createFilterDesc<OpDataType>(&_filter_desc);
-
-        cudnn::createTensorDesc<InDataType>(&_input_descs);
-        cudnn::createTensorDesc<InDataType>(&_output_descs);
-        cudnn::createConvolutionDesc<OpDataType>(&_conv_descs);
-
-        if (param.bias()->size() > 0) {
-            cudnn::createTensorDesc<OpDataType>(&_bias_desc);
-        }
-
-        cudnnCreateTensorDescriptor(&_input_nchw_descs);
-        cudnnCreateTensorDescriptor(&_output_nchw_descs);
-
         return create(inputs, outputs, param, ctx);
     }
 
@@ -150,46 +43,36 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
                             std::vector<DataTensor_out *>& outputs,
                             ConvParam<OpTensor>& param, Context<BM>& ctx);
 
-    //call cudnnConvolutionForward here
     virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
                           std::vector<DataTensor_out*>& outputs,
-                          ConvParam<OpTensor>& param);
+                          ConvParam<OpTensor>& param) {
+        const InDataType *in_data = (const InDataType *) inputs[0]->data();
+        const InDataType *weight = (const InDataType *) param.weight()->data();
+        const InDataType *bias = (const InDataType *) param.bias()->data();
+        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
+        int input_n = inputs[0]->num();
+        int input_c = inputs[0]->channel();
+        int input_h = inputs[0]->height();
+        int input_w = inputs[0]->width();
+        int group = param.group;
+        int output_c = outputs[0]->channel();
+        int kh = param.weight()->height();
+        int kw = param.weight()->width();
+        int pad_h = param.pad_h;
+        int pad_w = param.pad_w;
+        int stride_h = param.stride_h;
+        int stride_w = param.stride_w;
+        BMDNN_CHECK(bmdnn_conv_forward(_handle, in_data, weights, bias,
+                                    input_n, input_c, input_h, input_w, group, output_c,
+                                    kh, kw, pad_h, pad_w, stride_h, stride_w, 1, 0, 0, 
+                                    out_data, NULL));
+        return SaberSuccess;
+    }
 
 private:
     cudnnHandle_t _handle;
-    cudnnConvolutionFwdAlgo_t _fwd_algo;
-
-    cudnnTensorDescriptor_t _input_descs;
-    cudnnTensorDescriptor_t _output_descs;
-    cudnnTensorDescriptor_t _bias_desc;
-
-    cudnnFilterDescriptor_t _filter_desc;
-
-    cudnnConvolutionDescriptor_t _conv_descs;
-
-    size_t _workspace_fwd_sizes;
-    size_t _workspaceSizeInBytes;  // size of underlying storage
-
-    void *_workspaceData;  // underlying storage
-    void *_workspace;  // aliases into _workspaceData
-
-    const bool _use_tensor_core = true;
-    const size_t _workspace_limit_bytes = 64 * 1024 * 1024;
-    const cudnnConvolutionFwdPreference_t _preference = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
-
-    // create transform descriptor
-    cudnnTensorDescriptor_t _input_nchw_descs;
-    cudnnTensorDescriptor_t _output_nchw_descs;
-
-    void *x8_data;
-    void *y8_data;
-
-    int x8_data_size;
-    int y8_data_size;
 };
 
-
 }
-
 }
 #endif //ANAKIN_SABER_FUNCS_BMDNN_CONV2D_H
diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp
index 9d445f16a..b893183a2 100644
--- a/test/saber/bm/test_TargetWrapper_BM.cpp
+++ b/test/saber/bm/test_TargetWrapper_BM.cpp
@@ -12,14 +12,14 @@ int main() {
     //API::get_device_count(dev_count);
     //std::cout << "dev_count: " << dev_count << std::endl;
     
-    bm_device_mem_t *pmem = new bm_device_mem_t();
+    //bm_device_mem_t *pmem = new bm_device_mem_t();
+    void* pmem;
     std::cout << "mem addr before mem_alloc: " << pmem << std::endl;
     API::mem_alloc(&pmem, 3*200*400);
     std::cout << "mem addr after  mem_alloc: " << pmem << std::endl;
-    std::cout << "Start mem_free test." << pmem << std::endl;
+    std::cout << "Start mem_free test." << std::endl;
     API::mem_free(pmem);
     std::cout << "End mem_free test." << std::endl;
-    delete pmem;
     bmdnn_deinit(handle);
 }
 #endif

From 08bd31288cc014bef30a11be509039cfba39c04b Mon Sep 17 00:00:00 2001
From: "weihao.huang" <weihao.huang@bitmain.com>
Date: Mon, 25 Jun 2018 13:21:36 +0000
Subject: [PATCH 049/318] Add sync_memcpy function & fix test_saber_buffer_BM

---
 saber/core/impl/bm/bm_impl.cpp         | 19 +++++++++++++++++++
 saber/core/target_wrapper.h            |  6 ++----
 test/saber/bm/test_saber_buffer_BM.cpp | 21 ++++++++++++++++++++-
 3 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 1bdb5d140..dacca58b6 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -82,6 +82,25 @@ void BM_API::mem_set(void* ptr, int value, size_t n){
     //BMDNN_CHECK(bm_memset_device(handle, value, *pmem));
 }
 
+//static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+//    size_t count, __DtoD) {};
+
+//static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+//    size_t count, __HtoD) {};
+
+void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+    size_t count, __DtoH) {
+    handle = get_bm_handle(); 
+    //auto* dev_ptr = const_cast<bm_device_mem_t *>(src);
+    BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src)));
+    //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *src));
+    LOG(INFO) << "End sync_memcpy process";
+};
+
+//static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
+//    int src_dev, size_t count) {};
+
+
 //! target wrapper
 template struct TargetWrapper<BM, __device_target>;
 
diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index 1f283a004..2a2a4be88 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -379,9 +379,7 @@ struct TargetWrapper<BM, __device_target> {
     static void mem_alloc(void** ptr, size_t n);
 
     //template <typename void>
-    static void mem_free(void * ptr);
-    
-    static void mem_free_BM(bm_device_mem_t mem);
+    static void mem_free(void * ptr); 
     
     //template <typename void>
     static void mem_set(void* ptr, int value, size_t n);
@@ -406,7 +404,7 @@ struct TargetWrapper<BM, __device_target> {
         size_t count, __HtoD) {};
 
     static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-        size_t count, __DtoH) {};
+        size_t count, __DtoH);
 
     static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
         int src_dev, size_t count) {};
diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp
index ea8d7101d..434bd221a 100644
--- a/test/saber/bm/test_saber_buffer_BM.cpp
+++ b/test/saber/bm/test_saber_buffer_BM.cpp
@@ -104,11 +104,30 @@ void test_buffer() {
     CHECK_EQ(ptr1[n0 / 2], ptr2[n0 / 2]) << "deep copy between host is incorrect";
     LOG(INFO) << "deep copy from host buffer to device buffer";
     bm_buf1.sync_copy_from(x86_buf2);
+    
+    /*
+    const Hdtype* x86_buf2_ptr = static_cast<const Hdtype*>(x86_buf2.get_data());
+    for (int i = 0; i < 10; i++) {
+	std::cout << "x86: " << x86_buf2_ptr[i] << std::endl;
+    }
+
+    const Hdtype* bm_buf1_ptr = static_cast<const Hdtype*>(bm_buf1.get_data());
+    for (int i = 0; i < 10; i++) {
+	std::cout << "bm: " << bm_buf1_ptr[i] << std::endl;
+    }
+
+    LOG(INFO) << "bm_buf1 cap & cnt: " << bm_buf1.get_capacity() << " " << bm_buf1.get_count();
+    LOG(INFO) << "x86_buf1 cap & cnt: " << x86_buf1.get_capacity() << " " << x86_buf1.get_count(); 
+    LOG(INFO) << "size of Hdtype: " << sizeof(Hdtype);
+    LOG(INFO) << "size of Ddtype: " << sizeof(Ddtype);
+    */
+
+    x86_buf1.re_alloc(bm_buf1.get_capacity());
     x86_buf1.sync_copy_from(bm_buf1);
     LOG(INFO) << "deep copy from device buffer to host buffer: ";
     ptr1 = static_cast<const Hdtype*>(x86_buf1.get_data());
 
-    for (int i = 0; i < 10; i++) {
+    for (int i = 0; i < 30; i++) {
         std::cout << ptr1[i] << std::endl;
     }
 }

From b95facb0c9c3ff9539ada6016483344daffa094d Mon Sep 17 00:00:00 2001
From: "Guangzhi (Frank) Xie" <guangzhi.xie@berkeley.edu>
Date: Mon, 25 Jun 2018 21:55:27 +0800
Subject: [PATCH 050/318] init handle for tensor test

---
 test/saber/bm/test_saber_tensor_BM.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index ed3ff0503..d42665528 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -625,6 +625,10 @@ TEST(TestSaberTensorBM, test_tensor_base_type) {
 }*/
 
 int main(int argc, const char** argv) {
+    //TODO: init in another place
+    static bm_handle_t handle;
+    bmdnn_init(&handle);
+
     // initial logger
     logger::init(argv[0]);
     InitTest();

From 001f2bd4ecdd24815f8ad8d33fee34bed3f503a9 Mon Sep 17 00:00:00 2001
From: "Guangzhi (Frank) Xie" <guangzhi.xie@berkeley.edu>
Date: Mon, 25 Jun 2018 22:06:13 +0800
Subject: [PATCH 051/318] init handle for BM context test

---
 test/saber/bm/test_saber_context_BM.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/saber/bm/test_saber_context_BM.cpp b/test/saber/bm/test_saber_context_BM.cpp
index ed93866cf..f2df59c88 100644
--- a/test/saber/bm/test_saber_context_BM.cpp
+++ b/test/saber/bm/test_saber_context_BM.cpp
@@ -19,6 +19,10 @@ TEST(TestSaberContextBM, test_BM_context) {
 #endif
 
 int main(int argc, const char** argv) {
+    //TODO: init in another place
+    static bm_handle_t handle;
+    bmdnn_init(&handle);
+    
     // initial logger
     logger::init(argv[0]);
     InitTest();

From 2f1a8bafc5ec164e7e467fbd8fa45081864bd715 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Tue, 26 Jun 2018 09:12:18 +0800
Subject: [PATCH 052/318] handle init rearrange

---
 test/saber/bm/test_saber_buffer_BM.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp
index 434bd221a..00f77d308 100644
--- a/test/saber/bm/test_saber_buffer_BM.cpp
+++ b/test/saber/bm/test_saber_buffer_BM.cpp
@@ -4,8 +4,6 @@
 
 using namespace anakin::saber;
 
-static bm_handle_t handle;
-
 int get_bm_size() {
     return 1;
 }
@@ -13,9 +11,6 @@ int get_bm_size() {
 template <DataType Ddatatype, DataType Hdatatype>
 void test_buffer() {
 
-    //TODO: init in another place
-    bmdnn_init(&handle);
-
     typedef TargetWrapper<X86> X86_API;
     typedef TargetWrapper<BM> BM_API;
     typedef typename DataTrait<Ddatatype>::dtype Ddtype;
@@ -137,6 +132,10 @@ TEST(TestSaberBufferBM, test_buffer_memcpy) {
 }
 
 int main(int argc, const char** argv) {
+    //TODO: init in another place
+    static bm_handle_t handle;
+    bmdnn_init(&handle);
+
     // initial logger
     logger::init(argv[0]);
     InitTest();

From 094e7b664e565ae907323e8f4a8a9278ed997e9f Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Tue, 26 Jun 2018 09:43:57 +0800
Subject: [PATCH 053/318] add pooling wrapper, didn't test

---
 saber/funcs/impl/bm/vender_pooling.h         | 95 +++++---------------
 test/saber/bm/test_saber_func_pooling_BM.cpp | 33 ++-----
 2 files changed, 30 insertions(+), 98 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h
index 4990a5357..0da1a1106 100644
--- a/saber/funcs/impl/bm/vender_pooling.h
+++ b/saber/funcs/impl/bm/vender_pooling.h
@@ -44,78 +44,19 @@ class VenderPooling<NV, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in
     typedef typename DataTensor_out::Dtype OutDataType;
     typedef typename OpTensor::Dtype OpDataType;
 
-    VenderPooling() : _handle(NULL) {}
+    VenderPooling() : _handle(NULL), _pooling_type(NULL) {}
 
     ~VenderPooling() {}
 
     virtual SaberStatus init(const std::vector<DataTensor_in*>& inputs,
                   std::vector<DataTensor_out*>& outputs,
                   PoolingParam<OpTensor> &pooling_param, Context<BM> &ctx) {
-
-        this->_ctx = ctx;
-
-        cudaStream_t cuda_stream;
-        cuda_stream = ctx.get_compute_stream();
-
-        CUDNN_CHECK(cudnnCreate(&_handle));
-        CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream));
-
-        cudnn::createTensorDesc<InDataType>(&_input_descs);
-        cudnn::createTensorDesc<OutDataType>(&_output_descs);
-
-        cudnn::create_pooling_des<OpDataType>(&_pooling_descs);
-
         return create(inputs, outputs, pooling_param, ctx);
     }
 
     virtual SaberStatus create(const std::vector<DataTensor_in*>& inputs,
                 std::vector<DataTensor_out*>& outputs,
                 PoolingParam<OpTensor> &pooling_param, Context<BM> &ctx) {
-        if (!(ctx == this->_ctx)) {
-            if (_handle != NULL) {
-                CUDNN_CHECK(cudnnDestroy(_handle));
-            }
-            this->_ctx = ctx;
-
-            cudaStream_t cuda_stream;
-            cuda_stream = ctx.get_compute_stream();
-
-            CUDNN_CHECK(cudnnCreate(&_handle));
-            CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream));
-        }
-
-        int input_num = inputs[0]->num();
-        int input_channel = inputs[0]->channel();
-        int input_height = inputs[0]->height();
-        int input_width = inputs[0]->width();
-        int output_channel = outputs[0]->channel();
-        int output_height = outputs[0]->height();
-        int output_width = outputs[0]->width();
-
-        Shape stride_in = inputs[0]->get_stride();
-        Shape stride_out = outputs[0]->get_stride();
-
-        int dim_a[] = {input_num, input_channel,
-                       input_height, input_width};
-
-        int dim_b[] = {input_num, output_channel,
-                       output_height, output_width};
-
-        cudnn::setTensorNdDesc<InDataType>(&_input_descs,
-                                            inputs[0]->dims(), dim_a, &stride_in[0]);
-
-        cudnn::setTensorNdDesc<OutDataType>(&_output_descs,
-                                             outputs[0]->dims(), dim_b, &stride_out[0]);
-
-        int windowHeight[] = {pooling_param.window_h, pooling_param.window_w};
-        int padding[] = {pooling_param.pad_h, pooling_param.pad_w};
-
-        int stride[] = {pooling_param.stride_h, pooling_param.stride_w};
-
-        cudnn::set_nd_pooling_des<OpDataType>(&_pooling_descs, pooling_param.pooling_type,
-                                               inputs[0]->dims() - 2, windowHeight,
-                                               padding,stride);
-        return SaberSuccess;
     }
 
     virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
@@ -123,23 +64,31 @@ class VenderPooling<NV, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in
                           PoolingParam<OpTensor> &param) {
         const InDataType *in_data = inputs[0]->data();
         OutDataType *out_data = outputs[0]->mutable_data();
-
-        CUDNN_CHECK(cudnnPoolingForward(_handle, _pooling_descs,
-                                        cudnn::cudnnTypeWrapper<InDataType>::kOne(),
-                                        _input_descs, in_data,
-                                        cudnn::cudnnTypeWrapper<OutDataType>::kZero(),
-                                        _output_descs, out_data
-        ));
-
+        int input_n = inputs[0]->num();
+        int input_c = inputs[0]->channel();
+        int input_h = inputs[0]->height();
+        int input_w = inputs[0]->width();
+        int kh = param.window_h;
+        int kw = param.window_w;
+        int pad_h = param.pad_h;
+        int pad_w = param.pad_w;
+        int stride_h = param.stride_h;
+        int stride_w = param.stride_w;
+        if(_pooling_type == Pooling_max){
+            int is_avg_pooling = 0;
+        } else {
+            int is_avg_pooling = 1;
+        }
+        BMDNN_CHECK(bmdnn_pooling_forward(_handle, in_data, 
+                            input_n, input_c, input_h, input_w, kh, hw, pad_h, pad_w, 
+                            stride_h, stride_w, is_avg_pooling, 0,
+                            out_data, NULL, NULL));
         return SaberSuccess;
     }
 
 private:
-    cudnnHandle_t _handle;
-    cudnnTensorDescriptor_t _input_descs;
-    cudnnTensorDescriptor_t _output_descs;
-    cudnnPoolingDescriptor_t _pooling_descs;
-
+    bm_handle_t _handle;
+    PoolType _pooling_type;
 };
 
 template class VenderPooling<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp
index 04b963675..ce8e7f8f5 100644
--- a/test/saber/bm/test_saber_func_pooling_BM.cpp
+++ b/test/saber/bm/test_saber_func_pooling_BM.cpp
@@ -18,7 +18,7 @@ TEST(TestSaberFuncBM, test_func_pooling) {
     typedef TargetWrapper<X86> X86_API;
     typedef TargetWrapper<BM> BM_API;
     typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 
     int img_num = 1;
     int in_channels = 4;
@@ -71,7 +71,7 @@ TEST(TestSaberFuncBM, test_func_pooling) {
     input.push_back(&img_dev);
     output.push_back(&output_dev);
 
-    Pooling<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> pooling;
+    Pooling<BM, AK_BM, AK_BM, AK_BM, NCHW> pooling;
     pooling.compute_output_shape(input, output, param);
 
     output_dev.re_alloc(output[0]->shape());
@@ -92,15 +92,12 @@ TEST(TestSaberFuncBM, test_func_pooling) {
     }
 
     output_dev.sync();
-    cudaDeviceSynchronize();
     LOG(INFO) << " average time: " << t1.get_average_ms() << " ms";
     LOG(INFO) << " tile 10% time: " << t1.get_tile_time(10) << " ms";
     LOG(INFO) << " tile 50% time: " << t1.get_tile_time(50) << " ms";
     LOG(INFO) << " tile 90% time: " << t1.get_tile_time(90) << " ms";
     LOG(INFO) << " tile 95% time: " << t1.get_tile_time(95) << " ms";
     LOG(INFO) << " tile 99% time: " << t1.get_tile_time(99) << " ms";
-
-    CUDA_CHECK(cudaPeekAtLastError());
 }
 
 TEST(TestSaberFuncBM, test_pooling_result) {
@@ -113,7 +110,7 @@ TEST(TestSaberFuncBM, test_pooling_result) {
     typedef TargetWrapper<X86> X86_API;
     typedef TargetWrapper<BM> BM_API;
     typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 
     int img_num = 1;
     int in_channels = 2;
@@ -166,7 +163,7 @@ TEST(TestSaberFuncBM, test_pooling_result) {
     input.push_back(&img_dev);
     output.push_back(&output_dev);
 
-    Pooling<BM, AK_FLOAT> pooling;
+    Pooling<BM, AK_BM> pooling;
     pooling.compute_output_shape(input, output, param);
 
     output_dev.re_alloc(output[0]->shape());
@@ -174,14 +171,9 @@ TEST(TestSaberFuncBM, test_pooling_result) {
     // init assume output tensor has been reshpaed by user.
     pooling.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
     pooling(input, output, param, ctx1);
-    cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    output[0]->record_event(cuda_stream);
 
     output_dev.sync();
     print_tensor_device(output_dev);
-
-    cudaDeviceSynchronize();
-    CUDA_CHECK(cudaPeekAtLastError());
 }
 
 TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
@@ -194,7 +186,7 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
     typedef TargetWrapper<X86> X86_API;
     typedef TargetWrapper<BM> BM_API;
     typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 
     int img_num = 1;
     int in_channels = 2;
@@ -257,9 +249,9 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
     input.push_back(&img_dev);
     output.push_back(&output_dev);
 
-    Pooling<BM, AK_FLOAT> pooling;
-    Pooling<BM, AK_FLOAT> pooling0;
-    Pooling<BM, AK_FLOAT> pooling1;
+    Pooling<BM, AK_BM> pooling;
+    Pooling<BM, AK_BM> pooling0;
+    Pooling<BM, AK_BM> pooling1;
 
     pooling.compute_output_shape(input,output,  param);
 
@@ -286,19 +278,10 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
     pooling1.init(input1, output1, param, SPECIFY, VENDER_IMPL, ctx1);
     pooling1(input1, output1, param, ctx1);
 
-    cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    out0.record_event(cuda_stream);
-
-    cudaStream_t cuda_stream1 = ctx1.get_compute_stream();
-    out1.record_event(cuda_stream1);
-
     out0.sync();
     out1.sync();
 
     print_tensor_device(output_dev);
-
-    cudaDeviceSynchronize();
-    CUDA_CHECK(cudaPeekAtLastError());
 }
 
 int main(int argc, const char** argv) {

From b3e78bf64175766b8e18b0218431b8dbc8c114b0 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Tue, 26 Jun 2018 09:44:43 +0800
Subject: [PATCH 054/318] ptr2 should be from buf2

---
 test/saber/bm/test_saber_buffer_BM.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp
index 00f77d308..9910638fb 100644
--- a/test/saber/bm/test_saber_buffer_BM.cpp
+++ b/test/saber/bm/test_saber_buffer_BM.cpp
@@ -90,7 +90,7 @@ void test_buffer() {
     x86_buf1.sync_copy_from(x86_buf2);
     LOG(INFO) << "deep copy between two host buffer: ";
     const Hdtype* ptr1 = static_cast<const Hdtype*>(x86_buf1.get_data());
-    const Hdtype* ptr2 = static_cast<const Hdtype*>(x86_buf1.get_data());
+    const Hdtype* ptr2 = static_cast<const Hdtype*>(x86_buf2.get_data());
 
     for (int i = 0; i < 10; i++) {
         std::cout << ptr1[i] << std::endl;

From 82f81aaa793e6c1244b5d9b518ed292021d6ec49 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Tue, 26 Jun 2018 10:03:24 +0800
Subject: [PATCH 055/318] Restrict copy_from for different types

---
 saber/core/tensor.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 3051e16c6..32ad81ac3 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -725,7 +725,7 @@ class Tensor : public TensorBase {
 #ifdef USE_BM
     template <typename NewTargetType_t, DataType NewDataType_t, typename NewLayOutType_t>
     SaberStatus copy_from(const Tensor<NewTargetType_t, NewDataType_t, NewLayOutType_t>& tensor) {
-        LOG(INFO) << "base copy_from";
+        LOG(WARNING) << "Invalid: copy_from is not allowed for current type.";
         return SaberInvalidValue;
     }
 #endif
@@ -951,6 +951,8 @@ template<>
 template<> inline
 SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
     LOG(INFO) << "BM copy_from";
+    CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
+
     auto* device_data_ptr = mutable_data();
     BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(const_cast<float *>(tensor.data()))));
     return SaberSuccess;
@@ -960,6 +962,8 @@ template<>
 template<> inline
 SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor<BM, AK_BM, NCHW>& tensor) {
     LOG(INFO) << "X86 copy_from";
+    CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
+
     auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
     BMDNN_CHECK(bm_memcpy_d2s(TargetWrapper<BM>::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr));
     return SaberSuccess;

From 2e84467d981da3d6a9a94ba294a35b65913d4732 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Tue, 26 Jun 2018 11:24:19 +0800
Subject: [PATCH 056/318] Implement fill_tensor_device_rand &
 fill_tensor_device_const for BM

No test yet
---
 saber/core/tensor_op.cpp | 109 ++++++++++++---------------------------
 saber/core/tensor_op.h   |  10 ++++
 2 files changed, 44 insertions(+), 75 deletions(-)

diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp
index 046fef53c..56d8b7244 100644
--- a/saber/core/tensor_op.cpp
+++ b/saber/core/tensor_op.cpp
@@ -363,98 +363,57 @@ Context<NV> ctx) {
 #endif
 
 
-/*#ifdef USE_BM
+#ifdef USE_BM
 
 template<>
-SaberStatus
-DataTensorTransformHelper::convert_weights<Tensor<X86, AK_INT8, NCHW_C4>,
-                          Tensor<X86, AK_FLOAT, NCHW> >(Tensor<X86, AK_INT8, NCHW_C4>& out_tensor,
-                                  const Tensor<X86, AK_FLOAT, NCHW>& in_tensor,
-Context<BM> ctx) {
-    int input_channel = in_tensor.channel();
-    int output_channel = out_tensor.shape()[1];
-    //            LOG(INFO)<<"input_channel = "<<input_channel<<" output_channel = "<<output_channel;
-    _vector_weight_scale.resize(input_channel);
-
-    int weight_inner_dim = in_tensor.channel()
-                           * in_tensor.height()
-                           * in_tensor.width();
-    const float* in_weight_data = in_tensor.data();
-
-    for (int c = 0; c < input_channel; ++c) {
-        float max_val = -1.f;
-
-        for (int i = 0; i < weight_inner_dim; ++i) {
-            float read_data = fabs(in_weight_data[i]);
-            max_val = (read_data > max_val) ? read_data : max_val;
-        }
+void fill_tensor_device_rand<Tensor<BM, AK_BM, NCHW>>(Tensor<BM, AK_BM, NCHW>& tensor, \
+    typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream) {
 
-        _vector_weight_scale[c] = max_val / 127.f;
-        in_weight_data += weight_inner_dim;
-        //                LOG(INFO)<<"max_val = "<<max_val<<" vector: "<<max_val / 127.f;
+    float *host_mem_input = new float[tensor.size()];
+    for (int i = 0; i < tensor.size(); ++i) {
+        host_mem_input[i] = static_cast<float>(rand());
     }
 
-    int o_num = out_tensor.num();
-    int o_channel = output_channel;
-    int o_height = out_tensor.height();
-    int o_width = out_tensor.width();
-
-    int out_n_stride = o_channel * o_height * o_width;
-    int out_c_stride = o_height * o_width;
-    int out_h_stride = o_width;
-
-    Shape in_stride = in_tensor.get_stride();
-
-    in_weight_data = in_tensor.data();
-    char* out_weight_data = out_tensor.mutable_data();
+    bm_device_mem_t* device_data_ptr = tensor.mutable_data();
+    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
 
-    for (int idx = 0; idx < o_num * o_channel * o_height * o_width; ++idx) {
+    delete [] host_mem_input;
+}
 
-        int n = (idx / (out_n_stride)) % o_num;
-        int in_offset = ((idx / (out_n_stride)) % o_num) * in_stride[0]
-                        + ((idx / (out_c_stride)) % o_channel) * (in_stride[1] * 4)
-                        + ((idx / (out_h_stride)) % o_height) * in_stride[2]
-                        + (idx % o_width) * in_stride[3];
+void fill_tensor_device_rand(Tensor<BM, AK_BM, NCHW>& tensor, float vstart, \
+    float vend, typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream = NULL){
 
-        int out_offset = ((idx / (out_n_stride)) % o_num) * out_n_stride
-                         + ((idx / (out_c_stride)) % o_channel) * out_c_stride
-                         + ((idx / (out_h_stride)) % o_height) * out_h_stride
-                         + (idx % o_width);
-        out_weight_data[out_offset * 4 + 0] = (char)(round(
-                in_weight_data[in_offset + 0 * in_stride[1]] / _vector_weight_scale[n]));
-        out_weight_data[out_offset * 4 + 1] = (char)(round(
-                in_weight_data[in_offset + 1 * in_stride[1]] / _vector_weight_scale[n]));
-        out_weight_data[out_offset * 4 + 2] = (char)(round(
-                in_weight_data[in_offset + 2 * in_stride[1]] / _vector_weight_scale[n]));
-        out_weight_data[out_offset * 4 + 3] = (char)(round(
-                in_weight_data[in_offset + 3 * in_stride[1]] / _vector_weight_scale[n]));
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dis(0, 1.f);
 
+    float *host_mem_input = new float[tensor.size()];
+    for (int i = 0; i < tensor.size(); ++i) {
+        float random_num = vstart + (vend - vstart) * dis(gen);
+        host_mem_input[i] = random_num;
     }
 
-    return SaberSuccess;
+    bm_device_mem_t* device_data_ptr = tensor.mutable_data();
+    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
+
+    delete [] host_mem_input;
 }
-template<>
-SaberStatus
-DataTensorTransformHelper::convert_bias<Tensor<X86, AK_FLOAT, NCHW>,
-                          Tensor<X86, AK_FLOAT, NCHW> >(Tensor<X86, AK_FLOAT, NCHW>& out_tensor,
-                                  const Tensor<X86, AK_FLOAT, NCHW>& in_tensor,
-Context<BM> ctx) {
-    unsigned long weight_size = _vector_weight_scale.size();
-    unsigned long bias_size = in_tensor.size();
-    CHECK_GT(_in_scale, 0);
-    CHECK_GT(weight_size, 0);
-    CHECK_EQ(bias_size, weight_size);
 
-    const float* in_data = in_tensor.data();
-    float* out_data = out_tensor.mutable_data();
+void fill_tensor_device_const(Tensor<BM, AK_BM, NCHW>& tensor, float value, \
+    typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream = NULL){
 
-    for (int i = 0; i < bias_size; ++i) {
-        out_data[i] = in_data[i] / _in_scale / _vector_weight_scale[i];
+    float *host_mem_input = new float[tensor.size()];
+    for (int i = 0; i < tensor.size(); ++i) {
+        host_mem_input[i] = value;
     }
 
-    return SaberSuccess;
+    bm_device_mem_t* device_data_ptr = tensor.mutable_data();
+    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
+
+    delete [] host_mem_input;
 }
-#endif*/
+
+#endif
 
 } //namespace saber
 
diff --git a/saber/core/tensor_op.h b/saber/core/tensor_op.h
index 166d8f32b..c4d7a7661 100644
--- a/saber/core/tensor_op.h
+++ b/saber/core/tensor_op.h
@@ -171,6 +171,16 @@ class DataTensorTransformHelper{
 
 #endif
 
+#ifdef USE_BM
+
+void fill_tensor_device_const(Tensor<BM, AK_BM, NCHW>& tensor, float value, \
+    typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream = NULL);
+
+void fill_tensor_device_rand(Tensor<BM, AK_BM, NCHW>& tensor, float vstart, \
+    float vend, typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream = NULL);
+
+#endif
+
 } // namespace saber
 
 } // namespace anakin

From d9e9669c9ffcd1cf9e9c45400f838bd0eefdf5c4 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 11:51:04 +0800
Subject: [PATCH 057/318] get handle directly by calling get_handler()

---
 saber/core/context.h           | 5 -----
 saber/core/impl/bm/bm_impl.cpp | 4 ----
 saber/core/target_wrapper.h    | 2 --
 saber/core/tensor.h            | 4 ++--
 4 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/saber/core/context.h b/saber/core/context.h
index 15ec2e0b6..1667f36e0 100644
--- a/saber/core/context.h
+++ b/saber/core/context.h
@@ -111,11 +111,6 @@ class Context final{
         return _stream_compute;
     }
 
-#ifdef USE_BM
-    bm_handle_t get_handler() {
-        return API::get_handler();
-    }
-#endif
 
 #ifdef USE_ARM_PLACE
     void set_power_mode(PowerMode mode);
diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index dacca58b6..d2790d0a9 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -41,10 +41,6 @@ typedef TargetWrapper<BM, __device_target> BM_API;
 //static bm_handle_t handle = get_bm_handle();
 static bm_handle_t handle;
 
-bm_handle_t BM_API::get_handler() {
-    return handle;
-}
-
 void BM_API::get_device_count(int &count) {
     BMDNN_CHECK(bm_dev_getcount(&count));
 }
diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index 2a2a4be88..6e6f67b55 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -414,8 +414,6 @@ struct TargetWrapper<BM, __device_target> {
      * @return          currently activated device id
      */
     static int get_device_id();
-
-    static bm_handle_t get_handler();
 };
 
 #endif //USE_BM
diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 32ad81ac3..d8a319cd5 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -954,7 +954,7 @@ SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor
     CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
 
     auto* device_data_ptr = mutable_data();
-    BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(const_cast<float *>(tensor.data()))));
+    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(const_cast<float *>(tensor.data()))));
     return SaberSuccess;
 }
 
@@ -965,7 +965,7 @@ SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor
     CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
 
     auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
-    BMDNN_CHECK(bm_memcpy_d2s(TargetWrapper<BM>::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr));
+    BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *device_data_ptr));
     return SaberSuccess;
 }
 

From 1e75380499f3d67d32e2a6a8f1a731c13c6ef5b9 Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Tue, 26 Jun 2018 12:38:48 +0800
Subject: [PATCH 058/318] modify pooling, test failed

---
 saber/funcs/impl/bm/vender_pooling.h         | 50 +++++++-------------
 saber/funcs/pooling.h                        |  4 ++
 test/saber/bm/test_saber_func_pooling_BM.cpp |  6 ---
 3 files changed, 22 insertions(+), 38 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h
index 0da1a1106..b857eacdd 100644
--- a/saber/funcs/impl/bm/vender_pooling.h
+++ b/saber/funcs/impl/bm/vender_pooling.h
@@ -1,23 +1,7 @@
-/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-   
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. 
-*/
-
-#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_POOLING_H
-#define ANAKIN_SABER_FUNCS_IMPL_CUDA_POOLING_H
+#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_POOLING_H
+#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_POOLING_H
 
 #include "saber/funcs/impl/impl_pooling.h"
-#include "saber/funcs/impl/cuda/cudnn_helper.h"
 
 namespace anakin{
 
@@ -29,12 +13,12 @@ template <DataType OpDtype ,
     typename LayOutType_op,
     typename LayOutType_in,
     typename LayOutType_out>
-class VenderPooling<NV, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, LayOutType_out>:\
+class VenderPooling<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, LayOutType_out>:\
  public ImplBase<
-    Tensor<NV, inDtype, LayOutType_in>, 
-    Tensor<NV, outDtype, LayOutType_out>,
-    Tensor<NV, OpDtype, LayOutType_op>,
-    PoolingParam<Tensor<NV, OpDtype, LayOutType_op>>> {
+    Tensor<BM, inDtype, LayOutType_in>, 
+    Tensor<BM, outDtype, LayOutType_out>,
+    Tensor<BM, OpDtype, LayOutType_op>,
+    PoolingParam<Tensor<BM, OpDtype, LayOutType_op>>> {
 public:
     typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
     typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
@@ -62,8 +46,8 @@ class VenderPooling<NV, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in
     virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
                           std::vector<DataTensor_out*>& outputs,
                           PoolingParam<OpTensor> &param) {
-        const InDataType *in_data = inputs[0]->data();
-        OutDataType *out_data = outputs[0]->mutable_data();
+        const InDataType in_data = *(inputs[0]->data());
+        OutDataType out_data = *(outputs[0]->mutable_data());
         int input_n = inputs[0]->num();
         int input_c = inputs[0]->channel();
         int input_h = inputs[0]->height();
@@ -74,27 +58,29 @@ class VenderPooling<NV, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in
         int pad_w = param.pad_w;
         int stride_h = param.stride_h;
         int stride_w = param.stride_w;
+        int is_avg_pooling;
         if(_pooling_type == Pooling_max){
-            int is_avg_pooling = 0;
+            is_avg_pooling = 0;
         } else {
-            int is_avg_pooling = 1;
+            is_avg_pooling = 1;
         }
+        _handle = get_bm_handle();
         BMDNN_CHECK(bmdnn_pooling_forward(_handle, in_data, 
-                            input_n, input_c, input_h, input_w, kh, hw, pad_h, pad_w, 
+                            input_n, input_c, input_h, input_w, kh, kw, pad_h, pad_w, 
                             stride_h, stride_w, is_avg_pooling, 0,
-                            out_data, NULL, NULL));
+                            out_data, bm_mem_null, bm_mem_null));
         return SaberSuccess;
     }
 
 private:
     bm_handle_t _handle;
-    PoolType _pooling_type;
+    PoolingType _pooling_type;
 };
 
-template class VenderPooling<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
+template class VenderPooling<BM, AK_BM, AK_BM, AK_BM, NCHW, NCHW, NCHW>;
 
 } //namespace saber
 
 } // namespace anakin
 
-#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_POOLING_H
+#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_POOLING_H
diff --git a/saber/funcs/pooling.h b/saber/funcs/pooling.h
index 09ab8029e..739d05851 100644
--- a/saber/funcs/pooling.h
+++ b/saber/funcs/pooling.h
@@ -27,6 +27,10 @@
 #include "saber/funcs/impl/x86/saber_pooling.h"
 #endif
 
+#ifdef USE_BM
+#include "saber/funcs/impl/bm/vender_pooling.h"
+#endif
+
 namespace anakin {
 namespace saber {
 
diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp
index ce8e7f8f5..2a490c588 100644
--- a/test/saber/bm/test_saber_func_pooling_BM.cpp
+++ b/test/saber/bm/test_saber_func_pooling_BM.cpp
@@ -12,8 +12,6 @@ TEST(TestSaberFuncBM, test_func_pooling) {
 
     Env<BM>::env_init();
     typedef TargetWrapper<BM> API;
-    typename API::event_t event;
-    API::create_event(event);
 
     typedef TargetWrapper<X86> X86_API;
     typedef TargetWrapper<BM> BM_API;
@@ -104,8 +102,6 @@ TEST(TestSaberFuncBM, test_pooling_result) {
 
     Env<BM>::env_init();
     typedef TargetWrapper<BM> API;
-    typename API::event_t event;
-    API::create_event(event);
 
     typedef TargetWrapper<X86> X86_API;
     typedef TargetWrapper<BM> BM_API;
@@ -180,8 +176,6 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
 
     Env<BM>::env_init();
     typedef TargetWrapper<BM> API;
-    typename API::event_t event;
-    API::create_event(event);
 
     typedef TargetWrapper<X86> X86_API;
     typedef TargetWrapper<BM> BM_API;

From 21b23796d4411c61c8a6f1d46578116ebf12139d Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 13:18:23 +0800
Subject: [PATCH 059/318] Implement print_tensor_device for BM

---
 saber/core/tensor_op.cpp | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp
index 56d8b7244..841c9c208 100644
--- a/saber/core/tensor_op.cpp
+++ b/saber/core/tensor_op.cpp
@@ -413,6 +413,42 @@ void fill_tensor_device_const(Tensor<BM, AK_BM, NCHW>& tensor, float value, \
     delete [] host_mem_input;
 }
 
+template <>
+void print_tensor_device<Tensor<BM, AK_BM, NCHW>>(Tensor<BM, AK_BM, NCHW>& tensor,  \
+    typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream) {
+
+    LOG(INFO) << "BM device tensor data:" << tensor.size();
+
+    /*
+    const bm_device_mem_t* device_data_ptr = tensor.data();
+    unsigned long long gaddr = bm_mem_get_device_addr(*device_data_ptr);
+    bm_flush(get_bm_handle());
+    float* device_data = (float*)bm_get_global_addr(gaddr);
+
+    for (int i = 0; i < tensor.size(); ++i) {
+        printf("%.2f ", device_data[i]);
+
+        if ((i + 1) % (4 * tensor.width()) == 0) {
+            printf("\n");
+        }
+    }*/
+
+    float *host_mem = new float[tensor.size()];
+    auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
+    bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr);
+
+    for (int i = 0; i < tensor.size(); ++i) {
+        printf("%.2f ", host_mem[i]);
+
+        if ((i + 1) % (4 * tensor.width()) == 0) {
+            printf("\n");
+        }
+    }
+    printf("\n");
+
+    delete [] host_mem;
+}
+
 #endif
 
 } //namespace saber

From b5583187c7f647f50ed05fd562f3c4b8e04a21e5 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 13:25:48 +0800
Subject: [PATCH 060/318] Update BM tensor test

---
 saber/core/tensor_op.cpp               | 2 ++
 test/saber/bm/test_saber_tensor_BM.cpp | 9 ++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp
index 841c9c208..72de1d0b3 100644
--- a/saber/core/tensor_op.cpp
+++ b/saber/core/tensor_op.cpp
@@ -434,6 +434,8 @@ void print_tensor_device<Tensor<BM, AK_BM, NCHW>>(Tensor<BM, AK_BM, NCHW>& tenso
     }*/
 
     float *host_mem = new float[tensor.size()];
+    bm_flush(get_bm_handle());
+
     auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
     bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr);
 
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index d42665528..dfd8d90c9 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -55,7 +55,6 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     TensorHf4 thost1(sh1);
     TensorDf4 tdev1(sh1);
 
-
     //! test tensor copy_from() function
     LOG(INFO) << "test copy_from() function, input tensor could be any target";
 
@@ -65,17 +64,17 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
 
     // host to device
     tdev1.copy_from(thost0);
-    //TODO: print tensor for BM device
-    //print_tensor_host(tdev1);
+    print_tensor_device(tdev1);
 
     // device to host
     thost1.copy_from(tdev1);
     print_tensor_host(thost1);
 
-    /*
-    // device to device
+    //device to device
     tdev1.copy_from(tdev0);
+    print_tensor_device(tdev1);
 
+    /*
     //! test tensor constructor with shape and real_shape
     LOG(INFO) << "test tensor constructor with shape and real_shape";
     //! constructor with 3 shapes is removed

From 99493a44021b5468d8400e7a0e9373ae4d9f4464 Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Tue, 26 Jun 2018 13:38:28 +0800
Subject: [PATCH 061/318] fix pooling api error

---
 saber/funcs/impl/bm/vender_pooling.h |  3 +--
 saber/funcs/pooling.h                | 11 +++++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h
index b857eacdd..108a70708 100644
--- a/saber/funcs/impl/bm/vender_pooling.h
+++ b/saber/funcs/impl/bm/vender_pooling.h
@@ -67,8 +67,7 @@ class VenderPooling<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in
         _handle = get_bm_handle();
         BMDNN_CHECK(bmdnn_pooling_forward(_handle, in_data, 
                             input_n, input_c, input_h, input_w, kh, kw, pad_h, pad_w, 
-                            stride_h, stride_w, is_avg_pooling, 0,
-                            out_data, bm_mem_null, bm_mem_null));
+                            stride_h, stride_w, is_avg_pooling, out_data));
         return SaberSuccess;
     }
 
diff --git a/saber/funcs/pooling.h b/saber/funcs/pooling.h
index 739d05851..aff883505 100644
--- a/saber/funcs/pooling.h
+++ b/saber/funcs/pooling.h
@@ -34,6 +34,16 @@
 namespace anakin {
 namespace saber {
 
+#ifdef USE_BM
+template<typename TargetType,
+        DataType OpDtype,
+        DataType inDtype = AK_BM,
+        DataType outDtype = AK_BM,
+        typename LayOutType_op = NCHW,
+        typename LayOutType_in = NCHW,
+        typename LayOutType_out = NCHW
+>
+#else
 template<typename TargetType,
         DataType OpDtype,
         DataType inDtype = AK_FLOAT,
@@ -42,6 +52,7 @@ template<typename TargetType,
         typename LayOutType_in = NCHW,
         typename LayOutType_out = NCHW
 >
+#endif
 class Pooling : public BaseFunc<
         Tensor<TargetType, inDtype, LayOutType_in>,
         Tensor<TargetType, outDtype, LayOutType_out>,

From 1f02e147afea710c19869c523b73e8086436ed5e Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 14:50:10 +0800
Subject: [PATCH 062/318] Update pooling test

---
 test/saber/bm/test_saber_func_pooling_BM.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp
index 2a490c588..944ab6a18 100644
--- a/test/saber/bm/test_saber_func_pooling_BM.cpp
+++ b/test/saber/bm/test_saber_func_pooling_BM.cpp
@@ -9,8 +9,6 @@
 using namespace anakin::saber;
 
 TEST(TestSaberFuncBM, test_func_pooling) {
-
-    Env<BM>::env_init();
     typedef TargetWrapper<BM> API;
 
     typedef TargetWrapper<X86> X86_API;
@@ -42,6 +40,8 @@ TEST(TestSaberFuncBM, test_func_pooling) {
 
     // start Reshape & doInfer
 
+    LOG(INFO) << "init env...";
+    Env<BM>::env_init();
     Context<BM> ctx1(0, 1, 1);
     int window_h = 2;
     int window_w = 2;
@@ -279,6 +279,9 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
 }
 
 int main(int argc, const char** argv) {
+    //TODO: init in another place
+    static bm_handle_t handle;
+    bmdnn_init(&handle);
     // initial logger
     //logger::init(argv[0]);
     InitTest();

From a1e82149076024dc85006d4cf8794db96069b06f Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 15:01:16 +0800
Subject: [PATCH 063/318] Skip context init for BM

---
 saber/core/context.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/saber/core/context.h b/saber/core/context.h
index 1667f36e0..a661cce46 100644
--- a/saber/core/context.h
+++ b/saber/core/context.h
@@ -17,6 +17,7 @@
 
 #include "core/env.h"
 #include "saber/saber_types.h"
+#include <type_traits>
 
 #ifdef USE_BM
 #include "bmlib_runtime.h"
@@ -40,6 +41,11 @@ class Context final{
      * @param compute_stream_id
      */
     Context(int device_id = 0, int data_stream_id = 0, int compute_stream_id = 0){
+        if(std::is_same<TargetType, BM>::value){
+            LOG(INFO) << "context init for BM";
+            return;
+        }
+
         CHECK_GT(devs.size(), 0) << "Env is not initialized or current target is not exit!";
         if (device_id >= devs.size()){
             LOG(WARNING) << "device index exceeds the number of devices, set to default device(0)!";
@@ -63,6 +69,11 @@ class Context final{
     }
 
     Context(const Context<TargetType>& ctx){
+        if(std::is_same<TargetType, BM>::value){
+            LOG(INFO) << "context init for BM";
+            return;
+        }
+
         _device_id = ctx._device_id;
         _data_stream_id = ctx._data_stream_id;
         _compute_stream_id = ctx._compute_stream_id;

From b1b9f7c920617d023783b47e9227c14fba7b3b32 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 15:09:02 +0800
Subject: [PATCH 064/318] remove flush action in print

---
 saber/core/tensor_op.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp
index 72de1d0b3..841c9c208 100644
--- a/saber/core/tensor_op.cpp
+++ b/saber/core/tensor_op.cpp
@@ -434,8 +434,6 @@ void print_tensor_device<Tensor<BM, AK_BM, NCHW>>(Tensor<BM, AK_BM, NCHW>& tenso
     }*/
 
     float *host_mem = new float[tensor.size()];
-    bm_flush(get_bm_handle());
-
     auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
     bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr);
 

From 27517ca93eb72070d13ff220c8072bdab5640080 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 16:09:58 +0800
Subject: [PATCH 065/318] ignore set_device for BM for now

---
 saber/core/impl/bm/bm_impl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index d2790d0a9..fa51bf2d7 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -47,7 +47,7 @@ void BM_API::get_device_count(int &count) {
 
 void BM_API::set_device(int id){
     //(bm_handle_t &handle, bool bmkernel_used, int id){
-    BMDNN_CHECK(bm_dev_request(&handle, 0, id));
+    //BMDNN_CHECK(bm_dev_request(&handle, 0, id));
 }
 
 //TODO: Do we have this functionality?

From 949c4c49fc359a54ad87eec17502e307b1715a4d Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 16:26:19 +0800
Subject: [PATCH 066/318] Update logs for copy_from

---
 saber/core/tensor.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index d8a319cd5..af3495b1f 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -950,7 +950,7 @@ size_t Tensor<BM, AK_BM, NCHW>::_type_len(){
 template<>
 template<> inline
 SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
-    LOG(INFO) << "BM copy_from";
+    LOG(INFO) << "BM copy_from X86";
     CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
 
     auto* device_data_ptr = mutable_data();
@@ -961,7 +961,7 @@ SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor
 template<>
 template<> inline
 SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor<BM, AK_BM, NCHW>& tensor) {
-    LOG(INFO) << "X86 copy_from";
+    LOG(INFO) << "X86 copy_from BM";
     CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
 
     auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());

From 51f0f2b4df677e5bad5ccfd5eb057f90dc4d423d Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 17:42:26 +0800
Subject: [PATCH 067/318] Initialize bm handle only in one place

---
 saber/core/impl/bm/bm_impl.cpp               | 4 ++--
 test/saber/bm/test_TargetWrapper_BM.cpp      | 6 +++---
 test/saber/bm/test_saber_buffer_BM.cpp       | 4 ----
 test/saber/bm/test_saber_func_pooling_BM.cpp | 3 ---
 test/saber/bm/test_saber_tensor_BM.cpp       | 4 ----
 5 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index fa51bf2d7..60e52088e 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -37,9 +37,9 @@ namespace saber{
 
 typedef TargetWrapper<BM, __device_target> BM_API;
 
-//TODO: check exception
-//static bm_handle_t handle = get_bm_handle();
+// Init handle only once in the lifetime
 static bm_handle_t handle;
+static bm_status_t init_handle{bmdnn_init(&handle)};
 
 void BM_API::get_device_count(int &count) {
     BMDNN_CHECK(bm_dev_getcount(&count));
diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp
index b893183a2..8de77498a 100644
--- a/test/saber/bm/test_TargetWrapper_BM.cpp
+++ b/test/saber/bm/test_TargetWrapper_BM.cpp
@@ -4,9 +4,9 @@
 
 #ifdef USE_BM
 using namespace anakin::saber;
-static bm_handle_t handle;
+//static bm_handle_t handle;
 int main() {
-    bmdnn_init(&handle);
+    //bmdnn_init(&handle);
     typedef TargetWrapper<BM> API;
     //int dev_count = 0;
     //API::get_device_count(dev_count);
@@ -20,7 +20,7 @@ int main() {
     std::cout << "Start mem_free test." << std::endl;
     API::mem_free(pmem);
     std::cout << "End mem_free test." << std::endl;
-    bmdnn_deinit(handle);
+    //bmdnn_deinit(handle);
 }
 #endif
 
diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp
index 9910638fb..dce1fae15 100644
--- a/test/saber/bm/test_saber_buffer_BM.cpp
+++ b/test/saber/bm/test_saber_buffer_BM.cpp
@@ -132,10 +132,6 @@ TEST(TestSaberBufferBM, test_buffer_memcpy) {
 }
 
 int main(int argc, const char** argv) {
-    //TODO: init in another place
-    static bm_handle_t handle;
-    bmdnn_init(&handle);
-
     // initial logger
     logger::init(argv[0]);
     InitTest();
diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp
index 944ab6a18..e988bc573 100644
--- a/test/saber/bm/test_saber_func_pooling_BM.cpp
+++ b/test/saber/bm/test_saber_func_pooling_BM.cpp
@@ -279,9 +279,6 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
 }
 
 int main(int argc, const char** argv) {
-    //TODO: init in another place
-    static bm_handle_t handle;
-    bmdnn_init(&handle);
     // initial logger
     //logger::init(argv[0]);
     InitTest();
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index dfd8d90c9..2dcd61c41 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -624,10 +624,6 @@ TEST(TestSaberTensorBM, test_tensor_base_type) {
 }*/
 
 int main(int argc, const char** argv) {
-    //TODO: init in another place
-    static bm_handle_t handle;
-    bmdnn_init(&handle);
-
     // initial logger
     logger::init(argv[0]);
     InitTest();

From 1fe4f195b8d736a78e1f8dbd131640de8342d070 Mon Sep 17 00:00:00 2001
From: lian <327842846@qq.com>
Date: Tue, 26 Jun 2018 10:46:30 +0000
Subject: [PATCH 068/318] chage tensor type_len

---
 saber/core/target_wrapper.h                   |  11 +
 saber/core/tensor.h                           |   7 +-
 test/framework/core/base_types_test.cpp       | 143 ----
 test/framework/core/core_test.h               |  46 --
 test/framework/graph/graph_base_test.cpp      |  82 --
 .../graph/graph_parser_from_model_test.cpp    |  88 ---
 test/framework/graph/graph_test.h             |  47 --
 test/framework/net/benchmark.cpp              | 162 ----
 test/framework/net/chinese_ner_test.cpp       | 213 -----
 test/framework/net/model_test.cpp             | 175 -----
 .../net/net_exec_multi_thread_test.cpp        | 149 ----
 test/framework/net/net_exec_test.cpp          | 273 -------
 test/framework/net/net_test.h                 |  98 ---
 test/framework/net/padde_api_test.cpp         | 121 ---
 test/framework/net/paddle_api.h               |  87 ---
 test/framework/operators/operator_tests.h     |  47 --
 test/framework/operators/pooling_test.cpp     |  43 --
 test/saber/bm/test_saber_buffer_BM.cpp        | 126 ---
 test/saber/bm/test_saber_buffer_BM.h          |  20 -
 test/saber/bm/test_saber_context_BM.cpp       |  28 -
 test/saber/bm/test_saber_context_BM.h         |  21 -
 test/saber/bm/test_saber_device_BM.cpp        |  20 -
 test/saber/bm/test_saber_device_BM.h          |  21 -
 test/saber/bm/test_saber_func_BM.h            |  38 -
 .../bm/test_saber_func_activation_BM.cpp      |  88 ---
 test/saber/bm/test_saber_func_conv_BM.cpp     | 725 ------------------
 test/saber/bm/test_saber_func_fc_BM.cpp       | 146 ----
 test/saber/bm/test_saber_func_pooling_BM.cpp  | 311 --------
 test/saber/bm/test_saber_shape_BM.cpp         | 126 ---
 test/saber/bm/test_saber_shape_BM.h           |  25 -
 test/saber/bm/test_saber_tensor_BM.cpp        |  40 +-
 31 files changed, 43 insertions(+), 3484 deletions(-)
 delete mode 100644 test/framework/core/base_types_test.cpp
 delete mode 100644 test/framework/core/core_test.h
 delete mode 100644 test/framework/graph/graph_base_test.cpp
 delete mode 100644 test/framework/graph/graph_parser_from_model_test.cpp
 delete mode 100644 test/framework/graph/graph_test.h
 delete mode 100644 test/framework/net/benchmark.cpp
 delete mode 100644 test/framework/net/chinese_ner_test.cpp
 delete mode 100644 test/framework/net/model_test.cpp
 delete mode 100644 test/framework/net/net_exec_multi_thread_test.cpp
 delete mode 100644 test/framework/net/net_exec_test.cpp
 delete mode 100644 test/framework/net/net_test.h
 delete mode 100644 test/framework/net/padde_api_test.cpp
 delete mode 100644 test/framework/net/paddle_api.h
 delete mode 100644 test/framework/operators/operator_tests.h
 delete mode 100644 test/framework/operators/pooling_test.cpp
 delete mode 100644 test/saber/bm/test_saber_buffer_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_buffer_BM.h
 delete mode 100644 test/saber/bm/test_saber_context_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_context_BM.h
 delete mode 100644 test/saber/bm/test_saber_device_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_device_BM.h
 delete mode 100644 test/saber/bm/test_saber_func_BM.h
 delete mode 100644 test/saber/bm/test_saber_func_activation_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_func_conv_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_func_fc_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_func_pooling_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_shape_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_shape_BM.h

diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index 1f283a004..c1325f7fb 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -368,6 +368,15 @@ struct TargetWrapper<NV, __device_target> {
 */
 template <>
 struct TargetWrapper<BM, __device_target> {
+//    TargetWrapper<BM, __device_target> ()
+//    {
+//        CHECK_EQ(bmdnn_init(&handle),BM_SUCCESS) << "Error:bmdnn_init failed";
+//    }
+//    ~TargetWrapper<BM, __device_target> ()
+//    {
+//        CHECK_EQ(bmdnn_deinit(handle),BM_SUCCESS) << "Error:bmdnn_deinit failed";
+//    }
+
     typedef void* event_t;
     typedef void* stream_t;
 
@@ -418,6 +427,8 @@ struct TargetWrapper<BM, __device_target> {
     static int get_device_id();
 
     static bm_handle_t get_handler();
+    
+//    bm_handle_t handle;
 };
 
 #endif //USE_BM
diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 3051e16c6..945c46d00 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -941,10 +941,11 @@ class Tensor : public TensorBase {
 };
 
 #ifdef USE_BM
-
+#ifndef BM_TENSOR_COPY
+#define BM_TENSOR_COPY
 template<> inline
 size_t Tensor<BM, AK_BM, NCHW>::_type_len(){
-    return 1;
+    return 4;
 }
 
 template<>
@@ -964,7 +965,7 @@ SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor
     BMDNN_CHECK(bm_memcpy_d2s(TargetWrapper<BM>::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr));
     return SaberSuccess;
 }
-
+#endif
 #endif
 
 } //namespace saber
diff --git a/test/framework/core/base_types_test.cpp b/test/framework/core/base_types_test.cpp
deleted file mode 100644
index 0109493bf..000000000
--- a/test/framework/core/base_types_test.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-#include "core_test.h"
-#include "any.h"
-#include "singleton.h"
-#include "tls.h"
-#include "parameter.h"
-#include "thread_pool.h"
-
-#ifdef USE_CUDA
-#include "cuda_funcs.h"
-#include "sass_funcs.h"
-#endif
-
-#include "tensor.h"
-
-#ifdef USE_CUDA
-TEST(CoreComponentsTest, sass_test) {
-    LOG(INFO) << "test for cuda code function";
-    //anakin::saber::Tensor<3, RTCUDA, float, NCHW> ts;
-    //LOG(WARNING) << " tensor num " << ts.num();
-    //ts.set_offset(8);
-    //my_print();
-    LOG(INFO) << "test for sass code function 1";
-    invoke_test();
-    LOG(INFO) << "test for sass code function 2";
-    invoke_test_2();
-}
-#endif
-
-TEST(CoreComponentsTest, core_base_types_any_test) {
-    LOG(INFO) << "test for any class .";
-    LOG(WARNING) << " level 1 : base type int (set 42 to any)";
-    const int a = 42;
-    any any_a(42);
-    int result_a = any_cast<int>(any_a);
-
-    LOG(INFO) << "casted result : " <<  result_a;
-    LOG(WARNING) << " level 2 : base type float (set 42.8 to any)";
-    float b = 42.8;
-    any any_b = b;
-    float result_b = any_cast<float>(any_b);
-    LOG(INFO) << "casted result : " <<  result_b << " decide: ";
-
-    LOG(WARNING) << " level 3 : ptuple type (set PTuple<float> to any)";
-    PTuple<float> p_tuple_float(3.2f, 3.3f, 3.5f);
-    p_tuple_float.push_back(4.3); // push_back
-
-    any p_tuple_float_any = p_tuple_float;
-    auto result_p_tuple_float_any = any_cast<PTuple<float>>(p_tuple_float_any);
-
-    for (int i = 0; i < result_p_tuple_float_any.size(); i++) {
-        LOG(INFO) << " any casted PTuple<float>[" << i << "]: " << result_p_tuple_float_any[i];
-    }
-
-    struct target {
-        void print() {
-            LOG(INFO) << " target struct Successfully recovered.";
-        }
-    };
-
-    LOG(WARNING) << " level 5 : struct type";
-
-    target tg;
-
-    any any_tg = tg;
-
-    target result_tg = any_cast<target>(any_tg);
-
-    result_tg.print();
-
-    LOG(WARNING) << " level other : struct type";
-
-    any any_tg_copy = any_tg;
-
-    target result_tg_copy = any_cast<target>(any_tg);
-
-    result_tg_copy.print();
-}
-
-void at_exit_in_test() {
-    LOG(WARNING) << "core_base_types_singleton_test exit successfully!";
-}
-
-TEST(CoreComponentsTest, core_base_types_singleton_test) {
-    struct target {
-        target() {
-            LOG(INFO) << " singleton target constructed";
-        }
-    };
-    typedef Singleton<target, at_exit_in_test> sg_target;
-    sg_target::Global();
-}
-
-typedef AnakinThreadLocalVar<int> sg_tls;
-void thread_func_0() {
-    int* tmp = sg_tls::value();
-    *tmp = 3;
-    LOG(INFO) << " thread_func_0 value: " << *(sg_tls::value());
-}
-void thread_func_1() {
-    int* tmp = sg_tls::value();
-    *tmp = 4;
-
-    LOG(INFO) << " thread_func_0 value: " << *(sg_tls::value());
-}
-TEST(CoreComponentsTest, core_base_types_tls_test) {
-    LOG(INFO) << " Create tls var 0 , check in two thread.";
-    std::thread first(thread_func_0);
-    std::thread sec(thread_func_1);
-    first.join();
-    sec.join();
-    LOG(INFO) << " main thread var: " << *(sg_tls::value());
-}
-
-int thread_pool_func(int i) {
-    LOG(INFO) << " thread_pool_func input : " << i;
-    //std::this_thread::sleep_for(std::chrono::seconds(0));
-    return i;
-}
-
-TEST(CoreComponentsTest, core_base_types_thread_pool_test) {
-    LOG(INFO) << " Create thread pool with thread num = 12 ";
-    ThreadPool thread_pool_test(100);
-    thread_pool_test.launch();
-    std::function<int(int)> test = thread_pool_func;
-
-    for (int i = 0; i < 50; i++) {
-        // run async
-        auto ret = thread_pool_test.RunAsync(test, i);
-        LOG(INFO) << " return : " << ret.get();
-
-        // run sync
-        //auto sync_ret = thread_pool_test.RunSync(test, i);
-    }
-}
-
-
-int main(int argc, const char** argv) {
-    // initial logger
-    logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
diff --git a/test/framework/core/core_test.h b/test/framework/core/core_test.h
deleted file mode 100644
index 6107eef4b..000000000
--- a/test/framework/core/core_test.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-   
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. 
-*/
-
-#ifndef ANAKIN_GRAPH_TEST_H
-#define ANAKIN_GRAPH_TEST_H
-
-#include <iostream>
-#include <string>
-#include <thread>
-#include "utils/unit_test/aktest.h"
-#include "utils/logger/logger.h"
-
-using namespace anakin;
-using ::anakin::test::Test;
-
-class CoreComponentsTest : public Test {
-public:
-    CoreComponentsTest(){}
-
-    void SetUp(){}
-
-    void TearDown(){}
-
-protected:
-};
-
-
-
-
-
-
-#endif
-
-
diff --git a/test/framework/graph/graph_base_test.cpp b/test/framework/graph/graph_base_test.cpp
deleted file mode 100644
index d42e86c02..000000000
--- a/test/framework/graph/graph_base_test.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-#include <string>
-#include "graph_test.h"
-#include "graph_base.h"
-
-using namespace anakin;
-using namespace anakin::graph;
-
-//! Usage sample
-class GraphTestClass : public GraphBase<std::string, int, int> {
-public:
-    GraphTestClass() {}
-    ~GraphTestClass() {}
-    virtual bool directed() {
-        return true;
-    };
-};
-class edge : public Arc<std::string, int> {
-public:
-    edge(std::string btm, std::string top, int weight): Arc<std::string, int>(btm, top, weight) {}
-    ~edge() {}
-};
-
-TEST(GraphTest, graph_base_test) {
-    LOG(INFO) << "test for graph base .";
-
-    GraphTestClass graph;
-    graph.add_vertex("a", 42);
-    graph.add_vertex("b", 43);
-    graph.add_vertex("c", 44);
-    graph.add_vertex("d", 45);
-    graph.add_vertex("e", 46);
-    graph.add_vertex("f", 47);
-
-    edge arc0("a", "b", 0);
-    edge arc1("b", "c", 1);
-    edge arc2("c", "d", 2);
-    edge arc3("d", "e", 3);
-    edge arc4("e", "f", 4);
-    edge arc5("f", "a", 5);
-
-    graph.add_in_arc(arc0);
-    graph.add_in_arc(arc1);
-    graph.add_in_arc(arc2);
-    graph.add_in_arc(arc3);
-    graph.add_in_arc(arc4);
-    graph.add_in_arc(arc5);
-    graph.add_out_arc(arc0);
-    graph.add_out_arc(arc1);
-    graph.add_out_arc(arc2);
-    graph.add_out_arc(arc3);
-    graph.add_out_arc(arc4);
-    graph.add_out_arc(arc5);
-
-    LOG(WARNING) << "Construction of graph.";
-    LOG(INFO) << graph.to_string();
-
-    LOG(WARNING) << "Remove a from graph.";
-    graph.remove("a");
-    LOG(INFO) << graph.to_string();
-
-    LOG(WARNING) << "Add arc: f->b to graph.";
-    edge arc_f_b("f", "b", 10);
-    graph.add_in_arc(arc_f_b);
-    graph.add_out_arc(arc_f_b);
-    LOG(INFO) << graph.to_string();
-
-    LOG(WARNING) << "Add vertex:a and arc: a->e to graph.";
-    graph.add_vertex("a", 47);
-    edge arc_a_e("a", "e", 10);
-    graph.add_out_arc(arc_a_e);
-    graph.add_in_arc(arc_a_e);
-    LOG(INFO) << graph.to_string();
-}
-
-
-int main(int argc, const char** argv) {
-    // initial logger
-    logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
diff --git a/test/framework/graph/graph_parser_from_model_test.cpp b/test/framework/graph/graph_parser_from_model_test.cpp
deleted file mode 100644
index 883a12858..000000000
--- a/test/framework/graph/graph_parser_from_model_test.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-#include <string>
-#include "graph_test.h"
-#include "graph_base.h"
-#include "graph.h"
-#include "scheduler.h"
-
-using namespace anakin;
-using namespace anakin::graph;
-
-//std::string model_path = "/home/chaowen/anakin_v2/model_v2/google_net/googlenet.anakin.bin";
-std::string model_path = "/home/chaowen/anakin_v2/model_v2/yolo/yolo.anakin.bin";
-
-
-TEST(GraphTest, graph_load_model) {
-    /*Graph<ARM, float, Precision::FP32>* graph = new Graph<ARM, float, Precision::FP32>();
-    LOG(WARNING) << "load anakin model file from " << model_path << " ...";
-    // load anakin model files.
-    graph->load(model_path);
-
-    DLOG(INFO) << graph->to_string();
-    // exec optimization
-    graph->Optimize();  */
-}
-
-#ifdef USE_CUDA
-TEST(GraphTest, nvidia_graph_save_model) {
-    Graph<NV, AK_FLOAT, Precision::FP32>* graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
-    // load anakin model files.
-    LOG(INFO) << "load anakin model file from " << model_path << " ...";
-    graph->load(model_path);
-
-    // regisiter output tensor
-    //graph->RegistOut("data_perm",  "data_scale");
-
-    //  exec optimization
-    graph->Optimize();
-
-    // save the optimized model to disk.
-    std::string save_model_path = model_path + std::string(".saved");
-    Status status = graph->save(save_model_path);
-}
-#endif
-
-#ifdef USE_X86_PLACE
-TEST(GraphTest, x86_graph_save_model) {
-    Graph<X86, AK_FLOAT, Precision::FP32>* graph = new Graph<X86, AK_FLOAT, Precision::FP32>();
-    // load anakin model files.
-    LOG(INFO) << "load anakin model file from " << model_path << " ...";
-    graph->load(model_path);
-
-    // regisiter output tensor
-    //graph->RegistOut("data_perm",  "data_scale");
-
-    //  exec optimization
-    graph->Optimize();
-
-    // save the optimized model to disk.
-    std::string save_model_path = model_path + std::string(".saved");
-    Status status = graph->save(save_model_path);
-}
-#endif
-
-#ifdef USE_ARM_PLACE
-TEST(GraphTest, arm_graph_save_model) {
-    Graph<ARM, AK_FLOAT, Precision::FP32>* graph = new Graph<ARM, AK_FLOAT, Precision::FP32>();
-    // load anakin model files.
-    LOG(INFO) << "load anakin model file from " << model_path << " ...";
-    graph->load(model_path);
-
-    // regisiter output tensor
-    //graph->RegistOut("data_perm",  "data_scale");
-
-    //  exec optimization
-    graph->Optimize();
-
-    // save the optimized model to disk.
-    std::string save_model_path = model_path + std::string(".saved");
-    Status status = graph->save(save_model_path);
-}
-#endif
-
-int main(int argc, const char** argv) {
-    // initial logger
-    logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
diff --git a/test/framework/graph/graph_test.h b/test/framework/graph/graph_test.h
deleted file mode 100644
index db837c84a..000000000
--- a/test/framework/graph/graph_test.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-   
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. 
-*/
-
-#ifndef ANAKIN_GRAPH_TEST_H
-#define ANAKIN_GRAPH_TEST_H
-
-#include <iostream>
-#include "utils/unit_test/aktest.h"
-#include "utils/logger/logger.h"
-
-using namespace anakin;
-using ::anakin::test::Test;
-
-/**
- * \brief Graph test is base Test class for anakin graph funciton.  
- */
-class GraphTest: public Test {
-public:
-    GraphTest(){}
-
-    void SetUp(){}
-
-    void TearDown(){}
-
-protected:
-};
-
-
-
-
-
-
-#endif
-
-
diff --git a/test/framework/net/benchmark.cpp b/test/framework/net/benchmark.cpp
deleted file mode 100644
index 41c31c83e..000000000
--- a/test/framework/net/benchmark.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-#include <string>
-#include "net_test.h"
-#include "saber/funcs/timer.h"
-#include <chrono>
-#include "saber/core/tensor_op.h"
-#include <dirent.h> 
-#include <sys/stat.h> 
-#include <sys/types.h> 
-#include <unistd.h>  
-#include <fcntl.h>
-#include <map>
-
-#ifdef USE_GFLAGS
-#include <gflags/gflags.h>
-
-DEFINE_string(model_dir, "", "model dir");
-DEFINE_string(model_file, "", "model file");
-DEFINE_int32(num, 1, "batchSize");
-DEFINE_int32(warmup_iter, 10, "warm up iterations");
-DEFINE_int32(epoch, 1000, "time statistic epoch");
-#else
-std::string FLAGS_model_dir;
-std::string FLAGS_model_file;
-int FLAGS_num = 1;
-int FLAGS_warmup_iter = 10;
-int FLAGS_epoch = 1000;
-#endif
-
-#ifdef USE_CUDA
-typedef NV Target;
-#elif defined(USE_X86_PLACE)
-typedef X86 Target;
-#else
-typedef ARM Target;
-#endif
-
-void getModels(std::string path, std::vector<std::string>& files) {
-    DIR *dir;
-    struct dirent *ptr;
-    if ((dir = opendir(path.c_str())) == NULL) {
-        perror("Open dri error...");
-        exit(1);
-    }
-    while((ptr = readdir(dir)) != NULL) {
-        if (strcmp(ptr->d_name, ".") == 0 || strcmp(ptr->d_name, "..") == 0)
-            continue;
-        else if (ptr->d_type == 8)//file
-            files.push_back(path + "/" + ptr->d_name);
-        else if (ptr->d_type == 4) {
-            getModels(path + "/" + ptr->d_name, files);
-        }
-    }
-    closedir(dir);
-}
-TEST(NetTest, net_execute_base_test) {
-    std::vector<std::string> models;
-    if (FLAGS_model_file == "") {
-        getModels(FLAGS_model_dir, models);
-    } else {
-        models.push_back(FLAGS_model_dir + FLAGS_model_file);
-    }
-    for (auto iter = models.begin(); iter < models.end(); iter++)
-    {
-        LOG(WARNING) << "load anakin model file from " << *iter << " ...";
-        Graph<Target, AK_FLOAT, Precision::FP32> graph;   
-        auto status = graph.load(*iter);
-        if (!status) {
-            LOG(FATAL) << " [ERROR] " << status.info();
-        }
-        graph.ResetBatchSize("input_0", FLAGS_num);        
-        graph.Optimize();
-        // constructs the executer net
-        Net<Target, AK_FLOAT, Precision::FP32> net_executer(graph, true);
-        // get in
-        auto d_tensor_in_p = net_executer.get_in("input_0");
-        Tensor4d<X86, AK_FLOAT> h_tensor_in;
-        auto valid_shape_in = d_tensor_in_p->valid_shape();
-        for (int i = 0; i < valid_shape_in.size(); i++) {
-            LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i];
-        }
-        h_tensor_in.re_alloc(valid_shape_in);
-        fill_tensor_host_rand(h_tensor_in, -1.0f,1.0f);
-        d_tensor_in_p->copy_from(h_tensor_in);
-        // do inference
-        Context<Target> ctx(0, 0, 0);
-        saber::SaberTimer<Target> my_time;
-        LOG(WARNING) << "EXECUTER !!!!!!!! ";
-        for (int i = 0; i < FLAGS_warmup_iter; i++) {
-            net_executer.prediction();
-        }
-#ifdef ENABLE_OP_TIMER
-        net_executer.reset_op_time();
-#endif
-        my_time.start(ctx);
-        //auto start = std::chrono::system_clock::now();
-        for (int i = 0; i < FLAGS_epoch; i++) {
-        //DLOG(ERROR) << " epoch(" << i << "/" << epoch << ") ";
-            net_executer.prediction();
-        }
-        my_time.end(ctx);
-#ifdef ENABLE_OP_TIMER
-        std::vector<float> op_time = net_executer.get_op_time();
-        auto exec_funcs = net_executer.get_exec_funcs();
-        auto op_param = net_executer.get_op_param();
-        for (int i = 0; i <  op_time.size(); i++) {
-            LOG(INFO) << "name: " << exec_funcs[i].name << " op_type: " << exec_funcs[i].op_name << " op_param: " << op_param[i] << " time " << op_time[i]/FLAGS_epoch;
-        }
-        std::map<std::string, float> op_map;
-        for (int i = 0; i < op_time.size(); i++) {
-            auto it = op_map.find(op_param[i]);
-            if (it != op_map.end())
-                op_map[op_param[i]] += op_time[i];
-            else
-                op_map.insert(std::pair<std::string, float>(op_param[i], op_time[i]));
-        }
-        for (auto it = op_map.begin(); it != op_map.end(); ++it) {
-            LOG(INFO)<< it->first << "  " << (it->second) / FLAGS_epoch<< " ms";
-        }
-#endif
-        size_t end = (*iter).find(".anakin.bin");
-        size_t start = FLAGS_model_dir.length();
-        std::string model_name = (*iter).substr(start, end-start);
-        
-        LOG(INFO) << model_name << " batch_size " << FLAGS_num << " average time "<< my_time.get_average_ms() / FLAGS_epoch << " ms";
-    }
-}
-int main(int argc, const char** argv){
-    // initial logger
-    logger::init(argv[0]);
-
-#ifdef USE_GFLAGS
-    google::ParseCommandLineFlags(&argc, &argv, true);
-#else 
-    LOG(INFO)<< "BenchMark usage:";
-    LOG(INFO)<< "   $benchmark <model_dir> <model_file> <num> <warmup_iter> <epoch>";
-    LOG(INFO)<< "   model_dir:      model directory";
-    LOG(INFO)<< "   model_file:     path to model";
-    LOG(INFO)<< "   num:            batchSize default to 1";
-    LOG(INFO)<< "   warmup_iter:    warm up iterations default to 10";
-    LOG(INFO)<< "   epoch:          time statistic epoch default to 1000";
-    if(argc < 3) {
-        LOG(ERROR) << "You should fill in the variable model_dir and model_file at least.";
-        return 0;
-    }
-    FLAGS_model_dir = argv[1];
-    if(argc > 2) {
-        FLAGS_model_file = argv[2];
-    }
-    if(argc > 3) {
-        FLAGS_num = atoi(argv[3]);
-    }
-    if(argc > 4) {
-        FLAGS_warmup_iter = atoi(argv[4]);
-    }
-    if(argc > 5) {
-        FLAGS_epoch = atoi(argv[5]);
-    }
-#endif
-    InitTest();
-    RUN_ALL_TESTS(argv[0]); 
-    return 0;
-}
diff --git a/test/framework/net/chinese_ner_test.cpp b/test/framework/net/chinese_ner_test.cpp
deleted file mode 100644
index 37785f721..000000000
--- a/test/framework/net/chinese_ner_test.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-#include "anakin_config.h"
-#include <string>
-#include <fstream>
-#include "net_test.h"
-#include "saber/funcs/timer.h"
-#include <chrono>
-#include "saber/core/tensor_op.h"
-#include <dirent.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <map>
-
-#define DEFINE_GLOBAL(type, var, value) \
-        type (GLB_##var) = (value)
-DEFINE_GLOBAL(std::string, model_dir, "");
-DEFINE_GLOBAL(std::string, input_file, "");
-
-//#define WITH_MENTION
-
-void getModels(std::string path, std::vector<std::string>& files) {
-    DIR* dir= nullptr;
-    struct dirent* ptr;
-
-    if ((dir = opendir(path.c_str())) == NULL) {
-        perror("Open dri error...");
-        exit(1);
-    }
-
-    while ((ptr = readdir(dir)) != NULL) {
-        if (strcmp(ptr->d_name, ".") == 0 || strcmp(ptr->d_name, "..") == 0) {
-            continue;
-        } else if (ptr->d_type == 8) { //file
-            files.push_back(path + "/" + ptr->d_name);
-        } else if (ptr->d_type == 4) {
-            //files.push_back(ptr->d_name);//dir
-            getModels(path + "/" + ptr->d_name, files);
-        }
-    }
-    closedir(dir);
-}
-void SplitString(const std::string& s,
-                 std::vector<std::string>& v, const std::string& c)
-{
-    std::string::size_type pos1, pos2;
-    pos2 = s.find(c);
-    pos1 = 0;
-    while(std::string::npos != pos2)
-    {
-        v.push_back(s.substr(pos1, pos2-pos1));
-
-        pos1 = pos2 + c.size();
-        pos2 = s.find(c, pos1);
-    }
-    if(pos1 != s.length())
-        v.push_back(s.substr(pos1));
-}
-
-bool split_word_mention_idx_from_file(
-        std::vector<std::vector<float> > &word_idx,
-        std::vector<std::vector<float> > &mention_idx,
-        const std::string input_file_path) {
-
-    std::ifstream infile(input_file_path.c_str());
-    if (!infile.good()) {
-        std::cout << "Cannot open " << std::endl;
-        return false;
-    }
-    LOG(INFO)<<"found filename: "<<input_file_path;
-    std::string line;
-    std::vector<std::string> split_v;
-    std::vector<std::string> split_w;
-    std::vector<std::string> split_m;
-    while (std::getline(infile, line)) {
-        split_v.clear();
-        SplitString(line, split_v, ";");
-        CHECK_GE(split_v.size(), 4) << " file need ; split";
-        std::vector<float> word;
-        std::vector<float> mention;
-        split_w.clear();
-        SplitString(split_v[1], split_w, " ");
-        split_m.clear();
-        SplitString(split_v[3], split_m, " ");
-        for (auto w : split_w) {
-            word.push_back(atof(w.c_str()));
-        }
-        for (auto m : split_m) {
-            mention.push_back(atof(m.c_str()));
-        }
-        word_idx.push_back(word);
-        mention_idx.push_back(mention);
-    }
-    return true;
-}
-
-int get_batch_data_offset(
-        std::vector<float> &out_data,
-        const std::vector<std::vector<float> > &seq_data,
-        std::vector<int> &seq_offset,
-        const int start_idx,
-        const int batch_num) {
-    seq_offset.clear();
-    out_data.clear();
-    seq_offset.push_back(0);
-    int len = 0;
-    for (int i = 0; i < batch_num; ++i) {
-        for (auto d : seq_data[i + start_idx]) {
-            len += 1;
-            out_data.push_back(d);
-        }
-        seq_offset.push_back(len);
-    }
-    return len;
-}
-
-#ifdef USE_X86_PLACE
-TEST(NetTest, chinese_ner_executor) {
-    std::vector<std::string> models;
-    getModels(GLB_model_dir, models);
-    std::vector<std::vector<float> > word_idx;
-    std::vector<std::vector<float> > mention_idx;
-    split_word_mention_idx_from_file(word_idx, mention_idx, GLB_input_file);
-    std::vector<float> word_idx_data;
-    std::vector<float> mention_idx_data;
-    std::vector<int> word_seq_offset;
-    std::vector<int> mention_seq_offset;
-    int batch_num = 6;
-
-    Graph<X86, AK_FLOAT, Precision::FP32>* graph = new Graph<X86, AK_FLOAT, Precision::FP32>();
-    LOG(WARNING) << "load anakin model file from " << models[0] << " ...";
-    // load anakin model files.
-    auto status = graph->load(models[0]);
-    if(!status ) {
-        LOG(FATAL) << " [ERROR] " << status.info();
-    }
-    graph->Reshape("input_0", {1000, 1, 1, 1});
-#ifdef WITH_MENTION
-    graph->Reshape("input_1", {1000, 1, 1, 1});
-#endif
-    //anakin graph optimization
-    graph->Optimize();
-    Net<X86, AK_FLOAT, Precision::FP32> net_executer(*graph, true);
-    SaberTimer<X86> timer;
-    Context<X86> ctx;
-    for (int i = 0; i < word_idx.size(); i += batch_num) {
-//    {
-//        int i = 0;
-        int word_len = get_batch_data_offset(word_idx_data, word_idx, word_seq_offset, i, batch_num);
-#ifdef WITH_MENTION
-        int mention_len = get_batch_data_offset(mention_idx_data, mention_idx, mention_seq_offset, i, batch_num);
-#endif
-//        for (auto w : word_idx_data) {
-//            std::cout << w << ",";
-//        }
-//        std::cout << std::endl;
-//        for (auto s : word_seq_offset) {
-//            std::cout << s << ", ";
-//        }
-//        std::cout << std::endl << std::endl << std::endl;
-//        word_idx_data = {20, 21, 22, 23, 24, 25, 26};
-//        word_seq_offset = {0, 5, 7};
-//        int word_len = 7;
-//        mention_idx_data = {2, 1, 22, 23, 24, 25, 26};
-//        mention_seq_offset = {0, 5, 7};
-//        int mention_len = 7;
-
-        auto word_in_p = net_executer.get_in("input_0");
-        word_in_p->reshape({word_len, 1, 1, 1});
-        for (int j = 0; j < word_idx_data.size(); ++j) {
-            word_in_p->mutable_data()[j] = word_idx_data[j];
-        }
-        word_in_p->set_seq_offset(word_seq_offset);
-#ifdef WITH_MENTION
-        auto mention_in_p = net_executer.get_in("input_1");
-        mention_in_p->reshape({mention_len, 1, 1, 1});
-        for (int j = 0; j < mention_idx_data.size(); ++j) {
-            mention_in_p->mutable_data()[j] = mention_idx_data[j];
-        }
-        mention_in_p->set_seq_offset(mention_seq_offset);
-#endif
-        timer.start(ctx);
-        net_executer.prediction();
-        timer.end(ctx);
-//        auto tensor_out_5_p = net_executer.get_out("crf_decoding_0.tmp_0_out");
-//        int v_size = tensor_out_5_p->valid_size();
-//        for (int j = 0; j < v_size; ++j) {
-//            std::cout << tensor_out_5_p->data()[j]<<" ";
-//        }
-//        std::cout << std::endl;
-    }
-    LOG(INFO)<<"elapse time: "<<timer.get_average_ms()<<" ms";
-}
-#endif
-
-int main(int argc, const char** argv) {
-    // initial logger
-    LOG(INFO) << "argc " << argc;
-
-    if (argc < 3) {
-        LOG(INFO) << "Example of Usage:\n \
-        ./output/unit_test/model_test\n \
-            anakin_models\n input file\n";
-        exit(0);
-    } else if (argc == 3) {
-        GLB_model_dir = std::string(argv[1]);
-        GLB_input_file = std::string(argv[2]);
-    }
-//    logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
diff --git a/test/framework/net/model_test.cpp b/test/framework/net/model_test.cpp
deleted file mode 100644
index 1f8055dbe..000000000
--- a/test/framework/net/model_test.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-#include <string>
-#include "net_test.h"
-#include "saber/funcs/timer.h"
-#include <chrono>
-#include "saber/core/tensor_op.h"
-#include <dirent.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <map>
-#define DEFINE_GLOBAL(type, var, value) \
-        type (GLB_##var) = (value)
-DEFINE_GLOBAL(std::string, model_dir, "");
-DEFINE_GLOBAL(int, num, 1);
-DEFINE_GLOBAL(int, channel, 8);
-DEFINE_GLOBAL(int, height, 640);
-DEFINE_GLOBAL(int, width, 640);
-DEFINE_GLOBAL(bool, is_input_shape, false);
-
-void getModels(std::string path, std::vector<std::string>& files) {
-    DIR* dir= nullptr;
-    struct dirent* ptr;
-
-    if ((dir = opendir(path.c_str())) == NULL) {
-        perror("Open dri error...");
-        exit(1);
-    }
-
-    while ((ptr = readdir(dir)) != NULL) {
-        if (strcmp(ptr->d_name, ".") == 0 || strcmp(ptr->d_name, "..") == 0) {
-            continue;
-        } else if (ptr->d_type == 8) { //file
-            files.push_back(path + "/" + ptr->d_name);
-        } else if (ptr->d_type == 4) {
-            //files.push_back(ptr->d_name);//dir
-            getModels(path + "/" + ptr->d_name, files);
-        }
-    }
-
-    closedir(dir);
-}
-
-#ifdef USE_CUDA
-TEST(NetTest, nv_net_execute_base_test) {
-    std::vector<std::string> models;
-    getModels(GLB_model_dir, models);
-
-    for (auto iter = models.begin(); iter < models.end(); iter++) {
-        LOG(WARNING) << "load anakin model file from " << *iter << " ...";
-#if 1
-        Graph<NV, AK_FLOAT, Precision::FP32> graph;
-        auto status = graph.load(*iter);
-
-        if (!status) {
-            LOG(FATAL) << " [ERROR] " << status.info();
-        }
-
-        if (GLB_is_input_shape) {
-            graph.Reshape("input_0", {GLB_num, GLB_channel, GLB_height, GLB_width});
-        } else {
-            graph.ResetBatchSize("input_0", GLB_num);
-        }
-
-        graph.Optimize();
-        // constructs the executer net
-        Net<NV, AK_FLOAT, Precision::FP32> net_executer(graph, true);
-        // get in
-        auto d_tensor_in_p = net_executer.get_in("input_0");
-        Tensor4d<X86, AK_FLOAT> h_tensor_in;
-        auto valid_shape_in = d_tensor_in_p->valid_shape();
-
-        for (int i = 0; i < valid_shape_in.size(); i++) {
-            LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i];
-        }
-
-        h_tensor_in.re_alloc(valid_shape_in);
-        fill_tensor_host_rand(h_tensor_in, -1.0f, 1.0f);
-        d_tensor_in_p->copy_from(h_tensor_in);
-        int warmup_iter = 10;
-        int epoch = 1000;
-        // do inference
-        Context<NV> ctx(0, 0, 0);
-        saber::SaberTimer<NV> my_time;
-        LOG(WARNING) << "EXECUTER !!!!!!!! ";
-
-        for (int i = 0; i < warmup_iter; i++) {
-            net_executer.prediction();
-        }
-
-#ifdef ENABLE_OP_TIMER
-        net_executer.reset_op_time();
-#endif
-        my_time.start(ctx);
-
-        //auto start = std::chrono::system_clock::now();
-        for (int i = 0; i < epoch; i++) {
-            //DLOG(ERROR) << " epoch(" << i << "/" << epoch << ") ";
-            net_executer.prediction();
-        }
-
-        my_time.end(ctx);
-#ifdef ENABLE_OP_TIMER
-        std::vector<float> op_time = net_executer.get_op_time();
-        auto exec_funcs = net_executer.get_exec_funcs();
-        auto op_param = net_executer.get_op_param();
-
-        for (int i = 0; i <  op_time.size(); i++) {
-            LOG(INFO) << "name: " << exec_funcs[i].name << " op_type: " << exec_funcs[i].op_name <<
-                      " op_param: " << op_param[i] << " time " << op_time[i] / epoch;
-        }
-
-        std::map<std::string, float> op_map;
-
-        for (int i = 0; i < op_time.size(); i++) {
-            auto it = op_map.find(op_param[i]);
-
-            if (it != op_map.end()) {
-                op_map[op_param[i]] += op_time[i];
-            } else {
-                op_map.insert(std::pair<std::string, float>(op_param[i], op_time[i]));
-            }
-        }
-
-        for (auto it = op_map.begin(); it != op_map.end(); ++it) {
-            LOG(INFO) << it->first << "  " << (it->second) / epoch << " ms";
-        }
-
-#endif
-        LOG(INFO) << *iter << " aveage time " << my_time.get_average_ms() / epoch << " ms";
-        // save the optimized model to disk.
-        //        std::string save_model_path = GLB_model_dir + std::string("opt.saved");
-        //        status = graph.save(save_model_path);
-        //        if (!status ) {
-        //            LOG(FATAL) << " [ERROR] " << status.info();
-        //        }
-#endif
-    }
-}
-#endif
-
-int main(int argc, const char** argv) {
-    // initial logger
-    LOG(INFO) << "argc " << argc;
-
-    if (argc < 1) {
-        LOG(INFO) << "Example of Usage:\n \
-        ./output/unit_test/model_test\n \
-            anakin_models\n \
-            num\n \
-            channel\n \
-            height\n \
-            width\n ";
-        exit(0);
-    } else if (argc == 2) {
-        GLB_model_dir = std::string(argv[1]);
-        GLB_is_input_shape = false;
-    } else if (argc == 3) {
-        GLB_model_dir = std::string(argv[1]);
-        GLB_num = atoi(argv[2]);
-        GLB_is_input_shape = false;
-    } else {
-        GLB_model_dir = std::string(argv[1]);
-        GLB_num = atoi(argv[2]);
-        GLB_channel = atoi(argv[3]);
-        GLB_height = atoi(argv[4]);
-        GLB_width = atoi(argv[5]);
-        GLB_is_input_shape = true;
-    }
-
-    logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
diff --git a/test/framework/net/net_exec_multi_thread_test.cpp b/test/framework/net/net_exec_multi_thread_test.cpp
deleted file mode 100644
index 7a8bf5401..000000000
--- a/test/framework/net/net_exec_multi_thread_test.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-#include <string>
-#include "net_test.h"
-#include "saber/funcs/timer.h"
-#include <chrono>
-
-std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/yolo_lane_v2.anakin.bin";
-
-#ifdef USE_CUDA
-#if 1
-TEST(NetTest, nv_net_execute_muti_thread_sync_test) {
-#if 1 // use host input
-    //Env<NV>::env_init(1);
-    LOG(WARNING) << "Sync Runing multi_threads for model: " << model_path;
-    Worker<NV, AK_FLOAT, Precision::FP32>  workers(model_path, 10); 
-    workers.register_inputs({"input_0"});
-    workers.register_outputs({"softmax_out"});    
-    workers.Reshape("input_0", {1, 384, 960, 3});
-
-    workers.launch();
-
-    std::vector<Tensor4dPtr<target_host<NV>::type, AK_FLOAT> > host_tensor_p_in_list;
-    // get in
-    saber::Shape valid_shape_in({1, 384, 960, 3});
-    Tensor4dPtr<target_host<NV>::type, AK_FLOAT> h_tensor_in = new Tensor4d<target_host<NV>::type, AK_FLOAT>(valid_shape_in);
-    float* h_data = h_tensor_in->mutable_data();
-    for (int i=0; i<h_tensor_in->size(); i++) {
-        h_data[i] = 1.0f;
-    }
-    host_tensor_p_in_list.push_back(h_tensor_in);
-
-    int epoch = 1000;
-
-    // Running 
-    for(int i=0; i<epoch; i++) {
-        auto  d_tensor_p_out_list = workers.sync_prediction(host_tensor_p_in_list);
-
-        // get the output
-        auto d_tensor_p = d_tensor_p_out_list[0];
-    }
-
-    // get exec times
-#ifdef ENABLE_OP_TIMER
-    auto& times_map = workers.get_task_exe_times_map_of_sync_api();
-    for (auto it = times_map.begin(); it!=times_map.end(); it++) {
-        LOG(WARNING) << " threadId: " << it->first << " processing " << it->second.size() << " tasks";
-        for (auto time_in_ms : it->second) { 
-            LOG(INFO) << "      \\__task avg time: " << time_in_ms;
-        }
-    }
-#endif
-
-#endif
-
-#if 0 // use device input
-    Env<NV>::env_init(1);
-    LOG(WARNING) << "Sync Runing multi_threads for model: " << model_path;
-    Worker<NV, AK_FLOAT, Precision::FP32>  workers(model_path, 1); 
-    workers.register_inputs({"input_0"});
-    workers.register_outputs({"softmax_out"});    
-    workers.Reshape("input_0", {1, 384, 960, 3});
-
-    workers.launch();
-
-    std::vector<Tensor4dPtr<target_host<NV>::type, AK_FLOAT> > host_tensor_p_in_list;
-    // get in
-    saber::Shape valid_shape_in({1, 384, 960, 3});
-    Tensor4dPtr<target_host<NV>::type, AK_FLOAT> h_tensor_in = new Tensor4d<target_host<NV>::type, AK_FLOAT>(valid_shape_in);
-    float* h_data = h_tensor_in->mutable_data();
-    for (int i=0; i<h_tensor_in->size(); i++) {
-        h_data[i] = 1.0f;
-    }
-    host_tensor_p_in_list.push_back(h_tensor_in);
-
-    std::vector<Tensor4dPtr<NV, AK_FLOAT> > device_tensor_p_in_list;
-    for (int i=0; i<host_tensor_p_in_list.size(); i++) {
-        Tensor4dPtr<NV, AK_FLOAT> d_tensor_in = new Tensor4d<NV, AK_FLOAT>(host_tensor_p_in_list[i]->valid_shape());
-        d_tensor_in->copy_from(*(host_tensor_p_in_list[i]));
-        device_tensor_p_in_list.push_back(d_tensor_in);
-    }
-
-    int epoch = 10;
-
-    // Running 
-    for (int i=0; i<epoch; i++) {
-        Context<NV> ctx(0, 0, 0);
-        saber::SaberTimer<NV> my_time;
-
-        my_time.start(ctx);
-        auto  d_tensor_p_out_list = workers.sync_prediction_device(device_tensor_p_in_list);
-        my_time.end(ctx);
-        LOG(INFO)<<"muti thread single task exec time: "<<my_time.get_average_ms()/epoch << " ms";
-
-        // get the output
-        auto d_tensor_p = d_tensor_p_out_list[0];
-    }
-#endif
-
-}
-#endif
-
-#if 0
-TEST(NetTest, net_execute_muti_thread_async_test) {
-    LOG(WARNING) << "Async Runing multi_threads for model: " << model_path;
-    Worker<NV, AK_FLOAT, Precision::FP32>  workers(model_path, 10); 
-    workers.register_inputs({"input_0"});
-    workers.register_outputs({"softmax_out"});    
-    workers.Reshape("input_0", {1, 384, 960, 3});
-
-    workers.launch();
-
-    std::vector<Tensor4dPtr<target_host<NV>::type, AK_FLOAT> > host_tensor_p_in_list;
-    // get in
-    saber::Shape valid_shape_in({1, 384, 960, 3});
-    Tensor4dPtr<target_host<NV>::type, AK_FLOAT> h_tensor_in = new Tensor4d<target_host<NV>::type, AK_FLOAT>(valid_shape_in);
-    float* h_data = h_tensor_in->mutable_data();
-    for (int i=0; i<h_tensor_in->size(); i++) {
-        h_data[i] = 1.0f;
-    }
-    host_tensor_p_in_list.push_back(h_tensor_in);
-
-    int epoch = 10000;
-
-    // Running 
-    for(int i=0; i<epoch; i++) {
-        workers.async_prediction(host_tensor_p_in_list);
-    }
-
-    // get the output
-    int iterator = epoch;
-    while(iterator) {
-        if(!workers.empty()) {
-            auto d_tensor_p = workers.async_get_result()[0];
-            // get hte data of d_tensor_p
-            
-            iterator--;
-        }
-    }
-
-}
-#endif 
-#endif
-
-int main(int argc, const char** argv){
-    // initial logger
-    logger::init(argv[0]);
-	InitTest();
-	RUN_ALL_TESTS(argv[0]);	
-	return 0;
-}
diff --git a/test/framework/net/net_exec_test.cpp b/test/framework/net/net_exec_test.cpp
deleted file mode 100644
index 3f40fc341..000000000
--- a/test/framework/net/net_exec_test.cpp
+++ /dev/null
@@ -1,273 +0,0 @@
-#include <string>
-#include "net_test.h"
-#include "saber/funcs/timer.h"
-#include <chrono>
-
-//#define USE_DIEPSE
-
-//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/diepsie_light_head.anakin.bin";
-
-//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/diepsie_light_head_base.anakin.bin";
-
-
-//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/densebox.anakin.bin";
-
-//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/cnn_seg.anakin.bin";
-
-//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/yolo_camera_detector.anakin.bin";
-
-//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/yolo_lane_v2.anakin.bin";
-
-// alignment of face
-//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/net_deploy_stageI.anakin.bin";
-
-//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/net_deploy_stageII.anakin.bin";
-
-// residual 7 patch of face
-//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/residual_net_7patch_3hc.anakin.bin";
-
-// resnet 50
-//std::string model_path = "/home/cuichaowen/anakin2/anakin2/benchmark/CNN/mobilenet_v2.anakin.bin";
-
-// vgg16
-std::string model_path = "/home/cuichaowen/anakin2/anakin2/benchmark/CNN/models/vgg16.anakin.bin";
-
-#ifdef USE_CUDA
-#if 1
-TEST(NetTest, net_execute_base_test) {
-    Graph<NV, AK_FLOAT, Precision::FP32>* graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
-    LOG(WARNING) << "load anakin model file from " << model_path << " ...";
-    // load anakin model files.
-    auto status = graph->load(model_path);
-    if(!status ) {
-        LOG(FATAL) << " [ERROR] " << status.info();
-    }
-
-    // reshape the input_0 's shape for graph model
-    //graph->Reshape("input_0", {1, 8, 640, 640});
-
-    // register all tensor inside graph
-    //graph->RegistAllOut();
-	
-    // register edge
-    // graph->RegistOut("conv2_2/expand/scale", "relu2_2/expand");
-
-    //anakin graph optimization
-    graph->Optimize();
-
-    // constructs the executer net
-	{ // inner scope
-#ifdef USE_DIEPSE
-    Net<NV, AK_FLOAT, Precision::FP32, OpRunType::SYNC> net_executer(*graph, true);
-#else
-    Net<NV, AK_FLOAT, Precision::FP32> net_executer(*graph, true);
-#endif
-
-    // get in
-    auto d_tensor_in_p = net_executer.get_in("input_0");
-    Tensor4d<X86, AK_FLOAT> h_tensor_in;
-
-    auto valid_shape_in = d_tensor_in_p->valid_shape();
-    for (int i=0; i<valid_shape_in.size(); i++) {
-        LOG(INFO) << "detect input_0 dims[" << i << "]" << valid_shape_in[i];
-    }
-
-    h_tensor_in.re_alloc(valid_shape_in);
-    float* h_data = h_tensor_in.mutable_data();
-
-    for (int i=0; i<h_tensor_in.size(); i++) {
-        h_data[i] = 1.0f;
-    }
-
-    d_tensor_in_p->copy_from(h_tensor_in);
-
-#ifdef USE_DIEPSE
-    // for diepse model
-    auto d_tensor_in_1_p = net_executer.get_in("input_1");
-    Tensor4d<X86, AK_FLOAT> h_tensor_in_1;
-
-    h_tensor_in_1.re_alloc(d_tensor_in_1_p->valid_shape());
-    for (int i=0; i<d_tensor_in_1_p->valid_shape().size(); i++) {
-        LOG(INFO) << "detect input_1 dims[" << i << "]" << d_tensor_in_1_p->valid_shape()[i];
-    }
-    h_data = h_tensor_in_1.mutable_data();
-    h_data[0] = 1408;
-    h_data[1] = 800;
-    h_data[2] = 0.733333;
-    h_data[3] = 0.733333;
-    h_data[4] = 0;
-    h_data[5] = 0;
-    d_tensor_in_1_p->copy_from(h_tensor_in_1);
-
-    auto d_tensor_in_2_p = net_executer.get_in("input_2");
-    Tensor4d<X86, AK_FLOAT> h_tensor_in_2;
-
-    h_tensor_in_2.re_alloc(d_tensor_in_2_p->valid_shape());
-    for (int i=0; i<d_tensor_in_2_p->valid_shape().size(); i++) {
-        LOG(INFO) << "detect input_2 dims[" << i << "]" << d_tensor_in_2_p->valid_shape()[i];
-    }
-    h_data = h_tensor_in_2.mutable_data();
-    h_data[0] = 2022.56;
-    h_data[1] = 989.389;
-    h_data[2] = 2014.05;
-    h_data[3] = 570.615;
-    h_data[4] = 1.489;
-    h_data[5] = -0.02;
-    d_tensor_in_2_p->copy_from(h_tensor_in_2);
-#endif
-
-    int epoch = 1;
-    // do inference
-    Context<NV> ctx(0, 0, 0);
-    saber::SaberTimer<NV> my_time;
-    LOG(WARNING) << "EXECUTER !!!!!!!! ";
-	// warm up
-	/*for(int i=0; i<10; i++) {
-		net_executer.prediction();
-	}*/
-
-    my_time.start(ctx);
-
-
-    //auto start = std::chrono::system_clock::now();
-    for(int i=0; i<epoch; i++) {
-		//DLOG(ERROR) << " epoch(" << i << "/" << epoch << ") ";
-        net_executer.prediction();
-    }
-   /* // running part of model
-    net_executer.execute_stop_at_node("relu2_2/expand");
-#ifdef USE_CUDA
-    cudaDeviceSynchronize();
-#endif
-
-	// get inner tensor after stop
-    auto tensor_out_inner_p = net_executer.get_tensor_from_edge("conv2_2/expand", "relu2_2/expand");
-    LOG(WARNING) << "inner tensor avg value : " << tensor_average(tensor_out_inner_p);
-#ifdef USE_CUDA
-	cudaDeviceSynchronize();
-#endif
-    
-    for (int i = 0; i < 3; i++) {
-    	net_executer.execute_start_from_node("relu2_2/expand");
-    }
-
-#ifdef USE_CUDA
-    cudaDeviceSynchronize();
-#endif*/
-
-    //auto end = std::chrono::system_clock::now();
-
-    //double time = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
-    //LOG(WARNING) << "avg time : " << time/epoch <<" ms";
-
-    my_time.end(ctx);
-    LOG(INFO)<<"aveage time "<<my_time.get_average_ms()/epoch << " ms";
-
-	} // inner scope over
-
-	LOG(ERROR) << "inner net exe over !";
-
-    //auto& tensor_out_inner_p = net_executer.get_tensor_from_edge("data_perm", "conv1");
-
-    // get out yolo_v2
-    /*auto tensor_out_0_p = net_executer.get_out("loc_pred_out");
-    auto tensor_out_1_p = net_executer.get_out("obj_pred_out");
-    auto tensor_out_2_p = net_executer.get_out("cls_pred_out");
-    auto tensor_out_3_p = net_executer.get_out("ori_pred_out");
-    auto tensor_out_4_p = net_executer.get_out("dim_pred_out");*/
-
-	// get outs cnn_seg 
-	/*auto tensor_out_0_p = net_executer.get_out("slice_[dump, mask]_out");
-	auto tensor_out_1_p = net_executer.get_out("category_score_out");
-	auto tensor_out_2_p = net_executer.get_out("instance_pt_out");
-   	auto tensor_out_3_p = net_executer.get_out("confidence_score_out");
-	auto tensor_out_4_p = net_executer.get_out("class_score_out");
-	auto tensor_out_5_p = net_executer.get_out("heading_pt_out");
-	auto tensor_out_6_p = net_executer.get_out("height_pt_out");*/
-    // get out result
-    //test_print<NV>(tensor_out_4_p);
-
-
-    // save the optimized model to disk.
-    /*std::string save_model_path = model_path + std::string(".saved");
-    status = graph->save(save_model_path);
-    if (!status ) { 
-        LOG(FATAL) << " [ERROR] " << status.info(); 
-    }*/
-}
-#endif 
-#endif
-
-#if 0
-TEST(NetTest, net_execute_reconstruction_test) {
-    graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
-    LOG(WARNING) << "load anakin model file from optimized model " << model_saved_path << " ...";
-    // load anakin model files.
-    auto status = graph->load(model_saved_path);
-    if (!status ) {
-        LOG(FATAL) << " [ERROR] " << status.info();
-    }
-
-    // regisiter output tensor
-    //graph->RegistOut("data_perm",  "data_scale");
-    graph->RegistOut("data_perm",  "conv1");
-
-    //anakin graph optimization
-    graph->Optimize();
-
-    // constructs the executer net
-    Net<NV, AK_FLOAT, Precision::FP32> net_executer(*graph);
-
-    // get in
-    auto d_tensor_in_p = net_executer.get_in("input_0");
-    Tensor4d<X86, AK_FLOAT> h_tensor_in;
-
-    auto valid_shape_in = d_tensor_in_p->valid_shape();
-    for (int i=0; i<valid_shape_in.size(); i++) {
-        LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i];
-    }
-
-    h_tensor_in.re_alloc(valid_shape_in);
-    float* h_data = h_tensor_in.mutable_data();
-
-    for (int i=0; i<h_tensor_in.size(); i++) {
-        h_data[i] = 1.0f;
-    }
-
-    d_tensor_in_p->copy_from(h_tensor_in);
-
-    // do inference
-    Context<NV> ctx(0, 0, 0);
-    saber::SaberTimer<NV> my_time;
-    my_time.start(ctx);
-
-    LOG(WARNING) << "EXECUTER !!!!!!!! ";
-    for (int i=0; i<1; i++) {
-        net_executer.prediction();
-
-    }
-    my_time.end(ctx);
-    LOG(INFO)<<"aveage time "<<my_time.get_average_ms()/1 << " ms";
-
-    auto tensor_out_inner_p = net_executer.get_tensor_from_edge("data_perm",  "conv1");
-
-    // get out
-    auto tensor_out_0_p = net_executer.get_out("loc_pred_out");
-    auto tensor_out_1_p = net_executer.get_out("obj_pred_out");
-    auto tensor_out_2_p = net_executer.get_out("cls_pred_out");
-    auto tensor_out_3_p = net_executer.get_out("ori_pred_out");
-    auto tensor_out_4_p = net_executer.get_out("dim_pred_out");
-
-    
-    // get out result
-    test_print<NV>(tensor_out_inner_p);
-}
-#endif
-
-int main(int argc, const char** argv){
-    // initial logger
-    logger::init(argv[0]);
-	InitTest();
-	RUN_ALL_TESTS(argv[0]);	
-	return 0;
-}
diff --git a/test/framework/net/net_test.h b/test/framework/net/net_test.h
deleted file mode 100644
index c240afbf0..000000000
--- a/test/framework/net/net_test.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-   
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. 
-*/
-
-#ifndef ANAKIN_NET_TEST_H
-#define ANAKIN_NET_TEST_H
-
-#include <iostream>
-#include "utils/unit_test/aktest.h"
-#include "utils/logger/logger.h"
-#include "graph_base.h"
-#include "graph.h"
-#include "scheduler.h"
-#include "net.h"
-#include "worker.h"
-
-using namespace anakin;
-using ::anakin::test::Test;
-
-using namespace anakin::graph;
-
-/**
- * \brief Graph test is base Test class for anakin graph funciton.  
- */
-class NetTest: public Test {
-public:
-    NetTest(){}
-
-    void SetUp(){}
-
-    void TearDown(){}
-
-protected:
-};
-
-#ifdef USE_CUDA
-void test_print(Tensor4dPtr<NV, AK_FLOAT>& out_tensor_p) {
-    Tensor4d<target_host<NV>::type, AK_FLOAT> h_tensor_result;
-    h_tensor_result.re_alloc(out_tensor_p->valid_shape());
-    LOG(ERROR) << "result count : " << h_tensor_result.valid_shape().count();
-    h_tensor_result.copy_from(*out_tensor_p);
-    for (int i = 0; i < h_tensor_result.valid_size(); i++) {
-        LOG(INFO) << " GET OUT (" << i << ") " << h_tensor_result.mutable_data()[i];
-    }
-}
-#endif
-
-template<typename Ttype, DataType Dtype>
-double tensor_average(Tensor4dPtr<Ttype, Dtype>& out_tensor_p) {
-    double sum = 0.0f;
-#ifdef USE_CUDA
-    float* h_data = new float[out_tensor_p->valid_size()];
-    const float* d_data = out_tensor_p->data();
-    CUDA_CHECK(cudaMemcpy(h_data, d_data, out_tensor_p->valid_size()*sizeof(float), cudaMemcpyDeviceToHost));
-#else
-	float* h_data = out_tensor_p->data();
-#endif
-    for (int i=0; i<out_tensor_p->valid_size(); i++) {
-		sum+=h_data[i];
-    }
-    return sum/out_tensor_p->valid_size();
-}
-
-
-#ifdef USE_X86_PLACE
-static int record_dev_tensorfile(const Tensor4d<X86, AK_FLOAT>* dev_tensor, const char* locate) {
-    Tensor<target_host<X86>::type, AK_FLOAT, NCHW> host_temp;
-    host_temp.re_alloc(dev_tensor->valid_shape());
-    host_temp.copy_from(*dev_tensor);
-    FILE* fp = fopen(locate, "w+");
-    int size = host_temp.valid_shape().count();
-    if (fp == 0) {
-        LOG(ERROR) << "[ FAILED ] file open target txt: " << locate;
-    } else {
-        for (int i = 0; i < size; ++i) {
-            fprintf(fp, "%.18f \n", i, (host_temp.data()[i]));
-        }
-        fclose(fp);
-    }
-    LOG(INFO) << "[ SUCCESS ] Write " << size << " data to: " << locate;
-    return 0;
-}
-#endif
-
-#endif
-
-
diff --git a/test/framework/net/padde_api_test.cpp b/test/framework/net/padde_api_test.cpp
deleted file mode 100644
index 6e0dfe878..000000000
--- a/test/framework/net/padde_api_test.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-#include <string>
-#include "net_test.h"
-#include "saber/funcs/timer.h"
-#include <chrono>
-#include "saber/core/tensor_op.h"
-#include <dirent.h> 
-#include <sys/stat.h> 
-#include <sys/types.h> 
-#include <unistd.h>  
-#include <fcntl.h>
-#include <map>
-#include "paddle_api.h"
-#define DEFINE_GLOBAL(type, var, value) \
-        type (GLB_##var) = (value)
-DEFINE_GLOBAL(std::string, model_dir, "");
-DEFINE_GLOBAL(int, num, 1);
-DEFINE_GLOBAL(int, channel, 8);
-DEFINE_GLOBAL(int, height, 640);
-DEFINE_GLOBAL(int, width, 640);
-DEFINE_GLOBAL(bool, is_input_shape, false);
-
-#ifdef USE_CUDA
-typedef NV Target;
-#elif defined(USE_X86_PLACE)
-typedef X86 Target;
-#else
-typedef ARM Target;
-#endif
-
-void getModels(std::string path, std::vector<std::string>& files)
-{
-    DIR *dir;
-    struct dirent *ptr;
-    if((dir=opendir(path.c_str()))==NULL){
-        perror("Open dri error...");
-        exit(1);
-    }
-    while((ptr=readdir(dir))!=NULL){
-        if(strcmp(ptr->d_name,".")==0||strcmp(ptr->d_name,"..")==0)
-            continue;
-        else if(ptr->d_type==8)//file
-            files.push_back(path+"/"+ptr->d_name);
-        else if(ptr->d_type==4){
-            //files.push_back(ptr->d_name);//dir
-            getModels(path+"/"+ptr->d_name,files);
-        }
-    }
-    closedir(dir);
-}
-
-
-TEST(NetTest, net_execute_base_test) {
-    std::vector<std::string> models;
-    getModels(GLB_model_dir, models);
-    for (auto iter = models.begin(); iter < models.end(); iter++)
-    {
-        AnakinEngine<Target, AK_FLOAT, Precision::FP32> anakin_engine;
-        LOG(WARNING) << "load anakin model file from " << *iter << " ...";
-        std::vector<int> shape{GLB_num, GLB_channel, GLB_height, GLB_width};
-        //anakin_engine.Build(*iter, shape);
-        anakin_engine.Build(*iter);
-
-        printf("Args = %d %d %d %d\n",GLB_num, GLB_channel, GLB_height, GLB_width);
-        //fill input
-        Tensor4d<X86, AK_FLOAT> h_tensor_in;
-        h_tensor_in.re_alloc({GLB_num, GLB_channel, GLB_height, GLB_width});
-        fill_tensor_host_rand(h_tensor_in, -1.0f,1.0f);
-
-        anakin_engine.SetInputFromCPU("input_0", h_tensor_in.data(), h_tensor_in.valid_size());
-
-        int warmup_iter = 10;
-        int epoch = 1000;
-        // do inference
-        Context<Target> ctx(0, 0, 0);
-        saber::SaberTimer<Target> my_time;
-        LOG(WARNING) << "EXECUTER !!!!!!!! ";
-        for (int i = 0; i < warmup_iter; i++) {
-            anakin_engine.Execute();
-        }
-        my_time.start(ctx);
-        //auto start = std::chrono::system_clock::now();
-        for (int i = 0; i < epoch; i++) {
-            anakin_engine.Execute();
-        }
-        my_time.end(ctx);
-        LOG(INFO) << *iter << " aveage time "<< my_time.get_average_ms() / epoch << " ms";            
-    }
-}
-
-int main(int argc, const char** argv){
-    // initial logger
-    LOG(INFO)<<"argc"<<argc;
-    if (argc < 1) {
-        LOG(INFO) << "Example of Usage:\n \
-        ./output/unit_test/model_test\n \
-            anakin_models\n \
-            num\n \
-            channel\n \
-            height\n \
-            width\n ";
-        exit(0);
-    } else if (argc == 2){
-        GLB_model_dir = std::string(argv[1]);
-        GLB_is_input_shape = false;
-    } else if (argc == 3){
-        GLB_model_dir = std::string(argv[1]);
-        GLB_num = atoi(argv[2]);
-        GLB_is_input_shape = false;
-    } else {
-        GLB_model_dir = std::string(argv[1]);
-        GLB_num = atoi(argv[2]);
-        GLB_channel = atoi(argv[3]);
-        GLB_height = atoi(argv[4]);
-        GLB_width = atoi(argv[5]);
-        GLB_is_input_shape = true;
-    }
-    logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]); 
-    return 0;
-}
diff --git a/test/framework/net/paddle_api.h b/test/framework/net/paddle_api.h
deleted file mode 100644
index 59cb92cd9..000000000
--- a/test/framework/net/paddle_api.h
+++ /dev/null
@@ -1,87 +0,0 @@
-
-#include <string>
-#include "saber/funcs/timer.h"
-#include <chrono>
-#include "saber/core/tensor_op.h"
-#include "saber/saber_types.h"
-#include <dirent.h> 
-#include <sys/stat.h> 
-#include <sys/types.h> 
-#include <unistd.h>  
-#include <fcntl.h>
-#include <map>
-
-class EngineBase {
- public:
-  // Build the model and do some preparation, for example, in TensorRT, run
-  // createInferBuilder, buildCudaEngine.
-  virtual void Build(const std::string& model_file, int batch_size = 1) = 0;
-  virtual void Build(const std::string& model_file, const std::vector<int>& shape) = 0;
-  // Execute the engine, that will run the inference network.
-  virtual void Execute() = 0;
-
-  virtual ~EngineBase() {}
-};  // class EngineBase
-
-template <typename Ttype, DataType Dtype, Precision Ptype>
-class AnakinEngine : public EngineBase {
-public:
-  typedef typename anakin::saber::DataTrait<Dtype>::dtype Dtype_t;
-  typedef anakin::saber::TargetWrapper<X86> X86_API;
-  typedef anakin::saber::TargetWrapper<Ttype> NV_API;
-  AnakinEngine(){}
-
-  ~AnakinEngine(){};
-
-  void Build(const std::string& model_file, int batch_size = 1) override
-  {
-    _graph.load(model_file);
-    _graph.ResetBatchSize("input_0", batch_size);
-    _graph.Optimize();
-    _net_executer.init(_graph);
-  };
-
-  void Build(const std::string& model_file, const std::vector<int>& shape) override
-  {
-    _graph.load(model_file);
-    _graph.Reshape("input_0", shape);
-    _graph.Optimize();
-    _net_executer.init(_graph);
-  };
-
-  void Execute() override
-  {
-    _net_executer.prediction();    
-  };
-
-  // Fill an input from CPU memory with name and size.
-  void SetInputFromCPU(const std::string name, Dtype_t* data, size_t size)
-  {
-    auto input_tensor = _net_executer.get_in(name);
-    anakin::Tensor<Ttype, Dtype> tmp_tensor(data, anakin::saber::X86(), X86_API::get_device_id(), input_tensor->valid_shape());
-    *input_tensor = tmp_tensor;
-  };
-
-  // accessed directly. Fill an input from GPU memory with name and size.
-  void SetInputFromGPU(const std::string& name, Dtype_t* data, size_t size)
-  {
-    auto input_tensor = _net_executer.get_in(name);
-    CHECK_EQ(size, input_tensor->valid_size());
-    anakin::Tensor<Ttype, Dtype> tmp_tensor(data, NV(), NV_API::get_device_id(), input_tensor->valid_shape());
-    *input_tensor = tmp_tensor;
-  };
-  // Get an output called name, the output of tensorrt is in GPU, so this method
-  // will just return the output's GPU memory address.
-  anakin::Tensor<Ttype, Dtype>* GetOutputInGPU(const std::string& name)
-  {
-    return _net_executer.get_out(name);
-  }
-
-private:
-    anakin::graph::Graph<Ttype, Dtype, Ptype> _graph;
-    anakin::Net<Ttype, Dtype, Ptype> _net_executer;
-};  // class TensorRTEngine
-template 
-class AnakinEngine<NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>;
-
-
diff --git a/test/framework/operators/operator_tests.h b/test/framework/operators/operator_tests.h
deleted file mode 100644
index 38f16b87d..000000000
--- a/test/framework/operators/operator_tests.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-   
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. 
-*/
-
-#ifndef ANAKIN_OPERATOR_TESTS_H
-#define ANAKIN_OPERATOR_TESTS_H
-
-#include <iostream>
-#include <string>
-#include <thread>
-#include "utils/unit_test/aktest.h"
-#include "utils/logger/logger.h"
-#include "framework/operators/ops.h"
-
-using namespace anakin;
-using ::anakin::test::Test;
-
-class OperatorsTest : public Test {
-public:
-    OperatorsTest(){}
-
-    void SetUp(){}
-
-    void TearDown(){}
-
-protected:
-};
-
-
-
-
-
-
-#endif
-
-
diff --git a/test/framework/operators/pooling_test.cpp b/test/framework/operators/pooling_test.cpp
deleted file mode 100644
index 47b66be23..000000000
--- a/test/framework/operators/pooling_test.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-#include "operator_tests.h"
-#include "thread_pool.h"
-
-#ifdef USE_CUDA
-using Target = NV;
-#elif defined(USE_X86_PLACE)
-using Target = X86;
-#else
-using Target = ARM;
-#endif
-
-TEST(OperatorsTest, PoolingFactoryTest) {
-    OpContext<Target> opctx;
-    std::vector<Tensor4dPtr<Target, AK_FLOAT> > in;
-    std::vector<Tensor4dPtr<Target, AK_FLOAT> > out;
-
-
-    /*Operator<RTCUDA, float>*/ auto* Op_name1 =
-        OpFactory<Target, AK_FLOAT, Precision::FP32>::Global()["pooling"];
-    /*Operator<RTCUDA, float>**/auto* Op_name2 =
-        OpFactory<Target, AK_FLOAT, Precision::FP32>::Global()["pool"];
-    auto& op_list = OpFactory<Target, AK_FLOAT, Precision::FP32>::Global().get_list_op_name();
-
-    for (auto& item : op_list) {
-        LOG(INFO) << " op: " << item;
-    }
-
-    LOG(WARNING) << " op name alias 1 : pooling";
-    LOG(INFO) << "  run forward function";
-    (*Op_name1)(opctx, in, out);
-    LOG(WARNING) << " op name alias 2 : pool";
-    LOG(INFO) << "  run forward function";
-    (*Op_name2)(opctx, in, out);
-}
-
-
-int main(int argc, const char** argv) {
-    // initial logger
-    logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp
deleted file mode 100644
index ea8d7101d..000000000
--- a/test/saber/bm/test_saber_buffer_BM.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-#include "test_saber_buffer_BM.h"
-#include "saber/core/buffer.h"
-#include "saber/core/data_traits.h"
-
-using namespace anakin::saber;
-
-static bm_handle_t handle;
-
-int get_bm_size() {
-    return 1;
-}
-
-template <DataType Ddatatype, DataType Hdatatype>
-void test_buffer() {
-
-    //TODO: init in another place
-    bmdnn_init(&handle);
-
-    typedef TargetWrapper<X86> X86_API;
-    typedef TargetWrapper<BM> BM_API;
-    typedef typename DataTrait<Ddatatype>::dtype Ddtype;
-    typedef typename DataTrait<Hdatatype>::dtype Hdtype;
-    typedef Buffer<X86> BufferH;
-    typedef Buffer<BM> BufferD;
-
-    int n0 = 1024;
-    int n1 = 2048;
-
-    void* tmp_x86;
-    Hdtype* x86_ptr;
-    X86_API::mem_alloc(&tmp_x86, sizeof(Hdtype) * n0);
-    x86_ptr = static_cast<Hdtype*>(tmp_x86);
-
-    for (int i = 0; i < n0; i++) {
-        x86_ptr[i] = static_cast<Hdtype>(i);
-    }
-
-    void* tmp_bm;
-    Ddtype* bm_ptr;
-    BM_API::mem_alloc(&tmp_bm, get_bm_size() * n0);
-    bm_ptr = static_cast<Ddtype*>(tmp_bm);
-
-    LOG(INFO) << "Buffer: test default(empty) constructor";
-    BufferH x86_buf0;
-    BufferD bm_buf0;
-
-    LOG(INFO) << "Buffer: test constructor with data size";
-    BufferH x86_buf1(n0 * sizeof(Hdtype));
-    BufferD bm_buf1(n0 * sizeof(Ddtype));
-
-    LOG(INFO) << "Buffer: test constructor with data pointer, size and device id";
-    BufferH x86_buf2(x86_ptr, n0 * sizeof(Hdtype), X86_API::get_device_id());
-    BufferD bm_buf2(bm_ptr, n0 * get_bm_size(), BM_API::get_device_id());
-
-    LOG(INFO) << "Buffer: test copy constructor";
-    BufferH x86_buf3(x86_buf2);
-    LOG(INFO) << "BM Buffer copy constructor";
-    LOG(INFO) << "bm target id: " << BM_API::get_device_id();
-    LOG(INFO) << "bm buffer target id: " << bm_buf2.get_id();
-    BufferD bm_buf3(bm_buf2);
-    CHECK_EQ(x86_buf3.get_count(), x86_buf2.get_count()) << \
-            "shared buffer should have same data count";
-    CHECK_EQ(bm_buf3.get_count(), bm_buf2.get_count()) << \
-            "shared buffer should have same data count";
-
-    LOG(INFO) << "Buffer: test operator =";
-    x86_buf0 = x86_buf2;
-    bm_buf0 = bm_buf2;
-    CHECK_EQ(x86_buf0.get_count(), x86_buf2.get_count()) << \
-            "shared buffer should have same data count";
-    CHECK_EQ(bm_buf0.get_count(), bm_buf2.get_count()) << \
-            "shared buffer should have same data count";
-
-    LOG(INFO) << "Buffer: test re_alloc";
-    x86_buf1.re_alloc(n1 * sizeof(Hdtype));
-    bm_buf1.re_alloc(n1 * sizeof(Ddtype));
-    CHECK_EQ(x86_buf1.get_count(), n1 * sizeof(Hdtype)) << "buffer count error";
-    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Hdtype)) << "buffer capacity error";
-    CHECK_EQ(bm_buf1.get_count(), n1 * sizeof(Ddtype)) << "buffer count error";
-    CHECK_EQ(bm_buf1.get_capacity(), n1 * sizeof(Ddtype)) << "buffer capacity error";
-    x86_buf1.re_alloc(n0 * sizeof(Hdtype));
-    bm_buf1.re_alloc(n0 * sizeof(Ddtype));
-    CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Hdtype)) << "buffer count error";
-    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Hdtype)) << "buffer capacity error";
-    CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Hdtype)) << "buffer count error";
-    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Hdtype)) << "buffer capacity error";
-
-    LOG(INFO) << "Buffer: test get_id()";
-    LOG(INFO) << "X86 device id: " << x86_buf0.get_id() << \
-              ", bm device id: " << bm_buf0.get_id();
-    CHECK_EQ(X86_API::get_device_id(), x86_buf0.get_id()) << "x86 device id error";
-    CHECK_EQ(BM_API::get_device_id(), bm_buf0.get_id()) << "bm device id error";
-
-    LOG(INFO) << "Buffer: test deep_cpy()";
-    x86_buf1.sync_copy_from(x86_buf2);
-    LOG(INFO) << "deep copy between two host buffer: ";
-    const Hdtype* ptr1 = static_cast<const Hdtype*>(x86_buf1.get_data());
-    const Hdtype* ptr2 = static_cast<const Hdtype*>(x86_buf1.get_data());
-
-    for (int i = 0; i < 10; i++) {
-        std::cout << ptr1[i] << std::endl;
-    }
-
-    CHECK_EQ(ptr1[n0 / 2], ptr2[n0 / 2]) << "deep copy between host is incorrect";
-    LOG(INFO) << "deep copy from host buffer to device buffer";
-    bm_buf1.sync_copy_from(x86_buf2);
-    x86_buf1.sync_copy_from(bm_buf1);
-    LOG(INFO) << "deep copy from device buffer to host buffer: ";
-    ptr1 = static_cast<const Hdtype*>(x86_buf1.get_data());
-
-    for (int i = 0; i < 10; i++) {
-        std::cout << ptr1[i] << std::endl;
-    }
-}
-
-TEST(TestSaberBufferBM, test_buffer_memcpy) {
-    test_buffer<AK_BM, AK_FLOAT>();
-}
-
-int main(int argc, const char** argv) {
-    // initial logger
-    logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
diff --git a/test/saber/bm/test_saber_buffer_BM.h b/test/saber/bm/test_saber_buffer_BM.h
deleted file mode 100644
index 8bbbe4511..000000000
--- a/test/saber/bm/test_saber_buffer_BM.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H
-#define ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H
-
-#include "utils/unit_test/aktest.h"
-#include "utils/logger/logger.h"
-
-using namespace anakin::test;
-
-class TestSaberBufferBM : public Test {
-public:
-    TestSaberBufferBM() {}
-    ~TestSaberBufferBM() {}
-
-protected:
-    virtual void setup() {}
-    virtual void teardown() {}
-
-};
-
-#endif //ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H
diff --git a/test/saber/bm/test_saber_context_BM.cpp b/test/saber/bm/test_saber_context_BM.cpp
deleted file mode 100644
index ed93866cf..000000000
--- a/test/saber/bm/test_saber_context_BM.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-#include "test_saber_context_BM.h"
-
-#ifdef USE_BM
-
-using namespace anakin::saber;
-
-TEST(TestSaberContextBM, test_BM_context) {
-    Env<BM>::env_init();
-    typedef TargetWrapper<BM> API;
-    typename API::event_t event;
-    API::create_event(event);
-    LOG(INFO) << "test context constructor";
-    Context<BM> ctx0;
-    Context<BM> ctx1(0, 1, 1);
-
-    //for BM no need to test stream as it is not in use
-}
-
-#endif
-
-int main(int argc, const char** argv) {
-    // initial logger
-    logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
diff --git a/test/saber/bm/test_saber_context_BM.h b/test/saber/bm/test_saber_context_BM.h
deleted file mode 100644
index 653ee11fd..000000000
--- a/test/saber/bm/test_saber_context_BM.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef SABER_TEST_SABER_CONTEXT_BM_H
-#define SABER_TEST_SABER_CONTEXT_BM_H
-
-#include "utils/unit_test/aktest.h"
-#include "utils/logger/logger.h"
-#include "core/context.h"
-
-using namespace anakin::test;
-
-class TestSaberContextBM : public Test {
-public:
-    TestSaberContextBM() {}
-    ~TestSaberContextBM() {}
-
-protected:
-    virtual void setup() {}
-    virtual void teardown() {}
-
-};
-
-#endif //SABER_TEST_SABER_CONTEXT_BM_H
diff --git a/test/saber/bm/test_saber_device_BM.cpp b/test/saber/bm/test_saber_device_BM.cpp
deleted file mode 100644
index 1c7086cf1..000000000
--- a/test/saber/bm/test_saber_device_BM.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "test_saber_device_BM.h"
-
-#ifdef USE_BM
-
-using namespace anakin::saber;
-
-TEST(TestSaberDeviceBM, test_BM_device) {
-    Device<BM> dev_BM;
-}
-
-#endif
-
-int main(int argc, const char** argv) {
-    // initial logger
-    logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
diff --git a/test/saber/bm/test_saber_device_BM.h b/test/saber/bm/test_saber_device_BM.h
deleted file mode 100644
index 3a6d61236..000000000
--- a/test/saber/bm/test_saber_device_BM.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef SABER_TEST_SABER_DEVICE_BM_H
-#define SABER_TEST_SABER_DEVICE_BM_H
-
-#include "utils/unit_test/aktest.h"
-#include "utils/logger/logger.h"
-#include "core/device.h"
-
-using namespace anakin::test;
-
-class TestSaberDeviceBM : public Test {
-public:
-    TestSaberDeviceBM() {}
-    ~TestSaberDeviceBM() {}
-
-protected:
-    virtual void setup() {}
-    virtual void teardown() {}
-
-};
-
-#endif //SABER_TEST_SABER_DEVICE_BM_H
diff --git a/test/saber/bm/test_saber_func_BM.h b/test/saber/bm/test_saber_func_BM.h
deleted file mode 100644
index 61d27d6f9..000000000
--- a/test/saber/bm/test_saber_func_BM.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef ANAKIN_TEST_SABER_TEST_SABER_FUNC_BM_H
-#define ANAKIN_TEST_SABER_TEST_SABER_FUNC_BM_H
-
-#include "utils/unit_test/aktest.h"
-#include "utils/logger/logger.h"
-#include "core/tensor.h"
-#include <fstream>
-#include <vector>
-
-using namespace anakin::test;
-
-int read_file(std::vector<float> &results, const char* file_name) {
-
-    std::ifstream infile(file_name);
-    if (!infile.good()) {
-        std::cout << "Cannot open " << std::endl;
-        return false;
-    }
-    LOG(INFO)<<"found filename: "<<file_name;
-    std::string line;
-    while (std::getline(infile, line)) {
-        results.push_back((float)atof(line.c_str()));
-    }
-    return 0;
-}
-
-class TestSaberFuncBM : public Test {
-public:
-    TestSaberFuncBM() {}
-    ~TestSaberFuncBM() {}
-
-protected:
-    virtual void setup() {}
-    virtual void teardown() {}
-
-};
-
-#endif //ANAKIN_TEST_SABER_TEST_SABER_TENSOR_BM_H
diff --git a/test/saber/bm/test_saber_func_activation_BM.cpp b/test/saber/bm/test_saber_func_activation_BM.cpp
deleted file mode 100644
index 523e94121..000000000
--- a/test/saber/bm/test_saber_func_activation_BM.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-#include "core/context.h"
-#include "funcs/activation.h"
-#include "test_saber_func_BM.h"
-#include "tensor_op.h"
-#include "saber_types.h"
-#include <vector>
-
-using namespace anakin::saber;
-
-template <typename Tensor>
-void print_tensor_shape(std::string name, Tensor& t0) {
-
-    LOG(INFO) << name << " valid shape is ["
-              << t0.valid_shape()[0] << ", "
-              << t0.valid_shape()[1] << ", "
-              << t0.valid_shape()[2] << ", "
-              << t0.valid_shape()[3] << "].";
-
-    LOG(INFO) << name << " real shape is ["
-              << t0.shape()[0] << ", "
-              << t0.shape()[1] << ", "
-              << t0.shape()[2] << ", "
-              << t0.shape()[3] << "].";
-
-    LOG(INFO) << name << " offset is ["
-              << t0.offset()[0] << ", "
-              << t0.offset()[1] << ", "
-              << t0.offset()[2] << ", "
-              << t0.offset()[3] << "].";
-}
-
-TEST(TestSaberFuncBM, test_func_constructor) {
-
-    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
-
-    int img_num = 1;
-    int in_channels = 2;
-    int img_h = 8;
-    int img_w = 8;
-
-    Shape img_s(img_num, in_channels, img_h, img_w);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = (float)(0.05 * (i & 0x1f) * -1);
-    }
-
-    img_dev.copy_from(img_host);
-    TensorDf4 output_dev;
-
-    // start Reshape & doInfer
-
-    Context<BM> ctx1(0, 1, 1);
-
-    ActivationParam<TensorDf4> param(Active_relu, 0.1f, 0.1f);
-
-    std::vector<TensorDf4*> input;
-    std::vector<TensorDf4*> output;
-
-    input.push_back(&img_dev);
-    output.push_back(&output_dev);
-
-    Activation<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> act;
-    act.compute_output_shape(input, output, param);
-    output_dev.re_alloc(output[0]->shape());
-
-    // init assume output tensor has been reshpaed by user.
-    act.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
-    act(input, output, param, ctx1);
-
-    print_tensor_device(output_dev);
-}
-
-int main(int argc, const char** argv) {
-    Env<BM>::env_init();
-    // initial logger
-    //logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
diff --git a/test/saber/bm/test_saber_func_conv_BM.cpp b/test/saber/bm/test_saber_func_conv_BM.cpp
deleted file mode 100644
index 7881cdb97..000000000
--- a/test/saber/bm/test_saber_func_conv_BM.cpp
+++ /dev/null
@@ -1,725 +0,0 @@
-#include "core/context.h"
-#include "funcs/conv.h"
-#include "test_saber_func_BM.h"
-#include "tensor_op.h"
-#include "saber_types.h"
-#include <vector>
-//#include "cublas.h"
-
-using namespace anakin::saber;
-
-typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
-
-template <typename Tensor>
-void print_tensor_shape(std::string name, Tensor &t0) {
-
-            LOG(INFO) << name << " valid shape is ["
-                      << t0.valid_shape()[0] << ", "
-                      << t0.valid_shape()[1] << ", "
-                      << t0.valid_shape()[2] << ", "
-                      << t0.valid_shape()[3] << "].";
-
-            LOG(INFO) << name << " real shape is ["
-                      << t0.shape()[0] << ", "
-                      << t0.shape()[1] << ", "
-                      << t0.shape()[2] << ", "
-                      << t0.shape()[3] << "].";
-
-            LOG(INFO) << name << " offset is ["
-                      << t0.offset()[0] << ", "
-                      << t0.offset()[1] << ", "
-                      << t0.offset()[2] << ", "
-                      << t0.offset()[3] << "].";
-}
-
-
-
-#if 1
-TEST(TestSaberFuncBM, test_depthwise_conv) {
-
-    int group = 2;
-    int pad_h = 1;
-    int pad_w = 1;
-    int stride_h = 1;
-    int stride_w = 1;
-    int dilation_h = 1;
-    int dilation_w = 1;
-
-    int kernel_h = 3;
-    int kernel_w = 3;
-    int out_channels = 2;
-    
-    int img_num = 1;
-    int in_channels = 2;
-    int img_h = 8;
-    int img_w = 8;
-
-    bool bias_term = true;
-
-    LOG(INFO) << "conv param: ";
-    LOG(INFO) << " img_num = " << img_num;
-    LOG(INFO) << " in_channels = " << in_channels;
-    LOG(INFO) << " img_h = " << img_h;
-    LOG(INFO) << " img_w = " << img_w;
-    LOG(INFO) << " group = " << group;
-    LOG(INFO) << " pad_h = " << pad_h;
-    LOG(INFO) << " pad_w = " << pad_w;
-    LOG(INFO) << " stride_h = " << stride_h;
-    LOG(INFO) << " stride_w = " << stride_w;
-    LOG(INFO) << " dilation_h = " << dilation_h;
-    LOG(INFO) << " dilation_w = " << dilation_w;
-    LOG(INFO) << " kernel_h = " << kernel_h;
-    LOG(INFO) << " kernel_w = " << kernel_w;
-    LOG(INFO) << " out_channels = " << out_channels;
-
-    Shape img_s(img_num, in_channels, img_h, img_w);
-    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
-    Shape bias_s(1, out_channels, 1, 1);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-    
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = 63 & i;
-    }
-
-    img_dev.copy_from(img_host);
-    
-    TensorHf4 weights_host;
-    TensorDf4 weights_dev;
-    
-    weights_host.re_alloc(weights_s);
-    weights_dev.re_alloc(weights_s);
-
-    fill_tensor_host_const(weights_host, 1.f);
-    weights_dev.copy_from(weights_host);
-
-    TensorHf4 bias_host;
-    TensorDf4 bias_dev;
-
-    if (bias_term) {
-        bias_host.re_alloc(bias_s);
-        bias_dev.re_alloc(bias_s);
-
-        fill_tensor_host_const(bias_host, 1.f);
-        bias_dev.copy_from(bias_host);
-    }
-
-    TensorHf4 output_host;
-    TensorDf4 output_dev;
-
-    // start Reshape & doInfer
-    Context<BM> ctx1(0, 1, 1);
-    
-    ConvParam<TensorDf4> param(group, pad_h, pad_w,
-                               stride_h, stride_w,
-                               dilation_h, dilation_w,
-                               &weights_dev, &bias_dev);
-
-    std::vector<TensorDf4*> input;
-    std::vector<TensorDf4*> output;
-
-    input.push_back(&img_dev);
-    output.push_back(&output_dev);
-
-    Conv<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> conv;
-    conv.compute_output_shape(input, output, param);
-
-    output_dev.re_alloc(output[0]->shape());
-    output_host.re_alloc(output[0]->shape());
-
-    LOG(INFO) << "regular start with group = " << group;
-    // init assume output tensor has been reshpaed by user.
-    conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
-
-    conv(input, output, param, ctx1);
-
-    cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    output[0]->record_event(cuda_stream);
-
-    output_dev.sync();
-    print_tensor_device(output_dev);
-
-//    param.group = 1;
-//    param.pad_h = 1;
-//    param.pad_w = 1;
-//
-//    LOG(INFO) << " param changed start with group = "<<param.group;
-//    conv(input, output, param, ctx1);
-//
-//    output_dev.sync();
-//    print_tensor_device(output_dev);
-    cudaDeviceSynchronize();
-    CUDA_CHECK(cudaPeekAtLastError());
-}
-
-TEST(TestSaberFuncBM, test_conv_param_change) {
-
-    int group = 4;
-    int pad_h = 1;
-    int pad_w = 1;
-    int stride_h = 1;
-    int stride_w = 1;
-    int dilation_h = 1;
-    int dilation_w = 1;
-
-    int kernel_h = 3;
-    int kernel_w = 3;
-    int out_channels = 4;
-
-    int img_num = 1;
-    int in_channels = 4;
-    int img_h = 65;
-    int img_w = 63;
-
-    bool bias_term = true;
-
-    LOG(INFO) << "conv param: ";
-    LOG(INFO) << " img_num = " << img_num;
-    LOG(INFO) << " in_channels = " << in_channels;
-    LOG(INFO) << " img_h = " << img_h;
-    LOG(INFO) << " img_w = " << img_w;
-    LOG(INFO) << " group = " << group;
-    LOG(INFO) << " pad_h = " << pad_h;
-    LOG(INFO) << " pad_w = " << pad_w;
-    LOG(INFO) << " stride_h = " << stride_h;
-    LOG(INFO) << " stride_w = " << stride_w;
-    LOG(INFO) << " dilation_h = " << dilation_h;
-    LOG(INFO) << " dilation_w = " << dilation_w;
-    LOG(INFO) << " kernel_h = " << kernel_h;
-    LOG(INFO) << " kernel_w = " << kernel_w;
-    LOG(INFO) << " out_channels = " << out_channels;
-
-    Shape img_s(img_num, in_channels, img_h, img_w);
-    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
-    Shape bias_s(1, out_channels, 1, 1);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = 0x7f & i;
-    }
-
-    img_dev.copy_from(img_host);
-
-    TensorHf4 weights_host;
-    TensorDf4 weights_dev;
-
-    weights_host.re_alloc(weights_s);
-    weights_dev.re_alloc(weights_s);
-
-    fill_tensor_host_const(weights_host, 1.f);
-    weights_dev.copy_from(weights_host);
-
-    TensorHf4 bias_host;
-    TensorDf4 bias_dev;
-
-    if (bias_term) {
-        bias_host.re_alloc(bias_s);
-        bias_dev.re_alloc(bias_s);
-
-        fill_tensor_host_const(bias_host, 1.f);
-        bias_dev.copy_from(bias_host);
-    }
-
-    TensorHf4 output_host;
-    TensorDf4 output_dev;
-
-    // start Reshape & doInfer
-    Context<BM> ctx1(0, 1, 1);
-
-    ConvParam<TensorDf4> param(group, pad_h, pad_w,
-                               stride_h, stride_w,
-                               dilation_h, dilation_w,
-                               &weights_dev, &bias_dev);
-
-    std::vector<TensorDf4*> input;
-    std::vector<TensorDf4*> output;
-
-    input.push_back(&img_dev);
-    output.push_back(&output_dev);
-
-    Conv<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> conv;
-    conv.compute_output_shape(input, output, param);
-
-    output_dev.re_alloc(output[0]->shape());
-    output_host.re_alloc(output[0]->shape());
-
-            LOG(INFO)<<"regular start with group = "<<group;
-    // init assume output tensor has been reshpaed by user.
-    conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
-
-    conv(input, output, param, ctx1);
-    output_dev.sync();
-//    print_tensor_device(output_dev);
-
-    param.group = 1;
-    param.pad_h = 1;
-    param.pad_w = 1;
-
-    LOG(INFO)<<" param changed start with group = "<<param.group;
-    conv(input, output, param, ctx1);
-
-    cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    output[0]->record_event(cuda_stream);
-
-    output_dev.sync();
-//    print_tensor_device(output_dev);
-    cudaDeviceSynchronize();
-    CUDA_CHECK(cudaPeekAtLastError());
-}
-
-TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
-
-    int group = 1;
-    int pad_h = 1;
-    int pad_w = 1;
-    int stride_h = 1;
-    int stride_w = 1;
-    int dilation_h = 1;
-    int dilation_w = 1;
-
-    int kernel_h = 3;
-    int kernel_w = 3;
-    int out_channels = 2;
-
-    int img_num = 1;
-    int in_channels = 2;
-    int img_h = 8;
-    int img_w = 8;
-
-    bool bias_term = false;
-
-    LOG(INFO) << "conv param: ";
-    LOG(INFO) << " img_num = " << img_num;
-    LOG(INFO) << " in_channels = " << in_channels;
-    LOG(INFO) << " img_h = " << img_h;
-    LOG(INFO) << " img_w = " << img_w;
-    LOG(INFO) << " group = " << group;
-    LOG(INFO) << " pad_h = " << pad_h;
-    LOG(INFO) << " pad_w = " << pad_w;
-    LOG(INFO) << " stride_h = " << stride_h;
-    LOG(INFO) << " stride_w = " << stride_w;
-    LOG(INFO) << " dilation_h = " << dilation_h;
-    LOG(INFO) << " dilation_w = " << dilation_w;
-    LOG(INFO) << " kernel_h = " << kernel_h;
-    LOG(INFO) << " kernel_w = " << kernel_w;
-    LOG(INFO) << " out_channels = " << out_channels;
-
-    Shape img_s(img_num, in_channels, img_h, img_w);
-    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
-    Shape bias_s(1, out_channels, 1, 1);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = 0x7f & i;
-    }
-
-    img_dev.copy_from(img_host);
-
-    Shape img_s_sub(img_num, in_channels, 4, 4);
-
-    TensorDf4 t0;
-    TensorDf4 t1;
-
-    t0.share_sub_buffer(img_dev, img_s_sub, {0,0,0,0});
-    t1.share_sub_buffer(img_dev, img_s_sub, {0,0,4,4});
-
-    print_tensor_shape("t0", t0);
-    print_tensor_shape("t1", t1);
-
-    TensorHf4 weights_host;
-    TensorDf4 weights_dev;
-
-    weights_host.re_alloc(weights_s);
-    weights_dev.re_alloc(weights_s);
-
-    fill_tensor_host_const(weights_host, 1.f);
-    weights_dev.copy_from(weights_host);
-
-    TensorHf4 bias_host;
-    TensorDf4 bias_dev;
-
-    if (bias_term) {
-        bias_host.re_alloc(bias_s);
-        bias_dev.re_alloc(bias_s);
-
-        fill_tensor_host_const(bias_host, 1.f);
-        bias_dev.copy_from(bias_host);
-    }
-
-    TensorDf4 output_dev;
-
-    // start Reshape & doInfer
-    Context<BM> ctx1(0, 1, 1);
-    Context<BM> ctx2(0, 2, 2);
-
-    TensorDf4 out0;
-    TensorDf4 out1;
-
-    ConvParam<TensorDf4> param0(group, pad_h, pad_w,
-                               stride_h, stride_w,
-                               dilation_h, dilation_w,
-                               &weights_dev, &bias_dev);
-
-    ConvParam<TensorDf4> param1(group, pad_h, pad_w,
-                                stride_h, stride_w,
-                                dilation_h, dilation_w,
-                                &weights_dev, &bias_dev);
-
-    std::vector<TensorDf4*> input0, input1;
-    std::vector<TensorDf4*> output0, output1;
-
-    input0.push_back(&t0);
-    input1.push_back(&t1);
-
-    output0.push_back(&out0);
-    output1.push_back(&out1);
-
-    // FIXME ? where do i get output shape
-    output_dev.re_alloc(img_s);
-
-    Conv<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> conv0;
-    Conv<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> conv1;
-
-    conv0.compute_output_shape(input0, output0, param0);
-    conv1.compute_output_shape(input1, output1, param1);
-
-    out0.share_sub_buffer(output_dev, output0[0]->valid_shape(),{0,0,0,0});
-    out1.share_sub_buffer(output_dev, output1[0]->valid_shape(),{0,0,4,4});
-
-    conv0.init(input0, output0, param0, SPECIFY, VENDER_IMPL, ctx1);
-    conv1.init(input1, output1, param1, SPECIFY, VENDER_IMPL, ctx2);
-
-    conv0(input0, output0, param0, ctx1);
-    conv1(input1, output1, param1, ctx2);
-
-    cudaStream_t cuda_stream1 = ctx1.get_compute_stream();
-    output0[0]->record_event(cuda_stream1);
-
-    cudaStream_t cuda_stream2 = ctx2.get_compute_stream();
-    output1[0]->record_event(cuda_stream2);
-
-    out0.sync();
-    out1.sync();
-
-    print_tensor_device(output_dev);
-
-//    print_tensor_device(output_dev);
-
-    cudaDeviceSynchronize();
-    CUDA_CHECK(cudaPeekAtLastError());
-}
-#endif
-
-TEST(TestSaberFuncBM, test_conv_fp32_speed_test) {
-
-    int group = 1;
-    int pad_h = 1;
-    int pad_w = 1;
-    int stride_h = 1;
-    int stride_w = 1;
-    int dilation_h = 1;
-    int dilation_w = 1;
-
-    int kernel_h = 1;
-    int kernel_w = 1;
-    int out_channels = 128;
-
-    int img_num = 7;
-    int in_channels = 13;
-    int img_h = 32;
-    int img_w = 32;
-
-    bool bias_term = false;
-
-    LOG(INFO) << "conv param: ";
-    LOG(INFO) << " img_num = " << img_num;
-    LOG(INFO) << " in_channels = " << in_channels;
-    LOG(INFO) << " img_h = " << img_h;
-    LOG(INFO) << " img_w = " << img_w;
-    LOG(INFO) << " group = " << group;
-    LOG(INFO) << " pad_h = " << pad_h;
-    LOG(INFO) << " pad_w = " << pad_w;
-    LOG(INFO) << " stride_h = " << stride_h;
-    LOG(INFO) << " stride_w = " << stride_w;
-    LOG(INFO) << " dilation_h = " << dilation_h;
-    LOG(INFO) << " dilation_w = " << dilation_w;
-    LOG(INFO) << " kernel_h = " << kernel_h;
-    LOG(INFO) << " kernel_w = " << kernel_w;
-    LOG(INFO) << " out_channels = " << out_channels;
-    Shape img_s(img_num, in_channels, img_h, img_w);
-    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
-    Shape bias_s(1, out_channels, 1, 1);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = 1;
-    }
-
-    img_dev.copy_from(img_host);
-
-    TensorHf4 weights_host;
-    TensorDf4 weights_dev;
-
-    weights_host.re_alloc(weights_s);
-    weights_dev.re_alloc(weights_s);
-
-    fill_tensor_host_const(weights_host, 1.f);
-    weights_dev.copy_from(weights_host);
-
-    TensorHf4 bias_host;
-    TensorDf4 bias_dev;
-
-    if (bias_term) {
-        bias_host.re_alloc(bias_s);
-        bias_dev.re_alloc(bias_s);
-
-        fill_tensor_host_const(bias_host, 1.f);
-        bias_dev.copy_from(bias_host);
-    }
-
-    TensorDf4 output_dev;
-
-    // start Reshape & doInfer
-    Context<BM> ctx1(0, 1, 1);
-
-    ConvParam<TensorDf4> param(group, pad_h, pad_w,
-                               stride_h, stride_w,
-                               dilation_h, dilation_w,
-                               &weights_dev, &bias_dev);
-
-    std::vector<TensorDf4*> input;
-    std::vector<TensorDf4*> output;
-
-    input.push_back(&img_dev);
-    output.push_back(&output_dev);
-
-    Conv<BM, AK_FLOAT> conv;
-    conv.compute_output_shape(input, output, param);
-
-    output_dev.re_alloc(output[0]->shape());
-    LOG(INFO) << "Output shape = [ " << output[0]->shape()[0] << " " << output[0]->shape()[1] << " " \
-        << output[0]->shape()[2] << " " << output[0]->shape()[3] << "]";
-    //LOG(INFO) << " blocks = [ " <<  i_div_up(img_num*output[0]->shape()[2]*output[0]->shape()[3],128) << " " << i_div_up(out_channels*kernel_h, 128) << " 1 ]" ; 
-    //选择k最小的那一组，如果一样，则选128*N，N最大的那一组
-    int k0 = i_div_up(out_channels, 128) * 128 - out_channels;
-    int k1 = i_div_up(out_channels, 64) * 64 - out_channels;
-    int k2 = i_div_up(out_channels, 32) * 32 - out_channels;
-    int kk = std::min(std::min(k0,k1),k2);
-    LOG(INFO) << "k0 = " << k0 << " k1 = " << k1 << " k2 = " << k2 << " kk = " << kk;
-    if (kk == k0)
-        LOG(INFO) << "thread = [256,1,1] 128*128" ;
-    if (kk == k1)
-        LOG(INFO) << "thread = [128,1,1] 128*64" ;
-    if (kk == k2)
-        LOG(INFO) << "thread = [128,1,1] 128*32" ;
-
-    LOG(INFO) << "saber conv init";
-    conv.init(input, output, param, SPECIFY, SABER_IMPL, ctx1);
-
-    LOG(INFO) << "saber conv dispatch";
-    conv(input, output, param, ctx1);
-
-    cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    output[0]->record_event(cuda_stream);
-
-    output_dev.sync();
-
-    SaberTimer<BM> t1;
-    int ts = 1;
-
-    for (int i = 0; i < ts; ++i) {
-        t1.start(ctx1);
-        conv(input, output, param, ctx1);
-        output_dev.sync();
-        t1.end(ctx1);
-    }
-
-    LOG(INFO) << "fp32 average time: " << t1.get_average_ms() << " ms";
-
-    cudaDeviceSynchronize();
-    CUDA_CHECK(cudaPeekAtLastError());
-}
-
-void test_conv_fp32_speed(std::vector<TensorDf4*> &inputs, std::vector<TensorDf4*> &outputs,
-                         TensorDf4 &weights, int kernel_size, int stride, int pad,
-                         int in_channel, int out_channel, TensorDf4 &bias,
-                         anakin::saber::ImplEnum impl) {
-
-    ConvParam<TensorDf4> conv_param(1, pad, pad,
-                                    stride, stride,
-                                    1, 1,
-                                    &weights, &bias);
-    Conv<BM, AK_FLOAT> conv;
-    conv.compute_output_shape(inputs, outputs, conv_param);
-    outputs[0]->re_alloc(outputs[0]->shape());
-    Context<BM> ctx1(0, 1, 1);
-
-    SABER_CHECK(conv.init(inputs, outputs, conv_param, SPECIFY, impl, ctx1));
-
-    conv(inputs, outputs, conv_param, ctx1);
-    outputs[0]->record_event(ctx1.get_compute_stream());
-    outputs[0]->sync();
-
-    cudaDeviceSynchronize();
-
-    SaberTimer<BM> t1;
-    int ts = 100;
-    for (int i = 0; i < ts; ++i) {
-        t1.start(ctx1);
-        conv(inputs, outputs, conv_param, ctx1);
-        outputs[0]->record_event(ctx1.get_compute_stream());
-        outputs[0]->sync();
-        t1.end(ctx1);
-    }
-            LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms";
-
-    cudaDeviceSynchronize();
-}
-
-
-cublasHandle_t  cublas_handle;
-
-void caffe_gemm(const int M, const int N, const int K,\
-					 const float alpha, const float* A,\
-					 const float* B, const float beta, float* C) {
-    int lda = K;
-    int ldb = N;
-    CUBLAS_CHECK(cublasSgemm(cublas_handle,
-                             CUBLAS_OP_N,
-                             CUBLAS_OP_N,
-                             N, M, K,
-                             &alpha, B,
-                             ldb, A,
-                             lda, &beta,
-                             C, N));
-}
-
-TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) {
-    int img_num = 1;
-    int kernel = 1;
-
-//    int out_channels = 32;
-//    int in_channels = 128;
-//    int img_h = 52;
-//    int img_w = 112;
-//    int out_channels = 64;
-//    int in_channels = 256;
-//    int img_h = 26;
-//    int img_w = 56;
-    int out_channels = 128;
-    int in_channels = 512;
-    int img_h = 13;
-    int img_w = 28;
-
-//    int out_channels = 512;
-//    int in_channels = 128;
-//    int img_h = 13;
-//    int img_w = 28;
-
-    int pad = 0;
-    int stride = 1;
-    Context<BM> ctx1(0, 1, 1);
-
-    CUBLAS_CHECK(cublasCreate(&cublas_handle));
-    CUBLAS_CHECK(cublasSetStream(cublas_handle, ctx1.get_compute_stream()));
-
-    TensorDf4 weights;
-    weights.re_alloc({out_channels, in_channels, 1, 1});
-
-    TensorDf4 img;
-    img.re_alloc({1, in_channels, img_h, img_w});
-
-    TensorDf4 out;
-    out.re_alloc({1, out_channels, img_h, img_w});
-    TensorDf4 out_gemm;
-    out_gemm.re_alloc({1, out_channels, img_h, img_w});
-
-    fill_tensor_device_rand(weights, -1.f, 1.f);
-    fill_tensor_device_rand(img, -1.f, 1.f);
-
-    LOG(INFO) << "img_num: " << img_num;
-    LOG(INFO) << "kernel: " << kernel;
-    LOG(INFO) << "out_channels: " << out_channels;
-    LOG(INFO) << "in_channels: " << in_channels;
-    LOG(INFO) << "img_h: " << img_h;
-    LOG(INFO) << "img_w: " << img_w;
-    LOG(INFO) << "pad: " << pad;
-    LOG(INFO) << "stride: " << stride;
-
-    TensorDf4 bias;
-
-    std::vector<TensorDf4*> input_v;
-    std::vector<TensorDf4*> output_gemm_v, output_v;
-
-    input_v.push_back(&img);
-    output_v.push_back(&out);
-    output_gemm_v.push_back(&out_gemm);
-    cudaDeviceSynchronize();
-    test_conv_fp32_speed(input_v, output_v,
-                         weights, kernel, stride, pad,
-            in_channels, out_channels, bias,
-            SABER_IMPL);
-    cudaDeviceSynchronize();
-    caffe_gemm(out_channels, img_h * img_w, in_channels,\
-					 1.f, weights.data(),\
-					 img.data(), 0.f, out_gemm.mutable_data());
-    cudaDeviceSynchronize();
-    SaberTimer<BM> t1;
-    int ts = 100;
-
-    for (int i = 0; i < ts; ++i) {
-        t1.start(ctx1);
-        caffe_gemm(out_channels, img_h * img_w, in_channels,\
-					 1.f, weights.data(),\
-					 img.data(), 0.f, out_gemm.mutable_data());
-        out_gemm.record_event(ctx1.get_compute_stream());
-        out_gemm.sync();
-        t1.end(ctx1);
-    }
-    LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms";
-
-    cudaDeviceSynchronize();
-//    print_tensor_device(out);
-//    print_tensor_device(out_gemm);
-    TensorHf4 out_host;
-    TensorHf4 out_gemm_host;
-    out_host.re_alloc(out.shape());
-    out_host.copy_from(out);
-
-    out_gemm_host.re_alloc(out_gemm.shape());
-    out_gemm_host.copy_from(out_gemm);
-    double max_r, max_d;
-    tensor_cmp_host(out_host.data(), out_gemm_host.data(), out_host.size(), max_r, max_d);
-    LOG(INFO) << "cmp result: max_r = " << max_r << " max_d = " << max_d;
-}
-
-int main(int argc, const char** argv){
-    anakin::saber::Env<BM>::env_init();
-
-    // initial logger
-    //logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
diff --git a/test/saber/bm/test_saber_func_fc_BM.cpp b/test/saber/bm/test_saber_func_fc_BM.cpp
deleted file mode 100644
index 869ff1bfd..000000000
--- a/test/saber/bm/test_saber_func_fc_BM.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-#include "core/context.h"
-#include "funcs/fc.h"
-#include "test_saber_func_BM.h"
-#include "tensor_op.h"
-#include "saber_types.h"
-#include <vector>
-
-using namespace anakin::saber;
-typedef TargetWrapper<BM> API;
-typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
-typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-typedef TensorDf4::Dtype ftype;
-
-void fc_compute(const TensorHf4& tin, const TensorHf4& weight, \
-                const TensorHf4& bias, TensorHf4& tout) {
-
-    int m = tin.num();
-    int k = tin.valid_size() / m;
-    int n = weight.valid_size() / k;
-    bool bias_term = bias.valid_size() > 0;
-
-    const float* din = tin.data();
-    const float* w = weight.data();
-    float* dout = tout.mutable_data();
-
-    for (int i = 0; i < m; ++i) {
-        float* pdout = dout + i * n;
-        const float* pdin = din + i * k;
-
-        for (int j = 0; j < n; ++j) {
-            if (bias_term) {
-                pdout[j] = bias.data()[j];
-            } else {
-                pdout[j] = 0;
-            }
-
-            for (int l = 0; l < k; ++l) {
-                pdout[j] += pdin[l] * w[l * n + j];
-            }
-        }
-    }
-}
-
-TEST(TestSaberFuncBM, test_func_fc) {
-
-    int test_iter = 100;
-    int w_in = 7;
-    int h_in = 7;
-    int ch_in = 512;
-    int num_in = 1;
-
-    int num_out = 4096;
-    int axis = 1;
-
-    Shape shape_in(num_in, ch_in, h_in, w_in);
-    Shape shape_out = {num_in, num_out, 1, 1};
-
-    Shape sh_w{1, 1, w_in* h_in * ch_in, num_out};
-    TensorDf4 weight(sh_w);
-    Shape sh_b{1, 1, 1, num_out};
-    TensorDf4 bias(sh_b);
-    fill_tensor_device_const(weight, 1.f);
-    fill_tensor_device_const(bias, 1.f);
-
-    LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \
-              ch_in << ", height=" << h_in << ", width=" << w_in;
-
-    std::vector<TensorDf4*> input_dev_4d;
-    std::vector<TensorDf4*> output_dev_4d;
-
-    TensorDf4 tdin;
-    TensorDf4 tdout;
-    tdin.re_alloc(shape_in);
-    fill_tensor_device_const(tdin, 1.f);
-    input_dev_4d.push_back(&tdin);
-    output_dev_4d.push_back(&tdout);
-
-    // start Reshape & doInfer
-    Context<BM> ctx_dev(0, 1, 1);
-
-    FcParam<TensorDf4> param(&weight, &bias, num_out, axis);
-
-    Fc<BM, AK_FLOAT> fc;
-
-    LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \
-              shape_out[2] << ", " << shape_out[3];
-
-    SABER_CHECK(fc.compute_output_shape(input_dev_4d, output_dev_4d, param));
-
-    LOG(INFO) << "re-alloc tensor buffer";
-    output_dev_4d[0]->re_alloc(output_dev_4d[0]->valid_shape());
-    Shape va_sh = tdout.valid_shape();
-    LOG(INFO) << "shape out 4d: " << va_sh[0] << ", " << va_sh[1] << ", " << \
-              va_sh[2] << ", " << va_sh[3];
-    CHECK_EQ(tdout.valid_shape() == shape_out, true) << "compute output shape error";
-
-    LOG(INFO) << "FC initialization";
-    SABER_CHECK(fc.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev));
-
-    LOG(INFO) << "FC compute";
-    SaberTimer<BM> t1;
-    t1.clear();
-    t1.start(ctx_dev);
-
-    for (int i = 0; i < test_iter; ++i) {
-        SABER_CHECK(fc(input_dev_4d, output_dev_4d, param, ctx_dev));
-        output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
-        output_dev_4d[0]->sync();
-        //cudaDeviceSynchronize();
-    }
-
-    t1.end(ctx_dev);
-    float ts = t1.get_average_ms();
-    LOG(INFO) << "total time: " << ts << "avg time: " << ts / test_iter;
-    //print_tensor_device(*output_dev_4d[0]);
-
-    //! check result
-    TensorHf4 thin(shape_in);
-    TensorHf4 thout(shape_out);
-    TensorHf4 thw(sh_w);
-    TensorHf4 thb(sh_b);
-    thin.copy_from(tdin);
-    thw.copy_from(weight);
-    thb.copy_from(bias);
-    fc_compute(thin, thw, thb, thout);
-    //print_tensor_host(thout);
-
-    TensorHf4 thout_d(shape_out);
-    thout_d.copy_from(tdout);
-    double max_ratio = 0;
-    double max_diff = 0;
-    tensor_cmp_host(thout.data(), thout_d.data(), thout.valid_size(), max_ratio, max_diff);
-    LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio;
-    CHECK_LE(fabs(max_ratio), 1.0e-6) << "error result";
-
-}
-
-int main(int argc, const char** argv) {
-    // initial logger
-    //logger::init(argv[0]);
-    Env<BM>::env_init();
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp
deleted file mode 100644
index 04b963675..000000000
--- a/test/saber/bm/test_saber_func_pooling_BM.cpp
+++ /dev/null
@@ -1,311 +0,0 @@
-#include "core/context.h"
-#include "funcs/pooling.h"
-#include "test_saber_func_BM.h"
-#include "tensor_op.h"
-#include "saber_types.h"
-#include "funcs/timer.h"
-#include <vector>
-
-using namespace anakin::saber;
-
-TEST(TestSaberFuncBM, test_func_pooling) {
-
-    Env<BM>::env_init();
-    typedef TargetWrapper<BM> API;
-    typename API::event_t event;
-    API::create_event(event);
-
-    typedef TargetWrapper<X86> X86_API;
-    typedef TargetWrapper<BM> BM_API;
-    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
-
-    int img_num = 1;
-    int in_channels = 4;
-    int img_h = 800;
-    int img_w = 1440;
-
-    Shape img_s(img_num, in_channels, img_h, img_w);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = 0x7f & i;
-    }
-
-    img_dev.copy_from(img_host);
-
-    TensorHf4 output_host;
-    TensorDf4 output_dev;
-
-    // start Reshape & doInfer
-
-    Context<BM> ctx1(0, 1, 1);
-    int window_h = 2;
-    int window_w = 2;
-    int pad_h = 1;
-    int pad_w = 1;
-    int stride_h = 1;
-    int stride_w = 1;
-    LOG(INFO) << " img_num: " << img_num;
-    LOG(INFO) << " in_channels: " << in_channels;
-    LOG(INFO) << " img_h: " << img_h;
-    LOG(INFO) << " img_w: " << img_w;
-    LOG(INFO) << " window_h: " << window_h;
-    LOG(INFO) << " window_w: " << window_w;
-    LOG(INFO) << " pad_h: " << pad_h;
-    LOG(INFO) << " pad_w: " << pad_w;
-    LOG(INFO) << " stride_h: " << stride_h;
-    LOG(INFO) << " stride_w: " << stride_w;
-
-    PoolingParam<TensorDf4> param(window_h, window_w, pad_h, pad_w
-                                  , stride_h, stride_w, Pooling_max);
-
-    std::vector<TensorDf4*> input;
-    std::vector<TensorDf4*> output;
-
-    input.push_back(&img_dev);
-    output.push_back(&output_dev);
-
-    Pooling<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> pooling;
-    pooling.compute_output_shape(input, output, param);
-
-    output_dev.re_alloc(output[0]->shape());
-    output_host.re_alloc(output[0]->shape());
-
-    // init assume output tensor has been reshpaed by user.
-    pooling.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
-    pooling(input, output, param, ctx1);
-
-    SaberTimer<BM> t1;
-    int ts = 1000;
-
-    for (int i = 0; i < ts; ++i) {
-        t1.start(ctx1);
-        pooling(input, output, param, ctx1);
-        output[0]->sync();
-        t1.end(ctx1);
-    }
-
-    output_dev.sync();
-    cudaDeviceSynchronize();
-    LOG(INFO) << " average time: " << t1.get_average_ms() << " ms";
-    LOG(INFO) << " tile 10% time: " << t1.get_tile_time(10) << " ms";
-    LOG(INFO) << " tile 50% time: " << t1.get_tile_time(50) << " ms";
-    LOG(INFO) << " tile 90% time: " << t1.get_tile_time(90) << " ms";
-    LOG(INFO) << " tile 95% time: " << t1.get_tile_time(95) << " ms";
-    LOG(INFO) << " tile 99% time: " << t1.get_tile_time(99) << " ms";
-
-    CUDA_CHECK(cudaPeekAtLastError());
-}
-
-TEST(TestSaberFuncBM, test_pooling_result) {
-
-    Env<BM>::env_init();
-    typedef TargetWrapper<BM> API;
-    typename API::event_t event;
-    API::create_event(event);
-
-    typedef TargetWrapper<X86> X86_API;
-    typedef TargetWrapper<BM> BM_API;
-    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
-
-    int img_num = 1;
-    int in_channels = 2;
-    int img_h = 8;
-    int img_w = 8;
-
-    Shape img_s(img_num, in_channels, img_h, img_w);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = 0x7f & i;
-    }
-
-    img_dev.copy_from(img_host);
-
-    TensorDf4 output_dev;
-
-    // start Reshape & doInfer
-
-    Context<BM> ctx1(0, 1, 1);
-    int window_h = 2;
-    int window_w = 2;
-    int pad_h = 1;
-    int pad_w = 1;
-    int stride_h = 1;
-    int stride_w = 1;
-
-    LOG(INFO) << " img_num: " << img_num;
-    LOG(INFO) << " in_channels: " << in_channels;
-    LOG(INFO) << " img_h: " << img_h;
-    LOG(INFO) << " img_w: " << img_w;
-    LOG(INFO) << " window_h: " << window_h;
-    LOG(INFO) << " window_w: " << window_w;
-    LOG(INFO) << " pad_h: " << pad_h;
-    LOG(INFO) << " pad_w: " << pad_w;
-    LOG(INFO) << " stride_h: " << stride_h;
-    LOG(INFO) << " stride_w: " << stride_w;
-
-    PoolingParam<TensorDf4> param(window_h, window_w, pad_h, pad_w
-                                  , stride_h, stride_w, Pooling_max);
-
-    std::vector<TensorDf4*> input;
-    std::vector<TensorDf4*> output;
-
-    input.push_back(&img_dev);
-    output.push_back(&output_dev);
-
-    Pooling<BM, AK_FLOAT> pooling;
-    pooling.compute_output_shape(input, output, param);
-
-    output_dev.re_alloc(output[0]->shape());
-
-    // init assume output tensor has been reshpaed by user.
-    pooling.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
-    pooling(input, output, param, ctx1);
-    cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    output[0]->record_event(cuda_stream);
-
-    output_dev.sync();
-    print_tensor_device(output_dev);
-
-    cudaDeviceSynchronize();
-    CUDA_CHECK(cudaPeekAtLastError());
-}
-
-TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
-
-    Env<BM>::env_init();
-    typedef TargetWrapper<BM> API;
-    typename API::event_t event;
-    API::create_event(event);
-
-    typedef TargetWrapper<X86> X86_API;
-    typedef TargetWrapper<BM> BM_API;
-    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
-
-    int img_num = 1;
-    int in_channels = 2;
-    int img_h = 8;
-    int img_w = 8;
-
-    Shape img_s(img_num, in_channels, img_h, img_w);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = 0x7f & i;
-    }
-
-    img_dev.copy_from(img_host);
-
-    TensorDf4 t0;
-    TensorDf4 t1;
-    Shape img_s_sub(img_num, in_channels, img_h / 2, img_w / 2);
-
-    t0.share_sub_buffer(img_dev, img_s_sub, {0, 0, 0, 0});
-    t1.share_sub_buffer(img_dev, img_s_sub, {0, 0, 4, 4});
-
-    TensorDf4 output_dev;
-
-    TensorDf4 out0;
-    TensorDf4 out1;
-
-    // start Reshape & doInfer
-
-    Context<BM> ctx1(0, 1, 1);
-    int window_h = 2;
-    int window_w = 2;
-    int pad_h = 1;
-    int pad_w = 1;
-    int stride_h = 1;
-    int stride_w = 1;
-
-    LOG(INFO) << " img_num: " << img_num;
-    LOG(INFO) << " in_channels: " << in_channels;
-    LOG(INFO) << " img_h: " << img_h;
-    LOG(INFO) << " img_w: " << img_w;
-    LOG(INFO) << " window_h: " << window_h;
-    LOG(INFO) << " window_w: " << window_w;
-    LOG(INFO) << " pad_h: " << pad_h;
-    LOG(INFO) << " pad_w: " << pad_w;
-    LOG(INFO) << " stride_h: " << stride_h;
-    LOG(INFO) << " stride_w: " << stride_w;
-
-    PoolingParam<TensorDf4> param(window_h, window_w, pad_h, pad_w
-                                  , stride_h, stride_w, Pooling_max);
-
-    std::vector<TensorDf4*> input;
-    std::vector<TensorDf4*> output;
-
-    input.push_back(&img_dev);
-    output.push_back(&output_dev);
-
-    Pooling<BM, AK_FLOAT> pooling;
-    Pooling<BM, AK_FLOAT> pooling0;
-    Pooling<BM, AK_FLOAT> pooling1;
-
-    pooling.compute_output_shape(input,output,  param);
-
-    Shape total_shape = output[0]->shape();
-
-    output_dev.re_alloc(total_shape);
-    Shape out_sub_shape = {total_shape[0], total_shape[1], total_shape[2] / 2, total_shape[3] / 2};
-
-    out0.share_sub_buffer(output_dev, out_sub_shape, {0, 0, 0, 0});
-    out1.share_sub_buffer(output_dev, out_sub_shape, {0, 0, out_sub_shape[2], out_sub_shape[3]});
-
-    std::vector<TensorDf4*> input0, input1;
-    std::vector<TensorDf4*> output0, output1;
-
-    input0.push_back(&t0);
-    input1.push_back(&t1);
-    output0.push_back(&out0);
-    output1.push_back(&out1);
-
-    // init assume output tensor has been reshpaed by user.
-    pooling0.init(input0, output0, param, SPECIFY, VENDER_IMPL, ctx1);
-    pooling0(input0, output0, param, ctx1);
-
-    pooling1.init(input1, output1, param, SPECIFY, VENDER_IMPL, ctx1);
-    pooling1(input1, output1, param, ctx1);
-
-    cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    out0.record_event(cuda_stream);
-
-    cudaStream_t cuda_stream1 = ctx1.get_compute_stream();
-    out1.record_event(cuda_stream1);
-
-    out0.sync();
-    out1.sync();
-
-    print_tensor_device(output_dev);
-
-    cudaDeviceSynchronize();
-    CUDA_CHECK(cudaPeekAtLastError());
-}
-
-int main(int argc, const char** argv) {
-    // initial logger
-    //logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
diff --git a/test/saber/bm/test_saber_shape_BM.cpp b/test/saber/bm/test_saber_shape_BM.cpp
deleted file mode 100644
index 18479cd18..000000000
--- a/test/saber/bm/test_saber_shape_BM.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-#include "test_saber_shape_BM.h"
-#include "shape.h"
-#include "anakin_config.h"
-
-#ifdef USE_OPENMP
-#include <omp.h>
-#include <core/shape.h>
-#endif
-
-using namespace anakin;
-using namespace saber;
-
-
-TEST(TestSaberShapeBM, test_saber_shape) {
-
-    int dim = 4;
-    Shape sh4d0{0, 0, 0, 0};
-    CHECK_EQ(sh4d0.dims(), 4) << "check shape dim error";
-
-    for (int i = 0; i < dim; ++i) {
-        CHECK_EQ(sh4d0[i], 0) << "check default constructor, dim size error";
-    }
-
-    CHECK_EQ(sh4d0.count(), 0) << "check shape count error";
-
-    int N = 1;
-    int C = 3;
-    int H = 11;
-    int W = 11;
-    std::vector<int> sh_size = {N, C, H, W};
-    //Shape sh4d1(sh_size);
-    Shape sh4d1(N, C, H, W);
-    LOG(INFO) << "Test Saber Shape, size of shape: " << sh4d1.size();
-    CHECK_EQ(sh4d1.count(), N * C * H * W) << "size error with vector constructor!";
-    //CHECK_EQ(sh4d2.size(), N * C * H * W) << "size error with args constructor!";
-
-    CHECK_EQ(sh4d1[0], N) << "get shape size error";
-    CHECK_EQ(sh4d1[1], C) << "get shape size error";
-    CHECK_EQ(sh4d1[2], H) << "get shape size error";
-    CHECK_EQ(sh4d1[3], W) << "get shape size error";
-
-    //CHECK_EQ(sh4d2[0], N) << "get shape size error";
-    //CHECK_EQ(sh4d2[1], C) << "get shape size error";
-    //CHECK_EQ(sh4d2[2], H) << "get shape size error";
-    //CHECK_EQ(sh4d2[3], W) << "get shape size error";
-
-    CHECK_EQ(sh4d1.count(0), N * C * H * W) << "calculate count failed";
-
-    C = 10;
-    sh4d1[1] = C;
-    CHECK_EQ(sh4d1[1], C) << "set shape size error";
-
-    bool is_equal = (sh4d0 == sh4d1);
-    CHECK_EQ(is_equal, false) << "check shape is_equal failed";
-
-    sh4d0 = sh4d1;
-    CHECK_EQ(sh4d1[0], N) << "constructor failed";
-    CHECK_EQ(sh4d1[1], C) << "get shape size error";
-    CHECK_EQ(sh4d1[2], H) << "get shape size error";
-    CHECK_EQ(sh4d1[3], W) << "get shape size error";
-
-    Shape sh4d3 = sh4d1;
-    CHECK_EQ((sh4d3 == sh4d1), true) << "constructor error";
-
-    Shape sh4d4(sh4d1);
-    CHECK_EQ((sh4d4 == sh4d1), true) << "constructor error";
-
-    Shape sh1d0{0};
-    //std::vector<int> sh1d_size = {W};
-
-    //Shape sh1d1(sh1d_size);
-    //Shape sh1d0{W};
-    Shape sh1d1(W);
-
-    Shape sh1d3 = sh1d1;
-    Shape sh1d4(sh1d1);
-
-    CHECK_EQ(sh1d0.dims(), 1) << "shape dim error";
-
-    CHECK_EQ(sh1d0.count(), 0) << "shape size error";
-
-    CHECK_EQ(sh1d0.count(0), 0) << "shape1d count error";
-
-    CHECK_EQ(sh1d1[0], W) << "get shape size error";
-
-    //CHECK_EQ(sh1d2.count(0), W) << "shape dim error";
-
-    CHECK_EQ((sh1d0 != sh1d1), true) << "compare shape error";
-
-    CHECK_EQ((sh1d3 == sh1d1), true) << "compare shape error";
-
-    CHECK_EQ((sh1d4 == sh1d1), true) << "compare shape error";
-
-    Shape sh0{2, 2, 3, 4};
-    Shape sh1{2, 1, 1, 24};
-    Shape sh2{2, 2, 3, 4};
-    Shape sh3{1, 1, 2, 3};
-
-    CHECK_EQ(sh0 == sh2, true) << "error ==";
-    CHECK_EQ(sh3 < sh0, true) << "error <";
-    CHECK_EQ(sh3 >= sh0, false) << "error >=";
-    CHECK_EQ(sh3 > sh0, false) << "error >";
-    CHECK_EQ(sh0 > sh3, true) << "error >";
-    CHECK_EQ(sh0 < sh1, false) << "error <";
-    CHECK_EQ(sh0 <= sh2, true) << "error <=";
-    CHECK_EQ(sh0 >= sh2, true) << "error >=";
-
-    Shape sh001 = Shape::zero(2);
-    Shape sh002 = Shape::zero(3);
-
-    if (sh001 > sh002) {
-        LOG(ERROR) << "error <";
-    }
-
-}
-
-
-int main(int argc, const char** argv) {
-    // initial logger
-    logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
-
diff --git a/test/saber/bm/test_saber_shape_BM.h b/test/saber/bm/test_saber_shape_BM.h
deleted file mode 100644
index a2ca02c9b..000000000
--- a/test/saber/bm/test_saber_shape_BM.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H
-#define ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H
-
-#include "utils/unit_test/aktest.h"
-#include "utils/logger/logger.h"
-#include "saber/core/shape.h"
-
-using namespace anakin::test;
-
-class TestSaberShapeBM : public Test {
-public:
-    TestSaberShapeBM() {}
-    ~TestSaberShapeBM() {}
-
-protected:
-    virtual void setup() {}
-    virtual void teardown() {}
-
-protected:
-    std::string name;
-    std::string _test;
-};
-
-#endif //ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H
-
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index ed3ff0503..69b1ccbfc 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -9,7 +9,9 @@ typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
 typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 typedef TensorHf4::Dtype dtype;
 
+static bm_handle_t handle;
 TEST(TestSaberTensorBM, test_tensor_constructor) {
+    bmdnn_init(&handle);
 
     //! test empty constructor
     LOG(INFO) << "test default (empty) constructor";
@@ -28,13 +30,13 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
 
     //! test tensor re_alloc function on tensor with data
     LOG(INFO) << "|--test tensor re_alloc function on tensor with data";
-    Shape sh1(1, 2, 4, 4);
+    Shape sh1(2, 4, 4, 2);
     thost0.re_alloc(sh1);
     tdev0.re_alloc(sh1);
     LOG(INFO) << "|--tensor size of host: " << thost0.size();
     LOG(INFO) << "|--tensor size of device: " << tdev0.size();
-    CHECK_EQ(thost0.size(), 32) << "error with tensor size";
-    CHECK_EQ(tdev0.size(), 32) << "error with tensor size";
+    CHECK_EQ(thost0.size(), 64) << "error with tensor size";
+    CHECK_EQ(tdev0.size(), 64) << "error with tensor size";
 
     //! test tensor shape() function
     LOG(INFO) << "|--test tensor shape() function";
@@ -45,9 +47,9 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
               << thost0.height() << ", width = " << thost0.width();
 
     //! test tensor mutable_data() function
-    LOG(INFO) << "|--test tensor mutable_data() function, write tensor data buffer with 1.f";
-    fill_tensor_host_const(thost0, 1.f);
-    LOG(INFO) << "|--test tensor data() function, show the const data, 1.f";
+    LOG(INFO) << "|--test tensor mutable_data() function, write tensor data buffer with 2.f";
+    fill_tensor_host_const(thost0, 2.f);
+    LOG(INFO) << "|--test tensor data() function, show the const data, 2.f";
     print_tensor_host(thost0);
 
     //! test tensor constructor with shape
@@ -72,7 +74,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     thost1.copy_from(tdev1);
     print_tensor_host(thost1);
 
-    /*
+    
     // device to device
     tdev1.copy_from(tdev0);
 
@@ -98,22 +100,35 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
 
     BM_API::mem_alloc(&tmp_pt_dev, sizeof(dtype) * sh1.count());
     dev_data_ptr = static_cast<dtype*>(tmp_pt_dev);
-    cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice);
+//    bm_memcpy_d2s(handle,host_data_ptr,dev_data_ptr)
+//    cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice);
     LOG(INFO) << "|--construct host tensor from host data ptr";
     TensorHf4 thost3(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
     LOG(INFO) << "|--constructor device tensor from host data ptr";
-    TensorDf4 tdev3(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
+
+//    TensorDf4 tdev3(&bm_mem_from_system(const_cast<float *>(host_data_ptr)), X86(), X86_API::get_device_id(), sh1);
+
+    TensorDf4 tdev3(&bm_mem_from_system(const_cast<float *>(host_data_ptr)), X86(), X86_API::get_device_id(), sh1);
+
     print_tensor_host(thost3);
-    print_tensor_device(tdev3);
-    //cudaDeviceSynchronize();
 
+    TensorHf4 thost_lian(sh1);
+    thost_lian.copy_from(tdev3);
+    print_tensor_host(thost_lian);
+
+    thost_lian.copy_from(thost3);
+    print_tensor_host(thost_lian);
+
+    //cudaDeviceSynchronize();
+    //
+/*
     LOG(INFO) << "|--construct host tensor from device data ptr";
     TensorHf4 thost4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1);
     LOG(INFO) << "|--constructor device tensor from device data ptr";
     TensorDf4 tdev4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1);
     print_tensor_host(thost4);
     print_tensor_device(tdev4);
-
+/*
     //BM_API::stream_t dev_stream0;
     //BM_API::create_stream_with_flag(dev_stream0, 1);
     //cudaDeviceSynchronize();
@@ -203,6 +218,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     LOG(INFO) << "|--show root tensor while data is changed by shared tensor";
     print_tensor_host(thost4);
      */
+//    bmdnn_deinit(handle);
 }
 
 /*

From 683969cd6c2b99b723e896aacada065f8330be04 Mon Sep 17 00:00:00 2001
From: "Guangzhi (Frank) Xie" <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 19:55:32 +0800
Subject: [PATCH 069/318] Return correct size for AK_BM

---
 saber/core/tensor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index af3495b1f..6824869dd 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -944,7 +944,7 @@ class Tensor : public TensorBase {
 
 template<> inline
 size_t Tensor<BM, AK_BM, NCHW>::_type_len(){
-    return 1;
+    return 4;
 }
 
 template<>

From adcac0eef01ea6144454039b8d53d59a74fb3c17 Mon Sep 17 00:00:00 2001
From: "Guangzhi (Frank) Xie" <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 21:12:57 +0800
Subject: [PATCH 070/318] Implement conv for BM

---
 saber/funcs/conv.h                        | 11 ++++
 saber/funcs/impl/bm/vender_conv.h         | 41 +++++++++++--
 test/saber/bm/test_saber_func_conv_BM.cpp | 71 ++++++++++++-----------
 3 files changed, 86 insertions(+), 37 deletions(-)

diff --git a/saber/funcs/conv.h b/saber/funcs/conv.h
index 4e5ca762f..fd5ebc2de 100644
--- a/saber/funcs/conv.h
+++ b/saber/funcs/conv.h
@@ -30,6 +30,16 @@
 namespace anakin {
 namespace saber {
 
+#ifdef USE_BM
+template<typename TargetType,
+        DataType OpDtype,
+        DataType inDtype = AK_BM,
+        DataType outDtype = AK_BM,
+        typename LayOutType_op = NCHW,
+        typename LayOutType_in = NCHW,
+        typename LayOutType_out = NCHW
+>
+#else
 template<typename TargetType,
         DataType OpDtype,
         DataType inDtype = AK_FLOAT,
@@ -38,6 +48,7 @@ template<typename TargetType,
         typename LayOutType_in = NCHW,
         typename LayOutType_out = NCHW
 >
+#endif
 class Conv : public BaseFunc<
         Tensor<TargetType, inDtype, LayOutType_in>,
         Tensor<TargetType, outDtype, LayOutType_out>,
diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h
index a0a3b3fb5..778094886 100644
--- a/saber/funcs/impl/bm/vender_conv.h
+++ b/saber/funcs/impl/bm/vender_conv.h
@@ -62,10 +62,43 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
         int pad_w = param.pad_w;
         int stride_h = param.stride_h;
         int stride_w = param.stride_w;
-        BMDNN_CHECK(bmdnn_conv_forward(_handle, in_data, weights, bias,
-                                    input_n, input_c, input_h, input_w, group, output_c,
-                                    kh, kw, pad_h, pad_w, stride_h, stride_w, 1, 0, 0, 
-                                    out_data, NULL));
+
+        bm_tensor_4d_t input_shape = {
+            input_n,
+            input_c,
+            input_h,
+            input_w
+        };
+
+        bm_tensor_4d_t output_shape = {
+            input_n,
+            output_c,
+            input_h,
+            input_w
+        };
+
+        bm_kernel_param_t kernel_param = {
+            group,
+            output_c,
+            input_c,
+            kh,
+            kw
+        };
+
+        bm_conv_param_t conv_param = {
+            stride_h,
+            stride_w,
+            pad_h,
+            pad_w,
+            kh,
+            kw,
+            0
+        };
+
+        _handle = get_bm_handle();
+        BMDNN_CHECK(bmdnn_conv_forward(_handle, *in_data, *weight, *bias, input_shape, 
+                                    kernel_param, output_shape, conv_param, 1, *out_data));
+                                    
         return SaberSuccess;
     }
 
diff --git a/test/saber/bm/test_saber_func_conv_BM.cpp b/test/saber/bm/test_saber_func_conv_BM.cpp
index 7881cdb97..025a1074c 100644
--- a/test/saber/bm/test_saber_func_conv_BM.cpp
+++ b/test/saber/bm/test_saber_func_conv_BM.cpp
@@ -9,7 +9,7 @@
 using namespace anakin::saber;
 
 typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 
 template <typename Tensor>
 void print_tensor_shape(std::string name, Tensor &t0) {
@@ -33,7 +33,11 @@ void print_tensor_shape(std::string name, Tensor &t0) {
                       << t0.offset()[3] << "].";
 }
 
-
+//Round a / b to nearest higher integer value
+inline int i_div_up(int a, int b)
+{
+    return (a % b != 0) ? (a / b + 1) : (a / b);
+}
 
 #if 1
 TEST(TestSaberFuncBM, test_depthwise_conv) {
@@ -126,7 +130,7 @@ TEST(TestSaberFuncBM, test_depthwise_conv) {
     input.push_back(&img_dev);
     output.push_back(&output_dev);
 
-    Conv<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> conv;
+    Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv;
     conv.compute_output_shape(input, output, param);
 
     output_dev.re_alloc(output[0]->shape());
@@ -138,10 +142,10 @@ TEST(TestSaberFuncBM, test_depthwise_conv) {
 
     conv(input, output, param, ctx1);
 
-    cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    output[0]->record_event(cuda_stream);
+    //cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    //output[0]->record_event(cuda_stream);
 
-    output_dev.sync();
+    //output_dev.sync();
     print_tensor_device(output_dev);
 
 //    param.group = 1;
@@ -153,8 +157,8 @@ TEST(TestSaberFuncBM, test_depthwise_conv) {
 //
 //    output_dev.sync();
 //    print_tensor_device(output_dev);
-    cudaDeviceSynchronize();
-    CUDA_CHECK(cudaPeekAtLastError());
+    //cudaDeviceSynchronize();
+    //CUDA_CHECK(cudaPeekAtLastError());
 }
 
 TEST(TestSaberFuncBM, test_conv_param_change) {
@@ -247,7 +251,7 @@ TEST(TestSaberFuncBM, test_conv_param_change) {
     input.push_back(&img_dev);
     output.push_back(&output_dev);
 
-    Conv<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> conv;
+    Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv;
     conv.compute_output_shape(input, output, param);
 
     output_dev.re_alloc(output[0]->shape());
@@ -258,7 +262,7 @@ TEST(TestSaberFuncBM, test_conv_param_change) {
     conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
 
     conv(input, output, param, ctx1);
-    output_dev.sync();
+    //output_dev.sync();
 //    print_tensor_device(output_dev);
 
     param.group = 1;
@@ -268,13 +272,13 @@ TEST(TestSaberFuncBM, test_conv_param_change) {
     LOG(INFO)<<" param changed start with group = "<<param.group;
     conv(input, output, param, ctx1);
 
-    cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    output[0]->record_event(cuda_stream);
+    //cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    //output[0]->record_event(cuda_stream);
 
-    output_dev.sync();
+    //output_dev.sync();
 //    print_tensor_device(output_dev);
-    cudaDeviceSynchronize();
-    CUDA_CHECK(cudaPeekAtLastError());
+    //cudaDeviceSynchronize();
+    //CUDA_CHECK(cudaPeekAtLastError());
 }
 
 TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
@@ -392,8 +396,8 @@ TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
     // FIXME ? where do i get output shape
     output_dev.re_alloc(img_s);
 
-    Conv<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> conv0;
-    Conv<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> conv1;
+    Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv0;
+    Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv1;
 
     conv0.compute_output_shape(input0, output0, param0);
     conv1.compute_output_shape(input1, output1, param1);
@@ -407,6 +411,7 @@ TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
     conv0(input0, output0, param0, ctx1);
     conv1(input1, output1, param1, ctx2);
 
+    /*
     cudaStream_t cuda_stream1 = ctx1.get_compute_stream();
     output0[0]->record_event(cuda_stream1);
 
@@ -415,13 +420,13 @@ TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
 
     out0.sync();
     out1.sync();
-
+    */
     print_tensor_device(output_dev);
 
 //    print_tensor_device(output_dev);
 
-    cudaDeviceSynchronize();
-    CUDA_CHECK(cudaPeekAtLastError());
+    //cudaDeviceSynchronize();
+    //CUDA_CHECK(cudaPeekAtLastError());
 }
 #endif
 
@@ -513,7 +518,7 @@ TEST(TestSaberFuncBM, test_conv_fp32_speed_test) {
     input.push_back(&img_dev);
     output.push_back(&output_dev);
 
-    Conv<BM, AK_FLOAT> conv;
+    Conv<BM, AK_BM> conv;
     conv.compute_output_shape(input, output, param);
 
     output_dev.re_alloc(output[0]->shape());
@@ -539,10 +544,10 @@ TEST(TestSaberFuncBM, test_conv_fp32_speed_test) {
     LOG(INFO) << "saber conv dispatch";
     conv(input, output, param, ctx1);
 
-    cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    output[0]->record_event(cuda_stream);
+    //cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    //output[0]->record_event(cuda_stream);
 
-    output_dev.sync();
+    //output_dev.sync();
 
     SaberTimer<BM> t1;
     int ts = 1;
@@ -556,8 +561,8 @@ TEST(TestSaberFuncBM, test_conv_fp32_speed_test) {
 
     LOG(INFO) << "fp32 average time: " << t1.get_average_ms() << " ms";
 
-    cudaDeviceSynchronize();
-    CUDA_CHECK(cudaPeekAtLastError());
+    //cudaDeviceSynchronize();
+    //CUDA_CHECK(cudaPeekAtLastError());
 }
 
 void test_conv_fp32_speed(std::vector<TensorDf4*> &inputs, std::vector<TensorDf4*> &outputs,
@@ -569,7 +574,7 @@ void test_conv_fp32_speed(std::vector<TensorDf4*> &inputs, std::vector<TensorDf4
                                     stride, stride,
                                     1, 1,
                                     &weights, &bias);
-    Conv<BM, AK_FLOAT> conv;
+    Conv<BM, AK_BM> conv;
     conv.compute_output_shape(inputs, outputs, conv_param);
     outputs[0]->re_alloc(outputs[0]->shape());
     Context<BM> ctx1(0, 1, 1);
@@ -580,7 +585,7 @@ void test_conv_fp32_speed(std::vector<TensorDf4*> &inputs, std::vector<TensorDf4
     outputs[0]->record_event(ctx1.get_compute_stream());
     outputs[0]->sync();
 
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     SaberTimer<BM> t1;
     int ts = 100;
@@ -593,7 +598,7 @@ void test_conv_fp32_speed(std::vector<TensorDf4*> &inputs, std::vector<TensorDf4
     }
             LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms";
 
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 }
 
 
@@ -674,16 +679,16 @@ TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) {
     input_v.push_back(&img);
     output_v.push_back(&out);
     output_gemm_v.push_back(&out_gemm);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
     test_conv_fp32_speed(input_v, output_v,
                          weights, kernel, stride, pad,
             in_channels, out_channels, bias,
             SABER_IMPL);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
     caffe_gemm(out_channels, img_h * img_w, in_channels,\
 					 1.f, weights.data(),\
 					 img.data(), 0.f, out_gemm.mutable_data());
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
     SaberTimer<BM> t1;
     int ts = 100;
 
@@ -698,7 +703,7 @@ TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) {
     }
     LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms";
 
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 //    print_tensor_device(out);
 //    print_tensor_device(out_gemm);
     TensorHf4 out_host;

From a4ed82ebd5095ab6f75d865c4b8ab34d3c6ad760 Mon Sep 17 00:00:00 2001
From: "Guangzhi (Frank) Xie" <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 21:21:24 +0800
Subject: [PATCH 071/318] Comment out last conv test for now

---
 test/saber/bm/test_saber_func_conv_BM.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/saber/bm/test_saber_func_conv_BM.cpp b/test/saber/bm/test_saber_func_conv_BM.cpp
index 025a1074c..9a25d00b3 100644
--- a/test/saber/bm/test_saber_func_conv_BM.cpp
+++ b/test/saber/bm/test_saber_func_conv_BM.cpp
@@ -601,7 +601,7 @@ void test_conv_fp32_speed(std::vector<TensorDf4*> &inputs, std::vector<TensorDf4
     //cudaDeviceSynchronize();
 }
 
-
+/*
 cublasHandle_t  cublas_handle;
 
 void caffe_gemm(const int M, const int N, const int K,\
@@ -717,7 +717,7 @@ TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) {
     tensor_cmp_host(out_host.data(), out_gemm_host.data(), out_host.size(), max_r, max_d);
     LOG(INFO) << "cmp result: max_r = " << max_r << " max_d = " << max_d;
 }
-
+*/
 int main(int argc, const char** argv){
     anakin::saber::Env<BM>::env_init();
 

From d4aa3eb01da8c6f1d63ee502db99ef808b1c35a9 Mon Sep 17 00:00:00 2001
From: "weihao.huang" <weihao.huang@bitmain.com>
Date: Tue, 26 Jun 2018 13:42:52 +0000
Subject: [PATCH 072/318] Modify sync_memcpy & add bm_mem_from_device

---
 saber/core/impl/bm/bm_impl.cpp                   | 16 ++++++++++------
 saber/core/target_wrapper.h                      |  2 +-
 .../impl/bm/base/include/bmlib/bmlib_runtime.h   |  3 +++
 test/saber/bm/test_saber_buffer_BM.cpp           | 10 ++++++----
 4 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 60e52088e..ef26884b2 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -81,16 +81,20 @@ void BM_API::mem_set(void* ptr, int value, size_t n){
 //static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
 //    size_t count, __DtoD) {};
 
-//static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-//    size_t count, __HtoD) {};
+void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+    size_t count, __HtoD) {
+    handle = get_bm_handle(); 
+    BMDNN_CHECK(bm_memcpy_s2d(handle, bm_mem_from_device(dst), bm_mem_from_system(src)));
+    LOG(INFO) << "BM sync_memcpy: host to device, finished";
+};
 
 void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
     size_t count, __DtoH) {
     handle = get_bm_handle(); 
-    //auto* dev_ptr = const_cast<bm_device_mem_t *>(src);
-    BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src)));
-    //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *src));
-    LOG(INFO) << "End sync_memcpy process";
+    BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), bm_mem_from_device(src)));
+    //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src)));
+    //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(reinterpret_cast<struct bm_mem_desc *>(src))));
+    LOG(INFO) << "BM sync_memcpy: device to host, finished";
 };
 
 //static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index 6e6f67b55..925f4dd39 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -401,7 +401,7 @@ struct TargetWrapper<BM, __device_target> {
         size_t count, __DtoD) {};
 
     static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-        size_t count, __HtoD) {};
+        size_t count, __HtoD);
 
     static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
         size_t count, __DtoH);
diff --git a/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h b/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h
index 932b17138..7d537401c 100644
--- a/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h
+++ b/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h
@@ -148,6 +148,9 @@ bm_status_t bm_memset_device(
 bm_device_mem_t bm_mem_from_system(
     void *              system_addr);
 
+bm_device_mem_t bm_mem_from_device(
+    void *              device_addr);
+	
 /*
 *brief malloc one device memory with the shape of (N,C,H,W), copy the sys_mem to
 device mem if need_copy is true
diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp
index dce1fae15..555e22675 100644
--- a/test/saber/bm/test_saber_buffer_BM.cpp
+++ b/test/saber/bm/test_saber_buffer_BM.cpp
@@ -5,7 +5,7 @@
 using namespace anakin::saber;
 
 int get_bm_size() {
-    return 1;
+    return 4;
 }
 
 template <DataType Ddatatype, DataType Hdatatype>
@@ -27,7 +27,7 @@ void test_buffer() {
     x86_ptr = static_cast<Hdtype*>(tmp_x86);
 
     for (int i = 0; i < n0; i++) {
-        x86_ptr[i] = static_cast<Hdtype>(i);
+        x86_ptr[i] = static_cast<Hdtype>(100);
     }
 
     void* tmp_bm;
@@ -105,6 +105,7 @@ void test_buffer() {
     for (int i = 0; i < 10; i++) {
 	std::cout << "x86: " << x86_buf2_ptr[i] << std::endl;
     }
+    */
 
     const Hdtype* bm_buf1_ptr = static_cast<const Hdtype*>(bm_buf1.get_data());
     for (int i = 0; i < 10; i++) {
@@ -115,16 +116,17 @@ void test_buffer() {
     LOG(INFO) << "x86_buf1 cap & cnt: " << x86_buf1.get_capacity() << " " << x86_buf1.get_count(); 
     LOG(INFO) << "size of Hdtype: " << sizeof(Hdtype);
     LOG(INFO) << "size of Ddtype: " << sizeof(Ddtype);
-    */
+    
 
     x86_buf1.re_alloc(bm_buf1.get_capacity());
     x86_buf1.sync_copy_from(bm_buf1);
     LOG(INFO) << "deep copy from device buffer to host buffer: ";
     ptr1 = static_cast<const Hdtype*>(x86_buf1.get_data());
 
-    for (int i = 0; i < 30; i++) {
+    for (int i = 0; i < 10; i++) {
         std::cout << ptr1[i] << std::endl;
     }
+
 }
 
 TEST(TestSaberBufferBM, test_buffer_memcpy) {

From 19b5ace798f5c49b26868d7947783fd647ab971a Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 09:39:38 +0800
Subject: [PATCH 073/318] Update BM conv params

---
 saber/funcs/impl/bm/vender_conv.h | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h
index 778094886..530eef528 100644
--- a/saber/funcs/impl/bm/vender_conv.h
+++ b/saber/funcs/impl/bm/vender_conv.h
@@ -36,6 +36,8 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
     virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             ConvParam<OpTensor>& param, Context<BM>& ctx) {
+
+        _handle = get_bm_handle();
         return create(inputs, outputs, param, ctx);
     }
 
@@ -50,18 +52,26 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
         const InDataType *weight = (const InDataType *) param.weight()->data();
         const InDataType *bias = (const InDataType *) param.bias()->data();
         OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
+
         int input_n = inputs[0]->num();
         int input_c = inputs[0]->channel();
         int input_h = inputs[0]->height();
         int input_w = inputs[0]->width();
-        int group = param.group;
+
+        int output_n = outputs[0]->num();
         int output_c = outputs[0]->channel();
+        int output_h = outputs[0]->height();
+        int output_w = outputs[0]->width();
+
+        int group = param.group;
         int kh = param.weight()->height();
         int kw = param.weight()->width();
         int pad_h = param.pad_h;
         int pad_w = param.pad_w;
         int stride_h = param.stride_h;
         int stride_w = param.stride_w;
+        int dilation_h = param.dilation_h;
+        int dilation_w = param.dilation_w;
 
         bm_tensor_4d_t input_shape = {
             input_n,
@@ -71,10 +81,10 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
         };
 
         bm_tensor_4d_t output_shape = {
-            input_n,
+            output_n,
             output_c,
-            input_h,
-            input_w
+            output_h,
+            output_w
         };
 
         bm_kernel_param_t kernel_param = {
@@ -90,12 +100,11 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
             stride_w,
             pad_h,
             pad_w,
-            kh,
-            kw,
+            dilation_h,
+            dilation_w,
             0
         };
 
-        _handle = get_bm_handle();
         BMDNN_CHECK(bmdnn_conv_forward(_handle, *in_data, *weight, *bias, input_shape, 
                                     kernel_param, output_shape, conv_param, 1, *out_data));
                                     
@@ -103,7 +112,7 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
     }
 
 private:
-    cudnnHandle_t _handle;
+    bm_handle_t _handle;
 };
 
 }

From 81e33aa03fd74faf6eba4616bec20e2d63b703bb Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 09:41:52 +0800
Subject: [PATCH 074/318] Init handle in init function

---
 saber/funcs/impl/bm/vender_pooling.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h
index 108a70708..6e5de79a4 100644
--- a/saber/funcs/impl/bm/vender_pooling.h
+++ b/saber/funcs/impl/bm/vender_pooling.h
@@ -35,6 +35,8 @@ class VenderPooling<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in
     virtual SaberStatus init(const std::vector<DataTensor_in*>& inputs,
                   std::vector<DataTensor_out*>& outputs,
                   PoolingParam<OpTensor> &pooling_param, Context<BM> &ctx) {
+
+        _handle = get_bm_handle();
         return create(inputs, outputs, pooling_param, ctx);
     }
 
@@ -64,7 +66,7 @@ class VenderPooling<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in
         } else {
             is_avg_pooling = 1;
         }
-        _handle = get_bm_handle();
+
         BMDNN_CHECK(bmdnn_pooling_forward(_handle, in_data, 
                             input_n, input_c, input_h, input_w, kh, kw, pad_h, pad_w, 
                             stride_h, stride_w, is_avg_pooling, out_data));

From 630cabcbd1bb74480ebbffd932dd13bb2ef03ad5 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 10:07:03 +0800
Subject: [PATCH 075/318] Include BM conv implementation

---
 saber/funcs/conv.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/saber/funcs/conv.h b/saber/funcs/conv.h
index fd5ebc2de..596964dbe 100644
--- a/saber/funcs/conv.h
+++ b/saber/funcs/conv.h
@@ -27,6 +27,10 @@
 #include "saber/funcs/impl/impl_conv.h"
 #endif
 
+#ifdef USE_BM
+#include "saber/funcs/impl/bm/vender_conv.h"
+#endif
+
 namespace anakin {
 namespace saber {
 

From e1c82c4557089fd6d0985c5e4cfb2148d0bb88cb Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 10:12:18 +0800
Subject: [PATCH 076/318] remove unecessary include

---
 saber/funcs/impl/bm/vender_conv.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h
index 530eef528..924bf736c 100644
--- a/saber/funcs/impl/bm/vender_conv.h
+++ b/saber/funcs/impl/bm/vender_conv.h
@@ -1,8 +1,7 @@
 #ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H
 #define ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H
 
-#include "saber/funcs/impl/impl_conv.h"
-#include "saber/funcs/impl/bm/bmdnn_api.h"   
+#include "saber/funcs/impl/impl_conv.h" 
 
 namespace anakin{
 

From 6905020377a0f1f9337be76dcdf7f5b296faad67 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 10:26:09 +0800
Subject: [PATCH 077/318] empty create function

---
 saber/funcs/impl/bm/vender_conv.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h
index 924bf736c..14e52af8e 100644
--- a/saber/funcs/impl/bm/vender_conv.h
+++ b/saber/funcs/impl/bm/vender_conv.h
@@ -42,7 +42,9 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
 
     virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
-                            ConvParam<OpTensor>& param, Context<BM>& ctx);
+                            ConvParam<OpTensor>& param, Context<BM>& ctx) {
+        
+    }
 
     virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
                           std::vector<DataTensor_out*>& outputs,

From 59dba0558133103131df77ccf4acca4f901c582b Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 11:18:00 +0800
Subject: [PATCH 078/318] unit test for BM conv

---
 saber/funcs/impl/bm/vender_conv.h         |  6 +-
 test/saber/bm/test_saber_func_conv_BM.cpp | 88 ++---------------------
 2 files changed, 8 insertions(+), 86 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h
index 14e52af8e..220b8a14e 100644
--- a/saber/funcs/impl/bm/vender_conv.h
+++ b/saber/funcs/impl/bm/vender_conv.h
@@ -1,7 +1,7 @@
 #ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H
 #define ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H
 
-#include "saber/funcs/impl/impl_conv.h" 
+#include "saber/funcs/impl/impl_conv.h"
 
 namespace anakin{
 
@@ -74,6 +74,8 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
         int dilation_h = param.dilation_h;
         int dilation_w = param.dilation_w;
 
+        bool with_bias = param.bias()->size() > 0;
+
         bm_tensor_4d_t input_shape = {
             input_n,
             input_c,
@@ -107,7 +109,7 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
         };
 
         BMDNN_CHECK(bmdnn_conv_forward(_handle, *in_data, *weight, *bias, input_shape, 
-                                    kernel_param, output_shape, conv_param, 1, *out_data));
+                                    kernel_param, output_shape, conv_param, with_bias, *out_data));
                                     
         return SaberSuccess;
     }
diff --git a/test/saber/bm/test_saber_func_conv_BM.cpp b/test/saber/bm/test_saber_func_conv_BM.cpp
index 9a25d00b3..554bcf843 100644
--- a/test/saber/bm/test_saber_func_conv_BM.cpp
+++ b/test/saber/bm/test_saber_func_conv_BM.cpp
@@ -147,18 +147,6 @@ TEST(TestSaberFuncBM, test_depthwise_conv) {
 
     //output_dev.sync();
     print_tensor_device(output_dev);
-
-//    param.group = 1;
-//    param.pad_h = 1;
-//    param.pad_w = 1;
-//
-//    LOG(INFO) << " param changed start with group = "<<param.group;
-//    conv(input, output, param, ctx1);
-//
-//    output_dev.sync();
-//    print_tensor_device(output_dev);
-    //cudaDeviceSynchronize();
-    //CUDA_CHECK(cudaPeekAtLastError());
 }
 
 TEST(TestSaberFuncBM, test_conv_param_change) {
@@ -263,7 +251,6 @@ TEST(TestSaberFuncBM, test_conv_param_change) {
 
     conv(input, output, param, ctx1);
     //output_dev.sync();
-//    print_tensor_device(output_dev);
 
     param.group = 1;
     param.pad_h = 1;
@@ -272,15 +259,11 @@ TEST(TestSaberFuncBM, test_conv_param_change) {
     LOG(INFO)<<" param changed start with group = "<<param.group;
     conv(input, output, param, ctx1);
 
-    //cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    //output[0]->record_event(cuda_stream);
+    //print_tensor_device(output_dev);
 
-    //output_dev.sync();
-//    print_tensor_device(output_dev);
-    //cudaDeviceSynchronize();
-    //CUDA_CHECK(cudaPeekAtLastError());
 }
 
+/*
 TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
 
     int group = 1;
@@ -411,16 +394,6 @@ TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
     conv0(input0, output0, param0, ctx1);
     conv1(input1, output1, param1, ctx2);
 
-    /*
-    cudaStream_t cuda_stream1 = ctx1.get_compute_stream();
-    output0[0]->record_event(cuda_stream1);
-
-    cudaStream_t cuda_stream2 = ctx2.get_compute_stream();
-    output1[0]->record_event(cuda_stream2);
-
-    out0.sync();
-    out1.sync();
-    */
     print_tensor_device(output_dev);
 
 //    print_tensor_device(output_dev);
@@ -428,6 +401,7 @@ TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
     //cudaDeviceSynchronize();
     //CUDA_CHECK(cudaPeekAtLastError());
 }
+*/
 #endif
 
 TEST(TestSaberFuncBM, test_conv_fp32_speed_test) {
@@ -561,8 +535,6 @@ TEST(TestSaberFuncBM, test_conv_fp32_speed_test) {
 
     LOG(INFO) << "fp32 average time: " << t1.get_average_ms() << " ms";
 
-    //cudaDeviceSynchronize();
-    //CUDA_CHECK(cudaPeekAtLastError());
 }
 
 void test_conv_fp32_speed(std::vector<TensorDf4*> &inputs, std::vector<TensorDf4*> &outputs,
@@ -601,23 +573,6 @@ void test_conv_fp32_speed(std::vector<TensorDf4*> &inputs, std::vector<TensorDf4
     //cudaDeviceSynchronize();
 }
 
-/*
-cublasHandle_t  cublas_handle;
-
-void caffe_gemm(const int M, const int N, const int K,\
-					 const float alpha, const float* A,\
-					 const float* B, const float beta, float* C) {
-    int lda = K;
-    int ldb = N;
-    CUBLAS_CHECK(cublasSgemm(cublas_handle,
-                             CUBLAS_OP_N,
-                             CUBLAS_OP_N,
-                             N, M, K,
-                             &alpha, B,
-                             ldb, A,
-                             lda, &beta,
-                             C, N));
-}
 
 TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) {
     int img_num = 1;
@@ -645,9 +600,6 @@ TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) {
     int stride = 1;
     Context<BM> ctx1(0, 1, 1);
 
-    CUBLAS_CHECK(cublasCreate(&cublas_handle));
-    CUBLAS_CHECK(cublasSetStream(cublas_handle, ctx1.get_compute_stream()));
-
     TensorDf4 weights;
     weights.re_alloc({out_channels, in_channels, 1, 1});
 
@@ -684,40 +636,8 @@ TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) {
                          weights, kernel, stride, pad,
             in_channels, out_channels, bias,
             SABER_IMPL);
-    //cudaDeviceSynchronize();
-    caffe_gemm(out_channels, img_h * img_w, in_channels,\
-					 1.f, weights.data(),\
-					 img.data(), 0.f, out_gemm.mutable_data());
-    //cudaDeviceSynchronize();
-    SaberTimer<BM> t1;
-    int ts = 100;
-
-    for (int i = 0; i < ts; ++i) {
-        t1.start(ctx1);
-        caffe_gemm(out_channels, img_h * img_w, in_channels,\
-					 1.f, weights.data(),\
-					 img.data(), 0.f, out_gemm.mutable_data());
-        out_gemm.record_event(ctx1.get_compute_stream());
-        out_gemm.sync();
-        t1.end(ctx1);
-    }
-    LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms";
-
-    //cudaDeviceSynchronize();
-//    print_tensor_device(out);
-//    print_tensor_device(out_gemm);
-    TensorHf4 out_host;
-    TensorHf4 out_gemm_host;
-    out_host.re_alloc(out.shape());
-    out_host.copy_from(out);
-
-    out_gemm_host.re_alloc(out_gemm.shape());
-    out_gemm_host.copy_from(out_gemm);
-    double max_r, max_d;
-    tensor_cmp_host(out_host.data(), out_gemm_host.data(), out_host.size(), max_r, max_d);
-    LOG(INFO) << "cmp result: max_r = " << max_r << " max_d = " << max_d;
 }
-*/
+
 int main(int argc, const char** argv){
     anakin::saber::Env<BM>::env_init();
 

From c27573a42bc43a670a347a2fcefe36ac752791cd Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 11:26:06 +0800
Subject: [PATCH 079/318] Update BM tensor print function

---
 saber/core/tensor_op.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp
index 841c9c208..3d6494b9d 100644
--- a/saber/core/tensor_op.cpp
+++ b/saber/core/tensor_op.cpp
@@ -440,7 +440,7 @@ void print_tensor_device<Tensor<BM, AK_BM, NCHW>>(Tensor<BM, AK_BM, NCHW>& tenso
     for (int i = 0; i < tensor.size(); ++i) {
         printf("%.2f ", host_mem[i]);
 
-        if ((i + 1) % (4 * tensor.width()) == 0) {
+        if ((i + 1) % tensor.width() == 0){
             printf("\n");
         }
     }

From 679ae3fca424df5a7e92f2b4138616b062c13d50 Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Wed, 27 Jun 2018 05:17:48 +0000
Subject: [PATCH 080/318] modify activation op, test pass

---
 saber/funcs/activation.h                      | 17 ++++++++++++-
 saber/funcs/impl/bm/vender_activation.h       | 15 ++++++------
 saber/saber_funcs_param.h                     | 24 +++++++++++++++++++
 .../bm/test_saber_func_activation_BM.cpp      | 13 ++++++----
 test/saber/bm/test_saber_func_pooling_BM.cpp  |  2 +-
 5 files changed, 56 insertions(+), 15 deletions(-)

diff --git a/saber/funcs/activation.h b/saber/funcs/activation.h
index f39747a27..e1167bc9a 100644
--- a/saber/funcs/activation.h
+++ b/saber/funcs/activation.h
@@ -29,9 +29,23 @@
 #include "saber/funcs/impl/x86/saber_activation.h"
 #endif
 
+#ifdef USE_BM
+#include "saber/funcs/impl/bm/vender_activation.h"
+#endif
+
 namespace anakin {
 namespace saber {
 
+#ifdef USE_BM
+template<typename TargetType,
+        DataType OpDtype,
+        DataType inDtype = AK_BM,
+        DataType outDtype = AK_BM,
+        typename LayOutType_op = NCHW,
+        typename LayOutType_in = NCHW,
+        typename LayOutType_out = NCHW
+>
+#else
 template<typename TargetType,
         DataType OpDtype,
         DataType inDtype = AK_FLOAT,
@@ -40,6 +54,7 @@ template<typename TargetType,
         typename LayOutType_in = NCHW,
         typename LayOutType_out = NCHW
 >
+#endif
 class Activation : public BaseFunc<
         Tensor<TargetType, inDtype, LayOutType_in>,
         Tensor<TargetType, outDtype, LayOutType_out>,
@@ -110,4 +125,4 @@ class Activation : public BaseFunc<
 } // namespace saber
 } // namespace anakin
 
-#endif
\ No newline at end of file
+#endif
diff --git a/saber/funcs/impl/bm/vender_activation.h b/saber/funcs/impl/bm/vender_activation.h
index c4baf8365..ec27ac054 100644
--- a/saber/funcs/impl/bm/vender_activation.h
+++ b/saber/funcs/impl/bm/vender_activation.h
@@ -27,7 +27,7 @@ class VenderActivation<BM, OpDtype, inDtype, outDtype,\
     typedef typename DataTensor_out::Dtype OutDataType;
     typedef typename OpTensor::Dtype OpDataType;
 
-    VenderActivation(): _handle(NULL), _active_type(NULL) {}
+    VenderActivation(): _handle(NULL), _active_type(Active_relu) {}
 
     ~VenderActivation() {}
 
@@ -35,6 +35,7 @@ class VenderActivation<BM, OpDtype, inDtype, outDtype,\
                             std::vector<DataTensor_out *>& outputs,
                             ActivationParam<OpTensor>& param, Context<BM>& ctx) {
         // not sure
+	_handle = get_bm_handle();
         return create(inputs, outputs, param, ctx);
     }
 
@@ -49,14 +50,15 @@ class VenderActivation<BM, OpDtype, inDtype, outDtype,\
     virtual SaberStatus dispatch(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             ActivationParam<OpTensor>& param) {
-        const InDataType *in_data = (const InDataType *) inputs[0]->data();
-        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
+        const InDataType in_data = *(inputs[0]->data());
+        OutDataType out_data = *(outputs[0]->mutable_data());
         int input_dim = inputs[0]->channel() * inputs[0]->height() * inputs[0]->width();
         int input_n = inputs[0]->num();
 
+        _active_type = param.active;
         switch (_active_type) {
             case Active_relu:
-                BMDNN_CHECK(bmdnn_relu_forward(_handle, in_data, input_n, input_dim, out_data));
+                BMDNN_CHECK(bmdnn_relu_forward(_handle, in_data, 0.0, input_n, input_dim, out_data));
                 break;
             case Active_sigmoid:
                 BMDNN_CHECK(bmdnn_sigmoid_forward(_handle, in_data, input_n, input_dim, out_data));
@@ -64,9 +66,6 @@ class VenderActivation<BM, OpDtype, inDtype, outDtype,\
             case Active_tanh:
                 BMDNN_CHECK(bmdnn_tanh_forward(_handle, in_data, input_n, input_dim, out_data));
                 break;
-            case Active_elu:
-                BMDNN_CHECK(bmdnn_elu_forward(_handle, 1.0, in_data, input_n, input_dim, out_data));
-                break;
         }
         return SaberSuccess;
     }
@@ -76,7 +75,7 @@ class VenderActivation<BM, OpDtype, inDtype, outDtype,\
     ActiveType _active_type;
 };
 
-template class VenderActivation<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
+template class VenderActivation<BM, AK_BM, AK_BM, AK_BM, NCHW, NCHW, NCHW>;
 } // namespace saber
 
 } // namespace anakin
diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h
index 6a109540e..284fbcbc5 100644
--- a/saber/saber_funcs_param.h
+++ b/saber/saber_funcs_param.h
@@ -18,6 +18,7 @@
 #include "anakin_config.h"
 #include <vector>
 #include <string>
+#include <type_traits>
 #include "saber/core/shape.h"
 #include "saber/core/tensor.h"
 #include "saber/saber_types.h"
@@ -858,6 +859,29 @@ struct ActivationParam {
     DataDtype negative_slope;
     DataDtype coef;
 };
+
+#ifdef USE_BM
+template <>
+struct ActivationParam<Tensor<BM, AK_BM, NCHW> > {
+    ActivationParam(): active(Active_unknow) {}
+    ActivationParam(ActiveType act): active(act) {}
+    ActivationParam(const ActivationParam &right): active(right.active) {}
+    ActivationParam &operator=(const ActivationParam &right) {
+        active = right.active;
+        return *this;
+    }
+    bool operator==(const ActivationParam &right) {
+        bool comp_eq = true;
+        comp_eq = comp_eq && (active == right.active);
+        return comp_eq;
+    }
+    bool has_negative_slope(){
+        return (active == Active_relu);
+    }
+    ActiveType active;
+};
+#endif
+
 template <typename opTensor>
 struct ScaleParam {
     typedef typename opTensor::Dtype DataDtype;
diff --git a/test/saber/bm/test_saber_func_activation_BM.cpp b/test/saber/bm/test_saber_func_activation_BM.cpp
index 523e94121..42f33e58d 100644
--- a/test/saber/bm/test_saber_func_activation_BM.cpp
+++ b/test/saber/bm/test_saber_func_activation_BM.cpp
@@ -32,10 +32,10 @@ void print_tensor_shape(std::string name, Tensor& t0) {
 TEST(TestSaberFuncBM, test_func_constructor) {
 
     typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 
     int img_num = 1;
-    int in_channels = 2;
+    int in_channels = 1;
     int img_h = 8;
     int img_w = 8;
 
@@ -47,18 +47,21 @@ TEST(TestSaberFuncBM, test_func_constructor) {
     img_host.re_alloc(img_s);
     img_dev.re_alloc(img_s);
 
+    int sign = -1;
     for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = (float)(0.05 * (i & 0x1f) * -1);
+	sign = i % 2 ? -1 : 1;
+        img_host.mutable_data()[i] = (float)(0.05 * (i & 0x1f) * sign);
     }
 
     img_dev.copy_from(img_host);
     TensorDf4 output_dev;
+    print_tensor_device(img_dev);
 
     // start Reshape & doInfer
 
     Context<BM> ctx1(0, 1, 1);
 
-    ActivationParam<TensorDf4> param(Active_relu, 0.1f, 0.1f);
+    ActivationParam<TensorDf4> param(Active_relu);
 
     std::vector<TensorDf4*> input;
     std::vector<TensorDf4*> output;
@@ -66,7 +69,7 @@ TEST(TestSaberFuncBM, test_func_constructor) {
     input.push_back(&img_dev);
     output.push_back(&output_dev);
 
-    Activation<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> act;
+    Activation<BM, AK_BM, AK_BM, AK_BM, NCHW> act;
     act.compute_output_shape(input, output, param);
     output_dev.re_alloc(output[0]->shape());
 
diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp
index e988bc573..fb1a7398d 100644
--- a/test/saber/bm/test_saber_func_pooling_BM.cpp
+++ b/test/saber/bm/test_saber_func_pooling_BM.cpp
@@ -80,7 +80,7 @@ TEST(TestSaberFuncBM, test_func_pooling) {
     pooling(input, output, param, ctx1);
 
     SaberTimer<BM> t1;
-    int ts = 1000;
+    int ts = 100;
 
     for (int i = 0; i < ts; ++i) {
         t1.start(ctx1);

From 1ab43e0aae2eb7dc532166b7948f3fa717418380 Mon Sep 17 00:00:00 2001
From: hlzy <327842846@qq.com>
Date: Wed, 27 Jun 2018 01:28:34 -0400
Subject: [PATCH 081/318] tensor_test

---
 test/saber/bm/test_saber_tensor_BM.cpp | 49 ++++++++++++++------------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index 69b1ccbfc..de787908b 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -8,6 +8,8 @@ typedef TargetWrapper<BM> BM_API;
 typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
 typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 typedef TensorHf4::Dtype dtype;
+typedef TensorDf4::Dtype dtype2;
+
 
 static bm_handle_t handle;
 TEST(TestSaberTensorBM, test_tensor_constructor) {
@@ -47,7 +49,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
               << thost0.height() << ", width = " << thost0.width();
 
     //! test tensor mutable_data() function
-    LOG(INFO) << "|--test tensor mutable_data() function, write tensor data buffer with 2.f";
+    LOG(INFO) << "|--xxxxxxxxtest tensor mutable_data() function, write tensor data buffer with 2.f";
     fill_tensor_host_const(thost0, 2.f);
     LOG(INFO) << "|--test tensor data() function, show the const data, 2.f";
     print_tensor_host(thost0);
@@ -88,7 +90,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     LOG(INFO) <<
               "test tensor constructor with data, if target is different, create buffer, and copy the data";
     dtype* host_data_ptr;
-    dtype* dev_data_ptr;
+//    dtype2* dev_data_ptr;
     void* tmp_pt_host;
     void* tmp_pt_dev;
     X86_API::mem_alloc(&tmp_pt_host, sizeof(dtype) * sh1.count());
@@ -98,26 +100,28 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
         host_data_ptr[i] = i;
     }
 
-    BM_API::mem_alloc(&tmp_pt_dev, sizeof(dtype) * sh1.count());
-    dev_data_ptr = static_cast<dtype*>(tmp_pt_dev);
-//    bm_memcpy_d2s(handle,host_data_ptr,dev_data_ptr)
-//    cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice);
+    BM_API::mem_alloc(&tmp_pt_dev, sizeof(dtype2) * sh1.count());
+//    dev_data_ptr = static_cast<dtype2*>(tmp_pt_dev);
+//    bm_memcpy_d2s(handle,*dev_data_ptr,bm_mem_from_system(const_cast<float *>(host_data_ptr)));
+
+//---    cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice);
+
     LOG(INFO) << "|--construct host tensor from host data ptr";
     TensorHf4 thost3(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
     LOG(INFO) << "|--constructor device tensor from host data ptr";
 
-//    TensorDf4 tdev3(&bm_mem_from_system(const_cast<float *>(host_data_ptr)), X86(), X86_API::get_device_id(), sh1);
-
     TensorDf4 tdev3(&bm_mem_from_system(const_cast<float *>(host_data_ptr)), X86(), X86_API::get_device_id(), sh1);
 
     print_tensor_host(thost3);
 
-    TensorHf4 thost_lian(sh1);
-    thost_lian.copy_from(tdev3);
-    print_tensor_host(thost_lian);
+    print_tensor_device(tdev3);
 
-    thost_lian.copy_from(thost3);
-    print_tensor_host(thost_lian);
+//    TensorHf4 thost_lian(sh1);
+//    thost_lian.copy_from(tdev3);
+//    print_tensor_host(thost_lian);
+//
+//    thost_lian.copy_from(thost3);
+//    print_tensor_host(thost_lian);
 
     //cudaDeviceSynchronize();
     //
@@ -128,16 +132,17 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     TensorDf4 tdev4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1);
     print_tensor_host(thost4);
     print_tensor_device(tdev4);
-/*
+*/
+
     //BM_API::stream_t dev_stream0;
     //BM_API::create_stream_with_flag(dev_stream0, 1);
     //cudaDeviceSynchronize();
-
+/*
     //! test tensor copy constructor
     LOG(INFO) << "test tensor copy constructor";
     LOG(INFO) << "|--normal copy constructor";
-    TensorHf4 thost5(thost4);
-    TensorDf4 tdev5(tdev4);
+//    TensorHf4 thost5(thost4);
+//    TensorDf4 tdev5(tdev4);
 
     LOG(INFO) << "|--push back to vector";
     std::vector<TensorHf4> vthost;
@@ -146,18 +151,18 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     vthost.push_back(thost1);
     vthost.push_back(thost2);
     vthost.push_back(thost3);
-    vthost.push_back(thost4);
-    vthost.push_back(thost5);
+//    vthost.push_back(thost4);
+//    vthost.push_back(thost5);
     vtdev.push_back(tdev0);
     vtdev.push_back(tdev1);
     vtdev.push_back(tdev2);
     vtdev.push_back(tdev3);
-    vtdev.push_back(tdev4);
-    vtdev.push_back(tdev5);
+//   vtdev.push_back(tdev4);
+//    vtdev.push_back(tdev5);
     print_tensor_host(vthost[5]);
     print_tensor_device(vtdev[5]);
     //cudaDeviceSynchronize();
-
+/*
     //! test share_from function, if targets are the same, buffer is shared, otherwise, buffer is copied
     LOG(INFO) << "test share_from function";
     TensorHf4 thost6, thost7;

From 80f57fb390b27fac834b6ac5e4ec5f1971e9d612 Mon Sep 17 00:00:00 2001
From: "weihao.huang" <weihao.huang@bitmain.com>
Date: Wed, 27 Jun 2018 06:14:17 +0000
Subject: [PATCH 082/318] Fix sync_memcpy functions & test_saber_buffer_BM all
 passes

---
 saber/core/impl/bm/bm_impl.cpp         | 28 ++++++++++++++++++--------
 saber/core/target_wrapper.h            |  4 ++--
 test/saber/bm/test_saber_buffer_BM.cpp | 24 ++++++----------------
 3 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index ef26884b2..a50994a60 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -78,27 +78,39 @@ void BM_API::mem_set(void* ptr, int value, size_t n){
     //BMDNN_CHECK(bm_memset_device(handle, value, *pmem));
 }
 
-//static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-//    size_t count, __DtoD) {};
+void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+    size_t count, __DtoD) {
+    handle = get_bm_handle(); 
+    //BMDNN_CHECK(bm_memcpy_s2d(handle, bm_mem_from_device(dst), dst_id, bm_mem_from_device(src), src_id, count));
+    BMDNN_CHECK(bm_memcpy_d2d(handle, *(bm_device_mem_t *)(dst), dst_id, *(bm_device_mem_t *)(src), src_id, count));
+    LOG(INFO) << "BM sync_memcpy: device to device, finished";
+};
 
 void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
     size_t count, __HtoD) {
     handle = get_bm_handle(); 
-    BMDNN_CHECK(bm_memcpy_s2d(handle, bm_mem_from_device(dst), bm_mem_from_system(src)));
+    BMDNN_CHECK(bm_memcpy_s2d(handle, *(bm_device_mem_t *)(dst), bm_mem_from_system(src)));
+    for(int i=0; i<10; i++)
+	std::cout << "HtoD src: " << *((float *)(src)+i) << std::endl;
+    
     LOG(INFO) << "BM sync_memcpy: host to device, finished";
 };
 
 void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
     size_t count, __DtoH) {
     handle = get_bm_handle(); 
-    BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), bm_mem_from_device(src)));
-    //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src)));
-    //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(reinterpret_cast<struct bm_mem_desc *>(src))));
+    BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src)));
+    for(int i=0; i<10; i++)
+        std::cout << "DtoH dst: " << *((float *)(dst)+i) << std::endl;
+
     LOG(INFO) << "BM sync_memcpy: device to host, finished";
 };
 
-//static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
-//    int src_dev, size_t count) {};
+void BM_API::sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
+    int src_dev, size_t count) { 
+
+    LOG(INFO) << "BM sync_memcpy_p2p: temporarily no used";
+};
 
 
 //! target wrapper
diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index 925f4dd39..d87b2ae03 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -398,7 +398,7 @@ struct TargetWrapper<BM, __device_target> {
     // brief create event, empty function for bitmain target
 
     static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-        size_t count, __DtoD) {};
+        size_t count, __DtoD);
 
     static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
         size_t count, __HtoD);
@@ -407,7 +407,7 @@ struct TargetWrapper<BM, __device_target> {
         size_t count, __DtoH);
 
     static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
-        int src_dev, size_t count) {};
+        int src_dev, size_t count);
 
     /**
      * \brief device target return currently used device id
diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp
index 555e22675..f8c8f46bb 100644
--- a/test/saber/bm/test_saber_buffer_BM.cpp
+++ b/test/saber/bm/test_saber_buffer_BM.cpp
@@ -27,7 +27,7 @@ void test_buffer() {
     x86_ptr = static_cast<Hdtype*>(tmp_x86);
 
     for (int i = 0; i < n0; i++) {
-        x86_ptr[i] = static_cast<Hdtype>(100);
+        x86_ptr[i] = static_cast<Hdtype>(i);
     }
 
     void* tmp_bm;
@@ -97,25 +97,13 @@ void test_buffer() {
     }
 
     CHECK_EQ(ptr1[n0 / 2], ptr2[n0 / 2]) << "deep copy between host is incorrect";
+    bm_buf1.sync_copy_from(x86_buf2); 
     LOG(INFO) << "deep copy from host buffer to device buffer";
-    bm_buf1.sync_copy_from(x86_buf2);
-    
-    /*
-    const Hdtype* x86_buf2_ptr = static_cast<const Hdtype*>(x86_buf2.get_data());
-    for (int i = 0; i < 10; i++) {
-	std::cout << "x86: " << x86_buf2_ptr[i] << std::endl;
-    }
-    */
-
-    const Hdtype* bm_buf1_ptr = static_cast<const Hdtype*>(bm_buf1.get_data());
-    for (int i = 0; i < 10; i++) {
-	std::cout << "bm: " << bm_buf1_ptr[i] << std::endl;
-    }
 
-    LOG(INFO) << "bm_buf1 cap & cnt: " << bm_buf1.get_capacity() << " " << bm_buf1.get_count();
-    LOG(INFO) << "x86_buf1 cap & cnt: " << x86_buf1.get_capacity() << " " << x86_buf1.get_count(); 
-    LOG(INFO) << "size of Hdtype: " << sizeof(Hdtype);
-    LOG(INFO) << "size of Ddtype: " << sizeof(Ddtype);
+    //LOG(INFO) << "bm_buf1 cap & cnt: " << bm_buf1.get_capacity() << " " << bm_buf1.get_count();
+    //LOG(INFO) << "x86_buf1 cap & cnt: " << x86_buf1.get_capacity() << " " << x86_buf1.get_count(); 
+    //LOG(INFO) << "size of Hdtype: " << sizeof(Hdtype);
+    //LOG(INFO) << "size of Ddtype: " << sizeof(Ddtype);
     
 
     x86_buf1.re_alloc(bm_buf1.get_capacity());

From a1bd3fdcbaf82268a6f60a56cdbebd34a17ffa11 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 14:22:38 +0800
Subject: [PATCH 083/318] Implement BM softmax

---
 saber/funcs/impl/bm/vender_softmax.h          | 106 ++++++++++
 saber/funcs/softmax.h                         |  15 ++
 test/saber/bm/test_saber_func_softmax_BM.cpp  | 194 ++++++++++++++++++
 test/saber/bm/test_saber_func_softmax_BM.h    |  21 ++
 .../saber/x86/test_saber_func_softmax_x86.cpp |   2 +-
 5 files changed, 337 insertions(+), 1 deletion(-)
 create mode 100644 saber/funcs/impl/bm/vender_softmax.h
 create mode 100644 test/saber/bm/test_saber_func_softmax_BM.cpp
 create mode 100644 test/saber/bm/test_saber_func_softmax_BM.h

diff --git a/saber/funcs/impl/bm/vender_softmax.h b/saber/funcs/impl/bm/vender_softmax.h
new file mode 100644
index 000000000..fb2595e87
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_softmax.h
@@ -0,0 +1,106 @@
+#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_SOFTMAX_H
+#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_SOFTMAX_H
+
+#include "saber/funcs/impl/impl_softmax.h"
+#include "saber/saber_funcs_param.h"
+#include "saber/saber_types.h"
+
+namespace anakin{
+
+namespace saber{
+
+template <DataType OpDtype,
+    DataType inDtype,
+    DataType outDtype,
+    typename LayOutType_op,
+    typename LayOutType_in,
+    typename LayOutType_out>
+class VenderSoftmax<BM, OpDtype, inDtype, outDtype,\
+    LayOutType_op, LayOutType_in, LayOutType_out> : \
+    public ImplBase<
+        Tensor<BM, inDtype, LayOutType_in>,
+        Tensor<BM, outDtype, LayOutType_out>,
+        Tensor<BM, OpDtype, LayOutType_op>,
+        SoftmaxParam<Tensor<BM, OpDtype, LayOutType_op> > >
+{
+public:
+    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+
+    VenderSoftmax(): _handle(NULL) {}
+    ~VenderSoftmax() {}
+
+    /**
+     * \brief initial all bmdnn resources here
+     * @param inputs
+     * @param outputs
+     * @param param
+     * @param ctx
+     */
+    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            SoftmaxParam<OpTensor>& param, Context<BM>& ctx) {
+
+        _handle = get_bm_handle();
+        return create(inputs, outputs, param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            SoftmaxParam<OpTensor>& param, Context<BM> &ctx) {
+
+    }
+
+    //call cudnnConvolutionForward here
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
+                          std::vector<DataTensor_out*>& outputs,
+                          SoftmaxParam<OpTensor> &param){
+
+        const InDataType *in_data = (const InDataType *) inputs[0]->data();
+        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
+
+        int input_n = inputs[0]->num();
+        int input_c = inputs[0]->channel();
+        int input_h = inputs[0]->height();
+        int input_w = inputs[0]->width();
+
+        /*
+        int outer_num = inputs[0]->count(0, param.axis);
+        int inner_num = inputs[0]->count(param.axis + 1, inputs[0]->dims());
+
+        int N = outer_num;
+        int K = inputs[0]->valid_shape()[param.axis];
+        int H = inner_num;
+        int W = 1;
+
+        const int stride_w = 1;
+        const int stride_h = W * stride_w;
+        const int stride_c = H * stride_h;
+        const int stride_n = K * stride_c;
+         */
+
+        bmdnn_softmax_forward(
+                _handle,
+                *in_data,
+                input_n,
+                input_c,
+                input_h * input_w,
+                *out_data
+        );
+
+        return SaberSuccess;
+    }
+
+private:
+    bm_handle_t _handle;
+};
+
+} //namespace saber
+
+} //namespace anakin
+
+#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_SOFTMAX_H
diff --git a/saber/funcs/softmax.h b/saber/funcs/softmax.h
index 3fa5d850e..4a1e631f0 100644
--- a/saber/funcs/softmax.h
+++ b/saber/funcs/softmax.h
@@ -27,10 +27,24 @@
 #include "saber/funcs/impl/x86/saber_softmax.h"
 #endif
 
+#ifdef USE_BM
+#include "saber/funcs/impl/bm/vender_softmax.h"
+#endif
+
 namespace anakin{
 
 namespace saber{
 
+#ifdef USE_BM
+template<typename TargetType,
+        DataType OpDtype,
+        DataType inDtype = AK_BM,
+        DataType outDtype = AK_BM,
+        typename LayOutType_op = NCHW,
+        typename LayOutType_in = NCHW,
+        typename LayOutType_out = NCHW
+>
+#else
 template <typename TargetType,
         DataType OpDtype,
         DataType inDtype = AK_FLOAT,
@@ -39,6 +53,7 @@ template <typename TargetType,
         typename LayOutType_in = NCHW,
         typename LayOutType_out = NCHW
 >
+#endif
 class Softmax : public BaseFunc<
         Tensor<TargetType, inDtype, LayOutType_in>,
         Tensor<TargetType, outDtype, LayOutType_out>,
diff --git a/test/saber/bm/test_saber_func_softmax_BM.cpp b/test/saber/bm/test_saber_func_softmax_BM.cpp
new file mode 100644
index 000000000..2da0d2e62
--- /dev/null
+++ b/test/saber/bm/test_saber_func_softmax_BM.cpp
@@ -0,0 +1,194 @@
+#include "core/context.h"
+#include "funcs/softmax.h"
+#include "test_saber_func_softmax_BM.h"
+#include "tensor_op.h"
+#include "saber_types.h"
+#include <vector>
+
+using namespace anakin::saber;
+
+TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) {
+
+    Env<BM>::env_init();
+    typedef TargetWrapper<BM> API;
+
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+
+    typedef TensorDf4::Dtype dtype;
+
+    int test_iter = 1000;
+
+    int softmax_axis = 3; // channel
+    int w_in = 3;
+    int h_in = 225;
+    int ch_in = 40;
+    int num_in = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_out = shape_in;
+
+    SoftmaxParam<TensorDf4> param(softmax_axis);
+
+    LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \
+              ch_in << ", height=" << h_in << ", width=" << w_in;
+
+    LOG(INFO) << "softmax axis= " << param.axis;
+
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+
+    Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
+
+    for (int i = 0; i < thin.size(); ++i) {
+        thin.mutable_data()[i] = i % 4;
+    }
+
+    TensorDf4 tdin, tdout;
+    tdin.re_alloc(shape_in);
+    tdin.copy_from(thin);
+    input_dev_4d.push_back(&tdin);
+
+    // start Reshape & doInfer
+    Context<BM> ctx_dev(0, 1, 1);
+
+    Softmax<BM, AK_BM> softmax_dev;
+
+    typedef std::vector<Shape> Shape_v;
+
+    LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \
+              shape_out[2] << ", " << shape_out[3];
+
+    output_dev_4d.push_back(&tdout);
+    softmax_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
+
+    LOG(INFO) << "re-alloc tensor buffer";
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
+
+    LOG(INFO) << "softmax initialized to cudnn impl";
+    softmax_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
+
+    LOG(INFO) << "cudnn softmax compute";
+    SaberTimer<BM> t1;
+    t1.clear();
+    t1.start(ctx_dev);
+
+    for (int i = 0; i < test_iter; ++i) {
+        softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
+        output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+        output_dev_4d[0]->sync();
+    }
+
+    t1.end(ctx_dev);
+    float ts = t1.get_average_ms();
+    printf("cudnn softmax total time : %.4f, avg time : %.4f\n", ts, ts / test_iter);
+
+    LOG(INFO) << "softmax initialized to saber impl";
+    softmax_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, SABER_IMPL, ctx_dev);
+
+    LOG(INFO) << "saber softmax compute";
+    t1.clear();
+    t1.start(ctx_dev);
+
+    for (int i = 0; i < test_iter; ++i) {
+        softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
+        output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+        output_dev_4d[0]->sync();
+    }
+
+    t1.end(ctx_dev);
+    ts = t1.get_average_ms();
+    printf("saber softmax total time : %.4f, avg time : %.4f\n", ts, ts / test_iter);
+    //print_tensor_device(*output_dev_4d[0]);
+}
+
+TEST(TestSaberFuncSoftmaxBM, test_func_softmax_ROI_BM) {
+
+    Env<BM>::env_init();
+    typedef TargetWrapper<BM> API;
+
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+
+    typedef TensorDf4::Dtype dtype;
+
+    int test_iter = 1;
+
+    int softmax_axis = 3; // channel
+    int w_in = 3;
+    int h_in = 10;
+    int ch_in = 10;
+    int num_in = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_in_roi{num_in, ch_in / 2, h_in / 2, w_in};
+    Shape shape_out = shape_in_roi;
+
+    SoftmaxParam<TensorDf4> param(softmax_axis);
+
+    LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \
+              ch_in << ", height=" << h_in << ", width=" << w_in;
+
+    LOG(INFO) << "softmax axis= " << param.axis;
+
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+
+    Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
+
+    for (int i = 0; i < thin.size(); ++i) {
+        thin.mutable_data()[i] = (i % 3);
+    }
+
+    TensorDf4 tdin, tdin_roi, tdout, tdout_roi;
+    tdin.re_alloc(shape_in);
+    tdout.re_alloc(shape_in);
+    tdin.copy_from(thin);
+    tdin_roi.share_sub_buffer(tdin, shape_in_roi, Shape(0, 0, 0, 0));
+    input_dev_4d.push_back(&tdin_roi);
+    output_dev_4d.push_back(&tdout_roi);
+
+    // start Reshape & doInfer
+    Context<BM> ctx_dev(0, 1, 1);
+
+    Softmax<BM, AK_BM> softmax_dev;
+
+    LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \
+              shape_out[2] << ", " << shape_out[3];
+
+    softmax_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
+
+    LOG(INFO) << "re-alloc tensor buffer";
+    output_dev_4d[0]->share_sub_buffer(tdout, shape_in_roi, Shape(0, 0, 0, 0));
+    //output_dev_4d[0]->reshape(output_dev_4d[0]->valid_shape());
+
+    LOG(INFO) << "softmax initialization";
+    softmax_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, SABER_IMPL, ctx_dev);
+
+    LOG(INFO) << "softmax compute";
+    SaberTimer<BM> t1;
+    t1.clear();
+    t1.start(ctx_dev);
+
+    for (int i = 0; i < test_iter; ++i) {
+        softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
+        output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+        output_dev_4d[0]->sync();
+    }
+
+    t1.end(ctx_dev);
+    float ts = t1.get_average_ms();
+    printf("total time : %.4f, avg time : %.4f\n", ts, ts / test_iter);
+    print_tensor_device(*output_dev_4d[0]);
+
+    TensorDf4 troi(output_dev_4d[0]->valid_shape());
+    troi.copy_from(*output_dev_4d[0]);
+    print_tensor_device(troi);
+}
+
+int main(int argc, const char** argv) {
+    // initial logger
+    //logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
diff --git a/test/saber/bm/test_saber_func_softmax_BM.h b/test/saber/bm/test_saber_func_softmax_BM.h
new file mode 100644
index 000000000..d5c5b6986
--- /dev/null
+++ b/test/saber/bm/test_saber_func_softmax_BM.h
@@ -0,0 +1,21 @@
+#ifndef ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H
+#define ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+#include "core/tensor.h"
+
+using namespace anakin::test;
+
+class TestSaberFuncSoftmaxBM : public Test {
+public:
+    TestSaberFuncSoftmaxBM() {}
+    ~TestSaberFuncSoftmaxBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+};
+
+#endif //ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H
diff --git a/test/saber/x86/test_saber_func_softmax_x86.cpp b/test/saber/x86/test_saber_func_softmax_x86.cpp
index c4942c302..179806e02 100644
--- a/test/saber/x86/test_saber_func_softmax_x86.cpp
+++ b/test/saber/x86/test_saber_func_softmax_x86.cpp
@@ -63,7 +63,7 @@ void test(int num, int channel) {
     dst_saber.re_alloc(shape_out);
     output_softmax.push_back(&dst_saber);
 
-    Softmax<X86, AK_FLOAT> op_softmax;
+    Softmax<X86, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW> op_softmax;
     SoftmaxParam<Tensor4f> smx_pm;
 
     op_softmax.init(input_softmax, output_softmax, smx_pm, SPECIFY, SABER_IMPL, ctx_host);

From 7c0a0f0118475617a60995314370759dfeea032c Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 14:53:56 +0800
Subject: [PATCH 084/318] only print in DEBUG

---
 saber/core/impl/bm/bm_impl.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index a50994a60..4d24dedf0 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -90,8 +90,11 @@ void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
     size_t count, __HtoD) {
     handle = get_bm_handle(); 
     BMDNN_CHECK(bm_memcpy_s2d(handle, *(bm_device_mem_t *)(dst), bm_mem_from_system(src)));
+
+    #ifdef DEBUG
     for(int i=0; i<10; i++)
-	std::cout << "HtoD src: " << *((float *)(src)+i) << std::endl;
+	    LOG(INFO) << "HtoD src: " << *((float *)(src)+i);
+    #endif
     
     LOG(INFO) << "BM sync_memcpy: host to device, finished";
 };
@@ -100,8 +103,11 @@ void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
     size_t count, __DtoH) {
     handle = get_bm_handle(); 
     BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src)));
+
+    #ifdef DEBUG
     for(int i=0; i<10; i++)
-        std::cout << "DtoH dst: " << *((float *)(dst)+i) << std::endl;
+        LOG(INFO) << "DtoH dst: " << *((float *)(dst)+i);
+    #endif
 
     LOG(INFO) << "BM sync_memcpy: device to host, finished";
 };

From 635ff4260496f98657440461c7f251c2b6a4c907 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 15:05:37 +0800
Subject: [PATCH 085/318] reduce iteration

---
 test/saber/bm/test_saber_func_softmax_BM.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/saber/bm/test_saber_func_softmax_BM.cpp b/test/saber/bm/test_saber_func_softmax_BM.cpp
index 2da0d2e62..8176a9e51 100644
--- a/test/saber/bm/test_saber_func_softmax_BM.cpp
+++ b/test/saber/bm/test_saber_func_softmax_BM.cpp
@@ -16,7 +16,7 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) {
 
     typedef TensorDf4::Dtype dtype;
 
-    int test_iter = 1000;
+    int test_iter = 10;
 
     int softmax_axis = 3; // channel
     int w_in = 3;

From 4a9863f59da04a26ef151208ec84bc31a1386d8e Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 15:11:57 +0800
Subject: [PATCH 086/318] Revert "reduce iteration"

This reverts commit 635ff4260496f98657440461c7f251c2b6a4c907.
---
 test/saber/bm/test_saber_func_softmax_BM.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/saber/bm/test_saber_func_softmax_BM.cpp b/test/saber/bm/test_saber_func_softmax_BM.cpp
index 8176a9e51..2da0d2e62 100644
--- a/test/saber/bm/test_saber_func_softmax_BM.cpp
+++ b/test/saber/bm/test_saber_func_softmax_BM.cpp
@@ -16,7 +16,7 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) {
 
     typedef TensorDf4::Dtype dtype;
 
-    int test_iter = 10;
+    int test_iter = 1000;
 
     int softmax_axis = 3; // channel
     int w_in = 3;

From 2997faf062e8ef4bf6310c425ab369059fec335d Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Wed, 27 Jun 2018 08:19:32 +0000
Subject: [PATCH 087/318] modify fc op, compile error

---
 saber/funcs/fc.h                        | 18 ++++++++++++++++++
 saber/funcs/impl/bm/vender_fc.h         | 12 ++++++------
 test/saber/bm/test_saber_func_fc_BM.cpp |  4 ++--
 3 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/saber/funcs/fc.h b/saber/funcs/fc.h
index 06dc8695a..035d85934 100644
--- a/saber/funcs/fc.h
+++ b/saber/funcs/fc.h
@@ -27,10 +27,24 @@
 #include "saber/funcs/impl/x86/vender_fc.h"
 #endif
    
+#ifdef USE_BM
+#include "saber/funcs/impl/bm/vender_fc.h"
+#endif
+
 namespace anakin{
 
 namespace saber{
 
+#ifdef USE_BM
+template<typename TargetType,
+        DataType OpDtype,
+        DataType inDtype = AK_BM,
+        DataType outDtype = AK_BM,
+        typename LayOutType_op = NCHW,
+        typename LayOutType_in = NCHW,
+        typename LayOutType_out = NCHW
+>
+#else
 template<typename TargetType,
         DataType OpDtype,
         DataType inDtype = AK_FLOAT,
@@ -39,6 +53,7 @@ template<typename TargetType,
         typename LayOutType_in = NCHW,
         typename LayOutType_out = NCHW
 >
+#endif
 class Fc : public BaseFunc<
         Tensor<TargetType, inDtype, LayOutType_in>,
         Tensor<TargetType, outDtype, LayOutType_out>,
@@ -125,6 +140,9 @@ class Fc : public BaseFunc<
 #endif
 #ifdef USE_X86_PLACE
         this->_best_impl = this->_impl[0];
+#endif
+#ifdef USE_BM
+        this->_best_impl = this->_impl[0];
 #endif
     }
 
diff --git a/saber/funcs/impl/bm/vender_fc.h b/saber/funcs/impl/bm/vender_fc.h
index 82dd6000c..5004ad349 100644
--- a/saber/funcs/impl/bm/vender_fc.h
+++ b/saber/funcs/impl/bm/vender_fc.h
@@ -1,6 +1,5 @@
 #ifndef ANAKIN_SABER_FUNCS_BMDNN_FC_H
 #define ANAKIN_SABER_FUNCS_BMDNN_FC_H
-
 #include "saber/funcs/impl/impl_fc.h"
 
 namespace anakin{
@@ -34,6 +33,7 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
     virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             FcParam<OpTensor>& param, Context<BM>& ctx){
+        _handle = get_bm_handle();
         return create(inputs, outputs, param, ctx);
     }
 
@@ -46,10 +46,10 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
     virtual SaberStatus dispatch(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             FcParam<OpTensor>& param){
-        const InDataType *in_data = (const InDataType *) inputs[0]->data();
-        const InDataType *weights = (const InDataType *) param.weights->get_buf()->get_data();
-        const InDataType *bias = (const InDataType *) param.bias->get_buf()->get_data();
-        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
+        const InDataType in_data = *(inputs[0]->data());
+        const InDataType weights = *(InDataType*)(param.weights->get_buf()->get_data());
+        const InDataType bias = *(InDataType*)(param.bias->get_buf()->get_data());
+        OutDataType out_data = *(outputs[0]->mutable_data());
         int batch_size = inputs[0]->num();
         int input_len = inputs[0]->channel();
         int output_len = param.num_output;
@@ -64,7 +64,7 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
     bm_handle_t _handle;
 };
 
-template class VenderFc<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
+template class VenderFc<BM, AK_BM, AK_BM, AK_BM, NCHW, NCHW, NCHW>;
 } //namespace saber
 
 } //namespace anakin
diff --git a/test/saber/bm/test_saber_func_fc_BM.cpp b/test/saber/bm/test_saber_func_fc_BM.cpp
index 869ff1bfd..5acbc453e 100644
--- a/test/saber/bm/test_saber_func_fc_BM.cpp
+++ b/test/saber/bm/test_saber_func_fc_BM.cpp
@@ -7,7 +7,7 @@
 
 using namespace anakin::saber;
 typedef TargetWrapper<BM> API;
-typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
 typedef TensorDf4::Dtype ftype;
 
@@ -80,7 +80,7 @@ TEST(TestSaberFuncBM, test_func_fc) {
 
     FcParam<TensorDf4> param(&weight, &bias, num_out, axis);
 
-    Fc<BM, AK_FLOAT> fc;
+    Fc<BM, AK_BM> fc;
 
     LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \
               shape_out[2] << ", " << shape_out[3];

From ff5039ff63bb89a32f57c48a14ef0a5e8e0061c7 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 16:53:20 +0800
Subject: [PATCH 088/318] Update for BM softmax

---
 saber/funcs/impl/bm/vender_softmax.h         | 14 +++++++-----
 test/saber/bm/test_saber_func_softmax_BM.cpp | 23 ++++++++++----------
 test/saber/bm/test_saber_func_softmax_BM.h   | 21 ------------------
 3 files changed, 20 insertions(+), 38 deletions(-)
 delete mode 100644 test/saber/bm/test_saber_func_softmax_BM.h

diff --git a/saber/funcs/impl/bm/vender_softmax.h b/saber/funcs/impl/bm/vender_softmax.h
index fb2595e87..55612f66a 100644
--- a/saber/funcs/impl/bm/vender_softmax.h
+++ b/saber/funcs/impl/bm/vender_softmax.h
@@ -63,12 +63,13 @@ class VenderSoftmax<BM, OpDtype, inDtype, outDtype,\
         const InDataType *in_data = (const InDataType *) inputs[0]->data();
         OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
 
+        /*
         int input_n = inputs[0]->num();
         int input_c = inputs[0]->channel();
         int input_h = inputs[0]->height();
         int input_w = inputs[0]->width();
+        */
 
-        /*
         int outer_num = inputs[0]->count(0, param.axis);
         int inner_num = inputs[0]->count(param.axis + 1, inputs[0]->dims());
 
@@ -77,18 +78,19 @@ class VenderSoftmax<BM, OpDtype, inDtype, outDtype,\
         int H = inner_num;
         int W = 1;
 
+        /*
         const int stride_w = 1;
         const int stride_h = W * stride_w;
         const int stride_c = H * stride_h;
         const int stride_n = K * stride_c;
-         */
-
+        */
+        
         bmdnn_softmax_forward(
                 _handle,
                 *in_data,
-                input_n,
-                input_c,
-                input_h * input_w,
+                N,
+                K,
+                H * W,
                 *out_data
         );
 
diff --git a/test/saber/bm/test_saber_func_softmax_BM.cpp b/test/saber/bm/test_saber_func_softmax_BM.cpp
index 2da0d2e62..6c38c7534 100644
--- a/test/saber/bm/test_saber_func_softmax_BM.cpp
+++ b/test/saber/bm/test_saber_func_softmax_BM.cpp
@@ -1,15 +1,16 @@
 #include "core/context.h"
 #include "funcs/softmax.h"
-#include "test_saber_func_softmax_BM.h"
+#include "test_saber_func_BM.h"
 #include "tensor_op.h"
 #include "saber_types.h"
 #include <vector>
 
 using namespace anakin::saber;
 
-TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) {
 
-    Env<BM>::env_init();
+TEST(TestSaberFuncBM, test_func_softmax_BM) {
+
+    //Env<BM>::env_init();
     typedef TargetWrapper<BM> API;
 
     typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
@@ -74,8 +75,8 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) {
 
     for (int i = 0; i < test_iter; ++i) {
         softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
-        output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
-        output_dev_4d[0]->sync();
+        //output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+        //output_dev_4d[0]->sync();
     }
 
     t1.end(ctx_dev);
@@ -91,8 +92,8 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) {
 
     for (int i = 0; i < test_iter; ++i) {
         softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
-        output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
-        output_dev_4d[0]->sync();
+        //output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+        //output_dev_4d[0]->sync();
     }
 
     t1.end(ctx_dev);
@@ -101,9 +102,9 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) {
     //print_tensor_device(*output_dev_4d[0]);
 }
 
-TEST(TestSaberFuncSoftmaxBM, test_func_softmax_ROI_BM) {
+TEST(TestSaberFuncBM, test_func_softmax_ROI_BM) {
 
-    Env<BM>::env_init();
+    //Env<BM>::env_init();
     typedef TargetWrapper<BM> API;
 
     typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
@@ -170,8 +171,8 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_ROI_BM) {
 
     for (int i = 0; i < test_iter; ++i) {
         softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
-        output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
-        output_dev_4d[0]->sync();
+        //output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+        //output_dev_4d[0]->sync();
     }
 
     t1.end(ctx_dev);
diff --git a/test/saber/bm/test_saber_func_softmax_BM.h b/test/saber/bm/test_saber_func_softmax_BM.h
deleted file mode 100644
index d5c5b6986..000000000
--- a/test/saber/bm/test_saber_func_softmax_BM.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H
-#define ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H
-
-#include "utils/unit_test/aktest.h"
-#include "utils/logger/logger.h"
-#include "core/tensor.h"
-
-using namespace anakin::test;
-
-class TestSaberFuncSoftmaxBM : public Test {
-public:
-    TestSaberFuncSoftmaxBM() {}
-    ~TestSaberFuncSoftmaxBM() {}
-
-protected:
-    virtual void setup() {}
-    virtual void teardown() {}
-
-};
-
-#endif //ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H

From ebb12b4bde4f87a1087a51e53f43d3866694f7c1 Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Wed, 27 Jun 2018 17:39:42 +0800
Subject: [PATCH 089/318] xRevert "modify fc op, compile error"

This reverts commit 2997faf062e8ef4bf6310c425ab369059fec335d.
---
 saber/funcs/fc.h                        | 18 ------------------
 saber/funcs/impl/bm/vender_fc.h         | 12 ++++++------
 test/saber/bm/test_saber_func_fc_BM.cpp |  4 ++--
 3 files changed, 8 insertions(+), 26 deletions(-)

diff --git a/saber/funcs/fc.h b/saber/funcs/fc.h
index 035d85934..06dc8695a 100644
--- a/saber/funcs/fc.h
+++ b/saber/funcs/fc.h
@@ -27,24 +27,10 @@
 #include "saber/funcs/impl/x86/vender_fc.h"
 #endif
    
-#ifdef USE_BM
-#include "saber/funcs/impl/bm/vender_fc.h"
-#endif
-
 namespace anakin{
 
 namespace saber{
 
-#ifdef USE_BM
-template<typename TargetType,
-        DataType OpDtype,
-        DataType inDtype = AK_BM,
-        DataType outDtype = AK_BM,
-        typename LayOutType_op = NCHW,
-        typename LayOutType_in = NCHW,
-        typename LayOutType_out = NCHW
->
-#else
 template<typename TargetType,
         DataType OpDtype,
         DataType inDtype = AK_FLOAT,
@@ -53,7 +39,6 @@ template<typename TargetType,
         typename LayOutType_in = NCHW,
         typename LayOutType_out = NCHW
 >
-#endif
 class Fc : public BaseFunc<
         Tensor<TargetType, inDtype, LayOutType_in>,
         Tensor<TargetType, outDtype, LayOutType_out>,
@@ -140,9 +125,6 @@ class Fc : public BaseFunc<
 #endif
 #ifdef USE_X86_PLACE
         this->_best_impl = this->_impl[0];
-#endif
-#ifdef USE_BM
-        this->_best_impl = this->_impl[0];
 #endif
     }
 
diff --git a/saber/funcs/impl/bm/vender_fc.h b/saber/funcs/impl/bm/vender_fc.h
index 5004ad349..82dd6000c 100644
--- a/saber/funcs/impl/bm/vender_fc.h
+++ b/saber/funcs/impl/bm/vender_fc.h
@@ -1,5 +1,6 @@
 #ifndef ANAKIN_SABER_FUNCS_BMDNN_FC_H
 #define ANAKIN_SABER_FUNCS_BMDNN_FC_H
+
 #include "saber/funcs/impl/impl_fc.h"
 
 namespace anakin{
@@ -33,7 +34,6 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
     virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             FcParam<OpTensor>& param, Context<BM>& ctx){
-        _handle = get_bm_handle();
         return create(inputs, outputs, param, ctx);
     }
 
@@ -46,10 +46,10 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
     virtual SaberStatus dispatch(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             FcParam<OpTensor>& param){
-        const InDataType in_data = *(inputs[0]->data());
-        const InDataType weights = *(InDataType*)(param.weights->get_buf()->get_data());
-        const InDataType bias = *(InDataType*)(param.bias->get_buf()->get_data());
-        OutDataType out_data = *(outputs[0]->mutable_data());
+        const InDataType *in_data = (const InDataType *) inputs[0]->data();
+        const InDataType *weights = (const InDataType *) param.weights->get_buf()->get_data();
+        const InDataType *bias = (const InDataType *) param.bias->get_buf()->get_data();
+        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
         int batch_size = inputs[0]->num();
         int input_len = inputs[0]->channel();
         int output_len = param.num_output;
@@ -64,7 +64,7 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
     bm_handle_t _handle;
 };
 
-template class VenderFc<BM, AK_BM, AK_BM, AK_BM, NCHW, NCHW, NCHW>;
+template class VenderFc<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
 } //namespace saber
 
 } //namespace anakin
diff --git a/test/saber/bm/test_saber_func_fc_BM.cpp b/test/saber/bm/test_saber_func_fc_BM.cpp
index 5acbc453e..869ff1bfd 100644
--- a/test/saber/bm/test_saber_func_fc_BM.cpp
+++ b/test/saber/bm/test_saber_func_fc_BM.cpp
@@ -7,7 +7,7 @@
 
 using namespace anakin::saber;
 typedef TargetWrapper<BM> API;
-typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
 typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
 typedef TensorDf4::Dtype ftype;
 
@@ -80,7 +80,7 @@ TEST(TestSaberFuncBM, test_func_fc) {
 
     FcParam<TensorDf4> param(&weight, &bias, num_out, axis);
 
-    Fc<BM, AK_BM> fc;
+    Fc<BM, AK_FLOAT> fc;
 
     LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \
               shape_out[2] << ", " << shape_out[3];

From 56f6122e47ae6d4286a353e8a5b01dc199913e73 Mon Sep 17 00:00:00 2001
From: hlzy <327842846@qq.com>
Date: Wed, 27 Jun 2018 07:46:39 -0400
Subject: [PATCH 090/318] change tensor_test_bm

---
 saber/core/target_wrapper.h            | 5 +----
 test/saber/bm/test_saber_tensor_BM.cpp | 6 ------
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index 8f84ca759..aafbf3648 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -423,13 +423,10 @@ struct TargetWrapper<BM, __device_target> {
      * @return          currently activated device id
      */
     static int get_device_id();
-<<<<<<< HEAD
 
-    static bm_handle_t get_handler();
+//    static bm_handle_t get_handler();
     
 //    bm_handle_t handle;
-=======
->>>>>>> c0edd55a1bdd22e12dc62c9463d229285e5f5d80
 };
 
 #endif //USE_BM
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index f720581ef..9fb62d989 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -72,12 +72,6 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     thost1.copy_from(tdev1);
     print_tensor_host(thost1);
 
-<<<<<<< HEAD
-    
-    // device to device
-=======
-    //device to device
->>>>>>> c0edd55a1bdd22e12dc62c9463d229285e5f5d80
     tdev1.copy_from(tdev0);
     print_tensor_device(tdev1);
 

From 571e3a43f3dfe3ec05ceae290624e98d2941718b Mon Sep 17 00:00:00 2001
From: hlzy <327842846@qq.com>
Date: Wed, 27 Jun 2018 20:19:31 -0400
Subject: [PATCH 091/318] tensor test update

---
 saber/core/tensor.h                    | 37 ++++++++++++++--
 test/saber/bm/test_saber_tensor_BM.cpp | 61 +++++++++++++++-----------
 2 files changed, 69 insertions(+), 29 deletions(-)

diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 0beaa7b04..7c1d00052 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -19,7 +19,7 @@
 #include "core/shape.h"
 #include "core/events.h"
 #include "core/tensor_traits.h"
-
+#include <typeinfo>
 namespace anakin{
 
 namespace saber{
@@ -117,20 +117,49 @@ class Tensor : public TensorBase {
     /**
      * \brief Constructor with allocated data ptr and entire memory shape.
      */
-    template <typename TargetType_t>
-    Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape) {
+//    template <typename TargetType_t>
+//    Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape) {
+//
+//        CHECK_EQ(shape.dims(), TensorAPI::layout_dims::value) << \
+//            "shape dims is not matched to layout type";
+//        _shape = shape;
+//        _valid_shape = shape;
+//        _offset = Shape::zero(shape.dims());
+//        std::shared_ptr<Buffer<TargetType_t>> buf_from_date = \
+//            std::make_shared<Buffer<TargetType_t>>(data_ptr, shape.count() * _type_len(), id);
+//        BufferMemShare(_buf, buf_from_date);
+//        _is_subbuf = false;
+//    }
 
+#ifdef USE_BM
+    /**
+     * \brief Constructor with allocated data ptr and entire memory shape. only for BM
+    */ 
+    template <typename Dtype_s,typename TargetType_t>
+    Tensor(Dtype_s* data_ptr, TargetType_t target, int id, Shape shape) {
         CHECK_EQ(shape.dims(), TensorAPI::layout_dims::value) << \
             "shape dims is not matched to layout type";
         _shape = shape;
         _valid_shape = shape;
         _offset = Shape::zero(shape.dims());
+
+        if(typeid(Dtype_s) == typeid(AK_FLOAT))
+        {
+        std::shared_ptr<Buffer<TargetType_t>> buf_from_date = \
+            std::make_shared<Buffer<TargetType_t>>(&bm_mem_from_system(const_cast<Dtype_s *>(data_ptr)), shape.count() * _type_len(), id);
+
+        BufferMemShare(_buf, buf_from_date);
+        }
+        else
+        {
         std::shared_ptr<Buffer<TargetType_t>> buf_from_date = \
             std::make_shared<Buffer<TargetType_t>>(data_ptr, shape.count() * _type_len(), id);
+
         BufferMemShare(_buf, buf_from_date);
+        }
         _is_subbuf = false;
     }
-
+#endif
     /**
      * \brief Copy constructor, shallow copy.
      */
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index 429f61673..423ffe221 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -32,7 +32,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
 
     //! test tensor re_alloc function on tensor with data
     LOG(INFO) << "|--test tensor re_alloc function on tensor with data";
-    Shape sh1(2, 4, 4, 2);
+    Shape sh1(1, 4, 4, 4);
     thost0.re_alloc(sh1);
     tdev0.re_alloc(sh1);
     LOG(INFO) << "|--tensor size of host: " << thost0.size();
@@ -74,10 +74,12 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     thost1.copy_from(tdev1);
     print_tensor_host(thost1);
 
+    LOG(INFO) << "test copy_from() function device to device";
+
     tdev1.copy_from(tdev0);
     print_tensor_device(tdev1);
 
-    /*
+    
     //! test tensor constructor with shape and real_shape
     LOG(INFO) << "test tensor constructor with shape and real_shape";
     //! constructor with 3 shapes is removed
@@ -88,7 +90,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     LOG(INFO) <<
               "test tensor constructor with data, if target is different, create buffer, and copy the data";
     dtype* host_data_ptr;
-//    dtype2* dev_data_ptr;
+    dtype2* dev_data_ptr;
     void* tmp_pt_host;
     void* tmp_pt_dev;
     X86_API::mem_alloc(&tmp_pt_host, sizeof(dtype) * sh1.count());
@@ -99,16 +101,17 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     }
 
     BM_API::mem_alloc(&tmp_pt_dev, sizeof(dtype2) * sh1.count());
-//    dev_data_ptr = static_cast<dtype2*>(tmp_pt_dev);
-//    bm_memcpy_d2s(handle,*dev_data_ptr,bm_mem_from_system(const_cast<float *>(host_data_ptr)));
-
+    dev_data_ptr = static_cast<dtype2*>(tmp_pt_dev);
 //---    cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice);
-
+    BM_API::sync_memcpy(dev_data_ptr,0,host_data_ptr,0,0,__HtoD());
     LOG(INFO) << "|--construct host tensor from host data ptr";
     TensorHf4 thost3(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
     LOG(INFO) << "|--constructor device tensor from host data ptr";
 
-    TensorDf4 tdev3(&bm_mem_from_system(const_cast<float *>(host_data_ptr)), X86(), X86_API::get_device_id(), sh1);
+//    TensorDf4 tdev3(&bm_mem_from_system(const_cast<float *>(host_data_ptr)), X86(), X86_API::get_device_id(), sh1);
+
+    TensorDf4 tdev3(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
+
 
     print_tensor_host(thost3);
 
@@ -123,24 +126,30 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
 
     //cudaDeviceSynchronize();
     //
-/*
+
     LOG(INFO) << "|--construct host tensor from device data ptr";
-    TensorHf4 thost4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1);
-    LOG(INFO) << "|--constructor device tensor from device data ptr";
-    TensorDf4 tdev4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1);
-    print_tensor_host(thost4);
-    print_tensor_device(tdev4);
-*/
+    TensorHf4 thost4(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
+
+    TensorDf4 tdev4(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
+
+//    TensorDf4 tdev3(&bm_mem_from_system(const_cast<float *>(host_data_ptr)), X86(), X86_API::get_device_id(), sh1);
+
+//    TensorHf4 thost4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1);
+//    LOG(INFO) << "|--constructor device tensor from device data ptr";
+//    TensorDf4 tdev4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1);
+//    print_tensor_host(thost4);
+//    print_tensor_device(tdev4);
+
 
     //BM_API::stream_t dev_stream0;
     //BM_API::create_stream_with_flag(dev_stream0, 1);
     //cudaDeviceSynchronize();
-/*
+
     //! test tensor copy constructor
     LOG(INFO) << "test tensor copy constructor";
     LOG(INFO) << "|--normal copy constructor";
-//    TensorHf4 thost5(thost4);
-//    TensorDf4 tdev5(tdev4);
+    TensorHf4 thost5(thost4);
+    TensorDf4 tdev5(tdev4);
 
     LOG(INFO) << "|--push back to vector";
     std::vector<TensorHf4> vthost;
@@ -149,18 +158,18 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     vthost.push_back(thost1);
     vthost.push_back(thost2);
     vthost.push_back(thost3);
-//    vthost.push_back(thost4);
-//    vthost.push_back(thost5);
+    vthost.push_back(thost4);
+    vthost.push_back(thost5);
     vtdev.push_back(tdev0);
     vtdev.push_back(tdev1);
     vtdev.push_back(tdev2);
     vtdev.push_back(tdev3);
-//   vtdev.push_back(tdev4);
-//    vtdev.push_back(tdev5);
+    vtdev.push_back(tdev4);
+    vtdev.push_back(tdev5);
     print_tensor_host(vthost[5]);
     print_tensor_device(vtdev[5]);
     //cudaDeviceSynchronize();
-/*
+
     //! test share_from function, if targets are the same, buffer is shared, otherwise, buffer is copied
     LOG(INFO) << "test share_from function";
     TensorHf4 thost6, thost7;
@@ -172,7 +181,9 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     Shape sh2(1, 2, 2, 2);
     Shape offset(0, 0, 1, 1);
     LOG(INFO) << "|--shared host";
+
     thost6.share_sub_buffer(thost4, sh2, offset);
+
     LOG(INFO) << "|--copied host";
     tdev6.share_from(thost4);
     LOG(INFO) << "|--copied device";
@@ -180,6 +191,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     LOG(INFO) << "|--shared device";
     tdev7.share_from(tdev4);
 
+
     LOG(INFO) << "|--change data in shared tensor";
 
     //Shape sh_real = thost6.shape();
@@ -220,8 +232,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
 
     LOG(INFO) << "|--show root tensor while data is changed by shared tensor";
     print_tensor_host(thost4);
-     */
-//    bmdnn_deinit(handle);
+    bmdnn_deinit(handle);
 }
 
 /*

From 62a04c8f1994447bfded4b9d2e3e03db7fb07b6d Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 09:42:35 +0800
Subject: [PATCH 092/318] Add back missing files

---
 test/saber/bm/test_saber_buffer_BM.h    |  20 ++++
 test/saber/bm/test_saber_context_BM.h   |  21 ++++
 test/saber/bm/test_saber_device_BM.cpp  |  20 ++++
 test/saber/bm/test_saber_device_BM.h    |  21 ++++
 test/saber/bm/test_saber_func_BM.h      |  38 ++++++
 test/saber/bm/test_saber_func_fc_BM.cpp | 146 ++++++++++++++++++++++++
 test/saber/bm/test_saber_shape_BM.cpp   | 126 ++++++++++++++++++++
 test/saber/bm/test_saber_shape_BM.h     |  25 ++++
 8 files changed, 417 insertions(+)
 create mode 100644 test/saber/bm/test_saber_buffer_BM.h
 create mode 100644 test/saber/bm/test_saber_context_BM.h
 create mode 100644 test/saber/bm/test_saber_device_BM.cpp
 create mode 100644 test/saber/bm/test_saber_device_BM.h
 create mode 100644 test/saber/bm/test_saber_func_BM.h
 create mode 100644 test/saber/bm/test_saber_func_fc_BM.cpp
 create mode 100644 test/saber/bm/test_saber_shape_BM.cpp
 create mode 100644 test/saber/bm/test_saber_shape_BM.h

diff --git a/test/saber/bm/test_saber_buffer_BM.h b/test/saber/bm/test_saber_buffer_BM.h
new file mode 100644
index 000000000..8bbbe4511
--- /dev/null
+++ b/test/saber/bm/test_saber_buffer_BM.h
@@ -0,0 +1,20 @@
+#ifndef ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H
+#define ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+
+using namespace anakin::test;
+
+class TestSaberBufferBM : public Test {
+public:
+    TestSaberBufferBM() {}
+    ~TestSaberBufferBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+};
+
+#endif //ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H
diff --git a/test/saber/bm/test_saber_context_BM.h b/test/saber/bm/test_saber_context_BM.h
new file mode 100644
index 000000000..653ee11fd
--- /dev/null
+++ b/test/saber/bm/test_saber_context_BM.h
@@ -0,0 +1,21 @@
+#ifndef SABER_TEST_SABER_CONTEXT_BM_H
+#define SABER_TEST_SABER_CONTEXT_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+#include "core/context.h"
+
+using namespace anakin::test;
+
+class TestSaberContextBM : public Test {
+public:
+    TestSaberContextBM() {}
+    ~TestSaberContextBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+};
+
+#endif //SABER_TEST_SABER_CONTEXT_BM_H
diff --git a/test/saber/bm/test_saber_device_BM.cpp b/test/saber/bm/test_saber_device_BM.cpp
new file mode 100644
index 000000000..1c7086cf1
--- /dev/null
+++ b/test/saber/bm/test_saber_device_BM.cpp
@@ -0,0 +1,20 @@
+#include "test_saber_device_BM.h"
+
+#ifdef USE_BM
+
+using namespace anakin::saber;
+
+TEST(TestSaberDeviceBM, test_BM_device) {
+    Device<BM> dev_BM;
+}
+
+#endif
+
+int main(int argc, const char** argv) {
+    // initial logger
+    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
diff --git a/test/saber/bm/test_saber_device_BM.h b/test/saber/bm/test_saber_device_BM.h
new file mode 100644
index 000000000..3a6d61236
--- /dev/null
+++ b/test/saber/bm/test_saber_device_BM.h
@@ -0,0 +1,21 @@
+#ifndef SABER_TEST_SABER_DEVICE_BM_H
+#define SABER_TEST_SABER_DEVICE_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+#include "core/device.h"
+
+using namespace anakin::test;
+
+class TestSaberDeviceBM : public Test {
+public:
+    TestSaberDeviceBM() {}
+    ~TestSaberDeviceBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+};
+
+#endif //SABER_TEST_SABER_DEVICE_BM_H
diff --git a/test/saber/bm/test_saber_func_BM.h b/test/saber/bm/test_saber_func_BM.h
new file mode 100644
index 000000000..61d27d6f9
--- /dev/null
+++ b/test/saber/bm/test_saber_func_BM.h
@@ -0,0 +1,38 @@
+#ifndef ANAKIN_TEST_SABER_TEST_SABER_FUNC_BM_H
+#define ANAKIN_TEST_SABER_TEST_SABER_FUNC_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+#include "core/tensor.h"
+#include <fstream>
+#include <vector>
+
+using namespace anakin::test;
+
+int read_file(std::vector<float> &results, const char* file_name) {
+
+    std::ifstream infile(file_name);
+    if (!infile.good()) {
+        std::cout << "Cannot open " << std::endl;
+        return false;
+    }
+    LOG(INFO)<<"found filename: "<<file_name;
+    std::string line;
+    while (std::getline(infile, line)) {
+        results.push_back((float)atof(line.c_str()));
+    }
+    return 0;
+}
+
+class TestSaberFuncBM : public Test {
+public:
+    TestSaberFuncBM() {}
+    ~TestSaberFuncBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+};
+
+#endif //ANAKIN_TEST_SABER_TEST_SABER_TENSOR_BM_H
diff --git a/test/saber/bm/test_saber_func_fc_BM.cpp b/test/saber/bm/test_saber_func_fc_BM.cpp
new file mode 100644
index 000000000..869ff1bfd
--- /dev/null
+++ b/test/saber/bm/test_saber_func_fc_BM.cpp
@@ -0,0 +1,146 @@
+#include "core/context.h"
+#include "funcs/fc.h"
+#include "test_saber_func_BM.h"
+#include "tensor_op.h"
+#include "saber_types.h"
+#include <vector>
+
+using namespace anakin::saber;
+typedef TargetWrapper<BM> API;
+typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+typedef TensorDf4::Dtype ftype;
+
+void fc_compute(const TensorHf4& tin, const TensorHf4& weight, \
+                const TensorHf4& bias, TensorHf4& tout) {
+
+    int m = tin.num();
+    int k = tin.valid_size() / m;
+    int n = weight.valid_size() / k;
+    bool bias_term = bias.valid_size() > 0;
+
+    const float* din = tin.data();
+    const float* w = weight.data();
+    float* dout = tout.mutable_data();
+
+    for (int i = 0; i < m; ++i) {
+        float* pdout = dout + i * n;
+        const float* pdin = din + i * k;
+
+        for (int j = 0; j < n; ++j) {
+            if (bias_term) {
+                pdout[j] = bias.data()[j];
+            } else {
+                pdout[j] = 0;
+            }
+
+            for (int l = 0; l < k; ++l) {
+                pdout[j] += pdin[l] * w[l * n + j];
+            }
+        }
+    }
+}
+
+TEST(TestSaberFuncBM, test_func_fc) {
+
+    int test_iter = 100;
+    int w_in = 7;
+    int h_in = 7;
+    int ch_in = 512;
+    int num_in = 1;
+
+    int num_out = 4096;
+    int axis = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_out = {num_in, num_out, 1, 1};
+
+    Shape sh_w{1, 1, w_in* h_in * ch_in, num_out};
+    TensorDf4 weight(sh_w);
+    Shape sh_b{1, 1, 1, num_out};
+    TensorDf4 bias(sh_b);
+    fill_tensor_device_const(weight, 1.f);
+    fill_tensor_device_const(bias, 1.f);
+
+    LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \
+              ch_in << ", height=" << h_in << ", width=" << w_in;
+
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+
+    TensorDf4 tdin;
+    TensorDf4 tdout;
+    tdin.re_alloc(shape_in);
+    fill_tensor_device_const(tdin, 1.f);
+    input_dev_4d.push_back(&tdin);
+    output_dev_4d.push_back(&tdout);
+
+    // start Reshape & doInfer
+    Context<BM> ctx_dev(0, 1, 1);
+
+    FcParam<TensorDf4> param(&weight, &bias, num_out, axis);
+
+    Fc<BM, AK_FLOAT> fc;
+
+    LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \
+              shape_out[2] << ", " << shape_out[3];
+
+    SABER_CHECK(fc.compute_output_shape(input_dev_4d, output_dev_4d, param));
+
+    LOG(INFO) << "re-alloc tensor buffer";
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->valid_shape());
+    Shape va_sh = tdout.valid_shape();
+    LOG(INFO) << "shape out 4d: " << va_sh[0] << ", " << va_sh[1] << ", " << \
+              va_sh[2] << ", " << va_sh[3];
+    CHECK_EQ(tdout.valid_shape() == shape_out, true) << "compute output shape error";
+
+    LOG(INFO) << "FC initialization";
+    SABER_CHECK(fc.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev));
+
+    LOG(INFO) << "FC compute";
+    SaberTimer<BM> t1;
+    t1.clear();
+    t1.start(ctx_dev);
+
+    for (int i = 0; i < test_iter; ++i) {
+        SABER_CHECK(fc(input_dev_4d, output_dev_4d, param, ctx_dev));
+        output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+        output_dev_4d[0]->sync();
+        //cudaDeviceSynchronize();
+    }
+
+    t1.end(ctx_dev);
+    float ts = t1.get_average_ms();
+    LOG(INFO) << "total time: " << ts << "avg time: " << ts / test_iter;
+    //print_tensor_device(*output_dev_4d[0]);
+
+    //! check result
+    TensorHf4 thin(shape_in);
+    TensorHf4 thout(shape_out);
+    TensorHf4 thw(sh_w);
+    TensorHf4 thb(sh_b);
+    thin.copy_from(tdin);
+    thw.copy_from(weight);
+    thb.copy_from(bias);
+    fc_compute(thin, thw, thb, thout);
+    //print_tensor_host(thout);
+
+    TensorHf4 thout_d(shape_out);
+    thout_d.copy_from(tdout);
+    double max_ratio = 0;
+    double max_diff = 0;
+    tensor_cmp_host(thout.data(), thout_d.data(), thout.valid_size(), max_ratio, max_diff);
+    LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio;
+    CHECK_LE(fabs(max_ratio), 1.0e-6) << "error result";
+
+}
+
+int main(int argc, const char** argv) {
+    // initial logger
+    //logger::init(argv[0]);
+    Env<BM>::env_init();
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
diff --git a/test/saber/bm/test_saber_shape_BM.cpp b/test/saber/bm/test_saber_shape_BM.cpp
new file mode 100644
index 000000000..18479cd18
--- /dev/null
+++ b/test/saber/bm/test_saber_shape_BM.cpp
@@ -0,0 +1,126 @@
+#include "test_saber_shape_BM.h"
+#include "shape.h"
+#include "anakin_config.h"
+
+#ifdef USE_OPENMP
+#include <omp.h>
+#include <core/shape.h>
+#endif
+
+using namespace anakin;
+using namespace saber;
+
+
+TEST(TestSaberShapeBM, test_saber_shape) {
+
+    int dim = 4;
+    Shape sh4d0{0, 0, 0, 0};
+    CHECK_EQ(sh4d0.dims(), 4) << "check shape dim error";
+
+    for (int i = 0; i < dim; ++i) {
+        CHECK_EQ(sh4d0[i], 0) << "check default constructor, dim size error";
+    }
+
+    CHECK_EQ(sh4d0.count(), 0) << "check shape count error";
+
+    int N = 1;
+    int C = 3;
+    int H = 11;
+    int W = 11;
+    std::vector<int> sh_size = {N, C, H, W};
+    //Shape sh4d1(sh_size);
+    Shape sh4d1(N, C, H, W);
+    LOG(INFO) << "Test Saber Shape, size of shape: " << sh4d1.size();
+    CHECK_EQ(sh4d1.count(), N * C * H * W) << "size error with vector constructor!";
+    //CHECK_EQ(sh4d2.size(), N * C * H * W) << "size error with args constructor!";
+
+    CHECK_EQ(sh4d1[0], N) << "get shape size error";
+    CHECK_EQ(sh4d1[1], C) << "get shape size error";
+    CHECK_EQ(sh4d1[2], H) << "get shape size error";
+    CHECK_EQ(sh4d1[3], W) << "get shape size error";
+
+    //CHECK_EQ(sh4d2[0], N) << "get shape size error";
+    //CHECK_EQ(sh4d2[1], C) << "get shape size error";
+    //CHECK_EQ(sh4d2[2], H) << "get shape size error";
+    //CHECK_EQ(sh4d2[3], W) << "get shape size error";
+
+    CHECK_EQ(sh4d1.count(0), N * C * H * W) << "calculate count failed";
+
+    C = 10;
+    sh4d1[1] = C;
+    CHECK_EQ(sh4d1[1], C) << "set shape size error";
+
+    bool is_equal = (sh4d0 == sh4d1);
+    CHECK_EQ(is_equal, false) << "check shape is_equal failed";
+
+    sh4d0 = sh4d1;
+    CHECK_EQ(sh4d1[0], N) << "constructor failed";
+    CHECK_EQ(sh4d1[1], C) << "get shape size error";
+    CHECK_EQ(sh4d1[2], H) << "get shape size error";
+    CHECK_EQ(sh4d1[3], W) << "get shape size error";
+
+    Shape sh4d3 = sh4d1;
+    CHECK_EQ((sh4d3 == sh4d1), true) << "constructor error";
+
+    Shape sh4d4(sh4d1);
+    CHECK_EQ((sh4d4 == sh4d1), true) << "constructor error";
+
+    Shape sh1d0{0};
+    //std::vector<int> sh1d_size = {W};
+
+    //Shape sh1d1(sh1d_size);
+    //Shape sh1d0{W};
+    Shape sh1d1(W);
+
+    Shape sh1d3 = sh1d1;
+    Shape sh1d4(sh1d1);
+
+    CHECK_EQ(sh1d0.dims(), 1) << "shape dim error";
+
+    CHECK_EQ(sh1d0.count(), 0) << "shape size error";
+
+    CHECK_EQ(sh1d0.count(0), 0) << "shape1d count error";
+
+    CHECK_EQ(sh1d1[0], W) << "get shape size error";
+
+    //CHECK_EQ(sh1d2.count(0), W) << "shape dim error";
+
+    CHECK_EQ((sh1d0 != sh1d1), true) << "compare shape error";
+
+    CHECK_EQ((sh1d3 == sh1d1), true) << "compare shape error";
+
+    CHECK_EQ((sh1d4 == sh1d1), true) << "compare shape error";
+
+    Shape sh0{2, 2, 3, 4};
+    Shape sh1{2, 1, 1, 24};
+    Shape sh2{2, 2, 3, 4};
+    Shape sh3{1, 1, 2, 3};
+
+    CHECK_EQ(sh0 == sh2, true) << "error ==";
+    CHECK_EQ(sh3 < sh0, true) << "error <";
+    CHECK_EQ(sh3 >= sh0, false) << "error >=";
+    CHECK_EQ(sh3 > sh0, false) << "error >";
+    CHECK_EQ(sh0 > sh3, true) << "error >";
+    CHECK_EQ(sh0 < sh1, false) << "error <";
+    CHECK_EQ(sh0 <= sh2, true) << "error <=";
+    CHECK_EQ(sh0 >= sh2, true) << "error >=";
+
+    Shape sh001 = Shape::zero(2);
+    Shape sh002 = Shape::zero(3);
+
+    if (sh001 > sh002) {
+        LOG(ERROR) << "error <";
+    }
+
+}
+
+
+int main(int argc, const char** argv) {
+    // initial logger
+    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
+
diff --git a/test/saber/bm/test_saber_shape_BM.h b/test/saber/bm/test_saber_shape_BM.h
new file mode 100644
index 000000000..a2ca02c9b
--- /dev/null
+++ b/test/saber/bm/test_saber_shape_BM.h
@@ -0,0 +1,25 @@
+#ifndef ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H
+#define ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+#include "saber/core/shape.h"
+
+using namespace anakin::test;
+
+class TestSaberShapeBM : public Test {
+public:
+    TestSaberShapeBM() {}
+    ~TestSaberShapeBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+protected:
+    std::string name;
+    std::string _test;
+};
+
+#endif //ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H
+

From bff601c294d62502ee92754df621a2f557c2760f Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 09:43:19 +0800
Subject: [PATCH 093/318] Add back missing files

---
 test/framework/core/base_types_test.cpp       | 143 +++++++++
 test/framework/core/core_test.h               |  46 +++
 test/framework/graph/graph_base_test.cpp      |  82 ++++++
 .../graph/graph_parser_from_model_test.cpp    |  88 ++++++
 test/framework/graph/graph_test.h             |  47 +++
 test/framework/net/benchmark.cpp              | 162 +++++++++++
 test/framework/net/chinese_ner_test.cpp       | 213 ++++++++++++++
 test/framework/net/model_test.cpp             | 175 +++++++++++
 .../net/net_exec_multi_thread_test.cpp        | 149 ++++++++++
 test/framework/net/net_exec_test.cpp          | 273 ++++++++++++++++++
 test/framework/net/net_test.h                 |  98 +++++++
 test/framework/net/padde_api_test.cpp         | 121 ++++++++
 test/framework/net/paddle_api.h               |  87 ++++++
 test/framework/operators/operator_tests.h     |  47 +++
 test/framework/operators/pooling_test.cpp     |  43 +++
 15 files changed, 1774 insertions(+)
 create mode 100644 test/framework/core/base_types_test.cpp
 create mode 100644 test/framework/core/core_test.h
 create mode 100644 test/framework/graph/graph_base_test.cpp
 create mode 100644 test/framework/graph/graph_parser_from_model_test.cpp
 create mode 100644 test/framework/graph/graph_test.h
 create mode 100644 test/framework/net/benchmark.cpp
 create mode 100644 test/framework/net/chinese_ner_test.cpp
 create mode 100644 test/framework/net/model_test.cpp
 create mode 100644 test/framework/net/net_exec_multi_thread_test.cpp
 create mode 100644 test/framework/net/net_exec_test.cpp
 create mode 100644 test/framework/net/net_test.h
 create mode 100644 test/framework/net/padde_api_test.cpp
 create mode 100644 test/framework/net/paddle_api.h
 create mode 100644 test/framework/operators/operator_tests.h
 create mode 100644 test/framework/operators/pooling_test.cpp

diff --git a/test/framework/core/base_types_test.cpp b/test/framework/core/base_types_test.cpp
new file mode 100644
index 000000000..0109493bf
--- /dev/null
+++ b/test/framework/core/base_types_test.cpp
@@ -0,0 +1,143 @@
+#include "core_test.h"
+#include "any.h"
+#include "singleton.h"
+#include "tls.h"
+#include "parameter.h"
+#include "thread_pool.h"
+
+#ifdef USE_CUDA
+#include "cuda_funcs.h"
+#include "sass_funcs.h"
+#endif
+
+#include "tensor.h"
+
+#ifdef USE_CUDA
+TEST(CoreComponentsTest, sass_test) {
+    LOG(INFO) << "test for cuda code function";
+    //anakin::saber::Tensor<3, RTCUDA, float, NCHW> ts;
+    //LOG(WARNING) << " tensor num " << ts.num();
+    //ts.set_offset(8);
+    //my_print();
+    LOG(INFO) << "test for sass code function 1";
+    invoke_test();
+    LOG(INFO) << "test for sass code function 2";
+    invoke_test_2();
+}
+#endif
+
+TEST(CoreComponentsTest, core_base_types_any_test) {
+    LOG(INFO) << "test for any class .";
+    LOG(WARNING) << " level 1 : base type int (set 42 to any)";
+    const int a = 42;
+    any any_a(42);
+    int result_a = any_cast<int>(any_a);
+
+    LOG(INFO) << "casted result : " <<  result_a;
+    LOG(WARNING) << " level 2 : base type float (set 42.8 to any)";
+    float b = 42.8;
+    any any_b = b;
+    float result_b = any_cast<float>(any_b);
+    LOG(INFO) << "casted result : " <<  result_b << " decide: ";
+
+    LOG(WARNING) << " level 3 : ptuple type (set PTuple<float> to any)";
+    PTuple<float> p_tuple_float(3.2f, 3.3f, 3.5f);
+    p_tuple_float.push_back(4.3); // push_back
+
+    any p_tuple_float_any = p_tuple_float;
+    auto result_p_tuple_float_any = any_cast<PTuple<float>>(p_tuple_float_any);
+
+    for (int i = 0; i < result_p_tuple_float_any.size(); i++) {
+        LOG(INFO) << " any casted PTuple<float>[" << i << "]: " << result_p_tuple_float_any[i];
+    }
+
+    struct target {
+        void print() {
+            LOG(INFO) << " target struct Successfully recovered.";
+        }
+    };
+
+    LOG(WARNING) << " level 5 : struct type";
+
+    target tg;
+
+    any any_tg = tg;
+
+    target result_tg = any_cast<target>(any_tg);
+
+    result_tg.print();
+
+    LOG(WARNING) << " level other : struct type";
+
+    any any_tg_copy = any_tg;
+
+    target result_tg_copy = any_cast<target>(any_tg);
+
+    result_tg_copy.print();
+}
+
+void at_exit_in_test() {
+    LOG(WARNING) << "core_base_types_singleton_test exit successfully!";
+}
+
+TEST(CoreComponentsTest, core_base_types_singleton_test) {
+    struct target {
+        target() {
+            LOG(INFO) << " singleton target constructed";
+        }
+    };
+    typedef Singleton<target, at_exit_in_test> sg_target;
+    sg_target::Global();
+}
+
+typedef AnakinThreadLocalVar<int> sg_tls;
+void thread_func_0() {
+    int* tmp = sg_tls::value();
+    *tmp = 3;
+    LOG(INFO) << " thread_func_0 value: " << *(sg_tls::value());
+}
+void thread_func_1() {
+    int* tmp = sg_tls::value();
+    *tmp = 4;
+
+    LOG(INFO) << " thread_func_0 value: " << *(sg_tls::value());
+}
+TEST(CoreComponentsTest, core_base_types_tls_test) {
+    LOG(INFO) << " Create tls var 0 , check in two thread.";
+    std::thread first(thread_func_0);
+    std::thread sec(thread_func_1);
+    first.join();
+    sec.join();
+    LOG(INFO) << " main thread var: " << *(sg_tls::value());
+}
+
+int thread_pool_func(int i) {
+    LOG(INFO) << " thread_pool_func input : " << i;
+    //std::this_thread::sleep_for(std::chrono::seconds(0));
+    return i;
+}
+
+TEST(CoreComponentsTest, core_base_types_thread_pool_test) {
+    LOG(INFO) << " Create thread pool with thread num = 12 ";
+    ThreadPool thread_pool_test(100);
+    thread_pool_test.launch();
+    std::function<int(int)> test = thread_pool_func;
+
+    for (int i = 0; i < 50; i++) {
+        // run async
+        auto ret = thread_pool_test.RunAsync(test, i);
+        LOG(INFO) << " return : " << ret.get();
+
+        // run sync
+        //auto sync_ret = thread_pool_test.RunSync(test, i);
+    }
+}
+
+
+int main(int argc, const char** argv) {
+    // initial logger
+    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
diff --git a/test/framework/core/core_test.h b/test/framework/core/core_test.h
new file mode 100644
index 000000000..6107eef4b
--- /dev/null
+++ b/test/framework/core/core_test.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+   
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. 
+*/
+
+#ifndef ANAKIN_GRAPH_TEST_H
+#define ANAKIN_GRAPH_TEST_H
+
+#include <iostream>
+#include <string>
+#include <thread>
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+
+using namespace anakin;
+using ::anakin::test::Test;
+
+class CoreComponentsTest : public Test {
+public:
+    CoreComponentsTest(){}
+
+    void SetUp(){}
+
+    void TearDown(){}
+
+protected:
+};
+
+
+
+
+
+
+#endif
+
+
diff --git a/test/framework/graph/graph_base_test.cpp b/test/framework/graph/graph_base_test.cpp
new file mode 100644
index 000000000..d42e86c02
--- /dev/null
+++ b/test/framework/graph/graph_base_test.cpp
@@ -0,0 +1,82 @@
+#include <string>
+#include "graph_test.h"
+#include "graph_base.h"
+
+using namespace anakin;
+using namespace anakin::graph;
+
+//! Usage sample
+class GraphTestClass : public GraphBase<std::string, int, int> {
+public:
+    GraphTestClass() {}
+    ~GraphTestClass() {}
+    virtual bool directed() {
+        return true;
+    };
+};
+class edge : public Arc<std::string, int> {
+public:
+    edge(std::string btm, std::string top, int weight): Arc<std::string, int>(btm, top, weight) {}
+    ~edge() {}
+};
+
+TEST(GraphTest, graph_base_test) {
+    LOG(INFO) << "test for graph base .";
+
+    GraphTestClass graph;
+    graph.add_vertex("a", 42);
+    graph.add_vertex("b", 43);
+    graph.add_vertex("c", 44);
+    graph.add_vertex("d", 45);
+    graph.add_vertex("e", 46);
+    graph.add_vertex("f", 47);
+
+    edge arc0("a", "b", 0);
+    edge arc1("b", "c", 1);
+    edge arc2("c", "d", 2);
+    edge arc3("d", "e", 3);
+    edge arc4("e", "f", 4);
+    edge arc5("f", "a", 5);
+
+    graph.add_in_arc(arc0);
+    graph.add_in_arc(arc1);
+    graph.add_in_arc(arc2);
+    graph.add_in_arc(arc3);
+    graph.add_in_arc(arc4);
+    graph.add_in_arc(arc5);
+    graph.add_out_arc(arc0);
+    graph.add_out_arc(arc1);
+    graph.add_out_arc(arc2);
+    graph.add_out_arc(arc3);
+    graph.add_out_arc(arc4);
+    graph.add_out_arc(arc5);
+
+    LOG(WARNING) << "Construction of graph.";
+    LOG(INFO) << graph.to_string();
+
+    LOG(WARNING) << "Remove a from graph.";
+    graph.remove("a");
+    LOG(INFO) << graph.to_string();
+
+    LOG(WARNING) << "Add arc: f->b to graph.";
+    edge arc_f_b("f", "b", 10);
+    graph.add_in_arc(arc_f_b);
+    graph.add_out_arc(arc_f_b);
+    LOG(INFO) << graph.to_string();
+
+    LOG(WARNING) << "Add vertex:a and arc: a->e to graph.";
+    graph.add_vertex("a", 47);
+    edge arc_a_e("a", "e", 10);
+    graph.add_out_arc(arc_a_e);
+    graph.add_in_arc(arc_a_e);
+    LOG(INFO) << graph.to_string();
+}
+
+
+int main(int argc, const char** argv) {
+    // initial logger
+    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
diff --git a/test/framework/graph/graph_parser_from_model_test.cpp b/test/framework/graph/graph_parser_from_model_test.cpp
new file mode 100644
index 000000000..883a12858
--- /dev/null
+++ b/test/framework/graph/graph_parser_from_model_test.cpp
@@ -0,0 +1,88 @@
+#include <string>
+#include "graph_test.h"
+#include "graph_base.h"
+#include "graph.h"
+#include "scheduler.h"
+
+using namespace anakin;
+using namespace anakin::graph;
+
+//std::string model_path = "/home/chaowen/anakin_v2/model_v2/google_net/googlenet.anakin.bin";
+std::string model_path = "/home/chaowen/anakin_v2/model_v2/yolo/yolo.anakin.bin";
+
+
+TEST(GraphTest, graph_load_model) {
+    /*Graph<ARM, float, Precision::FP32>* graph = new Graph<ARM, float, Precision::FP32>();
+    LOG(WARNING) << "load anakin model file from " << model_path << " ...";
+    // load anakin model files.
+    graph->load(model_path);
+
+    DLOG(INFO) << graph->to_string();
+    // exec optimization
+    graph->Optimize();  */
+}
+
+#ifdef USE_CUDA
+TEST(GraphTest, nvidia_graph_save_model) {
+    Graph<NV, AK_FLOAT, Precision::FP32>* graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+    // load anakin model files.
+    LOG(INFO) << "load anakin model file from " << model_path << " ...";
+    graph->load(model_path);
+
+    // regisiter output tensor
+    //graph->RegistOut("data_perm",  "data_scale");
+
+    //  exec optimization
+    graph->Optimize();
+
+    // save the optimized model to disk.
+    std::string save_model_path = model_path + std::string(".saved");
+    Status status = graph->save(save_model_path);
+}
+#endif
+
+#ifdef USE_X86_PLACE
+TEST(GraphTest, x86_graph_save_model) {
+    Graph<X86, AK_FLOAT, Precision::FP32>* graph = new Graph<X86, AK_FLOAT, Precision::FP32>();
+    // load anakin model files.
+    LOG(INFO) << "load anakin model file from " << model_path << " ...";
+    graph->load(model_path);
+
+    // regisiter output tensor
+    //graph->RegistOut("data_perm",  "data_scale");
+
+    //  exec optimization
+    graph->Optimize();
+
+    // save the optimized model to disk.
+    std::string save_model_path = model_path + std::string(".saved");
+    Status status = graph->save(save_model_path);
+}
+#endif
+
+#ifdef USE_ARM_PLACE
+TEST(GraphTest, arm_graph_save_model) {
+    Graph<ARM, AK_FLOAT, Precision::FP32>* graph = new Graph<ARM, AK_FLOAT, Precision::FP32>();
+    // load anakin model files.
+    LOG(INFO) << "load anakin model file from " << model_path << " ...";
+    graph->load(model_path);
+
+    // regisiter output tensor
+    //graph->RegistOut("data_perm",  "data_scale");
+
+    //  exec optimization
+    graph->Optimize();
+
+    // save the optimized model to disk.
+    std::string save_model_path = model_path + std::string(".saved");
+    Status status = graph->save(save_model_path);
+}
+#endif
+
+int main(int argc, const char** argv) {
+    // initial logger
+    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
diff --git a/test/framework/graph/graph_test.h b/test/framework/graph/graph_test.h
new file mode 100644
index 000000000..db837c84a
--- /dev/null
+++ b/test/framework/graph/graph_test.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+   
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. 
+*/
+
+#ifndef ANAKIN_GRAPH_TEST_H
+#define ANAKIN_GRAPH_TEST_H
+
+#include <iostream>
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+
+using namespace anakin;
+using ::anakin::test::Test;
+
+/**
+ * \brief Graph test is base Test class for anakin graph funciton.  
+ */
+class GraphTest: public Test {
+public:
+    GraphTest(){}
+
+    void SetUp(){}
+
+    void TearDown(){}
+
+protected:
+};
+
+
+
+
+
+
+#endif
+
+
diff --git a/test/framework/net/benchmark.cpp b/test/framework/net/benchmark.cpp
new file mode 100644
index 000000000..41c31c83e
--- /dev/null
+++ b/test/framework/net/benchmark.cpp
@@ -0,0 +1,162 @@
+#include <string>
+#include "net_test.h"
+#include "saber/funcs/timer.h"
+#include <chrono>
+#include "saber/core/tensor_op.h"
+#include <dirent.h> 
+#include <sys/stat.h> 
+#include <sys/types.h> 
+#include <unistd.h>  
+#include <fcntl.h>
+#include <map>
+
+#ifdef USE_GFLAGS
+#include <gflags/gflags.h>
+
+DEFINE_string(model_dir, "", "model dir");
+DEFINE_string(model_file, "", "model file");
+DEFINE_int32(num, 1, "batchSize");
+DEFINE_int32(warmup_iter, 10, "warm up iterations");
+DEFINE_int32(epoch, 1000, "time statistic epoch");
+#else
+std::string FLAGS_model_dir;
+std::string FLAGS_model_file;
+int FLAGS_num = 1;
+int FLAGS_warmup_iter = 10;
+int FLAGS_epoch = 1000;
+#endif
+
+#ifdef USE_CUDA
+typedef NV Target;
+#elif defined(USE_X86_PLACE)
+typedef X86 Target;
+#else
+typedef ARM Target;
+#endif
+
+void getModels(std::string path, std::vector<std::string>& files) {
+    DIR *dir;
+    struct dirent *ptr;
+    if ((dir = opendir(path.c_str())) == NULL) {
+        perror("Open dri error...");
+        exit(1);
+    }
+    while((ptr = readdir(dir)) != NULL) {
+        if (strcmp(ptr->d_name, ".") == 0 || strcmp(ptr->d_name, "..") == 0)
+            continue;
+        else if (ptr->d_type == 8)//file
+            files.push_back(path + "/" + ptr->d_name);
+        else if (ptr->d_type == 4) {
+            getModels(path + "/" + ptr->d_name, files);
+        }
+    }
+    closedir(dir);
+}
+TEST(NetTest, net_execute_base_test) {
+    std::vector<std::string> models;
+    if (FLAGS_model_file == "") {
+        getModels(FLAGS_model_dir, models);
+    } else {
+        models.push_back(FLAGS_model_dir + FLAGS_model_file);
+    }
+    for (auto iter = models.begin(); iter < models.end(); iter++)
+    {
+        LOG(WARNING) << "load anakin model file from " << *iter << " ...";
+        Graph<Target, AK_FLOAT, Precision::FP32> graph;   
+        auto status = graph.load(*iter);
+        if (!status) {
+            LOG(FATAL) << " [ERROR] " << status.info();
+        }
+        graph.ResetBatchSize("input_0", FLAGS_num);        
+        graph.Optimize();
+        // constructs the executer net
+        Net<Target, AK_FLOAT, Precision::FP32> net_executer(graph, true);
+        // get in
+        auto d_tensor_in_p = net_executer.get_in("input_0");
+        Tensor4d<X86, AK_FLOAT> h_tensor_in;
+        auto valid_shape_in = d_tensor_in_p->valid_shape();
+        for (int i = 0; i < valid_shape_in.size(); i++) {
+            LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i];
+        }
+        h_tensor_in.re_alloc(valid_shape_in);
+        fill_tensor_host_rand(h_tensor_in, -1.0f,1.0f);
+        d_tensor_in_p->copy_from(h_tensor_in);
+        // do inference
+        Context<Target> ctx(0, 0, 0);
+        saber::SaberTimer<Target> my_time;
+        LOG(WARNING) << "EXECUTER !!!!!!!! ";
+        for (int i = 0; i < FLAGS_warmup_iter; i++) {
+            net_executer.prediction();
+        }
+#ifdef ENABLE_OP_TIMER
+        net_executer.reset_op_time();
+#endif
+        my_time.start(ctx);
+        //auto start = std::chrono::system_clock::now();
+        for (int i = 0; i < FLAGS_epoch; i++) {
+        //DLOG(ERROR) << " epoch(" << i << "/" << epoch << ") ";
+            net_executer.prediction();
+        }
+        my_time.end(ctx);
+#ifdef ENABLE_OP_TIMER
+        std::vector<float> op_time = net_executer.get_op_time();
+        auto exec_funcs = net_executer.get_exec_funcs();
+        auto op_param = net_executer.get_op_param();
+        for (int i = 0; i <  op_time.size(); i++) {
+            LOG(INFO) << "name: " << exec_funcs[i].name << " op_type: " << exec_funcs[i].op_name << " op_param: " << op_param[i] << " time " << op_time[i]/FLAGS_epoch;
+        }
+        std::map<std::string, float> op_map;
+        for (int i = 0; i < op_time.size(); i++) {
+            auto it = op_map.find(op_param[i]);
+            if (it != op_map.end())
+                op_map[op_param[i]] += op_time[i];
+            else
+                op_map.insert(std::pair<std::string, float>(op_param[i], op_time[i]));
+        }
+        for (auto it = op_map.begin(); it != op_map.end(); ++it) {
+            LOG(INFO)<< it->first << "  " << (it->second) / FLAGS_epoch<< " ms";
+        }
+#endif
+        size_t end = (*iter).find(".anakin.bin");
+        size_t start = FLAGS_model_dir.length();
+        std::string model_name = (*iter).substr(start, end-start);
+        
+        LOG(INFO) << model_name << " batch_size " << FLAGS_num << " average time "<< my_time.get_average_ms() / FLAGS_epoch << " ms";
+    }
+}
+int main(int argc, const char** argv){
+    // initial logger
+    logger::init(argv[0]);
+
+#ifdef USE_GFLAGS
+    google::ParseCommandLineFlags(&argc, &argv, true);
+#else 
+    LOG(INFO)<< "BenchMark usage:";
+    LOG(INFO)<< "   $benchmark <model_dir> <model_file> <num> <warmup_iter> <epoch>";
+    LOG(INFO)<< "   model_dir:      model directory";
+    LOG(INFO)<< "   model_file:     path to model";
+    LOG(INFO)<< "   num:            batchSize default to 1";
+    LOG(INFO)<< "   warmup_iter:    warm up iterations default to 10";
+    LOG(INFO)<< "   epoch:          time statistic epoch default to 1000";
+    if(argc < 3) {
+        LOG(ERROR) << "You should fill in the variable model_dir and model_file at least.";
+        return 0;
+    }
+    FLAGS_model_dir = argv[1];
+    if(argc > 2) {
+        FLAGS_model_file = argv[2];
+    }
+    if(argc > 3) {
+        FLAGS_num = atoi(argv[3]);
+    }
+    if(argc > 4) {
+        FLAGS_warmup_iter = atoi(argv[4]);
+    }
+    if(argc > 5) {
+        FLAGS_epoch = atoi(argv[5]);
+    }
+#endif
+    InitTest();
+    RUN_ALL_TESTS(argv[0]); 
+    return 0;
+}
diff --git a/test/framework/net/chinese_ner_test.cpp b/test/framework/net/chinese_ner_test.cpp
new file mode 100644
index 000000000..37785f721
--- /dev/null
+++ b/test/framework/net/chinese_ner_test.cpp
@@ -0,0 +1,213 @@
+#include "anakin_config.h"
+#include <string>
+#include <fstream>
+#include "net_test.h"
+#include "saber/funcs/timer.h"
+#include <chrono>
+#include "saber/core/tensor_op.h"
+#include <dirent.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <map>
+
+#define DEFINE_GLOBAL(type, var, value) \
+        type (GLB_##var) = (value)
+DEFINE_GLOBAL(std::string, model_dir, "");
+DEFINE_GLOBAL(std::string, input_file, "");
+
+//#define WITH_MENTION
+
+void getModels(std::string path, std::vector<std::string>& files) {
+    DIR* dir= nullptr;
+    struct dirent* ptr;
+
+    if ((dir = opendir(path.c_str())) == NULL) {
+        perror("Open dri error...");
+        exit(1);
+    }
+
+    while ((ptr = readdir(dir)) != NULL) {
+        if (strcmp(ptr->d_name, ".") == 0 || strcmp(ptr->d_name, "..") == 0) {
+            continue;
+        } else if (ptr->d_type == 8) { //file
+            files.push_back(path + "/" + ptr->d_name);
+        } else if (ptr->d_type == 4) {
+            //files.push_back(ptr->d_name);//dir
+            getModels(path + "/" + ptr->d_name, files);
+        }
+    }
+    closedir(dir);
+}
+void SplitString(const std::string& s,
+                 std::vector<std::string>& v, const std::string& c)
+{
+    std::string::size_type pos1, pos2;
+    pos2 = s.find(c);
+    pos1 = 0;
+    while(std::string::npos != pos2)
+    {
+        v.push_back(s.substr(pos1, pos2-pos1));
+
+        pos1 = pos2 + c.size();
+        pos2 = s.find(c, pos1);
+    }
+    if(pos1 != s.length())
+        v.push_back(s.substr(pos1));
+}
+
+bool split_word_mention_idx_from_file(
+        std::vector<std::vector<float> > &word_idx,
+        std::vector<std::vector<float> > &mention_idx,
+        const std::string input_file_path) {
+
+    std::ifstream infile(input_file_path.c_str());
+    if (!infile.good()) {
+        std::cout << "Cannot open " << std::endl;
+        return false;
+    }
+    LOG(INFO)<<"found filename: "<<input_file_path;
+    std::string line;
+    std::vector<std::string> split_v;
+    std::vector<std::string> split_w;
+    std::vector<std::string> split_m;
+    while (std::getline(infile, line)) {
+        split_v.clear();
+        SplitString(line, split_v, ";");
+        CHECK_GE(split_v.size(), 4) << " file need ; split";
+        std::vector<float> word;
+        std::vector<float> mention;
+        split_w.clear();
+        SplitString(split_v[1], split_w, " ");
+        split_m.clear();
+        SplitString(split_v[3], split_m, " ");
+        for (auto w : split_w) {
+            word.push_back(atof(w.c_str()));
+        }
+        for (auto m : split_m) {
+            mention.push_back(atof(m.c_str()));
+        }
+        word_idx.push_back(word);
+        mention_idx.push_back(mention);
+    }
+    return true;
+}
+
+int get_batch_data_offset(
+        std::vector<float> &out_data,
+        const std::vector<std::vector<float> > &seq_data,
+        std::vector<int> &seq_offset,
+        const int start_idx,
+        const int batch_num) {
+    seq_offset.clear();
+    out_data.clear();
+    seq_offset.push_back(0);
+    int len = 0;
+    for (int i = 0; i < batch_num; ++i) {
+        for (auto d : seq_data[i + start_idx]) {
+            len += 1;
+            out_data.push_back(d);
+        }
+        seq_offset.push_back(len);
+    }
+    return len;
+}
+
+#ifdef USE_X86_PLACE
+TEST(NetTest, chinese_ner_executor) {
+    std::vector<std::string> models;
+    getModels(GLB_model_dir, models);
+    std::vector<std::vector<float> > word_idx;
+    std::vector<std::vector<float> > mention_idx;
+    split_word_mention_idx_from_file(word_idx, mention_idx, GLB_input_file);
+    std::vector<float> word_idx_data;
+    std::vector<float> mention_idx_data;
+    std::vector<int> word_seq_offset;
+    std::vector<int> mention_seq_offset;
+    int batch_num = 6;
+
+    Graph<X86, AK_FLOAT, Precision::FP32>* graph = new Graph<X86, AK_FLOAT, Precision::FP32>();
+    LOG(WARNING) << "load anakin model file from " << models[0] << " ...";
+    // load anakin model files.
+    auto status = graph->load(models[0]);
+    if(!status ) {
+        LOG(FATAL) << " [ERROR] " << status.info();
+    }
+    graph->Reshape("input_0", {1000, 1, 1, 1});
+#ifdef WITH_MENTION
+    graph->Reshape("input_1", {1000, 1, 1, 1});
+#endif
+    //anakin graph optimization
+    graph->Optimize();
+    Net<X86, AK_FLOAT, Precision::FP32> net_executer(*graph, true);
+    SaberTimer<X86> timer;
+    Context<X86> ctx;
+    for (int i = 0; i < word_idx.size(); i += batch_num) {
+//    {
+//        int i = 0;
+        int word_len = get_batch_data_offset(word_idx_data, word_idx, word_seq_offset, i, batch_num);
+#ifdef WITH_MENTION
+        int mention_len = get_batch_data_offset(mention_idx_data, mention_idx, mention_seq_offset, i, batch_num);
+#endif
+//        for (auto w : word_idx_data) {
+//            std::cout << w << ",";
+//        }
+//        std::cout << std::endl;
+//        for (auto s : word_seq_offset) {
+//            std::cout << s << ", ";
+//        }
+//        std::cout << std::endl << std::endl << std::endl;
+//        word_idx_data = {20, 21, 22, 23, 24, 25, 26};
+//        word_seq_offset = {0, 5, 7};
+//        int word_len = 7;
+//        mention_idx_data = {2, 1, 22, 23, 24, 25, 26};
+//        mention_seq_offset = {0, 5, 7};
+//        int mention_len = 7;
+
+        auto word_in_p = net_executer.get_in("input_0");
+        word_in_p->reshape({word_len, 1, 1, 1});
+        for (int j = 0; j < word_idx_data.size(); ++j) {
+            word_in_p->mutable_data()[j] = word_idx_data[j];
+        }
+        word_in_p->set_seq_offset(word_seq_offset);
+#ifdef WITH_MENTION
+        auto mention_in_p = net_executer.get_in("input_1");
+        mention_in_p->reshape({mention_len, 1, 1, 1});
+        for (int j = 0; j < mention_idx_data.size(); ++j) {
+            mention_in_p->mutable_data()[j] = mention_idx_data[j];
+        }
+        mention_in_p->set_seq_offset(mention_seq_offset);
+#endif
+        timer.start(ctx);
+        net_executer.prediction();
+        timer.end(ctx);
+//        auto tensor_out_5_p = net_executer.get_out("crf_decoding_0.tmp_0_out");
+//        int v_size = tensor_out_5_p->valid_size();
+//        for (int j = 0; j < v_size; ++j) {
+//            std::cout << tensor_out_5_p->data()[j]<<" ";
+//        }
+//        std::cout << std::endl;
+    }
+    LOG(INFO)<<"elapse time: "<<timer.get_average_ms()<<" ms";
+}
+#endif
+
+int main(int argc, const char** argv) {
+    // initial logger
+    LOG(INFO) << "argc " << argc;
+
+    if (argc < 3) {
+        LOG(INFO) << "Example of Usage:\n \
+        ./output/unit_test/model_test\n \
+            anakin_models\n input file\n";
+        exit(0);
+    } else if (argc == 3) {
+        GLB_model_dir = std::string(argv[1]);
+        GLB_input_file = std::string(argv[2]);
+    }
+//    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
diff --git a/test/framework/net/model_test.cpp b/test/framework/net/model_test.cpp
new file mode 100644
index 000000000..1f8055dbe
--- /dev/null
+++ b/test/framework/net/model_test.cpp
@@ -0,0 +1,175 @@
+#include <string>
+#include "net_test.h"
+#include "saber/funcs/timer.h"
+#include <chrono>
+#include "saber/core/tensor_op.h"
+#include <dirent.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <map>
+#define DEFINE_GLOBAL(type, var, value) \
+        type (GLB_##var) = (value)
+DEFINE_GLOBAL(std::string, model_dir, "");
+DEFINE_GLOBAL(int, num, 1);
+DEFINE_GLOBAL(int, channel, 8);
+DEFINE_GLOBAL(int, height, 640);
+DEFINE_GLOBAL(int, width, 640);
+DEFINE_GLOBAL(bool, is_input_shape, false);
+
+void getModels(std::string path, std::vector<std::string>& files) {
+    DIR* dir= nullptr;
+    struct dirent* ptr;
+
+    if ((dir = opendir(path.c_str())) == NULL) {
+        perror("Open dri error...");
+        exit(1);
+    }
+
+    while ((ptr = readdir(dir)) != NULL) {
+        if (strcmp(ptr->d_name, ".") == 0 || strcmp(ptr->d_name, "..") == 0) {
+            continue;
+        } else if (ptr->d_type == 8) { //file
+            files.push_back(path + "/" + ptr->d_name);
+        } else if (ptr->d_type == 4) {
+            //files.push_back(ptr->d_name);//dir
+            getModels(path + "/" + ptr->d_name, files);
+        }
+    }
+
+    closedir(dir);
+}
+
+#ifdef USE_CUDA
+TEST(NetTest, nv_net_execute_base_test) {
+    std::vector<std::string> models;
+    getModels(GLB_model_dir, models);
+
+    for (auto iter = models.begin(); iter < models.end(); iter++) {
+        LOG(WARNING) << "load anakin model file from " << *iter << " ...";
+#if 1
+        Graph<NV, AK_FLOAT, Precision::FP32> graph;
+        auto status = graph.load(*iter);
+
+        if (!status) {
+            LOG(FATAL) << " [ERROR] " << status.info();
+        }
+
+        if (GLB_is_input_shape) {
+            graph.Reshape("input_0", {GLB_num, GLB_channel, GLB_height, GLB_width});
+        } else {
+            graph.ResetBatchSize("input_0", GLB_num);
+        }
+
+        graph.Optimize();
+        // constructs the executer net
+        Net<NV, AK_FLOAT, Precision::FP32> net_executer(graph, true);
+        // get in
+        auto d_tensor_in_p = net_executer.get_in("input_0");
+        Tensor4d<X86, AK_FLOAT> h_tensor_in;
+        auto valid_shape_in = d_tensor_in_p->valid_shape();
+
+        for (int i = 0; i < valid_shape_in.size(); i++) {
+            LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i];
+        }
+
+        h_tensor_in.re_alloc(valid_shape_in);
+        fill_tensor_host_rand(h_tensor_in, -1.0f, 1.0f);
+        d_tensor_in_p->copy_from(h_tensor_in);
+        int warmup_iter = 10;
+        int epoch = 1000;
+        // do inference
+        Context<NV> ctx(0, 0, 0);
+        saber::SaberTimer<NV> my_time;
+        LOG(WARNING) << "EXECUTER !!!!!!!! ";
+
+        for (int i = 0; i < warmup_iter; i++) {
+            net_executer.prediction();
+        }
+
+#ifdef ENABLE_OP_TIMER
+        net_executer.reset_op_time();
+#endif
+        my_time.start(ctx);
+
+        //auto start = std::chrono::system_clock::now();
+        for (int i = 0; i < epoch; i++) {
+            //DLOG(ERROR) << " epoch(" << i << "/" << epoch << ") ";
+            net_executer.prediction();
+        }
+
+        my_time.end(ctx);
+#ifdef ENABLE_OP_TIMER
+        std::vector<float> op_time = net_executer.get_op_time();
+        auto exec_funcs = net_executer.get_exec_funcs();
+        auto op_param = net_executer.get_op_param();
+
+        for (int i = 0; i <  op_time.size(); i++) {
+            LOG(INFO) << "name: " << exec_funcs[i].name << " op_type: " << exec_funcs[i].op_name <<
+                      " op_param: " << op_param[i] << " time " << op_time[i] / epoch;
+        }
+
+        std::map<std::string, float> op_map;
+
+        for (int i = 0; i < op_time.size(); i++) {
+            auto it = op_map.find(op_param[i]);
+
+            if (it != op_map.end()) {
+                op_map[op_param[i]] += op_time[i];
+            } else {
+                op_map.insert(std::pair<std::string, float>(op_param[i], op_time[i]));
+            }
+        }
+
+        for (auto it = op_map.begin(); it != op_map.end(); ++it) {
+            LOG(INFO) << it->first << "  " << (it->second) / epoch << " ms";
+        }
+
+#endif
+        LOG(INFO) << *iter << " aveage time " << my_time.get_average_ms() / epoch << " ms";
+        // save the optimized model to disk.
+        //        std::string save_model_path = GLB_model_dir + std::string("opt.saved");
+        //        status = graph.save(save_model_path);
+        //        if (!status ) {
+        //            LOG(FATAL) << " [ERROR] " << status.info();
+        //        }
+#endif
+    }
+}
+#endif
+
+int main(int argc, const char** argv) {
+    // initial logger
+    LOG(INFO) << "argc " << argc;
+
+    if (argc < 1) {
+        LOG(INFO) << "Example of Usage:\n \
+        ./output/unit_test/model_test\n \
+            anakin_models\n \
+            num\n \
+            channel\n \
+            height\n \
+            width\n ";
+        exit(0);
+    } else if (argc == 2) {
+        GLB_model_dir = std::string(argv[1]);
+        GLB_is_input_shape = false;
+    } else if (argc == 3) {
+        GLB_model_dir = std::string(argv[1]);
+        GLB_num = atoi(argv[2]);
+        GLB_is_input_shape = false;
+    } else {
+        GLB_model_dir = std::string(argv[1]);
+        GLB_num = atoi(argv[2]);
+        GLB_channel = atoi(argv[3]);
+        GLB_height = atoi(argv[4]);
+        GLB_width = atoi(argv[5]);
+        GLB_is_input_shape = true;
+    }
+
+    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
diff --git a/test/framework/net/net_exec_multi_thread_test.cpp b/test/framework/net/net_exec_multi_thread_test.cpp
new file mode 100644
index 000000000..7a8bf5401
--- /dev/null
+++ b/test/framework/net/net_exec_multi_thread_test.cpp
@@ -0,0 +1,149 @@
+#include <string>
+#include "net_test.h"
+#include "saber/funcs/timer.h"
+#include <chrono>
+
+std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/yolo_lane_v2.anakin.bin";
+
+#ifdef USE_CUDA
+#if 1
+TEST(NetTest, nv_net_execute_muti_thread_sync_test) {
+#if 1 // use host input
+    //Env<NV>::env_init(1);
+    LOG(WARNING) << "Sync Runing multi_threads for model: " << model_path;
+    Worker<NV, AK_FLOAT, Precision::FP32>  workers(model_path, 10); 
+    workers.register_inputs({"input_0"});
+    workers.register_outputs({"softmax_out"});    
+    workers.Reshape("input_0", {1, 384, 960, 3});
+
+    workers.launch();
+
+    std::vector<Tensor4dPtr<target_host<NV>::type, AK_FLOAT> > host_tensor_p_in_list;
+    // get in
+    saber::Shape valid_shape_in({1, 384, 960, 3});
+    Tensor4dPtr<target_host<NV>::type, AK_FLOAT> h_tensor_in = new Tensor4d<target_host<NV>::type, AK_FLOAT>(valid_shape_in);
+    float* h_data = h_tensor_in->mutable_data();
+    for (int i=0; i<h_tensor_in->size(); i++) {
+        h_data[i] = 1.0f;
+    }
+    host_tensor_p_in_list.push_back(h_tensor_in);
+
+    int epoch = 1000;
+
+    // Running 
+    for(int i=0; i<epoch; i++) {
+        auto  d_tensor_p_out_list = workers.sync_prediction(host_tensor_p_in_list);
+
+        // get the output
+        auto d_tensor_p = d_tensor_p_out_list[0];
+    }
+
+    // get exec times
+#ifdef ENABLE_OP_TIMER
+    auto& times_map = workers.get_task_exe_times_map_of_sync_api();
+    for (auto it = times_map.begin(); it!=times_map.end(); it++) {
+        LOG(WARNING) << " threadId: " << it->first << " processing " << it->second.size() << " tasks";
+        for (auto time_in_ms : it->second) { 
+            LOG(INFO) << "      \\__task avg time: " << time_in_ms;
+        }
+    }
+#endif
+
+#endif
+
+#if 0 // use device input
+    Env<NV>::env_init(1);
+    LOG(WARNING) << "Sync Runing multi_threads for model: " << model_path;
+    Worker<NV, AK_FLOAT, Precision::FP32>  workers(model_path, 1); 
+    workers.register_inputs({"input_0"});
+    workers.register_outputs({"softmax_out"});    
+    workers.Reshape("input_0", {1, 384, 960, 3});
+
+    workers.launch();
+
+    std::vector<Tensor4dPtr<target_host<NV>::type, AK_FLOAT> > host_tensor_p_in_list;
+    // get in
+    saber::Shape valid_shape_in({1, 384, 960, 3});
+    Tensor4dPtr<target_host<NV>::type, AK_FLOAT> h_tensor_in = new Tensor4d<target_host<NV>::type, AK_FLOAT>(valid_shape_in);
+    float* h_data = h_tensor_in->mutable_data();
+    for (int i=0; i<h_tensor_in->size(); i++) {
+        h_data[i] = 1.0f;
+    }
+    host_tensor_p_in_list.push_back(h_tensor_in);
+
+    std::vector<Tensor4dPtr<NV, AK_FLOAT> > device_tensor_p_in_list;
+    for (int i=0; i<host_tensor_p_in_list.size(); i++) {
+        Tensor4dPtr<NV, AK_FLOAT> d_tensor_in = new Tensor4d<NV, AK_FLOAT>(host_tensor_p_in_list[i]->valid_shape());
+        d_tensor_in->copy_from(*(host_tensor_p_in_list[i]));
+        device_tensor_p_in_list.push_back(d_tensor_in);
+    }
+
+    int epoch = 10;
+
+    // Running 
+    for (int i=0; i<epoch; i++) {
+        Context<NV> ctx(0, 0, 0);
+        saber::SaberTimer<NV> my_time;
+
+        my_time.start(ctx);
+        auto  d_tensor_p_out_list = workers.sync_prediction_device(device_tensor_p_in_list);
+        my_time.end(ctx);
+        LOG(INFO)<<"muti thread single task exec time: "<<my_time.get_average_ms()/epoch << " ms";
+
+        // get the output
+        auto d_tensor_p = d_tensor_p_out_list[0];
+    }
+#endif
+
+}
+#endif
+
+#if 0
+TEST(NetTest, net_execute_muti_thread_async_test) {
+    LOG(WARNING) << "Async Runing multi_threads for model: " << model_path;
+    Worker<NV, AK_FLOAT, Precision::FP32>  workers(model_path, 10); 
+    workers.register_inputs({"input_0"});
+    workers.register_outputs({"softmax_out"});    
+    workers.Reshape("input_0", {1, 384, 960, 3});
+
+    workers.launch();
+
+    std::vector<Tensor4dPtr<target_host<NV>::type, AK_FLOAT> > host_tensor_p_in_list;
+    // get in
+    saber::Shape valid_shape_in({1, 384, 960, 3});
+    Tensor4dPtr<target_host<NV>::type, AK_FLOAT> h_tensor_in = new Tensor4d<target_host<NV>::type, AK_FLOAT>(valid_shape_in);
+    float* h_data = h_tensor_in->mutable_data();
+    for (int i=0; i<h_tensor_in->size(); i++) {
+        h_data[i] = 1.0f;
+    }
+    host_tensor_p_in_list.push_back(h_tensor_in);
+
+    int epoch = 10000;
+
+    // Running 
+    for(int i=0; i<epoch; i++) {
+        workers.async_prediction(host_tensor_p_in_list);
+    }
+
+    // get the output
+    int iterator = epoch;
+    while(iterator) {
+        if(!workers.empty()) {
+            auto d_tensor_p = workers.async_get_result()[0];
+            // get hte data of d_tensor_p
+            
+            iterator--;
+        }
+    }
+
+}
+#endif 
+#endif
+
+int main(int argc, const char** argv){
+    // initial logger
+    logger::init(argv[0]);
+	InitTest();
+	RUN_ALL_TESTS(argv[0]);	
+	return 0;
+}
diff --git a/test/framework/net/net_exec_test.cpp b/test/framework/net/net_exec_test.cpp
new file mode 100644
index 000000000..3f40fc341
--- /dev/null
+++ b/test/framework/net/net_exec_test.cpp
@@ -0,0 +1,273 @@
+#include <string>
+#include "net_test.h"
+#include "saber/funcs/timer.h"
+#include <chrono>
+
+//#define USE_DIEPSE
+
+//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/diepsie_light_head.anakin.bin";
+
+//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/diepsie_light_head_base.anakin.bin";
+
+
+//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/densebox.anakin.bin";
+
+//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/cnn_seg.anakin.bin";
+
+//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/yolo_camera_detector.anakin.bin";
+
+//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/yolo_lane_v2.anakin.bin";
+
+// alignment of face
+//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/net_deploy_stageI.anakin.bin";
+
+//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/net_deploy_stageII.anakin.bin";
+
+// residual 7 patch of face
+//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/residual_net_7patch_3hc.anakin.bin";
+
+// resnet 50
+//std::string model_path = "/home/cuichaowen/anakin2/anakin2/benchmark/CNN/mobilenet_v2.anakin.bin";
+
+// vgg16
+std::string model_path = "/home/cuichaowen/anakin2/anakin2/benchmark/CNN/models/vgg16.anakin.bin";
+
+#ifdef USE_CUDA
+#if 1
+TEST(NetTest, net_execute_base_test) {
+    Graph<NV, AK_FLOAT, Precision::FP32>* graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+    LOG(WARNING) << "load anakin model file from " << model_path << " ...";
+    // load anakin model files.
+    auto status = graph->load(model_path);
+    if(!status ) {
+        LOG(FATAL) << " [ERROR] " << status.info();
+    }
+
+    // reshape the input_0 's shape for graph model
+    //graph->Reshape("input_0", {1, 8, 640, 640});
+
+    // register all tensor inside graph
+    //graph->RegistAllOut();
+	
+    // register edge
+    // graph->RegistOut("conv2_2/expand/scale", "relu2_2/expand");
+
+    //anakin graph optimization
+    graph->Optimize();
+
+    // constructs the executer net
+	{ // inner scope
+#ifdef USE_DIEPSE
+    Net<NV, AK_FLOAT, Precision::FP32, OpRunType::SYNC> net_executer(*graph, true);
+#else
+    Net<NV, AK_FLOAT, Precision::FP32> net_executer(*graph, true);
+#endif
+
+    // get in
+    auto d_tensor_in_p = net_executer.get_in("input_0");
+    Tensor4d<X86, AK_FLOAT> h_tensor_in;
+
+    auto valid_shape_in = d_tensor_in_p->valid_shape();
+    for (int i=0; i<valid_shape_in.size(); i++) {
+        LOG(INFO) << "detect input_0 dims[" << i << "]" << valid_shape_in[i];
+    }
+
+    h_tensor_in.re_alloc(valid_shape_in);
+    float* h_data = h_tensor_in.mutable_data();
+
+    for (int i=0; i<h_tensor_in.size(); i++) {
+        h_data[i] = 1.0f;
+    }
+
+    d_tensor_in_p->copy_from(h_tensor_in);
+
+#ifdef USE_DIEPSE
+    // for diepse model
+    auto d_tensor_in_1_p = net_executer.get_in("input_1");
+    Tensor4d<X86, AK_FLOAT> h_tensor_in_1;
+
+    h_tensor_in_1.re_alloc(d_tensor_in_1_p->valid_shape());
+    for (int i=0; i<d_tensor_in_1_p->valid_shape().size(); i++) {
+        LOG(INFO) << "detect input_1 dims[" << i << "]" << d_tensor_in_1_p->valid_shape()[i];
+    }
+    h_data = h_tensor_in_1.mutable_data();
+    h_data[0] = 1408;
+    h_data[1] = 800;
+    h_data[2] = 0.733333;
+    h_data[3] = 0.733333;
+    h_data[4] = 0;
+    h_data[5] = 0;
+    d_tensor_in_1_p->copy_from(h_tensor_in_1);
+
+    auto d_tensor_in_2_p = net_executer.get_in("input_2");
+    Tensor4d<X86, AK_FLOAT> h_tensor_in_2;
+
+    h_tensor_in_2.re_alloc(d_tensor_in_2_p->valid_shape());
+    for (int i=0; i<d_tensor_in_2_p->valid_shape().size(); i++) {
+        LOG(INFO) << "detect input_2 dims[" << i << "]" << d_tensor_in_2_p->valid_shape()[i];
+    }
+    h_data = h_tensor_in_2.mutable_data();
+    h_data[0] = 2022.56;
+    h_data[1] = 989.389;
+    h_data[2] = 2014.05;
+    h_data[3] = 570.615;
+    h_data[4] = 1.489;
+    h_data[5] = -0.02;
+    d_tensor_in_2_p->copy_from(h_tensor_in_2);
+#endif
+
+    int epoch = 1;
+    // do inference
+    Context<NV> ctx(0, 0, 0);
+    saber::SaberTimer<NV> my_time;
+    LOG(WARNING) << "EXECUTER !!!!!!!! ";
+	// warm up
+	/*for(int i=0; i<10; i++) {
+		net_executer.prediction();
+	}*/
+
+    my_time.start(ctx);
+
+
+    //auto start = std::chrono::system_clock::now();
+    for(int i=0; i<epoch; i++) {
+		//DLOG(ERROR) << " epoch(" << i << "/" << epoch << ") ";
+        net_executer.prediction();
+    }
+   /* // running part of model
+    net_executer.execute_stop_at_node("relu2_2/expand");
+#ifdef USE_CUDA
+    cudaDeviceSynchronize();
+#endif
+
+	// get inner tensor after stop
+    auto tensor_out_inner_p = net_executer.get_tensor_from_edge("conv2_2/expand", "relu2_2/expand");
+    LOG(WARNING) << "inner tensor avg value : " << tensor_average(tensor_out_inner_p);
+#ifdef USE_CUDA
+	cudaDeviceSynchronize();
+#endif
+    
+    for (int i = 0; i < 3; i++) {
+    	net_executer.execute_start_from_node("relu2_2/expand");
+    }
+
+#ifdef USE_CUDA
+    cudaDeviceSynchronize();
+#endif*/
+
+    //auto end = std::chrono::system_clock::now();
+
+    //double time = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+    //LOG(WARNING) << "avg time : " << time/epoch <<" ms";
+
+    my_time.end(ctx);
+    LOG(INFO)<<"aveage time "<<my_time.get_average_ms()/epoch << " ms";
+
+	} // inner scope over
+
+	LOG(ERROR) << "inner net exe over !";
+
+    //auto& tensor_out_inner_p = net_executer.get_tensor_from_edge("data_perm", "conv1");
+
+    // get out yolo_v2
+    /*auto tensor_out_0_p = net_executer.get_out("loc_pred_out");
+    auto tensor_out_1_p = net_executer.get_out("obj_pred_out");
+    auto tensor_out_2_p = net_executer.get_out("cls_pred_out");
+    auto tensor_out_3_p = net_executer.get_out("ori_pred_out");
+    auto tensor_out_4_p = net_executer.get_out("dim_pred_out");*/
+
+	// get outs cnn_seg 
+	/*auto tensor_out_0_p = net_executer.get_out("slice_[dump, mask]_out");
+	auto tensor_out_1_p = net_executer.get_out("category_score_out");
+	auto tensor_out_2_p = net_executer.get_out("instance_pt_out");
+   	auto tensor_out_3_p = net_executer.get_out("confidence_score_out");
+	auto tensor_out_4_p = net_executer.get_out("class_score_out");
+	auto tensor_out_5_p = net_executer.get_out("heading_pt_out");
+	auto tensor_out_6_p = net_executer.get_out("height_pt_out");*/
+    // get out result
+    //test_print<NV>(tensor_out_4_p);
+
+
+    // save the optimized model to disk.
+    /*std::string save_model_path = model_path + std::string(".saved");
+    status = graph->save(save_model_path);
+    if (!status ) { 
+        LOG(FATAL) << " [ERROR] " << status.info(); 
+    }*/
+}
+#endif 
+#endif
+
+#if 0
+TEST(NetTest, net_execute_reconstruction_test) {
+    graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+    LOG(WARNING) << "load anakin model file from optimized model " << model_saved_path << " ...";
+    // load anakin model files.
+    auto status = graph->load(model_saved_path);
+    if (!status ) {
+        LOG(FATAL) << " [ERROR] " << status.info();
+    }
+
+    // regisiter output tensor
+    //graph->RegistOut("data_perm",  "data_scale");
+    graph->RegistOut("data_perm",  "conv1");
+
+    //anakin graph optimization
+    graph->Optimize();
+
+    // constructs the executer net
+    Net<NV, AK_FLOAT, Precision::FP32> net_executer(*graph);
+
+    // get in
+    auto d_tensor_in_p = net_executer.get_in("input_0");
+    Tensor4d<X86, AK_FLOAT> h_tensor_in;
+
+    auto valid_shape_in = d_tensor_in_p->valid_shape();
+    for (int i=0; i<valid_shape_in.size(); i++) {
+        LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i];
+    }
+
+    h_tensor_in.re_alloc(valid_shape_in);
+    float* h_data = h_tensor_in.mutable_data();
+
+    for (int i=0; i<h_tensor_in.size(); i++) {
+        h_data[i] = 1.0f;
+    }
+
+    d_tensor_in_p->copy_from(h_tensor_in);
+
+    // do inference
+    Context<NV> ctx(0, 0, 0);
+    saber::SaberTimer<NV> my_time;
+    my_time.start(ctx);
+
+    LOG(WARNING) << "EXECUTER !!!!!!!! ";
+    for (int i=0; i<1; i++) {
+        net_executer.prediction();
+
+    }
+    my_time.end(ctx);
+    LOG(INFO)<<"aveage time "<<my_time.get_average_ms()/1 << " ms";
+
+    auto tensor_out_inner_p = net_executer.get_tensor_from_edge("data_perm",  "conv1");
+
+    // get out
+    auto tensor_out_0_p = net_executer.get_out("loc_pred_out");
+    auto tensor_out_1_p = net_executer.get_out("obj_pred_out");
+    auto tensor_out_2_p = net_executer.get_out("cls_pred_out");
+    auto tensor_out_3_p = net_executer.get_out("ori_pred_out");
+    auto tensor_out_4_p = net_executer.get_out("dim_pred_out");
+
+    
+    // get out result
+    test_print<NV>(tensor_out_inner_p);
+}
+#endif
+
+int main(int argc, const char** argv){
+    // initial logger
+    logger::init(argv[0]);
+	InitTest();
+	RUN_ALL_TESTS(argv[0]);	
+	return 0;
+}
diff --git a/test/framework/net/net_test.h b/test/framework/net/net_test.h
new file mode 100644
index 000000000..c240afbf0
--- /dev/null
+++ b/test/framework/net/net_test.h
@@ -0,0 +1,98 @@
+/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+   
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. 
+*/
+
+#ifndef ANAKIN_NET_TEST_H
+#define ANAKIN_NET_TEST_H
+
+#include <iostream>
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+#include "graph_base.h"
+#include "graph.h"
+#include "scheduler.h"
+#include "net.h"
+#include "worker.h"
+
+using namespace anakin;
+using ::anakin::test::Test;
+
+using namespace anakin::graph;
+
+/**
+ * \brief Graph test is base Test class for anakin graph funciton.  
+ */
+class NetTest: public Test {
+public:
+    NetTest(){}
+
+    void SetUp(){}
+
+    void TearDown(){}
+
+protected:
+};
+
+#ifdef USE_CUDA
+void test_print(Tensor4dPtr<NV, AK_FLOAT>& out_tensor_p) {
+    Tensor4d<target_host<NV>::type, AK_FLOAT> h_tensor_result;
+    h_tensor_result.re_alloc(out_tensor_p->valid_shape());
+    LOG(ERROR) << "result count : " << h_tensor_result.valid_shape().count();
+    h_tensor_result.copy_from(*out_tensor_p);
+    for (int i = 0; i < h_tensor_result.valid_size(); i++) {
+        LOG(INFO) << " GET OUT (" << i << ") " << h_tensor_result.mutable_data()[i];
+    }
+}
+#endif
+
+template<typename Ttype, DataType Dtype>
+double tensor_average(Tensor4dPtr<Ttype, Dtype>& out_tensor_p) {
+    double sum = 0.0f;
+#ifdef USE_CUDA
+    float* h_data = new float[out_tensor_p->valid_size()];
+    const float* d_data = out_tensor_p->data();
+    CUDA_CHECK(cudaMemcpy(h_data, d_data, out_tensor_p->valid_size()*sizeof(float), cudaMemcpyDeviceToHost));
+#else
+	float* h_data = out_tensor_p->data();
+#endif
+    for (int i=0; i<out_tensor_p->valid_size(); i++) {
+		sum+=h_data[i];
+    }
+    return sum/out_tensor_p->valid_size();
+}
+
+
+#ifdef USE_X86_PLACE
+static int record_dev_tensorfile(const Tensor4d<X86, AK_FLOAT>* dev_tensor, const char* locate) {
+    Tensor<target_host<X86>::type, AK_FLOAT, NCHW> host_temp;
+    host_temp.re_alloc(dev_tensor->valid_shape());
+    host_temp.copy_from(*dev_tensor);
+    FILE* fp = fopen(locate, "w+");
+    int size = host_temp.valid_shape().count();
+    if (fp == 0) {
+        LOG(ERROR) << "[ FAILED ] file open target txt: " << locate;
+    } else {
+        for (int i = 0; i < size; ++i) {
+            fprintf(fp, "%.18f \n", i, (host_temp.data()[i]));
+        }
+        fclose(fp);
+    }
+    LOG(INFO) << "[ SUCCESS ] Write " << size << " data to: " << locate;
+    return 0;
+}
+#endif
+
+#endif
+
+
diff --git a/test/framework/net/padde_api_test.cpp b/test/framework/net/padde_api_test.cpp
new file mode 100644
index 000000000..6e0dfe878
--- /dev/null
+++ b/test/framework/net/padde_api_test.cpp
@@ -0,0 +1,121 @@
+#include <string>
+#include "net_test.h"
+#include "saber/funcs/timer.h"
+#include <chrono>
+#include "saber/core/tensor_op.h"
+#include <dirent.h> 
+#include <sys/stat.h> 
+#include <sys/types.h> 
+#include <unistd.h>  
+#include <fcntl.h>
+#include <map>
+#include "paddle_api.h"
+#define DEFINE_GLOBAL(type, var, value) \
+        type (GLB_##var) = (value)
+DEFINE_GLOBAL(std::string, model_dir, "");
+DEFINE_GLOBAL(int, num, 1);
+DEFINE_GLOBAL(int, channel, 8);
+DEFINE_GLOBAL(int, height, 640);
+DEFINE_GLOBAL(int, width, 640);
+DEFINE_GLOBAL(bool, is_input_shape, false);
+
+#ifdef USE_CUDA
+typedef NV Target;
+#elif defined(USE_X86_PLACE)
+typedef X86 Target;
+#else
+typedef ARM Target;
+#endif
+
+void getModels(std::string path, std::vector<std::string>& files)
+{
+    DIR *dir;
+    struct dirent *ptr;
+    if((dir=opendir(path.c_str()))==NULL){
+        perror("Open dri error...");
+        exit(1);
+    }
+    while((ptr=readdir(dir))!=NULL){
+        if(strcmp(ptr->d_name,".")==0||strcmp(ptr->d_name,"..")==0)
+            continue;
+        else if(ptr->d_type==8)//file
+            files.push_back(path+"/"+ptr->d_name);
+        else if(ptr->d_type==4){
+            //files.push_back(ptr->d_name);//dir
+            getModels(path+"/"+ptr->d_name,files);
+        }
+    }
+    closedir(dir);
+}
+
+
+TEST(NetTest, net_execute_base_test) {
+    std::vector<std::string> models;
+    getModels(GLB_model_dir, models);
+    for (auto iter = models.begin(); iter < models.end(); iter++)
+    {
+        AnakinEngine<Target, AK_FLOAT, Precision::FP32> anakin_engine;
+        LOG(WARNING) << "load anakin model file from " << *iter << " ...";
+        std::vector<int> shape{GLB_num, GLB_channel, GLB_height, GLB_width};
+        //anakin_engine.Build(*iter, shape);
+        anakin_engine.Build(*iter);
+
+        printf("Args = %d %d %d %d\n",GLB_num, GLB_channel, GLB_height, GLB_width);
+        //fill input
+        Tensor4d<X86, AK_FLOAT> h_tensor_in;
+        h_tensor_in.re_alloc({GLB_num, GLB_channel, GLB_height, GLB_width});
+        fill_tensor_host_rand(h_tensor_in, -1.0f,1.0f);
+
+        anakin_engine.SetInputFromCPU("input_0", h_tensor_in.data(), h_tensor_in.valid_size());
+
+        int warmup_iter = 10;
+        int epoch = 1000;
+        // do inference
+        Context<Target> ctx(0, 0, 0);
+        saber::SaberTimer<Target> my_time;
+        LOG(WARNING) << "EXECUTER !!!!!!!! ";
+        for (int i = 0; i < warmup_iter; i++) {
+            anakin_engine.Execute();
+        }
+        my_time.start(ctx);
+        //auto start = std::chrono::system_clock::now();
+        for (int i = 0; i < epoch; i++) {
+            anakin_engine.Execute();
+        }
+        my_time.end(ctx);
+        LOG(INFO) << *iter << " aveage time "<< my_time.get_average_ms() / epoch << " ms";            
+    }
+}
+
+int main(int argc, const char** argv){
+    // initial logger
+    LOG(INFO)<<"argc"<<argc;
+    if (argc < 1) {
+        LOG(INFO) << "Example of Usage:\n \
+        ./output/unit_test/model_test\n \
+            anakin_models\n \
+            num\n \
+            channel\n \
+            height\n \
+            width\n ";
+        exit(0);
+    } else if (argc == 2){
+        GLB_model_dir = std::string(argv[1]);
+        GLB_is_input_shape = false;
+    } else if (argc == 3){
+        GLB_model_dir = std::string(argv[1]);
+        GLB_num = atoi(argv[2]);
+        GLB_is_input_shape = false;
+    } else {
+        GLB_model_dir = std::string(argv[1]);
+        GLB_num = atoi(argv[2]);
+        GLB_channel = atoi(argv[3]);
+        GLB_height = atoi(argv[4]);
+        GLB_width = atoi(argv[5]);
+        GLB_is_input_shape = true;
+    }
+    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]); 
+    return 0;
+}
diff --git a/test/framework/net/paddle_api.h b/test/framework/net/paddle_api.h
new file mode 100644
index 000000000..59cb92cd9
--- /dev/null
+++ b/test/framework/net/paddle_api.h
@@ -0,0 +1,87 @@
+
+#include <string>
+#include "saber/funcs/timer.h"
+#include <chrono>
+#include "saber/core/tensor_op.h"
+#include "saber/saber_types.h"
+#include <dirent.h> 
+#include <sys/stat.h> 
+#include <sys/types.h> 
+#include <unistd.h>  
+#include <fcntl.h>
+#include <map>
+
+class EngineBase {
+ public:
+  // Build the model and do some preparation, for example, in TensorRT, run
+  // createInferBuilder, buildCudaEngine.
+  virtual void Build(const std::string& model_file, int batch_size = 1) = 0;
+  virtual void Build(const std::string& model_file, const std::vector<int>& shape) = 0;
+  // Execute the engine, that will run the inference network.
+  virtual void Execute() = 0;
+
+  virtual ~EngineBase() {}
+};  // class EngineBase
+
+template <typename Ttype, DataType Dtype, Precision Ptype>
+class AnakinEngine : public EngineBase {
+public:
+  typedef typename anakin::saber::DataTrait<Dtype>::dtype Dtype_t;
+  typedef anakin::saber::TargetWrapper<X86> X86_API;
+  typedef anakin::saber::TargetWrapper<Ttype> NV_API;
+  AnakinEngine(){}
+
+  ~AnakinEngine(){};
+
+  void Build(const std::string& model_file, int batch_size = 1) override
+  {
+    _graph.load(model_file);
+    _graph.ResetBatchSize("input_0", batch_size);
+    _graph.Optimize();
+    _net_executer.init(_graph);
+  };
+
+  void Build(const std::string& model_file, const std::vector<int>& shape) override
+  {
+    _graph.load(model_file);
+    _graph.Reshape("input_0", shape);
+    _graph.Optimize();
+    _net_executer.init(_graph);
+  };
+
+  void Execute() override
+  {
+    _net_executer.prediction();    
+  };
+
+  // Fill an input from CPU memory with name and size.
+  void SetInputFromCPU(const std::string name, Dtype_t* data, size_t size)
+  {
+    auto input_tensor = _net_executer.get_in(name);
+    anakin::Tensor<Ttype, Dtype> tmp_tensor(data, anakin::saber::X86(), X86_API::get_device_id(), input_tensor->valid_shape());
+    *input_tensor = tmp_tensor;
+  };
+
+  // accessed directly. Fill an input from GPU memory with name and size.
+  void SetInputFromGPU(const std::string& name, Dtype_t* data, size_t size)
+  {
+    auto input_tensor = _net_executer.get_in(name);
+    CHECK_EQ(size, input_tensor->valid_size());
+    anakin::Tensor<Ttype, Dtype> tmp_tensor(data, NV(), NV_API::get_device_id(), input_tensor->valid_shape());
+    *input_tensor = tmp_tensor;
+  };
+  // Get an output called name, the output of tensorrt is in GPU, so this method
+  // will just return the output's GPU memory address.
+  anakin::Tensor<Ttype, Dtype>* GetOutputInGPU(const std::string& name)
+  {
+    return _net_executer.get_out(name);
+  }
+
+private:
+    anakin::graph::Graph<Ttype, Dtype, Ptype> _graph;
+    anakin::Net<Ttype, Dtype, Ptype> _net_executer;
+};  // class TensorRTEngine
+template 
+class AnakinEngine<NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>;
+
+
diff --git a/test/framework/operators/operator_tests.h b/test/framework/operators/operator_tests.h
new file mode 100644
index 000000000..38f16b87d
--- /dev/null
+++ b/test/framework/operators/operator_tests.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+   
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. 
+*/
+
+#ifndef ANAKIN_OPERATOR_TESTS_H
+#define ANAKIN_OPERATOR_TESTS_H
+
+#include <iostream>
+#include <string>
+#include <thread>
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+#include "framework/operators/ops.h"
+
+using namespace anakin;
+using ::anakin::test::Test;
+
+class OperatorsTest : public Test {
+public:
+    OperatorsTest(){}
+
+    void SetUp(){}
+
+    void TearDown(){}
+
+protected:
+};
+
+
+
+
+
+
+#endif
+
+
diff --git a/test/framework/operators/pooling_test.cpp b/test/framework/operators/pooling_test.cpp
new file mode 100644
index 000000000..47b66be23
--- /dev/null
+++ b/test/framework/operators/pooling_test.cpp
@@ -0,0 +1,43 @@
+#include "operator_tests.h"
+#include "thread_pool.h"
+
+#ifdef USE_CUDA
+using Target = NV;
+#elif defined(USE_X86_PLACE)
+using Target = X86;
+#else
+using Target = ARM;
+#endif
+
+TEST(OperatorsTest, PoolingFactoryTest) {
+    OpContext<Target> opctx;
+    std::vector<Tensor4dPtr<Target, AK_FLOAT> > in;
+    std::vector<Tensor4dPtr<Target, AK_FLOAT> > out;
+
+
+    /*Operator<RTCUDA, float>*/ auto* Op_name1 =
+        OpFactory<Target, AK_FLOAT, Precision::FP32>::Global()["pooling"];
+    /*Operator<RTCUDA, float>**/auto* Op_name2 =
+        OpFactory<Target, AK_FLOAT, Precision::FP32>::Global()["pool"];
+    auto& op_list = OpFactory<Target, AK_FLOAT, Precision::FP32>::Global().get_list_op_name();
+
+    for (auto& item : op_list) {
+        LOG(INFO) << " op: " << item;
+    }
+
+    LOG(WARNING) << " op name alias 1 : pooling";
+    LOG(INFO) << "  run forward function";
+    (*Op_name1)(opctx, in, out);
+    LOG(WARNING) << " op name alias 2 : pool";
+    LOG(INFO) << "  run forward function";
+    (*Op_name2)(opctx, in, out);
+}
+
+
+int main(int argc, const char** argv) {
+    // initial logger
+    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}

From 19413c53d66ac9cb4e8fa2dce74408c46b21f8f9 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 11:01:56 +0800
Subject: [PATCH 094/318] Implement BM scale

---
 saber/funcs/impl/bm/vender_scale.h | 114 +++++++++++++++++++++++++++++
 saber/funcs/scale.h                |  19 +++++
 2 files changed, 133 insertions(+)
 create mode 100644 saber/funcs/impl/bm/vender_scale.h

diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
new file mode 100644
index 000000000..e019f1b21
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -0,0 +1,114 @@
+#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_SCALE_H
+#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_SCALE_H
+
+#include "saber/funcs/impl/impl_scale.h"
+
+namespace anakin{
+
+namespace saber{
+
+template <DataType OpDtype,
+    DataType inDtype,
+    DataType outDtype,
+    typename LayOutType_op,
+    typename LayOutType_in,
+    typename LayOutType_out>
+class SaberScale<BM, OpDtype, inDtype, outDtype,\
+    LayOutType_op, LayOutType_in, LayOutType_out> : \
+    public ImplBase<
+        Tensor<BM, inDtype, LayOutType_in>,
+        Tensor<BM, outDtype, LayOutType_out>,
+        Tensor<BM, OpDtype, LayOutType_op>,
+        ScaleParam<Tensor<BM, OpDtype, LayOutType_op> > >
+{
+public:
+    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+
+    SaberScale()
+    {}
+
+    ~SaberScale() {}
+
+    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            ScaleParam<OpTensor>& param, Context<BM>& ctx) {
+
+        _handle = get_bm_handle();
+        return create(inputs, outputs, param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            ScaleParam<OpTensor>& param, Context<BM> &ctx) {
+
+    }
+    
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
+                          std::vector<DataTensor_out*>& outputs,
+                          ScaleParam<OpTensor>& param) {
+
+        const InDataType *in_data = (const InDataType *) inputs[0]->data();
+        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
+
+        int input_n = inputs[0]->num();
+        int input_c = inputs[0]->channel();
+        int input_h = inputs[0]->height();
+        int input_w = inputs[0]->width();
+
+        int axis = (param.num_axes == 0) ? 0 : param.axis;
+        int num_axes = param.num_axes >=0 ? param.num_axes : inputs[0]->shape().dims() - axis;
+
+        int outer_dim = inputs[0]->count(0, axis);
+        int inner_dim = inputs[0]->count(axis + num_axes, inputs[0]->shape().dims());
+        int scale_dim = inputs[0]->count(axis, axis + num_axes);
+        if (inputs.size() == 1) {
+            CHECK_EQ(scale_dim, param.scale_w.size()) << "scale dim not valid";
+        }
+
+        OpDataType* scale_data = param.scale_w[0];
+        bmdnn_scale_forward(
+                _handle,
+                //input
+                *in_data,
+                *scale_data,
+                input_n,
+                input_c,
+                input_h,
+                input_w,
+                scale_dim,
+                inner_dim,
+                0,
+                //output
+                new bm_device_mem_t(),
+                *out_data
+        );
+
+        if (param.bias_term) {
+            OpDataType* bias_data = param.scale_b[0];
+            bmdnn_bias_forward(
+                    _handle,
+                    //input
+                    *out_data,
+                    *bias_data,
+                    outer_dim,
+                    inner_dim,
+                    //output
+                    *out_data
+            );
+        }
+
+        return SaberSuccess;
+    }
+private:
+    bm_handle_t _handle;
+};
+
+}
+
+}
+#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_SCALE_H
diff --git a/saber/funcs/scale.h b/saber/funcs/scale.h
index 1cf9d6212..95e0e6263 100644
--- a/saber/funcs/scale.h
+++ b/saber/funcs/scale.h
@@ -28,11 +28,29 @@
 #ifdef USE_X86_PLACE
 #include "saber/funcs/impl/x86/saber_scale.h"
 #endif
+#ifdef USE_ARM_PLACE
+//todo
+#include "saber/funcs/impl/impl_scale.h"
+#endif
+
+#ifdef USE_BM
+#include "saber/funcs/impl/bm/vender_scale.h"
+#endif
 
 namespace anakin {
 namespace saber {
 
+#ifdef USE_BM
 template<typename TargetType,
+        DataType OpDtype,
+        DataType inDtype = AK_BM,
+        DataType outDtype = AK_BM,
+        typename LayOutType_op = NCHW,
+        typename LayOutType_in = NCHW,
+        typename LayOutType_out = NCHW
+>
+#else
+template <typename TargetType,
         DataType OpDtype,
         DataType inDtype = AK_FLOAT,
         DataType outDtype = AK_FLOAT,
@@ -40,6 +58,7 @@ template<typename TargetType,
         typename LayOutType_in = NCHW,
         typename LayOutType_out = NCHW
 >
+#endif
 class Scale : public BaseFunc<
         Tensor<TargetType, inDtype, LayOutType_in>,
         Tensor<TargetType, outDtype, LayOutType_out>,

From 25fa4815da5315d386dc36c07a710a61ec564f5f Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Thu, 28 Jun 2018 11:05:25 +0800
Subject: [PATCH 095/318] pooling test

---
 saber/core/tensor_op.cpp                     |  2 +-
 saber/funcs/impl/bm/vender_pooling.h         |  4 +--
 test/saber/bm/test_saber_func_pooling_BM.cpp | 28 +++++++-------------
 3 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp
index 3d6494b9d..d7ee91231 100644
--- a/saber/core/tensor_op.cpp
+++ b/saber/core/tensor_op.cpp
@@ -438,7 +438,7 @@ void print_tensor_device<Tensor<BM, AK_BM, NCHW>>(Tensor<BM, AK_BM, NCHW>& tenso
     bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr);
 
     for (int i = 0; i < tensor.size(); ++i) {
-        printf("%.2f ", host_mem[i]);
+        printf("%.2f\t", host_mem[i]);
 
         if ((i + 1) % tensor.width() == 0){
             printf("\n");
diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h
index 6e5de79a4..1bdcfdecb 100644
--- a/saber/funcs/impl/bm/vender_pooling.h
+++ b/saber/funcs/impl/bm/vender_pooling.h
@@ -28,7 +28,7 @@ class VenderPooling<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in
     typedef typename DataTensor_out::Dtype OutDataType;
     typedef typename OpTensor::Dtype OpDataType;
 
-    VenderPooling() : _handle(NULL), _pooling_type(NULL) {}
+    VenderPooling() : _handle(NULL) {}
 
     ~VenderPooling() {}
 
@@ -61,7 +61,7 @@ class VenderPooling<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in
         int stride_h = param.stride_h;
         int stride_w = param.stride_w;
         int is_avg_pooling;
-        if(_pooling_type == Pooling_max){
+        if(param.pooling_type == Pooling_max){
             is_avg_pooling = 0;
         } else {
             is_avg_pooling = 1;
diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp
index fb1a7398d..7edfc677b 100644
--- a/test/saber/bm/test_saber_func_pooling_BM.cpp
+++ b/test/saber/bm/test_saber_func_pooling_BM.cpp
@@ -49,16 +49,6 @@ TEST(TestSaberFuncBM, test_func_pooling) {
     int pad_w = 1;
     int stride_h = 1;
     int stride_w = 1;
-    LOG(INFO) << " img_num: " << img_num;
-    LOG(INFO) << " in_channels: " << in_channels;
-    LOG(INFO) << " img_h: " << img_h;
-    LOG(INFO) << " img_w: " << img_w;
-    LOG(INFO) << " window_h: " << window_h;
-    LOG(INFO) << " window_w: " << window_w;
-    LOG(INFO) << " pad_h: " << pad_h;
-    LOG(INFO) << " pad_w: " << pad_w;
-    LOG(INFO) << " stride_h: " << stride_h;
-    LOG(INFO) << " stride_w: " << stride_w;
 
     PoolingParam<TensorDf4> param(window_h, window_w, pad_h, pad_w
                                   , stride_h, stride_w, Pooling_max);
@@ -80,7 +70,7 @@ TEST(TestSaberFuncBM, test_func_pooling) {
     pooling(input, output, param, ctx1);
 
     SaberTimer<BM> t1;
-    int ts = 100;
+    int ts = 10;
 
     for (int i = 0; i < ts; ++i) {
         t1.start(ctx1);
@@ -100,7 +90,6 @@ TEST(TestSaberFuncBM, test_func_pooling) {
 
 TEST(TestSaberFuncBM, test_pooling_result) {
 
-    Env<BM>::env_init();
     typedef TargetWrapper<BM> API;
 
     typedef TargetWrapper<X86> X86_API;
@@ -109,7 +98,7 @@ TEST(TestSaberFuncBM, test_pooling_result) {
     typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 
     int img_num = 1;
-    int in_channels = 2;
+    int in_channels = 1;
     int img_h = 8;
     int img_w = 8;
 
@@ -122,7 +111,7 @@ TEST(TestSaberFuncBM, test_pooling_result) {
     img_dev.re_alloc(img_s);
 
     for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = 0x7f & i;
+        img_host.mutable_data()[i] = rand() % 20;
     }
 
     img_dev.copy_from(img_host);
@@ -150,8 +139,8 @@ TEST(TestSaberFuncBM, test_pooling_result) {
     LOG(INFO) << " stride_h: " << stride_h;
     LOG(INFO) << " stride_w: " << stride_w;
 
-    PoolingParam<TensorDf4> param(window_h, window_w, pad_h, pad_w
-                                  , stride_h, stride_w, Pooling_max);
+    PoolingParam<TensorDf4> param(window_h, window_w, pad_h, pad_w,
+                                  stride_h, stride_w, Pooling_average_include_padding);
 
     std::vector<TensorDf4*> input;
     std::vector<TensorDf4*> output;
@@ -169,12 +158,14 @@ TEST(TestSaberFuncBM, test_pooling_result) {
     pooling(input, output, param, ctx1);
 
     output_dev.sync();
+    LOG(INFO) << "tensor data before pooling: ";
+    print_tensor_device(img_dev);
+    LOG(INFO) << "tensor data after pooling: ";
     print_tensor_device(output_dev);
 }
 
 TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
 
-    Env<BM>::env_init();
     typedef TargetWrapper<BM> API;
 
     typedef TargetWrapper<X86> X86_API;
@@ -275,12 +266,13 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
     out0.sync();
     out1.sync();
 
-    print_tensor_device(output_dev);
+    /* print_tensor_device(output_dev); */
 }
 
 int main(int argc, const char** argv) {
     // initial logger
     //logger::init(argv[0]);
+    Env<BM>::env_init();
     InitTest();
     RUN_ALL_TESTS(argv[0]);
     return 0;

From 56271d43c89aa9494fbf45212b7cf7de8161d912 Mon Sep 17 00:00:00 2001
From: "weihao.huang" <weihao.huang@bitmain.com>
Date: Thu, 28 Jun 2018 03:22:08 +0000
Subject: [PATCH 096/318] Fix d2d mem copy

---
 saber/core/impl/bm/bm_impl.cpp               |  2 +-
 saber/core/tensor.h                          | 54 +++++++++++++++++++-
 test/saber/bm/test_saber_func_softmax_BM.cpp |  5 +-
 3 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 4d24dedf0..e73e355b7 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -81,7 +81,7 @@ void BM_API::mem_set(void* ptr, int value, size_t n){
 void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
     size_t count, __DtoD) {
     handle = get_bm_handle(); 
-    //BMDNN_CHECK(bm_memcpy_s2d(handle, bm_mem_from_device(dst), dst_id, bm_mem_from_device(src), src_id, count));
+    //BMDNN_CHECK(bm_memcpy_d2d(handle, bm_mem_from_device(dst), dst_id, bm_mem_from_device(src), src_id, count));
     BMDNN_CHECK(bm_memcpy_d2d(handle, *(bm_device_mem_t *)(dst), dst_id, *(bm_device_mem_t *)(src), src_id, count));
     LOG(INFO) << "BM sync_memcpy: device to device, finished";
 };
diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 7c1d00052..860749981 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -609,7 +609,7 @@ class Tensor : public TensorBase {
         }
         CHECK_EQ(valid_size(), tensor.valid_size()) \
             << "sizes of two valid shapes must be the same";
-
+        
         /// get the proper process target wrapper
         typedef  TargetWrapper<TargetType_t> API_t;
         typedef typename TargetTypeTraits<TargetType_t>::target_type target_type_t;
@@ -756,7 +756,8 @@ class Tensor : public TensorBase {
     SaberStatus copy_from(const Tensor<NewTargetType_t, NewDataType_t, NewLayOutType_t>& tensor) {
         LOG(WARNING) << "Invalid: copy_from is not allowed for current type.";
         return SaberInvalidValue;
-    }
+    }  
+
 #endif
 
     /**
@@ -970,8 +971,11 @@ class Tensor : public TensorBase {
 };
 
 #ifdef USE_BM
+
 #ifndef BM_TENSOR_COPY
 #define BM_TENSOR_COPY
+
+
 template<> inline
 size_t Tensor<BM, AK_BM, NCHW>::_type_len(){
     return 4;
@@ -998,9 +1002,55 @@ SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor
     BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *device_data_ptr));
     return SaberSuccess;
 }
+
+/*
+    template<> inline
+    size_t Tensor<BM, AK_BM, NCHW>::_type_len(){
+        return 4;
+    }  
+
+    template<>
+    template<> inline
+    SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
+        LOG(INFO) << "BM copy_from X86";
+        CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
+
+        auto* device_data_ptr = mutable_data();
+        BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(const_cast<float *>(tensor.data()))));
+        //BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *(bm_device_mem_t *)(mutable_data()), bm_mem_from_system(tensor.data())));
+        return SaberSuccess;
+    }
+
+    template<>
+    template<> inline
+    SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor<BM, AK_BM, NCHW>& tensor) {
+        LOG(INFO) << "X86 copy_from BM";
+        CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
+
+        auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
+        BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *device_data_ptr));
+        //BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *(bm_device_mem_t *)(tensor.data())));
+        return SaberSuccess;
+    }
+
+    template<>
+    template<> inline
+    SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor<BM, AK_BM, NCHW>& tensor) {
+        LOG(INFO) << "BM copy_from BM";
+        CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
+
+        auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
+        //BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *device_data_ptr));
+        //BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *(bm_device_mem_t *)(tensor.data())));
+        return SaberSuccess;
+    } 
+*/
+
 #endif
+
 #endif
 
+
 } //namespace saber
 
 } //namespace anakin
diff --git a/test/saber/bm/test_saber_func_softmax_BM.cpp b/test/saber/bm/test_saber_func_softmax_BM.cpp
index 6c38c7534..d7707fad7 100644
--- a/test/saber/bm/test_saber_func_softmax_BM.cpp
+++ b/test/saber/bm/test_saber_func_softmax_BM.cpp
@@ -17,7 +17,7 @@ TEST(TestSaberFuncBM, test_func_softmax_BM) {
 
     typedef TensorDf4::Dtype dtype;
 
-    int test_iter = 1000;
+    int test_iter = 10;
 
     int softmax_axis = 3; // channel
     int w_in = 3;
@@ -182,12 +182,13 @@ TEST(TestSaberFuncBM, test_func_softmax_ROI_BM) {
 
     TensorDf4 troi(output_dev_4d[0]->valid_shape());
     troi.copy_from(*output_dev_4d[0]);
-    print_tensor_device(troi);
+    //print_tensor_device(troi);
 }
 
 int main(int argc, const char** argv) {
     // initial logger
     //logger::init(argv[0]);
+    Env<BM>::env_init();
     InitTest();
     RUN_ALL_TESTS(argv[0]);
     return 0;

From c5a30a79192a4d0cc7b2c4aa5238b4d3d3d6df97 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 11:39:02 +0800
Subject: [PATCH 097/318] Add batch norm operation

---
 saber/funcs/batch_norm.h                | 115 ++++++++++++++++++++++++
 saber/funcs/impl/bm/vender_batch_norm.h |  63 +++++++++++++
 saber/funcs/impl/bm/vender_scale.h      |   6 +-
 saber/funcs/impl/impl_batch_norm.h      |  14 +++
 saber/funcs/scale.h                     |   6 +-
 5 files changed, 200 insertions(+), 4 deletions(-)
 create mode 100644 saber/funcs/batch_norm.h
 create mode 100644 saber/funcs/impl/bm/vender_batch_norm.h
 create mode 100644 saber/funcs/impl/impl_batch_norm.h

diff --git a/saber/funcs/batch_norm.h b/saber/funcs/batch_norm.h
new file mode 100644
index 000000000..604687303
--- /dev/null
+++ b/saber/funcs/batch_norm.h
@@ -0,0 +1,115 @@
+#ifndef ANAKIN_SABER_FUNCS_BATCH_NORM_H
+#define ANAKIN_SABER_FUNCS_BATCH_NORM_H
+
+#include "saber/core/tensor.h"
+#include "saber/funcs/base.h"
+#include "saber/saber_funcs_param.h"
+#include "saber/funcs/impl/impl_base.h"
+#include "saber/funcs/impl/impl_batch_norm.h"
+
+#ifdef NVIDIA_GPU
+//todo
+#include "saber/funcs/impl/impl_batch_norm.h"
+#endif
+
+#ifdef USE_X86_PLACE
+//todo
+#include "saber/funcs/impl/impl_batch_norm.h"
+#endif
+
+#ifdef USE_ARM_PLACE
+//todo
+#include "saber/funcs/impl/impl_batch_norm.h"
+#endif
+
+#ifdef USE_BM
+#include "saber/funcs/impl/bm/vender_batch_norm.h"
+#endif
+
+namespace anakin {
+namespace saber {
+
+#ifdef USE_BM
+template<typename TargetType,
+        DataType OpDtype,
+        DataType inDtype = AK_BM,
+        DataType outDtype = AK_BM,
+        typename LayOutType_op = NCHW,
+        typename LayOutType_in = NCHW,
+        typename LayOutType_out = NCHW
+>
+#else
+template <typename TargetType,
+        DataType OpDtype,
+        DataType inDtype = AK_FLOAT,
+        DataType outDtype = AK_FLOAT,
+        typename LayOutType_op = NCHW,
+        typename LayOutType_in = NCHW,
+        typename LayOutType_out = NCHW
+>
+#endif
+class BatchNorm : public BaseFunc<
+        Tensor<TargetType, inDtype, LayOutType_in>,
+        Tensor<TargetType, outDtype, LayOutType_out>,
+        Tensor<TargetType, OpDtype, LayOutType_op>,
+        ImplBase,
+        BatchNormParam
+> {
+public:
+    using BaseFunc<
+            Tensor<TargetType, inDtype, LayOutType_in>,
+            Tensor<TargetType, outDtype, LayOutType_out>,
+            Tensor<TargetType, OpDtype, LayOutType_op>,
+            ImplBase,
+            BatchNormParam>::BaseFunc;
+
+    BatchNorm() = default;
+
+    typedef Tensor<TargetType, inDtype, LayOutType_in> InDataTensor;
+    typedef Tensor<TargetType, outDtype, LayOutType_out> OutDataTensor;
+    typedef Tensor<TargetType, OpDtype, LayOutType_op> OpTensor;
+    typedef BatchNormParam<OpTensor> Param_t;
+    typedef std::vector<InDataTensor *> Input_v;
+    typedef std::vector<OutDataTensor *> Output_v;
+    typedef std::vector<Shape> Shape_v;
+
+    virtual SaberStatus compute_output_shape(const Input_v &input,
+                                             Output_v &output, Param_t &param) override {
+
+        Shape output_shape = (input[0]->valid_shape());
+        return output[0]->set_shape(output_shape);
+    }
+
+    virtual SaberStatus init_impl(ImplEnum implenum) override {
+        switch (implenum) {
+            case VENDER_IMPL:
+                this->_impl.push_back(new VenderBatchNorm <TargetType,
+                OpDtype, inDtype, outDtype,
+                LayOutType_op, LayOutType_in, LayOutType_out>);
+                return SaberSuccess;
+
+            case SABER_IMPL:
+                return SaberUnImplError;
+
+            default:
+                return SaberUnImplError;
+        }
+    }
+
+private:
+
+    virtual void pick_best_static() override {
+        if (true) // some condition?
+            this->_best_impl = this->_impl[0];
+    }
+
+    virtual void pick_best_specify(ImplEnum implenum) override {
+        this->_best_impl = this->_impl[0];
+    }
+
+};
+
+} // namespace saber
+} // namespace anakin
+
+#endif
diff --git a/saber/funcs/impl/bm/vender_batch_norm.h b/saber/funcs/impl/bm/vender_batch_norm.h
new file mode 100644
index 000000000..cf767cd22
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_batch_norm.h
@@ -0,0 +1,63 @@
+#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_BATCH_NORM_H
+#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_BATCH_NORM_H
+
+#include "saber/funcs/impl/impl_batch_norm.h"
+
+namespace anakin{
+
+namespace saber {
+
+template <DataType OpDtype ,
+    DataType inDtype,
+    DataType outDtype,
+    typename LayOutType_op,
+    typename LayOutType_in,
+    typename LayOutType_out>
+class VenderBatchNorm<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, LayOutType_out>:\
+ public ImplBase<
+    Tensor<BM, inDtype, LayOutType_in>, 
+    Tensor<BM, outDtype, LayOutType_out>,
+    Tensor<BM, OpDtype, LayOutType_op>,
+    BatchNormParam<Tensor<BM, OpDtype, LayOutType_op>>> {
+public:
+    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
+
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+
+    VenderBatchNorm() : _handle(NULL) {}
+
+    ~VenderBatchNorm() {}
+
+    virtual SaberStatus init(const std::vector<DataTensor_in*>& inputs,
+                  std::vector<DataTensor_out*>& outputs,
+                  BatchNormParam<OpTensor> &batch_norm_param, Context<BM> &ctx) {
+
+        _handle = get_bm_handle();
+        return create(inputs, outputs, batch_norm_param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in*>& inputs,
+                std::vector<DataTensor_out*>& outputs,
+                BatchNormParam<OpTensor> &batch_norm_param, Context<BM> &ctx) {
+    }
+
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
+                          std::vector<DataTensor_out*>& outputs,
+                          BatchNormParam<OpTensor> &param) {
+
+        return SaberSuccess;
+    }
+
+private:
+    bm_handle_t _handle;
+};
+
+} //namespace saber
+
+} // namespace anakin
+
+#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_BATCH_NORM_H
diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
index e019f1b21..9ed364173 100644
--- a/saber/funcs/impl/bm/vender_scale.h
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -13,7 +13,7 @@ template <DataType OpDtype,
     typename LayOutType_op,
     typename LayOutType_in,
     typename LayOutType_out>
-class SaberScale<BM, OpDtype, inDtype, outDtype,\
+class VenderScale<BM, OpDtype, inDtype, outDtype,\
     LayOutType_op, LayOutType_in, LayOutType_out> : \
     public ImplBase<
         Tensor<BM, inDtype, LayOutType_in>,
@@ -29,10 +29,10 @@ class SaberScale<BM, OpDtype, inDtype, outDtype,\
     typedef typename DataTensor_out::Dtype OutDataType;
     typedef typename OpTensor::Dtype OpDataType;
 
-    SaberScale()
+    VenderScale()
     {}
 
-    ~SaberScale() {}
+    ~VenderScale() {}
 
     virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
diff --git a/saber/funcs/impl/impl_batch_norm.h b/saber/funcs/impl/impl_batch_norm.h
new file mode 100644
index 000000000..5a09220c7
--- /dev/null
+++ b/saber/funcs/impl/impl_batch_norm.h
@@ -0,0 +1,14 @@
+#ifndef ANAKIN_SABER_FUNCS_IMPL_BATCH_NORM_H
+#define ANAKIN_SABER_FUNCS_IMPL_BATCH_NORM_H
+
+#include "saber/funcs/impl/impl_macro.h"
+namespace anakin{
+
+namespace saber{
+
+DEFINE_OP_CLASS(BatchNorm, BatchnormParam);
+
+}
+}
+
+#endif //ANAKIN_SABER_FUNCS_IMPL_BATCH_NORM_H
diff --git a/saber/funcs/scale.h b/saber/funcs/scale.h
index 95e0e6263..9a1762df6 100644
--- a/saber/funcs/scale.h
+++ b/saber/funcs/scale.h
@@ -28,6 +28,7 @@
 #ifdef USE_X86_PLACE
 #include "saber/funcs/impl/x86/saber_scale.h"
 #endif
+
 #ifdef USE_ARM_PLACE
 //todo
 #include "saber/funcs/impl/impl_scale.h"
@@ -94,7 +95,10 @@ class Scale : public BaseFunc<
     virtual SaberStatus init_impl(ImplEnum implenum) override {
         switch (implenum) {
             case VENDER_IMPL:
-                return SaberUnImplError;
+                this->_impl.push_back(new VenderScale <TargetType,
+                OpDtype, inDtype, outDtype,
+                LayOutType_op, LayOutType_in, LayOutType_out>);
+                return SaberSuccess;
 
             case SABER_IMPL:
                 this->_impl.push_back(new SaberScale <TargetType,

From b5cdc739e4c319b884a4eeec9beafcca330c7194 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 14:22:34 +0800
Subject: [PATCH 098/318] Implement batch norm for BM

---
 saber/funcs/impl/bm/vender_batch_norm.h | 31 ++++++++++++++++
 saber/saber_funcs_param.h               | 48 +++++++++++++++++++++++++
 2 files changed, 79 insertions(+)

diff --git a/saber/funcs/impl/bm/vender_batch_norm.h b/saber/funcs/impl/bm/vender_batch_norm.h
index cf767cd22..917dc7219 100644
--- a/saber/funcs/impl/bm/vender_batch_norm.h
+++ b/saber/funcs/impl/bm/vender_batch_norm.h
@@ -49,6 +49,37 @@ class VenderBatchNorm<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_
                           std::vector<DataTensor_out*>& outputs,
                           BatchNormParam<OpTensor> &param) {
 
+        const InDataType *in_data = (const InDataType *) inputs[0]->data();
+        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
+
+        int input_n = inputs[0]->num();
+        int input_c = inputs[0]->channel();
+        int input_h = inputs[0]->height();
+        int input_w = inputs[0]->width();
+
+        OpDataType eps = param.eps;
+        OpDataType scale = param.scale;
+
+        bm_device_mem_t mean_ma = bm_mem_from_system(&param.mean);
+        bm_device_mem_t variance_ma = bm_mem_from_system(&param.variance);
+
+        bmdnn_batchnorm_forward_inference(
+                _handle,
+                //input
+                *in_data,
+                mean_ma,
+                variance_ma,
+                scale,
+                new bm_device_mem_t(),
+                eps,
+                input_n,
+                input_c,
+                input_h,
+                input_w,
+                //output
+                *out_data
+        );
+
         return SaberSuccess;
     }
 
diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h
index 284fbcbc5..1a32c9c87 100644
--- a/saber/saber_funcs_param.h
+++ b/saber/saber_funcs_param.h
@@ -776,6 +776,53 @@ struct SoftmaxParam {
     }
     int axis;
 };
+
+#ifdef USE_BM
+template <typename opTensor>
+struct BatchnormParam {
+    typedef typename opTensor::Dtype DataDtype;
+    BatchnormParam()
+        : scale(float(0))
+        , use_global_stats(true)
+        , moving_average_fraction(float(0.999))
+        , eps(float(1e-5))
+        , mean(), variance()
+    {}
+    //scale_factor = 1 / scale;
+    BatchnormParam(std::vector<float> mean_in, std::vector<float> variance_in,
+                float scale_in, float moving_average_fraction_in = float(0.999),
+                float eps_in = float(1e-5), bool use_global_stats_in = true)
+        : mean(mean_in), variance(variance_in), scale(scale_in)
+        , moving_average_fraction(moving_average_fraction_in)
+        , eps(eps_in), use_global_stats(use_global_stats_in)
+    {}
+    BatchnormParam &operator=(const BatchnormParam &right) {
+        scale = right.scale;
+        moving_average_fraction = right.moving_average_fraction;
+        eps = right.eps;
+        use_global_stats = right.use_global_stats;
+        mean = right.mean;
+        variance = right.variance;
+        return *this;
+    }
+    bool operator==(const BatchnormParam &right) {
+        bool comp_eq = true;
+        comp_eq = comp_eq && (scale == right.scale);
+        comp_eq = comp_eq && (moving_average_fraction == right.moving_average_fraction);
+        comp_eq = comp_eq && (eps == right.eps);
+        comp_eq = comp_eq && (use_global_stats == right.use_global_stats);
+        comp_eq = comp_eq && (mean == right.mean);
+        comp_eq = comp_eq && (variance == right.variance);
+        return comp_eq;
+    }
+    float scale;
+    float moving_average_fraction;
+    float eps;
+    bool use_global_stats;
+    std::vector<float> mean;
+    std::vector<float> variance;
+};
+#else
 template <typename opTensor>
 struct BatchnormParam {
     typedef typename opTensor::Dtype DataDtype;
@@ -820,6 +867,7 @@ struct BatchnormParam {
     std::vector<DataDtype> mean;
     std::vector<DataDtype> variance;
 };
+#endif
 
 template <typename opTensor>
 struct ActivationParam {

From 5c6ec7f965638ffe41a7c1c0db14ab8ec45921aa Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 14:57:18 +0800
Subject: [PATCH 099/318] Use template specifications instead of macro

---
 saber/funcs/activation.h                     | 11 -----------
 saber/funcs/batch_norm.h                     | 11 -----------
 saber/funcs/conv.h                           | 11 -----------
 saber/funcs/pooling.h                        | 11 -----------
 saber/funcs/scale.h                          | 11 -----------
 saber/funcs/softmax.h                        | 11 -----------
 test/saber/bm/test_saber_func_conv_BM.cpp    |  4 ++--
 test/saber/bm/test_saber_func_pooling_BM.cpp |  8 ++++----
 test/saber/bm/test_saber_func_softmax_BM.cpp |  4 ++--
 9 files changed, 8 insertions(+), 74 deletions(-)

diff --git a/saber/funcs/activation.h b/saber/funcs/activation.h
index e1167bc9a..7af7a6f80 100644
--- a/saber/funcs/activation.h
+++ b/saber/funcs/activation.h
@@ -36,16 +36,6 @@
 namespace anakin {
 namespace saber {
 
-#ifdef USE_BM
-template<typename TargetType,
-        DataType OpDtype,
-        DataType inDtype = AK_BM,
-        DataType outDtype = AK_BM,
-        typename LayOutType_op = NCHW,
-        typename LayOutType_in = NCHW,
-        typename LayOutType_out = NCHW
->
-#else
 template<typename TargetType,
         DataType OpDtype,
         DataType inDtype = AK_FLOAT,
@@ -54,7 +44,6 @@ template<typename TargetType,
         typename LayOutType_in = NCHW,
         typename LayOutType_out = NCHW
 >
-#endif
 class Activation : public BaseFunc<
         Tensor<TargetType, inDtype, LayOutType_in>,
         Tensor<TargetType, outDtype, LayOutType_out>,
diff --git a/saber/funcs/batch_norm.h b/saber/funcs/batch_norm.h
index 604687303..f8cf3e693 100644
--- a/saber/funcs/batch_norm.h
+++ b/saber/funcs/batch_norm.h
@@ -29,16 +29,6 @@
 namespace anakin {
 namespace saber {
 
-#ifdef USE_BM
-template<typename TargetType,
-        DataType OpDtype,
-        DataType inDtype = AK_BM,
-        DataType outDtype = AK_BM,
-        typename LayOutType_op = NCHW,
-        typename LayOutType_in = NCHW,
-        typename LayOutType_out = NCHW
->
-#else
 template <typename TargetType,
         DataType OpDtype,
         DataType inDtype = AK_FLOAT,
@@ -47,7 +37,6 @@ template <typename TargetType,
         typename LayOutType_in = NCHW,
         typename LayOutType_out = NCHW
 >
-#endif
 class BatchNorm : public BaseFunc<
         Tensor<TargetType, inDtype, LayOutType_in>,
         Tensor<TargetType, outDtype, LayOutType_out>,
diff --git a/saber/funcs/conv.h b/saber/funcs/conv.h
index 596964dbe..e527f3d6f 100644
--- a/saber/funcs/conv.h
+++ b/saber/funcs/conv.h
@@ -34,16 +34,6 @@
 namespace anakin {
 namespace saber {
 
-#ifdef USE_BM
-template<typename TargetType,
-        DataType OpDtype,
-        DataType inDtype = AK_BM,
-        DataType outDtype = AK_BM,
-        typename LayOutType_op = NCHW,
-        typename LayOutType_in = NCHW,
-        typename LayOutType_out = NCHW
->
-#else
 template<typename TargetType,
         DataType OpDtype,
         DataType inDtype = AK_FLOAT,
@@ -52,7 +42,6 @@ template<typename TargetType,
         typename LayOutType_in = NCHW,
         typename LayOutType_out = NCHW
 >
-#endif
 class Conv : public BaseFunc<
         Tensor<TargetType, inDtype, LayOutType_in>,
         Tensor<TargetType, outDtype, LayOutType_out>,
diff --git a/saber/funcs/pooling.h b/saber/funcs/pooling.h
index aff883505..739d05851 100644
--- a/saber/funcs/pooling.h
+++ b/saber/funcs/pooling.h
@@ -34,16 +34,6 @@
 namespace anakin {
 namespace saber {
 
-#ifdef USE_BM
-template<typename TargetType,
-        DataType OpDtype,
-        DataType inDtype = AK_BM,
-        DataType outDtype = AK_BM,
-        typename LayOutType_op = NCHW,
-        typename LayOutType_in = NCHW,
-        typename LayOutType_out = NCHW
->
-#else
 template<typename TargetType,
         DataType OpDtype,
         DataType inDtype = AK_FLOAT,
@@ -52,7 +42,6 @@ template<typename TargetType,
         typename LayOutType_in = NCHW,
         typename LayOutType_out = NCHW
 >
-#endif
 class Pooling : public BaseFunc<
         Tensor<TargetType, inDtype, LayOutType_in>,
         Tensor<TargetType, outDtype, LayOutType_out>,
diff --git a/saber/funcs/scale.h b/saber/funcs/scale.h
index 9a1762df6..24e45138f 100644
--- a/saber/funcs/scale.h
+++ b/saber/funcs/scale.h
@@ -41,16 +41,6 @@
 namespace anakin {
 namespace saber {
 
-#ifdef USE_BM
-template<typename TargetType,
-        DataType OpDtype,
-        DataType inDtype = AK_BM,
-        DataType outDtype = AK_BM,
-        typename LayOutType_op = NCHW,
-        typename LayOutType_in = NCHW,
-        typename LayOutType_out = NCHW
->
-#else
 template <typename TargetType,
         DataType OpDtype,
         DataType inDtype = AK_FLOAT,
@@ -59,7 +49,6 @@ template <typename TargetType,
         typename LayOutType_in = NCHW,
         typename LayOutType_out = NCHW
 >
-#endif
 class Scale : public BaseFunc<
         Tensor<TargetType, inDtype, LayOutType_in>,
         Tensor<TargetType, outDtype, LayOutType_out>,
diff --git a/saber/funcs/softmax.h b/saber/funcs/softmax.h
index 4a1e631f0..1ad324908 100644
--- a/saber/funcs/softmax.h
+++ b/saber/funcs/softmax.h
@@ -35,16 +35,6 @@ namespace anakin{
 
 namespace saber{
 
-#ifdef USE_BM
-template<typename TargetType,
-        DataType OpDtype,
-        DataType inDtype = AK_BM,
-        DataType outDtype = AK_BM,
-        typename LayOutType_op = NCHW,
-        typename LayOutType_in = NCHW,
-        typename LayOutType_out = NCHW
->
-#else
 template <typename TargetType,
         DataType OpDtype,
         DataType inDtype = AK_FLOAT,
@@ -53,7 +43,6 @@ template <typename TargetType,
         typename LayOutType_in = NCHW,
         typename LayOutType_out = NCHW
 >
-#endif
 class Softmax : public BaseFunc<
         Tensor<TargetType, inDtype, LayOutType_in>,
         Tensor<TargetType, outDtype, LayOutType_out>,
diff --git a/test/saber/bm/test_saber_func_conv_BM.cpp b/test/saber/bm/test_saber_func_conv_BM.cpp
index 554bcf843..35ffc6006 100644
--- a/test/saber/bm/test_saber_func_conv_BM.cpp
+++ b/test/saber/bm/test_saber_func_conv_BM.cpp
@@ -492,7 +492,7 @@ TEST(TestSaberFuncBM, test_conv_fp32_speed_test) {
     input.push_back(&img_dev);
     output.push_back(&output_dev);
 
-    Conv<BM, AK_BM> conv;
+    Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv;
     conv.compute_output_shape(input, output, param);
 
     output_dev.re_alloc(output[0]->shape());
@@ -546,7 +546,7 @@ void test_conv_fp32_speed(std::vector<TensorDf4*> &inputs, std::vector<TensorDf4
                                     stride, stride,
                                     1, 1,
                                     &weights, &bias);
-    Conv<BM, AK_BM> conv;
+    Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv;
     conv.compute_output_shape(inputs, outputs, conv_param);
     outputs[0]->re_alloc(outputs[0]->shape());
     Context<BM> ctx1(0, 1, 1);
diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp
index 7edfc677b..943ed130b 100644
--- a/test/saber/bm/test_saber_func_pooling_BM.cpp
+++ b/test/saber/bm/test_saber_func_pooling_BM.cpp
@@ -148,7 +148,7 @@ TEST(TestSaberFuncBM, test_pooling_result) {
     input.push_back(&img_dev);
     output.push_back(&output_dev);
 
-    Pooling<BM, AK_BM> pooling;
+    Pooling<BM, AK_BM, AK_BM, AK_BM, NCHW> pooling;
     pooling.compute_output_shape(input, output, param);
 
     output_dev.re_alloc(output[0]->shape());
@@ -234,9 +234,9 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
     input.push_back(&img_dev);
     output.push_back(&output_dev);
 
-    Pooling<BM, AK_BM> pooling;
-    Pooling<BM, AK_BM> pooling0;
-    Pooling<BM, AK_BM> pooling1;
+    Pooling<BM, AK_BM, AK_BM, AK_BM, NCHW> pooling;
+    Pooling<BM, AK_BM, AK_BM, AK_BM, NCHW> pooling0;
+    Pooling<BM, AK_BM, AK_BM, AK_BM, NCHW> pooling1;
 
     pooling.compute_output_shape(input,output,  param);
 
diff --git a/test/saber/bm/test_saber_func_softmax_BM.cpp b/test/saber/bm/test_saber_func_softmax_BM.cpp
index d7707fad7..645d081f1 100644
--- a/test/saber/bm/test_saber_func_softmax_BM.cpp
+++ b/test/saber/bm/test_saber_func_softmax_BM.cpp
@@ -52,7 +52,7 @@ TEST(TestSaberFuncBM, test_func_softmax_BM) {
     // start Reshape & doInfer
     Context<BM> ctx_dev(0, 1, 1);
 
-    Softmax<BM, AK_BM> softmax_dev;
+    Softmax<BM, AK_BM, AK_BM, AK_BM, NCHW> softmax_dev;
 
     typedef std::vector<Shape> Shape_v;
 
@@ -150,7 +150,7 @@ TEST(TestSaberFuncBM, test_func_softmax_ROI_BM) {
     // start Reshape & doInfer
     Context<BM> ctx_dev(0, 1, 1);
 
-    Softmax<BM, AK_BM> softmax_dev;
+    Softmax<BM, AK_BM, AK_BM, AK_BM, NCHW> softmax_dev;
 
     LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \
               shape_out[2] << ", " << shape_out[3];

From 597fc4c1d87974989df86065955f0b1aaa8a4035 Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Thu, 28 Jun 2018 15:54:53 +0800
Subject: [PATCH 100/318] conv test

---
 test/saber/bm/test_saber_func_conv_BM.cpp | 283 +++++++---------------
 1 file changed, 92 insertions(+), 191 deletions(-)

diff --git a/test/saber/bm/test_saber_func_conv_BM.cpp b/test/saber/bm/test_saber_func_conv_BM.cpp
index 35ffc6006..75663cb8a 100644
--- a/test/saber/bm/test_saber_func_conv_BM.cpp
+++ b/test/saber/bm/test_saber_func_conv_BM.cpp
@@ -4,7 +4,6 @@
 #include "tensor_op.h"
 #include "saber_types.h"
 #include <vector>
-//#include "cublas.h"
 
 using namespace anakin::saber;
 
@@ -39,10 +38,10 @@ inline int i_div_up(int a, int b)
     return (a % b != 0) ? (a / b + 1) : (a / b);
 }
 
-#if 1
-TEST(TestSaberFuncBM, test_depthwise_conv) {
 
-    int group = 2;
+TEST(TestSaberFuncBM, test_conv_result) {
+
+    int group = 1;
     int pad_h = 1;
     int pad_w = 1;
     int stride_h = 1;
@@ -52,30 +51,30 @@ TEST(TestSaberFuncBM, test_depthwise_conv) {
 
     int kernel_h = 3;
     int kernel_w = 3;
-    int out_channels = 2;
+    int out_channels = 1;
     
     int img_num = 1;
-    int in_channels = 2;
+    int in_channels = 1;
     int img_h = 8;
     int img_w = 8;
 
     bool bias_term = true;
 
     LOG(INFO) << "conv param: ";
-    LOG(INFO) << " img_num = " << img_num;
-    LOG(INFO) << " in_channels = " << in_channels;
-    LOG(INFO) << " img_h = " << img_h;
-    LOG(INFO) << " img_w = " << img_w;
-    LOG(INFO) << " group = " << group;
-    LOG(INFO) << " pad_h = " << pad_h;
-    LOG(INFO) << " pad_w = " << pad_w;
-    LOG(INFO) << " stride_h = " << stride_h;
-    LOG(INFO) << " stride_w = " << stride_w;
-    LOG(INFO) << " dilation_h = " << dilation_h;
-    LOG(INFO) << " dilation_w = " << dilation_w;
-    LOG(INFO) << " kernel_h = " << kernel_h;
-    LOG(INFO) << " kernel_w = " << kernel_w;
-    LOG(INFO) << " out_channels = " << out_channels;
+    LOG(INFO) << "  img_num = " << img_num;
+    LOG(INFO) << "  in_channels = " << in_channels;
+    LOG(INFO) << "  img_h = " << img_h;
+    LOG(INFO) << "  img_w = " << img_w;
+    LOG(INFO) << "  group = " << group;
+    LOG(INFO) << "  pad_h = " << pad_h;
+    LOG(INFO) << "  pad_w = " << pad_w;
+    LOG(INFO) << "  stride_h = " << stride_h;
+    LOG(INFO) << "  stride_w = " << stride_w;
+    LOG(INFO) << "  dilation_h = " << dilation_h;
+    LOG(INFO) << "  dilation_w = " << dilation_w;
+    LOG(INFO) << "  kernel_h = " << kernel_h;
+    LOG(INFO) << "  kernel_w = " << kernel_w;
+    LOG(INFO) << "  out_channels = " << out_channels;
 
     Shape img_s(img_num, in_channels, img_h, img_w);
     Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
@@ -88,7 +87,7 @@ TEST(TestSaberFuncBM, test_depthwise_conv) {
     img_dev.re_alloc(img_s);
 
     for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = 63 & i;
+        img_host.mutable_data()[i] = i;
     }
 
     img_dev.copy_from(img_host);
@@ -142,10 +141,9 @@ TEST(TestSaberFuncBM, test_depthwise_conv) {
 
     conv(input, output, param, ctx1);
 
-    //cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    //output[0]->record_event(cuda_stream);
+    output_dev.sync();
 
-    //output_dev.sync();
+    print_tensor_device(img_dev);
     print_tensor_device(output_dev);
 }
 
@@ -165,27 +163,11 @@ TEST(TestSaberFuncBM, test_conv_param_change) {
 
     int img_num = 1;
     int in_channels = 4;
-    int img_h = 65;
-    int img_w = 63;
+    int img_h = 64;
+    int img_w = 64;
 
     bool bias_term = true;
 
-    LOG(INFO) << "conv param: ";
-    LOG(INFO) << " img_num = " << img_num;
-    LOG(INFO) << " in_channels = " << in_channels;
-    LOG(INFO) << " img_h = " << img_h;
-    LOG(INFO) << " img_w = " << img_w;
-    LOG(INFO) << " group = " << group;
-    LOG(INFO) << " pad_h = " << pad_h;
-    LOG(INFO) << " pad_w = " << pad_w;
-    LOG(INFO) << " stride_h = " << stride_h;
-    LOG(INFO) << " stride_w = " << stride_w;
-    LOG(INFO) << " dilation_h = " << dilation_h;
-    LOG(INFO) << " dilation_w = " << dilation_w;
-    LOG(INFO) << " kernel_h = " << kernel_h;
-    LOG(INFO) << " kernel_w = " << kernel_w;
-    LOG(INFO) << " out_channels = " << out_channels;
-
     Shape img_s(img_num, in_channels, img_h, img_w);
     Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
     Shape bias_s(1, out_channels, 1, 1);
@@ -197,7 +179,7 @@ TEST(TestSaberFuncBM, test_conv_param_change) {
     img_dev.re_alloc(img_s);
 
     for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = 0x7f & i;
+        img_host.mutable_data()[i] = i;
     }
 
     img_dev.copy_from(img_host);
@@ -245,25 +227,20 @@ TEST(TestSaberFuncBM, test_conv_param_change) {
     output_dev.re_alloc(output[0]->shape());
     output_host.re_alloc(output[0]->shape());
 
-            LOG(INFO)<<"regular start with group = "<<group;
+    LOG(INFO) << "regular start with group = " << group;
     // init assume output tensor has been reshpaed by user.
     conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
 
     conv(input, output, param, ctx1);
-    //output_dev.sync();
 
     param.group = 1;
     param.pad_h = 1;
     param.pad_w = 1;
 
-    LOG(INFO)<<" param changed start with group = "<<param.group;
+    LOG(INFO) << " param changed start with group = " << param.group;
     conv(input, output, param, ctx1);
-
-    //print_tensor_device(output_dev);
-
 }
 
-/*
 TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
 
     int group = 1;
@@ -283,7 +260,7 @@ TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
     int img_h = 8;
     int img_w = 8;
 
-    bool bias_term = false;
+    bool bias_term = true;
 
     LOG(INFO) << "conv param: ";
     LOG(INFO) << " img_num = " << img_num;
@@ -376,7 +353,6 @@ TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
     output0.push_back(&out0);
     output1.push_back(&out1);
 
-    // FIXME ? where do i get output shape
     output_dev.re_alloc(img_s);
 
     Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv0;
@@ -393,16 +369,7 @@ TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
 
     conv0(input0, output0, param0, ctx1);
     conv1(input1, output1, param1, ctx2);
-
-    print_tensor_device(output_dev);
-
-//    print_tensor_device(output_dev);
-
-    //cudaDeviceSynchronize();
-    //CUDA_CHECK(cudaPeekAtLastError());
 }
-*/
-#endif
 
 TEST(TestSaberFuncBM, test_conv_fp32_speed_test) {
 
@@ -414,32 +381,29 @@ TEST(TestSaberFuncBM, test_conv_fp32_speed_test) {
     int dilation_h = 1;
     int dilation_w = 1;
 
-    int kernel_h = 1;
-    int kernel_w = 1;
+    int kernel_h = 3;
+    int kernel_w = 3;
     int out_channels = 128;
 
-    int img_num = 7;
-    int in_channels = 13;
+    int img_num = 64;
+    int in_channels = 4;
     int img_h = 32;
     int img_w = 32;
 
-    bool bias_term = false;
+    bool bias_term = true;
 
     LOG(INFO) << "conv param: ";
-    LOG(INFO) << " img_num = " << img_num;
-    LOG(INFO) << " in_channels = " << in_channels;
-    LOG(INFO) << " img_h = " << img_h;
-    LOG(INFO) << " img_w = " << img_w;
-    LOG(INFO) << " group = " << group;
-    LOG(INFO) << " pad_h = " << pad_h;
-    LOG(INFO) << " pad_w = " << pad_w;
-    LOG(INFO) << " stride_h = " << stride_h;
-    LOG(INFO) << " stride_w = " << stride_w;
-    LOG(INFO) << " dilation_h = " << dilation_h;
-    LOG(INFO) << " dilation_w = " << dilation_w;
-    LOG(INFO) << " kernel_h = " << kernel_h;
-    LOG(INFO) << " kernel_w = " << kernel_w;
-    LOG(INFO) << " out_channels = " << out_channels;
+    LOG(INFO) << "  img_num = " << img_num;
+    LOG(INFO) << "  in_channels = " << in_channels;
+    LOG(INFO) << "  out_channels = " << out_channels;
+    LOG(INFO) << "  img_h = " << img_h;
+    LOG(INFO) << "  img_w = " << img_w;
+    LOG(INFO) << "  group = " << group;
+    LOG(INFO) << "  pad = " << pad_h;
+    LOG(INFO) << "  stride = " << stride_h;
+    LOG(INFO) << "  dilation = " << dilation_h;
+    LOG(INFO) << "  kernel_h = " << kernel_h;
+    LOG(INFO) << "  kernel_w = " << kernel_w;
     Shape img_s(img_num, in_channels, img_h, img_w);
     Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
     Shape bias_s(1, out_channels, 1, 1);
@@ -498,149 +462,86 @@ TEST(TestSaberFuncBM, test_conv_fp32_speed_test) {
     output_dev.re_alloc(output[0]->shape());
     LOG(INFO) << "Output shape = [ " << output[0]->shape()[0] << " " << output[0]->shape()[1] << " " \
         << output[0]->shape()[2] << " " << output[0]->shape()[3] << "]";
-    //LOG(INFO) << " blocks = [ " <<  i_div_up(img_num*output[0]->shape()[2]*output[0]->shape()[3],128) << " " << i_div_up(out_channels*kernel_h, 128) << " 1 ]" ; 
-    //选择k最小的那一组，如果一样，则选128*N，N最大的那一组
-    int k0 = i_div_up(out_channels, 128) * 128 - out_channels;
-    int k1 = i_div_up(out_channels, 64) * 64 - out_channels;
-    int k2 = i_div_up(out_channels, 32) * 32 - out_channels;
-    int kk = std::min(std::min(k0,k1),k2);
-    LOG(INFO) << "k0 = " << k0 << " k1 = " << k1 << " k2 = " << k2 << " kk = " << kk;
-    if (kk == k0)
-        LOG(INFO) << "thread = [256,1,1] 128*128" ;
-    if (kk == k1)
-        LOG(INFO) << "thread = [128,1,1] 128*64" ;
-    if (kk == k2)
-        LOG(INFO) << "thread = [128,1,1] 128*32" ;
 
     LOG(INFO) << "saber conv init";
-    conv.init(input, output, param, SPECIFY, SABER_IMPL, ctx1);
-
-    LOG(INFO) << "saber conv dispatch";
-    conv(input, output, param, ctx1);
-
-    //cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    //output[0]->record_event(cuda_stream);
+    conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
 
-    //output_dev.sync();
+    /* conv(input, output, param, ctx1); */
+    /* output_dev.sync(); */
 
+    LOG(INFO) << "saber conv dispatch";
     SaberTimer<BM> t1;
-    int ts = 1;
-
+    int ts = 100;
+    t1.start(ctx1);
     for (int i = 0; i < ts; ++i) {
-        t1.start(ctx1);
         conv(input, output, param, ctx1);
         output_dev.sync();
-        t1.end(ctx1);
     }
-
-    LOG(INFO) << "fp32 average time: " << t1.get_average_ms() << " ms";
-
+    t1.end(ctx1);
+    LOG(INFO) << "elapse time: " << t1.get_average_ms()/ts << " ms";
 }
 
-void test_conv_fp32_speed(std::vector<TensorDf4*> &inputs, std::vector<TensorDf4*> &outputs,
-                         TensorDf4 &weights, int kernel_size, int stride, int pad,
-                         int in_channel, int out_channel, TensorDf4 &bias,
-                         anakin::saber::ImplEnum impl) {
-
-    ConvParam<TensorDf4> conv_param(1, pad, pad,
-                                    stride, stride,
-                                    1, 1,
-                                    &weights, &bias);
-    Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv;
-    conv.compute_output_shape(inputs, outputs, conv_param);
-    outputs[0]->re_alloc(outputs[0]->shape());
-    Context<BM> ctx1(0, 1, 1);
-
-    SABER_CHECK(conv.init(inputs, outputs, conv_param, SPECIFY, impl, ctx1));
-
-    conv(inputs, outputs, conv_param, ctx1);
-    outputs[0]->record_event(ctx1.get_compute_stream());
-    outputs[0]->sync();
-
-    //cudaDeviceSynchronize();
-
-    SaberTimer<BM> t1;
-    int ts = 100;
-    for (int i = 0; i < ts; ++i) {
-        t1.start(ctx1);
-        conv(inputs, outputs, conv_param, ctx1);
-        outputs[0]->record_event(ctx1.get_compute_stream());
-        outputs[0]->sync();
-        t1.end(ctx1);
-    }
-            LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms";
-
-    //cudaDeviceSynchronize();
-}
-
-
 TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) {
     int img_num = 1;
     int kernel = 1;
-
-//    int out_channels = 32;
-//    int in_channels = 128;
-//    int img_h = 52;
-//    int img_w = 112;
-//    int out_channels = 64;
-//    int in_channels = 256;
-//    int img_h = 26;
-//    int img_w = 56;
     int out_channels = 128;
     int in_channels = 512;
-    int img_h = 13;
-    int img_w = 28;
-
-//    int out_channels = 512;
-//    int in_channels = 128;
-//    int img_h = 13;
-//    int img_w = 28;
-
+    int img_h = 32;
+    int img_w = 32;
     int pad = 0;
     int stride = 1;
-    Context<BM> ctx1(0, 1, 1);
 
     TensorDf4 weights;
+    TensorDf4 bias;
     weights.re_alloc({out_channels, in_channels, 1, 1});
+    bias.re_alloc({1, out_channels, 1, 1});
 
-    TensorDf4 img;
+    TensorDf4 img, out;
     img.re_alloc({1, in_channels, img_h, img_w});
 
-    TensorDf4 out;
-    out.re_alloc({1, out_channels, img_h, img_w});
-    TensorDf4 out_gemm;
-    out_gemm.re_alloc({1, out_channels, img_h, img_w});
-
     fill_tensor_device_rand(weights, -1.f, 1.f);
+    fill_tensor_device_rand(bias, -1.f, 1.f);
     fill_tensor_device_rand(img, -1.f, 1.f);
 
-    LOG(INFO) << "img_num: " << img_num;
-    LOG(INFO) << "kernel: " << kernel;
-    LOG(INFO) << "out_channels: " << out_channels;
-    LOG(INFO) << "in_channels: " << in_channels;
-    LOG(INFO) << "img_h: " << img_h;
-    LOG(INFO) << "img_w: " << img_w;
-    LOG(INFO) << "pad: " << pad;
-    LOG(INFO) << "stride: " << stride;
+    LOG(INFO) << "conv param: ";
+    LOG(INFO) << "  img_num: " << img_num;
+    LOG(INFO) << "  kernel: " << kernel;
+    LOG(INFO) << "  out_channels: " << out_channels;
+    LOG(INFO) << "  in_channels: " << in_channels;
+    LOG(INFO) << "  img_h: " << img_h;
+    LOG(INFO) << "  img_w: " << img_w;
+    LOG(INFO) << "  pad: " << pad;
+    LOG(INFO) << "  stride: " << stride;
 
-    TensorDf4 bias;
+    std::vector<TensorDf4*> input;
+    std::vector<TensorDf4*> output;
+
+    input.push_back(&img);
+    output.push_back(&out);
+
+    ConvParam<TensorDf4> conv_param(1, pad, pad,
+                                    stride, stride,
+                                    1, 1,
+                                    &weights, &bias);
+    Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv;
+    conv.compute_output_shape(input, output, conv_param);
+    out.re_alloc(output[0]->shape());
+    Context<BM> ctx1(0, 1, 1);
+    conv.init(input, output, conv_param, SPECIFY, VENDER_IMPL, ctx1);
 
-    std::vector<TensorDf4*> input_v;
-    std::vector<TensorDf4*> output_gemm_v, output_v;
-
-    input_v.push_back(&img);
-    output_v.push_back(&out);
-    output_gemm_v.push_back(&out_gemm);
-    //cudaDeviceSynchronize();
-    test_conv_fp32_speed(input_v, output_v,
-                         weights, kernel, stride, pad,
-            in_channels, out_channels, bias,
-            SABER_IMPL);
+    SaberTimer<BM> t1;
+    int ts = 100;
+    t1.start(ctx1);
+    for (int i = 0; i < ts; ++i) {
+        conv(input, output, conv_param, ctx1);
+        out.sync();
+    }
+    t1.end(ctx1);
+    LOG(INFO) << "elapse time: " << t1.get_average_ms()/ts << " ms";
 }
 
 int main(int argc, const char** argv){
-    anakin::saber::Env<BM>::env_init();
-
+    Env<BM>::env_init();
     // initial logger
     //logger::init(argv[0]);
     InitTest();

From a941292347101ec6fa0a266b672aee3a8b48ecc0 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 16:41:18 +0800
Subject: [PATCH 101/318] Add test for batch norm

---
 saber/funcs/batch_norm.h                      |  6 +-
 saber/funcs/impl/bm/vender_batch_norm.h       | 16 ++--
 saber/funcs/impl/bm/vender_scale.h            |  3 +-
 .../bm/test_saber_func_batch_norm_BM.cpp      | 81 +++++++++++++++++++
 4 files changed, 95 insertions(+), 11 deletions(-)
 create mode 100644 test/saber/bm/test_saber_func_batch_norm_BM.cpp

diff --git a/saber/funcs/batch_norm.h b/saber/funcs/batch_norm.h
index f8cf3e693..2e817c734 100644
--- a/saber/funcs/batch_norm.h
+++ b/saber/funcs/batch_norm.h
@@ -42,7 +42,7 @@ class BatchNorm : public BaseFunc<
         Tensor<TargetType, outDtype, LayOutType_out>,
         Tensor<TargetType, OpDtype, LayOutType_op>,
         ImplBase,
-        BatchNormParam
+        BatchnormParam
 > {
 public:
     using BaseFunc<
@@ -50,14 +50,14 @@ class BatchNorm : public BaseFunc<
             Tensor<TargetType, outDtype, LayOutType_out>,
             Tensor<TargetType, OpDtype, LayOutType_op>,
             ImplBase,
-            BatchNormParam>::BaseFunc;
+            BatchnormParam>::BaseFunc;
 
     BatchNorm() = default;
 
     typedef Tensor<TargetType, inDtype, LayOutType_in> InDataTensor;
     typedef Tensor<TargetType, outDtype, LayOutType_out> OutDataTensor;
     typedef Tensor<TargetType, OpDtype, LayOutType_op> OpTensor;
-    typedef BatchNormParam<OpTensor> Param_t;
+    typedef BatchnormParam<OpTensor> Param_t;
     typedef std::vector<InDataTensor *> Input_v;
     typedef std::vector<OutDataTensor *> Output_v;
     typedef std::vector<Shape> Shape_v;
diff --git a/saber/funcs/impl/bm/vender_batch_norm.h b/saber/funcs/impl/bm/vender_batch_norm.h
index 917dc7219..e362a256f 100644
--- a/saber/funcs/impl/bm/vender_batch_norm.h
+++ b/saber/funcs/impl/bm/vender_batch_norm.h
@@ -18,7 +18,7 @@ class VenderBatchNorm<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_
     Tensor<BM, inDtype, LayOutType_in>, 
     Tensor<BM, outDtype, LayOutType_out>,
     Tensor<BM, OpDtype, LayOutType_op>,
-    BatchNormParam<Tensor<BM, OpDtype, LayOutType_op>>> {
+    BatchnormParam<Tensor<BM, OpDtype, LayOutType_op>>> {
 public:
     typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
     typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
@@ -34,7 +34,7 @@ class VenderBatchNorm<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_
 
     virtual SaberStatus init(const std::vector<DataTensor_in*>& inputs,
                   std::vector<DataTensor_out*>& outputs,
-                  BatchNormParam<OpTensor> &batch_norm_param, Context<BM> &ctx) {
+                  BatchnormParam<OpTensor> &batch_norm_param, Context<BM> &ctx) {
 
         _handle = get_bm_handle();
         return create(inputs, outputs, batch_norm_param, ctx);
@@ -42,12 +42,12 @@ class VenderBatchNorm<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_
 
     virtual SaberStatus create(const std::vector<DataTensor_in*>& inputs,
                 std::vector<DataTensor_out*>& outputs,
-                BatchNormParam<OpTensor> &batch_norm_param, Context<BM> &ctx) {
+                BatchnormParam<OpTensor> &batch_norm_param, Context<BM> &ctx) {
     }
 
     virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
                           std::vector<DataTensor_out*>& outputs,
-                          BatchNormParam<OpTensor> &param) {
+                          BatchnormParam<OpTensor> &param) {
 
         const InDataType *in_data = (const InDataType *) inputs[0]->data();
         OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
@@ -57,12 +57,14 @@ class VenderBatchNorm<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_
         int input_h = inputs[0]->height();
         int input_w = inputs[0]->width();
 
-        OpDataType eps = param.eps;
-        OpDataType scale = param.scale;
+        float eps = param.eps;
+        float scale = param.scale;
 
         bm_device_mem_t mean_ma = bm_mem_from_system(&param.mean);
         bm_device_mem_t variance_ma = bm_mem_from_system(&param.variance);
 
+        bm_device_mem_t* variance_holder = new bm_device_mem_t();
+
         bmdnn_batchnorm_forward_inference(
                 _handle,
                 //input
@@ -70,7 +72,7 @@ class VenderBatchNorm<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_
                 mean_ma,
                 variance_ma,
                 scale,
-                new bm_device_mem_t(),
+                *variance_holder,
                 eps,
                 input_n,
                 input_c,
diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
index 9ed364173..b47716a03 100644
--- a/saber/funcs/impl/bm/vender_scale.h
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -70,6 +70,7 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
             CHECK_EQ(scale_dim, param.scale_w.size()) << "scale dim not valid";
         }
 
+        bm_device_mem_t* scale_extension = new bm_device_mem_t();
         OpDataType* scale_data = param.scale_w[0];
         bmdnn_scale_forward(
                 _handle,
@@ -84,7 +85,7 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
                 inner_dim,
                 0,
                 //output
-                new bm_device_mem_t(),
+                *scale_extension,
                 *out_data
         );
 
diff --git a/test/saber/bm/test_saber_func_batch_norm_BM.cpp b/test/saber/bm/test_saber_func_batch_norm_BM.cpp
new file mode 100644
index 000000000..659d0f699
--- /dev/null
+++ b/test/saber/bm/test_saber_func_batch_norm_BM.cpp
@@ -0,0 +1,81 @@
+#include "core/context.h"
+#include "funcs/batch_norm.h"
+#include "test_saber_func_BM.h"
+#include "tensor_op.h"
+#include "saber_types.h"
+#include <vector>
+
+using namespace anakin::saber;
+
+
+TEST(TestSaberFuncBM, test_func_batch_norm_BM) {
+
+    typedef TargetWrapper<BM> API;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+    typedef TensorDf4::Dtype dtype;
+
+    //Input / output tensor
+    Shape shape_in(1, 1, 2, 2);
+    Shape shape_out = shape_in;
+
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+
+    Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
+    for (int i = 0; i < thin.size(); ++i) {
+        thin.mutable_data()[i] = 10;
+    }
+
+    TensorDf4 tdin, tdout;
+    tdin.re_alloc(shape_in);
+    tdin.copy_from(thin);
+    input_dev_4d.push_back(&tdin);
+
+    //Batch norm param
+    std::vector<float> mean;
+    mean.push_back(10);
+
+    std::vector<float> variance;
+    variance.push_back(0);
+
+    float scale_in = 1;
+    float eps_in = float(1e-5);
+
+    BatchnormParam<TensorDf4> param(mean, variance, scale_in);
+
+    //BatachNorm
+    BatchNorm<BM, AK_BM, AK_BM, AK_BM, NCHW> batchNorm;
+
+    output_dev_4d.push_back(&tdout);
+    batchNorm.compute_output_shape(input_dev_4d, output_dev_4d, param);
+
+    LOG(INFO) << "re-alloc tensor buffer";
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
+
+    LOG(INFO) << "batch norm initialized to bm impl";
+    Context<BM> ctx_dev(0, 1, 1);
+    batchNorm.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
+
+    LOG(INFO) << "bm batch norm compute";
+    SaberTimer<BM> t1;
+    t1.clear();
+    t1.start(ctx_dev);
+
+    batchNorm(input_dev_4d, output_dev_4d, param, ctx_dev);
+
+    t1.end(ctx_dev);
+    float ts = t1.get_average_ms();
+    printf("cudnn softmax total time : %.4f, avg time : %.4f\n", ts, ts);
+
+    print_tensor_device(*output_dev_4d[0]);
+}
+
+int main(int argc, const char** argv) {
+    // initial logger
+    //logger::init(argv[0]);
+    //Env<BM>::env_init();
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+

From 64325fedf3597c26da2cbae429c690b2cbaba241 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 17:50:07 +0800
Subject: [PATCH 102/318] Use specialization

---
 saber/saber_funcs_param.h | 52 +++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 27 deletions(-)

diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h
index 1a32c9c87..a758b5881 100644
--- a/saber/saber_funcs_param.h
+++ b/saber/saber_funcs_param.h
@@ -777,21 +777,20 @@ struct SoftmaxParam {
     int axis;
 };
 
-#ifdef USE_BM
 template <typename opTensor>
 struct BatchnormParam {
     typedef typename opTensor::Dtype DataDtype;
     BatchnormParam()
-        : scale(float(0))
+        : scale(DataDtype(0))
         , use_global_stats(true)
-        , moving_average_fraction(float(0.999))
-        , eps(float(1e-5))
+        , moving_average_fraction(DataDtype(0.999))
+        , eps(DataDtype(1e-5))
         , mean(), variance()
     {}
     //scale_factor = 1 / scale;
-    BatchnormParam(std::vector<float> mean_in, std::vector<float> variance_in,
-                float scale_in, float moving_average_fraction_in = float(0.999),
-                float eps_in = float(1e-5), bool use_global_stats_in = true)
+    BatchnormParam(std::vector<DataDtype> mean_in, std::vector<DataDtype> variance_in,
+                DataDtype scale_in, DataDtype moving_average_fraction_in = DataDtype(0.999),
+                DataDtype eps_in = DataDtype(1e-5), bool use_global_stats_in = true)
         : mean(mean_in), variance(variance_in), scale(scale_in)
         , moving_average_fraction(moving_average_fraction_in)
         , eps(eps_in), use_global_stats(use_global_stats_in)
@@ -815,28 +814,27 @@ struct BatchnormParam {
         comp_eq = comp_eq && (variance == right.variance);
         return comp_eq;
     }
-    float scale;
-    float moving_average_fraction;
-    float eps;
+    DataDtype scale;
+    DataDtype moving_average_fraction;
+    DataDtype eps;
     bool use_global_stats;
-    std::vector<float> mean;
-    std::vector<float> variance;
+    std::vector<DataDtype> mean;
+    std::vector<DataDtype> variance;
 };
-#else
-template <typename opTensor>
-struct BatchnormParam {
-    typedef typename opTensor::Dtype DataDtype;
+#ifdef USE_BM
+template <>
+struct BatchnormParam<Tensor<BM, AK_BM, NCHW>> {
     BatchnormParam()
-        : scale(DataDtype(0))
+        : scale(float(0))
         , use_global_stats(true)
-        , moving_average_fraction(DataDtype(0.999))
-        , eps(DataDtype(1e-5))
+        , moving_average_fraction(float(0.999))
+        , eps(float(1e-5))
         , mean(), variance()
     {}
     //scale_factor = 1 / scale;
-    BatchnormParam(std::vector<DataDtype> mean_in, std::vector<DataDtype> variance_in,
-                DataDtype scale_in, DataDtype moving_average_fraction_in = DataDtype(0.999),
-                DataDtype eps_in = DataDtype(1e-5), bool use_global_stats_in = true)
+    BatchnormParam(std::vector<float> mean_in, std::vector<float> variance_in,
+                float scale_in, float moving_average_fraction_in = float(0.999),
+                float eps_in = float(1e-5), bool use_global_stats_in = true)
         : mean(mean_in), variance(variance_in), scale(scale_in)
         , moving_average_fraction(moving_average_fraction_in)
         , eps(eps_in), use_global_stats(use_global_stats_in)
@@ -860,12 +858,12 @@ struct BatchnormParam {
         comp_eq = comp_eq && (variance == right.variance);
         return comp_eq;
     }
-    DataDtype scale;
-    DataDtype moving_average_fraction;
-    DataDtype eps;
+    float scale;
+    float moving_average_fraction;
+    float eps;
     bool use_global_stats;
-    std::vector<DataDtype> mean;
-    std::vector<DataDtype> variance;
+    std::vector<float> mean;
+    std::vector<float> variance;
 };
 #endif
 

From da713a96794a5f65538420a2513167eeb6b88998 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 18:45:39 +0800
Subject: [PATCH 103/318] Update batch norm test for BM

---
 test/saber/bm/test_saber_func_batch_norm_BM.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/test/saber/bm/test_saber_func_batch_norm_BM.cpp b/test/saber/bm/test_saber_func_batch_norm_BM.cpp
index 659d0f699..0453f818a 100644
--- a/test/saber/bm/test_saber_func_batch_norm_BM.cpp
+++ b/test/saber/bm/test_saber_func_batch_norm_BM.cpp
@@ -23,7 +23,7 @@ TEST(TestSaberFuncBM, test_func_batch_norm_BM) {
 
     Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
     for (int i = 0; i < thin.size(); ++i) {
-        thin.mutable_data()[i] = 10;
+        thin.mutable_data()[i] = 1+i;
     }
 
     TensorDf4 tdin, tdout;
@@ -31,9 +31,12 @@ TEST(TestSaberFuncBM, test_func_batch_norm_BM) {
     tdin.copy_from(thin);
     input_dev_4d.push_back(&tdin);
 
+    LOG(INFO) << "Input tensor is:";
+    print_tensor_device(*input_dev_4d[0]);
+
     //Batch norm param
     std::vector<float> mean;
-    mean.push_back(10);
+    mean.push_back(1);
 
     std::vector<float> variance;
     variance.push_back(0);
@@ -65,7 +68,7 @@ TEST(TestSaberFuncBM, test_func_batch_norm_BM) {
 
     t1.end(ctx_dev);
     float ts = t1.get_average_ms();
-    printf("cudnn softmax total time : %.4f, avg time : %.4f\n", ts, ts);
+    printf("bm batch norm total time : %.4f, avg time : %.4f\n", ts, ts);
 
     print_tensor_device(*output_dev_4d[0]);
 }

From efd4524105661afec01a5c2d42120e74416f342b Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 18:45:39 +0800
Subject: [PATCH 104/318] Update batch norm test for BM

---
 saber/core/common.h                           |   1 +
 saber/funcs/impl/bm/vender_scale.h            |  52 ++------
 saber/saber_funcs_param.h                     |  42 ++++++
 .../bm/test_saber_func_batch_norm_BM.cpp      |   9 +-
 test/saber/bm/test_saber_func_scale_BM.cpp    | 121 ++++++++++++++++++
 5 files changed, 184 insertions(+), 41 deletions(-)
 create mode 100644 test/saber/bm/test_saber_func_scale_BM.cpp

diff --git a/saber/core/common.h b/saber/core/common.h
index 2e7cd2650..54d6c56dd 100644
--- a/saber/core/common.h
+++ b/saber/core/common.h
@@ -150,6 +150,7 @@ const char* cudnn_get_errorstring(cudnnStatus_t status);
 
 #include "bmlib_runtime.h"
 #include "bmdnn_api.h"
+#include "bmdnn_ext_api.h"
 #include "bmlib_utils.h"
 
 #define BMDNN_CHECK(condition) \
diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
index b47716a03..13f1d6322 100644
--- a/saber/funcs/impl/bm/vender_scale.h
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -29,8 +29,7 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
     typedef typename DataTensor_out::Dtype OutDataType;
     typedef typename OpTensor::Dtype OpDataType;
 
-    VenderScale()
-    {}
+    VenderScale() {}
 
     ~VenderScale() {}
 
@@ -52,8 +51,8 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
                           std::vector<DataTensor_out*>& outputs,
                           ScaleParam<OpTensor>& param) {
 
-        const InDataType *in_data = (const InDataType *) inputs[0]->data();
-        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
+        const InDataType in_data = *(inputs[0]->data());
+        OutDataType out_data = *(outputs[0]->mutable_data());
 
         int input_n = inputs[0]->num();
         int input_c = inputs[0]->channel();
@@ -66,43 +65,21 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
         int outer_dim = inputs[0]->count(0, axis);
         int inner_dim = inputs[0]->count(axis + num_axes, inputs[0]->shape().dims());
         int scale_dim = inputs[0]->count(axis, axis + num_axes);
-        if (inputs.size() == 1) {
-            CHECK_EQ(scale_dim, param.scale_w.size()) << "scale dim not valid";
-        }
+        /* if (inputs.size() == 1) { */
+        /*     CHECK_EQ(scale_dim, param.scale_w.size()) << "scale dim not valid"; */
+        /* } */
 
-        bm_device_mem_t* scale_extension = new bm_device_mem_t();
-        OpDataType* scale_data = param.scale_w[0];
-        bmdnn_scale_forward(
-                _handle,
-                //input
-                *in_data,
-                *scale_data,
-                input_n,
-                input_c,
-                input_h,
-                input_w,
-                scale_dim,
-                inner_dim,
-                0,
-                //output
-                *scale_extension,
-                *out_data
-        );
+        OpDataType scale_data = param.scale_w;
+        BMDNN_CHECK(bmdnn_scale_forward(_handle, in_data, scale_data,
+                input_n, input_c, input_h, input_w,
+                scale_dim, inner_dim, 0,
+                bm_mem_null(), out_data));
 
         if (param.bias_term) {
-            OpDataType* bias_data = param.scale_b[0];
-            bmdnn_bias_forward(
-                    _handle,
-                    //input
-                    *out_data,
-                    *bias_data,
-                    outer_dim,
-                    inner_dim,
-                    //output
-                    *out_data
-            );
+            OpDataType bias_data = param.scale_b;
+            BMDNN_CHECK(bmdnn_bias_forward(_handle, in_data, bias_data,
+                    outer_dim, inner_dim, out_data));
         }
-
         return SaberSuccess;
     }
 private:
@@ -110,6 +87,5 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
 };
 
 }
-
 }
 #endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_SCALE_H
diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h
index a758b5881..021928a49 100644
--- a/saber/saber_funcs_param.h
+++ b/saber/saber_funcs_param.h
@@ -972,6 +972,48 @@ struct ScaleParam {
     std::vector<DataDtype> scale_w;
     std::vector<DataDtype> scale_b;
 };
+#ifdef USE_BM
+template <>
+struct ScaleParam<Tensor<BM, AK_BM, NCHW>> {
+    ScaleParam(): axis(1), num_axes(1), bias_term(false) {}
+    ScaleParam(bm_device_mem_t scale_w_in, bm_device_mem_t scale_b_in,
+               bool bias_term_in = true, int axis_in = 1, int num_axes_in = 1)
+            : scale_w(scale_w_in), scale_b(scale_b_in)
+            , bias_term(bias_term_in), axis(axis_in), num_axes(num_axes_in)
+    {}
+    ScaleParam(bm_device_mem_t scale_w_in,
+               bool bias_term_in = false, int axis_in = 1, int num_axes_in = 1)
+            : scale_w(scale_w_in)
+            , bias_term(bias_term_in), axis(axis_in), num_axes(num_axes_in)
+    {}
+    ScaleParam(const ScaleParam &right)
+            : scale_w(right.scale_w), scale_b(right.scale_b)
+            , bias_term(right.bias_term), axis(right.axis), num_axes(right.num_axes)
+    {}
+    ScaleParam &operator=(const ScaleParam &right) {
+        scale_w = right.scale_w;
+        scale_b = right.scale_b;
+        bias_term = right.bias_term;
+        axis = right.axis;
+        num_axes = right.num_axes;
+        return *this;
+    }
+    bool operator==(const ScaleParam &right) {
+        bool comp_eq = true;
+        /* comp_eq = comp_eq && (scale_w == right.scale_w); */
+        /* comp_eq = comp_eq && (scale_b == right.scale_b); */
+        comp_eq = comp_eq && (bias_term == right.bias_term);
+        comp_eq = comp_eq && (axis == right.axis);
+        comp_eq = comp_eq && (num_axes == right.num_axes);
+        return comp_eq;
+    }
+    int axis; // default is 1
+    int num_axes; // default is 1
+    bool bias_term; // default false
+    bm_device_mem_t scale_w;
+    bm_device_mem_t scale_b;
+};
+#endif
 template <typename opTensor>
 struct PoolingParam {
     PoolingParam() : window_h(-1), window_w(-1)
diff --git a/test/saber/bm/test_saber_func_batch_norm_BM.cpp b/test/saber/bm/test_saber_func_batch_norm_BM.cpp
index 659d0f699..0453f818a 100644
--- a/test/saber/bm/test_saber_func_batch_norm_BM.cpp
+++ b/test/saber/bm/test_saber_func_batch_norm_BM.cpp
@@ -23,7 +23,7 @@ TEST(TestSaberFuncBM, test_func_batch_norm_BM) {
 
     Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
     for (int i = 0; i < thin.size(); ++i) {
-        thin.mutable_data()[i] = 10;
+        thin.mutable_data()[i] = 1+i;
     }
 
     TensorDf4 tdin, tdout;
@@ -31,9 +31,12 @@ TEST(TestSaberFuncBM, test_func_batch_norm_BM) {
     tdin.copy_from(thin);
     input_dev_4d.push_back(&tdin);
 
+    LOG(INFO) << "Input tensor is:";
+    print_tensor_device(*input_dev_4d[0]);
+
     //Batch norm param
     std::vector<float> mean;
-    mean.push_back(10);
+    mean.push_back(1);
 
     std::vector<float> variance;
     variance.push_back(0);
@@ -65,7 +68,7 @@ TEST(TestSaberFuncBM, test_func_batch_norm_BM) {
 
     t1.end(ctx_dev);
     float ts = t1.get_average_ms();
-    printf("cudnn softmax total time : %.4f, avg time : %.4f\n", ts, ts);
+    printf("bm batch norm total time : %.4f, avg time : %.4f\n", ts, ts);
 
     print_tensor_device(*output_dev_4d[0]);
 }
diff --git a/test/saber/bm/test_saber_func_scale_BM.cpp b/test/saber/bm/test_saber_func_scale_BM.cpp
new file mode 100644
index 000000000..c746a67ff
--- /dev/null
+++ b/test/saber/bm/test_saber_func_scale_BM.cpp
@@ -0,0 +1,121 @@
+#include "core/context.h"
+#include "funcs/scale.h"
+#include "test_saber_func_BM.h"
+#include "tensor_op.h"
+#include "saber_types.h"
+#include <vector>
+
+using namespace anakin::saber;
+
+template <typename Tensor>
+void print_tensor_shape(std::string name, Tensor& t0) {
+
+    LOG(INFO) << name << " valid shape is ["
+              << t0.valid_shape()[0] << ", "
+              << t0.valid_shape()[1] << ", "
+              << t0.valid_shape()[2] << ", "
+              << t0.valid_shape()[3] << "].";
+
+    LOG(INFO) << name << " real shape is ["
+              << t0.shape()[0] << ", "
+              << t0.shape()[1] << ", "
+              << t0.shape()[2] << ", "
+              << t0.shape()[3] << "].";
+
+    LOG(INFO) << name << " offset is ["
+              << t0.offset()[0] << ", "
+              << t0.offset()[1] << ", "
+              << t0.offset()[2] << ", "
+              << t0.offset()[3] << "].";
+}
+void fill_vector_rand(std::vector<float>& vec) {
+    for (int i = 0; i < vec.size(); i++) {
+        vec[i] = rand() *1.0f/RAND_MAX - 0.5;
+    }
+}
+void print_vector_data(std::vector<float>& vec) {
+    for (int i = 0; i < vec.size(); i++) {
+        printf("%d, %f\n", i, vec[i]);
+    }
+}
+
+void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_term, int scale_dim) {
+
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+
+    int img_num = n;
+    int in_channels = c;
+    int img_h = h;
+    int img_w = w;
+
+    Shape img_s(img_num, in_channels, img_h, img_w);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+    fill_tensor_host_rand(img_host, -0.5, 0.5);
+    img_dev.copy_from(img_host);
+
+    TensorDf4 output_dev;
+
+    Context<BM> ctx1(0, 1, 1);
+    std::vector<float> scale_w;
+    std::vector<float> scale_b;
+    scale_w.resize(scale_dim);
+    fill_vector_rand(scale_w);
+    if (bias_term) {
+        scale_b.resize(scale_dim);
+        fill_vector_rand(scale_b);
+    }
+
+    ScaleParam<TensorDf4> param(bm_mem_from_system(&scale_w[0]), 
+                                bm_mem_from_system(&scale_b[0]), 
+                                bias_term, axis, num_axes);
+
+    std::vector<TensorDf4*> input;
+    std::vector<TensorDf4*> output;
+
+    input.push_back(&img_dev);
+    output.push_back(&output_dev);
+
+    Scale<BM, AK_BM, AK_BM, AK_BM, NCHW> scale;
+    scale.compute_output_shape(input, output, param);
+    output_dev.re_alloc(output[0]->valid_shape());
+
+    // init assume output tensor has been reshpaed by user.
+    scale.init(input, output, param, SPECIFY, SABER_IMPL, ctx1);
+    scale(input, output, param, ctx1);
+
+    output_dev.sync();
+    LOG(INFO) << "input data: ";
+    print_tensor_device(img_dev);
+    LOG(INFO) << "output data: ";
+    print_tensor_device(output_dev);
+    LOG(INFO) << "scale_w data: ";
+    print_vector_data(scale_w);
+    if (bias_term) {
+        LOG(INFO) << "scale_b data: ";
+        print_vector_data(scale_b);
+    }
+}
+
+TEST(TestSaberFuncBM, test_func_constructor_elt) {
+    test_scale(2, 2, 4, 4, 1, 1, false, 2);
+    /* test_scale(2, 2, 4, 4, 1, 1, true, 2); */
+    /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */
+    /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */
+    /* test_scale(2, 2, 4, 4, 0, 0, true, 1); */
+    /* test_scale(2, 2, 4, 4, 0, 0, true, 1); */
+}
+
+
+int main(int argc, const char** argv) {
+    Env<BM>::env_init();
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+

From 44e1395bb7d8355ad18fe1ffa1e62d6b569eaefa Mon Sep 17 00:00:00 2001
From: "Guangzhi (Frank) Xie" <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 20:34:39 +0800
Subject: [PATCH 105/318] Update BM batch norm test

---
 saber/funcs/impl/bm/vender_batch_norm.h         | 6 +++---
 test/saber/bm/test_saber_func_batch_norm_BM.cpp | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_batch_norm.h b/saber/funcs/impl/bm/vender_batch_norm.h
index e362a256f..4f433a4a9 100644
--- a/saber/funcs/impl/bm/vender_batch_norm.h
+++ b/saber/funcs/impl/bm/vender_batch_norm.h
@@ -59,9 +59,9 @@ class VenderBatchNorm<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_
 
         float eps = param.eps;
         float scale = param.scale;
-
-        bm_device_mem_t mean_ma = bm_mem_from_system(&param.mean);
-        bm_device_mem_t variance_ma = bm_mem_from_system(&param.variance);
+        
+        bm_device_mem_t mean_ma = bm_mem_from_system(&param.mean[0]);
+        bm_device_mem_t variance_ma = bm_mem_from_system(&param.variance[0]);
 
         bm_device_mem_t* variance_holder = new bm_device_mem_t();
 
diff --git a/test/saber/bm/test_saber_func_batch_norm_BM.cpp b/test/saber/bm/test_saber_func_batch_norm_BM.cpp
index 0453f818a..395eb525f 100644
--- a/test/saber/bm/test_saber_func_batch_norm_BM.cpp
+++ b/test/saber/bm/test_saber_func_batch_norm_BM.cpp
@@ -39,7 +39,7 @@ TEST(TestSaberFuncBM, test_func_batch_norm_BM) {
     mean.push_back(1);
 
     std::vector<float> variance;
-    variance.push_back(0);
+    variance.push_back(0.001);
 
     float scale_in = 1;
     float eps_in = float(1e-5);

From 609bcd81b98d1729fec370a87b12db29cd8064a2 Mon Sep 17 00:00:00 2001
From: "Guangzhi (Frank) Xie" <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 21:15:41 +0800
Subject: [PATCH 106/318] Use vender scale for test

---
 test/saber/bm/test_saber_func_scale_BM.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/saber/bm/test_saber_func_scale_BM.cpp b/test/saber/bm/test_saber_func_scale_BM.cpp
index c746a67ff..6b0e309d8 100644
--- a/test/saber/bm/test_saber_func_scale_BM.cpp
+++ b/test/saber/bm/test_saber_func_scale_BM.cpp
@@ -86,7 +86,7 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te
     output_dev.re_alloc(output[0]->valid_shape());
 
     // init assume output tensor has been reshpaed by user.
-    scale.init(input, output, param, SPECIFY, SABER_IMPL, ctx1);
+    scale.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
     scale(input, output, param, ctx1);
 
     output_dev.sync();

From 16b6f6ec36880772c5a7b786fe9df456eaac1f3e Mon Sep 17 00:00:00 2001
From: "Guangzhi (Frank) Xie" <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 21:21:42 +0800
Subject: [PATCH 107/318] Update BM scale

---
 saber/funcs/impl/bm/vender_scale.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
index 13f1d6322..ce32e898e 100644
--- a/saber/funcs/impl/bm/vender_scale.h
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -70,10 +70,11 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
         /* } */
 
         OpDataType scale_data = param.scale_w;
+        bm_device_mem_t* scale_extension = new bm_device_mem_t();
         BMDNN_CHECK(bmdnn_scale_forward(_handle, in_data, scale_data,
                 input_n, input_c, input_h, input_w,
                 scale_dim, inner_dim, 0,
-                bm_mem_null(), out_data));
+                *scale_extension, out_data));
 
         if (param.bias_term) {
             OpDataType bias_data = param.scale_b;

From 20f7ed0028e8eca5485284a998cb058c3a0f7b5d Mon Sep 17 00:00:00 2001
From: "Guangzhi (Frank) Xie" <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 21:27:00 +0800
Subject: [PATCH 108/318] update BM bias input

---
 saber/funcs/impl/bm/vender_scale.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
index ce32e898e..8ecaa1c38 100644
--- a/saber/funcs/impl/bm/vender_scale.h
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -78,7 +78,7 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
 
         if (param.bias_term) {
             OpDataType bias_data = param.scale_b;
-            BMDNN_CHECK(bmdnn_bias_forward(_handle, in_data, bias_data,
+            BMDNN_CHECK(bmdnn_bias_forward(_handle, out_data, bias_data,
                     outer_dim, inner_dim, out_data));
         }
         return SaberSuccess;

From b729f54ce777983e1527dc2e935b8cb0e34681dd Mon Sep 17 00:00:00 2001
From: "Guangzhi (Frank) Xie" <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 21:37:46 +0800
Subject: [PATCH 109/318] BM scale test with bias

---
 test/saber/bm/test_saber_func_scale_BM.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/saber/bm/test_saber_func_scale_BM.cpp b/test/saber/bm/test_saber_func_scale_BM.cpp
index 6b0e309d8..d6833bb9a 100644
--- a/test/saber/bm/test_saber_func_scale_BM.cpp
+++ b/test/saber/bm/test_saber_func_scale_BM.cpp
@@ -104,7 +104,7 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te
 
 TEST(TestSaberFuncBM, test_func_constructor_elt) {
     test_scale(2, 2, 4, 4, 1, 1, false, 2);
-    /* test_scale(2, 2, 4, 4, 1, 1, true, 2); */
+    test_scale(2, 2, 4, 4, 1, 1, true, 2);
     /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */
     /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */
     /* test_scale(2, 2, 4, 4, 0, 0, true, 1); */

From 28a35e9036cb18b65a79048fa4213cf74a4b2266 Mon Sep 17 00:00:00 2001
From: liuhong03 <hongliu104@gmail.com>
Date: Thu, 28 Jun 2018 21:21:39 -0400
Subject: [PATCH 110/318] fix bias in scale

---
 saber/funcs/impl/bm/vender_scale.h         | 33 ++++++++++++++++++----
 test/saber/bm/test_saber_func_scale_BM.cpp |  6 ++--
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
index 8ecaa1c38..5f8b6d3bb 100644
--- a/saber/funcs/impl/bm/vender_scale.h
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -70,17 +70,40 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
         /* } */
 
         OpDataType scale_data = param.scale_w;
-        bm_device_mem_t* scale_extension = new bm_device_mem_t();
+        bm_device_mem_t* data_extension = new bm_device_mem_t();
+        int size = input_n * input_c * input_h * input_w;
+        bm_malloc_device_byte(_handle, data_extension, size * sizeof(float));
         BMDNN_CHECK(bmdnn_scale_forward(_handle, in_data, scale_data,
                 input_n, input_c, input_h, input_w,
                 scale_dim, inner_dim, 0,
-                *scale_extension, out_data));
-
+                *data_extension, out_data));
+        
         if (param.bias_term) {
             OpDataType bias_data = param.scale_b;
-            BMDNN_CHECK(bmdnn_bias_forward(_handle, out_data, bias_data,
-                    outer_dim, inner_dim, out_data));
+            float* host_bias = new float[scale_dim];
+            float* host_extension = new float[size];
+            printf(".........\n");
+//        bm_device_mem_t temp;;
+//        bm_malloc_device_byte(_handle, &temp, scale_dim * sizeof(float));
+//            bm_memcpy_d2s(_handle, bm_mem_from_system(host_bias), temp);
+//            bm_memcpy_d2s(_handle, bm_mem_from_system(host_bias), reinterpret_cast<bm_device_mem_t>(param.scale_b));
+            bm_memcpy_d2s(_handle, bm_mem_from_system(host_bias), bm_mem_from_device(&bias_data));
+            int dim = inner_dim * scale_dim;
+            host_bias[0] = 1;
+            host_bias[1] = 2;
+            for (int i = 0; i < size; ++i) {
+                 int bias_dim = (i % dim) / inner_dim;
+                 host_extension[i] = host_bias[bias_dim];
+                 printf("%f, ", host_extension[i]);
+            }
+            printf("\n");
+            bm_memcpy_s2d(_handle, *data_extension, bm_mem_from_system(const_cast<float *>(host_extension)));
+            delete [] host_bias;
+            delete [] host_extension; 
+            BMDNN_CHECK(bmdnn_bias_forward(_handle, out_data, *data_extension,
+                    outer_dim, scale_dim * inner_dim, out_data));
         }
+        bm_free_device(_handle, *data_extension);
         return SaberSuccess;
     }
 private:
diff --git a/test/saber/bm/test_saber_func_scale_BM.cpp b/test/saber/bm/test_saber_func_scale_BM.cpp
index d6833bb9a..cf0a1ad91 100644
--- a/test/saber/bm/test_saber_func_scale_BM.cpp
+++ b/test/saber/bm/test_saber_func_scale_BM.cpp
@@ -66,6 +66,8 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te
     std::vector<float> scale_b;
     scale_w.resize(scale_dim);
     fill_vector_rand(scale_w);
+    scale_w[0] = 0;
+    scale_w[1] = 0;
     if (bias_term) {
         scale_b.resize(scale_dim);
         fill_vector_rand(scale_b);
@@ -103,8 +105,8 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te
 }
 
 TEST(TestSaberFuncBM, test_func_constructor_elt) {
-    test_scale(2, 2, 4, 4, 1, 1, false, 2);
-    test_scale(2, 2, 4, 4, 1, 1, true, 2);
+//    test_scale(1, 2, 1, 2, 1, 1, false, 2);
+    test_scale(1, 2, 1, 2, 1, 1, true, 2);
     /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */
     /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */
     /* test_scale(2, 2, 4, 4, 0, 0, true, 1); */

From 7291e2138a1dea6dd1335cd6b748fa1c37c14e88 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 29 Jun 2018 10:19:07 +0800
Subject: [PATCH 111/318] Update BM scale ops

---
 saber/funcs/impl/bm/vender_scale.h         | 21 ++++++++-------------
 saber/saber_funcs_param.h                  |  8 ++++----
 test/saber/bm/test_saber_func_scale_BM.cpp |  4 ++--
 3 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
index 5f8b6d3bb..64c4d22a2 100644
--- a/saber/funcs/impl/bm/vender_scale.h
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -69,25 +69,19 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
         /*     CHECK_EQ(scale_dim, param.scale_w.size()) << "scale dim not valid"; */
         /* } */
 
-        OpDataType scale_data = param.scale_w;
+        float* scale_data = &param.scale_w[0];
         bm_device_mem_t* data_extension = new bm_device_mem_t();
         int size = input_n * input_c * input_h * input_w;
         bm_malloc_device_byte(_handle, data_extension, size * sizeof(float));
-        BMDNN_CHECK(bmdnn_scale_forward(_handle, in_data, scale_data,
+        BMDNN_CHECK(bmdnn_scale_forward(_handle, in_data, bm_mem_from_system(scale_data),
                 input_n, input_c, input_h, input_w,
                 scale_dim, inner_dim, 0,
                 *data_extension, out_data));
         
         if (param.bias_term) {
-            OpDataType bias_data = param.scale_b;
-            float* host_bias = new float[scale_dim];
+            float* host_bias = &param.scale_b[0];
             float* host_extension = new float[size];
             printf(".........\n");
-//        bm_device_mem_t temp;;
-//        bm_malloc_device_byte(_handle, &temp, scale_dim * sizeof(float));
-//            bm_memcpy_d2s(_handle, bm_mem_from_system(host_bias), temp);
-//            bm_memcpy_d2s(_handle, bm_mem_from_system(host_bias), reinterpret_cast<bm_device_mem_t>(param.scale_b));
-            bm_memcpy_d2s(_handle, bm_mem_from_system(host_bias), bm_mem_from_device(&bias_data));
             int dim = inner_dim * scale_dim;
             host_bias[0] = 1;
             host_bias[1] = 2;
@@ -97,11 +91,12 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
                  printf("%f, ", host_extension[i]);
             }
             printf("\n");
-            bm_memcpy_s2d(_handle, *data_extension, bm_mem_from_system(const_cast<float *>(host_extension)));
-            delete [] host_bias;
-            delete [] host_extension; 
-            BMDNN_CHECK(bmdnn_bias_forward(_handle, out_data, *data_extension,
+
+            BMDNN_CHECK(bmdnn_bias_forward(_handle, out_data, bm_mem_from_system(host_extension),
                     outer_dim, scale_dim * inner_dim, out_data));
+
+            delete [] host_bias;
+            delete [] host_extension;
         }
         bm_free_device(_handle, *data_extension);
         return SaberSuccess;
diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h
index 021928a49..c6a88cbe4 100644
--- a/saber/saber_funcs_param.h
+++ b/saber/saber_funcs_param.h
@@ -976,12 +976,12 @@ struct ScaleParam {
 template <>
 struct ScaleParam<Tensor<BM, AK_BM, NCHW>> {
     ScaleParam(): axis(1), num_axes(1), bias_term(false) {}
-    ScaleParam(bm_device_mem_t scale_w_in, bm_device_mem_t scale_b_in,
+    ScaleParam(std::vector<float> scale_w_in, std::vector<float> scale_b_in,
                bool bias_term_in = true, int axis_in = 1, int num_axes_in = 1)
             : scale_w(scale_w_in), scale_b(scale_b_in)
             , bias_term(bias_term_in), axis(axis_in), num_axes(num_axes_in)
     {}
-    ScaleParam(bm_device_mem_t scale_w_in,
+    ScaleParam(std::vector<float> scale_w_in,
                bool bias_term_in = false, int axis_in = 1, int num_axes_in = 1)
             : scale_w(scale_w_in)
             , bias_term(bias_term_in), axis(axis_in), num_axes(num_axes_in)
@@ -1010,8 +1010,8 @@ struct ScaleParam<Tensor<BM, AK_BM, NCHW>> {
     int axis; // default is 1
     int num_axes; // default is 1
     bool bias_term; // default false
-    bm_device_mem_t scale_w;
-    bm_device_mem_t scale_b;
+    std::vector<float> scale_w;
+    std::vector<float> scale_b;
 };
 #endif
 template <typename opTensor>
diff --git a/test/saber/bm/test_saber_func_scale_BM.cpp b/test/saber/bm/test_saber_func_scale_BM.cpp
index cf0a1ad91..066ba194b 100644
--- a/test/saber/bm/test_saber_func_scale_BM.cpp
+++ b/test/saber/bm/test_saber_func_scale_BM.cpp
@@ -73,8 +73,8 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te
         fill_vector_rand(scale_b);
     }
 
-    ScaleParam<TensorDf4> param(bm_mem_from_system(&scale_w[0]), 
-                                bm_mem_from_system(&scale_b[0]), 
+    ScaleParam<TensorDf4> param(scale_w,
+                                scale_b,
                                 bias_term, axis, num_axes);
 
     std::vector<TensorDf4*> input;

From c89d92cc18ed2273bdb0129f18909b4bd156b943 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 29 Jun 2018 10:28:54 +0800
Subject: [PATCH 112/318] cleanup

---
 saber/funcs/impl/bm/vender_scale.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
index 64c4d22a2..e1acd3bfa 100644
--- a/saber/funcs/impl/bm/vender_scale.h
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -83,8 +83,8 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
             float* host_extension = new float[size];
             printf(".........\n");
             int dim = inner_dim * scale_dim;
-            host_bias[0] = 1;
-            host_bias[1] = 2;
+            //host_bias[0] = 1;
+            //host_bias[1] = 2;
             for (int i = 0; i < size; ++i) {
                  int bias_dim = (i % dim) / inner_dim;
                  host_extension[i] = host_bias[bias_dim];

From d8f4d47a186a0d5133d485b8975e6c66fb5cced8 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 29 Jun 2018 10:58:57 +0800
Subject: [PATCH 113/318] Update BM scale test

---
 test/saber/bm/test_saber_func_scale_BM.cpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/test/saber/bm/test_saber_func_scale_BM.cpp b/test/saber/bm/test_saber_func_scale_BM.cpp
index 066ba194b..d4e40d44b 100644
--- a/test/saber/bm/test_saber_func_scale_BM.cpp
+++ b/test/saber/bm/test_saber_func_scale_BM.cpp
@@ -33,6 +33,11 @@ void fill_vector_rand(std::vector<float>& vec) {
         vec[i] = rand() *1.0f/RAND_MAX - 0.5;
     }
 }
+void fill_vector_const(std::vector<float>& vec, float num) {
+    for (int i = 0; i < vec.size(); i++) {
+        vec[i] = num;
+    }
+}
 void print_vector_data(std::vector<float>& vec) {
     for (int i = 0; i < vec.size(); i++) {
         printf("%d, %f\n", i, vec[i]);
@@ -56,7 +61,7 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te
 
     img_host.re_alloc(img_s);
     img_dev.re_alloc(img_s);
-    fill_tensor_host_rand(img_host, -0.5, 0.5);
+    fill_tensor_host_const(img_host, 1);
     img_dev.copy_from(img_host);
 
     TensorDf4 output_dev;
@@ -65,12 +70,10 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te
     std::vector<float> scale_w;
     std::vector<float> scale_b;
     scale_w.resize(scale_dim);
-    fill_vector_rand(scale_w);
-    scale_w[0] = 0;
-    scale_w[1] = 0;
+    fill_vector_const(scale_w, 2);
     if (bias_term) {
         scale_b.resize(scale_dim);
-        fill_vector_rand(scale_b);
+        fill_vector_const(scale_b, 0);
     }
 
     ScaleParam<TensorDf4> param(scale_w,
@@ -105,7 +108,7 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te
 }
 
 TEST(TestSaberFuncBM, test_func_constructor_elt) {
-//    test_scale(1, 2, 1, 2, 1, 1, false, 2);
+    test_scale(1, 2, 1, 2, 1, 1, false, 2);
     test_scale(1, 2, 1, 2, 1, 1, true, 2);
     /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */
     /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */

From d3cef11b1c26510dd49ad2bb52de5599b39a40cd Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 29 Jun 2018 13:52:16 +0800
Subject: [PATCH 114/318] cleanup

---
 saber/funcs/impl/bm/vender_scale.h      |   8 +-
 test/saber/bm/test_saber_func_fc_BM.cpp | 146 ------------------------
 2 files changed, 3 insertions(+), 151 deletions(-)
 delete mode 100644 test/saber/bm/test_saber_func_fc_BM.cpp

diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
index e1acd3bfa..e2e6fb900 100644
--- a/saber/funcs/impl/bm/vender_scale.h
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -81,16 +81,14 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
         if (param.bias_term) {
             float* host_bias = &param.scale_b[0];
             float* host_extension = new float[size];
-            printf(".........\n");
+            //printf(".........\n");
             int dim = inner_dim * scale_dim;
-            //host_bias[0] = 1;
-            //host_bias[1] = 2;
             for (int i = 0; i < size; ++i) {
                  int bias_dim = (i % dim) / inner_dim;
                  host_extension[i] = host_bias[bias_dim];
-                 printf("%f, ", host_extension[i]);
+                 //printf("%f, ", host_extension[i]);
             }
-            printf("\n");
+            //printf("\n");
 
             BMDNN_CHECK(bmdnn_bias_forward(_handle, out_data, bm_mem_from_system(host_extension),
                     outer_dim, scale_dim * inner_dim, out_data));
diff --git a/test/saber/bm/test_saber_func_fc_BM.cpp b/test/saber/bm/test_saber_func_fc_BM.cpp
deleted file mode 100644
index 869ff1bfd..000000000
--- a/test/saber/bm/test_saber_func_fc_BM.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-#include "core/context.h"
-#include "funcs/fc.h"
-#include "test_saber_func_BM.h"
-#include "tensor_op.h"
-#include "saber_types.h"
-#include <vector>
-
-using namespace anakin::saber;
-typedef TargetWrapper<BM> API;
-typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
-typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-typedef TensorDf4::Dtype ftype;
-
-void fc_compute(const TensorHf4& tin, const TensorHf4& weight, \
-                const TensorHf4& bias, TensorHf4& tout) {
-
-    int m = tin.num();
-    int k = tin.valid_size() / m;
-    int n = weight.valid_size() / k;
-    bool bias_term = bias.valid_size() > 0;
-
-    const float* din = tin.data();
-    const float* w = weight.data();
-    float* dout = tout.mutable_data();
-
-    for (int i = 0; i < m; ++i) {
-        float* pdout = dout + i * n;
-        const float* pdin = din + i * k;
-
-        for (int j = 0; j < n; ++j) {
-            if (bias_term) {
-                pdout[j] = bias.data()[j];
-            } else {
-                pdout[j] = 0;
-            }
-
-            for (int l = 0; l < k; ++l) {
-                pdout[j] += pdin[l] * w[l * n + j];
-            }
-        }
-    }
-}
-
-TEST(TestSaberFuncBM, test_func_fc) {
-
-    int test_iter = 100;
-    int w_in = 7;
-    int h_in = 7;
-    int ch_in = 512;
-    int num_in = 1;
-
-    int num_out = 4096;
-    int axis = 1;
-
-    Shape shape_in(num_in, ch_in, h_in, w_in);
-    Shape shape_out = {num_in, num_out, 1, 1};
-
-    Shape sh_w{1, 1, w_in* h_in * ch_in, num_out};
-    TensorDf4 weight(sh_w);
-    Shape sh_b{1, 1, 1, num_out};
-    TensorDf4 bias(sh_b);
-    fill_tensor_device_const(weight, 1.f);
-    fill_tensor_device_const(bias, 1.f);
-
-    LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \
-              ch_in << ", height=" << h_in << ", width=" << w_in;
-
-    std::vector<TensorDf4*> input_dev_4d;
-    std::vector<TensorDf4*> output_dev_4d;
-
-    TensorDf4 tdin;
-    TensorDf4 tdout;
-    tdin.re_alloc(shape_in);
-    fill_tensor_device_const(tdin, 1.f);
-    input_dev_4d.push_back(&tdin);
-    output_dev_4d.push_back(&tdout);
-
-    // start Reshape & doInfer
-    Context<BM> ctx_dev(0, 1, 1);
-
-    FcParam<TensorDf4> param(&weight, &bias, num_out, axis);
-
-    Fc<BM, AK_FLOAT> fc;
-
-    LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \
-              shape_out[2] << ", " << shape_out[3];
-
-    SABER_CHECK(fc.compute_output_shape(input_dev_4d, output_dev_4d, param));
-
-    LOG(INFO) << "re-alloc tensor buffer";
-    output_dev_4d[0]->re_alloc(output_dev_4d[0]->valid_shape());
-    Shape va_sh = tdout.valid_shape();
-    LOG(INFO) << "shape out 4d: " << va_sh[0] << ", " << va_sh[1] << ", " << \
-              va_sh[2] << ", " << va_sh[3];
-    CHECK_EQ(tdout.valid_shape() == shape_out, true) << "compute output shape error";
-
-    LOG(INFO) << "FC initialization";
-    SABER_CHECK(fc.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev));
-
-    LOG(INFO) << "FC compute";
-    SaberTimer<BM> t1;
-    t1.clear();
-    t1.start(ctx_dev);
-
-    for (int i = 0; i < test_iter; ++i) {
-        SABER_CHECK(fc(input_dev_4d, output_dev_4d, param, ctx_dev));
-        output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
-        output_dev_4d[0]->sync();
-        //cudaDeviceSynchronize();
-    }
-
-    t1.end(ctx_dev);
-    float ts = t1.get_average_ms();
-    LOG(INFO) << "total time: " << ts << "avg time: " << ts / test_iter;
-    //print_tensor_device(*output_dev_4d[0]);
-
-    //! check result
-    TensorHf4 thin(shape_in);
-    TensorHf4 thout(shape_out);
-    TensorHf4 thw(sh_w);
-    TensorHf4 thb(sh_b);
-    thin.copy_from(tdin);
-    thw.copy_from(weight);
-    thb.copy_from(bias);
-    fc_compute(thin, thw, thb, thout);
-    //print_tensor_host(thout);
-
-    TensorHf4 thout_d(shape_out);
-    thout_d.copy_from(tdout);
-    double max_ratio = 0;
-    double max_diff = 0;
-    tensor_cmp_host(thout.data(), thout_d.data(), thout.valid_size(), max_ratio, max_diff);
-    LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio;
-    CHECK_LE(fabs(max_ratio), 1.0e-6) << "error result";
-
-}
-
-int main(int argc, const char** argv) {
-    // initial logger
-    //logger::init(argv[0]);
-    Env<BM>::env_init();
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-

From ec902952b3bf399047ef95a2c1ada5da950902ad Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 29 Jun 2018 14:57:30 +0800
Subject: [PATCH 115/318] flush before next operation

---
 saber/funcs/impl/bm/vender_scale.h         | 4 +---
 test/saber/bm/test_saber_func_scale_BM.cpp | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
index e2e6fb900..4e9402a43 100644
--- a/saber/funcs/impl/bm/vender_scale.h
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -81,15 +81,13 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
         if (param.bias_term) {
             float* host_bias = &param.scale_b[0];
             float* host_extension = new float[size];
-            //printf(".........\n");
             int dim = inner_dim * scale_dim;
             for (int i = 0; i < size; ++i) {
                  int bias_dim = (i % dim) / inner_dim;
                  host_extension[i] = host_bias[bias_dim];
-                 //printf("%f, ", host_extension[i]);
             }
-            //printf("\n");
 
+            bm_flush(get_bm_handle());
             BMDNN_CHECK(bmdnn_bias_forward(_handle, out_data, bm_mem_from_system(host_extension),
                     outer_dim, scale_dim * inner_dim, out_data));
 
diff --git a/test/saber/bm/test_saber_func_scale_BM.cpp b/test/saber/bm/test_saber_func_scale_BM.cpp
index d4e40d44b..a20b61cbb 100644
--- a/test/saber/bm/test_saber_func_scale_BM.cpp
+++ b/test/saber/bm/test_saber_func_scale_BM.cpp
@@ -73,7 +73,7 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te
     fill_vector_const(scale_w, 2);
     if (bias_term) {
         scale_b.resize(scale_dim);
-        fill_vector_const(scale_b, 0);
+        fill_vector_const(scale_b, 3);
     }
 
     ScaleParam<TensorDf4> param(scale_w,

From c38bf09e25d56c12c823abc2b6c410c7a0137d5d Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 29 Jun 2018 15:54:06 +0800
Subject: [PATCH 116/318] check BM conv bias

---
 saber/funcs/impl/bm/vender_conv.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h
index 220b8a14e..7243fd6a4 100644
--- a/saber/funcs/impl/bm/vender_conv.h
+++ b/saber/funcs/impl/bm/vender_conv.h
@@ -51,7 +51,6 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
                           ConvParam<OpTensor>& param) {
         const InDataType *in_data = (const InDataType *) inputs[0]->data();
         const InDataType *weight = (const InDataType *) param.weight()->data();
-        const InDataType *bias = (const InDataType *) param.bias()->data();
         OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
 
         int input_n = inputs[0]->num();
@@ -75,6 +74,7 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
         int dilation_w = param.dilation_w;
 
         bool with_bias = param.bias()->size() > 0;
+        const InDataType *bias = with_bias? (const InDataType *) param.bias()->data() : &bm_mem_null();
 
         bm_tensor_4d_t input_shape = {
             input_n,

From 8dbb4b4cc22fba3fb693df72a4c831a793132b47 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 5 Jul 2018 13:47:59 +0800
Subject: [PATCH 117/318] Update BM tensor test

---
 test/saber/bm/test_saber_tensor_BM.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index 423ffe221..2400e73c3 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -415,12 +415,12 @@ TEST(TestSaberTensorBM, test_tensor_deepcopy) {
     td21.copy_from(td01);
     print_tensor_device(td21);
     //cudaDeviceSynchronize();
-}
+}*/
 
 TEST(TestSaberTensorBM, test_tensor_shape) {
-    typedef Tensor<X86, AK_BM, NCHW> Tensor4_0;
-    typedef Tensor<X86, AK_BM, NHWC> Tensor4_1;
-    typedef Tensor<X86, AK_BM, HW> Tensor2;
+    typedef Tensor<X86, AK_FLOAT, NCHW> Tensor4_0;
+    typedef Tensor<X86, AK_FLOAT, NHWC> Tensor4_1;
+    typedef Tensor<X86, AK_FLOAT, HW> Tensor2;
 
     int nin = 2;
     int cin = 2;
@@ -562,8 +562,8 @@ TEST(TestSaberTensorBM, test_tensor_op) {
     Shape sh{1, 2, 2, 10};
     TensorDf4 td1(sh);
     TensorHf4 th1(sh);
-    Tensor<BM, AK_INT8, NCHW> td2(sh);
-    Tensor<X86, AK_INT8, NCHW> th2(sh);
+    Tensor<BM, AK_BM, NCHW> td2(sh);
+    Tensor<X86, AK_FLOAT, NCHW> th2(sh);
     LOG(INFO) << "testing host fill tensor with const 1.";
     fill_tensor_host_const(th1, 1.f);
     LOG(INFO) << "data type: float";
@@ -616,9 +616,9 @@ TEST(TestSaberTensorBM, test_tensor_op) {
 TEST(TestSaberTensorBM, test_tensor_share_diff_dtype) {
     Shape sh{1, 1, 2, 10};
     Tensor<BM, AK_BM, NCHW> td1(sh);
-    Tensor<X86, AK_BM, NCHW> th1(sh);
-    Tensor<BM, AK_INT8, NCHW> td2;
-    Tensor<X86, AK_INT8, NCHW> th2;
+    Tensor<X86, AK_FLOAT, NCHW> th1(sh);
+    Tensor<BM, AK_BM, NCHW> td2;
+    Tensor<X86, AK_FLOAT, NCHW> th2;
     td2.set_shape(sh);
     th2.set_shape(sh);
     LOG(INFO) << "testing host fill tensor with const 1.";
@@ -641,7 +641,7 @@ TEST(TestSaberTensorBM, test_tensor_share_diff_dtype) {
 TEST(TestSaberTensorBM, test_tensor_base_type) {
     Shape sh(1, 3, 10, 10);
     Tensor<BM, AK_BM, NCHW> td1(sh);
-    Tensor<X86, AK_BM, NCHW> th1(sh);
+    Tensor<X86, AK_FLOAT, NCHW> th1(sh);
     fill_tensor_host_rand(th1, 0.f, 255.f);
     td1.copy_from(th1);
     TensorBase* tb1;
@@ -652,7 +652,7 @@ TEST(TestSaberTensorBM, test_tensor_base_type) {
     Shape sh11 = th1.valid_shape();
     LOG(INFO) << "base tensor call set shape: " << "n=" << sh11[0] << ", c=" << sh11[1] << \
               ", h=" << sh11[2] << ", w=" << sh11[3];
-}*/
+}
 
 int main(int argc, const char** argv) {
     // initial logger

From a19e6fae278dfd5374d8c25b4aec210c032481b4 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 10 Jul 2018 11:13:29 +0800
Subject: [PATCH 118/318] Implement fc for BM

---
 saber/funcs/fc.h                        |   4 +
 saber/funcs/impl/bm/vender_fc.h         |  22 ++--
 test/saber/bm/test_saber_func_fc_BM.cpp | 147 ++++++++++++++++++++++++
 3 files changed, 164 insertions(+), 9 deletions(-)
 create mode 100644 test/saber/bm/test_saber_func_fc_BM.cpp

diff --git a/saber/funcs/fc.h b/saber/funcs/fc.h
index 06dc8695a..8b1d553be 100644
--- a/saber/funcs/fc.h
+++ b/saber/funcs/fc.h
@@ -26,6 +26,10 @@
 #ifdef USE_X86_PLACE
 #include "saber/funcs/impl/x86/vender_fc.h"
 #endif
+
+#ifdef USE_BM
+#include "saber/funcs/impl/bm/vender_fc.h"
+#endif
    
 namespace anakin{
 
diff --git a/saber/funcs/impl/bm/vender_fc.h b/saber/funcs/impl/bm/vender_fc.h
index 82dd6000c..c0cd7ea66 100644
--- a/saber/funcs/impl/bm/vender_fc.h
+++ b/saber/funcs/impl/bm/vender_fc.h
@@ -34,6 +34,7 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
     virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             FcParam<OpTensor>& param, Context<BM>& ctx){
+        _handle = get_bm_handle();
         return create(inputs, outputs, param, ctx);
     }
 
@@ -47,16 +48,20 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
                             std::vector<DataTensor_out *>& outputs,
                             FcParam<OpTensor>& param){
         const InDataType *in_data = (const InDataType *) inputs[0]->data();
-        const InDataType *weights = (const InDataType *) param.weights->get_buf()->get_data();
-        const InDataType *bias = (const InDataType *) param.bias->get_buf()->get_data();
+        const InDataType *weights = (const InDataType *) param.weights->data();
+        const InDataType *bias = (const InDataType *) param.bias->data();
         OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
-        int batch_size = inputs[0]->num();
-        int input_len = inputs[0]->channel();
+        int batch_size = inputs[0]->count_valid(0, param.axis);
+        int input_len = inputs[0]->count_valid(param.axis, inputs[0]->dims());
         int output_len = param.num_output;
-        int is_transpose = param.is_transpose_weights ? 1 : 0;
-        BMDNN_CHECK(bmdnn_fc_forward(_handle, in_data, weights, bias,
-                                    batch_size, output_len, input_len, is_transpose, 1, 0,
-                                    out_data));
+        if (output_len <= 0) {
+            int weight_size = param.weights->valid_size();
+            output_len = weight_size / input_len;
+        }
+
+        BMDNN_CHECK(bmdnn_fc_forward(_handle, *in_data, *weights, *bias,
+                                    batch_size, output_len, input_len, param.is_transpose_weights, 1, 0,
+                                    *out_data));
         return SaberSuccess;
     };
 
@@ -64,7 +69,6 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
     bm_handle_t _handle;
 };
 
-template class VenderFc<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
 } //namespace saber
 
 } //namespace anakin
diff --git a/test/saber/bm/test_saber_func_fc_BM.cpp b/test/saber/bm/test_saber_func_fc_BM.cpp
new file mode 100644
index 000000000..7b56033e6
--- /dev/null
+++ b/test/saber/bm/test_saber_func_fc_BM.cpp
@@ -0,0 +1,147 @@
+#include "core/context.h"
+#include "funcs/fc.h"
+#include "test_saber_func_BM.h"
+#include "tensor_op.h"
+#include "saber_types.h"
+#include <vector>
+
+using namespace anakin::saber;
+typedef TargetWrapper<BM> API;
+typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+typedef TensorDf4::Dtype ftype;
+
+void fc_compute(const TensorHf4& tin, const TensorHf4& weight, \
+                const TensorHf4& bias, TensorHf4& tout) {
+
+    int m = tin.num();
+    int k = tin.valid_size() / m;
+    int n = weight.valid_size() / k;
+    bool bias_term = bias.valid_size() > 0;
+
+    const float* din = tin.data();
+    const float* w = weight.data();
+    float* dout = tout.mutable_data();
+
+    for (int i = 0; i < m; ++i) {
+        float* pdout = dout + i * n;
+        const float* pdin = din + i * k;
+
+        for (int j = 0; j < n; ++j) {
+            if (bias_term) {
+                pdout[j] = bias.data()[j];
+            } else {
+                pdout[j] = 0;
+            }
+
+            for (int l = 0; l < k; ++l) {
+                pdout[j] += pdin[l] * w[l * n + j];
+            }
+        }
+    }
+}
+
+TEST(TestSaberFuncBM, test_func_fc) {
+
+    int test_iter = 10;
+    int w_in = 7;
+    int h_in = 7;
+    int ch_in = 1024;
+    int num_in = 4;
+
+    int num_out = 4096;
+    int axis = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_out = {num_in, num_out, 1, 1};
+
+    Shape sh_w{1, 1, w_in* h_in * ch_in, num_out};
+    TensorDf4 weight(sh_w);
+    Shape sh_b{1, 1, 1, num_out};
+    TensorDf4 bias(sh_b);
+    fill_tensor_device_const(weight, 1.f);
+    fill_tensor_device_const(bias, 1.f);
+
+    LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \
+              ch_in << ", height=" << h_in << ", width=" << w_in;
+
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+
+    TensorDf4 tdin;
+    TensorDf4 tdout;
+    tdin.re_alloc(shape_in);
+    fill_tensor_device_const(tdin, 1.f);
+    input_dev_4d.push_back(&tdin);
+    output_dev_4d.push_back(&tdout);
+
+    // start Reshape & doInfer
+    Context<BM> ctx_dev(0, 1, 1);
+
+    FcParam<TensorDf4> param(&weight, &bias, num_out, axis);
+
+    Fc<BM, AK_BM, AK_BM, AK_BM, NCHW> fc;
+
+    LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \
+              shape_out[2] << ", " << shape_out[3];
+
+    SABER_CHECK(fc.compute_output_shape(input_dev_4d, output_dev_4d, param));
+
+    LOG(INFO) << "re-alloc tensor buffer";
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->valid_shape());
+    Shape va_sh = tdout.valid_shape();
+    LOG(INFO) << "shape out 4d: " << va_sh[0] << ", " << va_sh[1] << ", " << \
+              va_sh[2] << ", " << va_sh[3];
+    CHECK_EQ(tdout.valid_shape() == shape_out, true) << "compute output shape error";
+
+    LOG(INFO) << "FC initialization";
+    SABER_CHECK(fc.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev));
+
+    LOG(INFO) << "FC compute";
+    SaberTimer<BM> t1;
+    t1.clear();
+    t1.start(ctx_dev);
+
+    for (int i = 0; i < test_iter; ++i) {
+        SABER_CHECK(fc(input_dev_4d, output_dev_4d, param, ctx_dev));
+        bm_flush(get_bm_handle());
+        //output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+        //output_dev_4d[0]->sync();
+        //cudaDeviceSynchronize();
+    }
+
+    t1.end(ctx_dev);
+    float ts = t1.get_average_ms();
+    LOG(INFO) << "total time: " << ts << "avg time: " << ts / test_iter;
+    //print_tensor_device(*output_dev_4d[0]);
+
+    //! check result
+    TensorHf4 thin(shape_in);
+    TensorHf4 thout(shape_out);
+    TensorHf4 thw(sh_w);
+    TensorHf4 thb(sh_b);
+    thin.copy_from(tdin);
+    thw.copy_from(weight);
+    thb.copy_from(bias);
+    fc_compute(thin, thw, thb, thout);
+    //print_tensor_host(thout);
+
+    TensorHf4 thout_d(shape_out);
+    thout_d.copy_from(tdout);
+    double max_ratio = 0;
+    double max_diff = 0;
+    tensor_cmp_host(thout.data(), thout_d.data(), thout.valid_size(), max_ratio, max_diff);
+    LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio;
+    CHECK_LE(fabs(max_ratio), 1.0e-6) << "error result";
+
+}
+
+int main(int argc, const char** argv) {
+    // initial logger
+    //logger::init(argv[0]);
+    Env<BM>::env_init();
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+

From e340b1b38d150eb6a07389a40d6b523a0fd180b1 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 11 Jul 2018 10:08:45 +0800
Subject: [PATCH 119/318] Implement eltwise for BM

---
 saber/funcs/eltwise.h                |   4 +
 saber/funcs/impl/bm/vender_eltwise.h | 118 +++++++++++++++++++++++++++
 saber/funcs/impl/bm/vender_scale.h   |   2 +-
 saber/saber_funcs_param.h            |  48 +++++++++++
 4 files changed, 171 insertions(+), 1 deletion(-)
 create mode 100644 saber/funcs/impl/bm/vender_eltwise.h

diff --git a/saber/funcs/eltwise.h b/saber/funcs/eltwise.h
index 7d3a4860c..490f9b6bf 100644
--- a/saber/funcs/eltwise.h
+++ b/saber/funcs/eltwise.h
@@ -26,6 +26,10 @@
 #include "saber/funcs/impl/x86/saber_eltwise.h"
 #endif
 
+#ifdef USE_BM
+#include "saber/funcs/impl/bm/vender_eltwise.h"
+#endif
+
 namespace anakin {
 namespace saber {
 
diff --git a/saber/funcs/impl/bm/vender_eltwise.h b/saber/funcs/impl/bm/vender_eltwise.h
new file mode 100644
index 000000000..62ac2c436
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_eltwise.h
@@ -0,0 +1,118 @@
+#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_ELTWISE_H
+#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_ELTWISE_H
+
+#include "saber/funcs/impl/impl_eltwise.h"
+
+namespace anakin {
+
+namespace saber {
+
+template <DataType OpDtype,
+            DataType inDtype,
+            DataType outDtype,
+            typename LayOutType_op,
+            typename LayOutType_in,
+            typename LayOutType_out>
+class SaberEltwise<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, LayOutType_out>:\
+public ImplBase<
+        Tensor<BM, inDtype, LayOutType_in>,
+        Tensor<BM, outDtype, LayOutType_out>,
+        Tensor<BM, OpDtype, LayOutType_op>,
+        EltwiseParam<Tensor<BM, OpDtype, LayOutType_op>>> {
+public:
+    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
+
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+
+    SaberEltwise() {}
+
+    ~SaberEltwise() {}
+
+    virtual SaberStatus init(const std::vector<DataTensor_in*>& inputs,
+                         std::vector<DataTensor_out*>& outputs,
+                         EltwiseParam<OpTensor> &param,
+                         Context<BM> &ctx) {
+        _handle = get_bm_handle();
+        return create(inputs, outputs, param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in*>& inputs,
+                           std::vector<DataTensor_out*>& outputs,
+                           EltwiseParam<OpTensor> &param,
+                           Context<BM> &ctx) {
+        return SaberSuccess;
+    }
+
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
+                             std::vector<DataTensor_out*>& outputs,
+                             EltwiseParam<OpTensor> &param) {
+
+        int op_ = 0;
+        switch (param.operation) {
+            case Eltwise_prod:
+                op_ = 0;
+                break;
+            case Eltwise_sum:
+                op_ = 1;
+                break;
+            case Eltwise_max:
+                op_ = 2;
+                break;
+            default:
+                return SaberUnImplError;
+        }
+
+        //int input_size = inputs.size();
+        //CHECK_GE(input_size, 2) << "Input size should >= 2!";
+
+        OutDataType out_data = *(outputs[0]->mutable_data());
+        int input_n = inputs[0]->num();
+        int input_c = inputs[0]->channel();
+        int input_h = inputs[0]->height();
+        int input_w = inputs[0]->width();
+
+        std::vector<float> coeff_ = param.coeff;
+        if (coeff_.size() != inputs.size()) {
+            for (int j=0; j<(inputs.size() - coeff_.size()); j++) {
+                coeff_.push_back(1);
+            }
+        }
+
+        bm_device_mem_t* mask_data = new bm_device_mem_t();
+
+        int flag_first = 1;
+        for (int i=0; i<inputs.size(); i++){
+            const InDataType in_data = *(inputs[i]->data());
+            bmdnn_eltwise_forward(
+                    _handle,
+                    op_,
+                    flag_first,
+                    coeff_[i],
+                    i,
+                    in_data,
+                    out_data,
+                    input_n,
+                    input_c * input_h * input_w,
+                    *mask_data,
+                    out_data);
+
+            bm_flush(_handle);
+            flag_first = 0;
+        }
+        bm_free_device(_handle, *mask_data);
+        return SaberSuccess;
+    }
+
+private:
+    bm_handle_t _handle;
+};
+
+} //namespace saber
+
+} //namespace anakin
+
+#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_ELTWISE_H
\ No newline at end of file
diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
index 4e9402a43..2876e8005 100644
--- a/saber/funcs/impl/bm/vender_scale.h
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -87,7 +87,7 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
                  host_extension[i] = host_bias[bias_dim];
             }
 
-            bm_flush(get_bm_handle());
+            bm_flush(_handle);
             BMDNN_CHECK(bmdnn_bias_forward(_handle, out_data, bm_mem_from_system(host_extension),
                     outer_dim, scale_dim * inner_dim, out_data));
 
diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h
index c6a88cbe4..5e9b89b0f 100644
--- a/saber/saber_funcs_param.h
+++ b/saber/saber_funcs_param.h
@@ -1695,6 +1695,54 @@ struct EltwiseParam {
     std::vector<DataDtype> coeff;
 };
 
+#ifdef USE_BM
+template <>
+struct EltwiseParam<Tensor<BM, AK_BM, NCHW>> {
+    EltwiseParam()
+        : operation(Eltwise_unknow)
+        , coeff()
+    {}
+    EltwiseParam(EltwiseType operation_in
+            , std::vector<float> coeff_in = std::vector<float>({1,1}))
+        : operation(operation_in)
+        , coeff(coeff_in)
+    {
+        if ((operation == Eltwise_sum) && (coeff.size() == 0)) {
+            coeff.push_back(1);
+            coeff.push_back(1);
+        }
+    }
+
+    EltwiseParam(const EltwiseParam<Tensor<BM, AK_BM, NCHW>>& right)
+        : operation(right.operation)
+        , coeff(right.coeff)
+    {}
+
+    EltwiseParam<Tensor<BM, AK_BM, NCHW>>& operator=(const EltwiseParam<Tensor<BM, AK_BM, NCHW>>& right) {
+        operation = right.operation;
+        coeff.resize(right.coeff.size());
+        for (int i = 0; i < coeff.size(); ++i) {
+            coeff[i] = right.coeff[i];
+        }
+        return *this;
+    }
+
+    bool operator==(const EltwiseParam<Tensor<BM, AK_BM, NCHW>>& right) {
+        bool comp_eq = true;
+        comp_eq = comp_eq && (operation == right.operation);
+        comp_eq = comp_eq && (coeff.size() == right.coeff.size());
+        if (!comp_eq) {
+            return comp_eq;
+        }
+        for (int i = 0; i < coeff.size(); ++i) {
+            comp_eq = comp_eq && (coeff[i] == right.coeff[i]);
+        }
+    }
+    EltwiseType operation;
+    std::vector<float> coeff;
+};
+#endif
+
 template <typename opTensor>
 struct EltwiseActiveParam {
     EltwiseActiveParam()

From 2fe6ca07bb78555c420ffd0ca3c0b4db07f12be5 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 11 Jul 2018 11:27:17 +0800
Subject: [PATCH 120/318] Add test for BM eltwise

---
 saber/funcs/impl/bm/vender_eltwise.h         |  13 +-
 test/saber/bm/test_saber_func_eltwise_BM.cpp | 627 +++++++++++++++++++
 2 files changed, 634 insertions(+), 6 deletions(-)
 create mode 100644 test/saber/bm/test_saber_func_eltwise_BM.cpp

diff --git a/saber/funcs/impl/bm/vender_eltwise.h b/saber/funcs/impl/bm/vender_eltwise.h
index 62ac2c436..050fc4d43 100644
--- a/saber/funcs/impl/bm/vender_eltwise.h
+++ b/saber/funcs/impl/bm/vender_eltwise.h
@@ -13,7 +13,7 @@ template <DataType OpDtype,
             typename LayOutType_op,
             typename LayOutType_in,
             typename LayOutType_out>
-class SaberEltwise<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, LayOutType_out>:\
+class VenderEltwise<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, LayOutType_out>:\
 public ImplBase<
         Tensor<BM, inDtype, LayOutType_in>,
         Tensor<BM, outDtype, LayOutType_out>,
@@ -28,9 +28,9 @@ public ImplBase<
     typedef typename DataTensor_out::Dtype OutDataType;
     typedef typename OpTensor::Dtype OpDataType;
 
-    SaberEltwise() {}
+    VenderEltwise() {}
 
-    ~SaberEltwise() {}
+    ~VenderEltwise() {}
 
     virtual SaberStatus init(const std::vector<DataTensor_in*>& inputs,
                          std::vector<DataTensor_out*>& outputs,
@@ -44,7 +44,6 @@ public ImplBase<
                            std::vector<DataTensor_out*>& outputs,
                            EltwiseParam<OpTensor> &param,
                            Context<BM> &ctx) {
-        return SaberSuccess;
     }
 
     virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
@@ -77,7 +76,8 @@ public ImplBase<
 
         std::vector<float> coeff_ = param.coeff;
         if (coeff_.size() != inputs.size()) {
-            for (int j=0; j<(inputs.size() - coeff_.size()); j++) {
+            int diff = inputs.size() - coeff_.size();
+            for (int j=0; j<diff; j++) {
                 coeff_.push_back(1);
             }
         }
@@ -103,7 +103,8 @@ public ImplBase<
             bm_flush(_handle);
             flag_first = 0;
         }
-        bm_free_device(_handle, *mask_data);
+
+        //bm_free_device(_handle, *mask_data);
         return SaberSuccess;
     }
 
diff --git a/test/saber/bm/test_saber_func_eltwise_BM.cpp b/test/saber/bm/test_saber_func_eltwise_BM.cpp
new file mode 100644
index 000000000..da931510b
--- /dev/null
+++ b/test/saber/bm/test_saber_func_eltwise_BM.cpp
@@ -0,0 +1,627 @@
+#include "core/context.h"
+#include "funcs/eltwise.h"
+#include "test_saber_func_BM.h"
+#include "tensor_op.h"
+#include "saber_types.h"
+#include <vector>
+
+using namespace anakin::saber;
+
+/*
+TEST(TestSaberFuncBM, test_func_prod) {
+
+    Env<BM>::env_init();
+    typedef TargetWrapper<BM> API;
+
+    typedef TargetWrapper<BM> BM_API;
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+
+    EltwiseType elt_type = Eltwise_prod;
+
+    EltwiseParam<TensorDf4> param(elt_type);
+
+    int w_in = 10;
+    int h_in = 2;
+    int ch_in = 2;
+    int num_in = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_out = shape_in;
+
+    // Host Tensor
+    Tensor<X86, AK_FLOAT, NCHW> thin0(shape_in);
+    Tensor<X86, AK_FLOAT, NCHW> thin1(shape_in);
+    Tensor<X86, AK_FLOAT, NCHW> thin2(shape_in);
+    for (int i = 0; i < thin0.size(); ++i) {
+        thin0.mutable_data()[i] = i;
+    }
+    for (int i = 0; i < thin1.size(); ++i) {
+        thin1.mutable_data()[i] = i + 1;
+    }
+    for (int i = 0; i < thin2.size(); ++i) {
+        thin2.mutable_data()[i] = 1;
+    }
+
+    // Device Tensor
+    TensorDf4 tdin0, tdin1, tdin2, tdout;
+    tdin0.re_alloc(shape_in);
+    tdin1.re_alloc(shape_in);
+    tdin2.re_alloc(shape_in);
+    tdin0.copy_from(thin0);
+    tdin1.copy_from(thin1);
+    tdin2.copy_from(thin2);
+    tdout.re_alloc(shape_out);
+
+    // Device Vector of Tensor
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+    input_dev_4d.push_back(&tdin0);
+    input_dev_4d.push_back(&tdin1);
+    input_dev_4d.push_back(&tdin2);
+    output_dev_4d.push_back(&tdout);
+
+
+    Context<BM> ctx_dev(0, 1, 1);
+    Eltwise<BM, AK_BM> eltwise_dev;
+
+    LOG(INFO) << "eltwise compute output shape";
+    eltwise_dev.compute_output_shape(output_dev_4d, input_dev_4d, param);
+
+    //SABER_CHECK(eltwise_dev.compute_output_shape(output_dev_4d, input_dev_4d, param));
+
+    // Verify output shape
+    Shape sh = output_dev_4d[0]->valid_shape();
+    LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \
+        ", " << sh[2] << ", " << sh[3];
+    Shape shout{num_in, ch_in, h_in, w_in};
+    CHECK_EQ(shout == sh, true) << "compute shape error";
+
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
+
+    LOG(INFO) << "eltwise initialization";
+    eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
+
+
+    LOG(INFO) << "eltwise compute";
+    eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
+
+
+    output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+    output_dev_4d[0]->sync();
+    print_tensor_device(*output_dev_4d[0]);
+    cudaDeviceSynchronize();
+
+
+    TensorHf4 th_for_print;
+    th_for_print.re_alloc(output_dev_4d[0]->valid_shape());
+    th_for_print.copy_from(*output_dev_4d[0]);
+    print_tensor_host(th_for_print);
+
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+
+*/
+
+TEST(TestSaberFuncBM, test_func_sum) {
+
+    Env<BM>::env_init();
+
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+
+    EltwiseType elt_type = Eltwise_sum;
+
+    int w_in = 10;
+    int h_in = 2;
+    int ch_in = 2;
+    int num_in = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_out = shape_in;
+
+    // Host Tensor
+    TensorHf4 thin1(shape_in);
+    TensorHf4 thin2(shape_in);
+
+    for (int i = 0; i < thin1.size(); ++i) {
+        thin1.mutable_data()[i] = 1.0;
+    }
+
+    for (int i = 0; i < thin2.size(); ++i) {
+        thin2.mutable_data()[i] = 2.0;
+    }
+
+    // Device Tensor
+    TensorDf4 tdin0, tdin1, tdout;
+    tdin0.re_alloc(shape_in);
+    tdin1.re_alloc(shape_in);
+    tdin0.copy_from(thin1);
+    tdin1.copy_from(thin2);
+    tdout.re_alloc(shape_out);
+
+    // Device Vector of Tensor
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+    input_dev_4d.push_back(&tdin0);
+    input_dev_4d.push_back(&tdin1);
+    input_dev_4d.push_back(&tdin1);
+    output_dev_4d.push_back(&tdout);
+
+    EltwiseParam<TensorDf4> param(elt_type);
+
+    Context<BM> ctx_dev(0, 1, 1);
+    Eltwise<BM, AK_BM, AK_BM, AK_BM, NCHW> eltwise_dev;
+
+    LOG(INFO) << "eltwise compute output shape";
+    eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
+
+
+    // Verify output shape
+    Shape sh = output_dev_4d[0]->valid_shape();
+    LOG(INFO) << "output shape: " << sh[0] << ", " << sh[1] << \
+              ", " << sh[2] << ", " << sh[3];
+    Shape shout{num_in, ch_in, h_in, w_in};
+    CHECK_EQ(shout == sh, true) << "compute shape error";
+
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
+
+    LOG(INFO) << "eltwise initialization";
+    eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
+
+    LOG(INFO) << "eltwise compute";
+    eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
+    print_tensor_device(*output_dev_4d[0]);
+}
+
+TEST(TestSaberFuncBM, test_func_max) {
+
+    Env<BM>::env_init();
+
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+
+    EltwiseType elt_type = Eltwise_max;
+
+    EltwiseParam<TensorDf4> param(elt_type);
+
+    int w_in = 10;
+    int h_in = 2;
+    int ch_in = 2;
+    int num_in = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_out = shape_in;
+
+    // Host Tensor
+    Tensor<X86, AK_FLOAT, NCHW> thin0(shape_in);
+    Tensor<X86, AK_FLOAT, NCHW> thin1(shape_in);
+    Tensor<X86, AK_FLOAT, NCHW> thin2(shape_in);
+    for (int i = 0; i < thin0.size(); ++i) {
+        thin0.mutable_data()[i] = i;
+    }
+    for (int i = 0; i < thin1.size(); ++i) {
+        thin1.mutable_data()[i] = i + 2;
+    }
+    for (int i = 0; i < thin2.size(); ++i) {
+        thin2.mutable_data()[i] = i + 1;
+    }
+
+    // Device Tensor
+    TensorDf4 tdin0, tdin1, tdin2, tdout;
+    tdin0.re_alloc(shape_in);
+    tdin1.re_alloc(shape_in);
+    tdin2.re_alloc(shape_in);
+    tdin0.copy_from(thin0);
+    tdin1.copy_from(thin1);
+    tdin2.copy_from(thin2);
+    tdout.re_alloc(shape_out);
+
+    // Device Vector of Tensor
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+    input_dev_4d.push_back(&tdin0);
+    input_dev_4d.push_back(&tdin1);
+    input_dev_4d.push_back(&tdin2);
+    output_dev_4d.push_back(&tdout);
+
+    Context<BM> ctx_dev(0, 1, 1);
+    Eltwise<BM, AK_BM, AK_BM, AK_BM, NCHW> eltwise_dev;
+
+    LOG(INFO) << "eltwise compute output shape";
+    eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
+
+    // Verify output shape
+    Shape sh = output_dev_4d[0]->valid_shape();
+    LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \
+        ", " << sh[2] << ", " << sh[3];
+    Shape shout{num_in, ch_in, h_in, w_in};
+    CHECK_EQ(shout == sh, true) << "compute shape error";
+
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
+
+    LOG(INFO) << "eltwise initialization";
+    eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
+
+    LOG(INFO) << "eltwise compute";
+    eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
+    print_tensor_device(*output_dev_4d[0]);
+
+}
+
+/*   0   1   2   3   4
+ *  10  11  12  13  14   (tdin_roi1, c=0)
+ *   (tdin_roi0, c=0)   25  26  27  28  29
+ *                      35  36  37  38  39
+ * =======================================
+ *  40  41  42  43  44
+ *  50  51  52  53  54   (tdin_roi1, c=1)
+ *   (tdin_roi0, c=1)   65  66  67  68  69
+ *                      75  76  77  78  79
+ */
+/*
+TEST(TestSaberFuncBM, test_func_prod_roi) {
+
+    Env<BM>::env_init();
+    typedef TargetWrapper<BM> API;
+
+    typedef TargetWrapper<BM> BM_API;
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+
+    EltwiseType elt_type = Eltwise_prod;
+
+    EltwiseParam<TensorDf4> param(elt_type);
+
+    int w_in = 10;
+    int h_in = 4;
+    int ch_in = 2;
+    int num_in = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_in_roi{num_in, ch_in, h_in / 2, w_in / 2};
+    Shape off0{0, 0, 0, 0};
+    Shape off1{0, 0, 2, 5};
+    Shape shape_out = shape_in_roi;
+
+    // Host Tensor
+    Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
+    for (int i = 0; i < thin.size(); ++i) {
+        thin.mutable_data()[i] = i;
+    }
+
+    // Device Tensor
+    TensorDf4 tdin, tdin_roi0, tdin_roi1, tdout;
+    tdin.re_alloc(shape_in);
+    tdin.copy_from(thin);
+    tdin_roi0.share_sub_buffer(tdin, shape_in_roi, off0);
+    tdin_roi1.share_sub_buffer(tdin, shape_in_roi, off1);
+    tdout.re_alloc(shape_out);
+
+
+    // Device Vector of Tensor
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+    input_dev_4d.push_back(&tdin_roi0);
+    input_dev_4d.push_back(&tdin_roi1);
+    input_dev_4d.push_back(&tdin_roi1);
+    output_dev_4d.push_back(&tdout);
+
+
+    Context<BM> ctx_dev(0, 1, 1);
+    Eltwise<BM, AK_BM, AK_BM, AK_BM, NCHW> eltwise_dev;
+
+    LOG(INFO) << "eltwise compute output shape";
+    eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
+
+
+    // Verify output shape
+    Shape sh = output_dev_4d[0]->valid_shape();
+    LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \
+        ", " << sh[2] << ", " << sh[3];
+    Shape shout(shape_in_roi);
+    CHECK_EQ(shout == sh, true) << "compute shape error";
+
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
+
+    LOG(INFO) << "eltwise initialization";
+    eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
+
+
+    LOG(INFO) << "eltwise compute";
+    eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
+
+
+    output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+    output_dev_4d[0]->sync();
+    print_tensor_device(*output_dev_4d[0]);
+    cudaDeviceSynchronize();
+
+
+    TensorHf4 th_for_print;
+    th_for_print.re_alloc(output_dev_4d[0]->valid_shape());
+    th_for_print.copy_from(*output_dev_4d[0]);
+    print_tensor_host(th_for_print);
+
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+
+*/
+
+/*   0   1   2   3   4
+ *  10  11  12  13  14   (tdin_roi1, c=0)
+ *   (tdin_roi0, c=0)   25  26  27  28  29
+ *                      35  36  37  38  39
+ * =======================================
+ *  40  41  42  43  44
+ *  50  51  52  53  54   (tdin_roi1, c=1)
+ *   (tdin_roi0, c=1)   65  66  67  68  69
+ *                      75  76  77  78  79
+ */
+/*
+TEST(TestSaberFuncBM, test_func_sum_roi_new) {
+
+    Env<BM>::env_init();
+    typedef TargetWrapper<BM> API;
+
+    typedef TargetWrapper<BM> BM_API;
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+
+    EltwiseType elt_type = Eltwise_sum;
+
+    int w_in = 10;
+    int h_in = 4;
+    int ch_in = 2;
+    int num_in = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_in_roi{num_in, ch_in, h_in / 2, w_in / 2};
+
+    Shape off0{0, 0, 0, 0};
+    Shape off1{0, 0, 2, 5};
+    Shape shape_out = shape_in_roi;
+
+    // Host Tensor
+    Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
+    for (int i = 0; i < thin.size(); ++i) {
+        thin.mutable_data()[i] = i;
+    }
+
+    // Device Tensor
+    TensorDf4 tdin, tdin_roi0, tdin_roi1, tdout;
+    tdin.re_alloc(shape_in);
+    tdin.copy_from(thin);
+    tdin_roi0.share_sub_buffer(tdin, shape_in_roi, off0);
+    tdin_roi1.share_sub_buffer(tdin, shape_in_roi, off1);
+    tdout.re_alloc(shape_out);
+
+    // Device Vector of Tensor
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+    input_dev_4d.push_back(&tdin_roi0);
+    input_dev_4d.push_back(&tdin_roi1);
+//    input_dev_4d.push_back(&tdin_roi1);
+//    input_dev_4d.push_back(&tdin_roi1);
+    output_dev_4d.push_back(&tdout);
+
+//    Shape shape_coeff(1, 1, 1, input_dev_4d.size());
+//    TensorHf4 thcoeff(shape_coeff);
+//    for (int i = 0; i < thcoeff.size(); ++i) {
+//        thcoeff.mutable_data()[i] = 1;
+//    }
+
+    EltwiseParam<TensorDf4> param(elt_type);
+
+    Context<BM> ctx_dev(0, 1, 1);
+    Eltwise<BM, AK_BM, AK_BM, AK_BM, NCHW> eltwise_dev;
+
+    LOG(INFO) << "eltwise compute output shape";
+    eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
+
+    // Verify output shape
+    Shape sh = output_dev_4d[0]->valid_shape();
+    LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \
+        ", " << sh[2] << ", " << sh[3];
+    Shape shout(shape_in_roi);
+    CHECK_EQ(shout == sh, true) << "compute shape error";
+
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
+
+    LOG(INFO) << "eltwise initialization";
+    eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
+
+    print_tensor_device(*input_dev_4d[0]);
+    print_tensor_device(*input_dev_4d[1]);
+    cudaDeviceSynchronize();
+    LOG(INFO) << "eltwise compute";
+    eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
+
+    output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+    output_dev_4d[0]->sync();
+    print_tensor_device(*output_dev_4d[0]);
+    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+*/
+/*
+TEST(TestSaberFuncBM, test_func_sum_roi) {
+
+    Env<BM>::env_init();
+    typedef TargetWrapper<BM> API;
+
+    typedef TargetWrapper<BM> BM_API;
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+
+    EltwiseType elt_type = Eltwise_sum;
+
+    int w_in = 10;
+    int h_in = 4;
+    int ch_in = 2;
+    int num_in = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_in_roi{num_in, ch_in, h_in / 2, w_in / 2};
+    Shape off0{0, 0, 0, 0};
+    Shape off1{0, 0, 2, 5};
+    Shape shape_out = shape_in_roi;
+
+    // Host Tensor
+    Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
+    for (int i = 0; i < thin.size(); ++i) {
+        thin.mutable_data()[i] = i;
+    }
+
+    // Device Tensor
+    TensorDf4 tdin, tdin_roi0, tdin_roi1, tdout;
+    tdin.re_alloc(shape_in);
+    tdin.copy_from(thin);
+    tdin_roi0.share_sub_buffer(tdin, shape_in_roi, off0);
+    tdin_roi1.share_sub_buffer(tdin, shape_in_roi, off1);
+    tdout.re_alloc(shape_out);
+
+    // Device Vector of Tensor
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+    input_dev_4d.push_back(&tdin_roi0);
+    input_dev_4d.push_back(&tdin_roi1);
+    output_dev_4d.push_back(&tdout);
+
+    //Shape shape_coeff(1, 1, 1, 3);
+    Shape shape_coeff(1, 1, 1, input_dev_4d.size());
+    TensorHf4 thcoeff(shape_coeff);
+
+    for (int i = 0; i < thcoeff.size(); ++i) {
+        thcoeff.mutable_data()[i] = i;
+    }
+    TensorDf4 tdcoeff;
+    tdcoeff.re_alloc(shape_coeff);
+    tdcoeff.copy_from(thcoeff);
+
+    EltwiseParam<TensorDf4> param(elt_type, &tdcoeff);
+
+    Context<BM> ctx_dev(0, 1, 1);
+    Eltwise<BM, AK_BM, AK_BM, AK_BM, NCHW> eltwise_dev;
+
+    LOG(INFO) << "eltwise compute output shape";
+    eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
+
+    // Verify output shape
+    Shape sh = output_dev_4d[0]->valid_shape();
+    LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \
+        ", " << sh[2] << ", " << sh[3];
+    Shape shout(shape_in_roi);
+    CHECK_EQ(shout == sh, true) << "compute shape error";
+
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
+
+    LOG(INFO) << "eltwise initialization";
+    eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
+
+    LOG(INFO) << "eltwise compute";
+    eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
+
+    output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+    output_dev_4d[0]->sync();
+    print_tensor_device(*output_dev_4d[0]);
+    cudaDeviceSynchronize();
+
+    TensorHf4 th_for_print;
+    th_for_print.re_alloc(output_dev_4d[0]->valid_shape());
+    th_for_print.copy_from(*output_dev_4d[0]);
+    print_tensor_host(th_for_print);
+
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+*/
+
+/*
+TEST(TestSaberFuncBM, test_func_max_roi) {
+
+    Env<BM>::env_init();
+    typedef TargetWrapper<BM> API;
+
+    typedef TargetWrapper<BM> BM_API;
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+
+    EltwiseType elt_type = Eltwise_max;
+
+    int w_in = 10;
+    int h_in = 4;
+    int ch_in = 2;
+    int num_in = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_in_roi{num_in, ch_in, h_in / 2, w_in / 2};
+    Shape off0{0, 0, 0, 0};
+    Shape off1{0, 0, 2, 5};
+    Shape shape_out = shape_in_roi;
+
+    // Host Tensor
+    Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
+    for (int i = 0; i < thin.size(); ++i) {
+        thin.mutable_data()[i] = i;
+    }
+
+    // Device Tensor
+    TensorDf4 tdin, tdin_roi0, tdin_roi1, tdout;
+    tdin.re_alloc(shape_in);
+    tdin.copy_from(thin);
+    tdin_roi0.share_sub_buffer(tdin, shape_in_roi, off0);
+    tdin_roi1.share_sub_buffer(tdin, shape_in_roi, off1);
+    tdout.re_alloc(shape_out);
+
+    // Device Vector of Tensor
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+    input_dev_4d.push_back(&tdin_roi0);
+    input_dev_4d.push_back(&tdin_roi1);
+    output_dev_4d.push_back(&tdout);
+
+    EltwiseParam<TensorDf4> param(elt_type);
+
+    Context<BM> ctx_dev(0, 1, 1);
+    Eltwise<BM, AK_BM, AK_BM, AK_BM, NCHW> eltwise_dev;
+
+    LOG(INFO) << "eltwise compute output shape";
+    eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
+
+    // Verify output shape
+    Shape sh = output_dev_4d[0]->valid_shape();
+    LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \
+        ", " << sh[2] << ", " << sh[3];
+    Shape shout(shape_in_roi);
+    CHECK_EQ(shout == sh, true) << "compute shape error";
+
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
+
+    LOG(INFO) << "eltwise initialization";
+    eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
+
+    LOG(INFO) << "eltwise compute";
+    eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
+
+    output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+    output_dev_4d[0]->sync();
+    print_tensor_device(*output_dev_4d[0]);
+    cudaDeviceSynchronize();
+
+    TensorHf4 th_for_print;
+    th_for_print.re_alloc(output_dev_4d[0]->valid_shape());
+    th_for_print.copy_from(*output_dev_4d[0]);
+    print_tensor_host(th_for_print);
+
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+
+*/
+
+int main(int argc, const char** argv) {
+    // initial logger
+    //logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}

From 8125c944bec0486c34b1d44054a17dbe035e22e8 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 11 Jul 2018 11:40:43 +0800
Subject: [PATCH 121/318] test eltwise PROD for BM

---
 test/saber/bm/test_saber_func_eltwise_BM.cpp | 21 +++-----------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/test/saber/bm/test_saber_func_eltwise_BM.cpp b/test/saber/bm/test_saber_func_eltwise_BM.cpp
index da931510b..643f4e026 100644
--- a/test/saber/bm/test_saber_func_eltwise_BM.cpp
+++ b/test/saber/bm/test_saber_func_eltwise_BM.cpp
@@ -7,7 +7,7 @@
 
 using namespace anakin::saber;
 
-/*
+
 TEST(TestSaberFuncBM, test_func_prod) {
 
     Env<BM>::env_init();
@@ -63,12 +63,10 @@ TEST(TestSaberFuncBM, test_func_prod) {
 
 
     Context<BM> ctx_dev(0, 1, 1);
-    Eltwise<BM, AK_BM> eltwise_dev;
+    Eltwise<BM, AK_BM, AK_BM, AK_BM, NCHW> eltwise_dev;
 
     LOG(INFO) << "eltwise compute output shape";
-    eltwise_dev.compute_output_shape(output_dev_4d, input_dev_4d, param);
-
-    //SABER_CHECK(eltwise_dev.compute_output_shape(output_dev_4d, input_dev_4d, param));
+    eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
 
     // Verify output shape
     Shape sh = output_dev_4d[0]->valid_shape();
@@ -86,22 +84,9 @@ TEST(TestSaberFuncBM, test_func_prod) {
     LOG(INFO) << "eltwise compute";
     eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
 
-
-    output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
-    output_dev_4d[0]->sync();
     print_tensor_device(*output_dev_4d[0]);
-    cudaDeviceSynchronize();
-
-
-    TensorHf4 th_for_print;
-    th_for_print.re_alloc(output_dev_4d[0]->valid_shape());
-    th_for_print.copy_from(*output_dev_4d[0]);
-    print_tensor_host(th_for_print);
-
-    CUDA_CHECK(cudaPeekAtLastError());
 }
 
-*/
 
 TEST(TestSaberFuncBM, test_func_sum) {
 

From 0596def27ff050ff804522d176ecc490ddd9be08 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Mon, 18 Jun 2018 13:39:29 +0800
Subject: [PATCH 122/318] Initial checkin for BM device support

---
 .idea/workspace.xml                           | 393 +++++++++
 CMakeLists.txt                                |  24 +-
 cmake/compiler_options.cmake                  |  18 +
 cmake/config/anakin_config.h.in               |   2 +
 cmake/gather.cmake                            |   6 +
 framework/core/data_types.h                   |   9 +
 saber/CMakeLists.txt                          |  38 +-
 saber/core/common.h                           |  14 +
 saber/core/impl/bm/bm_device.cpp              |  24 +
 saber/core/impl/bm/bm_impl.cpp                |  89 ++
 saber/core/target_traits.h                    |   7 +
 saber/core/target_wrapper.h                   |  60 +-
 saber/core/tensor_op.cpp                      |   6 +-
 saber/funcs/CMakeLists.txt                    |  12 +
 saber/funcs/impl/bm/base/CMakeLists.txt       |  20 +
 .../impl/bm/base/include/bmdnn/bmdnn_api.h    | 814 ++++++++++++++++++
 .../bm/base/include/bmdnn/bmdnn_ext_api.h     | 438 ++++++++++
 .../bm/base/include/bmdnn/bmdnn_runtime.h     |  20 +
 .../impl/bm/base/include/bmdnn/op_code.h      |  62 ++
 .../bm/base/include/bmlib/bmlib_runtime.h     | 229 +++++
 .../impl/bm/base/include/bmlib/bmlib_utils.h  |  72 ++
 .../impl/bm/base/include/bmruntime/bmblob.h   |  97 +++
 .../impl/bm/base/include/bmruntime/bmcnnctx.h |  58 ++
 .../impl/bm/base/include/bmruntime/bmnet.h    |  78 ++
 .../bm/base/include/bmruntime/bmruntime.h     | 154 ++++
 .../base/include/bmruntime/bmruntime_common.h |  65 ++
 .../include/bmruntime/bmruntime_interface.h   |  11 +
 saber/funcs/impl/bm/vender_activation.h       |  96 +++
 saber/funcs/impl/bm/vender_conv.h             | 195 +++++
 saber/funcs/impl/bm/vender_conv_act.h         | 198 +++++
 saber/funcs/impl/bm/vender_conv_act_pooling.h | 176 ++++
 saber/funcs/impl/bm/vender_fc.h               | 114 +++
 saber/funcs/impl/bm/vender_pooling.h          | 151 ++++
 saber/saber_funcs_param.h                     |  12 +-
 saber/saber_types.h                           |  10 +-
 test/saber/bm/test_TargetWrapper_BM.cpp       |  16 +
 test/saber/bm/test_saber_buffer_BM.cpp        | 116 +++
 test/saber/bm/test_saber_buffer_BM.h          |  20 +
 test/saber/bm/test_saber_context_BM.cpp       |  31 +
 test/saber/bm/test_saber_context_BM.h         |  21 +
 test/saber/bm/test_saber_device_BM.cpp        |  20 +
 test/saber/bm/test_saber_device_BM.h          |  21 +
 test/saber/bm/test_saber_func_BM.h            |  38 +
 .../bm/test_saber_func_activation_BM.cpp      | 183 ++++
 test/saber/bm/test_saber_func_conv_BM.cpp     | 725 ++++++++++++++++
 test/saber/bm/test_saber_func_fc_BM.cpp       | 148 ++++
 test/saber/bm/test_saber_func_pooling_BM.cpp  | 311 +++++++
 test/saber/bm/test_saber_shape_BM.cpp         | 126 +++
 test/saber/bm/test_saber_shape_BM.h           |  25 +
 test/saber/bm/test_saber_tensor_BM.cpp        | 642 ++++++++++++++
 test/saber/bm/test_saber_tensor_BM.h          |  21 +
 51 files changed, 6218 insertions(+), 18 deletions(-)
 create mode 100644 .idea/workspace.xml
 create mode 100644 saber/core/impl/bm/bm_device.cpp
 create mode 100644 saber/core/impl/bm/bm_impl.cpp
 create mode 100644 saber/funcs/impl/bm/base/CMakeLists.txt
 create mode 100644 saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h
 create mode 100644 saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h
 create mode 100644 saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h
 create mode 100644 saber/funcs/impl/bm/base/include/bmdnn/op_code.h
 create mode 100644 saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h
 create mode 100644 saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h
 create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmblob.h
 create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h
 create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmnet.h
 create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h
 create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h
 create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h
 create mode 100644 saber/funcs/impl/bm/vender_activation.h
 create mode 100644 saber/funcs/impl/bm/vender_conv.h
 create mode 100644 saber/funcs/impl/bm/vender_conv_act.h
 create mode 100644 saber/funcs/impl/bm/vender_conv_act_pooling.h
 create mode 100644 saber/funcs/impl/bm/vender_fc.h
 create mode 100644 saber/funcs/impl/bm/vender_pooling.h
 create mode 100644 test/saber/bm/test_TargetWrapper_BM.cpp
 create mode 100644 test/saber/bm/test_saber_buffer_BM.cpp
 create mode 100644 test/saber/bm/test_saber_buffer_BM.h
 create mode 100644 test/saber/bm/test_saber_context_BM.cpp
 create mode 100644 test/saber/bm/test_saber_context_BM.h
 create mode 100644 test/saber/bm/test_saber_device_BM.cpp
 create mode 100644 test/saber/bm/test_saber_device_BM.h
 create mode 100644 test/saber/bm/test_saber_func_BM.h
 create mode 100644 test/saber/bm/test_saber_func_activation_BM.cpp
 create mode 100644 test/saber/bm/test_saber_func_conv_BM.cpp
 create mode 100644 test/saber/bm/test_saber_func_fc_BM.cpp
 create mode 100644 test/saber/bm/test_saber_func_pooling_BM.cpp
 create mode 100644 test/saber/bm/test_saber_shape_BM.cpp
 create mode 100644 test/saber/bm/test_saber_shape_BM.h
 create mode 100644 test/saber/bm/test_saber_tensor_BM.cpp
 create mode 100644 test/saber/bm/test_saber_tensor_BM.h

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
new file mode 100644
index 000000000..48b584478
--- /dev/null
+++ b/.idea/workspace.xml
@@ -0,0 +1,393 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="CMakeRunConfigurationManager" shouldGenerate="true" shouldDeleteObsolete="true" buildAllGenerated="false">
+    <generated />
+  </component>
+  <component name="CMakeSettings">
+    <configurations>
+      <configuration PROFILE_NAME="Debug" CONFIG_NAME="Debug" />
+    </configurations>
+  </component>
+  <component name="ChangeListManager">
+    <list default="true" id="05b5ffab-037b-4c62-b3a1-7655f7441f38" name="Default" comment="">
+      <change afterPath="$PROJECT_DIR$/saber/core/impl/bm/bm_device.cpp" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/saber/core/impl/bm/bm_impl.cpp" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/CMakeLists.txt" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmdnn/op_code.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmruntime/bmblob.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmruntime/bmnet.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/vender_activation.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/vender_conv.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/vender_conv_act.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/vender_conv_act_pooling.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/vender_fc.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/vender_pooling.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_TargetWrapper_BM.cpp" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_buffer_BM.cpp" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_buffer_BM.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_context_BM.cpp" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_context_BM.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_device_BM.cpp" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_device_BM.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_func_BM.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_func_activation_BM.cpp" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_func_conv_BM.cpp" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_func_fc_BM.cpp" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_func_pooling_BM.cpp" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_shape_BM.cpp" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_shape_BM.h" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_tensor_BM.cpp" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_tensor_BM.h" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/CMakeLists.txt" beforeDir="false" afterPath="$PROJECT_DIR$/CMakeLists.txt" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/cmake/compiler_options.cmake" beforeDir="false" afterPath="$PROJECT_DIR$/cmake/compiler_options.cmake" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/cmake/config/anakin_config.h.in" beforeDir="false" afterPath="$PROJECT_DIR$/cmake/config/anakin_config.h.in" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/cmake/gather.cmake" beforeDir="false" afterPath="$PROJECT_DIR$/cmake/gather.cmake" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/framework/core/data_types.h" beforeDir="false" afterPath="$PROJECT_DIR$/framework/core/data_types.h" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/saber/CMakeLists.txt" beforeDir="false" afterPath="$PROJECT_DIR$/saber/CMakeLists.txt" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/saber/core/common.h" beforeDir="false" afterPath="$PROJECT_DIR$/saber/core/common.h" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/saber/core/target_traits.h" beforeDir="false" afterPath="$PROJECT_DIR$/saber/core/target_traits.h" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/saber/core/target_wrapper.h" beforeDir="false" afterPath="$PROJECT_DIR$/saber/core/target_wrapper.h" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/saber/core/tensor_op.cpp" beforeDir="false" afterPath="$PROJECT_DIR$/saber/core/tensor_op.cpp" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/saber/funcs/CMakeLists.txt" beforeDir="false" afterPath="$PROJECT_DIR$/saber/funcs/CMakeLists.txt" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/saber/saber_funcs_param.h" beforeDir="false" afterPath="$PROJECT_DIR$/saber/saber_funcs_param.h" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/saber/saber_types.h" beforeDir="false" afterPath="$PROJECT_DIR$/saber/saber_types.h" afterDir="false" />
+    </list>
+    <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
+    <option name="TRACKING_ENABLED" value="true" />
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="FileEditorManager">
+    <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
+      <file leaf-file-name="saber_funcs_param.h" pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/saber/saber_funcs_param.h">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="417">
+              <caret line="33" column="21" lean-forward="true" selection-start-line="33" selection-start-column="21" selection-end-line="33" selection-end-column="21" />
+              <folding>
+                <element signature="e#897#918#0" expanded="true" />
+                <element signature="e#948#969#0" expanded="true" />
+              </folding>
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file leaf-file-name="target_wrapper.h" pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/saber/core/target_wrapper.h">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="750">
+              <caret line="559" lean-forward="true" selection-start-line="559" selection-end-line="559" />
+              <folding>
+                <element signature="e#14794#16797#0" expanded="true" />
+              </folding>
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file leaf-file-name="tensor.h" pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/saber/core/tensor.h">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="337">
+              <caret line="670" column="15" selection-start-line="670" selection-start-column="10" selection-end-line="670" selection-end-column="15" />
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file leaf-file-name="tensor_op.cpp" pinned="false" current-in-tab="true">
+        <entry file="file://$PROJECT_DIR$/saber/core/tensor_op.cpp">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="491">
+              <caret line="127" lean-forward="true" selection-start-line="127" selection-end-line="127" />
+              <folding>
+                <element signature="e#12586#12607#0" expanded="true" />
+                <element signature="e#12632#12656#0" expanded="true" />
+                <element signature="e#12686#12707#0" expanded="true" />
+                <element signature="e#12745#12766#0" expanded="true" />
+              </folding>
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file leaf-file-name=".gitignore" pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/.gitignore">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="315">
+              <caret line="21" selection-start-line="21" selection-end-line="21" />
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file leaf-file-name="CMakeLists.txt" pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/CMakeLists.txt">
+          <provider selected="true" editor-type-id="text-editor" />
+        </entry>
+      </file>
+    </leaf>
+  </component>
+  <component name="FindInProjectRecents">
+    <findStrings>
+      <find>MvnParam</find>
+      <find>ConvParam</find>
+      <find>TargetType</find>
+      <find>mem_set</find>
+      <find>&amp;</find>
+      <find>BM</find>
+      <find>print</find>
+      <find>print_tensor_host</find>
+    </findStrings>
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
+  </component>
+  <component name="IdeDocumentHistory">
+    <option name="CHANGED_PATHS">
+      <list>
+        <option value="$PROJECT_DIR$/saber/core/tensor.h" />
+        <option value="$PROJECT_DIR$/.gitignore" />
+        <option value="$PROJECT_DIR$/CMakeLists.txt" />
+        <option value="$PROJECT_DIR$/saber/core/target_wrapper.h" />
+        <option value="$PROJECT_DIR$/saber/core/tensor_op.cpp" />
+      </list>
+    </option>
+  </component>
+  <component name="JsBuildToolGruntFileManager" detection-done="true" sorting="DEFINITION_ORDER" />
+  <component name="JsBuildToolPackageJson" detection-done="true" sorting="DEFINITION_ORDER" />
+  <component name="JsGulpfileManager">
+    <detection-done>true</detection-done>
+    <sorting>DEFINITION_ORDER</sorting>
+  </component>
+  <component name="NodePackageJsonFileManager">
+    <packageJsonPaths />
+  </component>
+  <component name="ProjectFrameBounds">
+    <option name="y" value="23" />
+    <option name="width" value="2560" />
+    <option name="height" value="1353" />
+  </component>
+  <component name="ProjectLevelVcsManager">
+    <ConfirmationsSetting value="2" id="Add" />
+  </component>
+  <component name="ProjectView">
+    <navigator proportions="" version="1">
+      <foldersAlwaysOnTop value="true" />
+    </navigator>
+    <panes>
+      <pane id="ProjectPane">
+        <subPane>
+          <expand>
+            <path>
+              <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
+              <item name="Anakin" type="462c0819:PsiDirectoryNode" />
+            </path>
+            <path>
+              <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
+              <item name="Anakin" type="462c0819:PsiDirectoryNode" />
+              <item name="framework" type="462c0819:PsiDirectoryNode" />
+            </path>
+            <path>
+              <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
+              <item name="Anakin" type="462c0819:PsiDirectoryNode" />
+              <item name="saber" type="462c0819:PsiDirectoryNode" />
+            </path>
+            <path>
+              <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
+              <item name="Anakin" type="462c0819:PsiDirectoryNode" />
+              <item name="saber" type="462c0819:PsiDirectoryNode" />
+              <item name="core" type="462c0819:PsiDirectoryNode" />
+            </path>
+            <path>
+              <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
+              <item name="Anakin" type="462c0819:PsiDirectoryNode" />
+              <item name="saber" type="462c0819:PsiDirectoryNode" />
+              <item name="funcs" type="462c0819:PsiDirectoryNode" />
+            </path>
+            <path>
+              <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
+              <item name="Anakin" type="462c0819:PsiDirectoryNode" />
+              <item name="saber" type="462c0819:PsiDirectoryNode" />
+              <item name="funcs" type="462c0819:PsiDirectoryNode" />
+              <item name="impl" type="462c0819:PsiDirectoryNode" />
+            </path>
+          </expand>
+          <select />
+        </subPane>
+      </pane>
+      <pane id="Scope" />
+    </panes>
+  </component>
+  <component name="PropertiesComponent">
+    <property name="WebServerToolWindowFactoryState" value="false" />
+  </component>
+  <component name="RunDashboard">
+    <option name="ruleStates">
+      <list>
+        <RuleState>
+          <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
+        </RuleState>
+        <RuleState>
+          <option name="name" value="StatusDashboardGroupingRule" />
+        </RuleState>
+      </list>
+    </option>
+  </component>
+  <component name="SvnConfiguration">
+    <configuration />
+  </component>
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="05b5ffab-037b-4c62-b3a1-7655f7441f38" name="Default" comment="" />
+      <created>1533519941069</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1533519941069</updated>
+      <workItem from="1533519943497" duration="1090000" />
+      <workItem from="1533533623166" duration="3417000" />
+    </task>
+    <servers />
+  </component>
+  <component name="TimeTrackingManager">
+    <option name="totallyTimeSpent" value="4507000" />
+  </component>
+  <component name="ToolWindowManager">
+    <frame x="0" y="23" width="2560" height="1353" extended-state="0" />
+    <editor active="true" />
+    <layout>
+      <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.25" />
+      <window_info anchor="bottom" id="TODO" order="6" />
+      <window_info anchor="bottom" id="Event Log" order="7" side_tool="true" />
+      <window_info anchor="bottom" id="Version Control" order="7" weight="0.28850666" />
+      <window_info anchor="bottom" id="Run" order="2" />
+      <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
+      <window_info anchor="bottom" id="Terminal" order="7" visible="true" weight="0.19077404" />
+      <window_info id="Favorites" order="2" side_tool="true" />
+      <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
+      <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
+      <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
+      <window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
+      <window_info anchor="bottom" id="Message" order="0" />
+      <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
+      <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
+      <window_info anchor="bottom" id="Find" order="1" />
+    </layout>
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="1" />
+  </component>
+  <component name="Vcs.Log.History.Properties">
+    <option name="COLUMN_ORDER">
+      <list>
+        <option value="0" />
+        <option value="2" />
+        <option value="3" />
+        <option value="1" />
+      </list>
+    </option>
+  </component>
+  <component name="Vcs.Log.Tabs.Properties">
+    <option name="TAB_STATES">
+      <map>
+        <entry key="MAIN">
+          <value>
+            <State>
+              <option name="RECENTLY_FILTERED_USER_GROUPS">
+                <collection />
+              </option>
+              <option name="RECENTLY_FILTERED_BRANCH_GROUPS">
+                <collection />
+              </option>
+              <option name="COLUMN_ORDER">
+                <list>
+                  <option value="0" />
+                  <option value="1" />
+                  <option value="2" />
+                  <option value="3" />
+                </list>
+              </option>
+            </State>
+          </value>
+        </entry>
+      </map>
+    </option>
+  </component>
+  <component name="VcsContentAnnotationSettings">
+    <option name="myLimit" value="2678400000" />
+  </component>
+  <component name="editorHistoryManager">
+    <entry file="file://$PROJECT_DIR$/examples/cuda/example_nv_cnn_net.cpp">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="255">
+          <caret line="17" column="26" lean-forward="true" selection-start-line="17" selection-start-column="26" selection-end-line="17" selection-end-column="26" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/examples/cuda/example_nv_cnn_net.cpp">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="255">
+          <caret line="17" column="26" selection-start-line="17" selection-start-column="26" selection-end-line="17" selection-end-column="26" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/.gitignore">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="315">
+          <caret line="21" selection-start-line="21" selection-end-line="21" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/CMakeLists.txt">
+      <provider selected="true" editor-type-id="text-editor" />
+    </entry>
+    <entry file="file://$PROJECT_DIR$/saber/saber_funcs_param.h">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="417">
+          <caret line="33" column="21" lean-forward="true" selection-start-line="33" selection-start-column="21" selection-end-line="33" selection-end-column="21" />
+          <folding>
+            <element signature="e#897#918#0" expanded="true" />
+            <element signature="e#948#969#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/saber/core/target_wrapper.h">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="750">
+          <caret line="559" lean-forward="true" selection-start-line="559" selection-end-line="559" />
+          <folding>
+            <element signature="e#14794#16797#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/saber/core/tensor.h">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="337">
+          <caret line="670" column="15" selection-start-line="670" selection-start-column="10" selection-end-line="670" selection-end-column="15" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/saber/core/tensor_op.cpp">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="491">
+          <caret line="127" lean-forward="true" selection-start-line="127" selection-end-line="127" />
+          <folding>
+            <element signature="e#12586#12607#0" expanded="true" />
+            <element signature="e#12632#12656#0" expanded="true" />
+            <element signature="e#12686#12707#0" expanded="true" />
+            <element signature="e#12745#12766#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+  </component>
+</project>
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 369ccaf57..e1ac0aa80 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -79,6 +79,16 @@ anakin_option(USE_CUDNN "Use Cudnn libs." YES if USE_CUDA)
 anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform." YES if USE_CUDA)
 anakin_option(BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform" NO if BUILD_CROSS_PLANTFORM)
 
+# compile options for BM place
+anakin_option(USE_BM "Use Cuda libs." YES if NVIDIA_GPU)
+anakin_option(USE_CUBLAS "Use Cublas libs." YES if USE_BM)
+anakin_option(USE_CURAND "Use Curand libs." YES if USE_BM)
+anakin_option(USE_CUFFT "Use CuFFT libs." YES if USE_BM)
+anakin_option(USE_CUDNN "Use Cudnn libs." YES if USE_BM)
+anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform." YES if USE_BM)
+
+
+
 # build options for amd.
 anakin_option(USE_AMD "Use AMD ROCm OpenCL" YES if AMD_GPU)
 
@@ -87,6 +97,9 @@ cmake_minimum_required(VERSION ${MIN_CMAKE_V} FATAL_ERROR)
 if(USE_CUDA)
     # Select gpu target arch for local high performance implement sass code . Now we have checked on sm_61 sm_50 and it works well.
     set(SELECTED_SASS_TARGET_ARCH "61")
+elseif(USE_BM)
+    # Select gpu target arch for local high performance implement sass code . Now we have checked on sm_61 sm_50 and it works well.
+    #set(SELECTED_SASS_TARGET_ARCH "61")
 endif()
 if((NOT BUILD_FAT_BIN) AND (NOT BUILD_CROSS_PLANTFORM) AND USE_CUDA)
     # Select the only nvidia gpu arch you want to be built on
@@ -97,6 +110,11 @@ endif()
 anakin_option(BUILD_CUBIN "BUILD with the -cubin option in Device mode" NO if USE_CUDA)
 anakin_option(COMPILE_PTX "Returns a list of PTX files generated from src." NO if USE_CUDA)
 
+# build options for BM.
+anakin_option(BUILD_CUBIN "BUILD with the -cubin option in Device mode" NO if USE_BM)
+anakin_option(COMPILE_PTX "Returns a list of PTX files generated from src." NO if USE_BM)
+
+
 # common build options
 anakin_option(ENABLE_DEBUG "Enable DEBUG(default) mode." NO)
 anakin_option(ENABLE_VERBOSE_MSG "Enable verbose=1 : compile msg during make." NO)
@@ -142,7 +160,7 @@ else()
     set(CMAKE_BUILD_TYPE Release FORCE)
 endif()
 
-if(USE_LOGGER) 
+if(USE_LOGGER)
 	anakin_option(ENABLE_STACKTRACES "If enable local logger with stacktrace." YES if NOT USE_ARM_PLACE)
 	anakin_option(SUPPORT_PTHREADS "If enable local logger with supporting pthreads. " YES)
 endif()
@@ -171,6 +189,10 @@ if(USE_CUDA)
     include(cmake/cuda.cmake)
 endif()
 
+if(USE_BM)
+    #include(cmake/cuda.cmake)
+endif()
+
 if(USE_X86_PLACE)
     set(ANAKIN_TEMP_THIRD_PARTY_PATH ${CMAKE_BINARY_DIR}/third-party)
     if(USE_MKLML)
diff --git a/cmake/compiler_options.cmake b/cmake/compiler_options.cmake
index ef5e953c4..e10d32783 100644
--- a/cmake/compiler_options.cmake
+++ b/cmake/compiler_options.cmake
@@ -120,3 +120,21 @@ if(USE_CUDA)
     # set default nvidia gpu arch
     set(ANAKIN_ARCH_LIST "3.5;5.0;6.0;6.1")
 endif()
+
+if(USE_BM)
+	if(CMAKE_BUILD_TYPE MATCHES Debug)
+		anakin_add_compile_option("-Xcompiler -fPIC" NVCC)
+		anakin_add_compile_option(-G NVCC)
+		anakin_add_compile_option(-g NVCC)
+		anakin_add_compile_option(-std=c++11 NVCC)
+		anakin_add_compile_option(-Wno-deprecated-gpu-targets NVCC) # suppress warning by architectures are deprecated (2.0,2.1)
+	else()
+		anakin_add_compile_option("-Xcompiler -fPIC" NVCC)
+		anakin_add_compile_option(-O3 NVCC)
+		anakin_add_compile_option(-std=c++11 NVCC)
+		anakin_add_compile_option("--default-stream per-thread" NVCC)
+		anakin_add_compile_option(-Wno-deprecated-gpu-targets NVCC)
+	endif()
+	# set default nvidia gpu arch
+	set(ANAKIN_ARCH_LIST "3.5;5.0;6.0;6.1")
+endif()
diff --git a/cmake/config/anakin_config.h.in b/cmake/config/anakin_config.h.in
index 9de568015..663c68910 100644
--- a/cmake/config/anakin_config.h.in
+++ b/cmake/config/anakin_config.h.in
@@ -36,6 +36,8 @@
 
 #cmakedefine USE_CUDA
 
+#cmakedefine USE_BM
+
 #cmakedefine USE_CUDNN
 
 #cmakedefine USE_PYTHON
diff --git a/cmake/gather.cmake b/cmake/gather.cmake
index e3941ddd4..ff1a45d4e 100644
--- a/cmake/gather.cmake
+++ b/cmake/gather.cmake
@@ -24,6 +24,12 @@ if(USE_CUDA)
     anakin_find_cuda()
 endif()
 
+if(USE_BM)
+    #set other cuda path
+    #set(CUDA_TOOLKIT_ROOT_DIR $ENV{CUDA_PATH})
+    #anakin_find_cuda()
+endif()
+
 # set amd opencl path
 if(USE_AMD)
     #amd_set_opencl_path()
diff --git a/framework/core/data_types.h b/framework/core/data_types.h
index 496f1b688..c0c550a12 100644
--- a/framework/core/data_types.h
+++ b/framework/core/data_types.h
@@ -17,6 +17,7 @@
 #define ANAKIN_DATA_TYPES_H 
 
 #include "framework/core/parameter.h"
+#include "bmlib_runtime.h"
 #include <cstddef>
 
 namespace anakin {
@@ -45,6 +46,7 @@ SABER_TO_BASE_TYPE(AK_UINT16, uint16_t);
 SABER_TO_BASE_TYPE(AK_UINT32, uint32_t);
 SABER_TO_BASE_TYPE(AK_BOOL, bool);
 SABER_TO_BASE_TYPE(AK_STRING, std::string);
+SABER_TO_BASE_TYPE(AK_BM, bm_device_mem_t);
 
 template<typename T>
 struct DataTypeRecover {
@@ -69,6 +71,7 @@ BASE_TYPE_TO_SABER(uint8_t, AK_UINT8);
 BASE_TYPE_TO_SABER(uint32_t, AK_UINT32);
 BASE_TYPE_TO_SABER(bool, AK_BOOL);
 BASE_TYPE_TO_SABER(std::string, AK_STRING);
+BASE_TYPE_TO_SABER(bm_device_mem_t, AK_BM);
 
 template<typename T>
 struct TypeWarpper {
@@ -96,6 +99,7 @@ ANAKIN_TO_TYPE_ID(long long, anakin_int64)
 ANAKIN_TO_TYPE_ID(unsigned long long, anakin_uint64)
 ANAKIN_TO_TYPE_ID(bool, anakin_bool)
 ANAKIN_TO_TYPE_ID(std::string, anakin_string)
+ANAKIN_TO_TYPE_ID(bm_device_mem_t, anakin_bm)
 
 /// unique type tensor
 /// ANAKIN_TO_TYPE_ID(tensor, anakin_tensor)
@@ -133,6 +137,11 @@ ANAKIN_TO_TYPE_ID(Enum, anakin_tuple_enum)
 	ANAKIN_PBLOCK_TO_TYPE_ID(float, ARM, anakin_block_float)
 #endif
 
+#ifdef USE_BM
+	ANAKIN_PBLOCK_TO_TYPE_ID(bm_device_mem_t, BM, anakin_block_float)
+#endif
+
+
 template<typename T>
 struct type_id {
     typedef T type;
diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt
index 7b05d7157..298d08ab8 100644
--- a/saber/CMakeLists.txt
+++ b/saber/CMakeLists.txt
@@ -67,7 +67,7 @@ if(USE_CUDA)
 	# set select arch for cuda
 	add_subdirectory(${ANAKIN_SABER}/funcs/impl/cuda/base)
 
-	set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS}) 
+	set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS})
 	set(CMAKE_CXX_FLAGS "")
 	if(BUILD_SHARED)
     		CUDA_COMPILE(ANAKIN_SABER_CUDA_C_SRC_OBJS SHARED ${ANAKIN_SABER_CUDA_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
@@ -77,16 +77,42 @@ if(USE_CUDA)
 	endif()
     	set(CMAKE_CXX_FLAGS ${FLAGS_BACKUP})
 
-	set(ANAKIN_SABER_STATIC_RELAY ${ANAKIN_SABER_STATIC_RELAY} 
-				      ${BEGIN_WHOLE_ARCHIVE} 
-				      ${ANAKIN_SABER_SASS_STATIC_LIB} 
-				      ${WHOLE_ARCHIVE_END})	
+	set(ANAKIN_SABER_STATIC_RELAY ${ANAKIN_SABER_STATIC_RELAY}
+				      ${BEGIN_WHOLE_ARCHIVE}
+				      ${ANAKIN_SABER_SASS_STATIC_LIB}
+				      ${WHOLE_ARCHIVE_END})
 endif()
 
+if(USE_BM)
+    anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/bm "cpp" ANAKIN_SABER_BASE_SRC)
+    anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/bm "cpp" ANAKIN_SABER_BASE_SRC)
+
+    # set root
+    set(BM_BASE_CODE_ROOT ${ANAKIN_SABER}/funcs/impl/bm/base)
+    # set select arch for cuda
+    add_subdirectory(${ANAKIN_SABER}/funcs/impl/bm/base)
+
+    set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS})
+    set(CMAKE_CXX_FLAGS "")
+    if(BUILD_SHARED)
+        CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS SHARED ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
+    endif()
+    if(BUILD_STATIC)
+        CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS STATIC ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
+    endif()
+    set(CMAKE_CXX_FLAGS ${FLAGS_BACKUP})
+
+    set(ANAKIN_SABER_STATIC_RELAY ${ANAKIN_SABER_STATIC_RELAY}
+            ${BEGIN_WHOLE_ARCHIVE}
+            ${ANAKIN_SABER_BM_STATIC_LIB}
+            ${WHOLE_ARCHIVE_END})
+endif()
+
+
 # add saber library to static
 if(UNIX OR APPLE)
     if (USE_ARM_PLACE)
-        ADD_LIBRARY(${ANAKIN_SABER_TEMP_COMMMON_LIB} STATIC ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC})
+        ADD_LIBRARY(${ANAKIN_SABER_TEMP_COMMMON_LIB} STATIC ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BM_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC})
         set_target_properties(${ANAKIN_SABER_TEMP_COMMMON_LIB} PROPERTIES LIBRARY_OUTPUT_DIRECTORY
                 ${ANAKIN_ROOT}/output/)
     else()
diff --git a/saber/core/common.h b/saber/core/common.h
index 6296a8e79..a2110533b 100644
--- a/saber/core/common.h
+++ b/saber/core/common.h
@@ -176,3 +176,17 @@ const char* cudnn_get_errorstring(cudnnStatus_t status);
 
 #endif //ANAKIN_SABER_CORE_COMMON_H
 
+#ifdef USE_BM
+
+#include "bmlib_runtime.h"
+#include "bmdnn_api.h"
+#include "bmlib_utils.h"
+
+#define BMDNN_CHECK(condition) \
+  do { \
+    bm_status_t error = condition; \
+    CHECK_EQ(error, BM_SUCCESS) << " Failed with error code:" << error; \
+  } while (0)
+
+#endif // USE_BM
+
diff --git a/saber/core/impl/bm/bm_device.cpp b/saber/core/impl/bm/bm_device.cpp
new file mode 100644
index 000000000..c89045dcf
--- /dev/null
+++ b/saber/core/impl/bm/bm_device.cpp
@@ -0,0 +1,24 @@
+#include "core/device.h"
+namespace anakin{
+
+namespace saber{
+
+template <>
+void Device<BM>::create_stream() {
+    // todo
+    LOG(WARNING) << "BM create_stream is not implemented";
+}
+
+template <>
+void Device<BM>::get_info() {
+    // todo
+    LOG(WARNING) << "BM get_info is not implemented";
+}
+
+template void Device<BM>::get_info();
+template void Device<BM>::create_stream();
+
+
+} //namespace saber
+
+} //namespace anakin
diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
new file mode 100644
index 000000000..3ff30773a
--- /dev/null
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -0,0 +1,89 @@
+#include "core/tensor.h"
+#include "env.h"
+
+#include "bmlib_runtime.h"
+#include "bmdnn_api.h"
+#include "bmlib_utils.h"
+
+#ifdef USE_BM
+const char* bmdnn_get_errorstring(bm_status_t error) {
+    switch (error) {
+        case BM_SUCCESS:
+            return "BM API call correct";
+        case BM_ERR_FAILURE:
+            return "BM API fail to return";
+        case BM_ERR_TIMEOUT:
+            return "BM API time out";
+        case BM_ERR_PARAM:
+            return "BM API invalid parameter";
+        case BM_ERR_NOMEM:
+            return "BM API insufficient memory";
+        case BM_ERR_DATA:
+            return "BM API invalid data";
+        case BM_ERR_BUSY:
+            return "BM device is busy";
+        case BM_NOT_SUPPORTED:
+            return "BM unsupported operate";
+    }
+    return "Unknown bmdnn status";
+}
+#endif
+
+namespace anakin{
+
+namespace saber{
+
+#ifdef USE_BM
+
+typedef TargetWrapper<BM, __device_target> BM_API;
+
+static bm_handle_t handle;
+
+void BM_API::get_device_count(int &count) {
+    BMDNN_CHECK(bm_dev_getcount(&count));
+}
+
+void BM_API::set_device(int id){
+    //(bm_handle_t &handle, bool bmkernel_used, int id){
+    BMDNN_CHECK(bm_dev_request(&handle, 0, id));
+}
+
+//TODO: Do we have this functionality?
+int BM_API::get_device_id(){
+    return 0;
+}
+        
+void BM_API::mem_alloc(void** ptr, size_t n){
+    //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n)
+    bm_device_mem_t mem = bm_mem_from_system(ptr);
+    BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n));
+}
+        
+void BM_API::mem_free(void* ptr){
+    //(bm_handle_t handle, bm_device_mem_t mem){
+    if(ptr != nullptr){
+        bm_free_device(handle, bm_mem_from_system(ptr));
+    }
+}
+        
+void BM_API::mem_set(void* ptr, int value, size_t n){
+    //(bm_handle_t handle, const int value, bm_device_mem_t mem){
+    BMDNN_CHECK(bm_memset_device(handle, value, bm_mem_from_system(ptr)));
+}
+
+//! target wrapper
+template struct TargetWrapper<BM, __device_target>;
+
+//! BM Buffer
+template class Buffer<BM>;
+
+//! BM Tensor
+INSTANTIATE_TENSOR(BM, AK_BM, NCHW);
+
+template struct Env<BM>;
+
+#endif //USE_BM
+
+} //namespace saber
+
+} //namespace anakin
diff --git a/saber/core/target_traits.h b/saber/core/target_traits.h
index 52d1cc64e..9a059313a 100644
--- a/saber/core/target_traits.h
+++ b/saber/core/target_traits.h
@@ -28,6 +28,7 @@ struct __cuda_device{};
 struct __arm_device{};
 struct __amd_device{};
 struct __x86_device{};
+struct __bm_device{};
 
 struct __HtoD{};
 struct __HtoH{};
@@ -70,6 +71,12 @@ struct TargetTypeTraits<AMD> {
   typedef __amd_device target_type;
 };
 
+template <>
+struct TargetTypeTraits<BM> {
+  typedef __device_target target_category;
+  typedef __bm_device target_type;
+};
+
 } //namespace saber
 
 } //namespace anakin
diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index 120805b3b..cedf7023c 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -436,7 +436,7 @@ struct TargetWrapper<AMD, __device_target> {
 
     static void sync_stream(event_t event, stream_t stream);
     static void sync_stream(stream_t stream);
-    
+
     static void sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \
         const TPtr src, size_t src_offset, int src_id, \
         size_t count, __DtoD);
@@ -502,6 +502,64 @@ struct TargetWrapper<AMD, __device_target> {
 
 #endif //USE_AMD
 
+#ifdef USE_BM
+        /**
+ * \brief for Bitmain sophon device target only, device target is BM tpu
+ * use bitmain api to manage memory
+ * support device to device, device to host, host to device memcpy
+*/
+template <>
+struct TargetWrapper<BM, __device_target> {
+    typedef void* event_t;
+    typedef void* stream_t;
+
+    static void get_device_count(int& count);
+
+    static void set_device(int id);
+
+    //We should add strategy to avoid malloc directly
+    static void mem_alloc(void** ptr, size_t n);
+
+    //template <typename void>
+    static void mem_free(void * ptr);
+
+    //template <typename void>
+    static void mem_set(void* ptr, int value, size_t n);
+
+    // brief create event, empty function for bitmain target
+    static void create_event(event_t event, bool flag = false) {}
+    static void destroy_event(event_t event) {}
+    static void create_stream(stream_t stream) {}
+    static void create_stream_with_flag(stream_t stream, unsigned int flag) {}
+    static void create_stream_with_priority(stream_t stream, unsigned int flag, int priority) {}
+    static void destroy_stream(stream_t stream) {}
+    static void record_event(event_t event, stream_t stream) {}
+    static void query_event(event_t event) {}
+    static void sync_event(event_t event) {}
+    static void sync_stream(event_t event, stream_t stream) {}
+    // brief create event, empty function for bitmain target
+
+    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+        size_t count, __DtoD);
+
+    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+        size_t count, __HtoD);
+
+    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+        size_t count, __DtoH);
+
+    static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
+        int src_dev, size_t count);
+
+    /**
+     * \brief device target return currently used device id
+     * @return          currently activated device id
+     */
+    static int get_device_id();
+};
+
+#endif //USE_BM
+
 } //namespace saber
 
 } //namespace anakin
diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp
index 667c3cd98..e2e4a80d0 100644
--- a/saber/core/tensor_op.cpp
+++ b/saber/core/tensor_op.cpp
@@ -241,7 +241,7 @@ double tensor_mean_value_valid(Tensor<TargetType>& tensor, typename Tensor<Targe
     template double tensor_mean_value<target>(Tensor<target>& tensor, typename Tensor<target>::API::stream_t stream); \
     template double tensor_mean_value_valid<target>(Tensor<target>& tensor, typename Tensor<target>::API::stream_t stream);
 
-#if defined(BUILD_LITE) || defined(USE_X86_PLACE) || defined(USE_AMD) || defined(USE_CUDA)
+#if defined(BUILD_LITE) || defined(USE_X86_PLACE) || defined(USE_AMD) || defined(USE_CUDA) || defined(USE_BM)
 FILL_TENSOR_HOST(X86)
 #endif
 
@@ -253,6 +253,10 @@ FILL_TENSOR_HOST(NVHX86)
 FILL_TENSOR_HOST(ARM)
 #endif
 
+#ifdef USE_ARM_PLACE
+FILL_TENSOR_HOST(BM)
+#endif
+
 template void tensor_cmp_host<float>(const float* src1, const float* src2, \
                                      int size, double& max_ratio, double& max_diff);
 template void tensor_cmp_host<char>(const char* src1, const char* src2, int size, \
diff --git a/saber/funcs/CMakeLists.txt b/saber/funcs/CMakeLists.txt
index 74c1d4a9b..fcd1013aa 100644
--- a/saber/funcs/CMakeLists.txt
+++ b/saber/funcs/CMakeLists.txt
@@ -24,6 +24,10 @@ if(USE_CUDA)
     #FILE(GLOB CUDA_BASE_SRCS "cuda/*.cpp" "cuda/*.cu")
     aux_source_directory(impl/cuda CUDA_BASE_SRCS)
 endif()
+if(USE_BM)
+    #FILE(GLOB BM_BASE_SRCS "cuda/*.cpp" "cuda/*.cu")
+    aux_source_directory(impl/bm BM_BASE_SRCS)
+endif()
 if(USE_AMD)
     #FILE(GLOB CUDA_BASE_SRCS "cuda/*.cpp" "cuda/*.cu")
     aux_source_directory(impl/amd AMD_BASE_SRCS)
@@ -62,6 +66,14 @@ foreach(SRC_NAME ${CUDA_BASE_SRCS})
     list(APPEND DIR_SRCS_CUR "${CMAKE_CURRENT_SOURCE_DIR}/${FILE_NAME}")
 endforeach()
 
+foreach(SRC_NAME ${BM_BASE_SRCS})
+    #unpack the dir "/"
+    string(REPLACE "./" "" FILE_NAME ${SRC_NAME})
+    string(REPLACE " " "" FILE_NAME ${FILE_NAME})
+    #string(REPLACE ".cpp" ".cpp;" FILE_NAME ${FILE_NAME})
+    list(APPEND DIR_SRCS_CUR "${CMAKE_CURRENT_SOURCE_DIR}/${FILE_NAME}")
+endforeach()
+
 foreach(SRC_NAME ${X86_BASE_SRCS})
     #unpack the dir "/"
     string(REPLACE "./" "" FILE_NAME ${SRC_NAME})
diff --git a/saber/funcs/impl/bm/base/CMakeLists.txt b/saber/funcs/impl/bm/base/CMakeLists.txt
new file mode 100644
index 000000000..fd4b3d680
--- /dev/null
+++ b/saber/funcs/impl/bm/base/CMakeLists.txt
@@ -0,0 +1,20 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved
+# @file     CMakeLists files in the saber  subdirectory for nvidia gpu code
+# @auther   cuichaowen
+# @date     2017-11-29
+# ----------------------------------------------------------------------------
+
+if(USE_BM)
+    anakin_fetch_files_with_suffix_recursively(${BM_BASE_CODE_ROOT}/include "h" ANAKIN_SABER_BM_C_SRC)
+    anakin_fetch_files_with_suffix_recursively(${BM_BASE_CODE_ROOT}/lib "o" ANAKIN_SABER_BM_STATIC_LIB)
+endif()
+
+macro(anakin_set_upscope src)
+    set(${src} ${${src}} PARENT_SCOPE)
+endmacro()
+
+if(USE_BM)
+    anakin_set_upscope(ANAKIN_SABER_BM_C_SRC)
+    anakin_set_upscope(ANAKIN_SABER_BM_STATIC_LIB)
+endif()
diff --git a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h
new file mode 100644
index 000000000..97feb1972
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h
@@ -0,0 +1,814 @@
+#ifndef BMDNN_API_H
+#define BMDNN_API_H
+
+#include "bmdnn_runtime.h"
+#include "op_code.h"
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * All the name-style of input/output are in the viewpoint of forward operation
+ */
+
+typedef struct kernel_param{
+    int g;
+    int oc;
+    int ic;
+    int h;
+    int w;
+}bm_kernel_param_t;
+
+typedef struct bm_conv_param{
+    int stride_h;
+    int stride_w;
+    int pad_h;
+    int pad_w;
+    int dilation_h;
+    int dilation_w;
+    bool result_add;
+}bm_conv_param_t;
+
+typedef struct bm_pool_param{
+  int stride_h;
+  int stride_w;
+  int pad_h;
+  int pad_w;
+  int kh;
+  int kw;
+  bool is_avg_pooling;
+}bm_pool_param_t;
+
+bm_status_t bmdnn_conv_relu_pool_forward(
+    bm_handle_t      handle,
+    bm_device_mem_t  input,
+    bm_device_mem_t  weight,
+    bm_device_mem_t  bias,
+    bm_tensor_4d_t      input_shape,
+    bm_kernel_param_t   kernel_param,
+    bm_pool_param_t     pool_param,
+    bm_conv_param_t     conv_param,
+    bool                with_bias,
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_conv_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  weight,
+    bm_device_mem_t  bias,
+    bm_tensor_4d_t      input_shape,
+    bm_kernel_param_t   kernel_param,
+    bm_tensor_4d_t      output_shape,
+    bm_conv_param_t     conv_param,
+    bool                with_bias,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_deconv_forward(
+    bm_handle_t      handle,
+    bm_device_mem_t  input,
+    bm_device_mem_t  weight,
+    bm_device_mem_t  bias,
+    bm_tensor_4d_t      input_shape,
+    bm_kernel_param_t   kernel_param,
+    bm_tensor_4d_t      output_shape,
+    bm_conv_param_t     conv_param,
+    bool                with_bias,
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_conv_backward_bias(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 groups,
+    int                 output_c,
+    int                 kh,
+    int                 kw,
+    int                 pad_h,
+    int                 pad_w,
+    int                 stride_h,
+    int                 stride_w,
+    int                 result_add,
+    //output
+    bm_device_mem_t  bias_diff);
+
+bm_status_t bmdnn_pooling_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 kh,
+    int                 kw,
+    int                 pad_h,
+    int                 pad_w,
+    int                 stride_h,
+    int                 stride_w,
+    int                 is_avg_pooling,
+    //output
+    bm_device_mem_t  output
+    );
+bm_status_t bmdnn_upsample_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 size,
+    //output
+    bm_device_mem_t  output
+    );
+bm_status_t bmdnn_roi_pooling_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  rois,
+    int              input_n,
+    int              input_c,
+    int              input_h,
+    int              input_w,
+    int              pooled_h,
+    int              pooled_w,
+    int              roi_num,
+    int              spatial_scale,
+    //output
+    bm_device_mem_t  output
+    );
+
+bm_status_t bmdnn_fc_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  weight,
+    bm_device_mem_t  bias,
+    int              batch_size,
+    int              num_output_neuron,
+    int              num_input_neuron,
+    int              transpose,
+    int              using_bias,
+    int              using_relu,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_fc_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  input,
+    bm_device_mem_t  weight,
+    int              num_output_neuron,
+    int              batch_size,
+    int              num_input_neuron,
+    int              using_bias,
+    int              propagate_down_bias_diff,
+    int              propagate_down_weight_diff,
+    int              propagate_down_bottom,
+    //output
+    bm_device_mem_t  weight_diff,
+    bm_device_mem_t  bias_diff,
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_dropout_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    float            dropout_ratio,
+    int              input_n,
+    int              input_dim,
+    //output
+    bm_device_mem_t  output,
+    bm_device_mem_t  mask);
+
+bm_status_t bmdnn_dropout_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    float               dropout_ratio,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_batchnorm_forward_inference(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  mean_ma,
+    bm_device_mem_t  variance_ma,
+    float               scale_ma,
+    bm_device_mem_t  variance,
+    float               eps,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_batchnorm_forward_train(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    float               ma_fraction,
+    float               eps,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    //output
+    bm_device_mem_t  output,
+    bm_device_mem_t  mean,
+    bm_device_mem_t  variance,
+    bm_device_mem_t  mean_ma,
+    bm_device_mem_t  variance_ma);
+
+bm_status_t bmdnn_batchnorm_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  output,
+    bm_device_mem_t  variance,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 using_global_stats,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_lrn_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 lrn_n,
+    float               alpha,
+    float               beta,
+    float               k,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_lrn_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  output,
+    bm_device_mem_t  input,
+    int                 lrn_n,
+    float               alpha,
+    float               beta,
+    float               k,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_relu_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    float               negative_slope,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_relu_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  input,
+    float               negative_slope,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_sigmoid_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_sigmoid_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  output,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_tanh_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_tanh_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  output,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_softmax_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_c,
+    int                 input_inner_dim,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_softmax_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  output,
+    int                 input_n,
+    int                 input_c,
+    int                 input_inner_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_softmax_loss_forward(
+    bm_handle_t      handle,
+    bm_device_mem_t  input,
+    bm_device_mem_t  label,
+    float               normalizer,
+    int                 input_n,
+    int                 input_c,
+    int                 input_inner_dim,
+    bm_device_mem_t  output,
+    bm_device_mem_t  loss);
+bm_status_t bmdnn_interp_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 pad_bag,
+    int                 pad_end,
+    int                 output_h,
+    int                 output_w,
+    //output
+    bm_device_mem_t  output
+    );
+bm_status_t bmdnn_softmax_loss_backward(
+    bm_handle_t      handle,
+    bm_device_mem_t  output,
+    bm_device_mem_t  label,
+    bm_device_mem_t  loss,
+    float               normalizer,
+    int                 input_n,
+    int                 input_c,
+    int                 input_inner_dim,
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_softmax_loss_bidirection(
+    bm_handle_t      handle,
+    bm_device_mem_t  input,
+    bm_device_mem_t  label,
+    float               normalizer,
+    int                 input_n,
+    int                 input_c,
+    int                 input_inner_dim,
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  loss);
+
+bm_status_t bmdnn_multiregion_forward_parallel(
+    bm_handle_t         handle,
+    //input
+    bm_device_mem_t*     input,
+    int*                 input_n,
+    int*                 input_c,
+    int*                 input_h,
+    int*                 input_w,
+    int                  input_num,
+    int                 classes,
+    int                 coords,
+    int                 nums,
+    int*                 Activate_parm,
+    //output
+    bm_device_mem_t*  output
+);
+
+bm_status_t bmdnn_accuracy(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  label_idx,
+    bm_device_mem_t  input_mem_buffer,
+    int                 input_num,
+    int                 input_dim,
+    int                 top_k,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_coeff_update_sgd(
+    bm_handle_t      handle,
+    bm_device_mem_t  weight_diff,
+    bm_device_mem_t  weight,
+    bm_device_mem_t  history_weight,
+    int                 weight_count,
+    float               base_lr,
+    float               momentum,
+    float               weight_decay);
+
+bm_status_t bmdnn_fc_backward_sgd(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  input,
+    //input and output
+    bm_device_mem_t  weight,
+    bm_device_mem_t  weight_history,
+    int                 num_output_neuron,
+    int                 batch_size,
+    int                 num_input_neuron,
+    int                 using_bias,
+    int                 propagate_down_bias_diff,
+    int                 propagate_down_weight_diff,
+    int                 propagate_down_bottom,
+    float               base_lr,
+    float               momentum,
+    float               weight_decay,
+    //output
+    bm_device_mem_t  bias_diff,
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_permute(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_normalize_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  scale,
+    float               eps,
+    float               scale_val,
+    bool                across_spatial,
+    bool                channel_share,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    //output
+    bm_device_mem_t  output);
+
+/*
+ * MD Operations for user
+ */
+
+
+bm_status_t bmdnn_md_scalar(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  tensor_A,
+    bm_device_mem_t  tensor_B,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    ALIGN_TENSOR_OP             align_tensor_op,
+    int                 result_add,
+    int                 A_is_constant,
+    int                 B_is_constant,
+    float               A_const_val,
+    float               B_const_val,
+    int                 B_N_is_1,
+    int                 B_index_is_1,
+    //output
+    bm_device_mem_t  tensor_R);
+
+bm_status_t bmdnn_md_cmp(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  tensor_A,
+    bm_device_mem_t  tensor_B,
+    bm_device_mem_t  tensor_C,
+    bm_device_mem_t  tensor_D,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 A_is_constant,
+    int                 B_is_constant,
+    int                 C_is_constant,
+    int                 D_is_constant,
+    float               A_constant,
+    float               B_constant,
+    unsigned int        C_constant,
+    unsigned int        D_constant,
+    int                 result_skip,
+    //output
+    bm_device_mem_t  tensor_Y,
+    bm_device_mem_t  tensor_R);
+
+bm_status_t bmdnn_md_sfu(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  tensor_A,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    SFU_OP              sfu_op,
+    float               a,
+    int                 n,
+    //output
+    bm_device_mem_t  tensor_Y);
+
+bm_status_t bmdnn_md_sum(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  tensor_A,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 result_add,
+    //output
+    bm_device_mem_t  tensor_Y);
+
+
+bm_status_t bmdnn_md_linear(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  tensor_A,
+    bm_device_mem_t  tensor_B,
+    bm_device_mem_t  tensor_S,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    LINEAR_OP           linear_op,
+    int                 result_add,
+    int                 B_is_const,
+    int                 S_is_const,
+    float               B_const_val,
+    float               S_const_val,
+    //output
+    bm_device_mem_t  tensor_Y);
+
+bm_status_t bmdnn_img_sum(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  tensor_A,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 result_add,
+    //output
+    bm_device_mem_t  tensor_Y);
+
+/*
+ * fullnet mode
+ */
+bm_status_t bmdnn_fullnet(
+        bm_handle_t handle,
+        unsigned long long bdc_cmd_offset,
+        unsigned long long gdma_cmd_offset,
+        unsigned long long cdma_cmd_offset,
+        unsigned long long cmd_num_offset
+        );
+
+/*
+ * multiple fullnet mode
+ */
+bm_status_t bmdnn_multi_fullnet(
+        bm_handle_t handle,
+        int input_num,
+        unsigned long long* user_input_global_offset,
+        unsigned long long* cmd_input_global_offset,
+        int* input_tensor_size,
+        int output_num,
+        unsigned long long* user_output_global_offset,
+        unsigned long long* cmd_output_global_offset,
+        int* output_tensor_size,
+        unsigned long long bdc_cmd_offset,
+        unsigned long long gdma_cmd_offset,
+        unsigned long long cdma_cmd_offset,
+        int* bdc_cmd_num,
+        int* gdma_cmd_num,
+        int* cdma_cmd_num,
+        int cmdgroup_num
+        );
+
+/*
+ * dynamic fullnet mode
+ */
+bm_status_t bmdnn_dynamic_fullnet(
+        bm_handle_t handle,
+        unsigned long long compiled_ir_global_addr,
+        unsigned int compiled_ir_length,
+        unsigned int batch_num,
+        unsigned int input_num,
+        unsigned long long* input_global_offset,
+        unsigned int* input_height,
+        unsigned int* input_width,
+        unsigned int output_num,
+        unsigned long long* output_global_offset,
+        unsigned long long apd_ctx_mem_offset
+#if defined(USING_CMODEL) && !defined(USING_FULLNET)
+        ,float**    p_refer_result
+#endif
+        );
+
+/**
+  * Depthwise convolution.
+  */
+bm_status_t bmdnn_depthwise_forward(
+        bm_handle_t         handle,
+        bm_device_mem_t     input,
+        bm_device_mem_t     weight,
+        bm_device_mem_t     bias,
+        int                 input_n,
+        int                 input_c,
+        int                 input_h,
+        int                 input_w,
+        int                 kernel_h,
+        int                 kernel_w,
+        int                 dilation_h,
+        int                 dilation_w,
+        int                 pad_h,
+        int                 pad_w,
+        int                 stride_h,
+        int                 stride_w,
+        int                 using_bias,
+        bm_device_mem_t     output);
+
+bm_status_t bmdnn_lstm_forward(
+        bm_handle_t      handle,
+        //input
+        bm_device_mem_t  input,
+        bm_device_mem_t  cont,
+        bm_device_mem_t  input_static,
+        /*bm_device_mem_t  w_hc,
+        bm_device_mem_t  w_xc,*/
+        bm_device_mem_t  w_hxc,
+        bm_device_mem_t  w_xstatic,
+        bm_device_mem_t  b_c,
+        bm_device_mem_t  h_0,
+        bm_device_mem_t  c_0,
+        int                 input_n,
+        int                 seq_len,
+        int                 input_dim,
+        int                 input_static_dim,
+        int                 output_dim,
+        int                 with_input_static,
+        //output
+        bm_device_mem_t  c,
+        bm_device_mem_t  gate,
+        bm_device_mem_t  h_T,
+        bm_device_mem_t  c_T,
+        bm_device_mem_t  h);
+
+bm_status_t bmdnn_netease_ocr_forward(
+        bm_handle_t      handle,
+        //input
+        bm_device_mem_t  conv1_ifmap,
+        bm_device_mem_t  params,
+        bm_device_mem_t  result);
+
+typedef struct dim4_s {
+    int n, c, h, w;
+} dim4_t;
+enum
+{
+    CONV_DEPTHWISE,
+    CONV_3D
+};
+typedef struct mobilenet_conv_param_s
+{
+    /** convolution. */
+    int type;
+    bm_device_mem_t kernel;
+    bm_device_mem_t bias;
+    dim4_t          kernel_shape;
+    int             dilation_h, dilation_w;
+    int             pad_h, pad_w;
+    int             stride_h, stride_w;
+    bool            using_bias;
+    /** batchnorm. */
+    bm_device_mem_t mean;
+    bm_device_mem_t variance;
+    /** relu. */
+    float           slope;
+} mobilenet_conv_param_t;
+bm_status_t bmdnn_mobilenet_forward(
+        bm_handle_t handle,
+        const mobilenet_conv_param_t   *conv,
+        int                             num,
+        const dim4_t                   &input_shape,
+        const bm_device_mem_t          &input_global_mem,
+        dim4_t                         &output_shape,
+        bm_device_mem_t                &output_global_mem,
+        float                           parallel_performance_factor = 1.f);
+
+bm_status_t bmdnn_conv_forward_bank_conflict(
+    bm_handle_t         handle,
+    //input
+    bm_device_mem_t     input,
+    bm_device_mem_t     weight,
+    bm_device_mem_t     bias,
+    bm_tensor_4d_t      input_shape,
+    bm_kernel_param_t   kernel_param,
+    bm_tensor_4d_t      output_shape,
+    bm_conv_param_t     conv_param,
+    bool                with_bias,
+    //output
+    bm_device_mem_t     output);
+
+bm_status_t bmdnn_pooling_forward_bank_conflict(
+    bm_handle_t         handle,
+    //input
+    bm_device_mem_t     input,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 kh,
+    int                 kw,
+    int                 pad_h,
+    int                 pad_w,
+    int                 stride_h,
+    int                 stride_w,
+    int                 is_avg_pooling,
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_fc_forward_bank_conflict(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  weight,
+    bm_device_mem_t  bias,
+    int              batch_size,
+    int              num_output_neuron,
+    int              num_input_neuron,
+    int              transpose,
+    int              using_bias,
+    int              using_relu,
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_conv_forward_power_evaluation(
+    bm_handle_t         handle,
+    //input
+    bm_device_mem_t     input,
+    bm_device_mem_t     weight,
+    bm_device_mem_t     bias,
+    bm_tensor_4d_t      input_shape,
+    bm_kernel_param_t   kernel_param,
+    bm_tensor_4d_t      output_shape,
+    bm_conv_param_t     conv_param,
+    bool                with_bias,
+    //output
+    bm_device_mem_t     output);
+
+bm_status_t bmdnn_img_scale(
+        bm_handle_t handle, bm_device_mem_t dst, bm_device_mem_t src, int n,
+        int c, int dh, int sh, int dw, int sw);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BMDNN_API_H */
diff --git a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h
new file mode 100644
index 000000000..384cd4108
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h
@@ -0,0 +1,438 @@
+#ifndef BMDNN_EXT_API_H
+#define BMDNN_EXT_API_H
+
+#include "bmdnn_runtime.h"
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+bm_status_t bmdnn_threshold_forward(
+    bm_handle_t      handle,
+    float               threshold,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output
+    );
+
+bm_status_t bmdnn_exp_forward(
+    bm_handle_t      handle,
+    float               base,
+    float               input_scale,
+    float               input_shift,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output
+    );
+
+bm_status_t bmdnn_exp_backward(
+    bm_handle_t      handle,
+    float               base,
+    float               input_scale,
+    float               input_shift,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  output,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff
+    );
+
+bm_status_t bmdnn_power_forward(
+    bm_handle_t      handle,
+    float               power_,
+    float               scale_,
+    float               shift_,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output
+    );
+
+bm_status_t bmdnn_power_backward(
+    bm_handle_t      handle,
+    float               power_,
+    float               scale_,
+    float               shift_,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  output,
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff
+    );
+
+bm_status_t bmdnn_euclidean_loss_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  label,
+    bm_device_mem_t  temp_,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  diff,
+    bm_device_mem_t  loss);
+
+bm_status_t bmdnn_euclidean_loss_backward(
+    bm_handle_t      handle,
+    float               alpha,
+    //input
+    bm_device_mem_t  output,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_silence_backward(
+    bm_handle_t      handle,
+    //input
+    //bm_device_mem_t  output_data,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_lstm_unit_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  X_i,
+    bm_device_mem_t  X_f,
+    bm_device_mem_t  X_o,
+    bm_device_mem_t  X_g,
+    bm_device_mem_t  C_prev,
+    bm_device_mem_t  cont_expand,
+    int                 num,
+    int                 hidden_dim,
+    //output
+    bm_device_mem_t  C,
+    bm_device_mem_t  H);
+
+bm_status_t bmdnn_lstm_unit_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  C_diff,
+    bm_device_mem_t  H_diff,
+    bm_device_mem_t  X_i,
+    bm_device_mem_t  X_f,
+    bm_device_mem_t  X_o,
+    bm_device_mem_t  X_g,
+    bm_device_mem_t  C_prev,
+    bm_device_mem_t  C,
+    bm_device_mem_t  cont_expand,
+    int                 num,
+    int                 hidden_dim,
+    //output
+    bm_device_mem_t  C_prev_diff,
+    bm_device_mem_t  X_i_diff,
+    bm_device_mem_t  X_f_diff,
+    bm_device_mem_t  X_o_diff,
+    bm_device_mem_t  X_g_diff);
+
+bm_status_t bmdnn_eltwise_forward(
+    bm_handle_t      handle,
+    int                 op_,
+    int                 flag_first,
+    float               coeffs_,
+    int                 index,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  target,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  mask_data,
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_eltwise_backward(
+    bm_handle_t      handle,
+    int                 op_,
+    int                 flag_first,
+    float               coeffs_,
+    int                 index,
+    //input
+    bm_device_mem_t  output_data,
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  input_data,
+    bm_device_mem_t  mask_data,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_bias_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  bias,
+    int                 outer_dim,
+    int                 dim,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_bias_backward(
+    bm_handle_t      handle,
+    int                 flag,
+    //input
+    bm_device_mem_t  output_diff,
+    int                 outer_dim,
+    int                 bias_dim,
+    int                 inner_dim,
+    //output
+    bm_device_mem_t  input_diff,
+    bm_device_mem_t  bias_diff);
+
+bm_status_t bmdnn_log_forward(
+    bm_handle_t      handle,
+    float               scale,
+    float               shift,
+    float               base,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_log_backward(
+    bm_handle_t      handle,
+    float               scale,
+    float               shift,
+    float               base,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_absval_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_absval_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_sigmoid_cross_entropy_loss_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  target,
+    bm_device_mem_t  buffer,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output,
+    bm_device_mem_t  loss);
+
+bm_status_t bmdnn_sigmoid_cross_entropy_loss_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output,
+    bm_device_mem_t  target,
+    bm_device_mem_t  output_diff,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_contrastive_loss_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input_0,
+    bm_device_mem_t  input_1,
+    bm_device_mem_t  label,
+    bm_device_mem_t  buffer,
+    int                 input_n,
+    int                 input_c,
+    float               margin,
+    bool                legacy_version,
+    //output
+    bm_device_mem_t  diff,
+    bm_device_mem_t  dist_sq,
+    bm_device_mem_t  loss);
+
+bm_status_t bmdnn_contrastive_loss_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  label,
+    bm_device_mem_t  diff,
+    bm_device_mem_t  dist_sq,
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  buffer,
+    int                 input_n,
+    int                 input_dim,
+    float               margin,
+    bool                legacy_version,
+    int                 propagate_down_flag,
+    //output
+    bm_device_mem_t  input_diff_0,
+    bm_device_mem_t  input_diff_1);
+
+bm_status_t bmdnn_filter_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  filter,
+    int                 input_n,
+    int                 output_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_filter_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  filter,
+    int                 input_n,
+    int                 output_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_split_backward(
+    bm_handle_t      handle,
+    //input
+    int                 is_first,
+    bm_device_mem_t  output_diff,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_bnll_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_bnll_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  input,
+    float               threshold,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_prelu_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  slope,
+    float            slope0,
+    int                 channel_shared,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_prelu_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  input,
+    bm_device_mem_t  slope,
+    int                 propagate_down_flag,
+    int                 channel_shared,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    //output
+    bm_device_mem_t  slope_diff,
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_scale_forward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  input,
+    bm_device_mem_t  scale,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 scale_dim,
+    int                 inner_dim,
+    int                 scale_is_neuron,
+    //output
+    bm_device_mem_t  scale_extension,
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_scale_backward(
+    bm_handle_t      handle,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  input_data,
+    bm_device_mem_t  scale_extension,
+    int                 propagate_down_flag,
+    int                 input_n,
+    int                 input_c,
+    int                 input_h,
+    int                 input_w,
+    int                 scale_dim,
+    int                 inner_dim,
+    int                 scale_is_neuron,
+    //output
+    bm_device_mem_t  scale_diff,
+    bm_device_mem_t  input_diff);
+
+bm_status_t bmdnn_elu_forward(
+    bm_handle_t      handle,
+    float               alpha,
+    //input
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  output);
+
+bm_status_t bmdnn_elu_backward(
+    bm_handle_t      handle,
+    float               alpha,
+    //input
+    bm_device_mem_t  output_diff,
+    bm_device_mem_t  output,
+    bm_device_mem_t  input,
+    int                 input_n,
+    int                 input_dim,
+    //output
+    bm_device_mem_t  input_diff);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BMDNN_EXT_API_H */
diff --git a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h
new file mode 100644
index 000000000..6fede1338
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h
@@ -0,0 +1,20 @@
+#ifndef BMDNN_RUNTIME_H_
+#define BMDNN_RUNTIME_H_
+
+#include "bmlib_runtime.h"
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+bm_status_t bmdnn_init(
+    bm_handle_t     *handle);
+
+void bmdnn_deinit(
+    bm_handle_t      handle);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif
diff --git a/saber/funcs/impl/bm/base/include/bmdnn/op_code.h b/saber/funcs/impl/bm/base/include/bmdnn/op_code.h
new file mode 100644
index 000000000..f85846a8a
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmdnn/op_code.h
@@ -0,0 +1,62 @@
+#ifndef OP_CODE_H_
+#define OP_CODE_H_
+
+
+typedef enum align_tensor_op {
+    ALIGN_TENSOR_ADD,
+    ALIGN_TENSOR_SUB,
+    ALIGN_TENSOR_MUL,
+    ALIGN_TENSOR_DIV,
+    TENSOR_INVALID
+} ALIGN_TENSOR_OP;
+
+typedef enum linear_op {
+    LINEAR_MAC,
+    LINEAR_ADD_SQR,
+    LINEAR_SUB_SQR
+} LINEAR_OP;
+
+typedef enum sfu_op {
+    SFU_XN,
+    SFU_EX,
+    SFU_LNX,
+    SFU_RSQ,
+    SFU_INVALID
+} SFU_OP;
+typedef struct tensor_4d_t {
+    int n;
+    int c;
+    int h;
+    int w;
+}bm_tensor_4d_t;
+
+
+#define TENSOR_ADD 0
+#define TENSOR_SUB 1
+#define TENSOR_MUL 2
+//Note the div should be implmented by KAMAKE algorithm
+#define TENSOR_DIV 3
+#define TENSOR_MAX 4
+#define TENSOR_CPY 5
+#define TENSOR_MAC 6
+
+#define TENSOR_N_DIM 0
+#define TENSOR_C_DIM 1
+#define TENSOR_H_DIM 2
+#define TENSOR_W_DIM 3
+
+#define SHARE_REG_MESSAGE_WP            0
+#define SHARE_REG_MESSAGE_RP            1
+#define SHARE_REG_MESSAGE_IRQSTATUS     2
+#define SHARE_REG_CDMA_IRQSTATUS    3 
+
+#define SHAREMEM_MSG_FIXED_OFFSET  (8192)
+#define SHAREMEM_SIZE_BIT  8
+#define SHAREMEM_MASK      ((1<<SHAREMEM_SIZE_BIT) - 1)
+#define SHARE_REG_CNT      16
+
+#define IRQ_STATUS_CDMA_INT             0x1111
+#define IRQ_STATUS_MSG_DONE_INT         0x2222
+
+ 
+#endif /* OP_CODE_H_ */
diff --git a/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h b/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h
new file mode 100644
index 000000000..932b17138
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h
@@ -0,0 +1,229 @@
+#ifndef BMLIB_RUNTIME_H_
+#define BMLIB_RUNTIME_H_
+#include <stdbool.h>
+#include <stddef.h>
+
+#if !defined(__x86_64__) && !defined(__aarch64__)
+#error "BM needs 64-bit to compile"
+#endif
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+typedef enum {
+  BM_SUCCESS                 = 0,
+  BM_ERR_DEVNOTREADY          = 1,   /* Device not ready yet */
+  BM_ERR_FAILURE             = 2,   /* General failure */
+  BM_ERR_TIMEOUT             = 3,   /* Timeout */
+  BM_ERR_PARAM               = 4,   /* Parameters invalid */
+  BM_ERR_NOMEM               = 5,   /* Not enough memory */
+  BM_ERR_DATA                = 6,   /* Data error */
+  BM_ERR_BUSY                = 7,   /* Busy */
+  BM_ERR_NOFEATURE           = 8,    /* Not supported yet */
+  BM_NOT_SUPPORTED           = 9
+} bm_status_t;
+
+typedef enum {
+  BM_MEM_TYPE_DEVICE  = 0,
+  BM_MEM_TYPE_HOST    = 1,
+  BM_MEM_TYPE_SYSTEM  = 2,
+  BM_MEM_TYPE_INT8_DEVICE  = 3,
+  BM_MEM_TYPE_INVALID = 4
+} bm_mem_type_t;
+
+#define BM_MEM_ADDR_NULL     (0xfffffffff)
+
+typedef struct bm_mem_desc {
+  unsigned char                 desc[16];
+} bm_mem_desc_t;
+
+struct bm_context;
+typedef struct bm_context *  bm_handle_t;
+typedef struct bm_mem_desc   bm_device_mem_t;
+typedef struct bm_mem_desc   bm_host_mem_t;
+typedef struct bm_mem_desc   bm_system_mem_t;
+
+#define BM_CHECK_RET(call)                         \
+    do {                                        \
+      bm_status_t ret = call;                \
+	  if ( ret != BM_SUCCESS ) {             \
+        printf("BM_CHECK_RET failed %d\n", ret);   \
+        ASSERT(0);                              \
+        exit(-ret);                             \
+      }                                         \
+    } while(0)
+
+/*
+ * control 
+ */
+void bm_flush(
+    bm_handle_t      handle);
+/*
+ * brief malloc host memory according to a tensor shape(each neuron is 32 bits)
+*/
+
+bm_status_t bm_malloc_neuron_device(
+    bm_handle_t      handle,
+    bm_device_mem_t *pmem,
+    int              n,
+    int              c,
+    int              h,
+    int              w);
+
+/*
+ * brief malloc host memory in size of dword(32 bits)
+*/
+
+bm_status_t bm_malloc_device_dword(
+    bm_handle_t      handle,
+    bm_device_mem_t *pmem,
+    int              count);
+
+/*
+ * brief malloc host memory in size of byte
+*/
+
+bm_status_t bm_malloc_device_byte(
+    bm_handle_t      handle,
+    bm_device_mem_t *pmem,
+    unsigned int     size);
+
+void bm_free_device(
+    bm_handle_t      handle,
+    bm_device_mem_t  mem);
+
+/*
+ * brief malloc host memory in size of byte
+ */
+bm_status_t bm_malloc_host(
+    bm_handle_t      handle,
+    bm_host_mem_t   *pmem,
+    unsigned int     size);
+
+void bm_free_host(
+    bm_handle_t      handle,
+    bm_host_mem_t    mem);
+
+void *bm_host_mem_get_pointer(
+    bm_host_mem_t    mem);
+
+/*
+ * Memory copy and set
+ */
+bm_status_t bm_memcpy_h2d(
+    bm_handle_t      handle,
+    bm_device_mem_t  dst,
+    bm_host_mem_t    src);
+
+bm_status_t bm_memcpy_d2h(
+    bm_handle_t      handle,
+    bm_host_mem_t    dst,
+    bm_device_mem_t  src);
+
+
+bm_status_t bm_memcpy_s2d(
+    bm_handle_t      handle,
+    bm_device_mem_t  dst,
+    bm_system_mem_t  src);
+
+bm_status_t bm_memcpy_d2s(
+    bm_handle_t      handle,
+    bm_system_mem_t  dst,
+    bm_device_mem_t  src);
+
+bm_status_t bm_memcpy_d2d(
+    bm_handle_t     handle,
+    bm_device_mem_t dst,
+    int             dst_offset,
+    bm_device_mem_t src,
+    int             src_offset,
+    int             len);
+
+bm_status_t bm_memset_device(
+    bm_handle_t      handle,
+    const int        value,
+    bm_device_mem_t  mem);
+
+bm_device_mem_t bm_mem_from_system(
+    void *              system_addr);
+
+/*
+*brief malloc one device memory with the shape of (N,C,H,W), copy the sys_mem to
+device mem if need_copy is true
+*/
+
+bm_status_t bm_mem_convert_system_to_device_neuron(
+    bm_handle_t          handle,
+    struct bm_mem_desc  *dev_mem,
+    struct bm_mem_desc   sys_mem,
+    bool                 need_copy,
+    int                  n,
+    int                  c,
+    int                  h,
+    int                  w);
+
+/*
+*brief malloc one device memory with the size of coeff_count, copy the sys_mem to
+device mem if need_copy is true
+*/
+bm_status_t bm_mem_convert_system_to_device_coeff(
+    bm_handle_t          handle,
+    struct bm_mem_desc  *dev_mem,
+    struct bm_mem_desc   sys_mem,
+    bool                 need_copy,
+    int                  coeff_count);
+
+/*
+ * memory info get and set
+ */
+unsigned long long bm_mem_get_device_addr(struct bm_mem_desc mem);
+void               bm_mem_set_device_addr(struct bm_mem_desc & mem, unsigned long long addr);
+unsigned int       bm_mem_get_device_size(struct bm_mem_desc mem);
+void               bm_mem_set_device_size(struct bm_mem_desc & mem, unsigned int size);
+bm_mem_type_t      bm_mem_get_type(struct bm_mem_desc mem);
+
+/* 
+* brief Get the handle of bmlib_runtime
+* return : If the handle has been inited, return the handle it self , else init one and return it
+*/
+bm_handle_t get_bm_handle();
+
+/*
+ * Helper functions
+ */
+
+/**
+* \brief Get the number of nodechip (Constant 1 in bm1682)
+* \return
+* \ref NO
+*/
+int bm_get_nodechip_num(
+    bm_handle_t      handle);
+
+/**
+* \brief Get the number of nodechip (Constant 64 in bm1682)
+* \return
+* \ref NO
+*/
+int bm_get_npu_num(
+    bm_handle_t      handle);
+int bm_get_eu_num( bm_handle_t handle);
+/**
+* \brief Get the number of nodechip (Constant 64 in bm1682)
+* \return
+* \ref NO
+*/
+bm_device_mem_t bm_mem_null(void);
+#define BM_MEM_NULL  (bm_mem_null())
+
+bm_status_t bm_dev_getcount(int* count);
+bm_status_t bm_dev_query(int devid);
+bm_status_t bm_dev_request(bm_handle_t *handle, bool bmkernel_used, int devid);
+void bm_dev_free(bm_handle_t handle);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BM_RUNTIME_H_ */
diff --git a/saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h b/saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h
new file mode 100644
index 000000000..e878343ef
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h
@@ -0,0 +1,72 @@
+#ifndef BMLIB_UTILS_H
+#define BMLIB_UTILS_H
+#include <stdlib.h>
+
+/*
+ * Debug definitions for user app only
+ * Copy from common.h
+ * Don't include for internal usage
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define UNUSED(x)               (void)(x)
+
+#define __ALIGN_MASK(x,mask)    (((x)+(mask))&~(mask))
+#define ALIGN(x,a)              __ALIGN_MASK(x,(__typeof__(x))(a)-1)
+
+int array_cmp(
+    float *p_exp,
+    float *p_got,
+    int len,
+    const char *info_label,
+    float delta);
+
+int tri_array_cmp(
+    float *p_exp,
+    float *p_got,
+    float *third_party,
+    int len,
+    const char *info_label,
+    float delta,
+    int* err_idx);
+
+int array_cmp_int(
+    int *p_exp,
+    int *p_got,
+    int len,
+    const char *info_label
+);
+
+void dump_hex(char *desc, void *addr, int len);
+void dump_data_float(char *desc, void *addr, int n, int c, int h, int w);
+void dump_data_int(char *desc, void *addr, int n, int c, int h, int w);
+void dump_matrix_float(char *desc, void *addr, int row, int col);
+void dump_array_file(char * file, int row_num, int col_num, int transpose, float * parr);
+
+/* dump to file */
+void dump_float_tensor(const char * filename,
+    int length, float * dump_data);
+
+#ifdef __cplusplus
+/* not available in C */
+void random_param(
+    int &n, int &c, int &h, int &w,
+    int &kh, int &kw, int &ph, int &pw, int &sh, int &sw,
+    int &oc);
+
+void random_conv_param(
+    int &n, int &ic, int &ih, int &iw, int &oc,
+    int &kh, int &kw, int &dh, int &dw,
+    int &ph, int &pw, int &sh, int &sw);
+#endif
+
+int conv_coeff_storage_convert(float * coeff_orig, float ** coeff_reformat, unsigned int oc, unsigned int ic, unsigned int kh, unsigned int kw, unsigned int npu_num);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BMLIB_UTILS_H */
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmblob.h b/saber/funcs/impl/bm/base/include/bmruntime/bmblob.h
new file mode 100644
index 000000000..f3e086f91
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmruntime/bmblob.h
@@ -0,0 +1,97 @@
+#ifndef __BM_BLOB_H__
+#define __BM_BLOB_H__
+
+struct bm_mem_desc;
+typedef struct bm_mem_desc bm_device_mem_t;
+namespace bmcnn {
+
+typedef struct { int n, c, h, w; } Shape;
+
+class BMBlob
+{
+public:
+    /**
+     * \brief Constructor of blob.
+     *
+     * \param shape - Shape of blob
+     */
+    explicit BMBlob(const Shape &shape, void *handle);
+    /**
+     * \brief Deconstructor of blob.
+     */
+    virtual ~BMBlob();
+    /**
+     * \brief Reshape blob.
+     * 
+     * \param n - Batch number of blob
+     * \param c - Channel number of blob
+     * \param h - Height of blob section
+     * \param w - Width of blob section
+     *
+     * \note
+     * (1) For now, number of channels is not allowed to be reshaped.\n
+     * (2) After reshaping, data in this blob will be set vanished.\n
+     */
+    void Reshape(int n, int c, int h, int w);
+    /**
+     * \brief Get shape.
+     */
+    inline Shape shape() const
+    { return shape_; }
+    /**
+     * \brief Get batch size.
+     */
+    inline int batch_num() const
+    { return shape_.n; }
+    /**
+     * \brief Get feature
+     *
+     * \return Channel number of the blob\n
+     */
+    inline int channels() const
+    { return shape_.c; }
+    /**
+     * \brief Get height of section
+     */
+    int height() const
+    { return shape_.h; }
+    /**
+     * \brief Get width of section.
+     */
+    int width() const
+    { return shape_.w; }
+    /**
+     * \brief Get read-only pointer to data in cpu.
+     */
+    const float *cpu_data(); 
+    /**
+     * \brief Get mutable pointer of data in cpu.
+     */    
+    float *mutable_cpu_data();
+    /**
+     * \brief Get mutable pointer of memory in device.
+     */    
+    bm_device_mem_t *mutable_dev_mem();
+    /**
+     * \brief Get read-only pointer of memory in device.
+     */    
+    const bm_device_mem_t *dev_mem();
+private:
+    BMBlob(const BMBlob &other);
+    BMBlob &operator=(const BMBlob &other);
+    
+    bm_device_mem_t *dev_mem_;
+    float *sys_data_;
+    Shape shape_;
+    int data_pos_;
+    int capacity_;
+    void *handle_;
+    
+    enum { AIR = 0x00, SYS = 0x01, DEV = 0x10 };
+    void sync_s2d();
+    void sync_d2s();
+};
+
+} /* namespace bmcnn */
+
+#endif /* __BM_BLOB_H__ */
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h b/saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h
new file mode 100644
index 000000000..6b0bfe857
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h
@@ -0,0 +1,58 @@
+#ifndef __BM_CNN_CONTEXT_H__
+#define __BM_CNN_CONTEXT_H__
+
+#include <string>
+#include "bmruntime.h"
+
+namespace bmcnn {
+
+typedef void *bmcnn_ctx_t;
+/**
+ * \brief Create context of BMCNN.
+ *
+ * \param ctx_dir - Directory of context files generated by BMNETC
+ *
+ * \note
+ * The context will be created in the device of ID 0.\n
+ *  
+ * \return
+ * NULL - Creating failed.\n
+ * non-NULL - The handle of the context (creating succeeded).\n
+ */
+bmcnn_ctx_t bmcnn_ctx_create(const std::string &ctx_dir);
+/**
+ * \brief Destroy context of BMCNN
+ * 
+ * \param handle - Handle of the context to be destroyed
+ */
+void bmcnn_ctx_destroy(bmcnn_ctx_t handle);
+/**
+ * \brief Create context of BMCNN in specific devide.
+ * 
+ * \param ctx_dir - Directory of context files generated by BMNETC
+ * \param devid - ID of device where the context will be placed.
+ *
+ * \note
+ * Call \ref bm_dev_getcount to get total number of devices, e.g. N is returned,
+ * valid devid should be in range of 0 ~ (N-1).\n
+ *
+ * \return
+ * NULL - Creating failed that might be caused by incorrect parameter.\n
+ * non-NULL - The handle of the context (creating succeeded).\n
+ */
+bmcnn_ctx_t bmcnn_ctx_create_by_devid(const std::string &ctx_dir, int devid);
+/**
+ * \brief Append context of BMCNN.
+ *
+ * \param ctx_dir - Directory of context files generated by BMNETC or BMNETD.
+ * \param bmrt    - The created handle of context.
+ *  
+ * \return
+ * false - Appending failed.\n
+ * true  - Appending succeeded.\n
+ */
+bool bmcnn_ctx_append(const std::string &ctx_dir, bmruntime *bmrt);
+
+} /* namespace bmcnn */
+
+#endif /* __BM_CNN_CONTEXT_H__ */
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmnet.h b/saber/funcs/impl/bm/base/include/bmruntime/bmnet.h
new file mode 100644
index 000000000..88005e1b8
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmruntime/bmnet.h
@@ -0,0 +1,78 @@
+#ifndef __BM_NET_H__
+#define __BM_NET_H__
+
+#include "bmblob.h"
+#include "bmcnnctx.h"
+#include <vector>
+#include <map>
+#include <string>
+
+#ifdef CROSS_COMPILE
+  #include <memory>
+#else
+  #include <boost/shared_ptr.hpp>
+#endif
+
+
+#ifdef CROSS_COMPILE
+#define NAMESPACE_USED  std
+#else
+#define NAMESPACE_USED  boost
+#endif
+
+namespace bmcnn {
+    
+class BMNet
+{
+public:
+    /**
+     * \brief Constructor of net.
+     *
+     * \param handle - Handler of BMCNN context (created by \ref bmcnn_ctx_create)
+     * \param name - Name of net
+     */
+    explicit BMNet(bmcnn_ctx_t handle, const std::string &name);
+    /**
+     * \brief Deconstructor of blob.
+     */
+    virtual ~BMNet();
+    /**
+     * \brief Reshape all layers from bottom to top.
+     */
+    void Reshape();
+    /**
+     * \brief Run forward.
+     * 
+     * \param sync - Flag of synchronizing.
+     */
+    void Forward(bool sync = false);
+    /**
+     * \brief Get blob by name.
+     *
+     * \param name - Name of blob 
+     * \note
+     * (1) The name could only be of blob in input or output.\n
+     * (2) If the name is not spotted, null pointer will be returned.\n
+     */
+    const NAMESPACE_USED::shared_ptr<BMBlob> blob_by_name(const std::string &name) const;
+    /**
+     * \brief Get maximum shape allowed.
+     */
+    inline const Shape &max_shape() const
+    { return max_shape_; }
+private:
+    BMNet(const BMNet &other);
+    BMNet &operator=(const BMNet &other);
+
+    bmcnn_ctx_t bmcc_ctx_;
+    std::vector<NAMESPACE_USED::shared_ptr<BMBlob> > blobs_;
+    std::vector<BMBlob *> net_input_blobs_;
+    std::vector<BMBlob *> net_output_blobs_;
+    std::string name_;
+    std::map<std::string, size_t> blob_name_index_;
+    Shape max_shape_;
+};
+
+} /* namespace bmcnn */
+
+#endif /* __BM_NET_H__ */
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h
new file mode 100644
index 000000000..daa101fce
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h
@@ -0,0 +1,154 @@
+#ifndef  BMRUNTIME_H_
+#define  BMRUNTIME_H_
+#include <algorithm>
+#include <vector>
+#include "bmlib_runtime.h"
+#include "bmruntime_common.h"
+#include "stdio.h"
+#include <string>
+#include <map>
+#include <set>
+#include <iostream>
+
+using std::vector;
+using std::map;
+using std::set;
+using std::string;
+using std::pair;
+using std::make_pair;
+using std::cout;
+using std::endl;
+typedef unsigned int            u32;
+typedef unsigned long long      u64;
+
+typedef struct stage_param_with_idx{
+  int height_high;
+  int height_low;
+  int width_high;
+  int width_low;
+  int stage_index;
+}stage_param_with_idx_t;
+
+class bmruntime {
+  public:
+    bmruntime(bm_handle_t bm_handle);
+    ~bmruntime();
+
+    bool load_context(const string& ctx_dir);
+
+    const set<string>& get_input_tensor(int net_idx) const;
+    const set<string>& get_input_tensor(const string& net_name);
+
+    const set<string>& get_output_tensor(int net_idx) const;
+    const set<string>& get_output_tensor(const string& net_name);
+
+    const bm_device_mem_t* get_input_blob(const string& tensor_name, int net_idx);
+    const bm_device_mem_t* get_input_blob(const string& tensor_name, const string& net_name);
+
+    const bm_device_mem_t* get_output_blob(const string& tensor_name, int net_idx);
+    const bm_device_mem_t* get_output_blob(const string& tensor_name, const string& net_name);
+
+    bool launch(int net_idx);
+    bool launch(const string& net_name);
+
+    bool launch(int net_idx, const bm_device_mem_t* input_tensors, int input_num,
+            const bm_device_mem_t* output_tensors, int output_num);
+    bool launch(const string& net_name, const bm_device_mem_t* input_tensors, int input_num,
+            const bm_device_mem_t* output_tensors, int output_num);
+
+    bool launch(int net_idx, int n, int h , int w);
+    bool launch(const string& net_name, int n, int h, int w);
+    bool launch(int net_idx, const bm_device_mem_t* input_tensors, int input_num,
+            const bm_device_mem_t* output_tensors, int output_num, int n, int h, int w);
+    bool launch(const string& net_name, const bm_device_mem_t* input_tensors, int input_num,
+            const bm_device_mem_t* output_tensors, int output_num, int n , int h, int w);
+
+    void get_input_blob_max_nhw(const string& tensor_name, int net_idx, int * max_n, int * max_c, int * max_h, int * max_w);
+    void get_input_blob_max_nhw(const string& tensor_name, const string& net_name, int * max_n, int * max_c, int * max_h, int * max_w);
+    void get_output_blob_max_nhw(const string& tensor_name, int net_idx, int * max_n, int * max_c, int * max_h, int * max_w);
+    void get_output_blob_max_nhw(const string& tensor_name, const string& net_name, int * max_n, int *max_c, int * max_h, int * max_w);
+
+    int get_oh_from_ih(const string& input_tensor_name, const string& output_tensor_name, const string& net_name, int ih);
+    int get_oh_from_ih(const string& input_tensor_name, const string& output_tensor_name, int net_idx, int ih);
+    int get_ow_from_iw(const string& input_tensor_name, const string& output_tensor_name, const string& net_name, int iw);
+    int get_ow_from_iw(const string& input_tensor_name, const string& output_tensor_name, int net_idx, int iw);
+
+
+
+
+    bool can_batch_size_change(int net_idx);
+    bool can_batch_size_change(const string& net_name);
+    bool can_height_and_width_change(int net_idx);
+    bool can_height_and_width_change(const string& net_name);
+
+    void show_neuron_network();
+
+    int get_network_number() {return net_num;}
+
+    inline bm_handle_t get_bm_handle() {return m_handle;}
+
+  protected:
+    bool setup_mem_context(const string& ctx_dir);
+    bool setup_cmd_context(const string& ctx_dir);
+    bool set_using_cmd_file(const string& ctx_dir);
+    void load_cmd(u32* cmd, int engine_id, bool last_cmd, u64 start_address, u64 append_mem_offset);
+    bool setup_ir_context(const string& ctx_dir);
+
+    void wrong_net_idx_handle(int net_idx) const;
+
+    int get_net_idx(const string& net_name);
+    int get_stage_idx(int net_idx, int h, int w);
+    u64 get_stage_offset(int net_idx, int stage_idx);
+
+    int compute_output_height(int input_height, int global_kh, int global_stride_h, int global_pad_h, int global_pool_kh);
+    int compute_output_width(int input_width, int global_kw, int global_stride_w, int global_pad_w, int global_pool_kw);
+
+    bm_handle_t m_handle;
+    std::vector<DEVICE_MEM_INFO>            m_device_mem_info_vec;
+    std::vector<bm_device_mem_t>            m_device_mem_vec;
+
+    vector<int>                             m_gdma_total_id_v;
+    vector<int>                             m_cdma_total_id_v;
+    vector<int>                             m_bdc_total_id_v;
+    vector<vector<int> >                    m_gdma_group_id_v;
+    vector<vector<int> >                    m_cdma_group_id_v;
+    vector<vector<int> >                    m_bdc_group_id_v;
+    vector<int>                             m_cmdgroup_num;
+    vector<u64>                             m_gdma_cmd_start_address_v;
+    vector<u64>                             m_cdma_cmd_start_address_v;
+    vector<u64>                             m_bdc_cmd_start_address_v;
+    vector<map<string, bm_device_mem_t> >   input_tensor_mem_map_v;
+    vector<map<string, bm_device_mem_t> >   output_tensor_mem_map_v;
+    vector<set<string> >                    m_input_tensor_set_v;
+    vector<set<string> >                    m_output_tensor_set_v;
+    int                                     net_num;
+    map<string,int>                         net_name_to_idx;
+    vector<int>                             stage_num;
+
+    bool                                    have_ir_info;
+    vector<vector<unsigned int> >           m_ir_info_len;
+    vector<u64>                             m_ir_info_start_address_v;
+    vector<vector<stage_param_with_idx_t> > stage_param_with_idx_vv;
+
+    //io tensor param
+    vector<int>                             n_can_change_v;
+    vector<int>                             h_w_can_change_v;
+
+    vector<vector<map<string, tensor_max_shape_t> > >           input_tensor_max_shape_vv;
+    vector<vector<map<string, tensor_max_shape_t> > >           output_tensor_max_shape_vv;
+    vector<vector<map<string, global_output_tensor_param_t> > > global_output_tensor_param_vv;
+
+    bool m_using_cmd_file;
+    FILE * m_gdma_cmd_file;
+    FILE * m_cdma_cmd_file;
+    FILE * m_bdc_cmd_file;
+
+    //previous value or state
+    int pre_net_num;
+    int pre_m_device_mem_info_vec_size;  
+
+    //append mem offset when appending another framework's context.
+    vector<u64> apd_ctx_mem_offset;
+};
+
+#endif
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h
new file mode 100644
index 000000000..200656739
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h
@@ -0,0 +1,65 @@
+#ifndef BMRUNTIME_COMMON_H
+#define BMRUNTIME_COMMON_H
+
+#define BMRT_ASSERT(_cond)                       \
+  do {                                           \
+    if (!(_cond)) {                              \
+      printf("ASSERT %s: %s: %d: %s\n",          \
+          __FILE__, __func__, __LINE__, #_cond); \
+      exit(-1);                                  \
+    }                                            \
+  } while(0)
+
+typedef enum neuron_device_mem_type {
+    INPUT_NEURON_TENSOR = 0,
+    INTERMEDIATE_NEURON_TENSOR = 1,
+    OUTPUT_NEURON_TENSOR = 2,
+    CMD_BUF_TENSOR = 3,
+    CMD_NUM_TENSOR = 4
+} NEURON_DEVICE_MEM_TYPE;
+
+typedef enum device_mem_type {
+    NEURON = 0,
+    COEFF = 1,
+#ifdef INT8_COEFF_FUNC
+    COEFF_INT8 = 2,
+    COEFF_INT8SCALE = 3,
+    LOCAL = 4
+#else
+    LOCAL = 2
+#endif
+} DEVICE_MEM_TYPE;
+
+typedef struct device_mem_info {
+    DEVICE_MEM_TYPE device_mem_type;
+    NEURON_DEVICE_MEM_TYPE neuron_device_mem_type;
+    int n;
+    int c;
+    int h;
+    int w;
+    int coeff_count;
+    int groups;
+    unsigned long long address;
+} DEVICE_MEM_INFO;
+
+//info for compute output tensor
+typedef struct tensor_max_shape {
+  int max_n;
+  int channel;
+  int max_h;
+  int max_w;
+} tensor_max_shape_t;
+
+typedef struct global_output_tensor_param {
+  int input_idx;
+  int global_kh;
+  int global_kw;
+  int global_stride_h;
+  int global_stride_w;
+  int global_pad_h;
+  int global_pad_w;
+  int global_pool_kh;
+  int global_pool_kw;
+} global_output_tensor_param_t; 
+
+#endif
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h
new file mode 100644
index 000000000..4214674f3
--- /dev/null
+++ b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h
@@ -0,0 +1,11 @@
+#ifndef BMRUNTIME_INTERFACE_H_
+#define BMRUNTIME_INTERFACE_H_
+
+#include "bmruntime.h"
+#include "bmdnn_runtime.h"
+
+bmruntime* create_bmruntime(bm_handle_t* bm_handle);
+
+void destroy_bmruntime(bm_handle_t bm_handle, bmruntime* p_bmrt);
+
+#endif
diff --git a/saber/funcs/impl/bm/vender_activation.h b/saber/funcs/impl/bm/vender_activation.h
new file mode 100644
index 000000000..45541add9
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_activation.h
@@ -0,0 +1,96 @@
+#ifndef ANAKIN_SABER_FUNCS_BMDNN_ACT_H
+#define ANAKIN_SABER_FUNCS_BMDNN_ACT_H
+#include "saber/funcs/impl/impl_activation.h"
+namespace anakin {
+
+namespace saber {
+
+template <DataType OpDtype ,
+    DataType inDtype,
+    DataType outDtype,
+    typename LayOutType_op,
+    typename LayOutType_in,
+    typename LayOutType_out>
+class VenderActivation<BM, OpDtype, inDtype, outDtype,\
+    LayOutType_op, LayOutType_in, LayOutType_out> : \
+    public ImplBase<
+        Tensor<BM, inDtype, LayOutType_in>, 
+        Tensor<BM, outDtype, LayOutType_out>,
+        Tensor<BM, OpDtype, LayOutType_op>,
+        ActivationParam<Tensor<BM, OpDtype, LayOutType_op> > > 
+{
+public:
+    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+
+    VenderActivation()
+            : _handle(NULL), _active_descs(NULL), _input_descs(NULL), _output_descs(NULL) {}
+
+    ~VenderActivation() {
+        if (_input_descs) {
+            BMDNN_CHECK(bm_free_device(_input_descs));
+        }
+        if (_output_descs) {
+            BMDNN_CHECK(bm_free_device(_output_descs));
+        }
+    }
+
+    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            ActivationParam<OpTensor>& param, Context<BM>& ctx) {
+        // not sure
+        return create(inputs, outputs, param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            ActivationParam<OpTensor>& param, Context<BM>& ctx) {
+        // not sure
+        return SaberSuccess;
+    }
+
+    //call bmdnn activation funcs here
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            ActivationParam<OpTensor>& param) {
+
+        const InDataType *in_data = (const InDataType *) inputs[0]->data();
+        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
+        int input_dim = inputs[0]->channel() * inputs[0]->height() * inputs[0]->width();
+        int input_n = inputs[0]->num();
+
+        switch (_active_type) {
+            case Active_sigmoid:
+                BMDNN_CHECK(bmdnn_sigmoid_forward(_handle, _input_descs, input_n, input_dim, _output_descs));
+                break;
+            case Active_relu:
+                BMDNN_CHECK(bmdnn_relu_forward(_handle, _input_descs, input_n, input_dim, _output_descs));
+                break;
+            case Active_tanh:
+                BMDNN_CHECK(bmdnn_tanh_forward(_handle, _input_descs, input_n, input_dim, _output_descs));
+                break;
+        }
+        /* BMDNN_CHECK(cudnnActivationForward(_handle, _active_descs, */
+        /*                                    cudnn::cudnnTypeWrapper<InDataType>::kOne(), */
+        /*                                    _input_descs, in_data, */
+        /*                                    cudnn::cudnnTypeWrapper<InDataType>::kZero(), */
+        /*                                    _output_descs, out_data */
+        /* )); */
+        return SaberSuccess;
+    }
+
+private:
+    bm_handle_t _handle;
+    bm_device_mem_t _input_descs;
+    bm_device_mem_t _output_descs;
+    ActiveType _active_type;
+};
+template class VenderActivation<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
+}
+}
+
+#endif //ANAKIN_SABER_FUNCS_BMDNN_ACT_H
diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h
new file mode 100644
index 000000000..7efdfa611
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_conv.h
@@ -0,0 +1,195 @@
+/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+   
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. 
+*/
+
+#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H
+#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H
+
+#include "saber/funcs/impl/impl_conv.h"
+#include "saber/funcs/impl/bm/bmdnn_api.h"   
+
+namespace anakin{
+
+namespace saber{
+
+template <DataType OpDtype,
+    DataType inDtype,
+    DataType outDtype,
+    typename LayOutType_op,
+    typename LayOutType_in,
+    typename LayOutType_out>
+class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
+    LayOutType_op, LayOutType_in, LayOutType_out> : \
+    public ImplBase<
+        Tensor<BM, inDtype, LayOutType_in>,
+        Tensor<BM, outDtype, LayOutType_out>,
+        Tensor<BM, OpDtype, LayOutType_op>,
+        ConvParam<Tensor<BM, OpDtype, LayOutType_op> > >
+{
+public:
+    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+    VenderConv2D()
+            : _handle(NULL)
+            , _workspaceData(NULL)
+            , _workspace(NULL)
+            , _conv_descs(NULL)
+            , _input_descs(NULL)
+            , _output_descs(NULL)
+            , _filter_desc(NULL)
+            , _workspace_fwd_sizes(0)
+            , _workspaceSizeInBytes(0)
+            , _fwd_algo((cudnnConvolutionFwdAlgo_t)0)
+            , _input_nchw_descs(NULL)
+            , _output_nchw_descs(NULL)
+            , x8_data(NULL)
+            , y8_data(NULL)
+            , x8_data_size(0)
+            , y8_data_size(0)
+    {}
+
+    ~VenderConv2D() {
+
+        if (_conv_descs) {
+            CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(_conv_descs));
+        }
+        if (_input_descs) {
+            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_descs));
+        }
+        if (_output_descs) {
+            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_descs));
+        }
+        if (_filter_desc) {
+            CUDNN_CHECK(cudnnDestroyFilterDescriptor(_filter_desc));
+        }
+        if (_handle != NULL) {
+            CUDNN_CHECK(cudnnDestroy(_handle));
+        }
+        if (_workspaceData != NULL) {
+            cudaFree(_workspaceData);
+        }
+        if (_input_nchw_descs != NULL) {
+            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_nchw_descs));
+        }
+        if (_output_nchw_descs != NULL) {
+            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_nchw_descs));
+        }
+        if (x8_data != NULL) {
+            CUDA_CHECK(cudaFree(x8_data));
+        }
+        if (y8_data != NULL) {
+            CUDA_CHECK(cudaFree(y8_data));
+        }
+    }
+
+    /**
+     * [Create description] Init all cudnn resource here
+     * @AuthorHTL
+     * @DateTime  2018-02-01T16:13:06+0800
+     * @param     inputs                    [description]
+     * @param     outputs                   [description]
+     * @param     param                [conv parameters]
+     */
+    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            ConvParam<OpTensor>& param, Context<BM>& ctx) {
+
+        // ---- init cudnn resources ----
+
+        _workspaceSizeInBytes = 0;
+        _workspaceData = NULL;
+
+        _workspace_fwd_sizes = 0;
+
+        this->_ctx = ctx;
+        // ---- get cuda resources ----
+
+        cudaStream_t cuda_stream;
+        cuda_stream = ctx.get_compute_stream();
+
+        CUDNN_CHECK(cudnnCreate(&_handle));
+        CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream));
+
+        _workspace = NULL;
+
+        int in_channels = inputs[0]->channel();
+
+        // ---- create cudnn Descs ----
+        cudnn::createFilterDesc<OpDataType>(&_filter_desc);
+
+        cudnn::createTensorDesc<InDataType>(&_input_descs);
+        cudnn::createTensorDesc<InDataType>(&_output_descs);
+        cudnn::createConvolutionDesc<OpDataType>(&_conv_descs);
+
+        if (param.bias()->size() > 0) {
+            cudnn::createTensorDesc<OpDataType>(&_bias_desc);
+        }
+
+        cudnnCreateTensorDescriptor(&_input_nchw_descs);
+        cudnnCreateTensorDescriptor(&_output_nchw_descs);
+
+        return create(inputs, outputs, param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            ConvParam<OpTensor>& param, Context<BM>& ctx);
+
+    //call cudnnConvolutionForward here
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
+                          std::vector<DataTensor_out*>& outputs,
+                          ConvParam<OpTensor>& param);
+
+private:
+    cudnnHandle_t _handle;
+    cudnnConvolutionFwdAlgo_t _fwd_algo;
+
+    cudnnTensorDescriptor_t _input_descs;
+    cudnnTensorDescriptor_t _output_descs;
+    cudnnTensorDescriptor_t _bias_desc;
+
+    cudnnFilterDescriptor_t _filter_desc;
+
+    cudnnConvolutionDescriptor_t _conv_descs;
+
+    size_t _workspace_fwd_sizes;
+    size_t _workspaceSizeInBytes;  // size of underlying storage
+
+    void *_workspaceData;  // underlying storage
+    void *_workspace;  // aliases into _workspaceData
+
+    const bool _use_tensor_core = true;
+    const size_t _workspace_limit_bytes = 64 * 1024 * 1024;
+    const cudnnConvolutionFwdPreference_t _preference = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+
+    // create transform descriptor
+    cudnnTensorDescriptor_t _input_nchw_descs;
+    cudnnTensorDescriptor_t _output_nchw_descs;
+
+    void *x8_data;
+    void *y8_data;
+
+    int x8_data_size;
+    int y8_data_size;
+};
+
+
+}
+
+}
+#endif //ANAKIN_SABER_FUNCS_BMDNN_CONV2D_H
diff --git a/saber/funcs/impl/bm/vender_conv_act.h b/saber/funcs/impl/bm/vender_conv_act.h
new file mode 100644
index 000000000..4d9c9f3bb
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_conv_act.h
@@ -0,0 +1,198 @@
+/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+   
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. 
+*/
+
+#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ACT_H
+#define ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ACT_H
+
+#include "saber/funcs/impl/impl_conv_act.h"
+#include "saber/funcs/impl/cuda/cudnn_helper.h"   
+#include <cudnn.h>
+
+namespace anakin{
+
+namespace saber{
+
+template <DataType OpDtype,
+    DataType inDtype,
+    DataType outDtype,
+    typename LayOutType_op,
+    typename LayOutType_in,
+    typename LayOutType_out>
+class VenderConv2DAct<BM, OpDtype, inDtype, outDtype,\
+    LayOutType_op, LayOutType_in, LayOutType_out> : \
+    public ImplBase<
+        Tensor<BM, inDtype, LayOutType_in>,
+        Tensor<BM, outDtype, LayOutType_out>,
+        Tensor<BM, OpDtype, LayOutType_op>,
+        ConvActiveParam<Tensor<BM, OpDtype, LayOutType_op> > >
+{
+public:
+    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+
+    VenderConv2DAct()
+            : _handle(NULL)
+            , _workspaceData(NULL)
+            , _workspace(NULL)
+            , _conv_descs(NULL)
+            , _input_descs(NULL)
+            , _output_descs(NULL)
+            , _filter_desc(NULL)
+            , _workspace_fwd_sizes(0)
+            , _workspaceSizeInBytes(0)
+            , _fwd_algo((cudnnConvolutionFwdAlgo_t)0)
+            , _input_nchw_descs(NULL)
+            , _output_nchw_descs(NULL)
+            , x8_data(NULL)
+            , y8_data(NULL)
+            , x8_data_size(0)
+            , y8_data_size(0)
+    {}
+
+    ~VenderConv2DAct() {
+
+        if (_conv_descs) {
+            CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(_conv_descs));
+        }
+        if (_input_descs) {
+            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_descs));
+        }
+        if (_output_descs) {
+            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_descs));
+        }
+        if (_filter_desc) {
+            CUDNN_CHECK(cudnnDestroyFilterDescriptor(_filter_desc));
+        }
+        if (_handle != NULL) {
+            CUDNN_CHECK(cudnnDestroy(_handle));
+        }
+        if (_workspaceData != NULL) {
+            cudaFree(_workspaceData);
+        }
+        if (_input_nchw_descs != NULL) {
+            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_nchw_descs));
+        }
+        if (_output_nchw_descs != NULL) {
+            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_nchw_descs));
+        }
+        if (x8_data != NULL) {
+            CUDA_CHECK(cudaFree(x8_data));
+        }
+        if (y8_data != NULL) {
+            CUDA_CHECK(cudaFree(y8_data));
+        }
+    }
+
+    /**
+     * [Create description] Init all cudnn resource here
+     * @AuthorHTL
+     * @DateTime  2018-02-01T16:13:06+0800
+     * @param     inputs                    [description]
+     * @param     outputs                   [description]
+     * @param     conv_param                [conv parameters]
+     */
+    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            ConvActiveParam<OpTensor>& param, Context<BM>& ctx) {
+        // ---- init cudnn resources ----
+
+        _workspaceSizeInBytes = 0;
+        _workspaceData = NULL;
+
+        _workspace_fwd_sizes = 0;
+
+        this->_ctx = ctx;
+        // ---- get cuda resources ----
+
+        cudaStream_t cuda_stream;
+        cuda_stream = ctx.get_compute_stream();
+
+        CUDNN_CHECK(cudnnCreate(&_handle));
+        CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream));
+
+        _workspace = NULL;
+
+        int in_channels = inputs[0]->channel();
+
+        // ---- create cudnn Descs ----
+        cudnn::createFilterDesc<OpDataType>(&_filter_desc);
+
+        cudnn::createTensorDesc<InDataType>(&_input_descs);
+        cudnn::createTensorDesc<OutDataType>(&_output_descs);
+        cudnn::createConvolutionDesc<OpDataType>(&_conv_descs);
+        cudnn::create_activation_des<InDataType>(&_active_descs);
+
+        if (param.conv_param.bias()->size() > 0) {
+            cudnn::createTensorDesc<OpDataType>(&_bias_desc);
+        }
+
+        cudnnCreateTensorDescriptor(&_input_nchw_descs);
+        cudnnCreateTensorDescriptor(&_output_nchw_descs);
+
+        return create(inputs, outputs, param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            ConvActiveParam<OpTensor>& param, Context<BM>& ctx);
+    //call cudnnConvolutionForward here
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
+                          std::vector<DataTensor_out*>& outputs,
+                          ConvActiveParam<OpTensor>& param);
+private:
+    cudnnHandle_t _handle;
+    cudnnConvolutionFwdAlgo_t _fwd_algo;
+
+    cudnnTensorDescriptor_t _input_descs;
+    cudnnTensorDescriptor_t _output_descs;
+    cudnnTensorDescriptor_t _bias_desc;
+
+    cudnnFilterDescriptor_t _filter_desc;
+
+    cudnnConvolutionDescriptor_t _conv_descs;
+
+    size_t _workspace_fwd_sizes;
+    size_t _workspaceSizeInBytes;  // size of underlying storage
+
+    void *_workspaceData;  // underlying storage
+    void *_workspace;  // aliases into workspaceData
+
+    const bool _use_tensor_core = true;
+    const size_t _workspace_limit_bytes = 64 * 1024 * 1024;
+    const cudnnConvolutionFwdPreference_t _preference = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+
+    // activation descriptor
+    cudnnActivationDescriptor_t _active_descs;
+
+    // create transform descriptor
+    cudnnTensorDescriptor_t _input_nchw_descs;
+    cudnnTensorDescriptor_t _output_nchw_descs;
+
+    void *x8_data;
+    void *y8_data;
+
+    int x8_data_size;
+    int y8_data_size;
+};
+
+
+}
+
+}
+#endif //ANAKIN_SABER_FUNCS_CUDNN_CONV2D_H
diff --git a/saber/funcs/impl/bm/vender_conv_act_pooling.h b/saber/funcs/impl/bm/vender_conv_act_pooling.h
new file mode 100644
index 000000000..e602a693d
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_conv_act_pooling.h
@@ -0,0 +1,176 @@
+/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+   
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. 
+*/
+
+#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ACT_POOLING_H
+#define ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ACT_POOLING_H
+
+#include "saber/funcs/impl/impl_conv_act_pooling.h"
+#include "saber/funcs/impl/cuda/cudnn_helper.h"   
+#include <cudnn.h>
+
+namespace anakin{
+
+namespace saber{
+
+template <DataType OpDtype,
+    DataType inDtype,
+    DataType outDtype,
+    typename LayOutType_op,
+    typename LayOutType_in,
+    typename LayOutType_out>
+class VenderConv2DActPooling<BM, OpDtype, inDtype, outDtype,\
+    LayOutType_op, LayOutType_in, LayOutType_out> : \
+    public ImplBase<
+        Tensor<BM, inDtype, LayOutType_in>,
+        Tensor<BM, outDtype, LayOutType_out>,
+        Tensor<BM, OpDtype, LayOutType_op>,
+        ConvActivePoolingParam<Tensor<BM, OpDtype, LayOutType_op> > >
+{
+public:
+    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+
+    VenderConv2DActPooling()
+            : _handle(NULL)
+            , _workspaceData(NULL)
+            , _workspace(NULL)
+            , _conv_descs(NULL)
+            , _input_descs(NULL)
+            , _output_descs(NULL)
+            , _filter_desc(NULL)
+            , _workspace_fwd_sizes(0)
+            , _workspaceSizeInBytes(0)
+            , _fwd_algo((cudnnConvolutionFwdAlgo_t)0)
+    {}
+    ~VenderConv2DActPooling() {
+
+        if (_conv_descs) {
+            CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(_conv_descs));
+        }
+        if (_input_descs) {
+            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_descs));
+        }
+        if (_output_descs) {
+            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_descs));
+        }
+        if (_filter_desc) {
+            CUDNN_CHECK(cudnnDestroyFilterDescriptor(_filter_desc));
+        }
+        if (_handle != NULL) {
+            CUDNN_CHECK(cudnnDestroy(_handle));
+        }
+        if (_workspaceData != NULL) {
+            cudaFree(_workspaceData);
+        }
+    }
+
+    /**
+     * [Create description] Init all cudnn resource here
+     * @AuthorHTL
+     * @DateTime  2018-02-01T16:13:06+0800
+     * @param     inputs                    [description]
+     * @param     outputs                   [description]
+     * @param     conv_param                [conv parameters]
+     */
+    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            ConvActivePoolingParam<OpTensor>& param, Context<BM>& ctx) {
+        // ---- init cudnn resources ----
+
+        _workspaceSizeInBytes = 0;
+        _workspaceData = NULL;
+
+        _workspace_fwd_sizes = 0;
+
+        this->_ctx = ctx;
+        // ---- get cuda resources ----
+
+        cudaStream_t cuda_stream;
+        cuda_stream = ctx.get_compute_stream();
+
+        CUDNN_CHECK(cudnnCreate(&_handle));
+        CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream));
+
+        _workspace = NULL;
+
+        int in_channels = inputs[0]->channel();
+
+        // ---- create cudnn Descs ----
+        cudnn::createFilterDesc<OpDataType>(&_filter_desc);
+
+        cudnn::createTensorDesc<InDataType>(&_input_descs);
+        cudnn::createTensorDesc<InDataType>(&_inner_descs);
+        cudnn::createTensorDesc<OutDataType>(&_output_descs);
+        cudnn::createConvolutionDesc<OpDataType>(&_conv_descs);
+        if (param.has_activation) {
+            cudnn::create_activation_des<InDataType>(&_active_descs);
+        }
+        if (param.has_pooling) {
+            cudnn::create_pooling_des<InDataType>(&_pooling_descs);
+        }
+        if (param.conv_param.bias()->size() > 0) {
+            cudnn::createTensorDesc<OpDataType>(&_bias_desc);
+        }
+
+        return create(inputs, outputs, param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            ConvActivePoolingParam<OpTensor>& param, Context<BM>& ctx);
+    //call cudnnConvolutionForward here
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
+                          std::vector<DataTensor_out*>& outputs,
+                          ConvActivePoolingParam<OpTensor>& param);
+private:
+    cudnnHandle_t _handle;
+    cudnnConvolutionFwdAlgo_t _fwd_algo;
+
+    cudnnTensorDescriptor_t _input_descs;
+    cudnnTensorDescriptor_t _output_descs;
+    cudnnTensorDescriptor_t _inner_descs;
+    cudnnTensorDescriptor_t _bias_desc;
+
+    cudnnFilterDescriptor_t _filter_desc;
+
+    cudnnConvolutionDescriptor_t _conv_descs;
+    cudnnPoolingDescriptor_t _pooling_descs;
+
+    size_t _workspace_fwd_sizes;
+    size_t _workspaceSizeInBytes;  // size of underlying storage
+
+    void *_workspaceData;  // underlying storage
+    void *_workspace;  // aliases into workspaceData
+
+    const bool _use_tensor_core = true;
+    const size_t _workspace_limit_bytes = 64 * 1024 * 1024;
+    const cudnnConvolutionFwdPreference_t _preference = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+
+    // activation descriptor
+    cudnnActivationDescriptor_t _active_descs;
+
+    Shape _inner_shape;
+    DataTensor_out _inner_tensor;
+};
+
+
+}
+
+}
+#endif //ANAKIN_SABER_FUNCS_CUDNN_CONV2D_H
diff --git a/saber/funcs/impl/bm/vender_fc.h b/saber/funcs/impl/bm/vender_fc.h
new file mode 100644
index 000000000..5c7c23e67
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_fc.h
@@ -0,0 +1,114 @@
+/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+   
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. 
+*/
+
+#ifndef ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H
+#define ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H
+
+#include "saber/funcs/impl/impl_fc.h"
+
+namespace anakin{
+
+namespace saber{
+
+template <DataType OpDtype,
+        DataType inDtype,
+        DataType outDtype,
+        typename LayOutType_op,
+        typename LayOutType_in,
+        typename LayOutType_out>
+class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, LayOutType_out>: \
+    public ImplBase<
+        Tensor<BM, inDtype, LayOutType_in>, \
+        Tensor<BM, outDtype, LayOutType_out>, \
+        Tensor<BM, OpDtype, LayOutType_op>, \
+        FcParam<Tensor<BM, OpDtype, LayOutType_op>>> {
+
+public:
+    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+
+    VenderFc() = default;
+    ~VenderFc() {
+        if (_handle != nullptr) {
+            CUBLAS_CHECK(cublasDestroy(_handle));
+        }
+    }
+
+    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            FcParam<OpTensor>& param, Context<BM>& ctx){
+        // get context
+        this->_ctx = ctx;
+        cudaStream_t cuda_stream;
+        cuda_stream = ctx.get_compute_stream();
+
+        CUBLAS_CHECK(cublasCreate(&_handle));
+        CUBLAS_CHECK(cublasSetStream(_handle, cuda_stream));
+        return create(inputs, outputs, param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            FcParam<OpTensor>& param, Context<BM>& ctx){
+
+        if (!(ctx == this->_ctx)) {
+            if (_handle != NULL) {
+                CUBLAS_CHECK(cublasDestroy(_handle));
+            }
+            this->_ctx = ctx;
+
+            cudaStream_t cuda_stream;
+            cuda_stream = ctx.get_compute_stream();
+            CUBLAS_CHECK(cublasCreate(&_handle));
+            CUBLAS_CHECK(cublasSetStream(_handle, cuda_stream));
+        }
+
+        Shape shape_out = inputs[0]->valid_shape();
+        _M = inputs[0]->count_valid(0, param.axis);
+        _K = inputs[0]->count_valid(param.axis, inputs[0]->dims());
+        _N = param.num_output;
+        if (_N <= 0) {
+            int weight_size = param.weights->valid_size();
+            _N = weight_size / _K;
+        }
+        //! weights dims must be in h and w
+        _flag_trans_weights = param.is_transpose_weights;
+        return SaberSuccess;
+    }
+
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            FcParam<OpTensor>& param);
+
+
+private:
+    bool _flag_trans_weights{false};
+    int _M;
+    int _K;
+    int _N;
+    cublasHandle_t _handle;
+    bool _is_continue_buf{true};
+};
+
+template class VenderFc<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
+} //namespace saber
+
+} //namespace anakin
+
+#endif //ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H
diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h
new file mode 100644
index 000000000..4990a5357
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_pooling.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+   
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. 
+*/
+
+#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_POOLING_H
+#define ANAKIN_SABER_FUNCS_IMPL_CUDA_POOLING_H
+
+#include "saber/funcs/impl/impl_pooling.h"
+#include "saber/funcs/impl/cuda/cudnn_helper.h"
+
+namespace anakin{
+
+namespace saber {
+
+template <DataType OpDtype ,
+    DataType inDtype,
+    DataType outDtype,
+    typename LayOutType_op,
+    typename LayOutType_in,
+    typename LayOutType_out>
+class VenderPooling<NV, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, LayOutType_out>:\
+ public ImplBase<
+    Tensor<NV, inDtype, LayOutType_in>, 
+    Tensor<NV, outDtype, LayOutType_out>,
+    Tensor<NV, OpDtype, LayOutType_op>,
+    PoolingParam<Tensor<NV, OpDtype, LayOutType_op>>> {
+public:
+    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
+
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+
+    VenderPooling() : _handle(NULL) {}
+
+    ~VenderPooling() {}
+
+    virtual SaberStatus init(const std::vector<DataTensor_in*>& inputs,
+                  std::vector<DataTensor_out*>& outputs,
+                  PoolingParam<OpTensor> &pooling_param, Context<BM> &ctx) {
+
+        this->_ctx = ctx;
+
+        cudaStream_t cuda_stream;
+        cuda_stream = ctx.get_compute_stream();
+
+        CUDNN_CHECK(cudnnCreate(&_handle));
+        CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream));
+
+        cudnn::createTensorDesc<InDataType>(&_input_descs);
+        cudnn::createTensorDesc<OutDataType>(&_output_descs);
+
+        cudnn::create_pooling_des<OpDataType>(&_pooling_descs);
+
+        return create(inputs, outputs, pooling_param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in*>& inputs,
+                std::vector<DataTensor_out*>& outputs,
+                PoolingParam<OpTensor> &pooling_param, Context<BM> &ctx) {
+        if (!(ctx == this->_ctx)) {
+            if (_handle != NULL) {
+                CUDNN_CHECK(cudnnDestroy(_handle));
+            }
+            this->_ctx = ctx;
+
+            cudaStream_t cuda_stream;
+            cuda_stream = ctx.get_compute_stream();
+
+            CUDNN_CHECK(cudnnCreate(&_handle));
+            CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream));
+        }
+
+        int input_num = inputs[0]->num();
+        int input_channel = inputs[0]->channel();
+        int input_height = inputs[0]->height();
+        int input_width = inputs[0]->width();
+        int output_channel = outputs[0]->channel();
+        int output_height = outputs[0]->height();
+        int output_width = outputs[0]->width();
+
+        Shape stride_in = inputs[0]->get_stride();
+        Shape stride_out = outputs[0]->get_stride();
+
+        int dim_a[] = {input_num, input_channel,
+                       input_height, input_width};
+
+        int dim_b[] = {input_num, output_channel,
+                       output_height, output_width};
+
+        cudnn::setTensorNdDesc<InDataType>(&_input_descs,
+                                            inputs[0]->dims(), dim_a, &stride_in[0]);
+
+        cudnn::setTensorNdDesc<OutDataType>(&_output_descs,
+                                             outputs[0]->dims(), dim_b, &stride_out[0]);
+
+        int windowHeight[] = {pooling_param.window_h, pooling_param.window_w};
+        int padding[] = {pooling_param.pad_h, pooling_param.pad_w};
+
+        int stride[] = {pooling_param.stride_h, pooling_param.stride_w};
+
+        cudnn::set_nd_pooling_des<OpDataType>(&_pooling_descs, pooling_param.pooling_type,
+                                               inputs[0]->dims() - 2, windowHeight,
+                                               padding,stride);
+        return SaberSuccess;
+    }
+
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
+                          std::vector<DataTensor_out*>& outputs,
+                          PoolingParam<OpTensor> &param) {
+        const InDataType *in_data = inputs[0]->data();
+        OutDataType *out_data = outputs[0]->mutable_data();
+
+        CUDNN_CHECK(cudnnPoolingForward(_handle, _pooling_descs,
+                                        cudnn::cudnnTypeWrapper<InDataType>::kOne(),
+                                        _input_descs, in_data,
+                                        cudnn::cudnnTypeWrapper<OutDataType>::kZero(),
+                                        _output_descs, out_data
+        ));
+
+        return SaberSuccess;
+    }
+
+private:
+    cudnnHandle_t _handle;
+    cudnnTensorDescriptor_t _input_descs;
+    cudnnTensorDescriptor_t _output_descs;
+    cudnnPoolingDescriptor_t _pooling_descs;
+
+};
+
+template class VenderPooling<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
+
+} //namespace saber
+
+} // namespace anakin
+
+#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_POOLING_H
diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h
index b82ae4c2b..f014c2523 100644
--- a/saber/saber_funcs_param.h
+++ b/saber/saber_funcs_param.h
@@ -185,7 +185,7 @@ struct ConvParam {
 template <typename TargetType>
 struct NormalizeParam {
     NormalizeParam() = default;
-    
+
     NormalizeParam(bool is_across_spatial, float eps_in = 1e-6f, int pin = 2) {
         across_spatial = is_across_spatial;
         p = pin;
@@ -196,7 +196,7 @@ struct NormalizeParam {
     }
     NormalizeParam(bool is_across_spatial, bool is_shared_channel, \
                    Tensor<TargetType>* input_scale, float eps_in = 1e-6f, int pin = 2) {
-        
+
         across_spatial = is_across_spatial;
         channel_shared = is_shared_channel;
         p = pin;
@@ -205,7 +205,7 @@ struct NormalizeParam {
         eps = eps_in;
         CHECK_EQ(p == 2 || p == 1, true) << "only support L1 and L2 norm";
     }
-    
+
     NormalizeParam(const NormalizeParam<TargetType>& right) {
         channel_shared = right.channel_shared;
         across_spatial = right.across_spatial;
@@ -224,7 +224,7 @@ struct NormalizeParam {
         this->eps = right.eps;
         return *this;
     }
-    
+
     bool operator==(const NormalizeParam<TargetType>& right) {
         bool flag = this->across_spatial == right.across_spatial;
         flag = flag && (this->channel_shared == right.channel_shared);
@@ -233,7 +233,7 @@ struct NormalizeParam {
         flag = flag && (fabsf(this->eps - right.eps) < 1e-7f);
         return flag && (this->scale == right.scale);
     }
-    
+
     //! p = 1, L1 normalize, p = 2, L2 normalize
     int  p{2};
     //! whether normalize is across the spatial
@@ -247,7 +247,7 @@ struct NormalizeParam {
     Tensor<TargetType>* scale{nullptr};
     float eps{1e-6f};
 };
-  
+
 template <typename TargetType>
 struct PreluParam {
     PreluParam() = default;
diff --git a/saber/saber_types.h b/saber/saber_types.h
index 7f2a7464d..16bdf9e35 100644
--- a/saber/saber_types.h
+++ b/saber/saber_types.h
@@ -33,7 +33,8 @@ enum TargetTypeEnum {
     eX86 = 4,
     eNVHX86 = 5,
     eNVHARM = 6,
-    eARMGPU = 7,
+    eBM = 7,
+    eARMGPU = 8,
     eARMDSP
 };
 
@@ -48,6 +49,8 @@ typedef TargetType<eX86> X86;
 // NV device with pinned memory
 typedef TargetType<eNVHX86> NVHX86;
 //typedef TargetType<eNVHARM> NVHARM;
+// Bitmain device support
+typedef TargetType<eBM> BM;
 // invalid target type, for target has only one memory block
 typedef TargetType<eINVALID> INVLD;
 
@@ -165,7 +168,8 @@ enum DataType {
     AK_STRING       =       10,
     AK_BOOL         =       11,
     AK_SHAPE        =       12,
-    AK_TENSOR       =       13
+    AK_TENSOR       =       13,
+    AK_BM           =       14
 };
 
 typedef enum {
@@ -285,7 +289,7 @@ typedef enum {
     PRIOR_MAX = 1,
     PRIOR_COM = 2
 } PriorType;
-    
+
 typedef enum{
     RANDOM=0,
     SPECIAL,
diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp
new file mode 100644
index 000000000..c6ee0811b
--- /dev/null
+++ b/test/saber/bm/test_TargetWrapper_BM.cpp
@@ -0,0 +1,16 @@
+#include "saber_types.h"
+#include "target_wrapper.h"
+#include <iostream>
+
+#ifdef USE_BM
+using namespace anakin::saber;
+int main() {
+    typedef TargetWrapper<BM> API;
+    void *pmem;
+    int dev_count;
+    API::get_device_count(&dev_count);
+    API::mem_alloc(&pmem, 3*200*200);
+    API::mem_free(pmem);
+}
+#endif
+
diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp
new file mode 100644
index 000000000..a204e7807
--- /dev/null
+++ b/test/saber/bm/test_saber_buffer_BM.cpp
@@ -0,0 +1,116 @@
+#include "test_saber_buffer_bm.h"
+#include "saber/core/buffer.h"
+#include "saber/core/data_traits.h"
+
+using namespace anakin::saber;
+
+template <DataType datatype>
+void test_buffer() {
+
+    typedef TargetWrapper<X86> X86_API;
+    typedef TargetWrapper<BM> BM_API;
+    typedef typename DataTrait<datatype>::dtype Dtype;
+    typedef Buffer<X86> BufferH;
+    typedef Buffer<BM> BufferD;
+
+    int n0 = 1024;
+    int n1 = 2048;
+
+    void* tmp_x86;
+    Dtype* x86_ptr;
+    X86_API::mem_alloc(&tmp_x86, sizeof(Dtype) * n0);
+    x86_ptr = static_cast<Dtype*>(tmp_x86);
+
+    for (int i = 0; i < n0; i++) {
+        x86_ptr[i] = static_cast<Dtype>(i);
+    }
+
+    void* tmp_bm;
+    Dtype* bm_ptr;
+    BM_API::mem_alloc(&tmp_bm, sizeof(Dtype) * n0);
+    bm_ptr = static_cast<Dtype*>(tmp_bm);
+
+    LOG(INFO) << "Buffer: test default(empty) constructor";
+    BufferH x86_buf0;
+    BufferD bm_buf0;
+
+    LOG(INFO) << "Buffer: test constructor with data size";
+    BufferH x86_buf1(n0 * sizeof(Dtype));
+    BufferD bm_buf1(n0 * sizeof(Dtype));
+
+    LOG(INFO) << "Buffer: test constructor with data pointer, size and device id";
+    BufferH x86_buf2(x86_ptr, n0 * sizeof(Dtype), X86_API::get_device_id());
+    BufferD bm_buf2(bm_ptr, n0 * sizeof(Dtype), BM_API::get_device_id());
+
+    LOG(INFO) << "Buffer: test copy constructor";
+    BufferH x86_buf3(x86_buf2);
+    LOG(INFO) << "BM Buffer copy constructor";
+    LOG(INFO) << "bm target id: " << BM_API::get_device_id();
+    LOG(INFO) << "bm buffer target id: " << bm_buf2.get_id();
+    BufferD bm_buf3(bm_buf2);
+    CHECK_EQ(x86_buf3.get_count(), x86_buf2.get_count()) << \
+            "shared buffer should have same data count";
+    CHECK_EQ(bm_buf3.get_count(), bm_buf2.get_count()) << \
+            "shared buffer should have same data count";
+
+    LOG(INFO) << "Buffer: test operator =";
+    x86_buf0 = x86_buf2;
+    bm_buf0 = bm_buf2;
+    CHECK_EQ(x86_buf0.get_count(), x86_buf2.get_count()) << \
+            "shared buffer should have same data count";
+    CHECK_EQ(bm_buf0.get_count(), bm_buf2.get_count()) << \
+            "shared buffer should have same data count";
+
+    LOG(INFO) << "Buffer: test re_alloc";
+    x86_buf1.re_alloc(n1 * sizeof(Dtype));
+    bm_buf1.re_alloc(n1 * sizeof(Dtype));
+    CHECK_EQ(x86_buf1.get_count(), n1 * sizeof(Dtype)) << "buffer count error";
+    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error";
+    CHECK_EQ(bm_buf1.get_count(), n1 * sizeof(Dtype)) << "buffer count error";
+    CHECK_EQ(bm_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error";
+    x86_buf1.re_alloc(n0 * sizeof(Dtype));
+    bm_buf1.re_alloc(n0 * sizeof(Dtype));
+    CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Dtype)) << "buffer count error";
+    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error";
+    CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Dtype)) << "buffer count error";
+    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error";
+
+    LOG(INFO) << "Buffer: test get_id()";
+    LOG(INFO) << "X86 device id: " << x86_buf0.get_id() << \
+              ", bm device id: " << bm_buf0.get_id();
+    CHECK_EQ(X86_API::get_device_id(), x86_buf0.get_id()) << "x86 device id error";
+    CHECK_EQ(BM_API::get_device_id(), bm_buf0.get_id()) << "bm device id error";
+
+    LOG(INFO) << "Buffer: test deep_cpy()";
+    x86_buf1.sync_copy_from(x86_buf2);
+    LOG(INFO) << "deep copy between two host buffer: ";
+    const Dtype* ptr1 = static_cast<const Dtype*>(x86_buf1.get_data());
+    const Dtype* ptr2 = static_cast<const Dtype*>(x86_buf1.get_data());
+
+    for (int i = 0; i < 10; i++) {
+        std::cout << ptr1[i] << std::endl;
+    }
+
+    CHECK_EQ(ptr1[n0 / 2], ptr2[n0 / 2]) << "deep copy between host is incorrect";
+    LOG(INFO) << "deep copy from host buffer to device buffer";
+    bm_buf1.sync_copy_from(x86_buf2);
+    x86_buf1.sync_copy_from(bm_buf1);
+    LOG(INFO) << "deep copy from device buffer to host buffer: ";
+    ptr1 = static_cast<const Dtype*>(x86_buf1.get_data());
+
+    for (int i = 0; i < 10; i++) {
+        std::cout << ptr1[i] << std::endl;
+    }
+}
+
+TEST(TestSaberBufferBM, test_buffer_memcpy) {
+    test_buffer<AK_BM>();
+}
+
+int main(int argc, const char** argv) {
+    // initial logger
+    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
diff --git a/test/saber/bm/test_saber_buffer_BM.h b/test/saber/bm/test_saber_buffer_BM.h
new file mode 100644
index 000000000..8bbbe4511
--- /dev/null
+++ b/test/saber/bm/test_saber_buffer_BM.h
@@ -0,0 +1,20 @@
+#ifndef ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H
+#define ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+
+using namespace anakin::test;
+
+class TestSaberBufferBM : public Test {
+public:
+    TestSaberBufferBM() {}
+    ~TestSaberBufferBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+};
+
+#endif //ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H
diff --git a/test/saber/bm/test_saber_context_BM.cpp b/test/saber/bm/test_saber_context_BM.cpp
new file mode 100644
index 000000000..e221ba8f4
--- /dev/null
+++ b/test/saber/bm/test_saber_context_BM.cpp
@@ -0,0 +1,31 @@
+#include "test_saber_context_BM.h"
+
+#ifdef USE_BM
+
+using namespace anakin::saber;
+
+TEST(TestSaberContextBM, test_BM_context) {
+    Env<BM>::env_init();
+    typedef TargetWrapper<BM> API;
+    typename API::event_t event;
+    API::create_event(event);
+    LOG(INFO) << "test context constructor";
+    Context<BM> ctx0;
+    Context<BM> ctx1(0, 1, 1);
+    LOG(INFO) << "test record event to context data stream and compute stream";
+    API::record_event(event, ctx0.get_data_stream());
+    API::record_event(event, ctx0.get_compute_stream());
+    API::record_event(event, ctx1.get_data_stream());
+    API::record_event(event, ctx1.get_compute_stream());
+}
+
+#endif
+
+int main(int argc, const char** argv) {
+    // initial logger
+    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
diff --git a/test/saber/bm/test_saber_context_BM.h b/test/saber/bm/test_saber_context_BM.h
new file mode 100644
index 000000000..653ee11fd
--- /dev/null
+++ b/test/saber/bm/test_saber_context_BM.h
@@ -0,0 +1,21 @@
+#ifndef SABER_TEST_SABER_CONTEXT_BM_H
+#define SABER_TEST_SABER_CONTEXT_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+#include "core/context.h"
+
+using namespace anakin::test;
+
+class TestSaberContextBM : public Test {
+public:
+    TestSaberContextBM() {}
+    ~TestSaberContextBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+};
+
+#endif //SABER_TEST_SABER_CONTEXT_BM_H
diff --git a/test/saber/bm/test_saber_device_BM.cpp b/test/saber/bm/test_saber_device_BM.cpp
new file mode 100644
index 000000000..1c7086cf1
--- /dev/null
+++ b/test/saber/bm/test_saber_device_BM.cpp
@@ -0,0 +1,20 @@
+#include "test_saber_device_BM.h"
+
+#ifdef USE_BM
+
+using namespace anakin::saber;
+
+TEST(TestSaberDeviceBM, test_BM_device) {
+    Device<BM> dev_BM;
+}
+
+#endif
+
+int main(int argc, const char** argv) {
+    // initial logger
+    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
diff --git a/test/saber/bm/test_saber_device_BM.h b/test/saber/bm/test_saber_device_BM.h
new file mode 100644
index 000000000..3a6d61236
--- /dev/null
+++ b/test/saber/bm/test_saber_device_BM.h
@@ -0,0 +1,21 @@
+#ifndef SABER_TEST_SABER_DEVICE_BM_H
+#define SABER_TEST_SABER_DEVICE_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+#include "core/device.h"
+
+using namespace anakin::test;
+
+class TestSaberDeviceBM : public Test {
+public:
+    TestSaberDeviceBM() {}
+    ~TestSaberDeviceBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+};
+
+#endif //SABER_TEST_SABER_DEVICE_BM_H
diff --git a/test/saber/bm/test_saber_func_BM.h b/test/saber/bm/test_saber_func_BM.h
new file mode 100644
index 000000000..61d27d6f9
--- /dev/null
+++ b/test/saber/bm/test_saber_func_BM.h
@@ -0,0 +1,38 @@
+#ifndef ANAKIN_TEST_SABER_TEST_SABER_FUNC_BM_H
+#define ANAKIN_TEST_SABER_TEST_SABER_FUNC_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+#include "core/tensor.h"
+#include <fstream>
+#include <vector>
+
+using namespace anakin::test;
+
+int read_file(std::vector<float> &results, const char* file_name) {
+
+    std::ifstream infile(file_name);
+    if (!infile.good()) {
+        std::cout << "Cannot open " << std::endl;
+        return false;
+    }
+    LOG(INFO)<<"found filename: "<<file_name;
+    std::string line;
+    while (std::getline(infile, line)) {
+        results.push_back((float)atof(line.c_str()));
+    }
+    return 0;
+}
+
+class TestSaberFuncBM : public Test {
+public:
+    TestSaberFuncBM() {}
+    ~TestSaberFuncBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+};
+
+#endif //ANAKIN_TEST_SABER_TEST_SABER_TENSOR_BM_H
diff --git a/test/saber/bm/test_saber_func_activation_BM.cpp b/test/saber/bm/test_saber_func_activation_BM.cpp
new file mode 100644
index 000000000..5d30a6d64
--- /dev/null
+++ b/test/saber/bm/test_saber_func_activation_BM.cpp
@@ -0,0 +1,183 @@
+#include "core/context.h"
+#include "funcs/activation.h"
+#include "test_saber_func_BM.h"
+#include "tensor_op.h"
+#include "saber_types.h"
+#include <vector>
+
+using namespace anakin::saber;
+
+template <typename Tensor>
+void print_tensor_shape(std::string name, Tensor& t0) {
+
+    LOG(INFO) << name << " valid shape is ["
+              << t0.valid_shape()[0] << ", "
+              << t0.valid_shape()[1] << ", "
+              << t0.valid_shape()[2] << ", "
+              << t0.valid_shape()[3] << "].";
+
+    LOG(INFO) << name << " real shape is ["
+              << t0.shape()[0] << ", "
+              << t0.shape()[1] << ", "
+              << t0.shape()[2] << ", "
+              << t0.shape()[3] << "].";
+
+    LOG(INFO) << name << " offset is ["
+              << t0.offset()[0] << ", "
+              << t0.offset()[1] << ", "
+              << t0.offset()[2] << ", "
+              << t0.offset()[3] << "].";
+}
+
+TEST(TestSaberFuncBM, test_func_constructor) {
+
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+
+    int img_num = 1;
+    int in_channels = 2;
+    int img_h = 8;
+    int img_w = 8;
+
+    Shape img_s(img_num, in_channels, img_h, img_w);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    for (int i = 0; i < img_host.size(); ++i) {
+        img_host.mutable_data()[i] = (float)(0.05 * (i & 0x1f) * -1);
+    }
+
+    img_dev.copy_from(img_host);
+    TensorDf4 output_dev;
+
+    // start Reshape & doInfer
+
+    Context<BM> ctx1(0, 1, 1);
+
+    ActivationParam<TensorDf4> param(Active_elu, 0.1f, 0.1f);
+
+    std::vector<TensorDf4*> input;
+    std::vector<TensorDf4*> output;
+
+    input.push_back(&img_dev);
+    output.push_back(&output_dev);
+
+    Activation<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> act;
+    act.compute_output_shape(input, output, param);
+    output_dev.re_alloc(output[0]->shape());
+
+    // init assume output tensor has been reshpaed by user.
+    act.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
+    act(input, output, param, ctx1);
+
+    cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    output[0]->record_event(cuda_stream);
+    output_dev.sync();
+    print_tensor_device(output_dev);
+    cudaDeviceSynchronize();
+    CUDA_POST_KERNEL_CHECK;
+}
+
+TEST(TestSaberFuncBM, test_func_sub_tensor) {
+
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+
+    int img_num = 1;
+    int in_channels = 2;
+    int img_h = 8;
+    int img_w = 8;
+
+    Shape img_s(img_num, in_channels, img_h, img_w);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    for (int i = 0; i < img_host.size(); ++i) {
+        img_host.mutable_data()[i] = (float)(0.05 * (i & 0x1f) * -1);
+    }
+
+    img_dev.copy_from(img_host);
+    Shape img_s_t0(img_num, in_channels, 4, 4);
+
+    TensorDf4 t0;
+    TensorDf4 t1;
+
+    t0.share_sub_buffer(img_dev, img_s_t0, {0, 0, 0, 0});
+    t1.share_sub_buffer(img_dev, img_s_t0, {0, 0, 4, 4});
+
+    print_tensor_shape("t0", t0);
+    print_tensor_shape("t1", t1);
+
+    TensorDf4 output_dev;
+
+    TensorDf4 out0;
+    TensorDf4 out1;
+
+    // start Reshape & doInfer
+    Context<BM> ctx1(0, 1, 1);
+    Context<BM> ctx2(0, 2, 2);
+
+    ActivationParam<TensorDf4> param1(Active_elu, 0.1f, 0.1f);
+    ActivationParam<TensorDf4> param2(Active_elu, 0.1f, 0.1f);
+
+    std::vector<TensorDf4*> input1, input2;
+    std::vector<TensorDf4*> output1, output2;
+
+    input1.push_back(&t0);
+    input2.push_back(&t1);
+
+    output1.push_back(&out0);
+    output2.push_back(&out1);
+
+    //FIXME where do I get img_s and all those shapes ????
+    output_dev.re_alloc(img_s);
+
+    out0.share_sub_buffer(output_dev, img_s_t0, {0, 0, 0, 0});
+    out1.share_sub_buffer(output_dev, img_s_t0, {0, 0, 4, 4});
+
+    print_tensor_shape("output_dev", output_dev);
+
+    Activation<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> act1;
+    Activation<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> act2;
+
+    act1.compute_output_shape(output1, input1, param1);
+    act2.compute_output_shape(output2, input2, param2);
+
+    print_tensor_shape("out0", out0);
+    print_tensor_shape("out1", out1);
+
+    // init assume output tensor has been reshpaed by user.
+    act1.init(input1, output1, param1, SPECIFY, SABER_IMPL, ctx1);
+    act1(input1, output1, param1, ctx1);
+    cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    output1[0]->record_event(cuda_stream);
+
+    act2.init(input2, output2, param2, SPECIFY, SABER_IMPL, ctx2);
+    act2(input2, output2, param2, ctx2);
+    cudaStream_t cuda_stream2 = ctx2.get_compute_stream();
+    output2[0]->record_event(cuda_stream2);
+
+    out0.sync();
+    out1.sync();
+    print_tensor_device(output_dev);
+    cudaDeviceSynchronize();
+    CUDA_POST_KERNEL_CHECK;
+}
+
+int main(int argc, const char** argv) {
+    Env<BM>::env_init();
+    // initial logger
+    //logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
diff --git a/test/saber/bm/test_saber_func_conv_BM.cpp b/test/saber/bm/test_saber_func_conv_BM.cpp
new file mode 100644
index 000000000..7881cdb97
--- /dev/null
+++ b/test/saber/bm/test_saber_func_conv_BM.cpp
@@ -0,0 +1,725 @@
+#include "core/context.h"
+#include "funcs/conv.h"
+#include "test_saber_func_BM.h"
+#include "tensor_op.h"
+#include "saber_types.h"
+#include <vector>
+//#include "cublas.h"
+
+using namespace anakin::saber;
+
+typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+
+template <typename Tensor>
+void print_tensor_shape(std::string name, Tensor &t0) {
+
+            LOG(INFO) << name << " valid shape is ["
+                      << t0.valid_shape()[0] << ", "
+                      << t0.valid_shape()[1] << ", "
+                      << t0.valid_shape()[2] << ", "
+                      << t0.valid_shape()[3] << "].";
+
+            LOG(INFO) << name << " real shape is ["
+                      << t0.shape()[0] << ", "
+                      << t0.shape()[1] << ", "
+                      << t0.shape()[2] << ", "
+                      << t0.shape()[3] << "].";
+
+            LOG(INFO) << name << " offset is ["
+                      << t0.offset()[0] << ", "
+                      << t0.offset()[1] << ", "
+                      << t0.offset()[2] << ", "
+                      << t0.offset()[3] << "].";
+}
+
+
+
+#if 1
+TEST(TestSaberFuncBM, test_depthwise_conv) {
+
+    int group = 2;
+    int pad_h = 1;
+    int pad_w = 1;
+    int stride_h = 1;
+    int stride_w = 1;
+    int dilation_h = 1;
+    int dilation_w = 1;
+
+    int kernel_h = 3;
+    int kernel_w = 3;
+    int out_channels = 2;
+    
+    int img_num = 1;
+    int in_channels = 2;
+    int img_h = 8;
+    int img_w = 8;
+
+    bool bias_term = true;
+
+    LOG(INFO) << "conv param: ";
+    LOG(INFO) << " img_num = " << img_num;
+    LOG(INFO) << " in_channels = " << in_channels;
+    LOG(INFO) << " img_h = " << img_h;
+    LOG(INFO) << " img_w = " << img_w;
+    LOG(INFO) << " group = " << group;
+    LOG(INFO) << " pad_h = " << pad_h;
+    LOG(INFO) << " pad_w = " << pad_w;
+    LOG(INFO) << " stride_h = " << stride_h;
+    LOG(INFO) << " stride_w = " << stride_w;
+    LOG(INFO) << " dilation_h = " << dilation_h;
+    LOG(INFO) << " dilation_w = " << dilation_w;
+    LOG(INFO) << " kernel_h = " << kernel_h;
+    LOG(INFO) << " kernel_w = " << kernel_w;
+    LOG(INFO) << " out_channels = " << out_channels;
+
+    Shape img_s(img_num, in_channels, img_h, img_w);
+    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
+    Shape bias_s(1, out_channels, 1, 1);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+    
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    for (int i = 0; i < img_host.size(); ++i) {
+        img_host.mutable_data()[i] = 63 & i;
+    }
+
+    img_dev.copy_from(img_host);
+    
+    TensorHf4 weights_host;
+    TensorDf4 weights_dev;
+    
+    weights_host.re_alloc(weights_s);
+    weights_dev.re_alloc(weights_s);
+
+    fill_tensor_host_const(weights_host, 1.f);
+    weights_dev.copy_from(weights_host);
+
+    TensorHf4 bias_host;
+    TensorDf4 bias_dev;
+
+    if (bias_term) {
+        bias_host.re_alloc(bias_s);
+        bias_dev.re_alloc(bias_s);
+
+        fill_tensor_host_const(bias_host, 1.f);
+        bias_dev.copy_from(bias_host);
+    }
+
+    TensorHf4 output_host;
+    TensorDf4 output_dev;
+
+    // start Reshape & doInfer
+    Context<BM> ctx1(0, 1, 1);
+    
+    ConvParam<TensorDf4> param(group, pad_h, pad_w,
+                               stride_h, stride_w,
+                               dilation_h, dilation_w,
+                               &weights_dev, &bias_dev);
+
+    std::vector<TensorDf4*> input;
+    std::vector<TensorDf4*> output;
+
+    input.push_back(&img_dev);
+    output.push_back(&output_dev);
+
+    Conv<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> conv;
+    conv.compute_output_shape(input, output, param);
+
+    output_dev.re_alloc(output[0]->shape());
+    output_host.re_alloc(output[0]->shape());
+
+    LOG(INFO) << "regular start with group = " << group;
+    // init assume output tensor has been reshpaed by user.
+    conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
+
+    conv(input, output, param, ctx1);
+
+    cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    output[0]->record_event(cuda_stream);
+
+    output_dev.sync();
+    print_tensor_device(output_dev);
+
+//    param.group = 1;
+//    param.pad_h = 1;
+//    param.pad_w = 1;
+//
+//    LOG(INFO) << " param changed start with group = "<<param.group;
+//    conv(input, output, param, ctx1);
+//
+//    output_dev.sync();
+//    print_tensor_device(output_dev);
+    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+
+TEST(TestSaberFuncBM, test_conv_param_change) {
+
+    int group = 4;
+    int pad_h = 1;
+    int pad_w = 1;
+    int stride_h = 1;
+    int stride_w = 1;
+    int dilation_h = 1;
+    int dilation_w = 1;
+
+    int kernel_h = 3;
+    int kernel_w = 3;
+    int out_channels = 4;
+
+    int img_num = 1;
+    int in_channels = 4;
+    int img_h = 65;
+    int img_w = 63;
+
+    bool bias_term = true;
+
+    LOG(INFO) << "conv param: ";
+    LOG(INFO) << " img_num = " << img_num;
+    LOG(INFO) << " in_channels = " << in_channels;
+    LOG(INFO) << " img_h = " << img_h;
+    LOG(INFO) << " img_w = " << img_w;
+    LOG(INFO) << " group = " << group;
+    LOG(INFO) << " pad_h = " << pad_h;
+    LOG(INFO) << " pad_w = " << pad_w;
+    LOG(INFO) << " stride_h = " << stride_h;
+    LOG(INFO) << " stride_w = " << stride_w;
+    LOG(INFO) << " dilation_h = " << dilation_h;
+    LOG(INFO) << " dilation_w = " << dilation_w;
+    LOG(INFO) << " kernel_h = " << kernel_h;
+    LOG(INFO) << " kernel_w = " << kernel_w;
+    LOG(INFO) << " out_channels = " << out_channels;
+
+    Shape img_s(img_num, in_channels, img_h, img_w);
+    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
+    Shape bias_s(1, out_channels, 1, 1);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    for (int i = 0; i < img_host.size(); ++i) {
+        img_host.mutable_data()[i] = 0x7f & i;
+    }
+
+    img_dev.copy_from(img_host);
+
+    TensorHf4 weights_host;
+    TensorDf4 weights_dev;
+
+    weights_host.re_alloc(weights_s);
+    weights_dev.re_alloc(weights_s);
+
+    fill_tensor_host_const(weights_host, 1.f);
+    weights_dev.copy_from(weights_host);
+
+    TensorHf4 bias_host;
+    TensorDf4 bias_dev;
+
+    if (bias_term) {
+        bias_host.re_alloc(bias_s);
+        bias_dev.re_alloc(bias_s);
+
+        fill_tensor_host_const(bias_host, 1.f);
+        bias_dev.copy_from(bias_host);
+    }
+
+    TensorHf4 output_host;
+    TensorDf4 output_dev;
+
+    // start Reshape & doInfer
+    Context<BM> ctx1(0, 1, 1);
+
+    ConvParam<TensorDf4> param(group, pad_h, pad_w,
+                               stride_h, stride_w,
+                               dilation_h, dilation_w,
+                               &weights_dev, &bias_dev);
+
+    std::vector<TensorDf4*> input;
+    std::vector<TensorDf4*> output;
+
+    input.push_back(&img_dev);
+    output.push_back(&output_dev);
+
+    Conv<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> conv;
+    conv.compute_output_shape(input, output, param);
+
+    output_dev.re_alloc(output[0]->shape());
+    output_host.re_alloc(output[0]->shape());
+
+            LOG(INFO)<<"regular start with group = "<<group;
+    // init assume output tensor has been reshpaed by user.
+    conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
+
+    conv(input, output, param, ctx1);
+    output_dev.sync();
+//    print_tensor_device(output_dev);
+
+    param.group = 1;
+    param.pad_h = 1;
+    param.pad_w = 1;
+
+    LOG(INFO)<<" param changed start with group = "<<param.group;
+    conv(input, output, param, ctx1);
+
+    cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    output[0]->record_event(cuda_stream);
+
+    output_dev.sync();
+//    print_tensor_device(output_dev);
+    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+
+TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
+
+    int group = 1;
+    int pad_h = 1;
+    int pad_w = 1;
+    int stride_h = 1;
+    int stride_w = 1;
+    int dilation_h = 1;
+    int dilation_w = 1;
+
+    int kernel_h = 3;
+    int kernel_w = 3;
+    int out_channels = 2;
+
+    int img_num = 1;
+    int in_channels = 2;
+    int img_h = 8;
+    int img_w = 8;
+
+    bool bias_term = false;
+
+    LOG(INFO) << "conv param: ";
+    LOG(INFO) << " img_num = " << img_num;
+    LOG(INFO) << " in_channels = " << in_channels;
+    LOG(INFO) << " img_h = " << img_h;
+    LOG(INFO) << " img_w = " << img_w;
+    LOG(INFO) << " group = " << group;
+    LOG(INFO) << " pad_h = " << pad_h;
+    LOG(INFO) << " pad_w = " << pad_w;
+    LOG(INFO) << " stride_h = " << stride_h;
+    LOG(INFO) << " stride_w = " << stride_w;
+    LOG(INFO) << " dilation_h = " << dilation_h;
+    LOG(INFO) << " dilation_w = " << dilation_w;
+    LOG(INFO) << " kernel_h = " << kernel_h;
+    LOG(INFO) << " kernel_w = " << kernel_w;
+    LOG(INFO) << " out_channels = " << out_channels;
+
+    Shape img_s(img_num, in_channels, img_h, img_w);
+    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
+    Shape bias_s(1, out_channels, 1, 1);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    for (int i = 0; i < img_host.size(); ++i) {
+        img_host.mutable_data()[i] = 0x7f & i;
+    }
+
+    img_dev.copy_from(img_host);
+
+    Shape img_s_sub(img_num, in_channels, 4, 4);
+
+    TensorDf4 t0;
+    TensorDf4 t1;
+
+    t0.share_sub_buffer(img_dev, img_s_sub, {0,0,0,0});
+    t1.share_sub_buffer(img_dev, img_s_sub, {0,0,4,4});
+
+    print_tensor_shape("t0", t0);
+    print_tensor_shape("t1", t1);
+
+    TensorHf4 weights_host;
+    TensorDf4 weights_dev;
+
+    weights_host.re_alloc(weights_s);
+    weights_dev.re_alloc(weights_s);
+
+    fill_tensor_host_const(weights_host, 1.f);
+    weights_dev.copy_from(weights_host);
+
+    TensorHf4 bias_host;
+    TensorDf4 bias_dev;
+
+    if (bias_term) {
+        bias_host.re_alloc(bias_s);
+        bias_dev.re_alloc(bias_s);
+
+        fill_tensor_host_const(bias_host, 1.f);
+        bias_dev.copy_from(bias_host);
+    }
+
+    TensorDf4 output_dev;
+
+    // start Reshape & doInfer
+    Context<BM> ctx1(0, 1, 1);
+    Context<BM> ctx2(0, 2, 2);
+
+    TensorDf4 out0;
+    TensorDf4 out1;
+
+    ConvParam<TensorDf4> param0(group, pad_h, pad_w,
+                               stride_h, stride_w,
+                               dilation_h, dilation_w,
+                               &weights_dev, &bias_dev);
+
+    ConvParam<TensorDf4> param1(group, pad_h, pad_w,
+                                stride_h, stride_w,
+                                dilation_h, dilation_w,
+                                &weights_dev, &bias_dev);
+
+    std::vector<TensorDf4*> input0, input1;
+    std::vector<TensorDf4*> output0, output1;
+
+    input0.push_back(&t0);
+    input1.push_back(&t1);
+
+    output0.push_back(&out0);
+    output1.push_back(&out1);
+
+    // FIXME ? where do i get output shape
+    output_dev.re_alloc(img_s);
+
+    Conv<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> conv0;
+    Conv<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> conv1;
+
+    conv0.compute_output_shape(input0, output0, param0);
+    conv1.compute_output_shape(input1, output1, param1);
+
+    out0.share_sub_buffer(output_dev, output0[0]->valid_shape(),{0,0,0,0});
+    out1.share_sub_buffer(output_dev, output1[0]->valid_shape(),{0,0,4,4});
+
+    conv0.init(input0, output0, param0, SPECIFY, VENDER_IMPL, ctx1);
+    conv1.init(input1, output1, param1, SPECIFY, VENDER_IMPL, ctx2);
+
+    conv0(input0, output0, param0, ctx1);
+    conv1(input1, output1, param1, ctx2);
+
+    cudaStream_t cuda_stream1 = ctx1.get_compute_stream();
+    output0[0]->record_event(cuda_stream1);
+
+    cudaStream_t cuda_stream2 = ctx2.get_compute_stream();
+    output1[0]->record_event(cuda_stream2);
+
+    out0.sync();
+    out1.sync();
+
+    print_tensor_device(output_dev);
+
+//    print_tensor_device(output_dev);
+
+    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+#endif
+
+TEST(TestSaberFuncBM, test_conv_fp32_speed_test) {
+
+    int group = 1;
+    int pad_h = 1;
+    int pad_w = 1;
+    int stride_h = 1;
+    int stride_w = 1;
+    int dilation_h = 1;
+    int dilation_w = 1;
+
+    int kernel_h = 1;
+    int kernel_w = 1;
+    int out_channels = 128;
+
+    int img_num = 7;
+    int in_channels = 13;
+    int img_h = 32;
+    int img_w = 32;
+
+    bool bias_term = false;
+
+    LOG(INFO) << "conv param: ";
+    LOG(INFO) << " img_num = " << img_num;
+    LOG(INFO) << " in_channels = " << in_channels;
+    LOG(INFO) << " img_h = " << img_h;
+    LOG(INFO) << " img_w = " << img_w;
+    LOG(INFO) << " group = " << group;
+    LOG(INFO) << " pad_h = " << pad_h;
+    LOG(INFO) << " pad_w = " << pad_w;
+    LOG(INFO) << " stride_h = " << stride_h;
+    LOG(INFO) << " stride_w = " << stride_w;
+    LOG(INFO) << " dilation_h = " << dilation_h;
+    LOG(INFO) << " dilation_w = " << dilation_w;
+    LOG(INFO) << " kernel_h = " << kernel_h;
+    LOG(INFO) << " kernel_w = " << kernel_w;
+    LOG(INFO) << " out_channels = " << out_channels;
+    Shape img_s(img_num, in_channels, img_h, img_w);
+    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
+    Shape bias_s(1, out_channels, 1, 1);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    for (int i = 0; i < img_host.size(); ++i) {
+        img_host.mutable_data()[i] = 1;
+    }
+
+    img_dev.copy_from(img_host);
+
+    TensorHf4 weights_host;
+    TensorDf4 weights_dev;
+
+    weights_host.re_alloc(weights_s);
+    weights_dev.re_alloc(weights_s);
+
+    fill_tensor_host_const(weights_host, 1.f);
+    weights_dev.copy_from(weights_host);
+
+    TensorHf4 bias_host;
+    TensorDf4 bias_dev;
+
+    if (bias_term) {
+        bias_host.re_alloc(bias_s);
+        bias_dev.re_alloc(bias_s);
+
+        fill_tensor_host_const(bias_host, 1.f);
+        bias_dev.copy_from(bias_host);
+    }
+
+    TensorDf4 output_dev;
+
+    // start Reshape & doInfer
+    Context<BM> ctx1(0, 1, 1);
+
+    ConvParam<TensorDf4> param(group, pad_h, pad_w,
+                               stride_h, stride_w,
+                               dilation_h, dilation_w,
+                               &weights_dev, &bias_dev);
+
+    std::vector<TensorDf4*> input;
+    std::vector<TensorDf4*> output;
+
+    input.push_back(&img_dev);
+    output.push_back(&output_dev);
+
+    Conv<BM, AK_FLOAT> conv;
+    conv.compute_output_shape(input, output, param);
+
+    output_dev.re_alloc(output[0]->shape());
+    LOG(INFO) << "Output shape = [ " << output[0]->shape()[0] << " " << output[0]->shape()[1] << " " \
+        << output[0]->shape()[2] << " " << output[0]->shape()[3] << "]";
+    //LOG(INFO) << " blocks = [ " <<  i_div_up(img_num*output[0]->shape()[2]*output[0]->shape()[3],128) << " " << i_div_up(out_channels*kernel_h, 128) << " 1 ]" ; 
+    //选择k最小的那一组，如果一样，则选128*N，N最大的那一组
+    int k0 = i_div_up(out_channels, 128) * 128 - out_channels;
+    int k1 = i_div_up(out_channels, 64) * 64 - out_channels;
+    int k2 = i_div_up(out_channels, 32) * 32 - out_channels;
+    int kk = std::min(std::min(k0,k1),k2);
+    LOG(INFO) << "k0 = " << k0 << " k1 = " << k1 << " k2 = " << k2 << " kk = " << kk;
+    if (kk == k0)
+        LOG(INFO) << "thread = [256,1,1] 128*128" ;
+    if (kk == k1)
+        LOG(INFO) << "thread = [128,1,1] 128*64" ;
+    if (kk == k2)
+        LOG(INFO) << "thread = [128,1,1] 128*32" ;
+
+    LOG(INFO) << "saber conv init";
+    conv.init(input, output, param, SPECIFY, SABER_IMPL, ctx1);
+
+    LOG(INFO) << "saber conv dispatch";
+    conv(input, output, param, ctx1);
+
+    cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    output[0]->record_event(cuda_stream);
+
+    output_dev.sync();
+
+    SaberTimer<BM> t1;
+    int ts = 1;
+
+    for (int i = 0; i < ts; ++i) {
+        t1.start(ctx1);
+        conv(input, output, param, ctx1);
+        output_dev.sync();
+        t1.end(ctx1);
+    }
+
+    LOG(INFO) << "fp32 average time: " << t1.get_average_ms() << " ms";
+
+    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+
+void test_conv_fp32_speed(std::vector<TensorDf4*> &inputs, std::vector<TensorDf4*> &outputs,
+                         TensorDf4 &weights, int kernel_size, int stride, int pad,
+                         int in_channel, int out_channel, TensorDf4 &bias,
+                         anakin::saber::ImplEnum impl) {
+
+    ConvParam<TensorDf4> conv_param(1, pad, pad,
+                                    stride, stride,
+                                    1, 1,
+                                    &weights, &bias);
+    Conv<BM, AK_FLOAT> conv;
+    conv.compute_output_shape(inputs, outputs, conv_param);
+    outputs[0]->re_alloc(outputs[0]->shape());
+    Context<BM> ctx1(0, 1, 1);
+
+    SABER_CHECK(conv.init(inputs, outputs, conv_param, SPECIFY, impl, ctx1));
+
+    conv(inputs, outputs, conv_param, ctx1);
+    outputs[0]->record_event(ctx1.get_compute_stream());
+    outputs[0]->sync();
+
+    cudaDeviceSynchronize();
+
+    SaberTimer<BM> t1;
+    int ts = 100;
+    for (int i = 0; i < ts; ++i) {
+        t1.start(ctx1);
+        conv(inputs, outputs, conv_param, ctx1);
+        outputs[0]->record_event(ctx1.get_compute_stream());
+        outputs[0]->sync();
+        t1.end(ctx1);
+    }
+            LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms";
+
+    cudaDeviceSynchronize();
+}
+
+
+cublasHandle_t  cublas_handle;
+
+void caffe_gemm(const int M, const int N, const int K,\
+					 const float alpha, const float* A,\
+					 const float* B, const float beta, float* C) {
+    int lda = K;
+    int ldb = N;
+    CUBLAS_CHECK(cublasSgemm(cublas_handle,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             N, M, K,
+                             &alpha, B,
+                             ldb, A,
+                             lda, &beta,
+                             C, N));
+}
+
+TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) {
+    int img_num = 1;
+    int kernel = 1;
+
+//    int out_channels = 32;
+//    int in_channels = 128;
+//    int img_h = 52;
+//    int img_w = 112;
+//    int out_channels = 64;
+//    int in_channels = 256;
+//    int img_h = 26;
+//    int img_w = 56;
+    int out_channels = 128;
+    int in_channels = 512;
+    int img_h = 13;
+    int img_w = 28;
+
+//    int out_channels = 512;
+//    int in_channels = 128;
+//    int img_h = 13;
+//    int img_w = 28;
+
+    int pad = 0;
+    int stride = 1;
+    Context<BM> ctx1(0, 1, 1);
+
+    CUBLAS_CHECK(cublasCreate(&cublas_handle));
+    CUBLAS_CHECK(cublasSetStream(cublas_handle, ctx1.get_compute_stream()));
+
+    TensorDf4 weights;
+    weights.re_alloc({out_channels, in_channels, 1, 1});
+
+    TensorDf4 img;
+    img.re_alloc({1, in_channels, img_h, img_w});
+
+    TensorDf4 out;
+    out.re_alloc({1, out_channels, img_h, img_w});
+    TensorDf4 out_gemm;
+    out_gemm.re_alloc({1, out_channels, img_h, img_w});
+
+    fill_tensor_device_rand(weights, -1.f, 1.f);
+    fill_tensor_device_rand(img, -1.f, 1.f);
+
+    LOG(INFO) << "img_num: " << img_num;
+    LOG(INFO) << "kernel: " << kernel;
+    LOG(INFO) << "out_channels: " << out_channels;
+    LOG(INFO) << "in_channels: " << in_channels;
+    LOG(INFO) << "img_h: " << img_h;
+    LOG(INFO) << "img_w: " << img_w;
+    LOG(INFO) << "pad: " << pad;
+    LOG(INFO) << "stride: " << stride;
+
+    TensorDf4 bias;
+
+    std::vector<TensorDf4*> input_v;
+    std::vector<TensorDf4*> output_gemm_v, output_v;
+
+    input_v.push_back(&img);
+    output_v.push_back(&out);
+    output_gemm_v.push_back(&out_gemm);
+    cudaDeviceSynchronize();
+    test_conv_fp32_speed(input_v, output_v,
+                         weights, kernel, stride, pad,
+            in_channels, out_channels, bias,
+            SABER_IMPL);
+    cudaDeviceSynchronize();
+    caffe_gemm(out_channels, img_h * img_w, in_channels,\
+					 1.f, weights.data(),\
+					 img.data(), 0.f, out_gemm.mutable_data());
+    cudaDeviceSynchronize();
+    SaberTimer<BM> t1;
+    int ts = 100;
+
+    for (int i = 0; i < ts; ++i) {
+        t1.start(ctx1);
+        caffe_gemm(out_channels, img_h * img_w, in_channels,\
+					 1.f, weights.data(),\
+					 img.data(), 0.f, out_gemm.mutable_data());
+        out_gemm.record_event(ctx1.get_compute_stream());
+        out_gemm.sync();
+        t1.end(ctx1);
+    }
+    LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms";
+
+    cudaDeviceSynchronize();
+//    print_tensor_device(out);
+//    print_tensor_device(out_gemm);
+    TensorHf4 out_host;
+    TensorHf4 out_gemm_host;
+    out_host.re_alloc(out.shape());
+    out_host.copy_from(out);
+
+    out_gemm_host.re_alloc(out_gemm.shape());
+    out_gemm_host.copy_from(out_gemm);
+    double max_r, max_d;
+    tensor_cmp_host(out_host.data(), out_gemm_host.data(), out_host.size(), max_r, max_d);
+    LOG(INFO) << "cmp result: max_r = " << max_r << " max_d = " << max_d;
+}
+
+int main(int argc, const char** argv){
+    anakin::saber::Env<BM>::env_init();
+
+    // initial logger
+    //logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
diff --git a/test/saber/bm/test_saber_func_fc_BM.cpp b/test/saber/bm/test_saber_func_fc_BM.cpp
new file mode 100644
index 000000000..5101c75f8
--- /dev/null
+++ b/test/saber/bm/test_saber_func_fc_BM.cpp
@@ -0,0 +1,148 @@
+#include "core/context.h"
+#include "funcs/fc.h"
+#include "test_saber_func_fc_BM.h"
+#include "tensor_op.h"
+#include "saber_types.h"
+#include <vector>
+
+using namespace anakin::saber;
+typedef TargetWrapper<BM> API;
+typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+typedef TensorDf4::Dtype ftype;
+
+void fc_compute(const TensorHf4& tin, const TensorHf4& weight, \
+                const TensorHf4& bias, TensorHf4& tout) {
+
+    int m = tin.num();
+    int k = tin.valid_size() / m;
+    int n = weight.valid_size() / k;
+    bool bias_term = bias.valid_size() > 0;
+
+    const float* din = tin.data();
+    const float* w = weight.data();
+    float* dout = tout.mutable_data();
+
+    for (int i = 0; i < m; ++i) {
+        float* pdout = dout + i * n;
+        const float* pdin = din + i * k;
+
+        for (int j = 0; j < n; ++j) {
+            if (bias_term) {
+                pdout[j] = bias.data()[j];
+            } else {
+                pdout[j] = 0;
+            }
+
+            for (int l = 0; l < k; ++l) {
+                pdout[j] += pdin[l] * w[l * n + j];
+            }
+        }
+    }
+}
+
+TEST(TestSaberFuncFcBM, test_func_fc) {
+
+    int test_iter = 100;
+    int w_in = 7;
+    int h_in = 7;
+    int ch_in = 512;
+    int num_in = 1;
+
+    int num_out = 4096;
+    int axis = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_out = {num_in, num_out, 1, 1};
+
+    Shape sh_w{1, 1, w_in* h_in * ch_in, num_out};
+    TensorDf4 weight(sh_w);
+    Shape sh_b{1, 1, 1, num_out};
+    TensorDf4 bias(sh_b);
+    fill_tensor_device_const(weight, 1.f);
+    fill_tensor_device_const(bias, 1.f);
+
+    LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \
+              ch_in << ", height=" << h_in << ", width=" << w_in;
+
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+
+    TensorDf4 tdin;
+    TensorDf4 tdout;
+    tdin.re_alloc(shape_in);
+    fill_tensor_device_const(tdin, 1.f);
+    input_dev_4d.push_back(&tdin);
+    output_dev_4d.push_back(&tdout);
+
+    // start Reshape & doInfer
+    Context<BM> ctx_dev(0, 1, 1);
+
+    FcParam<TensorDf4> param(&weight, &bias, num_out, axis);
+
+    Fc<BM, AK_FLOAT> fc;
+
+    LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \
+              shape_out[2] << ", " << shape_out[3];
+
+    SABER_CHECK(fc.compute_output_shape(input_dev_4d, output_dev_4d, param));
+
+    LOG(INFO) << "re-alloc tensor buffer";
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->valid_shape());
+    Shape va_sh = tdout.valid_shape();
+    LOG(INFO) << "shape out 4d: " << va_sh[0] << ", " << va_sh[1] << ", " << \
+              va_sh[2] << ", " << va_sh[3];
+    CHECK_EQ(tdout.valid_shape() == shape_out, true) << "compute output shape error";
+
+    LOG(INFO) << "FC initialization";
+    SABER_CHECK(fc.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev));
+
+    LOG(INFO) << "FC compute";
+    SaberTimer<BM> t1;
+    t1.clear();
+    t1.start(ctx_dev);
+
+    for (int i = 0; i < test_iter; ++i) {
+        SABER_CHECK(fc(input_dev_4d, output_dev_4d, param, ctx_dev));
+        output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+        output_dev_4d[0]->sync();
+        //cudaDeviceSynchronize();
+    }
+
+    CUDA_POST_KERNEL_CHECK;
+    t1.end(ctx_dev);
+    float ts = t1.get_average_ms();
+    LOG(INFO) << "total time: " << ts << "avg time: " << ts / test_iter;
+    //print_tensor_device(*output_dev_4d[0]);
+    //cudaDeviceSynchronize();
+
+    //! check result
+    TensorHf4 thin(shape_in);
+    TensorHf4 thout(shape_out);
+    TensorHf4 thw(sh_w);
+    TensorHf4 thb(sh_b);
+    thin.copy_from(tdin);
+    thw.copy_from(weight);
+    thb.copy_from(bias);
+    fc_compute(thin, thw, thb, thout);
+    //print_tensor_host(thout);
+
+    TensorHf4 thout_d(shape_out);
+    thout_d.copy_from(tdout);
+    double max_ratio = 0;
+    double max_diff = 0;
+    tensor_cmp_host(thout.data(), thout_d.data(), thout.valid_size(), max_ratio, max_diff);
+    LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio;
+    CHECK_LE(fabs(max_ratio), 1.0e-6) << "error result";
+
+}
+
+int main(int argc, const char** argv) {
+    // initial logger
+    //logger::init(argv[0]);
+    Env<BM>::env_init();
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp
new file mode 100644
index 000000000..04b963675
--- /dev/null
+++ b/test/saber/bm/test_saber_func_pooling_BM.cpp
@@ -0,0 +1,311 @@
+#include "core/context.h"
+#include "funcs/pooling.h"
+#include "test_saber_func_BM.h"
+#include "tensor_op.h"
+#include "saber_types.h"
+#include "funcs/timer.h"
+#include <vector>
+
+using namespace anakin::saber;
+
+TEST(TestSaberFuncBM, test_func_pooling) {
+
+    Env<BM>::env_init();
+    typedef TargetWrapper<BM> API;
+    typename API::event_t event;
+    API::create_event(event);
+
+    typedef TargetWrapper<X86> X86_API;
+    typedef TargetWrapper<BM> BM_API;
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+
+    int img_num = 1;
+    int in_channels = 4;
+    int img_h = 800;
+    int img_w = 1440;
+
+    Shape img_s(img_num, in_channels, img_h, img_w);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    for (int i = 0; i < img_host.size(); ++i) {
+        img_host.mutable_data()[i] = 0x7f & i;
+    }
+
+    img_dev.copy_from(img_host);
+
+    TensorHf4 output_host;
+    TensorDf4 output_dev;
+
+    // start Reshape & doInfer
+
+    Context<BM> ctx1(0, 1, 1);
+    int window_h = 2;
+    int window_w = 2;
+    int pad_h = 1;
+    int pad_w = 1;
+    int stride_h = 1;
+    int stride_w = 1;
+    LOG(INFO) << " img_num: " << img_num;
+    LOG(INFO) << " in_channels: " << in_channels;
+    LOG(INFO) << " img_h: " << img_h;
+    LOG(INFO) << " img_w: " << img_w;
+    LOG(INFO) << " window_h: " << window_h;
+    LOG(INFO) << " window_w: " << window_w;
+    LOG(INFO) << " pad_h: " << pad_h;
+    LOG(INFO) << " pad_w: " << pad_w;
+    LOG(INFO) << " stride_h: " << stride_h;
+    LOG(INFO) << " stride_w: " << stride_w;
+
+    PoolingParam<TensorDf4> param(window_h, window_w, pad_h, pad_w
+                                  , stride_h, stride_w, Pooling_max);
+
+    std::vector<TensorDf4*> input;
+    std::vector<TensorDf4*> output;
+
+    input.push_back(&img_dev);
+    output.push_back(&output_dev);
+
+    Pooling<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> pooling;
+    pooling.compute_output_shape(input, output, param);
+
+    output_dev.re_alloc(output[0]->shape());
+    output_host.re_alloc(output[0]->shape());
+
+    // init assume output tensor has been reshpaed by user.
+    pooling.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
+    pooling(input, output, param, ctx1);
+
+    SaberTimer<BM> t1;
+    int ts = 1000;
+
+    for (int i = 0; i < ts; ++i) {
+        t1.start(ctx1);
+        pooling(input, output, param, ctx1);
+        output[0]->sync();
+        t1.end(ctx1);
+    }
+
+    output_dev.sync();
+    cudaDeviceSynchronize();
+    LOG(INFO) << " average time: " << t1.get_average_ms() << " ms";
+    LOG(INFO) << " tile 10% time: " << t1.get_tile_time(10) << " ms";
+    LOG(INFO) << " tile 50% time: " << t1.get_tile_time(50) << " ms";
+    LOG(INFO) << " tile 90% time: " << t1.get_tile_time(90) << " ms";
+    LOG(INFO) << " tile 95% time: " << t1.get_tile_time(95) << " ms";
+    LOG(INFO) << " tile 99% time: " << t1.get_tile_time(99) << " ms";
+
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+
+TEST(TestSaberFuncBM, test_pooling_result) {
+
+    Env<BM>::env_init();
+    typedef TargetWrapper<BM> API;
+    typename API::event_t event;
+    API::create_event(event);
+
+    typedef TargetWrapper<X86> X86_API;
+    typedef TargetWrapper<BM> BM_API;
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+
+    int img_num = 1;
+    int in_channels = 2;
+    int img_h = 8;
+    int img_w = 8;
+
+    Shape img_s(img_num, in_channels, img_h, img_w);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    for (int i = 0; i < img_host.size(); ++i) {
+        img_host.mutable_data()[i] = 0x7f & i;
+    }
+
+    img_dev.copy_from(img_host);
+
+    TensorDf4 output_dev;
+
+    // start Reshape & doInfer
+
+    Context<BM> ctx1(0, 1, 1);
+    int window_h = 2;
+    int window_w = 2;
+    int pad_h = 1;
+    int pad_w = 1;
+    int stride_h = 1;
+    int stride_w = 1;
+
+    LOG(INFO) << " img_num: " << img_num;
+    LOG(INFO) << " in_channels: " << in_channels;
+    LOG(INFO) << " img_h: " << img_h;
+    LOG(INFO) << " img_w: " << img_w;
+    LOG(INFO) << " window_h: " << window_h;
+    LOG(INFO) << " window_w: " << window_w;
+    LOG(INFO) << " pad_h: " << pad_h;
+    LOG(INFO) << " pad_w: " << pad_w;
+    LOG(INFO) << " stride_h: " << stride_h;
+    LOG(INFO) << " stride_w: " << stride_w;
+
+    PoolingParam<TensorDf4> param(window_h, window_w, pad_h, pad_w
+                                  , stride_h, stride_w, Pooling_max);
+
+    std::vector<TensorDf4*> input;
+    std::vector<TensorDf4*> output;
+
+    input.push_back(&img_dev);
+    output.push_back(&output_dev);
+
+    Pooling<BM, AK_FLOAT> pooling;
+    pooling.compute_output_shape(input, output, param);
+
+    output_dev.re_alloc(output[0]->shape());
+
+    // init assume output tensor has been reshpaed by user.
+    pooling.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
+    pooling(input, output, param, ctx1);
+    cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    output[0]->record_event(cuda_stream);
+
+    output_dev.sync();
+    print_tensor_device(output_dev);
+
+    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+
+TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
+
+    Env<BM>::env_init();
+    typedef TargetWrapper<BM> API;
+    typename API::event_t event;
+    API::create_event(event);
+
+    typedef TargetWrapper<X86> X86_API;
+    typedef TargetWrapper<BM> BM_API;
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+
+    int img_num = 1;
+    int in_channels = 2;
+    int img_h = 8;
+    int img_w = 8;
+
+    Shape img_s(img_num, in_channels, img_h, img_w);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    for (int i = 0; i < img_host.size(); ++i) {
+        img_host.mutable_data()[i] = 0x7f & i;
+    }
+
+    img_dev.copy_from(img_host);
+
+    TensorDf4 t0;
+    TensorDf4 t1;
+    Shape img_s_sub(img_num, in_channels, img_h / 2, img_w / 2);
+
+    t0.share_sub_buffer(img_dev, img_s_sub, {0, 0, 0, 0});
+    t1.share_sub_buffer(img_dev, img_s_sub, {0, 0, 4, 4});
+
+    TensorDf4 output_dev;
+
+    TensorDf4 out0;
+    TensorDf4 out1;
+
+    // start Reshape & doInfer
+
+    Context<BM> ctx1(0, 1, 1);
+    int window_h = 2;
+    int window_w = 2;
+    int pad_h = 1;
+    int pad_w = 1;
+    int stride_h = 1;
+    int stride_w = 1;
+
+    LOG(INFO) << " img_num: " << img_num;
+    LOG(INFO) << " in_channels: " << in_channels;
+    LOG(INFO) << " img_h: " << img_h;
+    LOG(INFO) << " img_w: " << img_w;
+    LOG(INFO) << " window_h: " << window_h;
+    LOG(INFO) << " window_w: " << window_w;
+    LOG(INFO) << " pad_h: " << pad_h;
+    LOG(INFO) << " pad_w: " << pad_w;
+    LOG(INFO) << " stride_h: " << stride_h;
+    LOG(INFO) << " stride_w: " << stride_w;
+
+    PoolingParam<TensorDf4> param(window_h, window_w, pad_h, pad_w
+                                  , stride_h, stride_w, Pooling_max);
+
+    std::vector<TensorDf4*> input;
+    std::vector<TensorDf4*> output;
+
+    input.push_back(&img_dev);
+    output.push_back(&output_dev);
+
+    Pooling<BM, AK_FLOAT> pooling;
+    Pooling<BM, AK_FLOAT> pooling0;
+    Pooling<BM, AK_FLOAT> pooling1;
+
+    pooling.compute_output_shape(input,output,  param);
+
+    Shape total_shape = output[0]->shape();
+
+    output_dev.re_alloc(total_shape);
+    Shape out_sub_shape = {total_shape[0], total_shape[1], total_shape[2] / 2, total_shape[3] / 2};
+
+    out0.share_sub_buffer(output_dev, out_sub_shape, {0, 0, 0, 0});
+    out1.share_sub_buffer(output_dev, out_sub_shape, {0, 0, out_sub_shape[2], out_sub_shape[3]});
+
+    std::vector<TensorDf4*> input0, input1;
+    std::vector<TensorDf4*> output0, output1;
+
+    input0.push_back(&t0);
+    input1.push_back(&t1);
+    output0.push_back(&out0);
+    output1.push_back(&out1);
+
+    // init assume output tensor has been reshpaed by user.
+    pooling0.init(input0, output0, param, SPECIFY, VENDER_IMPL, ctx1);
+    pooling0(input0, output0, param, ctx1);
+
+    pooling1.init(input1, output1, param, SPECIFY, VENDER_IMPL, ctx1);
+    pooling1(input1, output1, param, ctx1);
+
+    cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    out0.record_event(cuda_stream);
+
+    cudaStream_t cuda_stream1 = ctx1.get_compute_stream();
+    out1.record_event(cuda_stream1);
+
+    out0.sync();
+    out1.sync();
+
+    print_tensor_device(output_dev);
+
+    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+
+int main(int argc, const char** argv) {
+    // initial logger
+    //logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
diff --git a/test/saber/bm/test_saber_shape_BM.cpp b/test/saber/bm/test_saber_shape_BM.cpp
new file mode 100644
index 000000000..18479cd18
--- /dev/null
+++ b/test/saber/bm/test_saber_shape_BM.cpp
@@ -0,0 +1,126 @@
+#include "test_saber_shape_BM.h"
+#include "shape.h"
+#include "anakin_config.h"
+
+#ifdef USE_OPENMP
+#include <omp.h>
+#include <core/shape.h>
+#endif
+
+using namespace anakin;
+using namespace saber;
+
+
+TEST(TestSaberShapeBM, test_saber_shape) {
+
+    int dim = 4;
+    Shape sh4d0{0, 0, 0, 0};
+    CHECK_EQ(sh4d0.dims(), 4) << "check shape dim error";
+
+    for (int i = 0; i < dim; ++i) {
+        CHECK_EQ(sh4d0[i], 0) << "check default constructor, dim size error";
+    }
+
+    CHECK_EQ(sh4d0.count(), 0) << "check shape count error";
+
+    int N = 1;
+    int C = 3;
+    int H = 11;
+    int W = 11;
+    std::vector<int> sh_size = {N, C, H, W};
+    //Shape sh4d1(sh_size);
+    Shape sh4d1(N, C, H, W);
+    LOG(INFO) << "Test Saber Shape, size of shape: " << sh4d1.size();
+    CHECK_EQ(sh4d1.count(), N * C * H * W) << "size error with vector constructor!";
+    //CHECK_EQ(sh4d2.size(), N * C * H * W) << "size error with args constructor!";
+
+    CHECK_EQ(sh4d1[0], N) << "get shape size error";
+    CHECK_EQ(sh4d1[1], C) << "get shape size error";
+    CHECK_EQ(sh4d1[2], H) << "get shape size error";
+    CHECK_EQ(sh4d1[3], W) << "get shape size error";
+
+    //CHECK_EQ(sh4d2[0], N) << "get shape size error";
+    //CHECK_EQ(sh4d2[1], C) << "get shape size error";
+    //CHECK_EQ(sh4d2[2], H) << "get shape size error";
+    //CHECK_EQ(sh4d2[3], W) << "get shape size error";
+
+    CHECK_EQ(sh4d1.count(0), N * C * H * W) << "calculate count failed";
+
+    C = 10;
+    sh4d1[1] = C;
+    CHECK_EQ(sh4d1[1], C) << "set shape size error";
+
+    bool is_equal = (sh4d0 == sh4d1);
+    CHECK_EQ(is_equal, false) << "check shape is_equal failed";
+
+    sh4d0 = sh4d1;
+    CHECK_EQ(sh4d1[0], N) << "constructor failed";
+    CHECK_EQ(sh4d1[1], C) << "get shape size error";
+    CHECK_EQ(sh4d1[2], H) << "get shape size error";
+    CHECK_EQ(sh4d1[3], W) << "get shape size error";
+
+    Shape sh4d3 = sh4d1;
+    CHECK_EQ((sh4d3 == sh4d1), true) << "constructor error";
+
+    Shape sh4d4(sh4d1);
+    CHECK_EQ((sh4d4 == sh4d1), true) << "constructor error";
+
+    Shape sh1d0{0};
+    //std::vector<int> sh1d_size = {W};
+
+    //Shape sh1d1(sh1d_size);
+    //Shape sh1d0{W};
+    Shape sh1d1(W);
+
+    Shape sh1d3 = sh1d1;
+    Shape sh1d4(sh1d1);
+
+    CHECK_EQ(sh1d0.dims(), 1) << "shape dim error";
+
+    CHECK_EQ(sh1d0.count(), 0) << "shape size error";
+
+    CHECK_EQ(sh1d0.count(0), 0) << "shape1d count error";
+
+    CHECK_EQ(sh1d1[0], W) << "get shape size error";
+
+    //CHECK_EQ(sh1d2.count(0), W) << "shape dim error";
+
+    CHECK_EQ((sh1d0 != sh1d1), true) << "compare shape error";
+
+    CHECK_EQ((sh1d3 == sh1d1), true) << "compare shape error";
+
+    CHECK_EQ((sh1d4 == sh1d1), true) << "compare shape error";
+
+    Shape sh0{2, 2, 3, 4};
+    Shape sh1{2, 1, 1, 24};
+    Shape sh2{2, 2, 3, 4};
+    Shape sh3{1, 1, 2, 3};
+
+    CHECK_EQ(sh0 == sh2, true) << "error ==";
+    CHECK_EQ(sh3 < sh0, true) << "error <";
+    CHECK_EQ(sh3 >= sh0, false) << "error >=";
+    CHECK_EQ(sh3 > sh0, false) << "error >";
+    CHECK_EQ(sh0 > sh3, true) << "error >";
+    CHECK_EQ(sh0 < sh1, false) << "error <";
+    CHECK_EQ(sh0 <= sh2, true) << "error <=";
+    CHECK_EQ(sh0 >= sh2, true) << "error >=";
+
+    Shape sh001 = Shape::zero(2);
+    Shape sh002 = Shape::zero(3);
+
+    if (sh001 > sh002) {
+        LOG(ERROR) << "error <";
+    }
+
+}
+
+
+int main(int argc, const char** argv) {
+    // initial logger
+    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
+
diff --git a/test/saber/bm/test_saber_shape_BM.h b/test/saber/bm/test_saber_shape_BM.h
new file mode 100644
index 000000000..a2ca02c9b
--- /dev/null
+++ b/test/saber/bm/test_saber_shape_BM.h
@@ -0,0 +1,25 @@
+#ifndef ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H
+#define ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+#include "saber/core/shape.h"
+
+using namespace anakin::test;
+
+class TestSaberShapeBM : public Test {
+public:
+    TestSaberShapeBM() {}
+    ~TestSaberShapeBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+protected:
+    std::string name;
+    std::string _test;
+};
+
+#endif //ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H
+
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
new file mode 100644
index 000000000..d9c65c7b4
--- /dev/null
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -0,0 +1,642 @@
+#include "test_saber_tensor_BM.h"
+#include "tensor_op.h"
+#include <vector>
+using namespace anakin::saber;
+
+typedef TargetWrapper<X86> X86_API;
+typedef TargetWrapper<BM> BM_API;
+typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+typedef TensorHf4::Dtype dtype;
+
+TEST(TestSaberTensorBM, test_tensor_constructor) {
+
+    //! test empty constructor
+    LOG(INFO) << "test default (empty) constructor";
+    TensorHf4 thost0;
+    TensorDf4 tdev0;
+
+    //! test tensor re_alloc function empty constructor
+    Shape sh0(2, 2, 8, 8);
+    LOG(INFO) << "|--test tensor re_alloc function on empty tensor";
+    thost0.re_alloc(sh0);
+    tdev0.re_alloc(sh0);
+    LOG(INFO) << "|--tensor size of host: " << thost0.size();
+    LOG(INFO) << "|--tensor size of device: " << tdev0.size();
+    CHECK_EQ(thost0.size(), 256) << "error with tensor size";
+    CHECK_EQ(tdev0.size(), 256) << "error with tensor size";
+/*
+    //! test tensor re_alloc function on tensor with data
+    LOG(INFO) << "|--test tensor re_alloc function on tensor with data";
+    Shape sh1(1, 2, 4, 4);
+    thost0.re_alloc(sh1);
+    tdev0.re_alloc(sh1);
+    LOG(INFO) << "|--tensor size of host: " << thost0.size();
+    LOG(INFO) << "|--tensor size of device: " << tdev0.size();
+    CHECK_EQ(thost0.size(), 32) << "error with tensor size";
+    CHECK_EQ(tdev0.size(), 32) << "error with tensor size";
+
+    //! test tensor shape() function
+    LOG(INFO) << "|--test tensor shape() function";
+    Shape sho = thost0.shape();
+    LOG(INFO) << "|--shape of tensor: " << sho[0] << ", " << sho[1] << "," << sho[2] << "," << sho[3];
+    LOG(INFO) << "|--test get tensor n, c, h, w function, num = " \
+              << thost0.num() << ", channel = " << thost0.channel() << ", height = " \
+              << thost0.height() << ", width = " << thost0.width();
+
+    //! test tensor mutable_data() function
+    LOG(INFO) << "|--test tensor mutable_data() function, write tensor data buffer with 1.f";
+    fill_tensor_host_const(thost0, 1.f);
+    LOG(INFO) << "|--test tensor data() function, show the const data, 1.f";
+    print_tensor_host(thost0);
+
+    //! test tensor constructor with shape
+    LOG(INFO) << "test tensor constructor with shape";
+    TensorHf4 thost1(sh1);
+    TensorDf4 tdev1(sh1);
+
+    //! test tensor copy_from() function
+    LOG(INFO) << "test copy_from() function, input tensor could be any target";
+    thost1.copy_from(thost0);
+    tdev1.copy_from(thost0);
+    print_tensor_device(tdev1);
+    cudaDeviceSynchronize();
+    thost1.copy_from(tdev1);
+    tdev1.copy_from(tdev0);
+    print_tensor_host(thost1);
+
+    //! test tensor constructor with shape and real_shape
+    LOG(INFO) << "test tensor constructor with shape and real_shape";
+    //! constructor with 3 shapes is removed
+    TensorHf4 thost2(sh0);
+    TensorDf4 tdev2(sh0);
+
+    //! test tensor constructor with data, if target is different, create buffer, and copy the data
+    LOG(INFO) <<
+              "test tensor constructor with data, if target is different, create buffer, and copy the data";
+    dtype* host_data_ptr;
+    dtype* dev_data_ptr;
+    void* tmp_pt_host;
+    void* tmp_pt_dev;
+    X86_API::mem_alloc(&tmp_pt_host, sizeof(dtype) * sh1.count());
+    host_data_ptr = static_cast<dtype*>(tmp_pt_host);
+
+    for (int i = 0; i < sh1.count(); ++i) {
+        host_data_ptr[i] = i;
+    }
+
+    NV_API::mem_alloc(&tmp_pt_dev, sizeof(dtype) * sh1.count());
+    dev_data_ptr = static_cast<dtype*>(tmp_pt_dev);
+    cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice);
+    LOG(INFO) << "|--construct host tensor from host data ptr";
+    TensorHf4 thost3(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
+    LOG(INFO) << "|--constructor device tensor from host data ptr";
+    TensorDf4 tdev3(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
+    print_tensor_host(thost3);
+    print_tensor_device(tdev3);
+    cudaDeviceSynchronize();
+
+    LOG(INFO) << "|--construct host tensor from device data ptr";
+    TensorHf4 thost4(dev_data_ptr, NV(), NV_API::get_device_id(), sh1);
+    LOG(INFO) << "|--constructor device tensor from device data ptr";
+    TensorDf4 tdev4(dev_data_ptr, NV(), NV_API::get_device_id(), sh1);
+    print_tensor_host(thost4);
+    print_tensor_device(tdev4);
+    NV_API::stream_t dev_stream0;
+    NV_API::create_stream_with_flag(dev_stream0, 1);
+    cudaDeviceSynchronize();
+
+    //! test tensor copy constructor
+    LOG(INFO) << "test tensor copy constructor";
+    LOG(INFO) << "|--normal copy constructor";
+    TensorHf4 thost5(thost4);
+    TensorDf4 tdev5(tdev4);
+
+    LOG(INFO) << "|--push back to vector";
+    std::vector<TensorHf4> vthost;
+    std::vector<TensorDf4> vtdev;
+    vthost.push_back(thost0);
+    vthost.push_back(thost1);
+    vthost.push_back(thost2);
+    vthost.push_back(thost3);
+    vthost.push_back(thost4);
+    vthost.push_back(thost5);
+    vtdev.push_back(tdev0);
+    vtdev.push_back(tdev1);
+    vtdev.push_back(tdev2);
+    vtdev.push_back(tdev3);
+    vtdev.push_back(tdev4);
+    vtdev.push_back(tdev5);
+    print_tensor_host(vthost[5]);
+    print_tensor_device(vtdev[5]);
+    cudaDeviceSynchronize();
+
+    //! test share_from function, if targets are the same, buffer is shared, otherwise, buffer is copied
+    LOG(INFO) << "test share_from function";
+    TensorHf4 thost6, thost7;
+    TensorDf4 tdev6, tdev7;
+    thost6.set_shape(thost4.shape());
+    thost7.set_shape(thost4.shape());
+    tdev6.set_shape(thost4.shape());
+    tdev7.set_shape(thost4.shape());
+    Shape sh2(1, 2, 2, 2);
+    Shape offset(0, 0, 1, 1);
+    LOG(INFO) << "|--shared host";
+    thost6.share_sub_buffer(thost4, sh2, offset);
+    LOG(INFO) << "|--copied host";
+    tdev6.share_from(thost4);
+    LOG(INFO) << "|--copied device";
+    thost7.share_from(tdev4);
+    LOG(INFO) << "|--shared device";
+    tdev7.share_from(tdev4);
+
+    LOG(INFO) << "|--change data in shared tensor";
+
+    //Shape sh_real = thost6.shape();
+    //Shape sh_act = thost6.valid_shape();
+    //Shape offset_act = thost6.offset();
+
+    //int start_w = offset_act[3];
+    //int start_h = offset_act[2];
+    //int start_c = offset_act[1];
+    //int start_n = offset_act[0];
+    //int stride_h = sh_real.count(3);
+    //int stride_c = sh_real.count(2);
+    //int stride_n = sh_real.count(1);
+    //int stride_n = sh_real.count(0);
+    Shape stride = thost6.get_stride();
+    int w = thost6.width();
+    int h = thost6.height();
+    int c = thost6.channel();
+    int n = thost6.num();
+
+    dtype* ptr_host = thost6.mutable_data();
+
+    for (int in = 0; in < n; ++in) {
+        dtype* ptr_batch = ptr_host + in * stride[0];
+
+        for (int ic = 0; ic < c; ++ic) {
+            dtype* ptr_channel = ptr_batch + ic * stride[1];
+
+            for (int ih = 0; ih < h; ++ih) {
+                dtype* ptr_row = ptr_channel + ih * stride[2];
+
+                for (int iw = 0; iw < w; ++iw) {
+                    ptr_row[iw] = 1.f;
+                }
+            }
+        }
+    }
+
+    LOG(INFO) << "|--show root tensor while data is changed by shared tensor";
+    print_tensor_host(thost4);
+
+    //! test record tensor event
+    LOG(INFO) << "test record tensor event";
+    NV_API::stream_t dev_stream;
+    NV_API::stream_t dev_stream1;
+    NV_API::create_stream_with_flag(dev_stream, 1);
+    NV_API::create_stream_with_flag(dev_stream1, 1);
+    X86_API::stream_t host_stream;
+    X86_API::create_stream_with_flag(host_stream, 1);
+    LOG(INFO) << "|--test record event on host tensor";
+    fill_tensor_host_const(thost4, 888.f);
+    thost4.record_event(host_stream);
+    thost4.sync();
+    print_tensor_host(thost4);
+    LOG(INFO) << "|--test record event on device tensor";
+    fill_tensor_device_const(tdev4, 666.f, dev_stream);
+    tdev4.record_event(dev_stream);
+    tdev4.sync();
+    print_tensor_device(tdev4, dev_stream1);
+    tdev4.record_event(dev_stream1);
+    tdev4.sync();
+}
+
+TEST(TestSaberTensorNV, test_tensor_deepcopy) {
+    //! tensor constructor with alloc data, if target is different, create buffer, and copy the data
+    LOG(INFO) << "test tensor deep copy";
+    Shape sh0(2, 2, 4, 4);
+    Shape va_sh0(2, 2, 2, 2);
+    Shape off_sh0(0, 0, 1, 1);
+
+    Shape sh1(2, 2, 4, 4);
+    Shape va_sh1(va_sh0);
+    Shape off_sh1(0, 0, 1, 0);
+
+    Shape sh2(2, 32);
+    Shape va_sh2(2, 8);
+    Shape off_sh2(0, 8);
+
+    X86_API::stream_t x86_stream;
+    NV_API::stream_t nv_stream;
+    X86_API::create_stream(x86_stream);
+    NV_API::create_stream(nv_stream);
+
+    //! create source tensor, th0, td0, th01, td01, th1, td1;
+    TensorHf4 th0(sh0);
+
+    for (int i = 0; i < sh0.count(); ++i) {
+        th0.mutable_data()[i] = i;
+    }
+
+    TensorHf4 th1(va_sh0);
+
+    for (int i = 0; i < va_sh0.count(); ++i) {
+        th1.mutable_data()[i] = i;
+    }
+
+    TensorHf4 th01;
+    th01.share_sub_buffer(th0, va_sh0, off_sh0);
+
+    TensorDf4 td0, td1, td01;
+    td0.set_shape(th0.shape());
+    td1.set_shape(th1.shape());
+    td0.share_from(th0);
+    td1.share_from(th1);
+    TensorDf4 dev_tmp0;
+    dev_tmp0.set_shape(th0.shape());
+    dev_tmp0.share_from(th0);
+    td01.share_sub_buffer(dev_tmp0, va_sh0, off_sh0);
+
+    print_tensor_host(th0);
+    print_tensor_host(th1);
+    print_tensor_device(td0);
+    print_tensor_device(td1);
+
+    //! create th2, th3, th21, td2, td3, td21 as dst tensor
+    TensorHf2 th2(sh2);
+    fill_tensor_host_const(th2, 0.f);
+    TensorHf2 th21;
+    th21.share_sub_buffer(th2, va_sh2, off_sh2);
+    TensorHf2 th3(va_sh2);
+
+    TensorDf2 td2(sh2);
+    fill_tensor_device_const(td2, 0.f);
+    cudaDeviceSynchronize();
+    TensorDf2 td21;
+    td21.share_sub_buffer(td2, va_sh2, off_sh2);
+    TensorDf2 td3(va_sh2);
+
+    double max_diff;
+    double  max_ratio;
+    //! test tensor deep copy, entire buffer copy
+    LOG(INFO) << "test tensor deep copy, entire buffer copy, H2H";
+    th3.copy_from(th1);
+    print_tensor_host(th3);
+    tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff);
+    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, sync, H2H";
+    fill_tensor_host_const(th3, 0.f);
+    th3.async_copy_from(th1, x86_stream);
+    th3.record_event(x86_stream);
+    th3.sync();
+    tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff);
+    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, async, H2H";
+
+    LOG(INFO) << "test tensor deep copy, entire buffer copy, D2H";
+    th3.copy_from(td1);
+    print_tensor_host(th3);
+    tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff);
+    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, sync, D2H";
+    fill_tensor_host_const(th3, 0.f);
+    th3.async_copy_from(td1, nv_stream);
+    th3.record_event(x86_stream);
+    th3.sync();
+    tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff);
+    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, async, D2H";
+
+    LOG(INFO) << "test tensor deep copy, entire buffer copy, H2D";
+    td3.copy_from(th1);
+    print_tensor_device(td3);
+    cudaDeviceSynchronize();
+    tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff);
+    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, sync, D2H";
+    fill_tensor_device_const(td3, 0.f);
+    cudaDeviceSynchronize();
+    td3.async_copy_from(th1, nv_stream);
+    td3.record_event(nv_stream);
+    td3.sync();
+    tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff);
+    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, async, D2H";
+
+    LOG(INFO) << "test tensor deep copy, entire buffer copy, D2D";
+    td3.copy_from(td1);
+    print_tensor_device(td3);
+    cudaDeviceSynchronize();
+    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, sync, D2D";
+    fill_tensor_device_const(td3, 0.f);
+    cudaDeviceSynchronize();
+    td3.async_copy_from(td1, nv_stream);
+    td3.record_event(nv_stream);
+    td3.sync();
+    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, async, D2D";
+
+
+    //! test tensor deep copy, src with roi
+    LOG(INFO) << "test tensor deep copy, src with roi, H2H";
+    th3.copy_from(th01);
+    print_tensor_host(th3);
+
+    LOG(INFO) << "test tensor deep copy, src with roi, D2H";
+    th3.copy_from(td01);
+    print_tensor_host(th3);
+
+    LOG(INFO) << "test tensor deep copy, src with roi, H2D";
+    td3.copy_from(th01);
+    print_tensor_device(td3);
+    cudaDeviceSynchronize();
+
+    LOG(INFO) << "test tensor deep copy, src with roi, D2D";
+    td3.copy_from(td01);
+    print_tensor_device(td3);
+    cudaDeviceSynchronize();
+
+
+    //! test tensor deep copy, dst with roi
+    LOG(INFO) << "test tensor deep copy, dst with roi, H2H";
+    print_tensor_host(th21);
+    print_tensor_host(th1);
+    th21.copy_from(th1);
+    print_tensor_host(th21);
+
+    LOG(INFO) << "test tensor deep copy, dst with roi, D2H";
+    th21.copy_from(td1);
+    print_tensor_host(th21);
+
+    LOG(INFO) << "test tensor deep copy, dst with roi, H2D";
+    td21.copy_from(th1);
+    print_tensor_device(td21);
+    cudaDeviceSynchronize();
+
+    LOG(INFO) << "test tensor deep copy, dst with roi, D2D";
+    td21.copy_from(td1);
+    print_tensor_device(td21);
+    cudaDeviceSynchronize();
+
+
+    //! test tensor deep copy, src and dst are with roi
+    LOG(INFO) << "test tensor deep copy, src and dst are with roi, H2H";
+    th21.copy_from(th01);
+    print_tensor_host(th21);
+
+    LOG(INFO) << "test tensor deep copy, src and dst are with roi, D2H";
+    th21.copy_from(td01);
+    print_tensor_host(th21);
+
+    LOG(INFO) << "test tensor deep copy, src and dst are with roi, H2D";
+    td21.copy_from(th01);
+    print_tensor_device(td21);
+    cudaDeviceSynchronize();
+
+    LOG(INFO) << "test tensor deep copy, src and dst are with roi, D2D";
+    td21.copy_from(td01);
+    print_tensor_device(td21);
+    cudaDeviceSynchronize();
+}
+
+TEST(TestSaberTensorNV, test_tensor_shape) {
+    typedef Tensor<X86, AK_FLOAT, NCHW> Tensor4_0;
+    typedef Tensor<X86, AK_FLOAT, NHWC> Tensor4_1;
+    typedef Tensor<X86, AK_FLOAT, HW> Tensor2;
+
+    int nin = 2;
+    int cin = 2;
+    int hin = 4;
+    int win = 4;
+
+    LOG(INFO) << "test tensor interface";
+
+    Tensor4_0 t1(Shape(nin, cin, hin, win));
+    Tensor4_1 t2(Shape(nin, hin, win, cin));
+    Tensor2 t3(Shape(hin, win));
+
+    LOG(INFO) << "test tensor with layout of NCHW";
+    LOG(INFO) << "num: " << t1.num() << ", num idx: " << t1.num_index() << \
+              ", channel: " << t1.channel() << ", channel idx: " << t1.channel_index() << \
+              ", height: " << t1.height() << ", height idx: " << t1.height_index() << \
+              ", widhth: " << t1.width() << ", width idx: " << t1.width_index();
+
+    CHECK_EQ(t1.num(), nin) << "NCHW get num error";
+    CHECK_EQ(t1.channel(), cin) << "NCHW get channel error";
+    CHECK_EQ(t1.height(), hin) << "NCHW get height error";
+    CHECK_EQ(t1.width(), win) << "NCHW get width error";
+
+    CHECK_EQ(t1.num_index(), 0) << "NCHW get num index error";
+    CHECK_EQ(t1.channel_index(), 1) << "NCHW get channel index error";
+    CHECK_EQ(t1.height_index(), 2) << "NCHW get height index error";
+    CHECK_EQ(t1.width_index(), 3) << "NCHW get width index error";
+
+    LOG(INFO) << "test tensor with layout of NHWC";
+    LOG(INFO) << "num: " << t2.num() << ", num idx: " << t2.num_index() << \
+              ", channel: " << t2.channel() << ", channel idx: " << t2.channel_index() << \
+              ", height: " << t2.height() << ", height idx: " << t2.height_index() << \
+              ", widhth: " << t2.width() << ", width idx: " << t2.width_index();
+
+    CHECK_EQ(t2.num(), nin) << "NHWC get num error";
+    CHECK_EQ(t2.channel(), cin) << "NHWC get channel error";
+    CHECK_EQ(t2.height(), hin) << "NHWC get height error";
+    CHECK_EQ(t2.width(), win) << "NHWC get width error";
+
+    CHECK_EQ(t2.num_index(), 0) << "NHWC get num index error";
+    CHECK_EQ(t2.channel_index(), 3) << "NHWC get channel index error";
+    CHECK_EQ(t2.height_index(), 1) << "NHWC get height index error";
+    CHECK_EQ(t2.width_index(), 2) << "NHWC get width index error";
+
+    LOG(INFO) << "test tensor with layout of HW";
+    LOG(INFO) << "num: " << t3.num() << ", num idx: " << t3.num_index() << \
+              ", channel: " << t3.channel() << ", channel idx: " << t3.channel_index() << \
+              ", height: " << t3.height() << ", height idx: " << t3.height_index() << \
+              ", widhth: " << t3.width() << ", width idx: " << t3.width_index();
+
+    CHECK_EQ(t3.num(), 1) << "HW get num error";
+    CHECK_EQ(t3.channel(), 1) << "HW get channel error";
+    CHECK_EQ(t3.height(), hin) << "HW get height error";
+    CHECK_EQ(t3.width(), win) << "HW get width error";
+
+    CHECK_EQ(t3.num_index(), -1) << "HW get num index error";
+    CHECK_EQ(t3.channel_index(), -1) << "HW get channel index error";
+    CHECK_EQ(t3.height_index(), 0) << "HW get height index error";
+    CHECK_EQ(t3.width_index(), 1) << "HW get width index error";
+
+}
+
+TEST(TestSaberTensorNV, test_tensor_reshape_realloc) {
+
+    LOG(INFO) << "test tensor reshape and re_alloc funcs";
+
+    Shape sh0(2, 2, 2, 2);
+    Shape sh1(2, 2, 4, 4);
+    TensorHf4 th0(sh1);
+    TensorDf4 td0(sh1);
+    fill_tensor_host_const(th0, 1);
+    fill_tensor_device_const(td0, 1);
+    LOG(INFO) << "ori tensor with size: " << th0.valid_size();
+    print_tensor_host(th0);
+    print_tensor_device(td0);
+    cudaDeviceSynchronize();
+
+    th0.reshape(sh0);
+    td0.reshape(sh0);
+    LOG(INFO) << "tensor after reshape(from big space to small) with size: " << th0.valid_size();
+    print_tensor_host(th0);
+    print_tensor_device(td0);
+    cudaDeviceSynchronize();
+    fill_tensor_host_const(th0, 1);
+    fill_tensor_device_const(td0, 1);
+    cudaDeviceSynchronize();
+
+    th0.reshape(sh1);
+    td0.reshape(sh1);
+    LOG(INFO) << "tensor after reshape(from small to big, not larger than ori) with size: " <<
+              th0.valid_size();
+    print_tensor_host(th0);
+    print_tensor_device(td0);
+    cudaDeviceSynchronize();
+
+    th0.re_alloc(sh0);
+    td0.re_alloc(sh0);
+    LOG(INFO) << "tensor after re_alloc(from big space to small) with size: " << th0.valid_size();
+    print_tensor_host(th0);
+    print_tensor_device(td0);
+    cudaDeviceSynchronize();
+
+    TensorHf4 th1(sh0);
+    TensorDf4 td1(sh0);
+    LOG(INFO) << "ori tensor with size: " << th1.valid_size();
+    fill_tensor_host_const(th1, 1);
+    fill_tensor_device_const(td1, 1);
+    cudaDeviceSynchronize();
+    print_tensor_host(th1);
+    print_tensor_device(td1);
+    cudaDeviceSynchronize();
+
+    th1.reshape(sh1);
+    td1.reshape(sh1);
+    LOG(INFO) << "tensor after reshape(from small space to big) with size: " << th1.valid_size();
+    //printf("real_shape: %d,%d, %d, %d, valid_shape: %d, %d, %d, %d\n", \
+    th1.shape()[0], th1.shape()[1], th1.shape()[2], th1.shape()[3], \
+    th1.valid_shape()[0], th1.valid_shape()[1], th1.valid_shape()[2], th1.valid_shape()[3]);
+    print_tensor_host(th1);
+    print_tensor_device(td1);
+    cudaDeviceSynchronize();
+    fill_tensor_host_const(th1, 1);
+    fill_tensor_device_const(td1, 1);
+    cudaDeviceSynchronize();
+
+    th1.reshape(sh0);
+    td1.reshape(sh0);
+
+    LOG(INFO) << "tensor after re_alloc(from small space to big) with size: " << th1.valid_size();
+    th1.re_alloc(sh1);
+    td1.re_alloc(sh1);
+    print_tensor_host(th1);
+    print_tensor_device(td1);
+    cudaDeviceSynchronize();
+
+}
+
+TEST(TestSaberTensorNV, test_tensor_op) {
+    Shape sh{1, 2, 2, 10};
+    TensorDf4 td1(sh);
+    TensorHf4 th1(sh);
+    Tensor<NV, AK_INT8, NCHW> td2(sh);
+    Tensor<X86, AK_INT8, NCHW> th2(sh);
+    LOG(INFO) << "testing host fill tensor with const 1.";
+    fill_tensor_host_const(th1, 1.f);
+    LOG(INFO) << "data type: float";
+    print_tensor_host(th1);
+    fill_tensor_host_const(th2, 1);
+    LOG(INFO) << "data type: int8";
+    print_tensor_host(th2);
+
+    LOG(INFO) << "testing device fill tensor with const 1.";
+    fill_tensor_device_const(td1, 1.f);
+    LOG(INFO) << "data type: float";
+    print_tensor_device(td1);
+    fill_tensor_device_const(td2, 1);
+    LOG(INFO) << "data type: int8";
+    print_tensor_device(td2);
+
+    LOG(INFO) << "testing host fill tensor with rand";
+    fill_tensor_host_rand(th1);
+    LOG(INFO) << "data type: float";
+    print_tensor_host(th1);
+    fill_tensor_host_rand(th2);
+    LOG(INFO) << "data type: int8";
+    print_tensor_host(th2);
+
+    LOG(INFO) << "testing device fill tensor with rand";
+    fill_tensor_device_rand(td1);
+    LOG(INFO) << "data type: float";
+    print_tensor_device(td1);
+    fill_tensor_device_rand(td2);
+    LOG(INFO) << "data type: int8";
+    print_tensor_device(td2);
+
+    LOG(INFO) << "testing host fill tensor with rand from 1 to 10";
+    fill_tensor_host_rand(th1, 1, 10);
+    LOG(INFO) << "data type: float";
+    print_tensor_host(th1);
+    fill_tensor_host_rand(th2, 1, 10);
+    LOG(INFO) << "data type: int8";
+    print_tensor_host(th2);
+
+    LOG(INFO) << "testing device fill tensor with rand from 1 to 10";
+    fill_tensor_device_rand(td1, 1, 10);
+    LOG(INFO) << "data type: float";
+    print_tensor_device(td1);
+    fill_tensor_device_rand(td2, 1, 10);
+    LOG(INFO) << "data type: int8";
+    print_tensor_device(td2);
+}
+
+TEST(TestSaberTensorNV, test_tensor_share_diff_dtype) {
+    Shape sh{1, 1, 2, 10};
+    Tensor<NV, AK_FLOAT, NCHW> td1(sh);
+    Tensor<X86, AK_FLOAT, NCHW> th1(sh);
+    Tensor<NV, AK_INT8, NCHW> td2;
+    Tensor<X86, AK_INT8, NCHW> th2;
+    td2.set_shape(sh);
+    th2.set_shape(sh);
+    LOG(INFO) << "testing host fill tensor with const 1.";
+    fill_tensor_host_const(th1, -1);
+    LOG(INFO) << "data type: float";
+    print_tensor_host(th1);
+    fill_tensor_device_const(td1, -1);
+    LOG(INFO) << "data type: int8";
+    print_tensor_device(td1);
+    cudaDeviceSynchronize();
+
+    td2.share_from(td1);
+    th2.share_from(th1);
+
+    print_tensor_host(th2);
+    print_tensor_device(td2);
+    cudaDeviceSynchronize();
+}
+
+TEST(TestSaberTensorNV, test_tensor_base_type) {
+    Shape sh(1, 3, 10, 10);
+    Tensor<NV, AK_FLOAT, NCHW> td1(sh);
+    Tensor<X86, AK_FLOAT, NCHW> th1(sh);
+    fill_tensor_host_rand(th1, 0.f, 255.f);
+    td1.copy_from(th1);
+    TensorBase* tb1;
+    TensorBase* tb2;
+    tb1 = &th1;
+    Shape sh1(1, 1, 10, 10);
+    tb1->set_shape(sh1);
+    Shape sh11 = th1.valid_shape();
+    LOG(INFO) << "base tensor call set shape: " << "n=" << sh11[0] << ", c=" << sh11[1] << \
+              ", h=" << sh11[2] << ", w=" << sh11[3];
+*/
+}
+
+int main(int argc, const char** argv) {
+    // initial logger
+    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
diff --git a/test/saber/bm/test_saber_tensor_BM.h b/test/saber/bm/test_saber_tensor_BM.h
new file mode 100644
index 000000000..32a402258
--- /dev/null
+++ b/test/saber/bm/test_saber_tensor_BM.h
@@ -0,0 +1,21 @@
+#ifndef ANAKIN_TEST_SABER_TEST_SABER_TENSOR_BM_H
+#define ANAKIN_TEST_SABER_TEST_SABER_TENSOR_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+#include "core/tensor.h"
+
+using namespace anakin::test;
+
+class TestSaberTensorBM : public Test {
+public:
+    TestSaberTensorBM() {}
+    ~TestSaberTensorBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+};
+
+#endif //ANAKIN_TEST_SABER_TEST_SABER_TENSOR_BM_H

From a8c7357ecdaefeb45d4b0b742c5f1e3c3efa1ecd Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Mon, 18 Jun 2018 14:14:23 +0800
Subject: [PATCH 123/318] Fix cmake issues

---
 CMakeLists.txt       | 12 ++++++------
 saber/CMakeLists.txt |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e1ac0aa80..cc6d9d559 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -80,12 +80,12 @@ anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plan
 anakin_option(BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform" NO if BUILD_CROSS_PLANTFORM)
 
 # compile options for BM place
-anakin_option(USE_BM "Use Cuda libs." YES if NVIDIA_GPU)
-anakin_option(USE_CUBLAS "Use Cublas libs." YES if USE_BM)
-anakin_option(USE_CURAND "Use Curand libs." YES if USE_BM)
-anakin_option(USE_CUFFT "Use CuFFT libs." YES if USE_BM)
-anakin_option(USE_CUDNN "Use Cudnn libs." YES if USE_BM)
-anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform." YES if USE_BM)
+#anakin_option(USE_BM "Use Cuda libs." YES if NVIDIA_GPU)
+#anakin_option(USE_CUBLAS "Use Cublas libs." YES if USE_BM)
+#anakin_option(USE_CURAND "Use Curand libs." YES if USE_BM)
+#anakin_option(USE_CUFFT "Use CuFFT libs." YES if USE_BM)
+#anakin_option(USE_CUDNN "Use Cudnn libs." YES if USE_BM)
+#anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform." YES if USE_BM)
 
 
 
diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt
index 298d08ab8..86ed4ef81 100644
--- a/saber/CMakeLists.txt
+++ b/saber/CMakeLists.txt
@@ -95,10 +95,10 @@ if(USE_BM)
     set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS})
     set(CMAKE_CXX_FLAGS "")
     if(BUILD_SHARED)
-        CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS SHARED ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
+        #CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS SHARED ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
     endif()
     if(BUILD_STATIC)
-        CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS STATIC ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
+        #CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS STATIC ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
     endif()
     set(CMAKE_CXX_FLAGS ${FLAGS_BACKUP})
 

From d7a941c30669bbc792301fa511ac80e845c159fe Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Tue, 19 Jun 2018 11:04:03 +0800
Subject: [PATCH 124/318] Resolve BM library compilation issue

---
 saber/CMakeLists.txt                    | 10 +++++-----
 saber/funcs/impl/bm/base/CMakeLists.txt |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt
index 86ed4ef81..e60617373 100644
--- a/saber/CMakeLists.txt
+++ b/saber/CMakeLists.txt
@@ -70,7 +70,7 @@ if(USE_CUDA)
 	set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS})
 	set(CMAKE_CXX_FLAGS "")
 	if(BUILD_SHARED)
-    		CUDA_COMPILE(ANAKIN_SABER_CUDA_C_SRC_OBJS SHARED ${ANAKIN_SABER_CUDA_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
+        CUDA_COMPILE(ANAKIN_SABER_CUDA_C_SRC_OBJS SHARED ${ANAKIN_SABER_CUDA_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
 	endif()
 	if(BUILD_STATIC)
 		CUDA_COMPILE(ANAKIN_SABER_CUDA_C_SRC_OBJS STATIC ${ANAKIN_SABER_CUDA_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
@@ -94,12 +94,12 @@ if(USE_BM)
 
     set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS})
     set(CMAKE_CXX_FLAGS "")
-    if(BUILD_SHARED)
+    #if(BUILD_SHARED)
         #CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS SHARED ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
-    endif()
-    if(BUILD_STATIC)
+    #endif()
+    #if(BUILD_STATIC)
         #CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS STATIC ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG})
-    endif()
+    #endif()
     set(CMAKE_CXX_FLAGS ${FLAGS_BACKUP})
 
     set(ANAKIN_SABER_STATIC_RELAY ${ANAKIN_SABER_STATIC_RELAY}
diff --git a/saber/funcs/impl/bm/base/CMakeLists.txt b/saber/funcs/impl/bm/base/CMakeLists.txt
index fd4b3d680..59b82abb5 100644
--- a/saber/funcs/impl/bm/base/CMakeLists.txt
+++ b/saber/funcs/impl/bm/base/CMakeLists.txt
@@ -7,7 +7,7 @@
 
 if(USE_BM)
     anakin_fetch_files_with_suffix_recursively(${BM_BASE_CODE_ROOT}/include "h" ANAKIN_SABER_BM_C_SRC)
-    anakin_fetch_files_with_suffix_recursively(${BM_BASE_CODE_ROOT}/lib "o" ANAKIN_SABER_BM_STATIC_LIB)
+    anakin_fetch_files_with_suffix_recursively(${BM_BASE_CODE_ROOT}/lib "so" ANAKIN_SABER_BM_STATIC_LIB)
 endif()
 
 macro(anakin_set_upscope src)

From be88f933dfe5501650bce21804fa990c693e6f64 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Tue, 19 Jun 2018 11:30:42 +0800
Subject: [PATCH 125/318] Remove unnecessary files

---
 .../impl/bm/base/include/bmruntime/bmcnnctx.h | 58 --------------
 .../impl/bm/base/include/bmruntime/bmnet.h    | 78 -------------------
 2 files changed, 136 deletions(-)
 delete mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h
 delete mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmnet.h

diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h b/saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h
deleted file mode 100644
index 6b0bfe857..000000000
--- a/saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#ifndef __BM_CNN_CONTEXT_H__
-#define __BM_CNN_CONTEXT_H__
-
-#include <string>
-#include "bmruntime.h"
-
-namespace bmcnn {
-
-typedef void *bmcnn_ctx_t;
-/**
- * \brief Create context of BMCNN.
- *
- * \param ctx_dir - Directory of context files generated by BMNETC
- *
- * \note
- * The context will be created in the device of ID 0.\n
- *  
- * \return
- * NULL - Creating failed.\n
- * non-NULL - The handle of the context (creating succeeded).\n
- */
-bmcnn_ctx_t bmcnn_ctx_create(const std::string &ctx_dir);
-/**
- * \brief Destroy context of BMCNN
- * 
- * \param handle - Handle of the context to be destroyed
- */
-void bmcnn_ctx_destroy(bmcnn_ctx_t handle);
-/**
- * \brief Create context of BMCNN in specific devide.
- * 
- * \param ctx_dir - Directory of context files generated by BMNETC
- * \param devid - ID of device where the context will be placed.
- *
- * \note
- * Call \ref bm_dev_getcount to get total number of devices, e.g. N is returned,
- * valid devid should be in range of 0 ~ (N-1).\n
- *
- * \return
- * NULL - Creating failed that might be caused by incorrect parameter.\n
- * non-NULL - The handle of the context (creating succeeded).\n
- */
-bmcnn_ctx_t bmcnn_ctx_create_by_devid(const std::string &ctx_dir, int devid);
-/**
- * \brief Append context of BMCNN.
- *
- * \param ctx_dir - Directory of context files generated by BMNETC or BMNETD.
- * \param bmrt    - The created handle of context.
- *  
- * \return
- * false - Appending failed.\n
- * true  - Appending succeeded.\n
- */
-bool bmcnn_ctx_append(const std::string &ctx_dir, bmruntime *bmrt);
-
-} /* namespace bmcnn */
-
-#endif /* __BM_CNN_CONTEXT_H__ */
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmnet.h b/saber/funcs/impl/bm/base/include/bmruntime/bmnet.h
deleted file mode 100644
index 88005e1b8..000000000
--- a/saber/funcs/impl/bm/base/include/bmruntime/bmnet.h
+++ /dev/null
@@ -1,78 +0,0 @@
-#ifndef __BM_NET_H__
-#define __BM_NET_H__
-
-#include "bmblob.h"
-#include "bmcnnctx.h"
-#include <vector>
-#include <map>
-#include <string>
-
-#ifdef CROSS_COMPILE
-  #include <memory>
-#else
-  #include <boost/shared_ptr.hpp>
-#endif
-
-
-#ifdef CROSS_COMPILE
-#define NAMESPACE_USED  std
-#else
-#define NAMESPACE_USED  boost
-#endif
-
-namespace bmcnn {
-    
-class BMNet
-{
-public:
-    /**
-     * \brief Constructor of net.
-     *
-     * \param handle - Handler of BMCNN context (created by \ref bmcnn_ctx_create)
-     * \param name - Name of net
-     */
-    explicit BMNet(bmcnn_ctx_t handle, const std::string &name);
-    /**
-     * \brief Deconstructor of blob.
-     */
-    virtual ~BMNet();
-    /**
-     * \brief Reshape all layers from bottom to top.
-     */
-    void Reshape();
-    /**
-     * \brief Run forward.
-     * 
-     * \param sync - Flag of synchronizing.
-     */
-    void Forward(bool sync = false);
-    /**
-     * \brief Get blob by name.
-     *
-     * \param name - Name of blob 
-     * \note
-     * (1) The name could only be of blob in input or output.\n
-     * (2) If the name is not spotted, null pointer will be returned.\n
-     */
-    const NAMESPACE_USED::shared_ptr<BMBlob> blob_by_name(const std::string &name) const;
-    /**
-     * \brief Get maximum shape allowed.
-     */
-    inline const Shape &max_shape() const
-    { return max_shape_; }
-private:
-    BMNet(const BMNet &other);
-    BMNet &operator=(const BMNet &other);
-
-    bmcnn_ctx_t bmcc_ctx_;
-    std::vector<NAMESPACE_USED::shared_ptr<BMBlob> > blobs_;
-    std::vector<BMBlob *> net_input_blobs_;
-    std::vector<BMBlob *> net_output_blobs_;
-    std::string name_;
-    std::map<std::string, size_t> blob_name_index_;
-    Shape max_shape_;
-};
-
-} /* namespace bmcnn */
-
-#endif /* __BM_NET_H__ */

From ec7ed850a38700a5ba07eee8a178e59362703be4 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Tue, 19 Jun 2018 17:43:20 +0800
Subject: [PATCH 126/318] Put empty implementation for BM sync_mem for now

---
 saber/CMakeLists.txt        | 2 ++
 saber/core/target_wrapper.h | 8 ++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt
index e60617373..ff5959cc0 100644
--- a/saber/CMakeLists.txt
+++ b/saber/CMakeLists.txt
@@ -136,4 +136,6 @@ if(UNIX OR APPLE)
         endif ()
     endif()
 endif()
+
+
 set(ANAKIN_SABER_LIB_TARGET ${ANAKIN_SABER_TEMP_COMMMON_LIB} PARENT_SCOPE)
diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index cedf7023c..62055db48 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -540,16 +540,16 @@ struct TargetWrapper<BM, __device_target> {
     // brief create event, empty function for bitmain target
 
     static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-        size_t count, __DtoD);
+        size_t count, __DtoD) {};
 
     static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-        size_t count, __HtoD);
+        size_t count, __HtoD) {};
 
     static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-        size_t count, __DtoH);
+        size_t count, __DtoH) {};
 
     static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
-        int src_dev, size_t count);
+        int src_dev, size_t count) {};
 
     /**
      * \brief device target return currently used device id

From 953f99acf05a602bdd6f7f7085dd3f2631f63774 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Tue, 19 Jun 2018 18:03:13 +0800
Subject: [PATCH 127/318] Fix wrong input param

---
 test/saber/bm/test_TargetWrapper_BM.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp
index c6ee0811b..c54b392d1 100644
--- a/test/saber/bm/test_TargetWrapper_BM.cpp
+++ b/test/saber/bm/test_TargetWrapper_BM.cpp
@@ -7,8 +7,8 @@ using namespace anakin::saber;
 int main() {
     typedef TargetWrapper<BM> API;
     void *pmem;
-    int dev_count;
-    API::get_device_count(&dev_count);
+    int dev_count = 0;
+    API::get_device_count(dev_count);
     API::mem_alloc(&pmem, 3*200*200);
     API::mem_free(pmem);
 }

From 85435462b09bb89860245a12216907128dd0cfef Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Wed, 20 Jun 2018 11:46:35 +0800
Subject: [PATCH 128/318] Fix param type issue

---
 saber/core/impl/bm/bm_impl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 3ff30773a..143fbec9a 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -55,7 +55,7 @@ int BM_API::get_device_id(){
         
 void BM_API::mem_alloc(void** ptr, size_t n){
     //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n)
-    bm_device_mem_t mem = bm_mem_from_system(ptr);
+    bm_device_mem_t mem = bm_mem_from_system(*ptr);
     BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n));
 }
         

From db24efb8590e2f98163bca8f98527d919e9a35db Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Wed, 20 Jun 2018 14:17:06 +0800
Subject: [PATCH 129/318] Initialize BM handler

---
 saber/core/impl/bm/bm_impl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 143fbec9a..bee5ddab6 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -37,7 +37,7 @@ namespace saber{
 
 typedef TargetWrapper<BM, __device_target> BM_API;
 
-static bm_handle_t handle;
+static bm_handle_t handle = get_bm_handle();
 
 void BM_API::get_device_count(int &count) {
     BMDNN_CHECK(bm_dev_getcount(&count));

From 4a509e2839e2871a46be6840efb6dea48b031e9a Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Wed, 20 Jun 2018 15:33:44 +0800
Subject: [PATCH 130/318] Add more unit test for tensor

---
 test/saber/bm/test_TargetWrapper_BM.cpp |  16 ---
 test/saber/bm/test_saber_tensor_BM.cpp  | 130 ++++++++++--------------
 2 files changed, 55 insertions(+), 91 deletions(-)
 delete mode 100644 test/saber/bm/test_TargetWrapper_BM.cpp

diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp
deleted file mode 100644
index c54b392d1..000000000
--- a/test/saber/bm/test_TargetWrapper_BM.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-#include "saber_types.h"
-#include "target_wrapper.h"
-#include <iostream>
-
-#ifdef USE_BM
-using namespace anakin::saber;
-int main() {
-    typedef TargetWrapper<BM> API;
-    void *pmem;
-    int dev_count = 0;
-    API::get_device_count(dev_count);
-    API::mem_alloc(&pmem, 3*200*200);
-    API::mem_free(pmem);
-}
-#endif
-
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index d9c65c7b4..0634d0a2d 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -5,8 +5,8 @@ using namespace anakin::saber;
 
 typedef TargetWrapper<X86> X86_API;
 typedef TargetWrapper<BM> BM_API;
-typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+typedef Tensor<X86, AK_BM, NCHW> TensorHf4;
+typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 typedef TensorHf4::Dtype dtype;
 
 TEST(TestSaberTensorBM, test_tensor_constructor) {
@@ -25,7 +25,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     LOG(INFO) << "|--tensor size of device: " << tdev0.size();
     CHECK_EQ(thost0.size(), 256) << "error with tensor size";
     CHECK_EQ(tdev0.size(), 256) << "error with tensor size";
-/*
+
     //! test tensor re_alloc function on tensor with data
     LOG(INFO) << "|--test tensor re_alloc function on tensor with data";
     Shape sh1(1, 2, 4, 4);
@@ -60,7 +60,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     thost1.copy_from(thost0);
     tdev1.copy_from(thost0);
     print_tensor_device(tdev1);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
     thost1.copy_from(tdev1);
     tdev1.copy_from(tdev0);
     print_tensor_host(thost1);
@@ -85,7 +85,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
         host_data_ptr[i] = i;
     }
 
-    NV_API::mem_alloc(&tmp_pt_dev, sizeof(dtype) * sh1.count());
+    BM_API::mem_alloc(&tmp_pt_dev, sizeof(dtype) * sh1.count());
     dev_data_ptr = static_cast<dtype*>(tmp_pt_dev);
     cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice);
     LOG(INFO) << "|--construct host tensor from host data ptr";
@@ -94,17 +94,18 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     TensorDf4 tdev3(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
     print_tensor_host(thost3);
     print_tensor_device(tdev3);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     LOG(INFO) << "|--construct host tensor from device data ptr";
-    TensorHf4 thost4(dev_data_ptr, NV(), NV_API::get_device_id(), sh1);
+    TensorHf4 thost4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1);
     LOG(INFO) << "|--constructor device tensor from device data ptr";
-    TensorDf4 tdev4(dev_data_ptr, NV(), NV_API::get_device_id(), sh1);
+    TensorDf4 tdev4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1);
     print_tensor_host(thost4);
     print_tensor_device(tdev4);
-    NV_API::stream_t dev_stream0;
-    NV_API::create_stream_with_flag(dev_stream0, 1);
-    cudaDeviceSynchronize();
+
+    //BM_API::stream_t dev_stream0;
+    //BM_API::create_stream_with_flag(dev_stream0, 1);
+    //cudaDeviceSynchronize();
 
     //! test tensor copy constructor
     LOG(INFO) << "test tensor copy constructor";
@@ -129,7 +130,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     vtdev.push_back(tdev5);
     print_tensor_host(vthost[5]);
     print_tensor_device(vtdev[5]);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     //! test share_from function, if targets are the same, buffer is shared, otherwise, buffer is copied
     LOG(INFO) << "test share_from function";
@@ -190,30 +191,10 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
 
     LOG(INFO) << "|--show root tensor while data is changed by shared tensor";
     print_tensor_host(thost4);
-
-    //! test record tensor event
-    LOG(INFO) << "test record tensor event";
-    NV_API::stream_t dev_stream;
-    NV_API::stream_t dev_stream1;
-    NV_API::create_stream_with_flag(dev_stream, 1);
-    NV_API::create_stream_with_flag(dev_stream1, 1);
-    X86_API::stream_t host_stream;
-    X86_API::create_stream_with_flag(host_stream, 1);
-    LOG(INFO) << "|--test record event on host tensor";
-    fill_tensor_host_const(thost4, 888.f);
-    thost4.record_event(host_stream);
-    thost4.sync();
-    print_tensor_host(thost4);
-    LOG(INFO) << "|--test record event on device tensor";
-    fill_tensor_device_const(tdev4, 666.f, dev_stream);
-    tdev4.record_event(dev_stream);
-    tdev4.sync();
-    print_tensor_device(tdev4, dev_stream1);
-    tdev4.record_event(dev_stream1);
-    tdev4.sync();
 }
 
-TEST(TestSaberTensorNV, test_tensor_deepcopy) {
+/*
+TEST(TestSaberTensorBM, test_tensor_deepcopy) {
     //! tensor constructor with alloc data, if target is different, create buffer, and copy the data
     LOG(INFO) << "test tensor deep copy";
     Shape sh0(2, 2, 4, 4);
@@ -229,9 +210,9 @@ TEST(TestSaberTensorNV, test_tensor_deepcopy) {
     Shape off_sh2(0, 8);
 
     X86_API::stream_t x86_stream;
-    NV_API::stream_t nv_stream;
+    BM_API::stream_t nv_stream;
     X86_API::create_stream(x86_stream);
-    NV_API::create_stream(nv_stream);
+    BM_API::create_stream(nv_stream);
 
     //! create source tensor, th0, td0, th01, td01, th1, td1;
     TensorHf4 th0(sh0);
@@ -273,7 +254,7 @@ TEST(TestSaberTensorNV, test_tensor_deepcopy) {
 
     TensorDf2 td2(sh2);
     fill_tensor_device_const(td2, 0.f);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
     TensorDf2 td21;
     td21.share_sub_buffer(td2, va_sh2, off_sh2);
     TensorDf2 td3(va_sh2);
@@ -308,11 +289,11 @@ TEST(TestSaberTensorNV, test_tensor_deepcopy) {
     LOG(INFO) << "test tensor deep copy, entire buffer copy, H2D";
     td3.copy_from(th1);
     print_tensor_device(td3);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
     tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff);
     CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, sync, D2H";
     fill_tensor_device_const(td3, 0.f);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
     td3.async_copy_from(th1, nv_stream);
     td3.record_event(nv_stream);
     td3.sync();
@@ -322,10 +303,10 @@ TEST(TestSaberTensorNV, test_tensor_deepcopy) {
     LOG(INFO) << "test tensor deep copy, entire buffer copy, D2D";
     td3.copy_from(td1);
     print_tensor_device(td3);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
     CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, sync, D2D";
     fill_tensor_device_const(td3, 0.f);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
     td3.async_copy_from(td1, nv_stream);
     td3.record_event(nv_stream);
     td3.sync();
@@ -344,12 +325,12 @@ TEST(TestSaberTensorNV, test_tensor_deepcopy) {
     LOG(INFO) << "test tensor deep copy, src with roi, H2D";
     td3.copy_from(th01);
     print_tensor_device(td3);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     LOG(INFO) << "test tensor deep copy, src with roi, D2D";
     td3.copy_from(td01);
     print_tensor_device(td3);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
 
     //! test tensor deep copy, dst with roi
@@ -366,12 +347,12 @@ TEST(TestSaberTensorNV, test_tensor_deepcopy) {
     LOG(INFO) << "test tensor deep copy, dst with roi, H2D";
     td21.copy_from(th1);
     print_tensor_device(td21);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     LOG(INFO) << "test tensor deep copy, dst with roi, D2D";
     td21.copy_from(td1);
     print_tensor_device(td21);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
 
     //! test tensor deep copy, src and dst are with roi
@@ -386,18 +367,18 @@ TEST(TestSaberTensorNV, test_tensor_deepcopy) {
     LOG(INFO) << "test tensor deep copy, src and dst are with roi, H2D";
     td21.copy_from(th01);
     print_tensor_device(td21);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     LOG(INFO) << "test tensor deep copy, src and dst are with roi, D2D";
     td21.copy_from(td01);
     print_tensor_device(td21);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 }
 
-TEST(TestSaberTensorNV, test_tensor_shape) {
-    typedef Tensor<X86, AK_FLOAT, NCHW> Tensor4_0;
-    typedef Tensor<X86, AK_FLOAT, NHWC> Tensor4_1;
-    typedef Tensor<X86, AK_FLOAT, HW> Tensor2;
+TEST(TestSaberTensorBM, test_tensor_shape) {
+    typedef Tensor<X86, AK_BM, NCHW> Tensor4_0;
+    typedef Tensor<X86, AK_BM, NHWC> Tensor4_1;
+    typedef Tensor<X86, AK_BM, HW> Tensor2;
 
     int nin = 2;
     int cin = 2;
@@ -460,7 +441,7 @@ TEST(TestSaberTensorNV, test_tensor_shape) {
 
 }
 
-TEST(TestSaberTensorNV, test_tensor_reshape_realloc) {
+TEST(TestSaberTensorBM, test_tensor_reshape_realloc) {
 
     LOG(INFO) << "test tensor reshape and re_alloc funcs";
 
@@ -473,17 +454,17 @@ TEST(TestSaberTensorNV, test_tensor_reshape_realloc) {
     LOG(INFO) << "ori tensor with size: " << th0.valid_size();
     print_tensor_host(th0);
     print_tensor_device(td0);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     th0.reshape(sh0);
     td0.reshape(sh0);
     LOG(INFO) << "tensor after reshape(from big space to small) with size: " << th0.valid_size();
     print_tensor_host(th0);
     print_tensor_device(td0);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
     fill_tensor_host_const(th0, 1);
     fill_tensor_device_const(td0, 1);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     th0.reshape(sh1);
     td0.reshape(sh1);
@@ -491,24 +472,24 @@ TEST(TestSaberTensorNV, test_tensor_reshape_realloc) {
               th0.valid_size();
     print_tensor_host(th0);
     print_tensor_device(td0);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     th0.re_alloc(sh0);
     td0.re_alloc(sh0);
     LOG(INFO) << "tensor after re_alloc(from big space to small) with size: " << th0.valid_size();
     print_tensor_host(th0);
     print_tensor_device(td0);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     TensorHf4 th1(sh0);
     TensorDf4 td1(sh0);
     LOG(INFO) << "ori tensor with size: " << th1.valid_size();
     fill_tensor_host_const(th1, 1);
     fill_tensor_device_const(td1, 1);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
     print_tensor_host(th1);
     print_tensor_device(td1);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     th1.reshape(sh1);
     td1.reshape(sh1);
@@ -518,10 +499,10 @@ TEST(TestSaberTensorNV, test_tensor_reshape_realloc) {
     th1.valid_shape()[0], th1.valid_shape()[1], th1.valid_shape()[2], th1.valid_shape()[3]);
     print_tensor_host(th1);
     print_tensor_device(td1);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
     fill_tensor_host_const(th1, 1);
     fill_tensor_device_const(td1, 1);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     th1.reshape(sh0);
     td1.reshape(sh0);
@@ -531,15 +512,15 @@ TEST(TestSaberTensorNV, test_tensor_reshape_realloc) {
     td1.re_alloc(sh1);
     print_tensor_host(th1);
     print_tensor_device(td1);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
 }
 
-TEST(TestSaberTensorNV, test_tensor_op) {
+TEST(TestSaberTensorBM, test_tensor_op) {
     Shape sh{1, 2, 2, 10};
     TensorDf4 td1(sh);
     TensorHf4 th1(sh);
-    Tensor<NV, AK_INT8, NCHW> td2(sh);
+    Tensor<BM, AK_INT8, NCHW> td2(sh);
     Tensor<X86, AK_INT8, NCHW> th2(sh);
     LOG(INFO) << "testing host fill tensor with const 1.";
     fill_tensor_host_const(th1, 1.f);
@@ -590,11 +571,11 @@ TEST(TestSaberTensorNV, test_tensor_op) {
     print_tensor_device(td2);
 }
 
-TEST(TestSaberTensorNV, test_tensor_share_diff_dtype) {
+TEST(TestSaberTensorBM, test_tensor_share_diff_dtype) {
     Shape sh{1, 1, 2, 10};
-    Tensor<NV, AK_FLOAT, NCHW> td1(sh);
-    Tensor<X86, AK_FLOAT, NCHW> th1(sh);
-    Tensor<NV, AK_INT8, NCHW> td2;
+    Tensor<BM, AK_BM, NCHW> td1(sh);
+    Tensor<X86, AK_BM, NCHW> th1(sh);
+    Tensor<BM, AK_INT8, NCHW> td2;
     Tensor<X86, AK_INT8, NCHW> th2;
     td2.set_shape(sh);
     th2.set_shape(sh);
@@ -605,20 +586,20 @@ TEST(TestSaberTensorNV, test_tensor_share_diff_dtype) {
     fill_tensor_device_const(td1, -1);
     LOG(INFO) << "data type: int8";
     print_tensor_device(td1);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     td2.share_from(td1);
     th2.share_from(th1);
 
     print_tensor_host(th2);
     print_tensor_device(td2);
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 }
 
-TEST(TestSaberTensorNV, test_tensor_base_type) {
+TEST(TestSaberTensorBM, test_tensor_base_type) {
     Shape sh(1, 3, 10, 10);
-    Tensor<NV, AK_FLOAT, NCHW> td1(sh);
-    Tensor<X86, AK_FLOAT, NCHW> th1(sh);
+    Tensor<BM, AK_BM, NCHW> td1(sh);
+    Tensor<X86, AK_BM, NCHW> th1(sh);
     fill_tensor_host_rand(th1, 0.f, 255.f);
     td1.copy_from(th1);
     TensorBase* tb1;
@@ -629,8 +610,7 @@ TEST(TestSaberTensorNV, test_tensor_base_type) {
     Shape sh11 = th1.valid_shape();
     LOG(INFO) << "base tensor call set shape: " << "n=" << sh11[0] << ", c=" << sh11[1] << \
               ", h=" << sh11[2] << ", w=" << sh11[3];
-*/
-}
+}*/
 
 int main(int argc, const char** argv) {
     // initial logger

From e93ec7805dafc10e92335a91a61c24d94aeb90ed Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Wed, 20 Jun 2018 16:01:18 +0800
Subject: [PATCH 131/318] Update Dtype for host

---
 test/saber/bm/test_saber_tensor_BM.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index 0634d0a2d..14f86c8b5 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -5,7 +5,7 @@ using namespace anakin::saber;
 
 typedef TargetWrapper<X86> X86_API;
 typedef TargetWrapper<BM> BM_API;
-typedef Tensor<X86, AK_BM, NCHW> TensorHf4;
+typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
 typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 typedef TensorHf4::Dtype dtype;
 

From 717207468e106752d2e29ef0b4e53fa147c8e5e6 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Wed, 20 Jun 2018 16:38:08 +0800
Subject: [PATCH 132/318] Conversion from void* to bm_device_mem_t*

---
 saber/core/impl/bm/bm_impl.cpp         | 10 ++++++----
 test/saber/bm/test_saber_tensor_BM.cpp |  2 ++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index bee5ddab6..faca480f0 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -55,20 +55,22 @@ int BM_API::get_device_id(){
         
 void BM_API::mem_alloc(void** ptr, size_t n){
     //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n)
-    bm_device_mem_t mem = bm_mem_from_system(*ptr);
-    BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n));
+    bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr)
+    BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n));
 }
         
 void BM_API::mem_free(void* ptr){
     //(bm_handle_t handle, bm_device_mem_t mem){
     if(ptr != nullptr){
-        bm_free_device(handle, bm_mem_from_system(ptr));
+        bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr)
+        bm_free_device(handle, *pmem);
     }
 }
         
 void BM_API::mem_set(void* ptr, int value, size_t n){
     //(bm_handle_t handle, const int value, bm_device_mem_t mem){
-    BMDNN_CHECK(bm_memset_device(handle, value, bm_mem_from_system(ptr)));
+    bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr)
+    BMDNN_CHECK(bm_memset_device(handle, value, *pmem));
 }
 
 //! target wrapper
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index 14f86c8b5..af279797e 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -55,6 +55,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     TensorHf4 thost1(sh1);
     TensorDf4 tdev1(sh1);
 
+    /*
     //! test tensor copy_from() function
     LOG(INFO) << "test copy_from() function, input tensor could be any target";
     thost1.copy_from(thost0);
@@ -191,6 +192,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
 
     LOG(INFO) << "|--show root tensor while data is changed by shared tensor";
     print_tensor_host(thost4);
+     */
 }
 
 /*

From 8f131ab80ce5459150bb47aa921fdee807bf1352 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Wed, 20 Jun 2018 16:39:48 +0800
Subject: [PATCH 133/318] Convert from void* to bm_device_mem_t*

---
 saber/core/impl/bm/bm_impl.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index faca480f0..f2993426c 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -55,21 +55,21 @@ int BM_API::get_device_id(){
         
 void BM_API::mem_alloc(void** ptr, size_t n){
     //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n)
-    bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr)
+    bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr);
     BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n));
 }
         
 void BM_API::mem_free(void* ptr){
     //(bm_handle_t handle, bm_device_mem_t mem){
     if(ptr != nullptr){
-        bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr)
+        bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr);
         bm_free_device(handle, *pmem);
     }
 }
         
 void BM_API::mem_set(void* ptr, int value, size_t n){
     //(bm_handle_t handle, const int value, bm_device_mem_t mem){
-    bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr)
+    bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr);
     BMDNN_CHECK(bm_memset_device(handle, value, *pmem));
 }
 

From 190dc305c192ef331f3bc56220c4062f12f3c281 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Wed, 20 Jun 2018 16:45:27 +0800
Subject: [PATCH 134/318] Revert back first

---
 saber/core/impl/bm/bm_impl.cpp | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index f2993426c..f432cc863 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -55,22 +55,26 @@ int BM_API::get_device_id(){
         
 void BM_API::mem_alloc(void** ptr, size_t n){
     //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n)
-    bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr);
-    BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n));
+    bm_device_mem_t mem = bm_mem_from_system(*ptr);
+    BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n));
+    //bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr);
+    //BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n));
 }
         
 void BM_API::mem_free(void* ptr){
     //(bm_handle_t handle, bm_device_mem_t mem){
     if(ptr != nullptr){
-        bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr);
-        bm_free_device(handle, *pmem);
+        bm_free_device(handle, bm_mem_from_system(ptr));
+        //bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr);
+        //bm_free_device(handle, *pmem);
     }
 }
         
 void BM_API::mem_set(void* ptr, int value, size_t n){
     //(bm_handle_t handle, const int value, bm_device_mem_t mem){
-    bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr);
-    BMDNN_CHECK(bm_memset_device(handle, value, *pmem));
+    BMDNN_CHECK(bm_memset_device(handle, value, bm_mem_from_system(ptr)));
+    //bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr);
+    //BMDNN_CHECK(bm_memset_device(handle, value, *pmem));
 }
 
 //! target wrapper

From dcdfa8a79cfe9635d93615f9263de0b1f4eea361 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Wed, 20 Jun 2018 17:00:14 +0800
Subject: [PATCH 135/318] test

---
 saber/core/impl/bm/bm_impl.cpp         | 2 +-
 test/saber/bm/test_saber_tensor_BM.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index f432cc863..baa25f484 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -84,7 +84,7 @@ template struct TargetWrapper<BM, __device_target>;
 template class Buffer<BM>;
 
 //! BM Tensor
-INSTANTIATE_TENSOR(BM, AK_BM, NCHW);
+INSTANTIATE_TENSOR(BM, AK_FLOAT, NCHW);
 
 template struct Env<BM>;
 
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index af279797e..ce0bad95a 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -6,7 +6,7 @@ using namespace anakin::saber;
 typedef TargetWrapper<X86> X86_API;
 typedef TargetWrapper<BM> BM_API;
 typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
 typedef TensorHf4::Dtype dtype;
 
 TEST(TestSaberTensorBM, test_tensor_constructor) {
@@ -55,7 +55,6 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     TensorHf4 thost1(sh1);
     TensorDf4 tdev1(sh1);
 
-    /*
     //! test tensor copy_from() function
     LOG(INFO) << "test copy_from() function, input tensor could be any target";
     thost1.copy_from(thost0);
@@ -66,6 +65,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     tdev1.copy_from(tdev0);
     print_tensor_host(thost1);
 
+    /*
     //! test tensor constructor with shape and real_shape
     LOG(INFO) << "test tensor constructor with shape and real_shape";
     //! constructor with 3 shapes is removed

From 76c64b7d1a52545f8f0bae5c0c85de77f05df6a4 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Wed, 20 Jun 2018 17:10:11 +0800
Subject: [PATCH 136/318] Revert "test"

This reverts commit 5ea5263eaf7f103b975e710d74f38617227fd117.
---
 saber/core/impl/bm/bm_impl.cpp         | 2 +-
 test/saber/bm/test_saber_tensor_BM.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index baa25f484..f432cc863 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -84,7 +84,7 @@ template struct TargetWrapper<BM, __device_target>;
 template class Buffer<BM>;
 
 //! BM Tensor
-INSTANTIATE_TENSOR(BM, AK_FLOAT, NCHW);
+INSTANTIATE_TENSOR(BM, AK_BM, NCHW);
 
 template struct Env<BM>;
 
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index ce0bad95a..af279797e 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -6,7 +6,7 @@ using namespace anakin::saber;
 typedef TargetWrapper<X86> X86_API;
 typedef TargetWrapper<BM> BM_API;
 typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 typedef TensorHf4::Dtype dtype;
 
 TEST(TestSaberTensorBM, test_tensor_constructor) {
@@ -55,6 +55,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     TensorHf4 thost1(sh1);
     TensorDf4 tdev1(sh1);
 
+    /*
     //! test tensor copy_from() function
     LOG(INFO) << "test copy_from() function, input tensor could be any target";
     thost1.copy_from(thost0);
@@ -65,7 +66,6 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     tdev1.copy_from(tdev0);
     print_tensor_host(thost1);
 
-    /*
     //! test tensor constructor with shape and real_shape
     LOG(INFO) << "test tensor constructor with shape and real_shape";
     //! constructor with 3 shapes is removed

From 54dfffd273d4bf27eacdefda7e3f40d08cf1f916 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Wed, 20 Jun 2018 17:14:21 +0800
Subject: [PATCH 137/318] Debug on copy_from

---
 test/saber/bm/test_saber_tensor_BM.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index af279797e..13b9deff1 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -55,7 +55,6 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     TensorHf4 thost1(sh1);
     TensorDf4 tdev1(sh1);
 
-    /*
     //! test tensor copy_from() function
     LOG(INFO) << "test copy_from() function, input tensor could be any target";
     thost1.copy_from(thost0);
@@ -66,6 +65,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     tdev1.copy_from(tdev0);
     print_tensor_host(thost1);
 
+    /*
     //! test tensor constructor with shape and real_shape
     LOG(INFO) << "test tensor constructor with shape and real_shape";
     //! constructor with 3 shapes is removed

From a21ed77a82b4dea11f10026103296bb9f3aefee8 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Wed, 20 Jun 2018 17:40:20 +0800
Subject: [PATCH 138/318] Revert "Revert "test""

This reverts commit 1cc471ee9f0845ac0e59f422ebc7622338ae9947.
---
 saber/core/impl/bm/bm_impl.cpp         | 2 +-
 test/saber/bm/test_saber_tensor_BM.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index f432cc863..baa25f484 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -84,7 +84,7 @@ template struct TargetWrapper<BM, __device_target>;
 template class Buffer<BM>;
 
 //! BM Tensor
-INSTANTIATE_TENSOR(BM, AK_BM, NCHW);
+INSTANTIATE_TENSOR(BM, AK_FLOAT, NCHW);
 
 template struct Env<BM>;
 
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index 13b9deff1..ce0bad95a 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -6,7 +6,7 @@ using namespace anakin::saber;
 typedef TargetWrapper<X86> X86_API;
 typedef TargetWrapper<BM> BM_API;
 typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
 typedef TensorHf4::Dtype dtype;
 
 TEST(TestSaberTensorBM, test_tensor_constructor) {

From f46db516e73f8145cff95dfe378ce6164e44a650 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Thu, 21 Jun 2018 09:44:59 +0800
Subject: [PATCH 139/318] Print tensor for BM

---
 test/saber/bm/test_saber_tensor_BM.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index ce0bad95a..cc2adc774 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -59,8 +59,8 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     LOG(INFO) << "test copy_from() function, input tensor could be any target";
     thost1.copy_from(thost0);
     tdev1.copy_from(thost0);
-    print_tensor_device(tdev1);
-    //cudaDeviceSynchronize();
+    //TODO: print tensor for BM device
+    print_tensor_host(tdev1);
     thost1.copy_from(tdev1);
     tdev1.copy_from(tdev0);
     print_tensor_host(thost1);

From f3d589dcaf7e64b6d6365018ed7ff01b2b90864e Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Thu, 21 Jun 2018 09:45:35 +0800
Subject: [PATCH 140/318] Revert "Revert "Revert "test"""

This reverts commit 75f5063122cdcae1045b78f7d29055ca6b058e42.
---
 saber/core/impl/bm/bm_impl.cpp         | 2 +-
 test/saber/bm/test_saber_tensor_BM.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index baa25f484..f432cc863 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -84,7 +84,7 @@ template struct TargetWrapper<BM, __device_target>;
 template class Buffer<BM>;
 
 //! BM Tensor
-INSTANTIATE_TENSOR(BM, AK_FLOAT, NCHW);
+INSTANTIATE_TENSOR(BM, AK_BM, NCHW);
 
 template struct Env<BM>;
 
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index cc2adc774..db0edce6d 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -6,7 +6,7 @@ using namespace anakin::saber;
 typedef TargetWrapper<X86> X86_API;
 typedef TargetWrapper<BM> BM_API;
 typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 typedef TensorHf4::Dtype dtype;
 
 TEST(TestSaberTensorBM, test_tensor_constructor) {

From fd54ed89d77e2ca7b6b6603423e6458af01cc919 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Thu, 21 Jun 2018 10:00:22 +0800
Subject: [PATCH 141/318] Passing through BM handler

---
 saber/core/context.h           | 12 ++++++++++++
 saber/core/impl/bm/bm_impl.cpp |  5 +++++
 saber/core/target_wrapper.h    |  2 ++
 3 files changed, 19 insertions(+)

diff --git a/saber/core/context.h b/saber/core/context.h
index b8a916578..ad5d8d3f4 100644
--- a/saber/core/context.h
+++ b/saber/core/context.h
@@ -19,6 +19,12 @@
 #include "core/env.h"
 #include "saber/saber_types.h"
 
+#ifdef USE_BM
+#include "bmlib_runtime.h"
+#include "bmdnn_api.h"
+#include "bmlib_utils.h"
+#endif
+
 namespace anakin{
 
 namespace saber{
@@ -114,6 +120,12 @@ class Context final{
         return _stream_compute;
     }
 
+#ifdef USE_BM
+    bm_handle_t get_handler() {
+        return API::get_handler();
+    }
+#endif
+
 #ifdef USE_ARM_PLACE
     //void set_act_cores(std::vector<int> ids);
     //void set_power_mode(PowerMode mode);
diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index f432cc863..ecfe755d6 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -37,8 +37,13 @@ namespace saber{
 
 typedef TargetWrapper<BM, __device_target> BM_API;
 
+//TODO: check exception
 static bm_handle_t handle = get_bm_handle();
 
+bm_handle_t BM_API::get_handler() {
+    return handle;
+}
+
 void BM_API::get_device_count(int &count) {
     BMDNN_CHECK(bm_dev_getcount(&count));
 }
diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index 62055db48..4507af462 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -556,6 +556,8 @@ struct TargetWrapper<BM, __device_target> {
      * @return          currently activated device id
      */
     static int get_device_id();
+
+    static bm_handle_t get_handler();
 };
 
 #endif //USE_BM

From eefea6d8b0fd53d005ba4bf294de2cc85e16baeb Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Thu, 21 Jun 2018 11:20:14 +0800
Subject: [PATCH 142/318] Implement copy_from for BM; Add back
 test_TargetWrapper_BM

---
 saber/core/tensor.h                     | 17 +++++++++++++++++
 test/saber/bm/test_TargetWrapper_BM.cpp | 16 ++++++++++++++++
 2 files changed, 33 insertions(+)
 create mode 100644 test/saber/bm/test_TargetWrapper_BM.cpp

diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index f18884421..c200db805 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -20,6 +20,12 @@
 #include "saber/core/events.h"
 #include "saber/core/buffer.h"
 
+#ifdef USE_BM
+#include "bmlib_runtime.h"
+#include "bmdnn_api.h"
+#include "bmlib_utils.h"
+#endif
+
 namespace anakin{
 
 namespace saber{
@@ -583,6 +589,17 @@ class Tensor {
         return SaberSuccess;
     }
 
+#ifdef USE_BM
+    SaberStatus copy_from(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
+        CHECK_EQ(valid_size(), tensor.valid_size()) \
+            << "sizes of two valid shapes must be the same";
+
+        BMDNN_CHECK(m_memcpy_s2d(API::get_handler(), mutable_data(), bm_mem_from_system(tensor.data())));
+
+        return SaberSuccess;
+    }
+#endif
+
     /**
      *  \brief Deep copy data within region of interest from input tensor.
      */
diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp
new file mode 100644
index 000000000..c54b392d1
--- /dev/null
+++ b/test/saber/bm/test_TargetWrapper_BM.cpp
@@ -0,0 +1,16 @@
+#include "saber_types.h"
+#include "target_wrapper.h"
+#include <iostream>
+
+#ifdef USE_BM
+using namespace anakin::saber;
+int main() {
+    typedef TargetWrapper<BM> API;
+    void *pmem;
+    int dev_count = 0;
+    API::get_device_count(dev_count);
+    API::mem_alloc(&pmem, 3*200*200);
+    API::mem_free(pmem);
+}
+#endif
+

From 4c2fc50ef2146a9b2387ebabd33508f8914318f4 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Thu, 21 Jun 2018 11:35:11 +0800
Subject: [PATCH 143/318] check tensor target type

---
 saber/core/tensor.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index c200db805..018f07668 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -591,6 +591,9 @@ class Tensor {
 
 #ifdef USE_BM
     SaberStatus copy_from(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
+        CHECK_EQ(typeof(BM), typeof(targetType_t)) \
+            << "this method is only for BM tensor";
+
         CHECK_EQ(valid_size(), tensor.valid_size()) \
             << "sizes of two valid shapes must be the same";
 

From c6524797c34cded518fd0c7ef57b224b23e715d2 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Thu, 21 Jun 2018 11:39:13 +0800
Subject: [PATCH 144/318] Change back to compliable version

---
 saber/core/tensor.h                    | 14 --------------
 test/saber/bm/test_saber_tensor_BM.cpp |  2 +-
 2 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 018f07668..532283a0c 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -589,20 +589,6 @@ class Tensor {
         return SaberSuccess;
     }
 
-#ifdef USE_BM
-    SaberStatus copy_from(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
-        CHECK_EQ(typeof(BM), typeof(targetType_t)) \
-            << "this method is only for BM tensor";
-
-        CHECK_EQ(valid_size(), tensor.valid_size()) \
-            << "sizes of two valid shapes must be the same";
-
-        BMDNN_CHECK(m_memcpy_s2d(API::get_handler(), mutable_data(), bm_mem_from_system(tensor.data())));
-
-        return SaberSuccess;
-    }
-#endif
-
     /**
      *  \brief Deep copy data within region of interest from input tensor.
      */
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index db0edce6d..8aead4bb1 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -55,6 +55,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     TensorHf4 thost1(sh1);
     TensorDf4 tdev1(sh1);
 
+    /*
     //! test tensor copy_from() function
     LOG(INFO) << "test copy_from() function, input tensor could be any target";
     thost1.copy_from(thost0);
@@ -65,7 +66,6 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     tdev1.copy_from(tdev0);
     print_tensor_host(thost1);
 
-    /*
     //! test tensor constructor with shape and real_shape
     LOG(INFO) << "test tensor constructor with shape and real_shape";
     //! constructor with 3 shapes is removed

From 4594e9a3ca47c3496cd07622ec8361dae6ca3e92 Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Thu, 21 Jun 2018 15:38:08 +0800
Subject: [PATCH 145/318] modify activation op and test

---
 saber/funcs/impl/bm/vender_activation.h       | 38 +++-----
 saber/funcs/impl/bm/vender_fc.h               | 46 ++-------
 .../bm/test_saber_func_activation_BM.cpp      | 97 +------------------
 3 files changed, 21 insertions(+), 160 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_activation.h b/saber/funcs/impl/bm/vender_activation.h
index 45541add9..fadd817b9 100644
--- a/saber/funcs/impl/bm/vender_activation.h
+++ b/saber/funcs/impl/bm/vender_activation.h
@@ -27,17 +27,9 @@ class VenderActivation<BM, OpDtype, inDtype, outDtype,\
     typedef typename DataTensor_out::Dtype OutDataType;
     typedef typename OpTensor::Dtype OpDataType;
 
-    VenderActivation()
-            : _handle(NULL), _active_descs(NULL), _input_descs(NULL), _output_descs(NULL) {}
+    VenderActivation(): _handle(NULL), _active_type(NULL) {}
 
-    ~VenderActivation() {
-        if (_input_descs) {
-            BMDNN_CHECK(bm_free_device(_input_descs));
-        }
-        if (_output_descs) {
-            BMDNN_CHECK(bm_free_device(_output_descs));
-        }
-    }
+    ~VenderActivation() {}
 
     virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
@@ -64,33 +56,29 @@ class VenderActivation<BM, OpDtype, inDtype, outDtype,\
         int input_n = inputs[0]->num();
 
         switch (_active_type) {
-            case Active_sigmoid:
-                BMDNN_CHECK(bmdnn_sigmoid_forward(_handle, _input_descs, input_n, input_dim, _output_descs));
-                break;
             case Active_relu:
-                BMDNN_CHECK(bmdnn_relu_forward(_handle, _input_descs, input_n, input_dim, _output_descs));
+                BMDNN_CHECK(bmdnn_relu_forward(_handle, in_data, input_n, input_dim, out_data));
+                break;
+            case Active_sigmoid:
+                BMDNN_CHECK(bmdnn_sigmoid_forward(_handle, in_data, input_n, input_dim, out_data));
                 break;
             case Active_tanh:
-                BMDNN_CHECK(bmdnn_tanh_forward(_handle, _input_descs, input_n, input_dim, _output_descs));
+                BMDNN_CHECK(bmdnn_tanh_forward(_handle, in_data, input_n, input_dim, out_data));
+                break;
+            case Active_elu:
+                BMDNN_CHECK(bmdnn_elu_forward(_handle, 1.0, in_data, input_n, input_dim, out_data));
                 break;
         }
-        /* BMDNN_CHECK(cudnnActivationForward(_handle, _active_descs, */
-        /*                                    cudnn::cudnnTypeWrapper<InDataType>::kOne(), */
-        /*                                    _input_descs, in_data, */
-        /*                                    cudnn::cudnnTypeWrapper<InDataType>::kZero(), */
-        /*                                    _output_descs, out_data */
-        /* )); */
         return SaberSuccess;
     }
 
 private:
     bm_handle_t _handle;
-    bm_device_mem_t _input_descs;
-    bm_device_mem_t _output_descs;
     ActiveType _active_type;
 };
+
 template class VenderActivation<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
-}
-}
+} // namespace saber
 
+} // namespace anakin
 #endif //ANAKIN_SABER_FUNCS_BMDNN_ACT_H
diff --git a/saber/funcs/impl/bm/vender_fc.h b/saber/funcs/impl/bm/vender_fc.h
index 5c7c23e67..3b018686c 100644
--- a/saber/funcs/impl/bm/vender_fc.h
+++ b/saber/funcs/impl/bm/vender_fc.h
@@ -1,20 +1,5 @@
-/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-   
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. 
-*/
-
-#ifndef ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H
-#define ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H
+#ifndef ANAKIN_SABER_FUNCS_BMDNN_FC_H
+#define ANAKIN_SABER_FUNCS_BMDNN_FC_H
 
 #include "saber/funcs/impl/impl_fc.h"
 
@@ -43,23 +28,12 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
     typedef typename DataTensor_out::Dtype OutDataType;
     typedef typename OpTensor::Dtype OpDataType;
 
-    VenderFc() = default;
-    ~VenderFc() {
-        if (_handle != nullptr) {
-            CUBLAS_CHECK(cublasDestroy(_handle));
-        }
-    }
+    VenderFc() {};
+    ~VenderFc() {}
 
     virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             FcParam<OpTensor>& param, Context<BM>& ctx){
-        // get context
-        this->_ctx = ctx;
-        cudaStream_t cuda_stream;
-        cuda_stream = ctx.get_compute_stream();
-
-        CUBLAS_CHECK(cublasCreate(&_handle));
-        CUBLAS_CHECK(cublasSetStream(_handle, cuda_stream));
         return create(inputs, outputs, param, ctx);
     }
 
@@ -94,16 +68,10 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
 
     virtual SaberStatus dispatch(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
-                            FcParam<OpTensor>& param);
+                            FcParam<OpTensor>& param){
 
+    };
 
-private:
-    bool _flag_trans_weights{false};
-    int _M;
-    int _K;
-    int _N;
-    cublasHandle_t _handle;
-    bool _is_continue_buf{true};
 };
 
 template class VenderFc<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
@@ -111,4 +79,4 @@ template class VenderFc<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
 
 } //namespace anakin
 
-#endif //ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H
+#endif // ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H
diff --git a/test/saber/bm/test_saber_func_activation_BM.cpp b/test/saber/bm/test_saber_func_activation_BM.cpp
index 5d30a6d64..523e94121 100644
--- a/test/saber/bm/test_saber_func_activation_BM.cpp
+++ b/test/saber/bm/test_saber_func_activation_BM.cpp
@@ -58,7 +58,7 @@ TEST(TestSaberFuncBM, test_func_constructor) {
 
     Context<BM> ctx1(0, 1, 1);
 
-    ActivationParam<TensorDf4> param(Active_elu, 0.1f, 0.1f);
+    ActivationParam<TensorDf4> param(Active_relu, 0.1f, 0.1f);
 
     std::vector<TensorDf4*> input;
     std::vector<TensorDf4*> output;
@@ -74,102 +74,7 @@ TEST(TestSaberFuncBM, test_func_constructor) {
     act.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
     act(input, output, param, ctx1);
 
-    cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    output[0]->record_event(cuda_stream);
-    output_dev.sync();
     print_tensor_device(output_dev);
-    cudaDeviceSynchronize();
-    CUDA_POST_KERNEL_CHECK;
-}
-
-TEST(TestSaberFuncBM, test_func_sub_tensor) {
-
-    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
-
-    int img_num = 1;
-    int in_channels = 2;
-    int img_h = 8;
-    int img_w = 8;
-
-    Shape img_s(img_num, in_channels, img_h, img_w);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = (float)(0.05 * (i & 0x1f) * -1);
-    }
-
-    img_dev.copy_from(img_host);
-    Shape img_s_t0(img_num, in_channels, 4, 4);
-
-    TensorDf4 t0;
-    TensorDf4 t1;
-
-    t0.share_sub_buffer(img_dev, img_s_t0, {0, 0, 0, 0});
-    t1.share_sub_buffer(img_dev, img_s_t0, {0, 0, 4, 4});
-
-    print_tensor_shape("t0", t0);
-    print_tensor_shape("t1", t1);
-
-    TensorDf4 output_dev;
-
-    TensorDf4 out0;
-    TensorDf4 out1;
-
-    // start Reshape & doInfer
-    Context<BM> ctx1(0, 1, 1);
-    Context<BM> ctx2(0, 2, 2);
-
-    ActivationParam<TensorDf4> param1(Active_elu, 0.1f, 0.1f);
-    ActivationParam<TensorDf4> param2(Active_elu, 0.1f, 0.1f);
-
-    std::vector<TensorDf4*> input1, input2;
-    std::vector<TensorDf4*> output1, output2;
-
-    input1.push_back(&t0);
-    input2.push_back(&t1);
-
-    output1.push_back(&out0);
-    output2.push_back(&out1);
-
-    //FIXME where do I get img_s and all those shapes ????
-    output_dev.re_alloc(img_s);
-
-    out0.share_sub_buffer(output_dev, img_s_t0, {0, 0, 0, 0});
-    out1.share_sub_buffer(output_dev, img_s_t0, {0, 0, 4, 4});
-
-    print_tensor_shape("output_dev", output_dev);
-
-    Activation<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> act1;
-    Activation<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> act2;
-
-    act1.compute_output_shape(output1, input1, param1);
-    act2.compute_output_shape(output2, input2, param2);
-
-    print_tensor_shape("out0", out0);
-    print_tensor_shape("out1", out1);
-
-    // init assume output tensor has been reshpaed by user.
-    act1.init(input1, output1, param1, SPECIFY, SABER_IMPL, ctx1);
-    act1(input1, output1, param1, ctx1);
-    cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    output1[0]->record_event(cuda_stream);
-
-    act2.init(input2, output2, param2, SPECIFY, SABER_IMPL, ctx2);
-    act2(input2, output2, param2, ctx2);
-    cudaStream_t cuda_stream2 = ctx2.get_compute_stream();
-    output2[0]->record_event(cuda_stream2);
-
-    out0.sync();
-    out1.sync();
-    print_tensor_device(output_dev);
-    cudaDeviceSynchronize();
-    CUDA_POST_KERNEL_CHECK;
 }
 
 int main(int argc, const char** argv) {

From 2fa9cc03025cf657f9ddcd9559950749c4f2164b Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Thu, 21 Jun 2018 18:31:38 +0800
Subject: [PATCH 146/318] Enable copy from tensor with different Dtype

---
 saber/core/data_traits.h               | 13 +++++++++++++
 saber/core/tensor.h                    | 16 ++++++++++++++++
 test/saber/bm/test_saber_tensor_BM.cpp |  6 ++++--
 3 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/saber/core/data_traits.h b/saber/core/data_traits.h
index d92cc1842..45fd9b25f 100644
--- a/saber/core/data_traits.h
+++ b/saber/core/data_traits.h
@@ -18,6 +18,12 @@
 
 #include "saber_types.h"
 
+#ifdef USE_BM
+#include "bmlib_runtime.h"
+#include "bmdnn_api.h"
+#include "bmlib_utils.h"
+#endif
+
 namespace anakin{
 
 namespace saber{
@@ -44,6 +50,13 @@ struct DataTraitBase<AMD>{
 };
 #endif
 
+#ifdef USE_BM
+template <>
+struct DataTrait<AK_BM> {
+    typedef bm_device_mem_t dtype;
+};
+#endif
+
 static size_t type_length(DataType type) {
     switch (type){
         case AK_INT8:
diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 532283a0c..26b1320c1 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -21,6 +21,7 @@
 #include "saber/core/buffer.h"
 
 #ifdef USE_BM
+#include <typeinfo>
 #include "bmlib_runtime.h"
 #include "bmdnn_api.h"
 #include "bmlib_utils.h"
@@ -764,6 +765,21 @@ class Tensor {
         return SaberSuccess;
     }
 
+#ifdef USE_BM
+    template <typename TargetType_t, DataType DataType_t, typename LayOutType_t>
+    SaberStatus copy_from(const Tensor<TargetType_t, DataType_t, LayOutType_t>& tensor) {
+        if (typeid(BM) == typeid(targetType_t) &&
+            typeid(X86) == typeid(TargetType_t) &&
+            typeid(AK_FLOAT) == typeid(DataType_t)){
+
+            Dtype* device_data_ptr = mutable_data();
+            BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(tensor.data())));
+        }
+
+        return SaberSuccess;
+    };
+#endif
+
     /**
      * \brief Asynchronously copy entire buffer from source tensor.
      */
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index 8aead4bb1..83eb472b7 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -55,11 +55,13 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     TensorHf4 thost1(sh1);
     TensorDf4 tdev1(sh1);
 
-    /*
+
     //! test tensor copy_from() function
     LOG(INFO) << "test copy_from() function, input tensor could be any target";
-    thost1.copy_from(thost0);
+    //thost1.copy_from(thost0);
     tdev1.copy_from(thost0);
+
+    /*
     //TODO: print tensor for BM device
     print_tensor_host(tdev1);
     thost1.copy_from(tdev1);

From 71f0c6f6b684a4da44c14324dac30b9aca14734c Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Fri, 22 Jun 2018 09:25:03 +0800
Subject: [PATCH 147/318] Complete copy_from method

---
 saber/core/tensor.h | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 26b1320c1..99aed86bc 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -768,15 +768,36 @@ class Tensor {
 #ifdef USE_BM
     template <typename TargetType_t, DataType DataType_t, typename LayOutType_t>
     SaberStatus copy_from(const Tensor<TargetType_t, DataType_t, LayOutType_t>& tensor) {
+
+        CHECK_EQ(valid_size(), tensor.valid_size()) \
+            << "sizes of two valid shapes must be the same";
+
+        /// copy from system to device
         if (typeid(BM) == typeid(targetType_t) &&
+            typeid(AK_BM) == typeid(datatype) &&
             typeid(X86) == typeid(TargetType_t) &&
             typeid(AK_FLOAT) == typeid(DataType_t)){
 
             Dtype* device_data_ptr = mutable_data();
             BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(tensor.data())));
+
+            return SaberSuccess;
         }
 
-        return SaberSuccess;
+        /// copy from device to system
+        if (typeid(X86) == typeid(targetType_t) &&
+            typeid(AK_FLOAT) == typeid(datatype) &&
+            typeid(BM) == typeid(TargetType_t) &&
+            typeid(AK_BM) == typeid(DataType_t)){
+
+            Dtype* device_data_ptr = tensor.data();
+            BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr));
+
+            return SaberSuccess;
+        }
+
+        /// other types are not allowed here
+        return SaberInvalidValue;
     };
 #endif
 

From d7ed46b65db12a378122449f89cdc99264a0d300 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Fri, 22 Jun 2018 09:41:21 +0800
Subject: [PATCH 148/318] const_cast the immutable target data pointer

---
 saber/core/tensor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 99aed86bc..4c558af2a 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -790,7 +790,7 @@ class Tensor {
             typeid(BM) == typeid(TargetType_t) &&
             typeid(AK_BM) == typeid(DataType_t)){
 
-            Dtype* device_data_ptr = tensor.data();
+            Dtype* device_data_ptr = const_cast<Dtype*>(tensor.data());
             BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr));
 
             return SaberSuccess;

From d8119d546a373df220492b6ef828019d3619dc4f Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Fri, 22 Jun 2018 10:06:33 +0800
Subject: [PATCH 149/318] Revert back to compilable version

---
 saber/core/tensor.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 4c558af2a..bd29c7c74 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -785,16 +785,16 @@ class Tensor {
         }
 
         /// copy from device to system
-        if (typeid(X86) == typeid(targetType_t) &&
+        /*if (typeid(X86) == typeid(targetType_t) &&
             typeid(AK_FLOAT) == typeid(datatype) &&
             typeid(BM) == typeid(TargetType_t) &&
             typeid(AK_BM) == typeid(DataType_t)){
 
-            Dtype* device_data_ptr = const_cast<Dtype*>(tensor.data());
+            auto* device_data_ptr = tensor.data();
             BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr));
 
             return SaberSuccess;
-        }
+        }*/
 
         /// other types are not allowed here
         return SaberInvalidValue;

From fd07e728414e87498b370c661d8a02b9f12ab25f Mon Sep 17 00:00:00 2001
From: root <weihao.huang@bitmain.com>
Date: Fri, 22 Jun 2018 02:43:01 +0000
Subject: [PATCH 150/318] Modify handle usage & mem_alloc function

---
 saber/core/impl/bm/bm_impl.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index ecfe755d6..6088b3af6 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -37,12 +37,17 @@ namespace saber{
 
 typedef TargetWrapper<BM, __device_target> BM_API;
 
+<<<<<<< HEAD
 //TODO: check exception
 static bm_handle_t handle = get_bm_handle();
 
 bm_handle_t BM_API::get_handler() {
     return handle;
 }
+=======
+//static bm_handle_t handle = get_bm_handle();
+static bm_handle_t handle;
+>>>>>>> Modify handle usage & mem_alloc function
 
 void BM_API::get_device_count(int &count) {
     BMDNN_CHECK(bm_dev_getcount(&count));
@@ -60,18 +65,31 @@ int BM_API::get_device_id(){
         
 void BM_API::mem_alloc(void** ptr, size_t n){
     //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n)
+<<<<<<< HEAD
     bm_device_mem_t mem = bm_mem_from_system(*ptr);
     BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n));
     //bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr);
     //BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n));
+=======
+    //bm_device_mem_t mem = bm_mem_from_system(*ptr);
+    handle = get_bm_handle();
+    bm_device_mem_t *mem = new bm_device_mem_t[1];
+    mem = reinterpret_cast<struct bm_mem_desc *>(ptr);
+    BMDNN_CHECK(bm_malloc_device_byte(handle, mem, n));
+>>>>>>> Modify handle usage & mem_alloc function
 }
         
 void BM_API::mem_free(void* ptr){
     //(bm_handle_t handle, bm_device_mem_t mem){
     if(ptr != nullptr){
         bm_free_device(handle, bm_mem_from_system(ptr));
+<<<<<<< HEAD
         //bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr);
         //bm_free_device(handle, *pmem);
+=======
+        //handle = get_bm_handle();
+	//bm_free_device(handle, reinterpret_cast<struct bm_mem_desc>(*ptr));
+>>>>>>> Modify handle usage & mem_alloc function
     }
 }
         

From 1261fa227dfb4e4c45ac24a6405c6af25e9dec07 Mon Sep 17 00:00:00 2001
From: "weihao.huang" <weihao.huang@bitmain.com>
Date: Fri, 22 Jun 2018 04:01:48 +0000
Subject: [PATCH 151/318] Modify handle usage & mem_alloc function

---
 saber/core/impl/bm/bm_impl.cpp | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 6088b3af6..5ad6af84e 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -37,17 +37,13 @@ namespace saber{
 
 typedef TargetWrapper<BM, __device_target> BM_API;
 
-<<<<<<< HEAD
 //TODO: check exception
-static bm_handle_t handle = get_bm_handle();
+//static bm_handle_t handle = get_bm_handle();
+static bm_handle_t handle;
 
 bm_handle_t BM_API::get_handler() {
     return handle;
 }
-=======
-//static bm_handle_t handle = get_bm_handle();
-static bm_handle_t handle;
->>>>>>> Modify handle usage & mem_alloc function
 
 void BM_API::get_device_count(int &count) {
     BMDNN_CHECK(bm_dev_getcount(&count));
@@ -65,31 +61,25 @@ int BM_API::get_device_id(){
         
 void BM_API::mem_alloc(void** ptr, size_t n){
     //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n)
-<<<<<<< HEAD
     bm_device_mem_t mem = bm_mem_from_system(*ptr);
     BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n));
     //bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr);
     //BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n));
-=======
     //bm_device_mem_t mem = bm_mem_from_system(*ptr);
     handle = get_bm_handle();
     bm_device_mem_t *mem = new bm_device_mem_t[1];
     mem = reinterpret_cast<struct bm_mem_desc *>(ptr);
     BMDNN_CHECK(bm_malloc_device_byte(handle, mem, n));
->>>>>>> Modify handle usage & mem_alloc function
 }
         
 void BM_API::mem_free(void* ptr){
     //(bm_handle_t handle, bm_device_mem_t mem){
     if(ptr != nullptr){
         bm_free_device(handle, bm_mem_from_system(ptr));
-<<<<<<< HEAD
         //bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr);
         //bm_free_device(handle, *pmem);
-=======
         //handle = get_bm_handle();
 	//bm_free_device(handle, reinterpret_cast<struct bm_mem_desc>(*ptr));
->>>>>>> Modify handle usage & mem_alloc function
     }
 }
         

From 90efca01d00c2664afe278d7bdc0f8c9a343758d Mon Sep 17 00:00:00 2001
From: "weihao.huang" <weihao.huang@bitmain.com>
Date: Fri, 22 Jun 2018 05:31:05 +0000
Subject: [PATCH 152/318] Modify test_TargetWrapper

---
 saber/core/impl/bm/bm_impl.cpp          | 2 --
 test/saber/bm/test_TargetWrapper_BM.cpp | 9 ++++++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 5ad6af84e..4aecb169d 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -61,8 +61,6 @@ int BM_API::get_device_id(){
         
 void BM_API::mem_alloc(void** ptr, size_t n){
     //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n)
-    bm_device_mem_t mem = bm_mem_from_system(*ptr);
-    BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n));
     //bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr);
     //BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n));
     //bm_device_mem_t mem = bm_mem_from_system(*ptr);
diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp
index c54b392d1..a76bef279 100644
--- a/test/saber/bm/test_TargetWrapper_BM.cpp
+++ b/test/saber/bm/test_TargetWrapper_BM.cpp
@@ -4,13 +4,20 @@
 
 #ifdef USE_BM
 using namespace anakin::saber;
+static bm_handle_t handle;
 int main() {
+    bmdnn_init(&handle);
     typedef TargetWrapper<BM> API;
     void *pmem;
     int dev_count = 0;
     API::get_device_count(dev_count);
+    std::cout << dev_count << std::endl;
     API::mem_alloc(&pmem, 3*200*200);
-    API::mem_free(pmem);
+    //API::mem_free(pmem);
+    std::cout << "Press any key to finish execution." << std::endl;
+    int a;
+    std::cin >> a;
+    bmdnn_deinit(handle);
 }
 #endif
 

From 9c8d71aeaedbe784c199e69d53007d6f28b9d3ab Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Fri, 22 Jun 2018 13:52:18 +0800
Subject: [PATCH 153/318] fill activation and fc op; compile error

---
 saber/funcs/impl/bm/vender_activation.h |  1 -
 saber/funcs/impl/bm/vender_fc.h         | 42 ++++++---------
 saber/funcs/timer.h                     | 68 ++++++++++++++++++++++++-
 test/saber/bm/test_saber_buffer_BM.cpp  |  2 +-
 test/saber/bm/test_saber_func_fc_BM.cpp |  6 +--
 5 files changed, 86 insertions(+), 33 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_activation.h b/saber/funcs/impl/bm/vender_activation.h
index fadd817b9..c4baf8365 100644
--- a/saber/funcs/impl/bm/vender_activation.h
+++ b/saber/funcs/impl/bm/vender_activation.h
@@ -49,7 +49,6 @@ class VenderActivation<BM, OpDtype, inDtype, outDtype,\
     virtual SaberStatus dispatch(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             ActivationParam<OpTensor>& param) {
-
         const InDataType *in_data = (const InDataType *) inputs[0]->data();
         OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
         int input_dim = inputs[0]->channel() * inputs[0]->height() * inputs[0]->width();
diff --git a/saber/funcs/impl/bm/vender_fc.h b/saber/funcs/impl/bm/vender_fc.h
index 3b018686c..82dd6000c 100644
--- a/saber/funcs/impl/bm/vender_fc.h
+++ b/saber/funcs/impl/bm/vender_fc.h
@@ -28,7 +28,7 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
     typedef typename DataTensor_out::Dtype OutDataType;
     typedef typename OpTensor::Dtype OpDataType;
 
-    VenderFc() {};
+    VenderFc(): _handle(NULL) {};
     ~VenderFc() {}
 
     virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
@@ -40,38 +40,28 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
     virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             FcParam<OpTensor>& param, Context<BM>& ctx){
-
-        if (!(ctx == this->_ctx)) {
-            if (_handle != NULL) {
-                CUBLAS_CHECK(cublasDestroy(_handle));
-            }
-            this->_ctx = ctx;
-
-            cudaStream_t cuda_stream;
-            cuda_stream = ctx.get_compute_stream();
-            CUBLAS_CHECK(cublasCreate(&_handle));
-            CUBLAS_CHECK(cublasSetStream(_handle, cuda_stream));
-        }
-
-        Shape shape_out = inputs[0]->valid_shape();
-        _M = inputs[0]->count_valid(0, param.axis);
-        _K = inputs[0]->count_valid(param.axis, inputs[0]->dims());
-        _N = param.num_output;
-        if (_N <= 0) {
-            int weight_size = param.weights->valid_size();
-            _N = weight_size / _K;
-        }
-        //! weights dims must be in h and w
-        _flag_trans_weights = param.is_transpose_weights;
         return SaberSuccess;
     }
 
     virtual SaberStatus dispatch(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             FcParam<OpTensor>& param){
-
+        const InDataType *in_data = (const InDataType *) inputs[0]->data();
+        const InDataType *weights = (const InDataType *) param.weights->get_buf()->get_data();
+        const InDataType *bias = (const InDataType *) param.bias->get_buf()->get_data();
+        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
+        int batch_size = inputs[0]->num();
+        int input_len = inputs[0]->channel();
+        int output_len = param.num_output;
+        int is_transpose = param.is_transpose_weights ? 1 : 0;
+        BMDNN_CHECK(bmdnn_fc_forward(_handle, in_data, weights, bias,
+                                    batch_size, output_len, input_len, is_transpose, 1, 0,
+                                    out_data));
+        return SaberSuccess;
     };
 
+private:
+    bm_handle_t _handle;
 };
 
 template class VenderFc<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
@@ -79,4 +69,4 @@ template class VenderFc<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
 
 } //namespace anakin
 
-#endif // ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H
+#endif // ANAKIN_SABER_FUNCS_BMDNN_FC_H
diff --git a/saber/funcs/timer.h b/saber/funcs/timer.h
index f5685c82b..9e300b821 100644
--- a/saber/funcs/timer.h
+++ b/saber/funcs/timer.h
@@ -231,7 +231,7 @@ class SaberTimer<AMD> final {
         }
 #if 0
         for(auto time : ms_time)
-           LOG(INFO) << time; 
+           LOG(INFO) << time;
 #endif
         ms_time.sort();
         LOG(INFO) << ms_time.front() <<" - " << ms_time.back();
@@ -269,6 +269,72 @@ class SaberTimer<AMD> final {
 #endif
 
 
+#ifdef USE_BM
+template <>
+class SaberTimer<BM> final {
+
+public:
+    SaberTimer() {}
+
+    ~SaberTimer() {}
+
+    void clear() {
+        ms_time.clear();
+    }
+
+    void start(Context<BM> &ctx) {
+        tstart = std::chrono::system_clock::now();
+    }
+
+    void end(Context<BM> &ctx) {
+        tend = std::chrono::system_clock::now();
+        auto ts = std::chrono::duration_cast<std::chrono::microseconds>(tend - tstart);
+        float elapse_ms = 1000.f * float(ts.count()) * std::chrono::microseconds::period::num / \
+            std::chrono::microseconds::period::den;
+        ms_time.push_back(elapse_ms);
+    }
+
+    float get_average_ms() {
+        if (ms_time.size() == 0) {
+            return 0.f;
+        }
+        float sum = 0.f;
+        for (auto i : ms_time){
+            sum += i;
+        }
+        return sum / ms_time.size();
+    }
+
+    // return tile (0-99) time.
+    float get_tile_time(float tile) {
+
+        if (tile <0 || tile > 100) {
+            return -1.f;
+        }
+        int total_items = (int)ms_time.size();
+        if (total_items <= 0) {
+            return -2.f;
+        }
+        ms_time.sort();
+        int pos = (int)(tile * total_items / 100);
+        auto it = ms_time.begin();
+        for (int i = 0; i < pos; ++i) {
+            ++it;
+        }
+        return *it;
+    }
+
+    const std::list<float> get_time_stat() {
+        return ms_time;
+    }
+
+private:
+    std::chrono::time_point<std::chrono::system_clock> tstart;
+    std::chrono::time_point<std::chrono::system_clock> tend;
+    std::list<float> ms_time;
+};
+#endif // USE_BM
+
 }
 }
 
diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp
index a204e7807..93aa6d36e 100644
--- a/test/saber/bm/test_saber_buffer_BM.cpp
+++ b/test/saber/bm/test_saber_buffer_BM.cpp
@@ -1,4 +1,4 @@
-#include "test_saber_buffer_bm.h"
+#include "test_saber_buffer_BM.h"
 #include "saber/core/buffer.h"
 #include "saber/core/data_traits.h"
 
diff --git a/test/saber/bm/test_saber_func_fc_BM.cpp b/test/saber/bm/test_saber_func_fc_BM.cpp
index 5101c75f8..869ff1bfd 100644
--- a/test/saber/bm/test_saber_func_fc_BM.cpp
+++ b/test/saber/bm/test_saber_func_fc_BM.cpp
@@ -1,6 +1,6 @@
 #include "core/context.h"
 #include "funcs/fc.h"
-#include "test_saber_func_fc_BM.h"
+#include "test_saber_func_BM.h"
 #include "tensor_op.h"
 #include "saber_types.h"
 #include <vector>
@@ -41,7 +41,7 @@ void fc_compute(const TensorHf4& tin, const TensorHf4& weight, \
     }
 }
 
-TEST(TestSaberFuncFcBM, test_func_fc) {
+TEST(TestSaberFuncBM, test_func_fc) {
 
     int test_iter = 100;
     int w_in = 7;
@@ -109,12 +109,10 @@ TEST(TestSaberFuncFcBM, test_func_fc) {
         //cudaDeviceSynchronize();
     }
 
-    CUDA_POST_KERNEL_CHECK;
     t1.end(ctx_dev);
     float ts = t1.get_average_ms();
     LOG(INFO) << "total time: " << ts << "avg time: " << ts / test_iter;
     //print_tensor_device(*output_dev_4d[0]);
-    //cudaDeviceSynchronize();
 
     //! check result
     TensorHf4 thin(shape_in);

From cddbb5b1ad95b0317fae0b936acaeaf0b83001e0 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Sat, 23 Jun 2018 15:58:33 +0800
Subject: [PATCH 154/318] allow copy from tensor with different data type

---
 saber/core/tensor.cpp                  | 24 +++++++++++++++
 saber/core/tensor.h                    | 42 ++------------------------
 test/saber/bm/test_saber_tensor_BM.cpp | 18 ++++++++---
 3 files changed, 40 insertions(+), 44 deletions(-)
 create mode 100644 saber/core/tensor.cpp

diff --git a/saber/core/tensor.cpp b/saber/core/tensor.cpp
new file mode 100644
index 000000000..9283aac90
--- /dev/null
+++ b/saber/core/tensor.cpp
@@ -0,0 +1,24 @@
+#include "tensor.h"
+
+#ifdef USE_BM
+
+#include "bmlib_runtime.h"
+#include "bmdnn_api.h"
+#include "bmlib_utils.h"
+
+template<>
+template<>
+SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
+    //auto* device_data_ptr = mutable_data();
+    //BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(tensor.data())));
+    return SaberSuccess;
+}
+
+template<>
+template<>
+SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor<BM, AK_BM, NCHW>& tensor) {
+    return SaberSuccess;
+}
+
+#endif
+
diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index bd29c7c74..0ac9a1454 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -20,13 +20,6 @@
 #include "saber/core/events.h"
 #include "saber/core/buffer.h"
 
-#ifdef USE_BM
-#include <typeinfo>
-#include "bmlib_runtime.h"
-#include "bmdnn_api.h"
-#include "bmlib_utils.h"
-#endif
-
 namespace anakin{
 
 namespace saber{
@@ -766,39 +759,10 @@ class Tensor {
     }
 
 #ifdef USE_BM
-    template <typename TargetType_t, DataType DataType_t, typename LayOutType_t>
-    SaberStatus copy_from(const Tensor<TargetType_t, DataType_t, LayOutType_t>& tensor) {
-
-        CHECK_EQ(valid_size(), tensor.valid_size()) \
-            << "sizes of two valid shapes must be the same";
-
-        /// copy from system to device
-        if (typeid(BM) == typeid(targetType_t) &&
-            typeid(AK_BM) == typeid(datatype) &&
-            typeid(X86) == typeid(TargetType_t) &&
-            typeid(AK_FLOAT) == typeid(DataType_t)){
-
-            Dtype* device_data_ptr = mutable_data();
-            BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(tensor.data())));
-
-            return SaberSuccess;
-        }
-
-        /// copy from device to system
-        /*if (typeid(X86) == typeid(targetType_t) &&
-            typeid(AK_FLOAT) == typeid(datatype) &&
-            typeid(BM) == typeid(TargetType_t) &&
-            typeid(AK_BM) == typeid(DataType_t)){
-
-            auto* device_data_ptr = tensor.data();
-            BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr));
-
-            return SaberSuccess;
-        }*/
-
-        /// other types are not allowed here
+    template <typename NewTargetType_t, DataType NewDataType_t, typename NewLayOutType_t>
+    SaberStatus copy_from(const Tensor<NewTargetType_t, NewDataType_t, NewLayOutType_t>& tensor) {
         return SaberInvalidValue;
-    };
+    }
 #endif
 
     /**
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index 83eb472b7..ed3ff0503 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -58,16 +58,24 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
 
     //! test tensor copy_from() function
     LOG(INFO) << "test copy_from() function, input tensor could be any target";
-    //thost1.copy_from(thost0);
-    tdev1.copy_from(thost0);
 
-    /*
+    // host to host
+    thost1.copy_from(thost0);
+    print_tensor_host(thost1);
+
+    // host to device
+    tdev1.copy_from(thost0);
     //TODO: print tensor for BM device
-    print_tensor_host(tdev1);
+    //print_tensor_host(tdev1);
+
+    // device to host
     thost1.copy_from(tdev1);
-    tdev1.copy_from(tdev0);
     print_tensor_host(thost1);
 
+    /*
+    // device to device
+    tdev1.copy_from(tdev0);
+
     //! test tensor constructor with shape and real_shape
     LOG(INFO) << "test tensor constructor with shape and real_shape";
     //! constructor with 3 shapes is removed

From 849006dae424914156e09c345df67cc733f0ba19 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Sat, 23 Jun 2018 16:28:04 +0800
Subject: [PATCH 155/318] AK_BM size should return 1

---
 saber/core/tensor.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/saber/core/tensor.cpp b/saber/core/tensor.cpp
index 9283aac90..3a283c1f6 100644
--- a/saber/core/tensor.cpp
+++ b/saber/core/tensor.cpp
@@ -6,6 +6,9 @@
 #include "bmdnn_api.h"
 #include "bmlib_utils.h"
 
+template<>
+size_t Tensor<BM, AK_BM, NCHW>::_type_len{1};
+
 template<>
 template<>
 SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {

From 5ce796ddc3be7cf79a9795307fdb0dbc10ca8b83 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Sat, 23 Jun 2018 19:23:11 +0800
Subject: [PATCH 156/318] Comment out specialization of _type_len for now.

---
 saber/core/tensor.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/saber/core/tensor.cpp b/saber/core/tensor.cpp
index 3a283c1f6..3203f4779 100644
--- a/saber/core/tensor.cpp
+++ b/saber/core/tensor.cpp
@@ -1,13 +1,19 @@
 #include "tensor.h"
 
 #ifdef USE_BM
-
 #include "bmlib_runtime.h"
 #include "bmdnn_api.h"
 #include "bmlib_utils.h"
+#endif
 
-template<>
-size_t Tensor<BM, AK_BM, NCHW>::_type_len{1};
+namespace anakin {
+
+namespace saber {
+
+#ifdef USE_BM
+
+        //template<>
+//size_t Tensor<BM, AK_BM, NCHW>::_type_len{1};
 
 template<>
 template<>
@@ -25,3 +31,5 @@ SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor
 
 #endif
 
+}
+}
\ No newline at end of file

From 6e48d847fac30bf562aec0f6ffb7d6edd1849b01 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Sat, 23 Jun 2018 19:59:29 +0800
Subject: [PATCH 157/318] Add implementation for copy_from between device and
 system

---
 saber/core/tensor.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/saber/core/tensor.cpp b/saber/core/tensor.cpp
index 3203f4779..1978666bc 100644
--- a/saber/core/tensor.cpp
+++ b/saber/core/tensor.cpp
@@ -12,20 +12,22 @@ namespace saber {
 
 #ifdef USE_BM
 
-        //template<>
+//template<>
 //size_t Tensor<BM, AK_BM, NCHW>::_type_len{1};
 
 template<>
 template<>
 SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
-    //auto* device_data_ptr = mutable_data();
-    //BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(tensor.data())));
+    auto* device_data_ptr = mutable_data();
+    BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(const_cast<float *>(tensor.data()))));
     return SaberSuccess;
 }
 
 template<>
 template<>
 SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor<BM, AK_BM, NCHW>& tensor) {
+    auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
+    BMDNN_CHECK(bm_memcpy_d2s(TargetWrapper<BM>::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr));
     return SaberSuccess;
 }
 

From 56433bc449a6d2832f01ae04bdfac7038869c324 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Sat, 23 Jun 2018 22:46:42 +0800
Subject: [PATCH 158/318] Redefine _type_len as function so that we can do
 specialization

---
 saber/core/tensor.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/saber/core/tensor.cpp b/saber/core/tensor.cpp
index 1978666bc..081854c86 100644
--- a/saber/core/tensor.cpp
+++ b/saber/core/tensor.cpp
@@ -12,8 +12,10 @@ namespace saber {
 
 #ifdef USE_BM
 
-//template<>
-//size_t Tensor<BM, AK_BM, NCHW>::_type_len{1};
+template<>
+size_t Tensor<BM, AK_BM, NCHW>::_type_len(){
+    return 1;
+}
 
 template<>
 template<>

From 2a963f68c4f844596acf6bc59dc570de82ff67de Mon Sep 17 00:00:00 2001
From: "weihao.huang" <weihao.huang@bitmain.com>
Date: Sun, 24 Jun 2018 05:50:24 +0000
Subject: [PATCH 159/318] Fix mem_free function

---
 saber/core/impl/bm/bm_impl.cpp          | 21 ++++++++++++++++-----
 test/saber/bm/test_TargetWrapper_BM.cpp | 19 ++++++++++++-------
 2 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 4aecb169d..d4a312fcf 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -65,21 +65,32 @@ void BM_API::mem_alloc(void** ptr, size_t n){
     //BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n));
     //bm_device_mem_t mem = bm_mem_from_system(*ptr);
     handle = get_bm_handle();
-    bm_device_mem_t *mem = new bm_device_mem_t[1];
-    mem = reinterpret_cast<struct bm_mem_desc *>(ptr);
+    //bm_device_mem_t *mem = new bm_device_mem_t[1];
+    bm_device_mem_t *mem = reinterpret_cast<struct bm_mem_desc *>(*ptr);
     BMDNN_CHECK(bm_malloc_device_byte(handle, mem, n));
 }
         
 void BM_API::mem_free(void* ptr){
     //(bm_handle_t handle, bm_device_mem_t mem){
     if(ptr != nullptr){
-        bm_free_device(handle, bm_mem_from_system(ptr));
+        //bm_free_device(handle, bm_mem_from_system(ptr));
         //bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr);
         //bm_free_device(handle, *pmem);
-        //handle = get_bm_handle();
-	//bm_free_device(handle, reinterpret_cast<struct bm_mem_desc>(*ptr));
+        handle = get_bm_handle();
+        bm_device_mem_t *mem = reinterpret_cast<struct bm_mem_desc *>(ptr);
+	//bm_free_device(handle, reinterpret_cast<struct bm_mem_desc>(ptr));
+	bm_free_device(handle, *mem);
     }
 }
+
+void BM_API::mem_free_BM(bm_device_mem_t mem){
+    //(bm_handle_t handle, bm_device_mem_t mem){
+    if(&mem != nullptr){
+        handle = get_bm_handle();
+	bm_free_device(handle, mem);
+    }
+}
+ 
         
 void BM_API::mem_set(void* ptr, int value, size_t n){
     //(bm_handle_t handle, const int value, bm_device_mem_t mem){
diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp
index a76bef279..c50df3fa3 100644
--- a/test/saber/bm/test_TargetWrapper_BM.cpp
+++ b/test/saber/bm/test_TargetWrapper_BM.cpp
@@ -8,15 +8,20 @@ static bm_handle_t handle;
 int main() {
     bmdnn_init(&handle);
     typedef TargetWrapper<BM> API;
-    void *pmem;
     int dev_count = 0;
     API::get_device_count(dev_count);
-    std::cout << dev_count << std::endl;
-    API::mem_alloc(&pmem, 3*200*200);
-    //API::mem_free(pmem);
-    std::cout << "Press any key to finish execution." << std::endl;
-    int a;
-    std::cin >> a;
+    std::cout << "dev_count: " << dev_count << std::endl;
+    
+    //void *pmem;
+    bm_device_mem_t *pmem = new bm_device_mem_t();
+    std::cout << "mem addr before mem_alloc: " << pmem << std::endl;
+    API::mem_alloc(&pmem, 3*200*400);
+    std::cout << "mem addr after  mem_alloc: " << pmem << std::endl;
+    
+    bm_device_mem_t *test = reinterpret_cast<bm_device_mem_t *>(pmem);
+    API::mem_free_BM((bm_device_mem_t)(*test));
+    std::cout << "End mem_free test." << std::endl;
+    delete pmem;
     bmdnn_deinit(handle);
 }
 #endif

From 4e944abcbab7c1a41227f6de0aacc94eb1ee7e94 Mon Sep 17 00:00:00 2001
From: "weihao.huang" <weihao.huang@bitmain.com>
Date: Sun, 24 Jun 2018 07:26:59 +0000
Subject: [PATCH 160/318] Fix mem_free function

---
 saber/core/target_wrapper.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index 4507af462..c1c776072 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -522,7 +522,9 @@ struct TargetWrapper<BM, __device_target> {
 
     //template <typename void>
     static void mem_free(void * ptr);
-
+    
+    static void mem_free_BM(bm_device_mem_t mem);
+    
     //template <typename void>
     static void mem_set(void* ptr, int value, size_t n);
 

From f8fe05c441b2d337ec2c0c7105332dff4e6468d3 Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Sun, 24 Jun 2018 19:55:07 +0800
Subject: [PATCH 161/318] change mem_free_BM to mem_free; tensor test passed

---
 saber/core/impl/bm/bm_impl.cpp          | 22 +---------------------
 test/saber/bm/test_TargetWrapper_BM.cpp | 12 +++++-------
 2 files changed, 6 insertions(+), 28 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index d4a312fcf..e2e5b9e65 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -60,37 +60,17 @@ int BM_API::get_device_id(){
 }
         
 void BM_API::mem_alloc(void** ptr, size_t n){
-    //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n)
-    //bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr);
-    //BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n));
-    //bm_device_mem_t mem = bm_mem_from_system(*ptr);
     handle = get_bm_handle();
-    //bm_device_mem_t *mem = new bm_device_mem_t[1];
     bm_device_mem_t *mem = reinterpret_cast<struct bm_mem_desc *>(*ptr);
     BMDNN_CHECK(bm_malloc_device_byte(handle, mem, n));
 }
         
 void BM_API::mem_free(void* ptr){
-    //(bm_handle_t handle, bm_device_mem_t mem){
     if(ptr != nullptr){
-        //bm_free_device(handle, bm_mem_from_system(ptr));
-        //bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr);
-        //bm_free_device(handle, *pmem);
         handle = get_bm_handle();
-        bm_device_mem_t *mem = reinterpret_cast<struct bm_mem_desc *>(ptr);
-	//bm_free_device(handle, reinterpret_cast<struct bm_mem_desc>(ptr));
-	bm_free_device(handle, *mem);
+        bm_free_device(handle, *(struct bm_mem_desc*)(ptr));
     }
 }
-
-void BM_API::mem_free_BM(bm_device_mem_t mem){
-    //(bm_handle_t handle, bm_device_mem_t mem){
-    if(&mem != nullptr){
-        handle = get_bm_handle();
-	bm_free_device(handle, mem);
-    }
-}
- 
         
 void BM_API::mem_set(void* ptr, int value, size_t n){
     //(bm_handle_t handle, const int value, bm_device_mem_t mem){
diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp
index c50df3fa3..9d445f16a 100644
--- a/test/saber/bm/test_TargetWrapper_BM.cpp
+++ b/test/saber/bm/test_TargetWrapper_BM.cpp
@@ -8,18 +8,16 @@ static bm_handle_t handle;
 int main() {
     bmdnn_init(&handle);
     typedef TargetWrapper<BM> API;
-    int dev_count = 0;
-    API::get_device_count(dev_count);
-    std::cout << "dev_count: " << dev_count << std::endl;
+    //int dev_count = 0;
+    //API::get_device_count(dev_count);
+    //std::cout << "dev_count: " << dev_count << std::endl;
     
-    //void *pmem;
     bm_device_mem_t *pmem = new bm_device_mem_t();
     std::cout << "mem addr before mem_alloc: " << pmem << std::endl;
     API::mem_alloc(&pmem, 3*200*400);
     std::cout << "mem addr after  mem_alloc: " << pmem << std::endl;
-    
-    bm_device_mem_t *test = reinterpret_cast<bm_device_mem_t *>(pmem);
-    API::mem_free_BM((bm_device_mem_t)(*test));
+    std::cout << "Start mem_free test." << pmem << std::endl;
+    API::mem_free(pmem);
     std::cout << "End mem_free test." << std::endl;
     delete pmem;
     bmdnn_deinit(handle);

From cf2daea7a21b7f1dec57ee781f4fd191de8c76c5 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Mon, 25 Jun 2018 10:44:03 +0800
Subject: [PATCH 162/318] remove stream test in context

---
 test/saber/bm/test_saber_context_BM.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/test/saber/bm/test_saber_context_BM.cpp b/test/saber/bm/test_saber_context_BM.cpp
index e221ba8f4..ed93866cf 100644
--- a/test/saber/bm/test_saber_context_BM.cpp
+++ b/test/saber/bm/test_saber_context_BM.cpp
@@ -12,11 +12,8 @@ TEST(TestSaberContextBM, test_BM_context) {
     LOG(INFO) << "test context constructor";
     Context<BM> ctx0;
     Context<BM> ctx1(0, 1, 1);
-    LOG(INFO) << "test record event to context data stream and compute stream";
-    API::record_event(event, ctx0.get_data_stream());
-    API::record_event(event, ctx0.get_compute_stream());
-    API::record_event(event, ctx1.get_data_stream());
-    API::record_event(event, ctx1.get_compute_stream());
+
+    //for BM no need to test stream as it is not in use
 }
 
 #endif

From 366e01fe28887efe94f02bafeaf2ef589584ecd8 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Mon, 25 Jun 2018 11:46:19 +0800
Subject: [PATCH 163/318] Update buffer test for BM

---
 test/saber/bm/test_saber_buffer_BM.cpp | 68 +++++++++++++++-----------
 1 file changed, 39 insertions(+), 29 deletions(-)

diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp
index 93aa6d36e..ea8d7101d 100644
--- a/test/saber/bm/test_saber_buffer_BM.cpp
+++ b/test/saber/bm/test_saber_buffer_BM.cpp
@@ -4,12 +4,22 @@
 
 using namespace anakin::saber;
 
-template <DataType datatype>
+static bm_handle_t handle;
+
+int get_bm_size() {
+    return 1;
+}
+
+template <DataType Ddatatype, DataType Hdatatype>
 void test_buffer() {
 
+    //TODO: init in another place
+    bmdnn_init(&handle);
+
     typedef TargetWrapper<X86> X86_API;
     typedef TargetWrapper<BM> BM_API;
-    typedef typename DataTrait<datatype>::dtype Dtype;
+    typedef typename DataTrait<Ddatatype>::dtype Ddtype;
+    typedef typename DataTrait<Hdatatype>::dtype Hdtype;
     typedef Buffer<X86> BufferH;
     typedef Buffer<BM> BufferD;
 
@@ -17,30 +27,30 @@ void test_buffer() {
     int n1 = 2048;
 
     void* tmp_x86;
-    Dtype* x86_ptr;
-    X86_API::mem_alloc(&tmp_x86, sizeof(Dtype) * n0);
-    x86_ptr = static_cast<Dtype*>(tmp_x86);
+    Hdtype* x86_ptr;
+    X86_API::mem_alloc(&tmp_x86, sizeof(Hdtype) * n0);
+    x86_ptr = static_cast<Hdtype*>(tmp_x86);
 
     for (int i = 0; i < n0; i++) {
-        x86_ptr[i] = static_cast<Dtype>(i);
+        x86_ptr[i] = static_cast<Hdtype>(i);
     }
 
     void* tmp_bm;
-    Dtype* bm_ptr;
-    BM_API::mem_alloc(&tmp_bm, sizeof(Dtype) * n0);
-    bm_ptr = static_cast<Dtype*>(tmp_bm);
+    Ddtype* bm_ptr;
+    BM_API::mem_alloc(&tmp_bm, get_bm_size() * n0);
+    bm_ptr = static_cast<Ddtype*>(tmp_bm);
 
     LOG(INFO) << "Buffer: test default(empty) constructor";
     BufferH x86_buf0;
     BufferD bm_buf0;
 
     LOG(INFO) << "Buffer: test constructor with data size";
-    BufferH x86_buf1(n0 * sizeof(Dtype));
-    BufferD bm_buf1(n0 * sizeof(Dtype));
+    BufferH x86_buf1(n0 * sizeof(Hdtype));
+    BufferD bm_buf1(n0 * sizeof(Ddtype));
 
     LOG(INFO) << "Buffer: test constructor with data pointer, size and device id";
-    BufferH x86_buf2(x86_ptr, n0 * sizeof(Dtype), X86_API::get_device_id());
-    BufferD bm_buf2(bm_ptr, n0 * sizeof(Dtype), BM_API::get_device_id());
+    BufferH x86_buf2(x86_ptr, n0 * sizeof(Hdtype), X86_API::get_device_id());
+    BufferD bm_buf2(bm_ptr, n0 * get_bm_size(), BM_API::get_device_id());
 
     LOG(INFO) << "Buffer: test copy constructor";
     BufferH x86_buf3(x86_buf2);
@@ -62,18 +72,18 @@ void test_buffer() {
             "shared buffer should have same data count";
 
     LOG(INFO) << "Buffer: test re_alloc";
-    x86_buf1.re_alloc(n1 * sizeof(Dtype));
-    bm_buf1.re_alloc(n1 * sizeof(Dtype));
-    CHECK_EQ(x86_buf1.get_count(), n1 * sizeof(Dtype)) << "buffer count error";
-    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error";
-    CHECK_EQ(bm_buf1.get_count(), n1 * sizeof(Dtype)) << "buffer count error";
-    CHECK_EQ(bm_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error";
-    x86_buf1.re_alloc(n0 * sizeof(Dtype));
-    bm_buf1.re_alloc(n0 * sizeof(Dtype));
-    CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Dtype)) << "buffer count error";
-    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error";
-    CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Dtype)) << "buffer count error";
-    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error";
+    x86_buf1.re_alloc(n1 * sizeof(Hdtype));
+    bm_buf1.re_alloc(n1 * sizeof(Ddtype));
+    CHECK_EQ(x86_buf1.get_count(), n1 * sizeof(Hdtype)) << "buffer count error";
+    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Hdtype)) << "buffer capacity error";
+    CHECK_EQ(bm_buf1.get_count(), n1 * sizeof(Ddtype)) << "buffer count error";
+    CHECK_EQ(bm_buf1.get_capacity(), n1 * sizeof(Ddtype)) << "buffer capacity error";
+    x86_buf1.re_alloc(n0 * sizeof(Hdtype));
+    bm_buf1.re_alloc(n0 * sizeof(Ddtype));
+    CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Hdtype)) << "buffer count error";
+    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Hdtype)) << "buffer capacity error";
+    CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Hdtype)) << "buffer count error";
+    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Hdtype)) << "buffer capacity error";
 
     LOG(INFO) << "Buffer: test get_id()";
     LOG(INFO) << "X86 device id: " << x86_buf0.get_id() << \
@@ -84,8 +94,8 @@ void test_buffer() {
     LOG(INFO) << "Buffer: test deep_cpy()";
     x86_buf1.sync_copy_from(x86_buf2);
     LOG(INFO) << "deep copy between two host buffer: ";
-    const Dtype* ptr1 = static_cast<const Dtype*>(x86_buf1.get_data());
-    const Dtype* ptr2 = static_cast<const Dtype*>(x86_buf1.get_data());
+    const Hdtype* ptr1 = static_cast<const Hdtype*>(x86_buf1.get_data());
+    const Hdtype* ptr2 = static_cast<const Hdtype*>(x86_buf1.get_data());
 
     for (int i = 0; i < 10; i++) {
         std::cout << ptr1[i] << std::endl;
@@ -96,7 +106,7 @@ void test_buffer() {
     bm_buf1.sync_copy_from(x86_buf2);
     x86_buf1.sync_copy_from(bm_buf1);
     LOG(INFO) << "deep copy from device buffer to host buffer: ";
-    ptr1 = static_cast<const Dtype*>(x86_buf1.get_data());
+    ptr1 = static_cast<const Hdtype*>(x86_buf1.get_data());
 
     for (int i = 0; i < 10; i++) {
         std::cout << ptr1[i] << std::endl;
@@ -104,7 +114,7 @@ void test_buffer() {
 }
 
 TEST(TestSaberBufferBM, test_buffer_memcpy) {
-    test_buffer<AK_BM>();
+    test_buffer<AK_BM, AK_FLOAT>();
 }
 
 int main(int argc, const char** argv) {

From 4134a33f62d68d677437d5d97266645290a26c57 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Mon, 25 Jun 2018 13:26:51 +0800
Subject: [PATCH 164/318] Specialization for Env<BM>

---
 saber/core/env.cpp | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 saber/core/env.cpp

diff --git a/saber/core/env.cpp b/saber/core/env.cpp
new file mode 100644
index 000000000..b294fead4
--- /dev/null
+++ b/saber/core/env.cpp
@@ -0,0 +1,19 @@
+#include "env.h"
+
+namespace anakin {
+
+    namespace saber {
+
+#ifdef USE_BM
+
+        template<>
+        void Env<BM>::env_init(int max_stream){
+            //TODO: decide what to put here
+            LOG(INFO) << "env init for BM";
+        }
+
+#endif
+
+
+    }
+}

From e147561e4dc4534bf3dd71057b1008d5f90755bb Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Mon, 25 Jun 2018 14:07:30 +0800
Subject: [PATCH 165/318] env skip bm

---
 saber/core/env.cpp | 19 -------------------
 saber/core/env.h   |  5 +++++
 2 files changed, 5 insertions(+), 19 deletions(-)
 delete mode 100644 saber/core/env.cpp

diff --git a/saber/core/env.cpp b/saber/core/env.cpp
deleted file mode 100644
index b294fead4..000000000
--- a/saber/core/env.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-#include "env.h"
-
-namespace anakin {
-
-    namespace saber {
-
-#ifdef USE_BM
-
-        template<>
-        void Env<BM>::env_init(int max_stream){
-            //TODO: decide what to put here
-            LOG(INFO) << "env init for BM";
-        }
-
-#endif
-
-
-    }
-}
diff --git a/saber/core/env.h b/saber/core/env.h
index c6f73dfca..ef824e1d9 100644
--- a/saber/core/env.h
+++ b/saber/core/env.h
@@ -17,6 +17,7 @@
 #define ANAKIN_SABER_CORE_ENV_H
 
 #include "core/device.h"
+#include <type_traits>
 
 namespace anakin{
 
@@ -32,6 +33,10 @@ class Env {
         return *_g_env;
     }
     static void env_init(int max_stream = 4){
+        if(std::is_same<TargetType,BM>::value){
+            LOG(INFO) << "env init for BM";
+            return;
+        }
         Devs& devs = cur_env();
         if (devs.size() > 0){
             return;

From 6d864531042d200334dca2113969f5ab787f9802 Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Mon, 25 Jun 2018 15:10:32 +0800
Subject: [PATCH 166/318] modify mem_alloc for void*

---
 saber/core/impl/bm/bm_impl.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index e2e5b9e65..c93703a5d 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -61,8 +61,10 @@ int BM_API::get_device_id(){
         
 void BM_API::mem_alloc(void** ptr, size_t n){
     handle = get_bm_handle();
-    bm_device_mem_t *mem = reinterpret_cast<struct bm_mem_desc *>(*ptr);
+    /* bm_device_mem_t *mem = reinterpret_cast<struct bm_mem_desc *>(*ptr); */
+    bm_device_mem_t *mem = new bm_device_mem_t();
     BMDNN_CHECK(bm_malloc_device_byte(handle, mem, n));
+    *ptr = mem;
 }
         
 void BM_API::mem_free(void* ptr){

From 0a4a48f6d6e17911954658e30b7ea151f4169461 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Mon, 25 Jun 2018 15:40:43 +0800
Subject: [PATCH 167/318] Specialization for copy_from

---
 saber/core/tensor.cpp | 39 ---------------------------------------
 saber/core/tensor.h   | 28 ++++++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 39 deletions(-)
 delete mode 100644 saber/core/tensor.cpp

diff --git a/saber/core/tensor.cpp b/saber/core/tensor.cpp
deleted file mode 100644
index 081854c86..000000000
--- a/saber/core/tensor.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-#include "tensor.h"
-
-#ifdef USE_BM
-#include "bmlib_runtime.h"
-#include "bmdnn_api.h"
-#include "bmlib_utils.h"
-#endif
-
-namespace anakin {
-
-namespace saber {
-
-#ifdef USE_BM
-
-template<>
-size_t Tensor<BM, AK_BM, NCHW>::_type_len(){
-    return 1;
-}
-
-template<>
-template<>
-SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
-    auto* device_data_ptr = mutable_data();
-    BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(const_cast<float *>(tensor.data()))));
-    return SaberSuccess;
-}
-
-template<>
-template<>
-SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor<BM, AK_BM, NCHW>& tensor) {
-    auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
-    BMDNN_CHECK(bm_memcpy_d2s(TargetWrapper<BM>::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr));
-    return SaberSuccess;
-}
-
-#endif
-
-}
-}
\ No newline at end of file
diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 0ac9a1454..61be376f8 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -761,6 +761,7 @@ class Tensor {
 #ifdef USE_BM
     template <typename NewTargetType_t, DataType NewDataType_t, typename NewLayOutType_t>
     SaberStatus copy_from(const Tensor<NewTargetType_t, NewDataType_t, NewLayOutType_t>& tensor) {
+        LOG(INFO) << "base copy_from";
         return SaberInvalidValue;
     }
 #endif
@@ -1001,6 +1002,33 @@ class Tensor {
     }
 };
 
+#ifdef USE_BM
+
+template<> inline
+size_t Tensor<BM, AK_BM, NCHW>::_type_len(){
+    return 1;
+}
+
+template<>
+template<> inline
+SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
+    LOG(INFO) << "BM copy_from";
+    auto* device_data_ptr = mutable_data();
+    BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(const_cast<float *>(tensor.data()))));
+    return SaberSuccess;
+}
+
+template<>
+template<> inline
+SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor<BM, AK_BM, NCHW>& tensor) {
+    LOG(INFO) << "X86 copy_from";
+    auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
+    BMDNN_CHECK(bm_memcpy_d2s(TargetWrapper<BM>::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr));
+    return SaberSuccess;
+}
+
+#endif
+
 } //namespace saber
 
 } //namespace anakin

From 630e58e10836e6b27dfd04db8801d79fc6d1de9a Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Mon, 25 Jun 2018 15:50:04 +0800
Subject: [PATCH 168/318] Revert speical handling for Env<BM>

---
 saber/core/env.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/saber/core/env.h b/saber/core/env.h
index ef824e1d9..c6f73dfca 100644
--- a/saber/core/env.h
+++ b/saber/core/env.h
@@ -17,7 +17,6 @@
 #define ANAKIN_SABER_CORE_ENV_H
 
 #include "core/device.h"
-#include <type_traits>
 
 namespace anakin{
 
@@ -33,10 +32,6 @@ class Env {
         return *_g_env;
     }
     static void env_init(int max_stream = 4){
-        if(std::is_same<TargetType,BM>::value){
-            LOG(INFO) << "env init for BM";
-            return;
-        }
         Devs& devs = cur_env();
         if (devs.size() > 0){
             return;

From 74f27b536f45386c38a6542191e7c97036bc350d Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Mon, 25 Jun 2018 17:29:29 +0800
Subject: [PATCH 169/318] add conv op, did't test

---
 saber/core/impl/bm/bm_impl.cpp          |   1 +
 saber/funcs/impl/bm/vender_conv.h       | 167 ++++--------------------
 test/saber/bm/test_TargetWrapper_BM.cpp |   6 +-
 3 files changed, 29 insertions(+), 145 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index c93703a5d..1bdb5d140 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -71,6 +71,7 @@ void BM_API::mem_free(void* ptr){
     if(ptr != nullptr){
         handle = get_bm_handle();
         bm_free_device(handle, *(struct bm_mem_desc*)(ptr));
+        delete ptr;
     }
 }
         
diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h
index 7efdfa611..a0a3b3fb5 100644
--- a/saber/funcs/impl/bm/vender_conv.h
+++ b/saber/funcs/impl/bm/vender_conv.h
@@ -1,18 +1,3 @@
-/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-   
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. 
-*/
-
 #ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H
 #define ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H
 
@@ -44,105 +29,13 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
     typedef typename DataTensor_in::Dtype InDataType;
     typedef typename DataTensor_out::Dtype OutDataType;
     typedef typename OpTensor::Dtype OpDataType;
-    VenderConv2D()
-            : _handle(NULL)
-            , _workspaceData(NULL)
-            , _workspace(NULL)
-            , _conv_descs(NULL)
-            , _input_descs(NULL)
-            , _output_descs(NULL)
-            , _filter_desc(NULL)
-            , _workspace_fwd_sizes(0)
-            , _workspaceSizeInBytes(0)
-            , _fwd_algo((cudnnConvolutionFwdAlgo_t)0)
-            , _input_nchw_descs(NULL)
-            , _output_nchw_descs(NULL)
-            , x8_data(NULL)
-            , y8_data(NULL)
-            , x8_data_size(0)
-            , y8_data_size(0)
-    {}
-
-    ~VenderConv2D() {
 
-        if (_conv_descs) {
-            CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(_conv_descs));
-        }
-        if (_input_descs) {
-            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_descs));
-        }
-        if (_output_descs) {
-            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_descs));
-        }
-        if (_filter_desc) {
-            CUDNN_CHECK(cudnnDestroyFilterDescriptor(_filter_desc));
-        }
-        if (_handle != NULL) {
-            CUDNN_CHECK(cudnnDestroy(_handle));
-        }
-        if (_workspaceData != NULL) {
-            cudaFree(_workspaceData);
-        }
-        if (_input_nchw_descs != NULL) {
-            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_nchw_descs));
-        }
-        if (_output_nchw_descs != NULL) {
-            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_nchw_descs));
-        }
-        if (x8_data != NULL) {
-            CUDA_CHECK(cudaFree(x8_data));
-        }
-        if (y8_data != NULL) {
-            CUDA_CHECK(cudaFree(y8_data));
-        }
-    }
+    VenderConv2D(): _handle(NULL) {}
+    ~VenderConv2D() {}
 
-    /**
-     * [Create description] Init all cudnn resource here
-     * @AuthorHTL
-     * @DateTime  2018-02-01T16:13:06+0800
-     * @param     inputs                    [description]
-     * @param     outputs                   [description]
-     * @param     param                [conv parameters]
-     */
     virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             ConvParam<OpTensor>& param, Context<BM>& ctx) {
-
-        // ---- init cudnn resources ----
-
-        _workspaceSizeInBytes = 0;
-        _workspaceData = NULL;
-
-        _workspace_fwd_sizes = 0;
-
-        this->_ctx = ctx;
-        // ---- get cuda resources ----
-
-        cudaStream_t cuda_stream;
-        cuda_stream = ctx.get_compute_stream();
-
-        CUDNN_CHECK(cudnnCreate(&_handle));
-        CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream));
-
-        _workspace = NULL;
-
-        int in_channels = inputs[0]->channel();
-
-        // ---- create cudnn Descs ----
-        cudnn::createFilterDesc<OpDataType>(&_filter_desc);
-
-        cudnn::createTensorDesc<InDataType>(&_input_descs);
-        cudnn::createTensorDesc<InDataType>(&_output_descs);
-        cudnn::createConvolutionDesc<OpDataType>(&_conv_descs);
-
-        if (param.bias()->size() > 0) {
-            cudnn::createTensorDesc<OpDataType>(&_bias_desc);
-        }
-
-        cudnnCreateTensorDescriptor(&_input_nchw_descs);
-        cudnnCreateTensorDescriptor(&_output_nchw_descs);
-
         return create(inputs, outputs, param, ctx);
     }
 
@@ -150,46 +43,36 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
                             std::vector<DataTensor_out *>& outputs,
                             ConvParam<OpTensor>& param, Context<BM>& ctx);
 
-    //call cudnnConvolutionForward here
     virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
                           std::vector<DataTensor_out*>& outputs,
-                          ConvParam<OpTensor>& param);
+                          ConvParam<OpTensor>& param) {
+        const InDataType *in_data = (const InDataType *) inputs[0]->data();
+        const InDataType *weight = (const InDataType *) param.weight()->data();
+        const InDataType *bias = (const InDataType *) param.bias()->data();
+        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
+        int input_n = inputs[0]->num();
+        int input_c = inputs[0]->channel();
+        int input_h = inputs[0]->height();
+        int input_w = inputs[0]->width();
+        int group = param.group;
+        int output_c = outputs[0]->channel();
+        int kh = param.weight()->height();
+        int kw = param.weight()->width();
+        int pad_h = param.pad_h;
+        int pad_w = param.pad_w;
+        int stride_h = param.stride_h;
+        int stride_w = param.stride_w;
+        BMDNN_CHECK(bmdnn_conv_forward(_handle, in_data, weights, bias,
+                                    input_n, input_c, input_h, input_w, group, output_c,
+                                    kh, kw, pad_h, pad_w, stride_h, stride_w, 1, 0, 0, 
+                                    out_data, NULL));
+        return SaberSuccess;
+    }
 
 private:
     cudnnHandle_t _handle;
-    cudnnConvolutionFwdAlgo_t _fwd_algo;
-
-    cudnnTensorDescriptor_t _input_descs;
-    cudnnTensorDescriptor_t _output_descs;
-    cudnnTensorDescriptor_t _bias_desc;
-
-    cudnnFilterDescriptor_t _filter_desc;
-
-    cudnnConvolutionDescriptor_t _conv_descs;
-
-    size_t _workspace_fwd_sizes;
-    size_t _workspaceSizeInBytes;  // size of underlying storage
-
-    void *_workspaceData;  // underlying storage
-    void *_workspace;  // aliases into _workspaceData
-
-    const bool _use_tensor_core = true;
-    const size_t _workspace_limit_bytes = 64 * 1024 * 1024;
-    const cudnnConvolutionFwdPreference_t _preference = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
-
-    // create transform descriptor
-    cudnnTensorDescriptor_t _input_nchw_descs;
-    cudnnTensorDescriptor_t _output_nchw_descs;
-
-    void *x8_data;
-    void *y8_data;
-
-    int x8_data_size;
-    int y8_data_size;
 };
 
-
 }
-
 }
 #endif //ANAKIN_SABER_FUNCS_BMDNN_CONV2D_H
diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp
index 9d445f16a..b893183a2 100644
--- a/test/saber/bm/test_TargetWrapper_BM.cpp
+++ b/test/saber/bm/test_TargetWrapper_BM.cpp
@@ -12,14 +12,14 @@ int main() {
     //API::get_device_count(dev_count);
     //std::cout << "dev_count: " << dev_count << std::endl;
     
-    bm_device_mem_t *pmem = new bm_device_mem_t();
+    //bm_device_mem_t *pmem = new bm_device_mem_t();
+    void* pmem;
     std::cout << "mem addr before mem_alloc: " << pmem << std::endl;
     API::mem_alloc(&pmem, 3*200*400);
     std::cout << "mem addr after  mem_alloc: " << pmem << std::endl;
-    std::cout << "Start mem_free test." << pmem << std::endl;
+    std::cout << "Start mem_free test." << std::endl;
     API::mem_free(pmem);
     std::cout << "End mem_free test." << std::endl;
-    delete pmem;
     bmdnn_deinit(handle);
 }
 #endif

From 6346b23e55f7a073c30868b265770b04cfb54bad Mon Sep 17 00:00:00 2001
From: "weihao.huang" <weihao.huang@bitmain.com>
Date: Mon, 25 Jun 2018 13:21:36 +0000
Subject: [PATCH 170/318] Add sync_memcpy function & fix test_saber_buffer_BM

---
 saber/core/impl/bm/bm_impl.cpp         | 19 +++++++++++++++++++
 saber/core/target_wrapper.h            |  6 ++----
 test/saber/bm/test_saber_buffer_BM.cpp | 21 ++++++++++++++++++++-
 3 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 1bdb5d140..dacca58b6 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -82,6 +82,25 @@ void BM_API::mem_set(void* ptr, int value, size_t n){
     //BMDNN_CHECK(bm_memset_device(handle, value, *pmem));
 }
 
+//static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+//    size_t count, __DtoD) {};
+
+//static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+//    size_t count, __HtoD) {};
+
+void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+    size_t count, __DtoH) {
+    handle = get_bm_handle(); 
+    //auto* dev_ptr = const_cast<bm_device_mem_t *>(src);
+    BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src)));
+    //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *src));
+    LOG(INFO) << "End sync_memcpy process";
+};
+
+//static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
+//    int src_dev, size_t count) {};
+
+
 //! target wrapper
 template struct TargetWrapper<BM, __device_target>;
 
diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index c1c776072..8fc99d48b 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -521,9 +521,7 @@ struct TargetWrapper<BM, __device_target> {
     static void mem_alloc(void** ptr, size_t n);
 
     //template <typename void>
-    static void mem_free(void * ptr);
-    
-    static void mem_free_BM(bm_device_mem_t mem);
+    static void mem_free(void * ptr); 
     
     //template <typename void>
     static void mem_set(void* ptr, int value, size_t n);
@@ -548,7 +546,7 @@ struct TargetWrapper<BM, __device_target> {
         size_t count, __HtoD) {};
 
     static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-        size_t count, __DtoH) {};
+        size_t count, __DtoH);
 
     static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
         int src_dev, size_t count) {};
diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp
index ea8d7101d..434bd221a 100644
--- a/test/saber/bm/test_saber_buffer_BM.cpp
+++ b/test/saber/bm/test_saber_buffer_BM.cpp
@@ -104,11 +104,30 @@ void test_buffer() {
     CHECK_EQ(ptr1[n0 / 2], ptr2[n0 / 2]) << "deep copy between host is incorrect";
     LOG(INFO) << "deep copy from host buffer to device buffer";
     bm_buf1.sync_copy_from(x86_buf2);
+    
+    /*
+    const Hdtype* x86_buf2_ptr = static_cast<const Hdtype*>(x86_buf2.get_data());
+    for (int i = 0; i < 10; i++) {
+	std::cout << "x86: " << x86_buf2_ptr[i] << std::endl;
+    }
+
+    const Hdtype* bm_buf1_ptr = static_cast<const Hdtype*>(bm_buf1.get_data());
+    for (int i = 0; i < 10; i++) {
+	std::cout << "bm: " << bm_buf1_ptr[i] << std::endl;
+    }
+
+    LOG(INFO) << "bm_buf1 cap & cnt: " << bm_buf1.get_capacity() << " " << bm_buf1.get_count();
+    LOG(INFO) << "x86_buf1 cap & cnt: " << x86_buf1.get_capacity() << " " << x86_buf1.get_count(); 
+    LOG(INFO) << "size of Hdtype: " << sizeof(Hdtype);
+    LOG(INFO) << "size of Ddtype: " << sizeof(Ddtype);
+    */
+
+    x86_buf1.re_alloc(bm_buf1.get_capacity());
     x86_buf1.sync_copy_from(bm_buf1);
     LOG(INFO) << "deep copy from device buffer to host buffer: ";
     ptr1 = static_cast<const Hdtype*>(x86_buf1.get_data());
 
-    for (int i = 0; i < 10; i++) {
+    for (int i = 0; i < 30; i++) {
         std::cout << ptr1[i] << std::endl;
     }
 }

From f0c6e7879e6e662d350f9a07b97d233dcd0a7ecc Mon Sep 17 00:00:00 2001
From: "Guangzhi (Frank) Xie" <guangzhi.xie@berkeley.edu>
Date: Mon, 25 Jun 2018 21:55:27 +0800
Subject: [PATCH 171/318] init handle for tensor test

---
 test/saber/bm/test_saber_tensor_BM.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index ed3ff0503..d42665528 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -625,6 +625,10 @@ TEST(TestSaberTensorBM, test_tensor_base_type) {
 }*/
 
 int main(int argc, const char** argv) {
+    //TODO: init in another place
+    static bm_handle_t handle;
+    bmdnn_init(&handle);
+
     // initial logger
     logger::init(argv[0]);
     InitTest();

From 595677d3941b7c0b25b3feeb4bb68808c8acb177 Mon Sep 17 00:00:00 2001
From: "Guangzhi (Frank) Xie" <guangzhi.xie@berkeley.edu>
Date: Mon, 25 Jun 2018 22:06:13 +0800
Subject: [PATCH 172/318] init handle for BM context test

---
 test/saber/bm/test_saber_context_BM.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/saber/bm/test_saber_context_BM.cpp b/test/saber/bm/test_saber_context_BM.cpp
index ed93866cf..f2df59c88 100644
--- a/test/saber/bm/test_saber_context_BM.cpp
+++ b/test/saber/bm/test_saber_context_BM.cpp
@@ -19,6 +19,10 @@ TEST(TestSaberContextBM, test_BM_context) {
 #endif
 
 int main(int argc, const char** argv) {
+    //TODO: init in another place
+    static bm_handle_t handle;
+    bmdnn_init(&handle);
+    
     // initial logger
     logger::init(argv[0]);
     InitTest();

From 43c3a68cb2994445200f74b12160001341da05e2 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Tue, 26 Jun 2018 09:12:18 +0800
Subject: [PATCH 173/318] handle init rearrange

---
 test/saber/bm/test_saber_buffer_BM.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp
index 434bd221a..00f77d308 100644
--- a/test/saber/bm/test_saber_buffer_BM.cpp
+++ b/test/saber/bm/test_saber_buffer_BM.cpp
@@ -4,8 +4,6 @@
 
 using namespace anakin::saber;
 
-static bm_handle_t handle;
-
 int get_bm_size() {
     return 1;
 }
@@ -13,9 +11,6 @@ int get_bm_size() {
 template <DataType Ddatatype, DataType Hdatatype>
 void test_buffer() {
 
-    //TODO: init in another place
-    bmdnn_init(&handle);
-
     typedef TargetWrapper<X86> X86_API;
     typedef TargetWrapper<BM> BM_API;
     typedef typename DataTrait<Ddatatype>::dtype Ddtype;
@@ -137,6 +132,10 @@ TEST(TestSaberBufferBM, test_buffer_memcpy) {
 }
 
 int main(int argc, const char** argv) {
+    //TODO: init in another place
+    static bm_handle_t handle;
+    bmdnn_init(&handle);
+
     // initial logger
     logger::init(argv[0]);
     InitTest();

From 4fbb9517f4584a1037d1aafd85d92360a73b6cf4 Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Tue, 26 Jun 2018 09:43:57 +0800
Subject: [PATCH 174/318] add pooling wrapper, didn't test

---
 saber/funcs/impl/bm/vender_pooling.h         | 95 +++++---------------
 test/saber/bm/test_saber_func_pooling_BM.cpp | 33 ++-----
 2 files changed, 30 insertions(+), 98 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h
index 4990a5357..0da1a1106 100644
--- a/saber/funcs/impl/bm/vender_pooling.h
+++ b/saber/funcs/impl/bm/vender_pooling.h
@@ -44,78 +44,19 @@ class VenderPooling<NV, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in
     typedef typename DataTensor_out::Dtype OutDataType;
     typedef typename OpTensor::Dtype OpDataType;
 
-    VenderPooling() : _handle(NULL) {}
+    VenderPooling() : _handle(NULL), _pooling_type(NULL) {}
 
     ~VenderPooling() {}
 
     virtual SaberStatus init(const std::vector<DataTensor_in*>& inputs,
                   std::vector<DataTensor_out*>& outputs,
                   PoolingParam<OpTensor> &pooling_param, Context<BM> &ctx) {
-
-        this->_ctx = ctx;
-
-        cudaStream_t cuda_stream;
-        cuda_stream = ctx.get_compute_stream();
-
-        CUDNN_CHECK(cudnnCreate(&_handle));
-        CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream));
-
-        cudnn::createTensorDesc<InDataType>(&_input_descs);
-        cudnn::createTensorDesc<OutDataType>(&_output_descs);
-
-        cudnn::create_pooling_des<OpDataType>(&_pooling_descs);
-
         return create(inputs, outputs, pooling_param, ctx);
     }
 
     virtual SaberStatus create(const std::vector<DataTensor_in*>& inputs,
                 std::vector<DataTensor_out*>& outputs,
                 PoolingParam<OpTensor> &pooling_param, Context<BM> &ctx) {
-        if (!(ctx == this->_ctx)) {
-            if (_handle != NULL) {
-                CUDNN_CHECK(cudnnDestroy(_handle));
-            }
-            this->_ctx = ctx;
-
-            cudaStream_t cuda_stream;
-            cuda_stream = ctx.get_compute_stream();
-
-            CUDNN_CHECK(cudnnCreate(&_handle));
-            CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream));
-        }
-
-        int input_num = inputs[0]->num();
-        int input_channel = inputs[0]->channel();
-        int input_height = inputs[0]->height();
-        int input_width = inputs[0]->width();
-        int output_channel = outputs[0]->channel();
-        int output_height = outputs[0]->height();
-        int output_width = outputs[0]->width();
-
-        Shape stride_in = inputs[0]->get_stride();
-        Shape stride_out = outputs[0]->get_stride();
-
-        int dim_a[] = {input_num, input_channel,
-                       input_height, input_width};
-
-        int dim_b[] = {input_num, output_channel,
-                       output_height, output_width};
-
-        cudnn::setTensorNdDesc<InDataType>(&_input_descs,
-                                            inputs[0]->dims(), dim_a, &stride_in[0]);
-
-        cudnn::setTensorNdDesc<OutDataType>(&_output_descs,
-                                             outputs[0]->dims(), dim_b, &stride_out[0]);
-
-        int windowHeight[] = {pooling_param.window_h, pooling_param.window_w};
-        int padding[] = {pooling_param.pad_h, pooling_param.pad_w};
-
-        int stride[] = {pooling_param.stride_h, pooling_param.stride_w};
-
-        cudnn::set_nd_pooling_des<OpDataType>(&_pooling_descs, pooling_param.pooling_type,
-                                               inputs[0]->dims() - 2, windowHeight,
-                                               padding,stride);
-        return SaberSuccess;
     }
 
     virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
@@ -123,23 +64,31 @@ class VenderPooling<NV, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in
                           PoolingParam<OpTensor> &param) {
         const InDataType *in_data = inputs[0]->data();
         OutDataType *out_data = outputs[0]->mutable_data();
-
-        CUDNN_CHECK(cudnnPoolingForward(_handle, _pooling_descs,
-                                        cudnn::cudnnTypeWrapper<InDataType>::kOne(),
-                                        _input_descs, in_data,
-                                        cudnn::cudnnTypeWrapper<OutDataType>::kZero(),
-                                        _output_descs, out_data
-        ));
-
+        int input_n = inputs[0]->num();
+        int input_c = inputs[0]->channel();
+        int input_h = inputs[0]->height();
+        int input_w = inputs[0]->width();
+        int kh = param.window_h;
+        int kw = param.window_w;
+        int pad_h = param.pad_h;
+        int pad_w = param.pad_w;
+        int stride_h = param.stride_h;
+        int stride_w = param.stride_w;
+        if(_pooling_type == Pooling_max){
+            int is_avg_pooling = 0;
+        } else {
+            int is_avg_pooling = 1;
+        }
+        BMDNN_CHECK(bmdnn_pooling_forward(_handle, in_data, 
+                            input_n, input_c, input_h, input_w, kh, hw, pad_h, pad_w, 
+                            stride_h, stride_w, is_avg_pooling, 0,
+                            out_data, NULL, NULL));
         return SaberSuccess;
     }
 
 private:
-    cudnnHandle_t _handle;
-    cudnnTensorDescriptor_t _input_descs;
-    cudnnTensorDescriptor_t _output_descs;
-    cudnnPoolingDescriptor_t _pooling_descs;
-
+    bm_handle_t _handle;
+    PoolType _pooling_type;
 };
 
 template class VenderPooling<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp
index 04b963675..ce8e7f8f5 100644
--- a/test/saber/bm/test_saber_func_pooling_BM.cpp
+++ b/test/saber/bm/test_saber_func_pooling_BM.cpp
@@ -18,7 +18,7 @@ TEST(TestSaberFuncBM, test_func_pooling) {
     typedef TargetWrapper<X86> X86_API;
     typedef TargetWrapper<BM> BM_API;
     typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 
     int img_num = 1;
     int in_channels = 4;
@@ -71,7 +71,7 @@ TEST(TestSaberFuncBM, test_func_pooling) {
     input.push_back(&img_dev);
     output.push_back(&output_dev);
 
-    Pooling<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> pooling;
+    Pooling<BM, AK_BM, AK_BM, AK_BM, NCHW> pooling;
     pooling.compute_output_shape(input, output, param);
 
     output_dev.re_alloc(output[0]->shape());
@@ -92,15 +92,12 @@ TEST(TestSaberFuncBM, test_func_pooling) {
     }
 
     output_dev.sync();
-    cudaDeviceSynchronize();
     LOG(INFO) << " average time: " << t1.get_average_ms() << " ms";
     LOG(INFO) << " tile 10% time: " << t1.get_tile_time(10) << " ms";
     LOG(INFO) << " tile 50% time: " << t1.get_tile_time(50) << " ms";
     LOG(INFO) << " tile 90% time: " << t1.get_tile_time(90) << " ms";
     LOG(INFO) << " tile 95% time: " << t1.get_tile_time(95) << " ms";
     LOG(INFO) << " tile 99% time: " << t1.get_tile_time(99) << " ms";
-
-    CUDA_CHECK(cudaPeekAtLastError());
 }
 
 TEST(TestSaberFuncBM, test_pooling_result) {
@@ -113,7 +110,7 @@ TEST(TestSaberFuncBM, test_pooling_result) {
     typedef TargetWrapper<X86> X86_API;
     typedef TargetWrapper<BM> BM_API;
     typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 
     int img_num = 1;
     int in_channels = 2;
@@ -166,7 +163,7 @@ TEST(TestSaberFuncBM, test_pooling_result) {
     input.push_back(&img_dev);
     output.push_back(&output_dev);
 
-    Pooling<BM, AK_FLOAT> pooling;
+    Pooling<BM, AK_BM> pooling;
     pooling.compute_output_shape(input, output, param);
 
     output_dev.re_alloc(output[0]->shape());
@@ -174,14 +171,9 @@ TEST(TestSaberFuncBM, test_pooling_result) {
     // init assume output tensor has been reshpaed by user.
     pooling.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
     pooling(input, output, param, ctx1);
-    cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    output[0]->record_event(cuda_stream);
 
     output_dev.sync();
     print_tensor_device(output_dev);
-
-    cudaDeviceSynchronize();
-    CUDA_CHECK(cudaPeekAtLastError());
 }
 
 TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
@@ -194,7 +186,7 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
     typedef TargetWrapper<X86> X86_API;
     typedef TargetWrapper<BM> BM_API;
     typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 
     int img_num = 1;
     int in_channels = 2;
@@ -257,9 +249,9 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
     input.push_back(&img_dev);
     output.push_back(&output_dev);
 
-    Pooling<BM, AK_FLOAT> pooling;
-    Pooling<BM, AK_FLOAT> pooling0;
-    Pooling<BM, AK_FLOAT> pooling1;
+    Pooling<BM, AK_BM> pooling;
+    Pooling<BM, AK_BM> pooling0;
+    Pooling<BM, AK_BM> pooling1;
 
     pooling.compute_output_shape(input,output,  param);
 
@@ -286,19 +278,10 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
     pooling1.init(input1, output1, param, SPECIFY, VENDER_IMPL, ctx1);
     pooling1(input1, output1, param, ctx1);
 
-    cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    out0.record_event(cuda_stream);
-
-    cudaStream_t cuda_stream1 = ctx1.get_compute_stream();
-    out1.record_event(cuda_stream1);
-
     out0.sync();
     out1.sync();
 
     print_tensor_device(output_dev);
-
-    cudaDeviceSynchronize();
-    CUDA_CHECK(cudaPeekAtLastError());
 }
 
 int main(int argc, const char** argv) {

From 222f562c1ecd72c69764e64a7abba3c5cf916d7e Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Tue, 26 Jun 2018 09:44:43 +0800
Subject: [PATCH 175/318] ptr2 should be from buf2

---
 test/saber/bm/test_saber_buffer_BM.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp
index 00f77d308..9910638fb 100644
--- a/test/saber/bm/test_saber_buffer_BM.cpp
+++ b/test/saber/bm/test_saber_buffer_BM.cpp
@@ -90,7 +90,7 @@ void test_buffer() {
     x86_buf1.sync_copy_from(x86_buf2);
     LOG(INFO) << "deep copy between two host buffer: ";
     const Hdtype* ptr1 = static_cast<const Hdtype*>(x86_buf1.get_data());
-    const Hdtype* ptr2 = static_cast<const Hdtype*>(x86_buf1.get_data());
+    const Hdtype* ptr2 = static_cast<const Hdtype*>(x86_buf2.get_data());
 
     for (int i = 0; i < 10; i++) {
         std::cout << ptr1[i] << std::endl;

From 0ebed06260e607e2913857f851247b60ad8f4609 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Tue, 26 Jun 2018 10:03:24 +0800
Subject: [PATCH 176/318] Restrict copy_from for different types

---
 saber/core/tensor.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 61be376f8..2979abeda 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -761,7 +761,7 @@ class Tensor {
 #ifdef USE_BM
     template <typename NewTargetType_t, DataType NewDataType_t, typename NewLayOutType_t>
     SaberStatus copy_from(const Tensor<NewTargetType_t, NewDataType_t, NewLayOutType_t>& tensor) {
-        LOG(INFO) << "base copy_from";
+        LOG(WARNING) << "Invalid: copy_from is not allowed for current type.";
         return SaberInvalidValue;
     }
 #endif
@@ -1013,6 +1013,8 @@ template<>
 template<> inline
 SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
     LOG(INFO) << "BM copy_from";
+    CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
+
     auto* device_data_ptr = mutable_data();
     BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(const_cast<float *>(tensor.data()))));
     return SaberSuccess;
@@ -1022,6 +1024,8 @@ template<>
 template<> inline
 SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor<BM, AK_BM, NCHW>& tensor) {
     LOG(INFO) << "X86 copy_from";
+    CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
+
     auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
     BMDNN_CHECK(bm_memcpy_d2s(TargetWrapper<BM>::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr));
     return SaberSuccess;

From cbb287864c877334912f942084c33bb2c7e973f3 Mon Sep 17 00:00:00 2001
From: "frank.xie" <frank.xie@bitmain.com>
Date: Tue, 26 Jun 2018 11:24:19 +0800
Subject: [PATCH 177/318] Implement fill_tensor_device_rand &
 fill_tensor_device_const for BM

No test yet
---
 saber/core/tensor_op.cpp | 52 ++++++++++++++++++++++++++++++++++++++++
 saber/core/tensor_op.h   | 10 ++++++++
 2 files changed, 62 insertions(+)

diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp
index e2e4a80d0..6de80bce4 100644
--- a/saber/core/tensor_op.cpp
+++ b/saber/core/tensor_op.cpp
@@ -262,6 +262,58 @@ template void tensor_cmp_host<float>(const float* src1, const float* src2, \
 template void tensor_cmp_host<char>(const char* src1, const char* src2, int size, \
                                     double& max_ratio, double& max_diff);
 
+#ifdef USE_BM
+
+        template<>
+void fill_tensor_device_rand<Tensor<BM, AK_BM, NCHW>>(Tensor<BM, AK_BM, NCHW>& tensor, \
+    typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream) {
+
+    float *host_mem_input = new float[tensor.size()];
+    for (int i = 0; i < tensor.size(); ++i) {
+        host_mem_input[i] = static_cast<float>(rand());
+    }
+
+    bm_device_mem_t* device_data_ptr = tensor.mutable_data();
+    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
+
+    delete [] host_mem_input;
+}
+
+void fill_tensor_device_rand(Tensor<BM, AK_BM, NCHW>& tensor, float vstart, \
+    float vend, typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream = NULL){
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dis(0, 1.f);
+
+    float *host_mem_input = new float[tensor.size()];
+    for (int i = 0; i < tensor.size(); ++i) {
+        float random_num = vstart + (vend - vstart) * dis(gen);
+        host_mem_input[i] = random_num;
+    }
+
+    bm_device_mem_t* device_data_ptr = tensor.mutable_data();
+    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
+
+    delete [] host_mem_input;
+}
+
+void fill_tensor_device_const(Tensor<BM, AK_BM, NCHW>& tensor, float value, \
+    typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream = NULL){
+
+    float *host_mem_input = new float[tensor.size()];
+    for (int i = 0; i < tensor.size(); ++i) {
+        host_mem_input[i] = value;
+    }
+
+    bm_device_mem_t* device_data_ptr = tensor.mutable_data();
+    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
+
+    delete [] host_mem_input;
+}
+
+#endif
+
 } //namespace saber
 
 } //namespace anakin
diff --git a/saber/core/tensor_op.h b/saber/core/tensor_op.h
index 4c3d5974f..e8cb2f42d 100644
--- a/saber/core/tensor_op.h
+++ b/saber/core/tensor_op.h
@@ -154,6 +154,16 @@ class DataTensorTransformHelper{
 
 #endif
 
+#ifdef USE_BM
+
+void fill_tensor_device_const(Tensor<BM, AK_BM, NCHW>& tensor, float value, \
+    typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream = NULL);
+
+void fill_tensor_device_rand(Tensor<BM, AK_BM, NCHW>& tensor, float vstart, \
+    float vend, typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream = NULL);
+
+#endif
+
 } // namespace saber
 
 } // namespace anakin

From 599cf501597cbc7cfb57635b307477a3cf7a9eeb Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 11:51:04 +0800
Subject: [PATCH 178/318] get handle directly by calling get_handler()

---
 saber/core/context.h           | 5 -----
 saber/core/impl/bm/bm_impl.cpp | 4 ----
 saber/core/target_wrapper.h    | 2 --
 saber/core/tensor.h            | 4 ++--
 4 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/saber/core/context.h b/saber/core/context.h
index ad5d8d3f4..292f38449 100644
--- a/saber/core/context.h
+++ b/saber/core/context.h
@@ -120,11 +120,6 @@ class Context final{
         return _stream_compute;
     }
 
-#ifdef USE_BM
-    bm_handle_t get_handler() {
-        return API::get_handler();
-    }
-#endif
 
 #ifdef USE_ARM_PLACE
     //void set_act_cores(std::vector<int> ids);
diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index dacca58b6..d2790d0a9 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -41,10 +41,6 @@ typedef TargetWrapper<BM, __device_target> BM_API;
 //static bm_handle_t handle = get_bm_handle();
 static bm_handle_t handle;
 
-bm_handle_t BM_API::get_handler() {
-    return handle;
-}
-
 void BM_API::get_device_count(int &count) {
     BMDNN_CHECK(bm_dev_getcount(&count));
 }
diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index 8fc99d48b..648c85ed4 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -556,8 +556,6 @@ struct TargetWrapper<BM, __device_target> {
      * @return          currently activated device id
      */
     static int get_device_id();
-
-    static bm_handle_t get_handler();
 };
 
 #endif //USE_BM
diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 2979abeda..ff4728aa9 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -1016,7 +1016,7 @@ SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor
     CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
 
     auto* device_data_ptr = mutable_data();
-    BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(const_cast<float *>(tensor.data()))));
+    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(const_cast<float *>(tensor.data()))));
     return SaberSuccess;
 }
 
@@ -1027,7 +1027,7 @@ SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor
     CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
 
     auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
-    BMDNN_CHECK(bm_memcpy_d2s(TargetWrapper<BM>::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr));
+    BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *device_data_ptr));
     return SaberSuccess;
 }
 

From 5bbc2ee67fefdbc9c8230af490594f420ba12767 Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Tue, 26 Jun 2018 12:38:48 +0800
Subject: [PATCH 179/318] modify pooling, test failed

---
 .idea/workspace.xml                          | 156 +++++++------------
 saber/funcs/impl/bm/vender_pooling.h         |  50 +++---
 test/saber/bm/test_saber_func_pooling_BM.cpp |   6 -
 3 files changed, 76 insertions(+), 136 deletions(-)

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 48b584478..aec21f6ee 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -10,56 +10,8 @@
   </component>
   <component name="ChangeListManager">
     <list default="true" id="05b5ffab-037b-4c62-b3a1-7655f7441f38" name="Default" comment="">
-      <change afterPath="$PROJECT_DIR$/saber/core/impl/bm/bm_device.cpp" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/saber/core/impl/bm/bm_impl.cpp" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/CMakeLists.txt" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmdnn/op_code.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmruntime/bmblob.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmruntime/bmnet.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/vender_activation.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/vender_conv.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/vender_conv_act.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/vender_conv_act_pooling.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/vender_fc.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/vender_pooling.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_TargetWrapper_BM.cpp" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_buffer_BM.cpp" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_buffer_BM.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_context_BM.cpp" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_context_BM.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_device_BM.cpp" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_device_BM.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_func_BM.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_func_activation_BM.cpp" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_func_conv_BM.cpp" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_func_fc_BM.cpp" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_func_pooling_BM.cpp" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_shape_BM.cpp" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_shape_BM.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_tensor_BM.cpp" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_tensor_BM.h" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/CMakeLists.txt" beforeDir="false" afterPath="$PROJECT_DIR$/CMakeLists.txt" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/cmake/compiler_options.cmake" beforeDir="false" afterPath="$PROJECT_DIR$/cmake/compiler_options.cmake" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/cmake/config/anakin_config.h.in" beforeDir="false" afterPath="$PROJECT_DIR$/cmake/config/anakin_config.h.in" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/cmake/gather.cmake" beforeDir="false" afterPath="$PROJECT_DIR$/cmake/gather.cmake" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/framework/core/data_types.h" beforeDir="false" afterPath="$PROJECT_DIR$/framework/core/data_types.h" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/saber/CMakeLists.txt" beforeDir="false" afterPath="$PROJECT_DIR$/saber/CMakeLists.txt" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/saber/core/common.h" beforeDir="false" afterPath="$PROJECT_DIR$/saber/core/common.h" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/saber/core/target_traits.h" beforeDir="false" afterPath="$PROJECT_DIR$/saber/core/target_traits.h" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/saber/core/target_wrapper.h" beforeDir="false" afterPath="$PROJECT_DIR$/saber/core/target_wrapper.h" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/saber/core/tensor_op.cpp" beforeDir="false" afterPath="$PROJECT_DIR$/saber/core/tensor_op.cpp" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/saber/funcs/CMakeLists.txt" beforeDir="false" afterPath="$PROJECT_DIR$/saber/funcs/CMakeLists.txt" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/saber/saber_funcs_param.h" beforeDir="false" afterPath="$PROJECT_DIR$/saber/saber_funcs_param.h" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/saber/saber_types.h" beforeDir="false" afterPath="$PROJECT_DIR$/saber/saber_types.h" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/saber/funcs/impl/bm/vender_pooling.h" beforeDir="false" afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/vender_pooling.h" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_func_pooling_BM.cpp" beforeDir="false" afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_func_pooling_BM.cpp" afterDir="false" />
     </list>
     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
     <option name="TRACKING_ENABLED" value="true" />
@@ -73,12 +25,8 @@
       <file leaf-file-name="saber_funcs_param.h" pinned="false" current-in-tab="false">
         <entry file="file://$PROJECT_DIR$/saber/saber_funcs_param.h">
           <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="417">
-              <caret line="33" column="21" lean-forward="true" selection-start-line="33" selection-start-column="21" selection-end-line="33" selection-end-column="21" />
-              <folding>
-                <element signature="e#897#918#0" expanded="true" />
-                <element signature="e#948#969#0" expanded="true" />
-              </folding>
+            <state relative-caret-position="357">
+              <caret line="29" column="12" selection-start-line="29" selection-start-column="12" selection-end-line="29" selection-end-column="12" />
             </state>
           </provider>
         </entry>
@@ -87,7 +35,7 @@
         <entry file="file://$PROJECT_DIR$/saber/core/target_wrapper.h">
           <provider selected="true" editor-type-id="text-editor">
             <state relative-caret-position="750">
-              <caret line="559" lean-forward="true" selection-start-line="559" selection-end-line="559" />
+              <caret line="559" selection-start-line="559" selection-end-line="559" />
               <folding>
                 <element signature="e#14794#16797#0" expanded="true" />
               </folding>
@@ -98,8 +46,8 @@
       <file leaf-file-name="tensor.h" pinned="false" current-in-tab="false">
         <entry file="file://$PROJECT_DIR$/saber/core/tensor.h">
           <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="337">
-              <caret line="670" column="15" selection-start-line="670" selection-start-column="10" selection-end-line="670" selection-end-column="15" />
+            <state relative-caret-position="157">
+              <caret line="670" column="15" selection-start-line="670" selection-start-column="15" selection-end-line="670" selection-end-column="15" />
             </state>
           </provider>
         </entry>
@@ -107,13 +55,27 @@
       <file leaf-file-name="tensor_op.cpp" pinned="false" current-in-tab="true">
         <entry file="file://$PROJECT_DIR$/saber/core/tensor_op.cpp">
           <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="491">
-              <caret line="127" lean-forward="true" selection-start-line="127" selection-end-line="127" />
+            <state relative-caret-position="-4">
+              <caret line="113" column="5" selection-start-line="113" selection-start-column="5" selection-end-line="113" selection-end-column="5" />
               <folding>
-                <element signature="e#12586#12607#0" expanded="true" />
-                <element signature="e#12632#12656#0" expanded="true" />
-                <element signature="e#12686#12707#0" expanded="true" />
-                <element signature="e#12745#12766#0" expanded="true" />
+                <element signature="e#90#106#0" expanded="true" />
+                <element signature="e#268#289#0" expanded="true" />
+                <element signature="e#1629#1645#0" expanded="true" />
+                <element signature="e#1814#1835#0" expanded="true" />
+                <element signature="e#2886#2902#0" expanded="true" />
+                <element signature="e#3276#3297#0" expanded="true" />
+                <element signature="e#5075#5091#0" expanded="true" />
+                <element signature="e#5365#5386#0" expanded="true" />
+                <element signature="e#6578#6599#0" expanded="true" />
+                <element signature="e#8139#8155#0" expanded="true" />
+                <element signature="e#8800#8816#0" expanded="true" />
+                <element signature="e#9017#9038#0" expanded="true" />
+                <element signature="e#10133#10154#0" expanded="true" />
+                <element signature="e#12605#12626#0" expanded="true" />
+                <element signature="e#12651#12675#0" expanded="true" />
+                <element signature="e#12705#12726#0" expanded="true" />
+                <element signature="e#12756#12776#0" expanded="true" />
+                <element signature="e#13129#14820#0" expanded="true" />
               </folding>
             </state>
           </provider>
@@ -142,9 +104,10 @@
       <find>TargetType</find>
       <find>mem_set</find>
       <find>&amp;</find>
-      <find>BM</find>
       <find>print</find>
       <find>print_tensor_host</find>
+      <find>AMD_API</find>
+      <find>BM</find>
     </findStrings>
   </component>
   <component name="Git.Settings">
@@ -190,11 +153,6 @@
               <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
               <item name="Anakin" type="462c0819:PsiDirectoryNode" />
             </path>
-            <path>
-              <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
-              <item name="Anakin" type="462c0819:PsiDirectoryNode" />
-              <item name="framework" type="462c0819:PsiDirectoryNode" />
-            </path>
             <path>
               <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
               <item name="Anakin" type="462c0819:PsiDirectoryNode" />
@@ -212,13 +170,6 @@
               <item name="saber" type="462c0819:PsiDirectoryNode" />
               <item name="funcs" type="462c0819:PsiDirectoryNode" />
             </path>
-            <path>
-              <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
-              <item name="Anakin" type="462c0819:PsiDirectoryNode" />
-              <item name="saber" type="462c0819:PsiDirectoryNode" />
-              <item name="funcs" type="462c0819:PsiDirectoryNode" />
-              <item name="impl" type="462c0819:PsiDirectoryNode" />
-            </path>
           </expand>
           <select />
         </subPane>
@@ -252,24 +203,23 @@
       <option name="presentableId" value="Default" />
       <updated>1533519941069</updated>
       <workItem from="1533519943497" duration="1090000" />
-      <workItem from="1533533623166" duration="3417000" />
+      <workItem from="1533533623166" duration="4753000" />
     </task>
     <servers />
   </component>
   <component name="TimeTrackingManager">
-    <option name="totallyTimeSpent" value="4507000" />
+    <option name="totallyTimeSpent" value="5843000" />
   </component>
   <component name="ToolWindowManager">
     <frame x="0" y="23" width="2560" height="1353" extended-state="0" />
-    <editor active="true" />
     <layout>
-      <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.25" />
+      <window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.25" />
       <window_info anchor="bottom" id="TODO" order="6" />
       <window_info anchor="bottom" id="Event Log" order="7" side_tool="true" />
       <window_info anchor="bottom" id="Version Control" order="7" weight="0.28850666" />
       <window_info anchor="bottom" id="Run" order="2" />
       <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
-      <window_info anchor="bottom" id="Terminal" order="7" visible="true" weight="0.19077404" />
+      <window_info active="true" anchor="bottom" id="Terminal" order="7" visible="true" weight="0.19077404" />
       <window_info id="Favorites" order="2" side_tool="true" />
       <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
       <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
@@ -350,19 +300,15 @@
     </entry>
     <entry file="file://$PROJECT_DIR$/saber/saber_funcs_param.h">
       <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="417">
-          <caret line="33" column="21" lean-forward="true" selection-start-line="33" selection-start-column="21" selection-end-line="33" selection-end-column="21" />
-          <folding>
-            <element signature="e#897#918#0" expanded="true" />
-            <element signature="e#948#969#0" expanded="true" />
-          </folding>
+        <state relative-caret-position="357">
+          <caret line="29" column="12" selection-start-line="29" selection-start-column="12" selection-end-line="29" selection-end-column="12" />
         </state>
       </provider>
     </entry>
     <entry file="file://$PROJECT_DIR$/saber/core/target_wrapper.h">
       <provider selected="true" editor-type-id="text-editor">
         <state relative-caret-position="750">
-          <caret line="559" lean-forward="true" selection-start-line="559" selection-end-line="559" />
+          <caret line="559" selection-start-line="559" selection-end-line="559" />
           <folding>
             <element signature="e#14794#16797#0" expanded="true" />
           </folding>
@@ -371,20 +317,34 @@
     </entry>
     <entry file="file://$PROJECT_DIR$/saber/core/tensor.h">
       <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="337">
-          <caret line="670" column="15" selection-start-line="670" selection-start-column="10" selection-end-line="670" selection-end-column="15" />
+        <state relative-caret-position="157">
+          <caret line="670" column="15" selection-start-line="670" selection-start-column="15" selection-end-line="670" selection-end-column="15" />
         </state>
       </provider>
     </entry>
     <entry file="file://$PROJECT_DIR$/saber/core/tensor_op.cpp">
       <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="491">
-          <caret line="127" lean-forward="true" selection-start-line="127" selection-end-line="127" />
+        <state relative-caret-position="-4">
+          <caret line="113" column="5" selection-start-line="113" selection-start-column="5" selection-end-line="113" selection-end-column="5" />
           <folding>
-            <element signature="e#12586#12607#0" expanded="true" />
-            <element signature="e#12632#12656#0" expanded="true" />
-            <element signature="e#12686#12707#0" expanded="true" />
-            <element signature="e#12745#12766#0" expanded="true" />
+            <element signature="e#90#106#0" expanded="true" />
+            <element signature="e#268#289#0" expanded="true" />
+            <element signature="e#1629#1645#0" expanded="true" />
+            <element signature="e#1814#1835#0" expanded="true" />
+            <element signature="e#2886#2902#0" expanded="true" />
+            <element signature="e#3276#3297#0" expanded="true" />
+            <element signature="e#5075#5091#0" expanded="true" />
+            <element signature="e#5365#5386#0" expanded="true" />
+            <element signature="e#6578#6599#0" expanded="true" />
+            <element signature="e#8139#8155#0" expanded="true" />
+            <element signature="e#8800#8816#0" expanded="true" />
+            <element signature="e#9017#9038#0" expanded="true" />
+            <element signature="e#10133#10154#0" expanded="true" />
+            <element signature="e#12605#12626#0" expanded="true" />
+            <element signature="e#12651#12675#0" expanded="true" />
+            <element signature="e#12705#12726#0" expanded="true" />
+            <element signature="e#12756#12776#0" expanded="true" />
+            <element signature="e#13129#14820#0" expanded="true" />
           </folding>
         </state>
       </provider>
diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h
index 0da1a1106..b857eacdd 100644
--- a/saber/funcs/impl/bm/vender_pooling.h
+++ b/saber/funcs/impl/bm/vender_pooling.h
@@ -1,23 +1,7 @@
-/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-   
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. 
-*/
-
-#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_POOLING_H
-#define ANAKIN_SABER_FUNCS_IMPL_CUDA_POOLING_H
+#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_POOLING_H
+#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_POOLING_H
 
 #include "saber/funcs/impl/impl_pooling.h"
-#include "saber/funcs/impl/cuda/cudnn_helper.h"
 
 namespace anakin{
 
@@ -29,12 +13,12 @@ template <DataType OpDtype ,
     typename LayOutType_op,
     typename LayOutType_in,
     typename LayOutType_out>
-class VenderPooling<NV, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, LayOutType_out>:\
+class VenderPooling<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, LayOutType_out>:\
  public ImplBase<
-    Tensor<NV, inDtype, LayOutType_in>, 
-    Tensor<NV, outDtype, LayOutType_out>,
-    Tensor<NV, OpDtype, LayOutType_op>,
-    PoolingParam<Tensor<NV, OpDtype, LayOutType_op>>> {
+    Tensor<BM, inDtype, LayOutType_in>, 
+    Tensor<BM, outDtype, LayOutType_out>,
+    Tensor<BM, OpDtype, LayOutType_op>,
+    PoolingParam<Tensor<BM, OpDtype, LayOutType_op>>> {
 public:
     typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
     typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
@@ -62,8 +46,8 @@ class VenderPooling<NV, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in
     virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
                           std::vector<DataTensor_out*>& outputs,
                           PoolingParam<OpTensor> &param) {
-        const InDataType *in_data = inputs[0]->data();
-        OutDataType *out_data = outputs[0]->mutable_data();
+        const InDataType in_data = *(inputs[0]->data());
+        OutDataType out_data = *(outputs[0]->mutable_data());
         int input_n = inputs[0]->num();
         int input_c = inputs[0]->channel();
         int input_h = inputs[0]->height();
@@ -74,27 +58,29 @@ class VenderPooling<NV, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in
         int pad_w = param.pad_w;
         int stride_h = param.stride_h;
         int stride_w = param.stride_w;
+        int is_avg_pooling;
         if(_pooling_type == Pooling_max){
-            int is_avg_pooling = 0;
+            is_avg_pooling = 0;
         } else {
-            int is_avg_pooling = 1;
+            is_avg_pooling = 1;
         }
+        _handle = get_bm_handle();
         BMDNN_CHECK(bmdnn_pooling_forward(_handle, in_data, 
-                            input_n, input_c, input_h, input_w, kh, hw, pad_h, pad_w, 
+                            input_n, input_c, input_h, input_w, kh, kw, pad_h, pad_w, 
                             stride_h, stride_w, is_avg_pooling, 0,
-                            out_data, NULL, NULL));
+                            out_data, bm_mem_null, bm_mem_null));
         return SaberSuccess;
     }
 
 private:
     bm_handle_t _handle;
-    PoolType _pooling_type;
+    PoolingType _pooling_type;
 };
 
-template class VenderPooling<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
+template class VenderPooling<BM, AK_BM, AK_BM, AK_BM, NCHW, NCHW, NCHW>;
 
 } //namespace saber
 
 } // namespace anakin
 
-#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_POOLING_H
+#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_POOLING_H
diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp
index ce8e7f8f5..2a490c588 100644
--- a/test/saber/bm/test_saber_func_pooling_BM.cpp
+++ b/test/saber/bm/test_saber_func_pooling_BM.cpp
@@ -12,8 +12,6 @@ TEST(TestSaberFuncBM, test_func_pooling) {
 
     Env<BM>::env_init();
     typedef TargetWrapper<BM> API;
-    typename API::event_t event;
-    API::create_event(event);
 
     typedef TargetWrapper<X86> X86_API;
     typedef TargetWrapper<BM> BM_API;
@@ -104,8 +102,6 @@ TEST(TestSaberFuncBM, test_pooling_result) {
 
     Env<BM>::env_init();
     typedef TargetWrapper<BM> API;
-    typename API::event_t event;
-    API::create_event(event);
 
     typedef TargetWrapper<X86> X86_API;
     typedef TargetWrapper<BM> BM_API;
@@ -180,8 +176,6 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
 
     Env<BM>::env_init();
     typedef TargetWrapper<BM> API;
-    typename API::event_t event;
-    API::create_event(event);
 
     typedef TargetWrapper<X86> X86_API;
     typedef TargetWrapper<BM> BM_API;

From ccfa11b050820f5f56ea3a863070b6b9d77214ed Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 13:18:23 +0800
Subject: [PATCH 180/318] Implement print_tensor_device for BM

---
 saber/core/tensor_op.cpp | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp
index 6de80bce4..06ee5bd79 100644
--- a/saber/core/tensor_op.cpp
+++ b/saber/core/tensor_op.cpp
@@ -312,6 +312,42 @@ void fill_tensor_device_const(Tensor<BM, AK_BM, NCHW>& tensor, float value, \
     delete [] host_mem_input;
 }
 
+template <>
+void print_tensor_device<Tensor<BM, AK_BM, NCHW>>(Tensor<BM, AK_BM, NCHW>& tensor,  \
+    typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream) {
+
+    LOG(INFO) << "BM device tensor data:" << tensor.size();
+
+    /*
+    const bm_device_mem_t* device_data_ptr = tensor.data();
+    unsigned long long gaddr = bm_mem_get_device_addr(*device_data_ptr);
+    bm_flush(get_bm_handle());
+    float* device_data = (float*)bm_get_global_addr(gaddr);
+
+    for (int i = 0; i < tensor.size(); ++i) {
+        printf("%.2f ", device_data[i]);
+
+        if ((i + 1) % (4 * tensor.width()) == 0) {
+            printf("\n");
+        }
+    }*/
+
+    float *host_mem = new float[tensor.size()];
+    auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
+    bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr);
+
+    for (int i = 0; i < tensor.size(); ++i) {
+        printf("%.2f ", host_mem[i]);
+
+        if ((i + 1) % (4 * tensor.width()) == 0) {
+            printf("\n");
+        }
+    }
+    printf("\n");
+
+    delete [] host_mem;
+}
+
 #endif
 
 } //namespace saber

From 00384141404cb4022c6399e2e4454a26f66c6d30 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 13:25:48 +0800
Subject: [PATCH 181/318] Update BM tensor test

---
 saber/core/tensor_op.cpp               | 2 ++
 test/saber/bm/test_saber_tensor_BM.cpp | 9 ++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp
index 06ee5bd79..219a41fd8 100644
--- a/saber/core/tensor_op.cpp
+++ b/saber/core/tensor_op.cpp
@@ -333,6 +333,8 @@ void print_tensor_device<Tensor<BM, AK_BM, NCHW>>(Tensor<BM, AK_BM, NCHW>& tenso
     }*/
 
     float *host_mem = new float[tensor.size()];
+    bm_flush(get_bm_handle());
+
     auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
     bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr);
 
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index d42665528..dfd8d90c9 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -55,7 +55,6 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     TensorHf4 thost1(sh1);
     TensorDf4 tdev1(sh1);
 
-
     //! test tensor copy_from() function
     LOG(INFO) << "test copy_from() function, input tensor could be any target";
 
@@ -65,17 +64,17 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
 
     // host to device
     tdev1.copy_from(thost0);
-    //TODO: print tensor for BM device
-    //print_tensor_host(tdev1);
+    print_tensor_device(tdev1);
 
     // device to host
     thost1.copy_from(tdev1);
     print_tensor_host(thost1);
 
-    /*
-    // device to device
+    //device to device
     tdev1.copy_from(tdev0);
+    print_tensor_device(tdev1);
 
+    /*
     //! test tensor constructor with shape and real_shape
     LOG(INFO) << "test tensor constructor with shape and real_shape";
     //! constructor with 3 shapes is removed

From 9ca8735409b7760ce4a3032b8c533f7dd3f0402d Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Tue, 26 Jun 2018 13:38:28 +0800
Subject: [PATCH 182/318] fix pooling api error

---
 saber/funcs/impl/bm/vender_pooling.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h
index b857eacdd..108a70708 100644
--- a/saber/funcs/impl/bm/vender_pooling.h
+++ b/saber/funcs/impl/bm/vender_pooling.h
@@ -67,8 +67,7 @@ class VenderPooling<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in
         _handle = get_bm_handle();
         BMDNN_CHECK(bmdnn_pooling_forward(_handle, in_data, 
                             input_n, input_c, input_h, input_w, kh, kw, pad_h, pad_w, 
-                            stride_h, stride_w, is_avg_pooling, 0,
-                            out_data, bm_mem_null, bm_mem_null));
+                            stride_h, stride_w, is_avg_pooling, out_data));
         return SaberSuccess;
     }
 

From 62565be2e6353c627e0774719512a1a0a925c463 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 14:50:10 +0800
Subject: [PATCH 183/318] Update pooling test

---
 test/saber/bm/test_saber_func_pooling_BM.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp
index 2a490c588..944ab6a18 100644
--- a/test/saber/bm/test_saber_func_pooling_BM.cpp
+++ b/test/saber/bm/test_saber_func_pooling_BM.cpp
@@ -9,8 +9,6 @@
 using namespace anakin::saber;
 
 TEST(TestSaberFuncBM, test_func_pooling) {
-
-    Env<BM>::env_init();
     typedef TargetWrapper<BM> API;
 
     typedef TargetWrapper<X86> X86_API;
@@ -42,6 +40,8 @@ TEST(TestSaberFuncBM, test_func_pooling) {
 
     // start Reshape & doInfer
 
+    LOG(INFO) << "init env...";
+    Env<BM>::env_init();
     Context<BM> ctx1(0, 1, 1);
     int window_h = 2;
     int window_w = 2;
@@ -279,6 +279,9 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
 }
 
 int main(int argc, const char** argv) {
+    //TODO: init in another place
+    static bm_handle_t handle;
+    bmdnn_init(&handle);
     // initial logger
     //logger::init(argv[0]);
     InitTest();

From 7f1a4f3dd3deeb08abcfb711240ed59166795aba Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 15:01:16 +0800
Subject: [PATCH 184/318] Skip context init for BM

---
 saber/core/context.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/saber/core/context.h b/saber/core/context.h
index 292f38449..2147033f0 100644
--- a/saber/core/context.h
+++ b/saber/core/context.h
@@ -18,6 +18,7 @@
 
 #include "core/env.h"
 #include "saber/saber_types.h"
+#include <type_traits>
 
 #ifdef USE_BM
 #include "bmlib_runtime.h"
@@ -41,6 +42,11 @@ class Context final{
      * @param compute_stream_id
      */
     Context(int device_id = 0, int data_stream_id = 0, int compute_stream_id = 0){
+        if(std::is_same<TargetType, BM>::value){
+            LOG(INFO) << "context init for BM";
+            return;
+        }
+
         CHECK_GT(devs.size(), 0) << "Env is not initialized or current target is not exit!";
         if (device_id >= devs.size()){
             LOG(WARNING) << "device index exceeds the number of devices, set to default device(0)!";
@@ -64,6 +70,11 @@ class Context final{
     }
 
     Context(const Context<TargetType>& ctx){
+        if(std::is_same<TargetType, BM>::value){
+            LOG(INFO) << "context init for BM";
+            return;
+        }
+
         _device_id = ctx._device_id;
         _data_stream_id = ctx._data_stream_id;
         _compute_stream_id = ctx._compute_stream_id;

From 154d5ad1cf35693ef800953893447550e51363ce Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 15:09:02 +0800
Subject: [PATCH 185/318] remove flush action in print

---
 saber/core/tensor_op.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp
index 219a41fd8..06ee5bd79 100644
--- a/saber/core/tensor_op.cpp
+++ b/saber/core/tensor_op.cpp
@@ -333,8 +333,6 @@ void print_tensor_device<Tensor<BM, AK_BM, NCHW>>(Tensor<BM, AK_BM, NCHW>& tenso
     }*/
 
     float *host_mem = new float[tensor.size()];
-    bm_flush(get_bm_handle());
-
     auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
     bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr);
 

From d84c51ef97d612094a7ed2948baa49e8e96e9760 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 16:09:58 +0800
Subject: [PATCH 186/318] ignore set_device for BM for now

---
 saber/core/impl/bm/bm_impl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index d2790d0a9..fa51bf2d7 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -47,7 +47,7 @@ void BM_API::get_device_count(int &count) {
 
 void BM_API::set_device(int id){
     //(bm_handle_t &handle, bool bmkernel_used, int id){
-    BMDNN_CHECK(bm_dev_request(&handle, 0, id));
+    //BMDNN_CHECK(bm_dev_request(&handle, 0, id));
 }
 
 //TODO: Do we have this functionality?

From fe303220195d1a223bcb0299ee2c402f587c57b6 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 16:26:19 +0800
Subject: [PATCH 187/318] Update logs for copy_from

---
 saber/core/tensor.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index ff4728aa9..244b2a1c7 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -1012,7 +1012,7 @@ size_t Tensor<BM, AK_BM, NCHW>::_type_len(){
 template<>
 template<> inline
 SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
-    LOG(INFO) << "BM copy_from";
+    LOG(INFO) << "BM copy_from X86";
     CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
 
     auto* device_data_ptr = mutable_data();
@@ -1023,7 +1023,7 @@ SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor
 template<>
 template<> inline
 SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor<BM, AK_BM, NCHW>& tensor) {
-    LOG(INFO) << "X86 copy_from";
+    LOG(INFO) << "X86 copy_from BM";
     CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
 
     auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());

From a6088e39d62c38c301df1027ae58f988a3fa9487 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 17:42:26 +0800
Subject: [PATCH 188/318] Initialize bm handle only in one place

---
 saber/core/impl/bm/bm_impl.cpp               | 4 ++--
 test/saber/bm/test_TargetWrapper_BM.cpp      | 6 +++---
 test/saber/bm/test_saber_buffer_BM.cpp       | 4 ----
 test/saber/bm/test_saber_func_pooling_BM.cpp | 3 ---
 test/saber/bm/test_saber_tensor_BM.cpp       | 4 ----
 5 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index fa51bf2d7..60e52088e 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -37,9 +37,9 @@ namespace saber{
 
 typedef TargetWrapper<BM, __device_target> BM_API;
 
-//TODO: check exception
-//static bm_handle_t handle = get_bm_handle();
+// Init handle only once in the lifetime
 static bm_handle_t handle;
+static bm_status_t init_handle{bmdnn_init(&handle)};
 
 void BM_API::get_device_count(int &count) {
     BMDNN_CHECK(bm_dev_getcount(&count));
diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp
index b893183a2..8de77498a 100644
--- a/test/saber/bm/test_TargetWrapper_BM.cpp
+++ b/test/saber/bm/test_TargetWrapper_BM.cpp
@@ -4,9 +4,9 @@
 
 #ifdef USE_BM
 using namespace anakin::saber;
-static bm_handle_t handle;
+//static bm_handle_t handle;
 int main() {
-    bmdnn_init(&handle);
+    //bmdnn_init(&handle);
     typedef TargetWrapper<BM> API;
     //int dev_count = 0;
     //API::get_device_count(dev_count);
@@ -20,7 +20,7 @@ int main() {
     std::cout << "Start mem_free test." << std::endl;
     API::mem_free(pmem);
     std::cout << "End mem_free test." << std::endl;
-    bmdnn_deinit(handle);
+    //bmdnn_deinit(handle);
 }
 #endif
 
diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp
index 9910638fb..dce1fae15 100644
--- a/test/saber/bm/test_saber_buffer_BM.cpp
+++ b/test/saber/bm/test_saber_buffer_BM.cpp
@@ -132,10 +132,6 @@ TEST(TestSaberBufferBM, test_buffer_memcpy) {
 }
 
 int main(int argc, const char** argv) {
-    //TODO: init in another place
-    static bm_handle_t handle;
-    bmdnn_init(&handle);
-
     // initial logger
     logger::init(argv[0]);
     InitTest();
diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp
index 944ab6a18..e988bc573 100644
--- a/test/saber/bm/test_saber_func_pooling_BM.cpp
+++ b/test/saber/bm/test_saber_func_pooling_BM.cpp
@@ -279,9 +279,6 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
 }
 
 int main(int argc, const char** argv) {
-    //TODO: init in another place
-    static bm_handle_t handle;
-    bmdnn_init(&handle);
     // initial logger
     //logger::init(argv[0]);
     InitTest();
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index dfd8d90c9..2dcd61c41 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -624,10 +624,6 @@ TEST(TestSaberTensorBM, test_tensor_base_type) {
 }*/
 
 int main(int argc, const char** argv) {
-    //TODO: init in another place
-    static bm_handle_t handle;
-    bmdnn_init(&handle);
-
     // initial logger
     logger::init(argv[0]);
     InitTest();

From 42d7ee0bfc9ad96ab8658b2db95d1542036787e0 Mon Sep 17 00:00:00 2001
From: lian <327842846@qq.com>
Date: Tue, 26 Jun 2018 10:46:30 +0000
Subject: [PATCH 189/318] chage tensor type_len

---
 .idea/workspace.xml                           |  70 +-
 saber/core/target_wrapper.h                   |   6 +-
 saber/core/tensor.h                           |   7 +-
 test/framework/core/base_types_test.cpp       | 143 ----
 test/framework/graph/graph_base_test.cpp      |  82 --
 test/saber/bm/test_saber_buffer_BM.h          |  20 -
 test/saber/bm/test_saber_context_BM.h         |  21 -
 test/saber/bm/test_saber_device_BM.cpp        |  20 -
 test/saber/bm/test_saber_device_BM.h          |  21 -
 test/saber/bm/test_saber_func_BM.h            |  38 -
 .../bm/test_saber_func_activation_BM.cpp      |  88 ---
 test/saber/bm/test_saber_func_conv_BM.cpp     | 725 ------------------
 test/saber/bm/test_saber_func_fc_BM.cpp       | 146 ----
 test/saber/bm/test_saber_shape_BM.cpp         | 126 ---
 test/saber/bm/test_saber_shape_BM.h           |  25 -
 test/saber/bm/test_saber_tensor_BM.cpp        |  47 +-
 16 files changed, 83 insertions(+), 1502 deletions(-)
 delete mode 100644 test/framework/core/base_types_test.cpp
 delete mode 100644 test/framework/graph/graph_base_test.cpp
 delete mode 100644 test/saber/bm/test_saber_buffer_BM.h
 delete mode 100644 test/saber/bm/test_saber_context_BM.h
 delete mode 100644 test/saber/bm/test_saber_device_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_device_BM.h
 delete mode 100644 test/saber/bm/test_saber_func_BM.h
 delete mode 100644 test/saber/bm/test_saber_func_activation_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_func_conv_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_func_fc_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_shape_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_shape_BM.h

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index aec21f6ee..718ee2682 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -10,8 +10,21 @@
   </component>
   <component name="ChangeListManager">
     <list default="true" id="05b5ffab-037b-4c62-b3a1-7655f7441f38" name="Default" comment="">
-      <change beforePath="$PROJECT_DIR$/saber/funcs/impl/bm/vender_pooling.h" beforeDir="false" afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/vender_pooling.h" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_func_pooling_BM.cpp" beforeDir="false" afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_func_pooling_BM.cpp" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/saber/core/target_wrapper.h" beforeDir="false" afterPath="$PROJECT_DIR$/saber/core/target_wrapper.h" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/saber/core/tensor.h" beforeDir="false" afterPath="$PROJECT_DIR$/saber/core/tensor.h" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/framework/core/base_types_test.cpp" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/framework/graph/graph_base_test.cpp" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_buffer_BM.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_context_BM.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_device_BM.cpp" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_device_BM.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_func_BM.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_func_activation_BM.cpp" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_func_conv_BM.cpp" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_func_fc_BM.cpp" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_shape_BM.cpp" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_shape_BM.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_tensor_BM.cpp" beforeDir="false" afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_tensor_BM.cpp" afterDir="false" />
     </list>
     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
     <option name="TRACKING_ENABLED" value="true" />
@@ -31,14 +44,11 @@
           </provider>
         </entry>
       </file>
-      <file leaf-file-name="target_wrapper.h" pinned="false" current-in-tab="false">
+      <file leaf-file-name="target_wrapper.h" pinned="false" current-in-tab="true">
         <entry file="file://$PROJECT_DIR$/saber/core/target_wrapper.h">
           <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="750">
-              <caret line="559" selection-start-line="559" selection-end-line="559" />
-              <folding>
-                <element signature="e#14794#16797#0" expanded="true" />
-              </folding>
+            <state relative-caret-position="523">
+              <caret line="523" column="37" selection-start-line="523" selection-start-column="37" selection-end-line="523" selection-end-column="37" />
             </state>
           </provider>
         </entry>
@@ -52,7 +62,7 @@
           </provider>
         </entry>
       </file>
-      <file leaf-file-name="tensor_op.cpp" pinned="false" current-in-tab="true">
+      <file leaf-file-name="tensor_op.cpp" pinned="false" current-in-tab="false">
         <entry file="file://$PROJECT_DIR$/saber/core/tensor_op.cpp">
           <provider selected="true" editor-type-id="text-editor">
             <state relative-caret-position="-4">
@@ -75,12 +85,17 @@
                 <element signature="e#12651#12675#0" expanded="true" />
                 <element signature="e#12705#12726#0" expanded="true" />
                 <element signature="e#12756#12776#0" expanded="true" />
-                <element signature="e#13129#14820#0" expanded="true" />
+                <element signature="e#13129#15921#0" expanded="true" />
               </folding>
             </state>
           </provider>
         </entry>
       </file>
+      <file leaf-file-name="resize.h" pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/saber/funcs/resize.h">
+          <provider selected="true" editor-type-id="text-editor" />
+        </entry>
+      </file>
       <file leaf-file-name=".gitignore" pinned="false" current-in-tab="false">
         <entry file="file://$PROJECT_DIR$/.gitignore">
           <provider selected="true" editor-type-id="text-editor">
@@ -105,9 +120,9 @@
       <find>mem_set</find>
       <find>&amp;</find>
       <find>print</find>
-      <find>print_tensor_host</find>
       <find>AMD_API</find>
       <find>BM</find>
+      <find>print_tensor_host</find>
     </findStrings>
   </component>
   <component name="Git.Settings">
@@ -119,8 +134,8 @@
         <option value="$PROJECT_DIR$/saber/core/tensor.h" />
         <option value="$PROJECT_DIR$/.gitignore" />
         <option value="$PROJECT_DIR$/CMakeLists.txt" />
-        <option value="$PROJECT_DIR$/saber/core/target_wrapper.h" />
         <option value="$PROJECT_DIR$/saber/core/tensor_op.cpp" />
+        <option value="$PROJECT_DIR$/saber/core/target_wrapper.h" />
       </list>
     </option>
   </component>
@@ -203,23 +218,24 @@
       <option name="presentableId" value="Default" />
       <updated>1533519941069</updated>
       <workItem from="1533519943497" duration="1090000" />
-      <workItem from="1533533623166" duration="4753000" />
+      <workItem from="1533533623166" duration="5163000" />
     </task>
     <servers />
   </component>
   <component name="TimeTrackingManager">
-    <option name="totallyTimeSpent" value="5843000" />
+    <option name="totallyTimeSpent" value="6253000" />
   </component>
   <component name="ToolWindowManager">
     <frame x="0" y="23" width="2560" height="1353" extended-state="0" />
+    <editor active="true" />
     <layout>
-      <window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.25" />
+      <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.25" />
       <window_info anchor="bottom" id="TODO" order="6" />
       <window_info anchor="bottom" id="Event Log" order="7" side_tool="true" />
       <window_info anchor="bottom" id="Version Control" order="7" weight="0.28850666" />
       <window_info anchor="bottom" id="Run" order="2" />
       <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
-      <window_info active="true" anchor="bottom" id="Terminal" order="7" visible="true" weight="0.19077404" />
+      <window_info anchor="bottom" id="Terminal" order="7" visible="true" weight="0.19077404" />
       <window_info id="Favorites" order="2" side_tool="true" />
       <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
       <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
@@ -305,16 +321,6 @@
         </state>
       </provider>
     </entry>
-    <entry file="file://$PROJECT_DIR$/saber/core/target_wrapper.h">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="750">
-          <caret line="559" selection-start-line="559" selection-end-line="559" />
-          <folding>
-            <element signature="e#14794#16797#0" expanded="true" />
-          </folding>
-        </state>
-      </provider>
-    </entry>
     <entry file="file://$PROJECT_DIR$/saber/core/tensor.h">
       <provider selected="true" editor-type-id="text-editor">
         <state relative-caret-position="157">
@@ -344,10 +350,20 @@
             <element signature="e#12651#12675#0" expanded="true" />
             <element signature="e#12705#12726#0" expanded="true" />
             <element signature="e#12756#12776#0" expanded="true" />
-            <element signature="e#13129#14820#0" expanded="true" />
+            <element signature="e#13129#15921#0" expanded="true" />
           </folding>
         </state>
       </provider>
     </entry>
+    <entry file="file://$PROJECT_DIR$/saber/funcs/resize.h">
+      <provider selected="true" editor-type-id="text-editor" />
+    </entry>
+    <entry file="file://$PROJECT_DIR$/saber/core/target_wrapper.h">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="523">
+          <caret line="523" column="37" selection-start-line="523" selection-start-column="37" selection-end-line="523" selection-end-column="37" />
+        </state>
+      </provider>
+    </entry>
   </component>
 </project>
\ No newline at end of file
diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index 648c85ed4..49a6e9364 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -521,7 +521,7 @@ struct TargetWrapper<BM, __device_target> {
     static void mem_alloc(void** ptr, size_t n);
 
     //template <typename void>
-    static void mem_free(void * ptr); 
+    static void mem_free(void * ptr);
     
     //template <typename void>
     static void mem_set(void* ptr, int value, size_t n);
@@ -546,7 +546,7 @@ struct TargetWrapper<BM, __device_target> {
         size_t count, __HtoD) {};
 
     static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-        size_t count, __DtoH);
+        size_t count, __DtoH) {};
 
     static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
         int src_dev, size_t count) {};
@@ -556,6 +556,8 @@ struct TargetWrapper<BM, __device_target> {
      * @return          currently activated device id
      */
     static int get_device_id();
+
+    static bm_handle_t get_handler();
 };
 
 #endif //USE_BM
diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 244b2a1c7..93af6822f 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -1003,10 +1003,11 @@ class Tensor {
 };
 
 #ifdef USE_BM
-
+#ifndef BM_TENSOR_COPY
+#define BM_TENSOR_COPY
 template<> inline
 size_t Tensor<BM, AK_BM, NCHW>::_type_len(){
-    return 1;
+    return 4;
 }
 
 template<>
@@ -1030,7 +1031,7 @@ SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor
     BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *device_data_ptr));
     return SaberSuccess;
 }
-
+#endif
 #endif
 
 } //namespace saber
diff --git a/test/framework/core/base_types_test.cpp b/test/framework/core/base_types_test.cpp
deleted file mode 100644
index 0109493bf..000000000
--- a/test/framework/core/base_types_test.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-#include "core_test.h"
-#include "any.h"
-#include "singleton.h"
-#include "tls.h"
-#include "parameter.h"
-#include "thread_pool.h"
-
-#ifdef USE_CUDA
-#include "cuda_funcs.h"
-#include "sass_funcs.h"
-#endif
-
-#include "tensor.h"
-
-#ifdef USE_CUDA
-TEST(CoreComponentsTest, sass_test) {
-    LOG(INFO) << "test for cuda code function";
-    //anakin::saber::Tensor<3, RTCUDA, float, NCHW> ts;
-    //LOG(WARNING) << " tensor num " << ts.num();
-    //ts.set_offset(8);
-    //my_print();
-    LOG(INFO) << "test for sass code function 1";
-    invoke_test();
-    LOG(INFO) << "test for sass code function 2";
-    invoke_test_2();
-}
-#endif
-
-TEST(CoreComponentsTest, core_base_types_any_test) {
-    LOG(INFO) << "test for any class .";
-    LOG(WARNING) << " level 1 : base type int (set 42 to any)";
-    const int a = 42;
-    any any_a(42);
-    int result_a = any_cast<int>(any_a);
-
-    LOG(INFO) << "casted result : " <<  result_a;
-    LOG(WARNING) << " level 2 : base type float (set 42.8 to any)";
-    float b = 42.8;
-    any any_b = b;
-    float result_b = any_cast<float>(any_b);
-    LOG(INFO) << "casted result : " <<  result_b << " decide: ";
-
-    LOG(WARNING) << " level 3 : ptuple type (set PTuple<float> to any)";
-    PTuple<float> p_tuple_float(3.2f, 3.3f, 3.5f);
-    p_tuple_float.push_back(4.3); // push_back
-
-    any p_tuple_float_any = p_tuple_float;
-    auto result_p_tuple_float_any = any_cast<PTuple<float>>(p_tuple_float_any);
-
-    for (int i = 0; i < result_p_tuple_float_any.size(); i++) {
-        LOG(INFO) << " any casted PTuple<float>[" << i << "]: " << result_p_tuple_float_any[i];
-    }
-
-    struct target {
-        void print() {
-            LOG(INFO) << " target struct Successfully recovered.";
-        }
-    };
-
-    LOG(WARNING) << " level 5 : struct type";
-
-    target tg;
-
-    any any_tg = tg;
-
-    target result_tg = any_cast<target>(any_tg);
-
-    result_tg.print();
-
-    LOG(WARNING) << " level other : struct type";
-
-    any any_tg_copy = any_tg;
-
-    target result_tg_copy = any_cast<target>(any_tg);
-
-    result_tg_copy.print();
-}
-
-void at_exit_in_test() {
-    LOG(WARNING) << "core_base_types_singleton_test exit successfully!";
-}
-
-TEST(CoreComponentsTest, core_base_types_singleton_test) {
-    struct target {
-        target() {
-            LOG(INFO) << " singleton target constructed";
-        }
-    };
-    typedef Singleton<target, at_exit_in_test> sg_target;
-    sg_target::Global();
-}
-
-typedef AnakinThreadLocalVar<int> sg_tls;
-void thread_func_0() {
-    int* tmp = sg_tls::value();
-    *tmp = 3;
-    LOG(INFO) << " thread_func_0 value: " << *(sg_tls::value());
-}
-void thread_func_1() {
-    int* tmp = sg_tls::value();
-    *tmp = 4;
-
-    LOG(INFO) << " thread_func_0 value: " << *(sg_tls::value());
-}
-TEST(CoreComponentsTest, core_base_types_tls_test) {
-    LOG(INFO) << " Create tls var 0 , check in two thread.";
-    std::thread first(thread_func_0);
-    std::thread sec(thread_func_1);
-    first.join();
-    sec.join();
-    LOG(INFO) << " main thread var: " << *(sg_tls::value());
-}
-
-int thread_pool_func(int i) {
-    LOG(INFO) << " thread_pool_func input : " << i;
-    //std::this_thread::sleep_for(std::chrono::seconds(0));
-    return i;
-}
-
-TEST(CoreComponentsTest, core_base_types_thread_pool_test) {
-    LOG(INFO) << " Create thread pool with thread num = 12 ";
-    ThreadPool thread_pool_test(100);
-    thread_pool_test.launch();
-    std::function<int(int)> test = thread_pool_func;
-
-    for (int i = 0; i < 50; i++) {
-        // run async
-        auto ret = thread_pool_test.RunAsync(test, i);
-        LOG(INFO) << " return : " << ret.get();
-
-        // run sync
-        //auto sync_ret = thread_pool_test.RunSync(test, i);
-    }
-}
-
-
-int main(int argc, const char** argv) {
-    // initial logger
-    logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
diff --git a/test/framework/graph/graph_base_test.cpp b/test/framework/graph/graph_base_test.cpp
deleted file mode 100644
index d42e86c02..000000000
--- a/test/framework/graph/graph_base_test.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-#include <string>
-#include "graph_test.h"
-#include "graph_base.h"
-
-using namespace anakin;
-using namespace anakin::graph;
-
-//! Usage sample
-class GraphTestClass : public GraphBase<std::string, int, int> {
-public:
-    GraphTestClass() {}
-    ~GraphTestClass() {}
-    virtual bool directed() {
-        return true;
-    };
-};
-class edge : public Arc<std::string, int> {
-public:
-    edge(std::string btm, std::string top, int weight): Arc<std::string, int>(btm, top, weight) {}
-    ~edge() {}
-};
-
-TEST(GraphTest, graph_base_test) {
-    LOG(INFO) << "test for graph base .";
-
-    GraphTestClass graph;
-    graph.add_vertex("a", 42);
-    graph.add_vertex("b", 43);
-    graph.add_vertex("c", 44);
-    graph.add_vertex("d", 45);
-    graph.add_vertex("e", 46);
-    graph.add_vertex("f", 47);
-
-    edge arc0("a", "b", 0);
-    edge arc1("b", "c", 1);
-    edge arc2("c", "d", 2);
-    edge arc3("d", "e", 3);
-    edge arc4("e", "f", 4);
-    edge arc5("f", "a", 5);
-
-    graph.add_in_arc(arc0);
-    graph.add_in_arc(arc1);
-    graph.add_in_arc(arc2);
-    graph.add_in_arc(arc3);
-    graph.add_in_arc(arc4);
-    graph.add_in_arc(arc5);
-    graph.add_out_arc(arc0);
-    graph.add_out_arc(arc1);
-    graph.add_out_arc(arc2);
-    graph.add_out_arc(arc3);
-    graph.add_out_arc(arc4);
-    graph.add_out_arc(arc5);
-
-    LOG(WARNING) << "Construction of graph.";
-    LOG(INFO) << graph.to_string();
-
-    LOG(WARNING) << "Remove a from graph.";
-    graph.remove("a");
-    LOG(INFO) << graph.to_string();
-
-    LOG(WARNING) << "Add arc: f->b to graph.";
-    edge arc_f_b("f", "b", 10);
-    graph.add_in_arc(arc_f_b);
-    graph.add_out_arc(arc_f_b);
-    LOG(INFO) << graph.to_string();
-
-    LOG(WARNING) << "Add vertex:a and arc: a->e to graph.";
-    graph.add_vertex("a", 47);
-    edge arc_a_e("a", "e", 10);
-    graph.add_out_arc(arc_a_e);
-    graph.add_in_arc(arc_a_e);
-    LOG(INFO) << graph.to_string();
-}
-
-
-int main(int argc, const char** argv) {
-    // initial logger
-    logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
diff --git a/test/saber/bm/test_saber_buffer_BM.h b/test/saber/bm/test_saber_buffer_BM.h
deleted file mode 100644
index 8bbbe4511..000000000
--- a/test/saber/bm/test_saber_buffer_BM.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H
-#define ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H
-
-#include "utils/unit_test/aktest.h"
-#include "utils/logger/logger.h"
-
-using namespace anakin::test;
-
-class TestSaberBufferBM : public Test {
-public:
-    TestSaberBufferBM() {}
-    ~TestSaberBufferBM() {}
-
-protected:
-    virtual void setup() {}
-    virtual void teardown() {}
-
-};
-
-#endif //ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H
diff --git a/test/saber/bm/test_saber_context_BM.h b/test/saber/bm/test_saber_context_BM.h
deleted file mode 100644
index 653ee11fd..000000000
--- a/test/saber/bm/test_saber_context_BM.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef SABER_TEST_SABER_CONTEXT_BM_H
-#define SABER_TEST_SABER_CONTEXT_BM_H
-
-#include "utils/unit_test/aktest.h"
-#include "utils/logger/logger.h"
-#include "core/context.h"
-
-using namespace anakin::test;
-
-class TestSaberContextBM : public Test {
-public:
-    TestSaberContextBM() {}
-    ~TestSaberContextBM() {}
-
-protected:
-    virtual void setup() {}
-    virtual void teardown() {}
-
-};
-
-#endif //SABER_TEST_SABER_CONTEXT_BM_H
diff --git a/test/saber/bm/test_saber_device_BM.cpp b/test/saber/bm/test_saber_device_BM.cpp
deleted file mode 100644
index 1c7086cf1..000000000
--- a/test/saber/bm/test_saber_device_BM.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "test_saber_device_BM.h"
-
-#ifdef USE_BM
-
-using namespace anakin::saber;
-
-TEST(TestSaberDeviceBM, test_BM_device) {
-    Device<BM> dev_BM;
-}
-
-#endif
-
-int main(int argc, const char** argv) {
-    // initial logger
-    logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
diff --git a/test/saber/bm/test_saber_device_BM.h b/test/saber/bm/test_saber_device_BM.h
deleted file mode 100644
index 3a6d61236..000000000
--- a/test/saber/bm/test_saber_device_BM.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef SABER_TEST_SABER_DEVICE_BM_H
-#define SABER_TEST_SABER_DEVICE_BM_H
-
-#include "utils/unit_test/aktest.h"
-#include "utils/logger/logger.h"
-#include "core/device.h"
-
-using namespace anakin::test;
-
-class TestSaberDeviceBM : public Test {
-public:
-    TestSaberDeviceBM() {}
-    ~TestSaberDeviceBM() {}
-
-protected:
-    virtual void setup() {}
-    virtual void teardown() {}
-
-};
-
-#endif //SABER_TEST_SABER_DEVICE_BM_H
diff --git a/test/saber/bm/test_saber_func_BM.h b/test/saber/bm/test_saber_func_BM.h
deleted file mode 100644
index 61d27d6f9..000000000
--- a/test/saber/bm/test_saber_func_BM.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef ANAKIN_TEST_SABER_TEST_SABER_FUNC_BM_H
-#define ANAKIN_TEST_SABER_TEST_SABER_FUNC_BM_H
-
-#include "utils/unit_test/aktest.h"
-#include "utils/logger/logger.h"
-#include "core/tensor.h"
-#include <fstream>
-#include <vector>
-
-using namespace anakin::test;
-
-int read_file(std::vector<float> &results, const char* file_name) {
-
-    std::ifstream infile(file_name);
-    if (!infile.good()) {
-        std::cout << "Cannot open " << std::endl;
-        return false;
-    }
-    LOG(INFO)<<"found filename: "<<file_name;
-    std::string line;
-    while (std::getline(infile, line)) {
-        results.push_back((float)atof(line.c_str()));
-    }
-    return 0;
-}
-
-class TestSaberFuncBM : public Test {
-public:
-    TestSaberFuncBM() {}
-    ~TestSaberFuncBM() {}
-
-protected:
-    virtual void setup() {}
-    virtual void teardown() {}
-
-};
-
-#endif //ANAKIN_TEST_SABER_TEST_SABER_TENSOR_BM_H
diff --git a/test/saber/bm/test_saber_func_activation_BM.cpp b/test/saber/bm/test_saber_func_activation_BM.cpp
deleted file mode 100644
index 523e94121..000000000
--- a/test/saber/bm/test_saber_func_activation_BM.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-#include "core/context.h"
-#include "funcs/activation.h"
-#include "test_saber_func_BM.h"
-#include "tensor_op.h"
-#include "saber_types.h"
-#include <vector>
-
-using namespace anakin::saber;
-
-template <typename Tensor>
-void print_tensor_shape(std::string name, Tensor& t0) {
-
-    LOG(INFO) << name << " valid shape is ["
-              << t0.valid_shape()[0] << ", "
-              << t0.valid_shape()[1] << ", "
-              << t0.valid_shape()[2] << ", "
-              << t0.valid_shape()[3] << "].";
-
-    LOG(INFO) << name << " real shape is ["
-              << t0.shape()[0] << ", "
-              << t0.shape()[1] << ", "
-              << t0.shape()[2] << ", "
-              << t0.shape()[3] << "].";
-
-    LOG(INFO) << name << " offset is ["
-              << t0.offset()[0] << ", "
-              << t0.offset()[1] << ", "
-              << t0.offset()[2] << ", "
-              << t0.offset()[3] << "].";
-}
-
-TEST(TestSaberFuncBM, test_func_constructor) {
-
-    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
-
-    int img_num = 1;
-    int in_channels = 2;
-    int img_h = 8;
-    int img_w = 8;
-
-    Shape img_s(img_num, in_channels, img_h, img_w);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = (float)(0.05 * (i & 0x1f) * -1);
-    }
-
-    img_dev.copy_from(img_host);
-    TensorDf4 output_dev;
-
-    // start Reshape & doInfer
-
-    Context<BM> ctx1(0, 1, 1);
-
-    ActivationParam<TensorDf4> param(Active_relu, 0.1f, 0.1f);
-
-    std::vector<TensorDf4*> input;
-    std::vector<TensorDf4*> output;
-
-    input.push_back(&img_dev);
-    output.push_back(&output_dev);
-
-    Activation<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> act;
-    act.compute_output_shape(input, output, param);
-    output_dev.re_alloc(output[0]->shape());
-
-    // init assume output tensor has been reshpaed by user.
-    act.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
-    act(input, output, param, ctx1);
-
-    print_tensor_device(output_dev);
-}
-
-int main(int argc, const char** argv) {
-    Env<BM>::env_init();
-    // initial logger
-    //logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
diff --git a/test/saber/bm/test_saber_func_conv_BM.cpp b/test/saber/bm/test_saber_func_conv_BM.cpp
deleted file mode 100644
index 7881cdb97..000000000
--- a/test/saber/bm/test_saber_func_conv_BM.cpp
+++ /dev/null
@@ -1,725 +0,0 @@
-#include "core/context.h"
-#include "funcs/conv.h"
-#include "test_saber_func_BM.h"
-#include "tensor_op.h"
-#include "saber_types.h"
-#include <vector>
-//#include "cublas.h"
-
-using namespace anakin::saber;
-
-typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
-
-template <typename Tensor>
-void print_tensor_shape(std::string name, Tensor &t0) {
-
-            LOG(INFO) << name << " valid shape is ["
-                      << t0.valid_shape()[0] << ", "
-                      << t0.valid_shape()[1] << ", "
-                      << t0.valid_shape()[2] << ", "
-                      << t0.valid_shape()[3] << "].";
-
-            LOG(INFO) << name << " real shape is ["
-                      << t0.shape()[0] << ", "
-                      << t0.shape()[1] << ", "
-                      << t0.shape()[2] << ", "
-                      << t0.shape()[3] << "].";
-
-            LOG(INFO) << name << " offset is ["
-                      << t0.offset()[0] << ", "
-                      << t0.offset()[1] << ", "
-                      << t0.offset()[2] << ", "
-                      << t0.offset()[3] << "].";
-}
-
-
-
-#if 1
-TEST(TestSaberFuncBM, test_depthwise_conv) {
-
-    int group = 2;
-    int pad_h = 1;
-    int pad_w = 1;
-    int stride_h = 1;
-    int stride_w = 1;
-    int dilation_h = 1;
-    int dilation_w = 1;
-
-    int kernel_h = 3;
-    int kernel_w = 3;
-    int out_channels = 2;
-    
-    int img_num = 1;
-    int in_channels = 2;
-    int img_h = 8;
-    int img_w = 8;
-
-    bool bias_term = true;
-
-    LOG(INFO) << "conv param: ";
-    LOG(INFO) << " img_num = " << img_num;
-    LOG(INFO) << " in_channels = " << in_channels;
-    LOG(INFO) << " img_h = " << img_h;
-    LOG(INFO) << " img_w = " << img_w;
-    LOG(INFO) << " group = " << group;
-    LOG(INFO) << " pad_h = " << pad_h;
-    LOG(INFO) << " pad_w = " << pad_w;
-    LOG(INFO) << " stride_h = " << stride_h;
-    LOG(INFO) << " stride_w = " << stride_w;
-    LOG(INFO) << " dilation_h = " << dilation_h;
-    LOG(INFO) << " dilation_w = " << dilation_w;
-    LOG(INFO) << " kernel_h = " << kernel_h;
-    LOG(INFO) << " kernel_w = " << kernel_w;
-    LOG(INFO) << " out_channels = " << out_channels;
-
-    Shape img_s(img_num, in_channels, img_h, img_w);
-    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
-    Shape bias_s(1, out_channels, 1, 1);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-    
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = 63 & i;
-    }
-
-    img_dev.copy_from(img_host);
-    
-    TensorHf4 weights_host;
-    TensorDf4 weights_dev;
-    
-    weights_host.re_alloc(weights_s);
-    weights_dev.re_alloc(weights_s);
-
-    fill_tensor_host_const(weights_host, 1.f);
-    weights_dev.copy_from(weights_host);
-
-    TensorHf4 bias_host;
-    TensorDf4 bias_dev;
-
-    if (bias_term) {
-        bias_host.re_alloc(bias_s);
-        bias_dev.re_alloc(bias_s);
-
-        fill_tensor_host_const(bias_host, 1.f);
-        bias_dev.copy_from(bias_host);
-    }
-
-    TensorHf4 output_host;
-    TensorDf4 output_dev;
-
-    // start Reshape & doInfer
-    Context<BM> ctx1(0, 1, 1);
-    
-    ConvParam<TensorDf4> param(group, pad_h, pad_w,
-                               stride_h, stride_w,
-                               dilation_h, dilation_w,
-                               &weights_dev, &bias_dev);
-
-    std::vector<TensorDf4*> input;
-    std::vector<TensorDf4*> output;
-
-    input.push_back(&img_dev);
-    output.push_back(&output_dev);
-
-    Conv<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> conv;
-    conv.compute_output_shape(input, output, param);
-
-    output_dev.re_alloc(output[0]->shape());
-    output_host.re_alloc(output[0]->shape());
-
-    LOG(INFO) << "regular start with group = " << group;
-    // init assume output tensor has been reshpaed by user.
-    conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
-
-    conv(input, output, param, ctx1);
-
-    cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    output[0]->record_event(cuda_stream);
-
-    output_dev.sync();
-    print_tensor_device(output_dev);
-
-//    param.group = 1;
-//    param.pad_h = 1;
-//    param.pad_w = 1;
-//
-//    LOG(INFO) << " param changed start with group = "<<param.group;
-//    conv(input, output, param, ctx1);
-//
-//    output_dev.sync();
-//    print_tensor_device(output_dev);
-    cudaDeviceSynchronize();
-    CUDA_CHECK(cudaPeekAtLastError());
-}
-
-TEST(TestSaberFuncBM, test_conv_param_change) {
-
-    int group = 4;
-    int pad_h = 1;
-    int pad_w = 1;
-    int stride_h = 1;
-    int stride_w = 1;
-    int dilation_h = 1;
-    int dilation_w = 1;
-
-    int kernel_h = 3;
-    int kernel_w = 3;
-    int out_channels = 4;
-
-    int img_num = 1;
-    int in_channels = 4;
-    int img_h = 65;
-    int img_w = 63;
-
-    bool bias_term = true;
-
-    LOG(INFO) << "conv param: ";
-    LOG(INFO) << " img_num = " << img_num;
-    LOG(INFO) << " in_channels = " << in_channels;
-    LOG(INFO) << " img_h = " << img_h;
-    LOG(INFO) << " img_w = " << img_w;
-    LOG(INFO) << " group = " << group;
-    LOG(INFO) << " pad_h = " << pad_h;
-    LOG(INFO) << " pad_w = " << pad_w;
-    LOG(INFO) << " stride_h = " << stride_h;
-    LOG(INFO) << " stride_w = " << stride_w;
-    LOG(INFO) << " dilation_h = " << dilation_h;
-    LOG(INFO) << " dilation_w = " << dilation_w;
-    LOG(INFO) << " kernel_h = " << kernel_h;
-    LOG(INFO) << " kernel_w = " << kernel_w;
-    LOG(INFO) << " out_channels = " << out_channels;
-
-    Shape img_s(img_num, in_channels, img_h, img_w);
-    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
-    Shape bias_s(1, out_channels, 1, 1);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = 0x7f & i;
-    }
-
-    img_dev.copy_from(img_host);
-
-    TensorHf4 weights_host;
-    TensorDf4 weights_dev;
-
-    weights_host.re_alloc(weights_s);
-    weights_dev.re_alloc(weights_s);
-
-    fill_tensor_host_const(weights_host, 1.f);
-    weights_dev.copy_from(weights_host);
-
-    TensorHf4 bias_host;
-    TensorDf4 bias_dev;
-
-    if (bias_term) {
-        bias_host.re_alloc(bias_s);
-        bias_dev.re_alloc(bias_s);
-
-        fill_tensor_host_const(bias_host, 1.f);
-        bias_dev.copy_from(bias_host);
-    }
-
-    TensorHf4 output_host;
-    TensorDf4 output_dev;
-
-    // start Reshape & doInfer
-    Context<BM> ctx1(0, 1, 1);
-
-    ConvParam<TensorDf4> param(group, pad_h, pad_w,
-                               stride_h, stride_w,
-                               dilation_h, dilation_w,
-                               &weights_dev, &bias_dev);
-
-    std::vector<TensorDf4*> input;
-    std::vector<TensorDf4*> output;
-
-    input.push_back(&img_dev);
-    output.push_back(&output_dev);
-
-    Conv<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> conv;
-    conv.compute_output_shape(input, output, param);
-
-    output_dev.re_alloc(output[0]->shape());
-    output_host.re_alloc(output[0]->shape());
-
-            LOG(INFO)<<"regular start with group = "<<group;
-    // init assume output tensor has been reshpaed by user.
-    conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
-
-    conv(input, output, param, ctx1);
-    output_dev.sync();
-//    print_tensor_device(output_dev);
-
-    param.group = 1;
-    param.pad_h = 1;
-    param.pad_w = 1;
-
-    LOG(INFO)<<" param changed start with group = "<<param.group;
-    conv(input, output, param, ctx1);
-
-    cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    output[0]->record_event(cuda_stream);
-
-    output_dev.sync();
-//    print_tensor_device(output_dev);
-    cudaDeviceSynchronize();
-    CUDA_CHECK(cudaPeekAtLastError());
-}
-
-TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
-
-    int group = 1;
-    int pad_h = 1;
-    int pad_w = 1;
-    int stride_h = 1;
-    int stride_w = 1;
-    int dilation_h = 1;
-    int dilation_w = 1;
-
-    int kernel_h = 3;
-    int kernel_w = 3;
-    int out_channels = 2;
-
-    int img_num = 1;
-    int in_channels = 2;
-    int img_h = 8;
-    int img_w = 8;
-
-    bool bias_term = false;
-
-    LOG(INFO) << "conv param: ";
-    LOG(INFO) << " img_num = " << img_num;
-    LOG(INFO) << " in_channels = " << in_channels;
-    LOG(INFO) << " img_h = " << img_h;
-    LOG(INFO) << " img_w = " << img_w;
-    LOG(INFO) << " group = " << group;
-    LOG(INFO) << " pad_h = " << pad_h;
-    LOG(INFO) << " pad_w = " << pad_w;
-    LOG(INFO) << " stride_h = " << stride_h;
-    LOG(INFO) << " stride_w = " << stride_w;
-    LOG(INFO) << " dilation_h = " << dilation_h;
-    LOG(INFO) << " dilation_w = " << dilation_w;
-    LOG(INFO) << " kernel_h = " << kernel_h;
-    LOG(INFO) << " kernel_w = " << kernel_w;
-    LOG(INFO) << " out_channels = " << out_channels;
-
-    Shape img_s(img_num, in_channels, img_h, img_w);
-    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
-    Shape bias_s(1, out_channels, 1, 1);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = 0x7f & i;
-    }
-
-    img_dev.copy_from(img_host);
-
-    Shape img_s_sub(img_num, in_channels, 4, 4);
-
-    TensorDf4 t0;
-    TensorDf4 t1;
-
-    t0.share_sub_buffer(img_dev, img_s_sub, {0,0,0,0});
-    t1.share_sub_buffer(img_dev, img_s_sub, {0,0,4,4});
-
-    print_tensor_shape("t0", t0);
-    print_tensor_shape("t1", t1);
-
-    TensorHf4 weights_host;
-    TensorDf4 weights_dev;
-
-    weights_host.re_alloc(weights_s);
-    weights_dev.re_alloc(weights_s);
-
-    fill_tensor_host_const(weights_host, 1.f);
-    weights_dev.copy_from(weights_host);
-
-    TensorHf4 bias_host;
-    TensorDf4 bias_dev;
-
-    if (bias_term) {
-        bias_host.re_alloc(bias_s);
-        bias_dev.re_alloc(bias_s);
-
-        fill_tensor_host_const(bias_host, 1.f);
-        bias_dev.copy_from(bias_host);
-    }
-
-    TensorDf4 output_dev;
-
-    // start Reshape & doInfer
-    Context<BM> ctx1(0, 1, 1);
-    Context<BM> ctx2(0, 2, 2);
-
-    TensorDf4 out0;
-    TensorDf4 out1;
-
-    ConvParam<TensorDf4> param0(group, pad_h, pad_w,
-                               stride_h, stride_w,
-                               dilation_h, dilation_w,
-                               &weights_dev, &bias_dev);
-
-    ConvParam<TensorDf4> param1(group, pad_h, pad_w,
-                                stride_h, stride_w,
-                                dilation_h, dilation_w,
-                                &weights_dev, &bias_dev);
-
-    std::vector<TensorDf4*> input0, input1;
-    std::vector<TensorDf4*> output0, output1;
-
-    input0.push_back(&t0);
-    input1.push_back(&t1);
-
-    output0.push_back(&out0);
-    output1.push_back(&out1);
-
-    // FIXME ? where do i get output shape
-    output_dev.re_alloc(img_s);
-
-    Conv<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> conv0;
-    Conv<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> conv1;
-
-    conv0.compute_output_shape(input0, output0, param0);
-    conv1.compute_output_shape(input1, output1, param1);
-
-    out0.share_sub_buffer(output_dev, output0[0]->valid_shape(),{0,0,0,0});
-    out1.share_sub_buffer(output_dev, output1[0]->valid_shape(),{0,0,4,4});
-
-    conv0.init(input0, output0, param0, SPECIFY, VENDER_IMPL, ctx1);
-    conv1.init(input1, output1, param1, SPECIFY, VENDER_IMPL, ctx2);
-
-    conv0(input0, output0, param0, ctx1);
-    conv1(input1, output1, param1, ctx2);
-
-    cudaStream_t cuda_stream1 = ctx1.get_compute_stream();
-    output0[0]->record_event(cuda_stream1);
-
-    cudaStream_t cuda_stream2 = ctx2.get_compute_stream();
-    output1[0]->record_event(cuda_stream2);
-
-    out0.sync();
-    out1.sync();
-
-    print_tensor_device(output_dev);
-
-//    print_tensor_device(output_dev);
-
-    cudaDeviceSynchronize();
-    CUDA_CHECK(cudaPeekAtLastError());
-}
-#endif
-
-TEST(TestSaberFuncBM, test_conv_fp32_speed_test) {
-
-    int group = 1;
-    int pad_h = 1;
-    int pad_w = 1;
-    int stride_h = 1;
-    int stride_w = 1;
-    int dilation_h = 1;
-    int dilation_w = 1;
-
-    int kernel_h = 1;
-    int kernel_w = 1;
-    int out_channels = 128;
-
-    int img_num = 7;
-    int in_channels = 13;
-    int img_h = 32;
-    int img_w = 32;
-
-    bool bias_term = false;
-
-    LOG(INFO) << "conv param: ";
-    LOG(INFO) << " img_num = " << img_num;
-    LOG(INFO) << " in_channels = " << in_channels;
-    LOG(INFO) << " img_h = " << img_h;
-    LOG(INFO) << " img_w = " << img_w;
-    LOG(INFO) << " group = " << group;
-    LOG(INFO) << " pad_h = " << pad_h;
-    LOG(INFO) << " pad_w = " << pad_w;
-    LOG(INFO) << " stride_h = " << stride_h;
-    LOG(INFO) << " stride_w = " << stride_w;
-    LOG(INFO) << " dilation_h = " << dilation_h;
-    LOG(INFO) << " dilation_w = " << dilation_w;
-    LOG(INFO) << " kernel_h = " << kernel_h;
-    LOG(INFO) << " kernel_w = " << kernel_w;
-    LOG(INFO) << " out_channels = " << out_channels;
-    Shape img_s(img_num, in_channels, img_h, img_w);
-    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
-    Shape bias_s(1, out_channels, 1, 1);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = 1;
-    }
-
-    img_dev.copy_from(img_host);
-
-    TensorHf4 weights_host;
-    TensorDf4 weights_dev;
-
-    weights_host.re_alloc(weights_s);
-    weights_dev.re_alloc(weights_s);
-
-    fill_tensor_host_const(weights_host, 1.f);
-    weights_dev.copy_from(weights_host);
-
-    TensorHf4 bias_host;
-    TensorDf4 bias_dev;
-
-    if (bias_term) {
-        bias_host.re_alloc(bias_s);
-        bias_dev.re_alloc(bias_s);
-
-        fill_tensor_host_const(bias_host, 1.f);
-        bias_dev.copy_from(bias_host);
-    }
-
-    TensorDf4 output_dev;
-
-    // start Reshape & doInfer
-    Context<BM> ctx1(0, 1, 1);
-
-    ConvParam<TensorDf4> param(group, pad_h, pad_w,
-                               stride_h, stride_w,
-                               dilation_h, dilation_w,
-                               &weights_dev, &bias_dev);
-
-    std::vector<TensorDf4*> input;
-    std::vector<TensorDf4*> output;
-
-    input.push_back(&img_dev);
-    output.push_back(&output_dev);
-
-    Conv<BM, AK_FLOAT> conv;
-    conv.compute_output_shape(input, output, param);
-
-    output_dev.re_alloc(output[0]->shape());
-    LOG(INFO) << "Output shape = [ " << output[0]->shape()[0] << " " << output[0]->shape()[1] << " " \
-        << output[0]->shape()[2] << " " << output[0]->shape()[3] << "]";
-    //LOG(INFO) << " blocks = [ " <<  i_div_up(img_num*output[0]->shape()[2]*output[0]->shape()[3],128) << " " << i_div_up(out_channels*kernel_h, 128) << " 1 ]" ; 
-    //选择k最小的那一组，如果一样，则选128*N，N最大的那一组
-    int k0 = i_div_up(out_channels, 128) * 128 - out_channels;
-    int k1 = i_div_up(out_channels, 64) * 64 - out_channels;
-    int k2 = i_div_up(out_channels, 32) * 32 - out_channels;
-    int kk = std::min(std::min(k0,k1),k2);
-    LOG(INFO) << "k0 = " << k0 << " k1 = " << k1 << " k2 = " << k2 << " kk = " << kk;
-    if (kk == k0)
-        LOG(INFO) << "thread = [256,1,1] 128*128" ;
-    if (kk == k1)
-        LOG(INFO) << "thread = [128,1,1] 128*64" ;
-    if (kk == k2)
-        LOG(INFO) << "thread = [128,1,1] 128*32" ;
-
-    LOG(INFO) << "saber conv init";
-    conv.init(input, output, param, SPECIFY, SABER_IMPL, ctx1);
-
-    LOG(INFO) << "saber conv dispatch";
-    conv(input, output, param, ctx1);
-
-    cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    output[0]->record_event(cuda_stream);
-
-    output_dev.sync();
-
-    SaberTimer<BM> t1;
-    int ts = 1;
-
-    for (int i = 0; i < ts; ++i) {
-        t1.start(ctx1);
-        conv(input, output, param, ctx1);
-        output_dev.sync();
-        t1.end(ctx1);
-    }
-
-    LOG(INFO) << "fp32 average time: " << t1.get_average_ms() << " ms";
-
-    cudaDeviceSynchronize();
-    CUDA_CHECK(cudaPeekAtLastError());
-}
-
-void test_conv_fp32_speed(std::vector<TensorDf4*> &inputs, std::vector<TensorDf4*> &outputs,
-                         TensorDf4 &weights, int kernel_size, int stride, int pad,
-                         int in_channel, int out_channel, TensorDf4 &bias,
-                         anakin::saber::ImplEnum impl) {
-
-    ConvParam<TensorDf4> conv_param(1, pad, pad,
-                                    stride, stride,
-                                    1, 1,
-                                    &weights, &bias);
-    Conv<BM, AK_FLOAT> conv;
-    conv.compute_output_shape(inputs, outputs, conv_param);
-    outputs[0]->re_alloc(outputs[0]->shape());
-    Context<BM> ctx1(0, 1, 1);
-
-    SABER_CHECK(conv.init(inputs, outputs, conv_param, SPECIFY, impl, ctx1));
-
-    conv(inputs, outputs, conv_param, ctx1);
-    outputs[0]->record_event(ctx1.get_compute_stream());
-    outputs[0]->sync();
-
-    cudaDeviceSynchronize();
-
-    SaberTimer<BM> t1;
-    int ts = 100;
-    for (int i = 0; i < ts; ++i) {
-        t1.start(ctx1);
-        conv(inputs, outputs, conv_param, ctx1);
-        outputs[0]->record_event(ctx1.get_compute_stream());
-        outputs[0]->sync();
-        t1.end(ctx1);
-    }
-            LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms";
-
-    cudaDeviceSynchronize();
-}
-
-
-cublasHandle_t  cublas_handle;
-
-void caffe_gemm(const int M, const int N, const int K,\
-					 const float alpha, const float* A,\
-					 const float* B, const float beta, float* C) {
-    int lda = K;
-    int ldb = N;
-    CUBLAS_CHECK(cublasSgemm(cublas_handle,
-                             CUBLAS_OP_N,
-                             CUBLAS_OP_N,
-                             N, M, K,
-                             &alpha, B,
-                             ldb, A,
-                             lda, &beta,
-                             C, N));
-}
-
-TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) {
-    int img_num = 1;
-    int kernel = 1;
-
-//    int out_channels = 32;
-//    int in_channels = 128;
-//    int img_h = 52;
-//    int img_w = 112;
-//    int out_channels = 64;
-//    int in_channels = 256;
-//    int img_h = 26;
-//    int img_w = 56;
-    int out_channels = 128;
-    int in_channels = 512;
-    int img_h = 13;
-    int img_w = 28;
-
-//    int out_channels = 512;
-//    int in_channels = 128;
-//    int img_h = 13;
-//    int img_w = 28;
-
-    int pad = 0;
-    int stride = 1;
-    Context<BM> ctx1(0, 1, 1);
-
-    CUBLAS_CHECK(cublasCreate(&cublas_handle));
-    CUBLAS_CHECK(cublasSetStream(cublas_handle, ctx1.get_compute_stream()));
-
-    TensorDf4 weights;
-    weights.re_alloc({out_channels, in_channels, 1, 1});
-
-    TensorDf4 img;
-    img.re_alloc({1, in_channels, img_h, img_w});
-
-    TensorDf4 out;
-    out.re_alloc({1, out_channels, img_h, img_w});
-    TensorDf4 out_gemm;
-    out_gemm.re_alloc({1, out_channels, img_h, img_w});
-
-    fill_tensor_device_rand(weights, -1.f, 1.f);
-    fill_tensor_device_rand(img, -1.f, 1.f);
-
-    LOG(INFO) << "img_num: " << img_num;
-    LOG(INFO) << "kernel: " << kernel;
-    LOG(INFO) << "out_channels: " << out_channels;
-    LOG(INFO) << "in_channels: " << in_channels;
-    LOG(INFO) << "img_h: " << img_h;
-    LOG(INFO) << "img_w: " << img_w;
-    LOG(INFO) << "pad: " << pad;
-    LOG(INFO) << "stride: " << stride;
-
-    TensorDf4 bias;
-
-    std::vector<TensorDf4*> input_v;
-    std::vector<TensorDf4*> output_gemm_v, output_v;
-
-    input_v.push_back(&img);
-    output_v.push_back(&out);
-    output_gemm_v.push_back(&out_gemm);
-    cudaDeviceSynchronize();
-    test_conv_fp32_speed(input_v, output_v,
-                         weights, kernel, stride, pad,
-            in_channels, out_channels, bias,
-            SABER_IMPL);
-    cudaDeviceSynchronize();
-    caffe_gemm(out_channels, img_h * img_w, in_channels,\
-					 1.f, weights.data(),\
-					 img.data(), 0.f, out_gemm.mutable_data());
-    cudaDeviceSynchronize();
-    SaberTimer<BM> t1;
-    int ts = 100;
-
-    for (int i = 0; i < ts; ++i) {
-        t1.start(ctx1);
-        caffe_gemm(out_channels, img_h * img_w, in_channels,\
-					 1.f, weights.data(),\
-					 img.data(), 0.f, out_gemm.mutable_data());
-        out_gemm.record_event(ctx1.get_compute_stream());
-        out_gemm.sync();
-        t1.end(ctx1);
-    }
-    LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms";
-
-    cudaDeviceSynchronize();
-//    print_tensor_device(out);
-//    print_tensor_device(out_gemm);
-    TensorHf4 out_host;
-    TensorHf4 out_gemm_host;
-    out_host.re_alloc(out.shape());
-    out_host.copy_from(out);
-
-    out_gemm_host.re_alloc(out_gemm.shape());
-    out_gemm_host.copy_from(out_gemm);
-    double max_r, max_d;
-    tensor_cmp_host(out_host.data(), out_gemm_host.data(), out_host.size(), max_r, max_d);
-    LOG(INFO) << "cmp result: max_r = " << max_r << " max_d = " << max_d;
-}
-
-int main(int argc, const char** argv){
-    anakin::saber::Env<BM>::env_init();
-
-    // initial logger
-    //logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
diff --git a/test/saber/bm/test_saber_func_fc_BM.cpp b/test/saber/bm/test_saber_func_fc_BM.cpp
deleted file mode 100644
index 869ff1bfd..000000000
--- a/test/saber/bm/test_saber_func_fc_BM.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-#include "core/context.h"
-#include "funcs/fc.h"
-#include "test_saber_func_BM.h"
-#include "tensor_op.h"
-#include "saber_types.h"
-#include <vector>
-
-using namespace anakin::saber;
-typedef TargetWrapper<BM> API;
-typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
-typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-typedef TensorDf4::Dtype ftype;
-
-void fc_compute(const TensorHf4& tin, const TensorHf4& weight, \
-                const TensorHf4& bias, TensorHf4& tout) {
-
-    int m = tin.num();
-    int k = tin.valid_size() / m;
-    int n = weight.valid_size() / k;
-    bool bias_term = bias.valid_size() > 0;
-
-    const float* din = tin.data();
-    const float* w = weight.data();
-    float* dout = tout.mutable_data();
-
-    for (int i = 0; i < m; ++i) {
-        float* pdout = dout + i * n;
-        const float* pdin = din + i * k;
-
-        for (int j = 0; j < n; ++j) {
-            if (bias_term) {
-                pdout[j] = bias.data()[j];
-            } else {
-                pdout[j] = 0;
-            }
-
-            for (int l = 0; l < k; ++l) {
-                pdout[j] += pdin[l] * w[l * n + j];
-            }
-        }
-    }
-}
-
-TEST(TestSaberFuncBM, test_func_fc) {
-
-    int test_iter = 100;
-    int w_in = 7;
-    int h_in = 7;
-    int ch_in = 512;
-    int num_in = 1;
-
-    int num_out = 4096;
-    int axis = 1;
-
-    Shape shape_in(num_in, ch_in, h_in, w_in);
-    Shape shape_out = {num_in, num_out, 1, 1};
-
-    Shape sh_w{1, 1, w_in* h_in * ch_in, num_out};
-    TensorDf4 weight(sh_w);
-    Shape sh_b{1, 1, 1, num_out};
-    TensorDf4 bias(sh_b);
-    fill_tensor_device_const(weight, 1.f);
-    fill_tensor_device_const(bias, 1.f);
-
-    LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \
-              ch_in << ", height=" << h_in << ", width=" << w_in;
-
-    std::vector<TensorDf4*> input_dev_4d;
-    std::vector<TensorDf4*> output_dev_4d;
-
-    TensorDf4 tdin;
-    TensorDf4 tdout;
-    tdin.re_alloc(shape_in);
-    fill_tensor_device_const(tdin, 1.f);
-    input_dev_4d.push_back(&tdin);
-    output_dev_4d.push_back(&tdout);
-
-    // start Reshape & doInfer
-    Context<BM> ctx_dev(0, 1, 1);
-
-    FcParam<TensorDf4> param(&weight, &bias, num_out, axis);
-
-    Fc<BM, AK_FLOAT> fc;
-
-    LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \
-              shape_out[2] << ", " << shape_out[3];
-
-    SABER_CHECK(fc.compute_output_shape(input_dev_4d, output_dev_4d, param));
-
-    LOG(INFO) << "re-alloc tensor buffer";
-    output_dev_4d[0]->re_alloc(output_dev_4d[0]->valid_shape());
-    Shape va_sh = tdout.valid_shape();
-    LOG(INFO) << "shape out 4d: " << va_sh[0] << ", " << va_sh[1] << ", " << \
-              va_sh[2] << ", " << va_sh[3];
-    CHECK_EQ(tdout.valid_shape() == shape_out, true) << "compute output shape error";
-
-    LOG(INFO) << "FC initialization";
-    SABER_CHECK(fc.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev));
-
-    LOG(INFO) << "FC compute";
-    SaberTimer<BM> t1;
-    t1.clear();
-    t1.start(ctx_dev);
-
-    for (int i = 0; i < test_iter; ++i) {
-        SABER_CHECK(fc(input_dev_4d, output_dev_4d, param, ctx_dev));
-        output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
-        output_dev_4d[0]->sync();
-        //cudaDeviceSynchronize();
-    }
-
-    t1.end(ctx_dev);
-    float ts = t1.get_average_ms();
-    LOG(INFO) << "total time: " << ts << "avg time: " << ts / test_iter;
-    //print_tensor_device(*output_dev_4d[0]);
-
-    //! check result
-    TensorHf4 thin(shape_in);
-    TensorHf4 thout(shape_out);
-    TensorHf4 thw(sh_w);
-    TensorHf4 thb(sh_b);
-    thin.copy_from(tdin);
-    thw.copy_from(weight);
-    thb.copy_from(bias);
-    fc_compute(thin, thw, thb, thout);
-    //print_tensor_host(thout);
-
-    TensorHf4 thout_d(shape_out);
-    thout_d.copy_from(tdout);
-    double max_ratio = 0;
-    double max_diff = 0;
-    tensor_cmp_host(thout.data(), thout_d.data(), thout.valid_size(), max_ratio, max_diff);
-    LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio;
-    CHECK_LE(fabs(max_ratio), 1.0e-6) << "error result";
-
-}
-
-int main(int argc, const char** argv) {
-    // initial logger
-    //logger::init(argv[0]);
-    Env<BM>::env_init();
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
diff --git a/test/saber/bm/test_saber_shape_BM.cpp b/test/saber/bm/test_saber_shape_BM.cpp
deleted file mode 100644
index 18479cd18..000000000
--- a/test/saber/bm/test_saber_shape_BM.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-#include "test_saber_shape_BM.h"
-#include "shape.h"
-#include "anakin_config.h"
-
-#ifdef USE_OPENMP
-#include <omp.h>
-#include <core/shape.h>
-#endif
-
-using namespace anakin;
-using namespace saber;
-
-
-TEST(TestSaberShapeBM, test_saber_shape) {
-
-    int dim = 4;
-    Shape sh4d0{0, 0, 0, 0};
-    CHECK_EQ(sh4d0.dims(), 4) << "check shape dim error";
-
-    for (int i = 0; i < dim; ++i) {
-        CHECK_EQ(sh4d0[i], 0) << "check default constructor, dim size error";
-    }
-
-    CHECK_EQ(sh4d0.count(), 0) << "check shape count error";
-
-    int N = 1;
-    int C = 3;
-    int H = 11;
-    int W = 11;
-    std::vector<int> sh_size = {N, C, H, W};
-    //Shape sh4d1(sh_size);
-    Shape sh4d1(N, C, H, W);
-    LOG(INFO) << "Test Saber Shape, size of shape: " << sh4d1.size();
-    CHECK_EQ(sh4d1.count(), N * C * H * W) << "size error with vector constructor!";
-    //CHECK_EQ(sh4d2.size(), N * C * H * W) << "size error with args constructor!";
-
-    CHECK_EQ(sh4d1[0], N) << "get shape size error";
-    CHECK_EQ(sh4d1[1], C) << "get shape size error";
-    CHECK_EQ(sh4d1[2], H) << "get shape size error";
-    CHECK_EQ(sh4d1[3], W) << "get shape size error";
-
-    //CHECK_EQ(sh4d2[0], N) << "get shape size error";
-    //CHECK_EQ(sh4d2[1], C) << "get shape size error";
-    //CHECK_EQ(sh4d2[2], H) << "get shape size error";
-    //CHECK_EQ(sh4d2[3], W) << "get shape size error";
-
-    CHECK_EQ(sh4d1.count(0), N * C * H * W) << "calculate count failed";
-
-    C = 10;
-    sh4d1[1] = C;
-    CHECK_EQ(sh4d1[1], C) << "set shape size error";
-
-    bool is_equal = (sh4d0 == sh4d1);
-    CHECK_EQ(is_equal, false) << "check shape is_equal failed";
-
-    sh4d0 = sh4d1;
-    CHECK_EQ(sh4d1[0], N) << "constructor failed";
-    CHECK_EQ(sh4d1[1], C) << "get shape size error";
-    CHECK_EQ(sh4d1[2], H) << "get shape size error";
-    CHECK_EQ(sh4d1[3], W) << "get shape size error";
-
-    Shape sh4d3 = sh4d1;
-    CHECK_EQ((sh4d3 == sh4d1), true) << "constructor error";
-
-    Shape sh4d4(sh4d1);
-    CHECK_EQ((sh4d4 == sh4d1), true) << "constructor error";
-
-    Shape sh1d0{0};
-    //std::vector<int> sh1d_size = {W};
-
-    //Shape sh1d1(sh1d_size);
-    //Shape sh1d0{W};
-    Shape sh1d1(W);
-
-    Shape sh1d3 = sh1d1;
-    Shape sh1d4(sh1d1);
-
-    CHECK_EQ(sh1d0.dims(), 1) << "shape dim error";
-
-    CHECK_EQ(sh1d0.count(), 0) << "shape size error";
-
-    CHECK_EQ(sh1d0.count(0), 0) << "shape1d count error";
-
-    CHECK_EQ(sh1d1[0], W) << "get shape size error";
-
-    //CHECK_EQ(sh1d2.count(0), W) << "shape dim error";
-
-    CHECK_EQ((sh1d0 != sh1d1), true) << "compare shape error";
-
-    CHECK_EQ((sh1d3 == sh1d1), true) << "compare shape error";
-
-    CHECK_EQ((sh1d4 == sh1d1), true) << "compare shape error";
-
-    Shape sh0{2, 2, 3, 4};
-    Shape sh1{2, 1, 1, 24};
-    Shape sh2{2, 2, 3, 4};
-    Shape sh3{1, 1, 2, 3};
-
-    CHECK_EQ(sh0 == sh2, true) << "error ==";
-    CHECK_EQ(sh3 < sh0, true) << "error <";
-    CHECK_EQ(sh3 >= sh0, false) << "error >=";
-    CHECK_EQ(sh3 > sh0, false) << "error >";
-    CHECK_EQ(sh0 > sh3, true) << "error >";
-    CHECK_EQ(sh0 < sh1, false) << "error <";
-    CHECK_EQ(sh0 <= sh2, true) << "error <=";
-    CHECK_EQ(sh0 >= sh2, true) << "error >=";
-
-    Shape sh001 = Shape::zero(2);
-    Shape sh002 = Shape::zero(3);
-
-    if (sh001 > sh002) {
-        LOG(ERROR) << "error <";
-    }
-
-}
-
-
-int main(int argc, const char** argv) {
-    // initial logger
-    logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
-
diff --git a/test/saber/bm/test_saber_shape_BM.h b/test/saber/bm/test_saber_shape_BM.h
deleted file mode 100644
index a2ca02c9b..000000000
--- a/test/saber/bm/test_saber_shape_BM.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H
-#define ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H
-
-#include "utils/unit_test/aktest.h"
-#include "utils/logger/logger.h"
-#include "saber/core/shape.h"
-
-using namespace anakin::test;
-
-class TestSaberShapeBM : public Test {
-public:
-    TestSaberShapeBM() {}
-    ~TestSaberShapeBM() {}
-
-protected:
-    virtual void setup() {}
-    virtual void teardown() {}
-
-protected:
-    std::string name;
-    std::string _test;
-};
-
-#endif //ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H
-
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index 2dcd61c41..69b1ccbfc 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -9,7 +9,9 @@ typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
 typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 typedef TensorHf4::Dtype dtype;
 
+static bm_handle_t handle;
 TEST(TestSaberTensorBM, test_tensor_constructor) {
+    bmdnn_init(&handle);
 
     //! test empty constructor
     LOG(INFO) << "test default (empty) constructor";
@@ -28,13 +30,13 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
 
     //! test tensor re_alloc function on tensor with data
     LOG(INFO) << "|--test tensor re_alloc function on tensor with data";
-    Shape sh1(1, 2, 4, 4);
+    Shape sh1(2, 4, 4, 2);
     thost0.re_alloc(sh1);
     tdev0.re_alloc(sh1);
     LOG(INFO) << "|--tensor size of host: " << thost0.size();
     LOG(INFO) << "|--tensor size of device: " << tdev0.size();
-    CHECK_EQ(thost0.size(), 32) << "error with tensor size";
-    CHECK_EQ(tdev0.size(), 32) << "error with tensor size";
+    CHECK_EQ(thost0.size(), 64) << "error with tensor size";
+    CHECK_EQ(tdev0.size(), 64) << "error with tensor size";
 
     //! test tensor shape() function
     LOG(INFO) << "|--test tensor shape() function";
@@ -45,9 +47,9 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
               << thost0.height() << ", width = " << thost0.width();
 
     //! test tensor mutable_data() function
-    LOG(INFO) << "|--test tensor mutable_data() function, write tensor data buffer with 1.f";
-    fill_tensor_host_const(thost0, 1.f);
-    LOG(INFO) << "|--test tensor data() function, show the const data, 1.f";
+    LOG(INFO) << "|--test tensor mutable_data() function, write tensor data buffer with 2.f";
+    fill_tensor_host_const(thost0, 2.f);
+    LOG(INFO) << "|--test tensor data() function, show the const data, 2.f";
     print_tensor_host(thost0);
 
     //! test tensor constructor with shape
@@ -55,6 +57,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     TensorHf4 thost1(sh1);
     TensorDf4 tdev1(sh1);
 
+
     //! test tensor copy_from() function
     LOG(INFO) << "test copy_from() function, input tensor could be any target";
 
@@ -64,17 +67,17 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
 
     // host to device
     tdev1.copy_from(thost0);
-    print_tensor_device(tdev1);
+    //TODO: print tensor for BM device
+    //print_tensor_host(tdev1);
 
     // device to host
     thost1.copy_from(tdev1);
     print_tensor_host(thost1);
 
-    //device to device
+    
+    // device to device
     tdev1.copy_from(tdev0);
-    print_tensor_device(tdev1);
 
-    /*
     //! test tensor constructor with shape and real_shape
     LOG(INFO) << "test tensor constructor with shape and real_shape";
     //! constructor with 3 shapes is removed
@@ -97,22 +100,35 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
 
     BM_API::mem_alloc(&tmp_pt_dev, sizeof(dtype) * sh1.count());
     dev_data_ptr = static_cast<dtype*>(tmp_pt_dev);
-    cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice);
+//    bm_memcpy_d2s(handle,host_data_ptr,dev_data_ptr)
+//    cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice);
     LOG(INFO) << "|--construct host tensor from host data ptr";
     TensorHf4 thost3(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
     LOG(INFO) << "|--constructor device tensor from host data ptr";
-    TensorDf4 tdev3(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
+
+//    TensorDf4 tdev3(&bm_mem_from_system(const_cast<float *>(host_data_ptr)), X86(), X86_API::get_device_id(), sh1);
+
+    TensorDf4 tdev3(&bm_mem_from_system(const_cast<float *>(host_data_ptr)), X86(), X86_API::get_device_id(), sh1);
+
     print_tensor_host(thost3);
-    print_tensor_device(tdev3);
-    //cudaDeviceSynchronize();
 
+    TensorHf4 thost_lian(sh1);
+    thost_lian.copy_from(tdev3);
+    print_tensor_host(thost_lian);
+
+    thost_lian.copy_from(thost3);
+    print_tensor_host(thost_lian);
+
+    //cudaDeviceSynchronize();
+    //
+/*
     LOG(INFO) << "|--construct host tensor from device data ptr";
     TensorHf4 thost4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1);
     LOG(INFO) << "|--constructor device tensor from device data ptr";
     TensorDf4 tdev4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1);
     print_tensor_host(thost4);
     print_tensor_device(tdev4);
-
+/*
     //BM_API::stream_t dev_stream0;
     //BM_API::create_stream_with_flag(dev_stream0, 1);
     //cudaDeviceSynchronize();
@@ -202,6 +218,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     LOG(INFO) << "|--show root tensor while data is changed by shared tensor";
     print_tensor_host(thost4);
      */
+//    bmdnn_deinit(handle);
 }
 
 /*

From e048078e03e8b7b8858159019ff0d2684dca4249 Mon Sep 17 00:00:00 2001
From: "Guangzhi (Frank) Xie" <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 21:12:57 +0800
Subject: [PATCH 190/318] Implement conv for BM

---
 saber/funcs/impl/bm/vender_conv.h         |  41 +-
 test/saber/bm/test_saber_func_conv_BM.cpp | 730 ++++++++++++++++++++++
 2 files changed, 767 insertions(+), 4 deletions(-)
 create mode 100644 test/saber/bm/test_saber_func_conv_BM.cpp

diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h
index a0a3b3fb5..778094886 100644
--- a/saber/funcs/impl/bm/vender_conv.h
+++ b/saber/funcs/impl/bm/vender_conv.h
@@ -62,10 +62,43 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
         int pad_w = param.pad_w;
         int stride_h = param.stride_h;
         int stride_w = param.stride_w;
-        BMDNN_CHECK(bmdnn_conv_forward(_handle, in_data, weights, bias,
-                                    input_n, input_c, input_h, input_w, group, output_c,
-                                    kh, kw, pad_h, pad_w, stride_h, stride_w, 1, 0, 0, 
-                                    out_data, NULL));
+
+        bm_tensor_4d_t input_shape = {
+            input_n,
+            input_c,
+            input_h,
+            input_w
+        };
+
+        bm_tensor_4d_t output_shape = {
+            input_n,
+            output_c,
+            input_h,
+            input_w
+        };
+
+        bm_kernel_param_t kernel_param = {
+            group,
+            output_c,
+            input_c,
+            kh,
+            kw
+        };
+
+        bm_conv_param_t conv_param = {
+            stride_h,
+            stride_w,
+            pad_h,
+            pad_w,
+            kh,
+            kw,
+            0
+        };
+
+        _handle = get_bm_handle();
+        BMDNN_CHECK(bmdnn_conv_forward(_handle, *in_data, *weight, *bias, input_shape, 
+                                    kernel_param, output_shape, conv_param, 1, *out_data));
+                                    
         return SaberSuccess;
     }
 
diff --git a/test/saber/bm/test_saber_func_conv_BM.cpp b/test/saber/bm/test_saber_func_conv_BM.cpp
new file mode 100644
index 000000000..025a1074c
--- /dev/null
+++ b/test/saber/bm/test_saber_func_conv_BM.cpp
@@ -0,0 +1,730 @@
+#include "core/context.h"
+#include "funcs/conv.h"
+#include "test_saber_func_BM.h"
+#include "tensor_op.h"
+#include "saber_types.h"
+#include <vector>
+//#include "cublas.h"
+
+using namespace anakin::saber;
+
+typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+
+template <typename Tensor>
+void print_tensor_shape(std::string name, Tensor &t0) {
+
+            LOG(INFO) << name << " valid shape is ["
+                      << t0.valid_shape()[0] << ", "
+                      << t0.valid_shape()[1] << ", "
+                      << t0.valid_shape()[2] << ", "
+                      << t0.valid_shape()[3] << "].";
+
+            LOG(INFO) << name << " real shape is ["
+                      << t0.shape()[0] << ", "
+                      << t0.shape()[1] << ", "
+                      << t0.shape()[2] << ", "
+                      << t0.shape()[3] << "].";
+
+            LOG(INFO) << name << " offset is ["
+                      << t0.offset()[0] << ", "
+                      << t0.offset()[1] << ", "
+                      << t0.offset()[2] << ", "
+                      << t0.offset()[3] << "].";
+}
+
+//Round a / b to nearest higher integer value
+inline int i_div_up(int a, int b)
+{
+    return (a % b != 0) ? (a / b + 1) : (a / b);
+}
+
+#if 1
+TEST(TestSaberFuncBM, test_depthwise_conv) {
+
+    int group = 2;
+    int pad_h = 1;
+    int pad_w = 1;
+    int stride_h = 1;
+    int stride_w = 1;
+    int dilation_h = 1;
+    int dilation_w = 1;
+
+    int kernel_h = 3;
+    int kernel_w = 3;
+    int out_channels = 2;
+    
+    int img_num = 1;
+    int in_channels = 2;
+    int img_h = 8;
+    int img_w = 8;
+
+    bool bias_term = true;
+
+    LOG(INFO) << "conv param: ";
+    LOG(INFO) << " img_num = " << img_num;
+    LOG(INFO) << " in_channels = " << in_channels;
+    LOG(INFO) << " img_h = " << img_h;
+    LOG(INFO) << " img_w = " << img_w;
+    LOG(INFO) << " group = " << group;
+    LOG(INFO) << " pad_h = " << pad_h;
+    LOG(INFO) << " pad_w = " << pad_w;
+    LOG(INFO) << " stride_h = " << stride_h;
+    LOG(INFO) << " stride_w = " << stride_w;
+    LOG(INFO) << " dilation_h = " << dilation_h;
+    LOG(INFO) << " dilation_w = " << dilation_w;
+    LOG(INFO) << " kernel_h = " << kernel_h;
+    LOG(INFO) << " kernel_w = " << kernel_w;
+    LOG(INFO) << " out_channels = " << out_channels;
+
+    Shape img_s(img_num, in_channels, img_h, img_w);
+    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
+    Shape bias_s(1, out_channels, 1, 1);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+    
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    for (int i = 0; i < img_host.size(); ++i) {
+        img_host.mutable_data()[i] = 63 & i;
+    }
+
+    img_dev.copy_from(img_host);
+    
+    TensorHf4 weights_host;
+    TensorDf4 weights_dev;
+    
+    weights_host.re_alloc(weights_s);
+    weights_dev.re_alloc(weights_s);
+
+    fill_tensor_host_const(weights_host, 1.f);
+    weights_dev.copy_from(weights_host);
+
+    TensorHf4 bias_host;
+    TensorDf4 bias_dev;
+
+    if (bias_term) {
+        bias_host.re_alloc(bias_s);
+        bias_dev.re_alloc(bias_s);
+
+        fill_tensor_host_const(bias_host, 1.f);
+        bias_dev.copy_from(bias_host);
+    }
+
+    TensorHf4 output_host;
+    TensorDf4 output_dev;
+
+    // start Reshape & doInfer
+    Context<BM> ctx1(0, 1, 1);
+    
+    ConvParam<TensorDf4> param(group, pad_h, pad_w,
+                               stride_h, stride_w,
+                               dilation_h, dilation_w,
+                               &weights_dev, &bias_dev);
+
+    std::vector<TensorDf4*> input;
+    std::vector<TensorDf4*> output;
+
+    input.push_back(&img_dev);
+    output.push_back(&output_dev);
+
+    Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv;
+    conv.compute_output_shape(input, output, param);
+
+    output_dev.re_alloc(output[0]->shape());
+    output_host.re_alloc(output[0]->shape());
+
+    LOG(INFO) << "regular start with group = " << group;
+    // init assume output tensor has been reshpaed by user.
+    conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
+
+    conv(input, output, param, ctx1);
+
+    //cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    //output[0]->record_event(cuda_stream);
+
+    //output_dev.sync();
+    print_tensor_device(output_dev);
+
+//    param.group = 1;
+//    param.pad_h = 1;
+//    param.pad_w = 1;
+//
+//    LOG(INFO) << " param changed start with group = "<<param.group;
+//    conv(input, output, param, ctx1);
+//
+//    output_dev.sync();
+//    print_tensor_device(output_dev);
+    //cudaDeviceSynchronize();
+    //CUDA_CHECK(cudaPeekAtLastError());
+}
+
+TEST(TestSaberFuncBM, test_conv_param_change) {
+
+    int group = 4;
+    int pad_h = 1;
+    int pad_w = 1;
+    int stride_h = 1;
+    int stride_w = 1;
+    int dilation_h = 1;
+    int dilation_w = 1;
+
+    int kernel_h = 3;
+    int kernel_w = 3;
+    int out_channels = 4;
+
+    int img_num = 1;
+    int in_channels = 4;
+    int img_h = 65;
+    int img_w = 63;
+
+    bool bias_term = true;
+
+    LOG(INFO) << "conv param: ";
+    LOG(INFO) << " img_num = " << img_num;
+    LOG(INFO) << " in_channels = " << in_channels;
+    LOG(INFO) << " img_h = " << img_h;
+    LOG(INFO) << " img_w = " << img_w;
+    LOG(INFO) << " group = " << group;
+    LOG(INFO) << " pad_h = " << pad_h;
+    LOG(INFO) << " pad_w = " << pad_w;
+    LOG(INFO) << " stride_h = " << stride_h;
+    LOG(INFO) << " stride_w = " << stride_w;
+    LOG(INFO) << " dilation_h = " << dilation_h;
+    LOG(INFO) << " dilation_w = " << dilation_w;
+    LOG(INFO) << " kernel_h = " << kernel_h;
+    LOG(INFO) << " kernel_w = " << kernel_w;
+    LOG(INFO) << " out_channels = " << out_channels;
+
+    Shape img_s(img_num, in_channels, img_h, img_w);
+    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
+    Shape bias_s(1, out_channels, 1, 1);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    for (int i = 0; i < img_host.size(); ++i) {
+        img_host.mutable_data()[i] = 0x7f & i;
+    }
+
+    img_dev.copy_from(img_host);
+
+    TensorHf4 weights_host;
+    TensorDf4 weights_dev;
+
+    weights_host.re_alloc(weights_s);
+    weights_dev.re_alloc(weights_s);
+
+    fill_tensor_host_const(weights_host, 1.f);
+    weights_dev.copy_from(weights_host);
+
+    TensorHf4 bias_host;
+    TensorDf4 bias_dev;
+
+    if (bias_term) {
+        bias_host.re_alloc(bias_s);
+        bias_dev.re_alloc(bias_s);
+
+        fill_tensor_host_const(bias_host, 1.f);
+        bias_dev.copy_from(bias_host);
+    }
+
+    TensorHf4 output_host;
+    TensorDf4 output_dev;
+
+    // start Reshape & doInfer
+    Context<BM> ctx1(0, 1, 1);
+
+    ConvParam<TensorDf4> param(group, pad_h, pad_w,
+                               stride_h, stride_w,
+                               dilation_h, dilation_w,
+                               &weights_dev, &bias_dev);
+
+    std::vector<TensorDf4*> input;
+    std::vector<TensorDf4*> output;
+
+    input.push_back(&img_dev);
+    output.push_back(&output_dev);
+
+    Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv;
+    conv.compute_output_shape(input, output, param);
+
+    output_dev.re_alloc(output[0]->shape());
+    output_host.re_alloc(output[0]->shape());
+
+            LOG(INFO)<<"regular start with group = "<<group;
+    // init assume output tensor has been reshpaed by user.
+    conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
+
+    conv(input, output, param, ctx1);
+    //output_dev.sync();
+//    print_tensor_device(output_dev);
+
+    param.group = 1;
+    param.pad_h = 1;
+    param.pad_w = 1;
+
+    LOG(INFO)<<" param changed start with group = "<<param.group;
+    conv(input, output, param, ctx1);
+
+    //cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    //output[0]->record_event(cuda_stream);
+
+    //output_dev.sync();
+//    print_tensor_device(output_dev);
+    //cudaDeviceSynchronize();
+    //CUDA_CHECK(cudaPeekAtLastError());
+}
+
+TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
+
+    int group = 1;
+    int pad_h = 1;
+    int pad_w = 1;
+    int stride_h = 1;
+    int stride_w = 1;
+    int dilation_h = 1;
+    int dilation_w = 1;
+
+    int kernel_h = 3;
+    int kernel_w = 3;
+    int out_channels = 2;
+
+    int img_num = 1;
+    int in_channels = 2;
+    int img_h = 8;
+    int img_w = 8;
+
+    bool bias_term = false;
+
+    LOG(INFO) << "conv param: ";
+    LOG(INFO) << " img_num = " << img_num;
+    LOG(INFO) << " in_channels = " << in_channels;
+    LOG(INFO) << " img_h = " << img_h;
+    LOG(INFO) << " img_w = " << img_w;
+    LOG(INFO) << " group = " << group;
+    LOG(INFO) << " pad_h = " << pad_h;
+    LOG(INFO) << " pad_w = " << pad_w;
+    LOG(INFO) << " stride_h = " << stride_h;
+    LOG(INFO) << " stride_w = " << stride_w;
+    LOG(INFO) << " dilation_h = " << dilation_h;
+    LOG(INFO) << " dilation_w = " << dilation_w;
+    LOG(INFO) << " kernel_h = " << kernel_h;
+    LOG(INFO) << " kernel_w = " << kernel_w;
+    LOG(INFO) << " out_channels = " << out_channels;
+
+    Shape img_s(img_num, in_channels, img_h, img_w);
+    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
+    Shape bias_s(1, out_channels, 1, 1);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    for (int i = 0; i < img_host.size(); ++i) {
+        img_host.mutable_data()[i] = 0x7f & i;
+    }
+
+    img_dev.copy_from(img_host);
+
+    Shape img_s_sub(img_num, in_channels, 4, 4);
+
+    TensorDf4 t0;
+    TensorDf4 t1;
+
+    t0.share_sub_buffer(img_dev, img_s_sub, {0,0,0,0});
+    t1.share_sub_buffer(img_dev, img_s_sub, {0,0,4,4});
+
+    print_tensor_shape("t0", t0);
+    print_tensor_shape("t1", t1);
+
+    TensorHf4 weights_host;
+    TensorDf4 weights_dev;
+
+    weights_host.re_alloc(weights_s);
+    weights_dev.re_alloc(weights_s);
+
+    fill_tensor_host_const(weights_host, 1.f);
+    weights_dev.copy_from(weights_host);
+
+    TensorHf4 bias_host;
+    TensorDf4 bias_dev;
+
+    if (bias_term) {
+        bias_host.re_alloc(bias_s);
+        bias_dev.re_alloc(bias_s);
+
+        fill_tensor_host_const(bias_host, 1.f);
+        bias_dev.copy_from(bias_host);
+    }
+
+    TensorDf4 output_dev;
+
+    // start Reshape & doInfer
+    Context<BM> ctx1(0, 1, 1);
+    Context<BM> ctx2(0, 2, 2);
+
+    TensorDf4 out0;
+    TensorDf4 out1;
+
+    ConvParam<TensorDf4> param0(group, pad_h, pad_w,
+                               stride_h, stride_w,
+                               dilation_h, dilation_w,
+                               &weights_dev, &bias_dev);
+
+    ConvParam<TensorDf4> param1(group, pad_h, pad_w,
+                                stride_h, stride_w,
+                                dilation_h, dilation_w,
+                                &weights_dev, &bias_dev);
+
+    std::vector<TensorDf4*> input0, input1;
+    std::vector<TensorDf4*> output0, output1;
+
+    input0.push_back(&t0);
+    input1.push_back(&t1);
+
+    output0.push_back(&out0);
+    output1.push_back(&out1);
+
+    // FIXME ? where do i get output shape
+    output_dev.re_alloc(img_s);
+
+    Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv0;
+    Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv1;
+
+    conv0.compute_output_shape(input0, output0, param0);
+    conv1.compute_output_shape(input1, output1, param1);
+
+    out0.share_sub_buffer(output_dev, output0[0]->valid_shape(),{0,0,0,0});
+    out1.share_sub_buffer(output_dev, output1[0]->valid_shape(),{0,0,4,4});
+
+    conv0.init(input0, output0, param0, SPECIFY, VENDER_IMPL, ctx1);
+    conv1.init(input1, output1, param1, SPECIFY, VENDER_IMPL, ctx2);
+
+    conv0(input0, output0, param0, ctx1);
+    conv1(input1, output1, param1, ctx2);
+
+    /*
+    cudaStream_t cuda_stream1 = ctx1.get_compute_stream();
+    output0[0]->record_event(cuda_stream1);
+
+    cudaStream_t cuda_stream2 = ctx2.get_compute_stream();
+    output1[0]->record_event(cuda_stream2);
+
+    out0.sync();
+    out1.sync();
+    */
+    print_tensor_device(output_dev);
+
+//    print_tensor_device(output_dev);
+
+    //cudaDeviceSynchronize();
+    //CUDA_CHECK(cudaPeekAtLastError());
+}
+#endif
+
+TEST(TestSaberFuncBM, test_conv_fp32_speed_test) {
+
+    int group = 1;
+    int pad_h = 1;
+    int pad_w = 1;
+    int stride_h = 1;
+    int stride_w = 1;
+    int dilation_h = 1;
+    int dilation_w = 1;
+
+    int kernel_h = 1;
+    int kernel_w = 1;
+    int out_channels = 128;
+
+    int img_num = 7;
+    int in_channels = 13;
+    int img_h = 32;
+    int img_w = 32;
+
+    bool bias_term = false;
+
+    LOG(INFO) << "conv param: ";
+    LOG(INFO) << " img_num = " << img_num;
+    LOG(INFO) << " in_channels = " << in_channels;
+    LOG(INFO) << " img_h = " << img_h;
+    LOG(INFO) << " img_w = " << img_w;
+    LOG(INFO) << " group = " << group;
+    LOG(INFO) << " pad_h = " << pad_h;
+    LOG(INFO) << " pad_w = " << pad_w;
+    LOG(INFO) << " stride_h = " << stride_h;
+    LOG(INFO) << " stride_w = " << stride_w;
+    LOG(INFO) << " dilation_h = " << dilation_h;
+    LOG(INFO) << " dilation_w = " << dilation_w;
+    LOG(INFO) << " kernel_h = " << kernel_h;
+    LOG(INFO) << " kernel_w = " << kernel_w;
+    LOG(INFO) << " out_channels = " << out_channels;
+    Shape img_s(img_num, in_channels, img_h, img_w);
+    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
+    Shape bias_s(1, out_channels, 1, 1);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    for (int i = 0; i < img_host.size(); ++i) {
+        img_host.mutable_data()[i] = 1;
+    }
+
+    img_dev.copy_from(img_host);
+
+    TensorHf4 weights_host;
+    TensorDf4 weights_dev;
+
+    weights_host.re_alloc(weights_s);
+    weights_dev.re_alloc(weights_s);
+
+    fill_tensor_host_const(weights_host, 1.f);
+    weights_dev.copy_from(weights_host);
+
+    TensorHf4 bias_host;
+    TensorDf4 bias_dev;
+
+    if (bias_term) {
+        bias_host.re_alloc(bias_s);
+        bias_dev.re_alloc(bias_s);
+
+        fill_tensor_host_const(bias_host, 1.f);
+        bias_dev.copy_from(bias_host);
+    }
+
+    TensorDf4 output_dev;
+
+    // start Reshape & doInfer
+    Context<BM> ctx1(0, 1, 1);
+
+    ConvParam<TensorDf4> param(group, pad_h, pad_w,
+                               stride_h, stride_w,
+                               dilation_h, dilation_w,
+                               &weights_dev, &bias_dev);
+
+    std::vector<TensorDf4*> input;
+    std::vector<TensorDf4*> output;
+
+    input.push_back(&img_dev);
+    output.push_back(&output_dev);
+
+    Conv<BM, AK_BM> conv;
+    conv.compute_output_shape(input, output, param);
+
+    output_dev.re_alloc(output[0]->shape());
+    LOG(INFO) << "Output shape = [ " << output[0]->shape()[0] << " " << output[0]->shape()[1] << " " \
+        << output[0]->shape()[2] << " " << output[0]->shape()[3] << "]";
+    //LOG(INFO) << " blocks = [ " <<  i_div_up(img_num*output[0]->shape()[2]*output[0]->shape()[3],128) << " " << i_div_up(out_channels*kernel_h, 128) << " 1 ]" ; 
+    //选择k最小的那一组，如果一样，则选128*N，N最大的那一组
+    int k0 = i_div_up(out_channels, 128) * 128 - out_channels;
+    int k1 = i_div_up(out_channels, 64) * 64 - out_channels;
+    int k2 = i_div_up(out_channels, 32) * 32 - out_channels;
+    int kk = std::min(std::min(k0,k1),k2);
+    LOG(INFO) << "k0 = " << k0 << " k1 = " << k1 << " k2 = " << k2 << " kk = " << kk;
+    if (kk == k0)
+        LOG(INFO) << "thread = [256,1,1] 128*128" ;
+    if (kk == k1)
+        LOG(INFO) << "thread = [128,1,1] 128*64" ;
+    if (kk == k2)
+        LOG(INFO) << "thread = [128,1,1] 128*32" ;
+
+    LOG(INFO) << "saber conv init";
+    conv.init(input, output, param, SPECIFY, SABER_IMPL, ctx1);
+
+    LOG(INFO) << "saber conv dispatch";
+    conv(input, output, param, ctx1);
+
+    //cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    //output[0]->record_event(cuda_stream);
+
+    //output_dev.sync();
+
+    SaberTimer<BM> t1;
+    int ts = 1;
+
+    for (int i = 0; i < ts; ++i) {
+        t1.start(ctx1);
+        conv(input, output, param, ctx1);
+        output_dev.sync();
+        t1.end(ctx1);
+    }
+
+    LOG(INFO) << "fp32 average time: " << t1.get_average_ms() << " ms";
+
+    //cudaDeviceSynchronize();
+    //CUDA_CHECK(cudaPeekAtLastError());
+}
+
+void test_conv_fp32_speed(std::vector<TensorDf4*> &inputs, std::vector<TensorDf4*> &outputs,
+                         TensorDf4 &weights, int kernel_size, int stride, int pad,
+                         int in_channel, int out_channel, TensorDf4 &bias,
+                         anakin::saber::ImplEnum impl) {
+
+    ConvParam<TensorDf4> conv_param(1, pad, pad,
+                                    stride, stride,
+                                    1, 1,
+                                    &weights, &bias);
+    Conv<BM, AK_BM> conv;
+    conv.compute_output_shape(inputs, outputs, conv_param);
+    outputs[0]->re_alloc(outputs[0]->shape());
+    Context<BM> ctx1(0, 1, 1);
+
+    SABER_CHECK(conv.init(inputs, outputs, conv_param, SPECIFY, impl, ctx1));
+
+    conv(inputs, outputs, conv_param, ctx1);
+    outputs[0]->record_event(ctx1.get_compute_stream());
+    outputs[0]->sync();
+
+    //cudaDeviceSynchronize();
+
+    SaberTimer<BM> t1;
+    int ts = 100;
+    for (int i = 0; i < ts; ++i) {
+        t1.start(ctx1);
+        conv(inputs, outputs, conv_param, ctx1);
+        outputs[0]->record_event(ctx1.get_compute_stream());
+        outputs[0]->sync();
+        t1.end(ctx1);
+    }
+            LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms";
+
+    //cudaDeviceSynchronize();
+}
+
+
+cublasHandle_t  cublas_handle;
+
+void caffe_gemm(const int M, const int N, const int K,\
+					 const float alpha, const float* A,\
+					 const float* B, const float beta, float* C) {
+    int lda = K;
+    int ldb = N;
+    CUBLAS_CHECK(cublasSgemm(cublas_handle,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             N, M, K,
+                             &alpha, B,
+                             ldb, A,
+                             lda, &beta,
+                             C, N));
+}
+
+TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) {
+    int img_num = 1;
+    int kernel = 1;
+
+//    int out_channels = 32;
+//    int in_channels = 128;
+//    int img_h = 52;
+//    int img_w = 112;
+//    int out_channels = 64;
+//    int in_channels = 256;
+//    int img_h = 26;
+//    int img_w = 56;
+    int out_channels = 128;
+    int in_channels = 512;
+    int img_h = 13;
+    int img_w = 28;
+
+//    int out_channels = 512;
+//    int in_channels = 128;
+//    int img_h = 13;
+//    int img_w = 28;
+
+    int pad = 0;
+    int stride = 1;
+    Context<BM> ctx1(0, 1, 1);
+
+    CUBLAS_CHECK(cublasCreate(&cublas_handle));
+    CUBLAS_CHECK(cublasSetStream(cublas_handle, ctx1.get_compute_stream()));
+
+    TensorDf4 weights;
+    weights.re_alloc({out_channels, in_channels, 1, 1});
+
+    TensorDf4 img;
+    img.re_alloc({1, in_channels, img_h, img_w});
+
+    TensorDf4 out;
+    out.re_alloc({1, out_channels, img_h, img_w});
+    TensorDf4 out_gemm;
+    out_gemm.re_alloc({1, out_channels, img_h, img_w});
+
+    fill_tensor_device_rand(weights, -1.f, 1.f);
+    fill_tensor_device_rand(img, -1.f, 1.f);
+
+    LOG(INFO) << "img_num: " << img_num;
+    LOG(INFO) << "kernel: " << kernel;
+    LOG(INFO) << "out_channels: " << out_channels;
+    LOG(INFO) << "in_channels: " << in_channels;
+    LOG(INFO) << "img_h: " << img_h;
+    LOG(INFO) << "img_w: " << img_w;
+    LOG(INFO) << "pad: " << pad;
+    LOG(INFO) << "stride: " << stride;
+
+    TensorDf4 bias;
+
+    std::vector<TensorDf4*> input_v;
+    std::vector<TensorDf4*> output_gemm_v, output_v;
+
+    input_v.push_back(&img);
+    output_v.push_back(&out);
+    output_gemm_v.push_back(&out_gemm);
+    //cudaDeviceSynchronize();
+    test_conv_fp32_speed(input_v, output_v,
+                         weights, kernel, stride, pad,
+            in_channels, out_channels, bias,
+            SABER_IMPL);
+    //cudaDeviceSynchronize();
+    caffe_gemm(out_channels, img_h * img_w, in_channels,\
+					 1.f, weights.data(),\
+					 img.data(), 0.f, out_gemm.mutable_data());
+    //cudaDeviceSynchronize();
+    SaberTimer<BM> t1;
+    int ts = 100;
+
+    for (int i = 0; i < ts; ++i) {
+        t1.start(ctx1);
+        caffe_gemm(out_channels, img_h * img_w, in_channels,\
+					 1.f, weights.data(),\
+					 img.data(), 0.f, out_gemm.mutable_data());
+        out_gemm.record_event(ctx1.get_compute_stream());
+        out_gemm.sync();
+        t1.end(ctx1);
+    }
+    LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms";
+
+    //cudaDeviceSynchronize();
+//    print_tensor_device(out);
+//    print_tensor_device(out_gemm);
+    TensorHf4 out_host;
+    TensorHf4 out_gemm_host;
+    out_host.re_alloc(out.shape());
+    out_host.copy_from(out);
+
+    out_gemm_host.re_alloc(out_gemm.shape());
+    out_gemm_host.copy_from(out_gemm);
+    double max_r, max_d;
+    tensor_cmp_host(out_host.data(), out_gemm_host.data(), out_host.size(), max_r, max_d);
+    LOG(INFO) << "cmp result: max_r = " << max_r << " max_d = " << max_d;
+}
+
+int main(int argc, const char** argv){
+    anakin::saber::Env<BM>::env_init();
+
+    // initial logger
+    //logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+

From a394d60ba35ce000ab105ae80e0f92c0b7bce5aa Mon Sep 17 00:00:00 2001
From: "Guangzhi (Frank) Xie" <guangzhi.xie@berkeley.edu>
Date: Tue, 26 Jun 2018 21:21:24 +0800
Subject: [PATCH 191/318] Comment out last conv test for now

---
 test/saber/bm/test_saber_func_conv_BM.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/saber/bm/test_saber_func_conv_BM.cpp b/test/saber/bm/test_saber_func_conv_BM.cpp
index 025a1074c..9a25d00b3 100644
--- a/test/saber/bm/test_saber_func_conv_BM.cpp
+++ b/test/saber/bm/test_saber_func_conv_BM.cpp
@@ -601,7 +601,7 @@ void test_conv_fp32_speed(std::vector<TensorDf4*> &inputs, std::vector<TensorDf4
     //cudaDeviceSynchronize();
 }
 
-
+/*
 cublasHandle_t  cublas_handle;
 
 void caffe_gemm(const int M, const int N, const int K,\
@@ -717,7 +717,7 @@ TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) {
     tensor_cmp_host(out_host.data(), out_gemm_host.data(), out_host.size(), max_r, max_d);
     LOG(INFO) << "cmp result: max_r = " << max_r << " max_d = " << max_d;
 }
-
+*/
 int main(int argc, const char** argv){
     anakin::saber::Env<BM>::env_init();
 

From 6f752bbf52e8c358454d505fc8d98b9f1e8111f4 Mon Sep 17 00:00:00 2001
From: "weihao.huang" <weihao.huang@bitmain.com>
Date: Tue, 26 Jun 2018 13:42:52 +0000
Subject: [PATCH 192/318] Modify sync_memcpy & add bm_mem_from_device

---
 saber/core/impl/bm/bm_impl.cpp                   | 16 ++++++++++------
 saber/core/target_wrapper.h                      |  2 +-
 .../impl/bm/base/include/bmlib/bmlib_runtime.h   |  3 +++
 test/saber/bm/test_saber_buffer_BM.cpp           | 10 ++++++----
 4 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 60e52088e..ef26884b2 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -81,16 +81,20 @@ void BM_API::mem_set(void* ptr, int value, size_t n){
 //static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
 //    size_t count, __DtoD) {};
 
-//static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-//    size_t count, __HtoD) {};
+void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+    size_t count, __HtoD) {
+    handle = get_bm_handle(); 
+    BMDNN_CHECK(bm_memcpy_s2d(handle, bm_mem_from_device(dst), bm_mem_from_system(src)));
+    LOG(INFO) << "BM sync_memcpy: host to device, finished";
+};
 
 void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
     size_t count, __DtoH) {
     handle = get_bm_handle(); 
-    //auto* dev_ptr = const_cast<bm_device_mem_t *>(src);
-    BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src)));
-    //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *src));
-    LOG(INFO) << "End sync_memcpy process";
+    BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), bm_mem_from_device(src)));
+    //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src)));
+    //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(reinterpret_cast<struct bm_mem_desc *>(src))));
+    LOG(INFO) << "BM sync_memcpy: device to host, finished";
 };
 
 //static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index 49a6e9364..475fbba84 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -543,7 +543,7 @@ struct TargetWrapper<BM, __device_target> {
         size_t count, __DtoD) {};
 
     static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-        size_t count, __HtoD) {};
+        size_t count, __HtoD);
 
     static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
         size_t count, __DtoH) {};
diff --git a/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h b/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h
index 932b17138..7d537401c 100644
--- a/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h
+++ b/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h
@@ -148,6 +148,9 @@ bm_status_t bm_memset_device(
 bm_device_mem_t bm_mem_from_system(
     void *              system_addr);
 
+bm_device_mem_t bm_mem_from_device(
+    void *              device_addr);
+	
 /*
 *brief malloc one device memory with the shape of (N,C,H,W), copy the sys_mem to
 device mem if need_copy is true
diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp
index dce1fae15..555e22675 100644
--- a/test/saber/bm/test_saber_buffer_BM.cpp
+++ b/test/saber/bm/test_saber_buffer_BM.cpp
@@ -5,7 +5,7 @@
 using namespace anakin::saber;
 
 int get_bm_size() {
-    return 1;
+    return 4;
 }
 
 template <DataType Ddatatype, DataType Hdatatype>
@@ -27,7 +27,7 @@ void test_buffer() {
     x86_ptr = static_cast<Hdtype*>(tmp_x86);
 
     for (int i = 0; i < n0; i++) {
-        x86_ptr[i] = static_cast<Hdtype>(i);
+        x86_ptr[i] = static_cast<Hdtype>(100);
     }
 
     void* tmp_bm;
@@ -105,6 +105,7 @@ void test_buffer() {
     for (int i = 0; i < 10; i++) {
 	std::cout << "x86: " << x86_buf2_ptr[i] << std::endl;
     }
+    */
 
     const Hdtype* bm_buf1_ptr = static_cast<const Hdtype*>(bm_buf1.get_data());
     for (int i = 0; i < 10; i++) {
@@ -115,16 +116,17 @@ void test_buffer() {
     LOG(INFO) << "x86_buf1 cap & cnt: " << x86_buf1.get_capacity() << " " << x86_buf1.get_count(); 
     LOG(INFO) << "size of Hdtype: " << sizeof(Hdtype);
     LOG(INFO) << "size of Ddtype: " << sizeof(Ddtype);
-    */
+    
 
     x86_buf1.re_alloc(bm_buf1.get_capacity());
     x86_buf1.sync_copy_from(bm_buf1);
     LOG(INFO) << "deep copy from device buffer to host buffer: ";
     ptr1 = static_cast<const Hdtype*>(x86_buf1.get_data());
 
-    for (int i = 0; i < 30; i++) {
+    for (int i = 0; i < 10; i++) {
         std::cout << ptr1[i] << std::endl;
     }
+
 }
 
 TEST(TestSaberBufferBM, test_buffer_memcpy) {

From 8925da303d5a923b078b9b4624f642808c9f7468 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 09:39:38 +0800
Subject: [PATCH 193/318] Update BM conv params

---
 saber/funcs/impl/bm/vender_conv.h | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h
index 778094886..530eef528 100644
--- a/saber/funcs/impl/bm/vender_conv.h
+++ b/saber/funcs/impl/bm/vender_conv.h
@@ -36,6 +36,8 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
     virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             ConvParam<OpTensor>& param, Context<BM>& ctx) {
+
+        _handle = get_bm_handle();
         return create(inputs, outputs, param, ctx);
     }
 
@@ -50,18 +52,26 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
         const InDataType *weight = (const InDataType *) param.weight()->data();
         const InDataType *bias = (const InDataType *) param.bias()->data();
         OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
+
         int input_n = inputs[0]->num();
         int input_c = inputs[0]->channel();
         int input_h = inputs[0]->height();
         int input_w = inputs[0]->width();
-        int group = param.group;
+
+        int output_n = outputs[0]->num();
         int output_c = outputs[0]->channel();
+        int output_h = outputs[0]->height();
+        int output_w = outputs[0]->width();
+
+        int group = param.group;
         int kh = param.weight()->height();
         int kw = param.weight()->width();
         int pad_h = param.pad_h;
         int pad_w = param.pad_w;
         int stride_h = param.stride_h;
         int stride_w = param.stride_w;
+        int dilation_h = param.dilation_h;
+        int dilation_w = param.dilation_w;
 
         bm_tensor_4d_t input_shape = {
             input_n,
@@ -71,10 +81,10 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
         };
 
         bm_tensor_4d_t output_shape = {
-            input_n,
+            output_n,
             output_c,
-            input_h,
-            input_w
+            output_h,
+            output_w
         };
 
         bm_kernel_param_t kernel_param = {
@@ -90,12 +100,11 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
             stride_w,
             pad_h,
             pad_w,
-            kh,
-            kw,
+            dilation_h,
+            dilation_w,
             0
         };
 
-        _handle = get_bm_handle();
         BMDNN_CHECK(bmdnn_conv_forward(_handle, *in_data, *weight, *bias, input_shape, 
                                     kernel_param, output_shape, conv_param, 1, *out_data));
                                     
@@ -103,7 +112,7 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
     }
 
 private:
-    cudnnHandle_t _handle;
+    bm_handle_t _handle;
 };
 
 }

From 3b8ceed27d6dea361118dc95885be5299b0b32b4 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 09:41:52 +0800
Subject: [PATCH 194/318] Init handle in init function

---
 saber/funcs/impl/bm/vender_pooling.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h
index 108a70708..6e5de79a4 100644
--- a/saber/funcs/impl/bm/vender_pooling.h
+++ b/saber/funcs/impl/bm/vender_pooling.h
@@ -35,6 +35,8 @@ class VenderPooling<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in
     virtual SaberStatus init(const std::vector<DataTensor_in*>& inputs,
                   std::vector<DataTensor_out*>& outputs,
                   PoolingParam<OpTensor> &pooling_param, Context<BM> &ctx) {
+
+        _handle = get_bm_handle();
         return create(inputs, outputs, pooling_param, ctx);
     }
 
@@ -64,7 +66,7 @@ class VenderPooling<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in
         } else {
             is_avg_pooling = 1;
         }
-        _handle = get_bm_handle();
+
         BMDNN_CHECK(bmdnn_pooling_forward(_handle, in_data, 
                             input_n, input_c, input_h, input_w, kh, kw, pad_h, pad_w, 
                             stride_h, stride_w, is_avg_pooling, out_data));

From abb123e6ab654c1f80c9db6f94a0c709ae139735 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 10:07:03 +0800
Subject: [PATCH 195/318] Include BM conv implementation

---
 saber/funcs/conv.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/saber/funcs/conv.h b/saber/funcs/conv.h
index 5a58bb01c..1626d38a9 100644
--- a/saber/funcs/conv.h
+++ b/saber/funcs/conv.h
@@ -29,6 +29,10 @@
 #include "saber/funcs/impl/impl_conv.h"
 #endif
 
+#ifdef USE_BM
+#include "saber/funcs/impl/bm/vender_conv.h"
+#endif
+
 #ifdef USE_ARM_PLACE
 //#include "saber/funcs/impl/arm/saber_conv.h"
 #endif

From 27ba06b87ed8dbca624e06ed5f6c3b9b99a1f2c1 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 10:12:18 +0800
Subject: [PATCH 196/318] remove unecessary include

---
 saber/funcs/impl/bm/vender_conv.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h
index 530eef528..924bf736c 100644
--- a/saber/funcs/impl/bm/vender_conv.h
+++ b/saber/funcs/impl/bm/vender_conv.h
@@ -1,8 +1,7 @@
 #ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H
 #define ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H
 
-#include "saber/funcs/impl/impl_conv.h"
-#include "saber/funcs/impl/bm/bmdnn_api.h"   
+#include "saber/funcs/impl/impl_conv.h" 
 
 namespace anakin{
 

From 88d7ced5fbd1ee9ee0f1289b2841f7a74ea9313c Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 10:26:09 +0800
Subject: [PATCH 197/318] empty create function

---
 saber/funcs/impl/bm/vender_conv.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h
index 924bf736c..14e52af8e 100644
--- a/saber/funcs/impl/bm/vender_conv.h
+++ b/saber/funcs/impl/bm/vender_conv.h
@@ -42,7 +42,9 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
 
     virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
-                            ConvParam<OpTensor>& param, Context<BM>& ctx);
+                            ConvParam<OpTensor>& param, Context<BM>& ctx) {
+        
+    }
 
     virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
                           std::vector<DataTensor_out*>& outputs,

From 7d9bc02dd45bb1ba86cb816a0c4ca382c6446cd1 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 11:18:00 +0800
Subject: [PATCH 198/318] unit test for BM conv

---
 saber/funcs/impl/bm/vender_conv.h         |  6 +-
 test/saber/bm/test_saber_func_conv_BM.cpp | 88 ++---------------------
 2 files changed, 8 insertions(+), 86 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h
index 14e52af8e..220b8a14e 100644
--- a/saber/funcs/impl/bm/vender_conv.h
+++ b/saber/funcs/impl/bm/vender_conv.h
@@ -1,7 +1,7 @@
 #ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H
 #define ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H
 
-#include "saber/funcs/impl/impl_conv.h" 
+#include "saber/funcs/impl/impl_conv.h"
 
 namespace anakin{
 
@@ -74,6 +74,8 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
         int dilation_h = param.dilation_h;
         int dilation_w = param.dilation_w;
 
+        bool with_bias = param.bias()->size() > 0;
+
         bm_tensor_4d_t input_shape = {
             input_n,
             input_c,
@@ -107,7 +109,7 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
         };
 
         BMDNN_CHECK(bmdnn_conv_forward(_handle, *in_data, *weight, *bias, input_shape, 
-                                    kernel_param, output_shape, conv_param, 1, *out_data));
+                                    kernel_param, output_shape, conv_param, with_bias, *out_data));
                                     
         return SaberSuccess;
     }
diff --git a/test/saber/bm/test_saber_func_conv_BM.cpp b/test/saber/bm/test_saber_func_conv_BM.cpp
index 9a25d00b3..554bcf843 100644
--- a/test/saber/bm/test_saber_func_conv_BM.cpp
+++ b/test/saber/bm/test_saber_func_conv_BM.cpp
@@ -147,18 +147,6 @@ TEST(TestSaberFuncBM, test_depthwise_conv) {
 
     //output_dev.sync();
     print_tensor_device(output_dev);
-
-//    param.group = 1;
-//    param.pad_h = 1;
-//    param.pad_w = 1;
-//
-//    LOG(INFO) << " param changed start with group = "<<param.group;
-//    conv(input, output, param, ctx1);
-//
-//    output_dev.sync();
-//    print_tensor_device(output_dev);
-    //cudaDeviceSynchronize();
-    //CUDA_CHECK(cudaPeekAtLastError());
 }
 
 TEST(TestSaberFuncBM, test_conv_param_change) {
@@ -263,7 +251,6 @@ TEST(TestSaberFuncBM, test_conv_param_change) {
 
     conv(input, output, param, ctx1);
     //output_dev.sync();
-//    print_tensor_device(output_dev);
 
     param.group = 1;
     param.pad_h = 1;
@@ -272,15 +259,11 @@ TEST(TestSaberFuncBM, test_conv_param_change) {
     LOG(INFO)<<" param changed start with group = "<<param.group;
     conv(input, output, param, ctx1);
 
-    //cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    //output[0]->record_event(cuda_stream);
+    //print_tensor_device(output_dev);
 
-    //output_dev.sync();
-//    print_tensor_device(output_dev);
-    //cudaDeviceSynchronize();
-    //CUDA_CHECK(cudaPeekAtLastError());
 }
 
+/*
 TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
 
     int group = 1;
@@ -411,16 +394,6 @@ TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
     conv0(input0, output0, param0, ctx1);
     conv1(input1, output1, param1, ctx2);
 
-    /*
-    cudaStream_t cuda_stream1 = ctx1.get_compute_stream();
-    output0[0]->record_event(cuda_stream1);
-
-    cudaStream_t cuda_stream2 = ctx2.get_compute_stream();
-    output1[0]->record_event(cuda_stream2);
-
-    out0.sync();
-    out1.sync();
-    */
     print_tensor_device(output_dev);
 
 //    print_tensor_device(output_dev);
@@ -428,6 +401,7 @@ TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
     //cudaDeviceSynchronize();
     //CUDA_CHECK(cudaPeekAtLastError());
 }
+*/
 #endif
 
 TEST(TestSaberFuncBM, test_conv_fp32_speed_test) {
@@ -561,8 +535,6 @@ TEST(TestSaberFuncBM, test_conv_fp32_speed_test) {
 
     LOG(INFO) << "fp32 average time: " << t1.get_average_ms() << " ms";
 
-    //cudaDeviceSynchronize();
-    //CUDA_CHECK(cudaPeekAtLastError());
 }
 
 void test_conv_fp32_speed(std::vector<TensorDf4*> &inputs, std::vector<TensorDf4*> &outputs,
@@ -601,23 +573,6 @@ void test_conv_fp32_speed(std::vector<TensorDf4*> &inputs, std::vector<TensorDf4
     //cudaDeviceSynchronize();
 }
 
-/*
-cublasHandle_t  cublas_handle;
-
-void caffe_gemm(const int M, const int N, const int K,\
-					 const float alpha, const float* A,\
-					 const float* B, const float beta, float* C) {
-    int lda = K;
-    int ldb = N;
-    CUBLAS_CHECK(cublasSgemm(cublas_handle,
-                             CUBLAS_OP_N,
-                             CUBLAS_OP_N,
-                             N, M, K,
-                             &alpha, B,
-                             ldb, A,
-                             lda, &beta,
-                             C, N));
-}
 
 TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) {
     int img_num = 1;
@@ -645,9 +600,6 @@ TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) {
     int stride = 1;
     Context<BM> ctx1(0, 1, 1);
 
-    CUBLAS_CHECK(cublasCreate(&cublas_handle));
-    CUBLAS_CHECK(cublasSetStream(cublas_handle, ctx1.get_compute_stream()));
-
     TensorDf4 weights;
     weights.re_alloc({out_channels, in_channels, 1, 1});
 
@@ -684,40 +636,8 @@ TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) {
                          weights, kernel, stride, pad,
             in_channels, out_channels, bias,
             SABER_IMPL);
-    //cudaDeviceSynchronize();
-    caffe_gemm(out_channels, img_h * img_w, in_channels,\
-					 1.f, weights.data(),\
-					 img.data(), 0.f, out_gemm.mutable_data());
-    //cudaDeviceSynchronize();
-    SaberTimer<BM> t1;
-    int ts = 100;
-
-    for (int i = 0; i < ts; ++i) {
-        t1.start(ctx1);
-        caffe_gemm(out_channels, img_h * img_w, in_channels,\
-					 1.f, weights.data(),\
-					 img.data(), 0.f, out_gemm.mutable_data());
-        out_gemm.record_event(ctx1.get_compute_stream());
-        out_gemm.sync();
-        t1.end(ctx1);
-    }
-    LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms";
-
-    //cudaDeviceSynchronize();
-//    print_tensor_device(out);
-//    print_tensor_device(out_gemm);
-    TensorHf4 out_host;
-    TensorHf4 out_gemm_host;
-    out_host.re_alloc(out.shape());
-    out_host.copy_from(out);
-
-    out_gemm_host.re_alloc(out_gemm.shape());
-    out_gemm_host.copy_from(out_gemm);
-    double max_r, max_d;
-    tensor_cmp_host(out_host.data(), out_gemm_host.data(), out_host.size(), max_r, max_d);
-    LOG(INFO) << "cmp result: max_r = " << max_r << " max_d = " << max_d;
 }
-*/
+
 int main(int argc, const char** argv){
     anakin::saber::Env<BM>::env_init();
 

From 5ce905615a9cb22b34fb63a9e71ec51d18f523c3 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 11:26:06 +0800
Subject: [PATCH 199/318] Update BM tensor print function

---
 saber/core/tensor_op.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp
index 06ee5bd79..6a5d58f03 100644
--- a/saber/core/tensor_op.cpp
+++ b/saber/core/tensor_op.cpp
@@ -339,7 +339,7 @@ void print_tensor_device<Tensor<BM, AK_BM, NCHW>>(Tensor<BM, AK_BM, NCHW>& tenso
     for (int i = 0; i < tensor.size(); ++i) {
         printf("%.2f ", host_mem[i]);
 
-        if ((i + 1) % (4 * tensor.width()) == 0) {
+        if ((i + 1) % tensor.width() == 0){
             printf("\n");
         }
     }

From 838a2856dfd32a5cc8951aa6cfb1a6cf6f91536c Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Wed, 27 Jun 2018 05:17:48 +0000
Subject: [PATCH 200/318] modify activation op, test pass

---
 saber/funcs/impl/bm/vender_activation.h       | 15 ++-
 .../bm/test_saber_func_activation_BM.cpp      | 91 +++++++++++++++++++
 test/saber/bm/test_saber_func_pooling_BM.cpp  |  2 +-
 3 files changed, 99 insertions(+), 9 deletions(-)
 create mode 100644 test/saber/bm/test_saber_func_activation_BM.cpp

diff --git a/saber/funcs/impl/bm/vender_activation.h b/saber/funcs/impl/bm/vender_activation.h
index c4baf8365..ec27ac054 100644
--- a/saber/funcs/impl/bm/vender_activation.h
+++ b/saber/funcs/impl/bm/vender_activation.h
@@ -27,7 +27,7 @@ class VenderActivation<BM, OpDtype, inDtype, outDtype,\
     typedef typename DataTensor_out::Dtype OutDataType;
     typedef typename OpTensor::Dtype OpDataType;
 
-    VenderActivation(): _handle(NULL), _active_type(NULL) {}
+    VenderActivation(): _handle(NULL), _active_type(Active_relu) {}
 
     ~VenderActivation() {}
 
@@ -35,6 +35,7 @@ class VenderActivation<BM, OpDtype, inDtype, outDtype,\
                             std::vector<DataTensor_out *>& outputs,
                             ActivationParam<OpTensor>& param, Context<BM>& ctx) {
         // not sure
+	_handle = get_bm_handle();
         return create(inputs, outputs, param, ctx);
     }
 
@@ -49,14 +50,15 @@ class VenderActivation<BM, OpDtype, inDtype, outDtype,\
     virtual SaberStatus dispatch(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             ActivationParam<OpTensor>& param) {
-        const InDataType *in_data = (const InDataType *) inputs[0]->data();
-        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
+        const InDataType in_data = *(inputs[0]->data());
+        OutDataType out_data = *(outputs[0]->mutable_data());
         int input_dim = inputs[0]->channel() * inputs[0]->height() * inputs[0]->width();
         int input_n = inputs[0]->num();
 
+        _active_type = param.active;
         switch (_active_type) {
             case Active_relu:
-                BMDNN_CHECK(bmdnn_relu_forward(_handle, in_data, input_n, input_dim, out_data));
+                BMDNN_CHECK(bmdnn_relu_forward(_handle, in_data, 0.0, input_n, input_dim, out_data));
                 break;
             case Active_sigmoid:
                 BMDNN_CHECK(bmdnn_sigmoid_forward(_handle, in_data, input_n, input_dim, out_data));
@@ -64,9 +66,6 @@ class VenderActivation<BM, OpDtype, inDtype, outDtype,\
             case Active_tanh:
                 BMDNN_CHECK(bmdnn_tanh_forward(_handle, in_data, input_n, input_dim, out_data));
                 break;
-            case Active_elu:
-                BMDNN_CHECK(bmdnn_elu_forward(_handle, 1.0, in_data, input_n, input_dim, out_data));
-                break;
         }
         return SaberSuccess;
     }
@@ -76,7 +75,7 @@ class VenderActivation<BM, OpDtype, inDtype, outDtype,\
     ActiveType _active_type;
 };
 
-template class VenderActivation<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
+template class VenderActivation<BM, AK_BM, AK_BM, AK_BM, NCHW, NCHW, NCHW>;
 } // namespace saber
 
 } // namespace anakin
diff --git a/test/saber/bm/test_saber_func_activation_BM.cpp b/test/saber/bm/test_saber_func_activation_BM.cpp
new file mode 100644
index 000000000..42f33e58d
--- /dev/null
+++ b/test/saber/bm/test_saber_func_activation_BM.cpp
@@ -0,0 +1,91 @@
+#include "core/context.h"
+#include "funcs/activation.h"
+#include "test_saber_func_BM.h"
+#include "tensor_op.h"
+#include "saber_types.h"
+#include <vector>
+
+using namespace anakin::saber;
+
+template <typename Tensor>
+void print_tensor_shape(std::string name, Tensor& t0) {
+
+    LOG(INFO) << name << " valid shape is ["
+              << t0.valid_shape()[0] << ", "
+              << t0.valid_shape()[1] << ", "
+              << t0.valid_shape()[2] << ", "
+              << t0.valid_shape()[3] << "].";
+
+    LOG(INFO) << name << " real shape is ["
+              << t0.shape()[0] << ", "
+              << t0.shape()[1] << ", "
+              << t0.shape()[2] << ", "
+              << t0.shape()[3] << "].";
+
+    LOG(INFO) << name << " offset is ["
+              << t0.offset()[0] << ", "
+              << t0.offset()[1] << ", "
+              << t0.offset()[2] << ", "
+              << t0.offset()[3] << "].";
+}
+
+TEST(TestSaberFuncBM, test_func_constructor) {
+
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+
+    int img_num = 1;
+    int in_channels = 1;
+    int img_h = 8;
+    int img_w = 8;
+
+    Shape img_s(img_num, in_channels, img_h, img_w);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+
+    int sign = -1;
+    for (int i = 0; i < img_host.size(); ++i) {
+	sign = i % 2 ? -1 : 1;
+        img_host.mutable_data()[i] = (float)(0.05 * (i & 0x1f) * sign);
+    }
+
+    img_dev.copy_from(img_host);
+    TensorDf4 output_dev;
+    print_tensor_device(img_dev);
+
+    // start Reshape & doInfer
+
+    Context<BM> ctx1(0, 1, 1);
+
+    ActivationParam<TensorDf4> param(Active_relu);
+
+    std::vector<TensorDf4*> input;
+    std::vector<TensorDf4*> output;
+
+    input.push_back(&img_dev);
+    output.push_back(&output_dev);
+
+    Activation<BM, AK_BM, AK_BM, AK_BM, NCHW> act;
+    act.compute_output_shape(input, output, param);
+    output_dev.re_alloc(output[0]->shape());
+
+    // init assume output tensor has been reshpaed by user.
+    act.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
+    act(input, output, param, ctx1);
+
+    print_tensor_device(output_dev);
+}
+
+int main(int argc, const char** argv) {
+    Env<BM>::env_init();
+    // initial logger
+    //logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp
index e988bc573..fb1a7398d 100644
--- a/test/saber/bm/test_saber_func_pooling_BM.cpp
+++ b/test/saber/bm/test_saber_func_pooling_BM.cpp
@@ -80,7 +80,7 @@ TEST(TestSaberFuncBM, test_func_pooling) {
     pooling(input, output, param, ctx1);
 
     SaberTimer<BM> t1;
-    int ts = 1000;
+    int ts = 100;
 
     for (int i = 0; i < ts; ++i) {
         t1.start(ctx1);

From 272ef52bc9ea245097be3f8690f5a1dfd4695f02 Mon Sep 17 00:00:00 2001
From: hlzy <327842846@qq.com>
Date: Wed, 27 Jun 2018 01:28:34 -0400
Subject: [PATCH 201/318] tensor_test

---
 test/saber/bm/test_saber_tensor_BM.cpp | 49 ++++++++++++++------------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index 69b1ccbfc..de787908b 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -8,6 +8,8 @@ typedef TargetWrapper<BM> BM_API;
 typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
 typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 typedef TensorHf4::Dtype dtype;
+typedef TensorDf4::Dtype dtype2;
+
 
 static bm_handle_t handle;
 TEST(TestSaberTensorBM, test_tensor_constructor) {
@@ -47,7 +49,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
               << thost0.height() << ", width = " << thost0.width();
 
     //! test tensor mutable_data() function
-    LOG(INFO) << "|--test tensor mutable_data() function, write tensor data buffer with 2.f";
+    LOG(INFO) << "|--xxxxxxxxtest tensor mutable_data() function, write tensor data buffer with 2.f";
     fill_tensor_host_const(thost0, 2.f);
     LOG(INFO) << "|--test tensor data() function, show the const data, 2.f";
     print_tensor_host(thost0);
@@ -88,7 +90,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     LOG(INFO) <<
               "test tensor constructor with data, if target is different, create buffer, and copy the data";
     dtype* host_data_ptr;
-    dtype* dev_data_ptr;
+//    dtype2* dev_data_ptr;
     void* tmp_pt_host;
     void* tmp_pt_dev;
     X86_API::mem_alloc(&tmp_pt_host, sizeof(dtype) * sh1.count());
@@ -98,26 +100,28 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
         host_data_ptr[i] = i;
     }
 
-    BM_API::mem_alloc(&tmp_pt_dev, sizeof(dtype) * sh1.count());
-    dev_data_ptr = static_cast<dtype*>(tmp_pt_dev);
-//    bm_memcpy_d2s(handle,host_data_ptr,dev_data_ptr)
-//    cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice);
+    BM_API::mem_alloc(&tmp_pt_dev, sizeof(dtype2) * sh1.count());
+//    dev_data_ptr = static_cast<dtype2*>(tmp_pt_dev);
+//    bm_memcpy_d2s(handle,*dev_data_ptr,bm_mem_from_system(const_cast<float *>(host_data_ptr)));
+
+//---    cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice);
+
     LOG(INFO) << "|--construct host tensor from host data ptr";
     TensorHf4 thost3(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
     LOG(INFO) << "|--constructor device tensor from host data ptr";
 
-//    TensorDf4 tdev3(&bm_mem_from_system(const_cast<float *>(host_data_ptr)), X86(), X86_API::get_device_id(), sh1);
-
     TensorDf4 tdev3(&bm_mem_from_system(const_cast<float *>(host_data_ptr)), X86(), X86_API::get_device_id(), sh1);
 
     print_tensor_host(thost3);
 
-    TensorHf4 thost_lian(sh1);
-    thost_lian.copy_from(tdev3);
-    print_tensor_host(thost_lian);
+    print_tensor_device(tdev3);
 
-    thost_lian.copy_from(thost3);
-    print_tensor_host(thost_lian);
+//    TensorHf4 thost_lian(sh1);
+//    thost_lian.copy_from(tdev3);
+//    print_tensor_host(thost_lian);
+//
+//    thost_lian.copy_from(thost3);
+//    print_tensor_host(thost_lian);
 
     //cudaDeviceSynchronize();
     //
@@ -128,16 +132,17 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     TensorDf4 tdev4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1);
     print_tensor_host(thost4);
     print_tensor_device(tdev4);
-/*
+*/
+
     //BM_API::stream_t dev_stream0;
     //BM_API::create_stream_with_flag(dev_stream0, 1);
     //cudaDeviceSynchronize();
-
+/*
     //! test tensor copy constructor
     LOG(INFO) << "test tensor copy constructor";
     LOG(INFO) << "|--normal copy constructor";
-    TensorHf4 thost5(thost4);
-    TensorDf4 tdev5(tdev4);
+//    TensorHf4 thost5(thost4);
+//    TensorDf4 tdev5(tdev4);
 
     LOG(INFO) << "|--push back to vector";
     std::vector<TensorHf4> vthost;
@@ -146,18 +151,18 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     vthost.push_back(thost1);
     vthost.push_back(thost2);
     vthost.push_back(thost3);
-    vthost.push_back(thost4);
-    vthost.push_back(thost5);
+//    vthost.push_back(thost4);
+//    vthost.push_back(thost5);
     vtdev.push_back(tdev0);
     vtdev.push_back(tdev1);
     vtdev.push_back(tdev2);
     vtdev.push_back(tdev3);
-    vtdev.push_back(tdev4);
-    vtdev.push_back(tdev5);
+//   vtdev.push_back(tdev4);
+//    vtdev.push_back(tdev5);
     print_tensor_host(vthost[5]);
     print_tensor_device(vtdev[5]);
     //cudaDeviceSynchronize();
-
+/*
     //! test share_from function, if targets are the same, buffer is shared, otherwise, buffer is copied
     LOG(INFO) << "test share_from function";
     TensorHf4 thost6, thost7;

From 033a6ab2122e9cbae215f87d9374cafab1f3893d Mon Sep 17 00:00:00 2001
From: "weihao.huang" <weihao.huang@bitmain.com>
Date: Wed, 27 Jun 2018 06:14:17 +0000
Subject: [PATCH 202/318] Fix sync_memcpy functions & test_saber_buffer_BM all
 passes

---
 saber/core/impl/bm/bm_impl.cpp         | 28 ++++++++++++++++++--------
 saber/core/target_wrapper.h            |  4 ++--
 test/saber/bm/test_saber_buffer_BM.cpp | 24 ++++++----------------
 3 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index ef26884b2..a50994a60 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -78,27 +78,39 @@ void BM_API::mem_set(void* ptr, int value, size_t n){
     //BMDNN_CHECK(bm_memset_device(handle, value, *pmem));
 }
 
-//static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-//    size_t count, __DtoD) {};
+void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+    size_t count, __DtoD) {
+    handle = get_bm_handle(); 
+    //BMDNN_CHECK(bm_memcpy_s2d(handle, bm_mem_from_device(dst), dst_id, bm_mem_from_device(src), src_id, count));
+    BMDNN_CHECK(bm_memcpy_d2d(handle, *(bm_device_mem_t *)(dst), dst_id, *(bm_device_mem_t *)(src), src_id, count));
+    LOG(INFO) << "BM sync_memcpy: device to device, finished";
+};
 
 void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
     size_t count, __HtoD) {
     handle = get_bm_handle(); 
-    BMDNN_CHECK(bm_memcpy_s2d(handle, bm_mem_from_device(dst), bm_mem_from_system(src)));
+    BMDNN_CHECK(bm_memcpy_s2d(handle, *(bm_device_mem_t *)(dst), bm_mem_from_system(src)));
+    for(int i=0; i<10; i++)
+	std::cout << "HtoD src: " << *((float *)(src)+i) << std::endl;
+    
     LOG(INFO) << "BM sync_memcpy: host to device, finished";
 };
 
 void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
     size_t count, __DtoH) {
     handle = get_bm_handle(); 
-    BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), bm_mem_from_device(src)));
-    //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src)));
-    //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(reinterpret_cast<struct bm_mem_desc *>(src))));
+    BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src)));
+    for(int i=0; i<10; i++)
+        std::cout << "DtoH dst: " << *((float *)(dst)+i) << std::endl;
+
     LOG(INFO) << "BM sync_memcpy: device to host, finished";
 };
 
-//static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
-//    int src_dev, size_t count) {};
+void BM_API::sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
+    int src_dev, size_t count) { 
+
+    LOG(INFO) << "BM sync_memcpy_p2p: temporarily no used";
+};
 
 
 //! target wrapper
diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index 475fbba84..5c802fa9e 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -540,7 +540,7 @@ struct TargetWrapper<BM, __device_target> {
     // brief create event, empty function for bitmain target
 
     static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-        size_t count, __DtoD) {};
+        size_t count, __DtoD);
 
     static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
         size_t count, __HtoD);
@@ -549,7 +549,7 @@ struct TargetWrapper<BM, __device_target> {
         size_t count, __DtoH) {};
 
     static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
-        int src_dev, size_t count) {};
+        int src_dev, size_t count);
 
     /**
      * \brief device target return currently used device id
diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp
index 555e22675..f8c8f46bb 100644
--- a/test/saber/bm/test_saber_buffer_BM.cpp
+++ b/test/saber/bm/test_saber_buffer_BM.cpp
@@ -27,7 +27,7 @@ void test_buffer() {
     x86_ptr = static_cast<Hdtype*>(tmp_x86);
 
     for (int i = 0; i < n0; i++) {
-        x86_ptr[i] = static_cast<Hdtype>(100);
+        x86_ptr[i] = static_cast<Hdtype>(i);
     }
 
     void* tmp_bm;
@@ -97,25 +97,13 @@ void test_buffer() {
     }
 
     CHECK_EQ(ptr1[n0 / 2], ptr2[n0 / 2]) << "deep copy between host is incorrect";
+    bm_buf1.sync_copy_from(x86_buf2); 
     LOG(INFO) << "deep copy from host buffer to device buffer";
-    bm_buf1.sync_copy_from(x86_buf2);
-    
-    /*
-    const Hdtype* x86_buf2_ptr = static_cast<const Hdtype*>(x86_buf2.get_data());
-    for (int i = 0; i < 10; i++) {
-	std::cout << "x86: " << x86_buf2_ptr[i] << std::endl;
-    }
-    */
-
-    const Hdtype* bm_buf1_ptr = static_cast<const Hdtype*>(bm_buf1.get_data());
-    for (int i = 0; i < 10; i++) {
-	std::cout << "bm: " << bm_buf1_ptr[i] << std::endl;
-    }
 
-    LOG(INFO) << "bm_buf1 cap & cnt: " << bm_buf1.get_capacity() << " " << bm_buf1.get_count();
-    LOG(INFO) << "x86_buf1 cap & cnt: " << x86_buf1.get_capacity() << " " << x86_buf1.get_count(); 
-    LOG(INFO) << "size of Hdtype: " << sizeof(Hdtype);
-    LOG(INFO) << "size of Ddtype: " << sizeof(Ddtype);
+    //LOG(INFO) << "bm_buf1 cap & cnt: " << bm_buf1.get_capacity() << " " << bm_buf1.get_count();
+    //LOG(INFO) << "x86_buf1 cap & cnt: " << x86_buf1.get_capacity() << " " << x86_buf1.get_count(); 
+    //LOG(INFO) << "size of Hdtype: " << sizeof(Hdtype);
+    //LOG(INFO) << "size of Ddtype: " << sizeof(Ddtype);
     
 
     x86_buf1.re_alloc(bm_buf1.get_capacity());

From 9bba50ebdd0e98a11f0835b9f18a33f7d3c38a6d Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 14:22:38 +0800
Subject: [PATCH 203/318] Implement BM softmax

---
 saber/funcs/impl/bm/vender_softmax.h         | 106 ++++++++++
 test/saber/bm/test_saber_func_softmax_BM.cpp | 194 +++++++++++++++++++
 test/saber/bm/test_saber_func_softmax_BM.h   |  21 ++
 3 files changed, 321 insertions(+)
 create mode 100644 saber/funcs/impl/bm/vender_softmax.h
 create mode 100644 test/saber/bm/test_saber_func_softmax_BM.cpp
 create mode 100644 test/saber/bm/test_saber_func_softmax_BM.h

diff --git a/saber/funcs/impl/bm/vender_softmax.h b/saber/funcs/impl/bm/vender_softmax.h
new file mode 100644
index 000000000..fb2595e87
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_softmax.h
@@ -0,0 +1,106 @@
+#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_SOFTMAX_H
+#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_SOFTMAX_H
+
+#include "saber/funcs/impl/impl_softmax.h"
+#include "saber/saber_funcs_param.h"
+#include "saber/saber_types.h"
+
+namespace anakin{
+
+namespace saber{
+
+template <DataType OpDtype,
+    DataType inDtype,
+    DataType outDtype,
+    typename LayOutType_op,
+    typename LayOutType_in,
+    typename LayOutType_out>
+class VenderSoftmax<BM, OpDtype, inDtype, outDtype,\
+    LayOutType_op, LayOutType_in, LayOutType_out> : \
+    public ImplBase<
+        Tensor<BM, inDtype, LayOutType_in>,
+        Tensor<BM, outDtype, LayOutType_out>,
+        Tensor<BM, OpDtype, LayOutType_op>,
+        SoftmaxParam<Tensor<BM, OpDtype, LayOutType_op> > >
+{
+public:
+    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+
+    VenderSoftmax(): _handle(NULL) {}
+    ~VenderSoftmax() {}
+
+    /**
+     * \brief initial all bmdnn resources here
+     * @param inputs
+     * @param outputs
+     * @param param
+     * @param ctx
+     */
+    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            SoftmaxParam<OpTensor>& param, Context<BM>& ctx) {
+
+        _handle = get_bm_handle();
+        return create(inputs, outputs, param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            SoftmaxParam<OpTensor>& param, Context<BM> &ctx) {
+
+    }
+
+    //call cudnnConvolutionForward here
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
+                          std::vector<DataTensor_out*>& outputs,
+                          SoftmaxParam<OpTensor> &param){
+
+        const InDataType *in_data = (const InDataType *) inputs[0]->data();
+        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
+
+        int input_n = inputs[0]->num();
+        int input_c = inputs[0]->channel();
+        int input_h = inputs[0]->height();
+        int input_w = inputs[0]->width();
+
+        /*
+        int outer_num = inputs[0]->count(0, param.axis);
+        int inner_num = inputs[0]->count(param.axis + 1, inputs[0]->dims());
+
+        int N = outer_num;
+        int K = inputs[0]->valid_shape()[param.axis];
+        int H = inner_num;
+        int W = 1;
+
+        const int stride_w = 1;
+        const int stride_h = W * stride_w;
+        const int stride_c = H * stride_h;
+        const int stride_n = K * stride_c;
+         */
+
+        bmdnn_softmax_forward(
+                _handle,
+                *in_data,
+                input_n,
+                input_c,
+                input_h * input_w,
+                *out_data
+        );
+
+        return SaberSuccess;
+    }
+
+private:
+    bm_handle_t _handle;
+};
+
+} //namespace saber
+
+} //namespace anakin
+
+#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_SOFTMAX_H
diff --git a/test/saber/bm/test_saber_func_softmax_BM.cpp b/test/saber/bm/test_saber_func_softmax_BM.cpp
new file mode 100644
index 000000000..2da0d2e62
--- /dev/null
+++ b/test/saber/bm/test_saber_func_softmax_BM.cpp
@@ -0,0 +1,194 @@
+#include "core/context.h"
+#include "funcs/softmax.h"
+#include "test_saber_func_softmax_BM.h"
+#include "tensor_op.h"
+#include "saber_types.h"
+#include <vector>
+
+using namespace anakin::saber;
+
+TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) {
+
+    Env<BM>::env_init();
+    typedef TargetWrapper<BM> API;
+
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+
+    typedef TensorDf4::Dtype dtype;
+
+    int test_iter = 1000;
+
+    int softmax_axis = 3; // channel
+    int w_in = 3;
+    int h_in = 225;
+    int ch_in = 40;
+    int num_in = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_out = shape_in;
+
+    SoftmaxParam<TensorDf4> param(softmax_axis);
+
+    LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \
+              ch_in << ", height=" << h_in << ", width=" << w_in;
+
+    LOG(INFO) << "softmax axis= " << param.axis;
+
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+
+    Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
+
+    for (int i = 0; i < thin.size(); ++i) {
+        thin.mutable_data()[i] = i % 4;
+    }
+
+    TensorDf4 tdin, tdout;
+    tdin.re_alloc(shape_in);
+    tdin.copy_from(thin);
+    input_dev_4d.push_back(&tdin);
+
+    // start Reshape & doInfer
+    Context<BM> ctx_dev(0, 1, 1);
+
+    Softmax<BM, AK_BM> softmax_dev;
+
+    typedef std::vector<Shape> Shape_v;
+
+    LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \
+              shape_out[2] << ", " << shape_out[3];
+
+    output_dev_4d.push_back(&tdout);
+    softmax_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
+
+    LOG(INFO) << "re-alloc tensor buffer";
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
+
+    LOG(INFO) << "softmax initialized to cudnn impl";
+    softmax_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
+
+    LOG(INFO) << "cudnn softmax compute";
+    SaberTimer<BM> t1;
+    t1.clear();
+    t1.start(ctx_dev);
+
+    for (int i = 0; i < test_iter; ++i) {
+        softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
+        output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+        output_dev_4d[0]->sync();
+    }
+
+    t1.end(ctx_dev);
+    float ts = t1.get_average_ms();
+    printf("cudnn softmax total time : %.4f, avg time : %.4f\n", ts, ts / test_iter);
+
+    LOG(INFO) << "softmax initialized to saber impl";
+    softmax_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, SABER_IMPL, ctx_dev);
+
+    LOG(INFO) << "saber softmax compute";
+    t1.clear();
+    t1.start(ctx_dev);
+
+    for (int i = 0; i < test_iter; ++i) {
+        softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
+        output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+        output_dev_4d[0]->sync();
+    }
+
+    t1.end(ctx_dev);
+    ts = t1.get_average_ms();
+    printf("saber softmax total time : %.4f, avg time : %.4f\n", ts, ts / test_iter);
+    //print_tensor_device(*output_dev_4d[0]);
+}
+
+TEST(TestSaberFuncSoftmaxBM, test_func_softmax_ROI_BM) {
+
+    Env<BM>::env_init();
+    typedef TargetWrapper<BM> API;
+
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+
+    typedef TensorDf4::Dtype dtype;
+
+    int test_iter = 1;
+
+    int softmax_axis = 3; // channel
+    int w_in = 3;
+    int h_in = 10;
+    int ch_in = 10;
+    int num_in = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_in_roi{num_in, ch_in / 2, h_in / 2, w_in};
+    Shape shape_out = shape_in_roi;
+
+    SoftmaxParam<TensorDf4> param(softmax_axis);
+
+    LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \
+              ch_in << ", height=" << h_in << ", width=" << w_in;
+
+    LOG(INFO) << "softmax axis= " << param.axis;
+
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+
+    Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
+
+    for (int i = 0; i < thin.size(); ++i) {
+        thin.mutable_data()[i] = (i % 3);
+    }
+
+    TensorDf4 tdin, tdin_roi, tdout, tdout_roi;
+    tdin.re_alloc(shape_in);
+    tdout.re_alloc(shape_in);
+    tdin.copy_from(thin);
+    tdin_roi.share_sub_buffer(tdin, shape_in_roi, Shape(0, 0, 0, 0));
+    input_dev_4d.push_back(&tdin_roi);
+    output_dev_4d.push_back(&tdout_roi);
+
+    // start Reshape & doInfer
+    Context<BM> ctx_dev(0, 1, 1);
+
+    Softmax<BM, AK_BM> softmax_dev;
+
+    LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \
+              shape_out[2] << ", " << shape_out[3];
+
+    softmax_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
+
+    LOG(INFO) << "re-alloc tensor buffer";
+    output_dev_4d[0]->share_sub_buffer(tdout, shape_in_roi, Shape(0, 0, 0, 0));
+    //output_dev_4d[0]->reshape(output_dev_4d[0]->valid_shape());
+
+    LOG(INFO) << "softmax initialization";
+    softmax_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, SABER_IMPL, ctx_dev);
+
+    LOG(INFO) << "softmax compute";
+    SaberTimer<BM> t1;
+    t1.clear();
+    t1.start(ctx_dev);
+
+    for (int i = 0; i < test_iter; ++i) {
+        softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
+        output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+        output_dev_4d[0]->sync();
+    }
+
+    t1.end(ctx_dev);
+    float ts = t1.get_average_ms();
+    printf("total time : %.4f, avg time : %.4f\n", ts, ts / test_iter);
+    print_tensor_device(*output_dev_4d[0]);
+
+    TensorDf4 troi(output_dev_4d[0]->valid_shape());
+    troi.copy_from(*output_dev_4d[0]);
+    print_tensor_device(troi);
+}
+
+int main(int argc, const char** argv) {
+    // initial logger
+    //logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
diff --git a/test/saber/bm/test_saber_func_softmax_BM.h b/test/saber/bm/test_saber_func_softmax_BM.h
new file mode 100644
index 000000000..d5c5b6986
--- /dev/null
+++ b/test/saber/bm/test_saber_func_softmax_BM.h
@@ -0,0 +1,21 @@
+#ifndef ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H
+#define ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+#include "core/tensor.h"
+
+using namespace anakin::test;
+
+class TestSaberFuncSoftmaxBM : public Test {
+public:
+    TestSaberFuncSoftmaxBM() {}
+    ~TestSaberFuncSoftmaxBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+};
+
+#endif //ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H

From 1a8861bc993a096e922313a3c01bfecba37b29a2 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 14:53:56 +0800
Subject: [PATCH 204/318] only print in DEBUG

---
 saber/core/impl/bm/bm_impl.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index a50994a60..4d24dedf0 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -90,8 +90,11 @@ void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
     size_t count, __HtoD) {
     handle = get_bm_handle(); 
     BMDNN_CHECK(bm_memcpy_s2d(handle, *(bm_device_mem_t *)(dst), bm_mem_from_system(src)));
+
+    #ifdef DEBUG
     for(int i=0; i<10; i++)
-	std::cout << "HtoD src: " << *((float *)(src)+i) << std::endl;
+	    LOG(INFO) << "HtoD src: " << *((float *)(src)+i);
+    #endif
     
     LOG(INFO) << "BM sync_memcpy: host to device, finished";
 };
@@ -100,8 +103,11 @@ void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
     size_t count, __DtoH) {
     handle = get_bm_handle(); 
     BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src)));
+
+    #ifdef DEBUG
     for(int i=0; i<10; i++)
-        std::cout << "DtoH dst: " << *((float *)(dst)+i) << std::endl;
+        LOG(INFO) << "DtoH dst: " << *((float *)(dst)+i);
+    #endif
 
     LOG(INFO) << "BM sync_memcpy: device to host, finished";
 };

From 2103811c6bb83874eb2ebb56c997bb98d087663b Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 15:05:37 +0800
Subject: [PATCH 205/318] reduce iteration

---
 test/saber/bm/test_saber_func_softmax_BM.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/saber/bm/test_saber_func_softmax_BM.cpp b/test/saber/bm/test_saber_func_softmax_BM.cpp
index 2da0d2e62..8176a9e51 100644
--- a/test/saber/bm/test_saber_func_softmax_BM.cpp
+++ b/test/saber/bm/test_saber_func_softmax_BM.cpp
@@ -16,7 +16,7 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) {
 
     typedef TensorDf4::Dtype dtype;
 
-    int test_iter = 1000;
+    int test_iter = 10;
 
     int softmax_axis = 3; // channel
     int w_in = 3;

From 67e9bbd1702271b7d09e9b5006b2ca5190fe32e0 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 15:11:57 +0800
Subject: [PATCH 206/318] Revert "reduce iteration"

This reverts commit 635ff4260496f98657440461c7f251c2b6a4c907.
---
 test/saber/bm/test_saber_func_softmax_BM.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/saber/bm/test_saber_func_softmax_BM.cpp b/test/saber/bm/test_saber_func_softmax_BM.cpp
index 8176a9e51..2da0d2e62 100644
--- a/test/saber/bm/test_saber_func_softmax_BM.cpp
+++ b/test/saber/bm/test_saber_func_softmax_BM.cpp
@@ -16,7 +16,7 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) {
 
     typedef TensorDf4::Dtype dtype;
 
-    int test_iter = 10;
+    int test_iter = 1000;
 
     int softmax_axis = 3; // channel
     int w_in = 3;

From ceccee48718582feaa580453dfcbba7221f1bd33 Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Wed, 27 Jun 2018 08:19:32 +0000
Subject: [PATCH 207/318] modify fc op, compile error

---
 saber/funcs/impl/bm/vender_fc.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_fc.h b/saber/funcs/impl/bm/vender_fc.h
index 82dd6000c..5004ad349 100644
--- a/saber/funcs/impl/bm/vender_fc.h
+++ b/saber/funcs/impl/bm/vender_fc.h
@@ -1,6 +1,5 @@
 #ifndef ANAKIN_SABER_FUNCS_BMDNN_FC_H
 #define ANAKIN_SABER_FUNCS_BMDNN_FC_H
-
 #include "saber/funcs/impl/impl_fc.h"
 
 namespace anakin{
@@ -34,6 +33,7 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
     virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             FcParam<OpTensor>& param, Context<BM>& ctx){
+        _handle = get_bm_handle();
         return create(inputs, outputs, param, ctx);
     }
 
@@ -46,10 +46,10 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
     virtual SaberStatus dispatch(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             FcParam<OpTensor>& param){
-        const InDataType *in_data = (const InDataType *) inputs[0]->data();
-        const InDataType *weights = (const InDataType *) param.weights->get_buf()->get_data();
-        const InDataType *bias = (const InDataType *) param.bias->get_buf()->get_data();
-        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
+        const InDataType in_data = *(inputs[0]->data());
+        const InDataType weights = *(InDataType*)(param.weights->get_buf()->get_data());
+        const InDataType bias = *(InDataType*)(param.bias->get_buf()->get_data());
+        OutDataType out_data = *(outputs[0]->mutable_data());
         int batch_size = inputs[0]->num();
         int input_len = inputs[0]->channel();
         int output_len = param.num_output;
@@ -64,7 +64,7 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
     bm_handle_t _handle;
 };
 
-template class VenderFc<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
+template class VenderFc<BM, AK_BM, AK_BM, AK_BM, NCHW, NCHW, NCHW>;
 } //namespace saber
 
 } //namespace anakin

From 944214d6cdf62ff1399b1b1e5d86a05093f3bc7d Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 27 Jun 2018 16:53:20 +0800
Subject: [PATCH 208/318] Update for BM softmax

---
 saber/funcs/impl/bm/vender_softmax.h         | 14 +++++++-----
 test/saber/bm/test_saber_func_softmax_BM.cpp | 23 ++++++++++----------
 test/saber/bm/test_saber_func_softmax_BM.h   | 21 ------------------
 3 files changed, 20 insertions(+), 38 deletions(-)
 delete mode 100644 test/saber/bm/test_saber_func_softmax_BM.h

diff --git a/saber/funcs/impl/bm/vender_softmax.h b/saber/funcs/impl/bm/vender_softmax.h
index fb2595e87..55612f66a 100644
--- a/saber/funcs/impl/bm/vender_softmax.h
+++ b/saber/funcs/impl/bm/vender_softmax.h
@@ -63,12 +63,13 @@ class VenderSoftmax<BM, OpDtype, inDtype, outDtype,\
         const InDataType *in_data = (const InDataType *) inputs[0]->data();
         OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
 
+        /*
         int input_n = inputs[0]->num();
         int input_c = inputs[0]->channel();
         int input_h = inputs[0]->height();
         int input_w = inputs[0]->width();
+        */
 
-        /*
         int outer_num = inputs[0]->count(0, param.axis);
         int inner_num = inputs[0]->count(param.axis + 1, inputs[0]->dims());
 
@@ -77,18 +78,19 @@ class VenderSoftmax<BM, OpDtype, inDtype, outDtype,\
         int H = inner_num;
         int W = 1;
 
+        /*
         const int stride_w = 1;
         const int stride_h = W * stride_w;
         const int stride_c = H * stride_h;
         const int stride_n = K * stride_c;
-         */
-
+        */
+        
         bmdnn_softmax_forward(
                 _handle,
                 *in_data,
-                input_n,
-                input_c,
-                input_h * input_w,
+                N,
+                K,
+                H * W,
                 *out_data
         );
 
diff --git a/test/saber/bm/test_saber_func_softmax_BM.cpp b/test/saber/bm/test_saber_func_softmax_BM.cpp
index 2da0d2e62..6c38c7534 100644
--- a/test/saber/bm/test_saber_func_softmax_BM.cpp
+++ b/test/saber/bm/test_saber_func_softmax_BM.cpp
@@ -1,15 +1,16 @@
 #include "core/context.h"
 #include "funcs/softmax.h"
-#include "test_saber_func_softmax_BM.h"
+#include "test_saber_func_BM.h"
 #include "tensor_op.h"
 #include "saber_types.h"
 #include <vector>
 
 using namespace anakin::saber;
 
-TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) {
 
-    Env<BM>::env_init();
+TEST(TestSaberFuncBM, test_func_softmax_BM) {
+
+    //Env<BM>::env_init();
     typedef TargetWrapper<BM> API;
 
     typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
@@ -74,8 +75,8 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) {
 
     for (int i = 0; i < test_iter; ++i) {
         softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
-        output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
-        output_dev_4d[0]->sync();
+        //output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+        //output_dev_4d[0]->sync();
     }
 
     t1.end(ctx_dev);
@@ -91,8 +92,8 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) {
 
     for (int i = 0; i < test_iter; ++i) {
         softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
-        output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
-        output_dev_4d[0]->sync();
+        //output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+        //output_dev_4d[0]->sync();
     }
 
     t1.end(ctx_dev);
@@ -101,9 +102,9 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) {
     //print_tensor_device(*output_dev_4d[0]);
 }
 
-TEST(TestSaberFuncSoftmaxBM, test_func_softmax_ROI_BM) {
+TEST(TestSaberFuncBM, test_func_softmax_ROI_BM) {
 
-    Env<BM>::env_init();
+    //Env<BM>::env_init();
     typedef TargetWrapper<BM> API;
 
     typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
@@ -170,8 +171,8 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_ROI_BM) {
 
     for (int i = 0; i < test_iter; ++i) {
         softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
-        output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
-        output_dev_4d[0]->sync();
+        //output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+        //output_dev_4d[0]->sync();
     }
 
     t1.end(ctx_dev);
diff --git a/test/saber/bm/test_saber_func_softmax_BM.h b/test/saber/bm/test_saber_func_softmax_BM.h
deleted file mode 100644
index d5c5b6986..000000000
--- a/test/saber/bm/test_saber_func_softmax_BM.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H
-#define ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H
-
-#include "utils/unit_test/aktest.h"
-#include "utils/logger/logger.h"
-#include "core/tensor.h"
-
-using namespace anakin::test;
-
-class TestSaberFuncSoftmaxBM : public Test {
-public:
-    TestSaberFuncSoftmaxBM() {}
-    ~TestSaberFuncSoftmaxBM() {}
-
-protected:
-    virtual void setup() {}
-    virtual void teardown() {}
-
-};
-
-#endif //ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H

From 6d5c486ca7fd214a30fa7d354b25d252a24f9322 Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Wed, 27 Jun 2018 17:39:42 +0800
Subject: [PATCH 209/318] xRevert "modify fc op, compile error"

This reverts commit 2997faf062e8ef4bf6310c425ab369059fec335d.
---
 saber/funcs/impl/bm/vender_fc.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_fc.h b/saber/funcs/impl/bm/vender_fc.h
index 5004ad349..82dd6000c 100644
--- a/saber/funcs/impl/bm/vender_fc.h
+++ b/saber/funcs/impl/bm/vender_fc.h
@@ -1,5 +1,6 @@
 #ifndef ANAKIN_SABER_FUNCS_BMDNN_FC_H
 #define ANAKIN_SABER_FUNCS_BMDNN_FC_H
+
 #include "saber/funcs/impl/impl_fc.h"
 
 namespace anakin{
@@ -33,7 +34,6 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
     virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             FcParam<OpTensor>& param, Context<BM>& ctx){
-        _handle = get_bm_handle();
         return create(inputs, outputs, param, ctx);
     }
 
@@ -46,10 +46,10 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
     virtual SaberStatus dispatch(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             FcParam<OpTensor>& param){
-        const InDataType in_data = *(inputs[0]->data());
-        const InDataType weights = *(InDataType*)(param.weights->get_buf()->get_data());
-        const InDataType bias = *(InDataType*)(param.bias->get_buf()->get_data());
-        OutDataType out_data = *(outputs[0]->mutable_data());
+        const InDataType *in_data = (const InDataType *) inputs[0]->data();
+        const InDataType *weights = (const InDataType *) param.weights->get_buf()->get_data();
+        const InDataType *bias = (const InDataType *) param.bias->get_buf()->get_data();
+        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
         int batch_size = inputs[0]->num();
         int input_len = inputs[0]->channel();
         int output_len = param.num_output;
@@ -64,7 +64,7 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
     bm_handle_t _handle;
 };
 
-template class VenderFc<BM, AK_BM, AK_BM, AK_BM, NCHW, NCHW, NCHW>;
+template class VenderFc<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
 } //namespace saber
 
 } //namespace anakin

From 8a7a8d713100a6867d2cec13bc78d49cd88320af Mon Sep 17 00:00:00 2001
From: hlzy <327842846@qq.com>
Date: Wed, 27 Jun 2018 07:46:39 -0400
Subject: [PATCH 210/318] change tensor_test_bm

---
 .idea/workspace.xml | 86 ++++++++++++++++++++++++++++-----------------
 1 file changed, 54 insertions(+), 32 deletions(-)

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 718ee2682..210061337 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -9,23 +9,7 @@
     </configurations>
   </component>
   <component name="ChangeListManager">
-    <list default="true" id="05b5ffab-037b-4c62-b3a1-7655f7441f38" name="Default" comment="">
-      <change beforePath="$PROJECT_DIR$/saber/core/target_wrapper.h" beforeDir="false" afterPath="$PROJECT_DIR$/saber/core/target_wrapper.h" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/saber/core/tensor.h" beforeDir="false" afterPath="$PROJECT_DIR$/saber/core/tensor.h" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/framework/core/base_types_test.cpp" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/framework/graph/graph_base_test.cpp" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_buffer_BM.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_context_BM.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_device_BM.cpp" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_device_BM.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_func_BM.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_func_activation_BM.cpp" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_func_conv_BM.cpp" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_func_fc_BM.cpp" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_shape_BM.cpp" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_shape_BM.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/saber/bm/test_saber_tensor_BM.cpp" beforeDir="false" afterPath="$PROJECT_DIR$/test/saber/bm/test_saber_tensor_BM.cpp" afterDir="false" />
-    </list>
+    <list default="true" id="05b5ffab-037b-4c62-b3a1-7655f7441f38" name="Default" comment="" />
     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
     <option name="TRACKING_ENABLED" value="true" />
     <option name="SHOW_DIALOG" value="false" />
@@ -38,17 +22,36 @@
       <file leaf-file-name="saber_funcs_param.h" pinned="false" current-in-tab="false">
         <entry file="file://$PROJECT_DIR$/saber/saber_funcs_param.h">
           <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="357">
-              <caret line="29" column="12" selection-start-line="29" selection-start-column="12" selection-end-line="29" selection-end-column="12" />
+            <state relative-caret-position="402">
+              <caret line="32" column="12" selection-start-line="32" selection-start-column="12" selection-end-line="32" selection-end-column="12" />
             </state>
           </provider>
         </entry>
       </file>
-      <file leaf-file-name="target_wrapper.h" pinned="false" current-in-tab="true">
+      <file leaf-file-name="target_wrapper.h" pinned="false" current-in-tab="false">
         <entry file="file://$PROJECT_DIR$/saber/core/target_wrapper.h">
           <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="523">
+            <state relative-caret-position="4423">
               <caret line="523" column="37" selection-start-line="523" selection-start-column="37" selection-end-line="523" selection-end-column="37" />
+              <folding>
+                <element signature="e#14794#16896#0" expanded="true" />
+              </folding>
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file leaf-file-name="conv.h" pinned="false" current-in-tab="true">
+        <entry file="file://$PROJECT_DIR$/saber/funcs/conv.h">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="225">
+              <caret line="17" column="29" lean-forward="true" selection-start-line="17" selection-start-column="29" selection-end-line="17" selection-end-column="29" />
+              <folding>
+                <element signature="e#854#946#0" expanded="true" />
+                <element signature="e#976#1015#0" expanded="true" />
+                <element signature="e#1038#1082#0" expanded="true" />
+                <element signature="e#1112#1158#0" expanded="true" />
+                <element signature="e#1212#1259#0" expanded="true" />
+              </folding>
             </state>
           </provider>
         </entry>
@@ -136,6 +139,7 @@
         <option value="$PROJECT_DIR$/CMakeLists.txt" />
         <option value="$PROJECT_DIR$/saber/core/tensor_op.cpp" />
         <option value="$PROJECT_DIR$/saber/core/target_wrapper.h" />
+        <option value="$PROJECT_DIR$/saber/funcs/conv.h" />
       </list>
     </option>
   </component>
@@ -176,14 +180,13 @@
             <path>
               <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
               <item name="Anakin" type="462c0819:PsiDirectoryNode" />
-              <item name="saber" type="462c0819:PsiDirectoryNode" />
-              <item name="core" type="462c0819:PsiDirectoryNode" />
+              <item name="test" type="462c0819:PsiDirectoryNode" />
             </path>
             <path>
               <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
               <item name="Anakin" type="462c0819:PsiDirectoryNode" />
+              <item name="test" type="462c0819:PsiDirectoryNode" />
               <item name="saber" type="462c0819:PsiDirectoryNode" />
-              <item name="funcs" type="462c0819:PsiDirectoryNode" />
             </path>
           </expand>
           <select />
@@ -218,24 +221,23 @@
       <option name="presentableId" value="Default" />
       <updated>1533519941069</updated>
       <workItem from="1533519943497" duration="1090000" />
-      <workItem from="1533533623166" duration="5163000" />
+      <workItem from="1533533623166" duration="5735000" />
     </task>
     <servers />
   </component>
   <component name="TimeTrackingManager">
-    <option name="totallyTimeSpent" value="6253000" />
+    <option name="totallyTimeSpent" value="6825000" />
   </component>
   <component name="ToolWindowManager">
     <frame x="0" y="23" width="2560" height="1353" extended-state="0" />
-    <editor active="true" />
     <layout>
-      <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.25" />
+      <window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.25" />
       <window_info anchor="bottom" id="TODO" order="6" />
       <window_info anchor="bottom" id="Event Log" order="7" side_tool="true" />
       <window_info anchor="bottom" id="Version Control" order="7" weight="0.28850666" />
       <window_info anchor="bottom" id="Run" order="2" />
       <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
-      <window_info anchor="bottom" id="Terminal" order="7" visible="true" weight="0.19077404" />
+      <window_info active="true" anchor="bottom" id="Terminal" order="7" visible="true" weight="0.19077404" />
       <window_info id="Favorites" order="2" side_tool="true" />
       <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
       <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
@@ -316,8 +318,8 @@
     </entry>
     <entry file="file://$PROJECT_DIR$/saber/saber_funcs_param.h">
       <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="357">
-          <caret line="29" column="12" selection-start-line="29" selection-start-column="12" selection-end-line="29" selection-end-column="12" />
+        <state relative-caret-position="402">
+          <caret line="32" column="12" selection-start-line="32" selection-start-column="12" selection-end-line="32" selection-end-column="12" />
         </state>
       </provider>
     </entry>
@@ -360,8 +362,28 @@
     </entry>
     <entry file="file://$PROJECT_DIR$/saber/core/target_wrapper.h">
       <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="523">
+        <state relative-caret-position="4423">
           <caret line="523" column="37" selection-start-line="523" selection-start-column="37" selection-end-line="523" selection-end-column="37" />
+          <folding>
+            <element signature="e#14794#16896#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/saber/funcs/softmax.h">
+      <provider selected="true" editor-type-id="text-editor" />
+    </entry>
+    <entry file="file://$PROJECT_DIR$/saber/funcs/conv.h">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="225">
+          <caret line="17" column="29" lean-forward="true" selection-start-line="17" selection-start-column="29" selection-end-line="17" selection-end-column="29" />
+          <folding>
+            <element signature="e#854#946#0" expanded="true" />
+            <element signature="e#976#1015#0" expanded="true" />
+            <element signature="e#1038#1082#0" expanded="true" />
+            <element signature="e#1112#1158#0" expanded="true" />
+            <element signature="e#1212#1259#0" expanded="true" />
+          </folding>
         </state>
       </provider>
     </entry>

From 5aef6ab4ed3be970ea72e4e835caad9330ebe436 Mon Sep 17 00:00:00 2001
From: hlzy <327842846@qq.com>
Date: Wed, 27 Jun 2018 20:19:31 -0400
Subject: [PATCH 211/318] tensor test update

---
 saber/core/tensor.h                    | 30 ++++++++++++
 test/saber/bm/test_saber_tensor_BM.cpp | 67 +++++++++++++++-----------
 2 files changed, 68 insertions(+), 29 deletions(-)

diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 93af6822f..6b2b2a0f8 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -77,6 +77,36 @@ class Tensor {
         _is_subbuf = false;
     }
 
+#ifdef USE_BM
+    /**
+     * \brief Constructor with allocated data ptr and entire memory shape. only for BM
+    */
+    template <typename Dtype_s,typename TargetType_t>
+    Tensor(Dtype_s* data_ptr, TargetType_t target, int id, Shape shape) {
+        CHECK_EQ(shape.dims(), TensorAPI::layout_dims::value) << \
+            "shape dims is not matched to layout type";
+        _shape = shape;
+        _valid_shape = shape;
+        _offset = Shape::zero(shape.dims());
+
+        if(typeid(Dtype_s) == typeid(AK_FLOAT))
+        {
+        std::shared_ptr<Buffer<TargetType_t>> buf_from_date = \
+            std::make_shared<Buffer<TargetType_t>>(&bm_mem_from_system(const_cast<Dtype_s *>(data_ptr)), shape.count() * _type_len(), id);
+
+        BufferMemShare(_buf, buf_from_date);
+        }
+        else
+        {
+        std::shared_ptr<Buffer<TargetType_t>> buf_from_date = \
+            std::make_shared<Buffer<TargetType_t>>(data_ptr, shape.count() * _type_len(), id);
+
+        BufferMemShare(_buf, buf_from_date);
+        }
+        _is_subbuf = false;
+    }
+#endif
+
     /**
      * \brief Copy constructor, shallow copy.
      */
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index de787908b..423ffe221 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -32,7 +32,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
 
     //! test tensor re_alloc function on tensor with data
     LOG(INFO) << "|--test tensor re_alloc function on tensor with data";
-    Shape sh1(2, 4, 4, 2);
+    Shape sh1(1, 4, 4, 4);
     thost0.re_alloc(sh1);
     tdev0.re_alloc(sh1);
     LOG(INFO) << "|--tensor size of host: " << thost0.size();
@@ -59,7 +59,6 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     TensorHf4 thost1(sh1);
     TensorDf4 tdev1(sh1);
 
-
     //! test tensor copy_from() function
     LOG(INFO) << "test copy_from() function, input tensor could be any target";
 
@@ -69,17 +68,18 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
 
     // host to device
     tdev1.copy_from(thost0);
-    //TODO: print tensor for BM device
-    //print_tensor_host(tdev1);
+    print_tensor_device(tdev1);
 
     // device to host
     thost1.copy_from(tdev1);
     print_tensor_host(thost1);
 
-    
-    // device to device
+    LOG(INFO) << "test copy_from() function device to device";
+
     tdev1.copy_from(tdev0);
+    print_tensor_device(tdev1);
 
+    
     //! test tensor constructor with shape and real_shape
     LOG(INFO) << "test tensor constructor with shape and real_shape";
     //! constructor with 3 shapes is removed
@@ -90,7 +90,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     LOG(INFO) <<
               "test tensor constructor with data, if target is different, create buffer, and copy the data";
     dtype* host_data_ptr;
-//    dtype2* dev_data_ptr;
+    dtype2* dev_data_ptr;
     void* tmp_pt_host;
     void* tmp_pt_dev;
     X86_API::mem_alloc(&tmp_pt_host, sizeof(dtype) * sh1.count());
@@ -101,16 +101,17 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     }
 
     BM_API::mem_alloc(&tmp_pt_dev, sizeof(dtype2) * sh1.count());
-//    dev_data_ptr = static_cast<dtype2*>(tmp_pt_dev);
-//    bm_memcpy_d2s(handle,*dev_data_ptr,bm_mem_from_system(const_cast<float *>(host_data_ptr)));
-
+    dev_data_ptr = static_cast<dtype2*>(tmp_pt_dev);
 //---    cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice);
-
+    BM_API::sync_memcpy(dev_data_ptr,0,host_data_ptr,0,0,__HtoD());
     LOG(INFO) << "|--construct host tensor from host data ptr";
     TensorHf4 thost3(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
     LOG(INFO) << "|--constructor device tensor from host data ptr";
 
-    TensorDf4 tdev3(&bm_mem_from_system(const_cast<float *>(host_data_ptr)), X86(), X86_API::get_device_id(), sh1);
+//    TensorDf4 tdev3(&bm_mem_from_system(const_cast<float *>(host_data_ptr)), X86(), X86_API::get_device_id(), sh1);
+
+    TensorDf4 tdev3(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
+
 
     print_tensor_host(thost3);
 
@@ -125,24 +126,30 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
 
     //cudaDeviceSynchronize();
     //
-/*
+
     LOG(INFO) << "|--construct host tensor from device data ptr";
-    TensorHf4 thost4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1);
-    LOG(INFO) << "|--constructor device tensor from device data ptr";
-    TensorDf4 tdev4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1);
-    print_tensor_host(thost4);
-    print_tensor_device(tdev4);
-*/
+    TensorHf4 thost4(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
+
+    TensorDf4 tdev4(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
+
+//    TensorDf4 tdev3(&bm_mem_from_system(const_cast<float *>(host_data_ptr)), X86(), X86_API::get_device_id(), sh1);
+
+//    TensorHf4 thost4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1);
+//    LOG(INFO) << "|--constructor device tensor from device data ptr";
+//    TensorDf4 tdev4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1);
+//    print_tensor_host(thost4);
+//    print_tensor_device(tdev4);
+
 
     //BM_API::stream_t dev_stream0;
     //BM_API::create_stream_with_flag(dev_stream0, 1);
     //cudaDeviceSynchronize();
-/*
+
     //! test tensor copy constructor
     LOG(INFO) << "test tensor copy constructor";
     LOG(INFO) << "|--normal copy constructor";
-//    TensorHf4 thost5(thost4);
-//    TensorDf4 tdev5(tdev4);
+    TensorHf4 thost5(thost4);
+    TensorDf4 tdev5(tdev4);
 
     LOG(INFO) << "|--push back to vector";
     std::vector<TensorHf4> vthost;
@@ -151,18 +158,18 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     vthost.push_back(thost1);
     vthost.push_back(thost2);
     vthost.push_back(thost3);
-//    vthost.push_back(thost4);
-//    vthost.push_back(thost5);
+    vthost.push_back(thost4);
+    vthost.push_back(thost5);
     vtdev.push_back(tdev0);
     vtdev.push_back(tdev1);
     vtdev.push_back(tdev2);
     vtdev.push_back(tdev3);
-//   vtdev.push_back(tdev4);
-//    vtdev.push_back(tdev5);
+    vtdev.push_back(tdev4);
+    vtdev.push_back(tdev5);
     print_tensor_host(vthost[5]);
     print_tensor_device(vtdev[5]);
     //cudaDeviceSynchronize();
-/*
+
     //! test share_from function, if targets are the same, buffer is shared, otherwise, buffer is copied
     LOG(INFO) << "test share_from function";
     TensorHf4 thost6, thost7;
@@ -174,7 +181,9 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     Shape sh2(1, 2, 2, 2);
     Shape offset(0, 0, 1, 1);
     LOG(INFO) << "|--shared host";
+
     thost6.share_sub_buffer(thost4, sh2, offset);
+
     LOG(INFO) << "|--copied host";
     tdev6.share_from(thost4);
     LOG(INFO) << "|--copied device";
@@ -182,6 +191,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
     LOG(INFO) << "|--shared device";
     tdev7.share_from(tdev4);
 
+
     LOG(INFO) << "|--change data in shared tensor";
 
     //Shape sh_real = thost6.shape();
@@ -222,8 +232,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) {
 
     LOG(INFO) << "|--show root tensor while data is changed by shared tensor";
     print_tensor_host(thost4);
-     */
-//    bmdnn_deinit(handle);
+    bmdnn_deinit(handle);
 }
 
 /*

From 6af8e17b94e435fde4f4207e02e5473dcd7dff22 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 09:42:35 +0800
Subject: [PATCH 212/318] Add back missing files

---
 test/saber/bm/test_saber_buffer_BM.h    |  20 ++++
 test/saber/bm/test_saber_context_BM.h   |  21 ++++
 test/saber/bm/test_saber_device_BM.cpp  |  20 ++++
 test/saber/bm/test_saber_device_BM.h    |  21 ++++
 test/saber/bm/test_saber_func_BM.h      |  38 ++++++
 test/saber/bm/test_saber_func_fc_BM.cpp | 146 ++++++++++++++++++++++++
 test/saber/bm/test_saber_shape_BM.cpp   | 126 ++++++++++++++++++++
 test/saber/bm/test_saber_shape_BM.h     |  25 ++++
 8 files changed, 417 insertions(+)
 create mode 100644 test/saber/bm/test_saber_buffer_BM.h
 create mode 100644 test/saber/bm/test_saber_context_BM.h
 create mode 100644 test/saber/bm/test_saber_device_BM.cpp
 create mode 100644 test/saber/bm/test_saber_device_BM.h
 create mode 100644 test/saber/bm/test_saber_func_BM.h
 create mode 100644 test/saber/bm/test_saber_func_fc_BM.cpp
 create mode 100644 test/saber/bm/test_saber_shape_BM.cpp
 create mode 100644 test/saber/bm/test_saber_shape_BM.h

diff --git a/test/saber/bm/test_saber_buffer_BM.h b/test/saber/bm/test_saber_buffer_BM.h
new file mode 100644
index 000000000..8bbbe4511
--- /dev/null
+++ b/test/saber/bm/test_saber_buffer_BM.h
@@ -0,0 +1,20 @@
+#ifndef ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H
+#define ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+
+using namespace anakin::test;
+
+class TestSaberBufferBM : public Test {
+public:
+    TestSaberBufferBM() {}
+    ~TestSaberBufferBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+};
+
+#endif //ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H
diff --git a/test/saber/bm/test_saber_context_BM.h b/test/saber/bm/test_saber_context_BM.h
new file mode 100644
index 000000000..653ee11fd
--- /dev/null
+++ b/test/saber/bm/test_saber_context_BM.h
@@ -0,0 +1,21 @@
+#ifndef SABER_TEST_SABER_CONTEXT_BM_H
+#define SABER_TEST_SABER_CONTEXT_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+#include "core/context.h"
+
+using namespace anakin::test;
+
+class TestSaberContextBM : public Test {
+public:
+    TestSaberContextBM() {}
+    ~TestSaberContextBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+};
+
+#endif //SABER_TEST_SABER_CONTEXT_BM_H
diff --git a/test/saber/bm/test_saber_device_BM.cpp b/test/saber/bm/test_saber_device_BM.cpp
new file mode 100644
index 000000000..1c7086cf1
--- /dev/null
+++ b/test/saber/bm/test_saber_device_BM.cpp
@@ -0,0 +1,20 @@
+#include "test_saber_device_BM.h"
+
+#ifdef USE_BM
+
+using namespace anakin::saber;
+
+TEST(TestSaberDeviceBM, test_BM_device) {
+    Device<BM> dev_BM;
+}
+
+#endif
+
+int main(int argc, const char** argv) {
+    // initial logger
+    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
diff --git a/test/saber/bm/test_saber_device_BM.h b/test/saber/bm/test_saber_device_BM.h
new file mode 100644
index 000000000..3a6d61236
--- /dev/null
+++ b/test/saber/bm/test_saber_device_BM.h
@@ -0,0 +1,21 @@
+#ifndef SABER_TEST_SABER_DEVICE_BM_H
+#define SABER_TEST_SABER_DEVICE_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+#include "core/device.h"
+
+using namespace anakin::test;
+
+class TestSaberDeviceBM : public Test {
+public:
+    TestSaberDeviceBM() {}
+    ~TestSaberDeviceBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+};
+
+#endif //SABER_TEST_SABER_DEVICE_BM_H
diff --git a/test/saber/bm/test_saber_func_BM.h b/test/saber/bm/test_saber_func_BM.h
new file mode 100644
index 000000000..61d27d6f9
--- /dev/null
+++ b/test/saber/bm/test_saber_func_BM.h
@@ -0,0 +1,38 @@
+#ifndef ANAKIN_TEST_SABER_TEST_SABER_FUNC_BM_H
+#define ANAKIN_TEST_SABER_TEST_SABER_FUNC_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+#include "core/tensor.h"
+#include <fstream>
+#include <vector>
+
+using namespace anakin::test;
+
+int read_file(std::vector<float> &results, const char* file_name) {
+
+    std::ifstream infile(file_name);
+    if (!infile.good()) {
+        std::cout << "Cannot open " << std::endl;
+        return false;
+    }
+    LOG(INFO)<<"found filename: "<<file_name;
+    std::string line;
+    while (std::getline(infile, line)) {
+        results.push_back((float)atof(line.c_str()));
+    }
+    return 0;
+}
+
+class TestSaberFuncBM : public Test {
+public:
+    TestSaberFuncBM() {}
+    ~TestSaberFuncBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+};
+
+#endif //ANAKIN_TEST_SABER_TEST_SABER_TENSOR_BM_H
diff --git a/test/saber/bm/test_saber_func_fc_BM.cpp b/test/saber/bm/test_saber_func_fc_BM.cpp
new file mode 100644
index 000000000..869ff1bfd
--- /dev/null
+++ b/test/saber/bm/test_saber_func_fc_BM.cpp
@@ -0,0 +1,146 @@
+#include "core/context.h"
+#include "funcs/fc.h"
+#include "test_saber_func_BM.h"
+#include "tensor_op.h"
+#include "saber_types.h"
+#include <vector>
+
+using namespace anakin::saber;
+typedef TargetWrapper<BM> API;
+typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
+typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+typedef TensorDf4::Dtype ftype;
+
+void fc_compute(const TensorHf4& tin, const TensorHf4& weight, \
+                const TensorHf4& bias, TensorHf4& tout) {
+
+    int m = tin.num();
+    int k = tin.valid_size() / m;
+    int n = weight.valid_size() / k;
+    bool bias_term = bias.valid_size() > 0;
+
+    const float* din = tin.data();
+    const float* w = weight.data();
+    float* dout = tout.mutable_data();
+
+    for (int i = 0; i < m; ++i) {
+        float* pdout = dout + i * n;
+        const float* pdin = din + i * k;
+
+        for (int j = 0; j < n; ++j) {
+            if (bias_term) {
+                pdout[j] = bias.data()[j];
+            } else {
+                pdout[j] = 0;
+            }
+
+            for (int l = 0; l < k; ++l) {
+                pdout[j] += pdin[l] * w[l * n + j];
+            }
+        }
+    }
+}
+
+TEST(TestSaberFuncBM, test_func_fc) {
+
+    int test_iter = 100;
+    int w_in = 7;
+    int h_in = 7;
+    int ch_in = 512;
+    int num_in = 1;
+
+    int num_out = 4096;
+    int axis = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_out = {num_in, num_out, 1, 1};
+
+    Shape sh_w{1, 1, w_in* h_in * ch_in, num_out};
+    TensorDf4 weight(sh_w);
+    Shape sh_b{1, 1, 1, num_out};
+    TensorDf4 bias(sh_b);
+    fill_tensor_device_const(weight, 1.f);
+    fill_tensor_device_const(bias, 1.f);
+
+    LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \
+              ch_in << ", height=" << h_in << ", width=" << w_in;
+
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+
+    TensorDf4 tdin;
+    TensorDf4 tdout;
+    tdin.re_alloc(shape_in);
+    fill_tensor_device_const(tdin, 1.f);
+    input_dev_4d.push_back(&tdin);
+    output_dev_4d.push_back(&tdout);
+
+    // start Reshape & doInfer
+    Context<BM> ctx_dev(0, 1, 1);
+
+    FcParam<TensorDf4> param(&weight, &bias, num_out, axis);
+
+    Fc<BM, AK_FLOAT> fc;
+
+    LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \
+              shape_out[2] << ", " << shape_out[3];
+
+    SABER_CHECK(fc.compute_output_shape(input_dev_4d, output_dev_4d, param));
+
+    LOG(INFO) << "re-alloc tensor buffer";
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->valid_shape());
+    Shape va_sh = tdout.valid_shape();
+    LOG(INFO) << "shape out 4d: " << va_sh[0] << ", " << va_sh[1] << ", " << \
+              va_sh[2] << ", " << va_sh[3];
+    CHECK_EQ(tdout.valid_shape() == shape_out, true) << "compute output shape error";
+
+    LOG(INFO) << "FC initialization";
+    SABER_CHECK(fc.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev));
+
+    LOG(INFO) << "FC compute";
+    SaberTimer<BM> t1;
+    t1.clear();
+    t1.start(ctx_dev);
+
+    for (int i = 0; i < test_iter; ++i) {
+        SABER_CHECK(fc(input_dev_4d, output_dev_4d, param, ctx_dev));
+        output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+        output_dev_4d[0]->sync();
+        //cudaDeviceSynchronize();
+    }
+
+    t1.end(ctx_dev);
+    float ts = t1.get_average_ms();
+    LOG(INFO) << "total time: " << ts << "avg time: " << ts / test_iter;
+    //print_tensor_device(*output_dev_4d[0]);
+
+    //! check result
+    TensorHf4 thin(shape_in);
+    TensorHf4 thout(shape_out);
+    TensorHf4 thw(sh_w);
+    TensorHf4 thb(sh_b);
+    thin.copy_from(tdin);
+    thw.copy_from(weight);
+    thb.copy_from(bias);
+    fc_compute(thin, thw, thb, thout);
+    //print_tensor_host(thout);
+
+    TensorHf4 thout_d(shape_out);
+    thout_d.copy_from(tdout);
+    double max_ratio = 0;
+    double max_diff = 0;
+    tensor_cmp_host(thout.data(), thout_d.data(), thout.valid_size(), max_ratio, max_diff);
+    LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio;
+    CHECK_LE(fabs(max_ratio), 1.0e-6) << "error result";
+
+}
+
+int main(int argc, const char** argv) {
+    // initial logger
+    //logger::init(argv[0]);
+    Env<BM>::env_init();
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
diff --git a/test/saber/bm/test_saber_shape_BM.cpp b/test/saber/bm/test_saber_shape_BM.cpp
new file mode 100644
index 000000000..18479cd18
--- /dev/null
+++ b/test/saber/bm/test_saber_shape_BM.cpp
@@ -0,0 +1,126 @@
+#include "test_saber_shape_BM.h"
+#include "shape.h"
+#include "anakin_config.h"
+
+#ifdef USE_OPENMP
+#include <omp.h>
+#include <core/shape.h>
+#endif
+
+using namespace anakin;
+using namespace saber;
+
+
+TEST(TestSaberShapeBM, test_saber_shape) {
+
+    int dim = 4;
+    Shape sh4d0{0, 0, 0, 0};
+    CHECK_EQ(sh4d0.dims(), 4) << "check shape dim error";
+
+    for (int i = 0; i < dim; ++i) {
+        CHECK_EQ(sh4d0[i], 0) << "check default constructor, dim size error";
+    }
+
+    CHECK_EQ(sh4d0.count(), 0) << "check shape count error";
+
+    int N = 1;
+    int C = 3;
+    int H = 11;
+    int W = 11;
+    std::vector<int> sh_size = {N, C, H, W};
+    //Shape sh4d1(sh_size);
+    Shape sh4d1(N, C, H, W);
+    LOG(INFO) << "Test Saber Shape, size of shape: " << sh4d1.size();
+    CHECK_EQ(sh4d1.count(), N * C * H * W) << "size error with vector constructor!";
+    //CHECK_EQ(sh4d2.size(), N * C * H * W) << "size error with args constructor!";
+
+    CHECK_EQ(sh4d1[0], N) << "get shape size error";
+    CHECK_EQ(sh4d1[1], C) << "get shape size error";
+    CHECK_EQ(sh4d1[2], H) << "get shape size error";
+    CHECK_EQ(sh4d1[3], W) << "get shape size error";
+
+    //CHECK_EQ(sh4d2[0], N) << "get shape size error";
+    //CHECK_EQ(sh4d2[1], C) << "get shape size error";
+    //CHECK_EQ(sh4d2[2], H) << "get shape size error";
+    //CHECK_EQ(sh4d2[3], W) << "get shape size error";
+
+    CHECK_EQ(sh4d1.count(0), N * C * H * W) << "calculate count failed";
+
+    C = 10;
+    sh4d1[1] = C;
+    CHECK_EQ(sh4d1[1], C) << "set shape size error";
+
+    bool is_equal = (sh4d0 == sh4d1);
+    CHECK_EQ(is_equal, false) << "check shape is_equal failed";
+
+    sh4d0 = sh4d1;
+    CHECK_EQ(sh4d1[0], N) << "constructor failed";
+    CHECK_EQ(sh4d1[1], C) << "get shape size error";
+    CHECK_EQ(sh4d1[2], H) << "get shape size error";
+    CHECK_EQ(sh4d1[3], W) << "get shape size error";
+
+    Shape sh4d3 = sh4d1;
+    CHECK_EQ((sh4d3 == sh4d1), true) << "constructor error";
+
+    Shape sh4d4(sh4d1);
+    CHECK_EQ((sh4d4 == sh4d1), true) << "constructor error";
+
+    Shape sh1d0{0};
+    //std::vector<int> sh1d_size = {W};
+
+    //Shape sh1d1(sh1d_size);
+    //Shape sh1d0{W};
+    Shape sh1d1(W);
+
+    Shape sh1d3 = sh1d1;
+    Shape sh1d4(sh1d1);
+
+    CHECK_EQ(sh1d0.dims(), 1) << "shape dim error";
+
+    CHECK_EQ(sh1d0.count(), 0) << "shape size error";
+
+    CHECK_EQ(sh1d0.count(0), 0) << "shape1d count error";
+
+    CHECK_EQ(sh1d1[0], W) << "get shape size error";
+
+    //CHECK_EQ(sh1d2.count(0), W) << "shape dim error";
+
+    CHECK_EQ((sh1d0 != sh1d1), true) << "compare shape error";
+
+    CHECK_EQ((sh1d3 == sh1d1), true) << "compare shape error";
+
+    CHECK_EQ((sh1d4 == sh1d1), true) << "compare shape error";
+
+    Shape sh0{2, 2, 3, 4};
+    Shape sh1{2, 1, 1, 24};
+    Shape sh2{2, 2, 3, 4};
+    Shape sh3{1, 1, 2, 3};
+
+    CHECK_EQ(sh0 == sh2, true) << "error ==";
+    CHECK_EQ(sh3 < sh0, true) << "error <";
+    CHECK_EQ(sh3 >= sh0, false) << "error >=";
+    CHECK_EQ(sh3 > sh0, false) << "error >";
+    CHECK_EQ(sh0 > sh3, true) << "error >";
+    CHECK_EQ(sh0 < sh1, false) << "error <";
+    CHECK_EQ(sh0 <= sh2, true) << "error <=";
+    CHECK_EQ(sh0 >= sh2, true) << "error >=";
+
+    Shape sh001 = Shape::zero(2);
+    Shape sh002 = Shape::zero(3);
+
+    if (sh001 > sh002) {
+        LOG(ERROR) << "error <";
+    }
+
+}
+
+
+int main(int argc, const char** argv) {
+    // initial logger
+    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
+
diff --git a/test/saber/bm/test_saber_shape_BM.h b/test/saber/bm/test_saber_shape_BM.h
new file mode 100644
index 000000000..a2ca02c9b
--- /dev/null
+++ b/test/saber/bm/test_saber_shape_BM.h
@@ -0,0 +1,25 @@
+#ifndef ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H
+#define ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H
+
+#include "utils/unit_test/aktest.h"
+#include "utils/logger/logger.h"
+#include "saber/core/shape.h"
+
+using namespace anakin::test;
+
+class TestSaberShapeBM : public Test {
+public:
+    TestSaberShapeBM() {}
+    ~TestSaberShapeBM() {}
+
+protected:
+    virtual void setup() {}
+    virtual void teardown() {}
+
+protected:
+    std::string name;
+    std::string _test;
+};
+
+#endif //ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H
+

From 1d98f9fc69f57cb65ae622016daf1d5817822840 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 09:43:19 +0800
Subject: [PATCH 213/318] Add back missing files

---
 test/framework/core/base_types_test.cpp  | 143 +++++++++++++++++++++++
 test/framework/graph/graph_base_test.cpp |  82 +++++++++++++
 2 files changed, 225 insertions(+)
 create mode 100644 test/framework/core/base_types_test.cpp
 create mode 100644 test/framework/graph/graph_base_test.cpp

diff --git a/test/framework/core/base_types_test.cpp b/test/framework/core/base_types_test.cpp
new file mode 100644
index 000000000..0109493bf
--- /dev/null
+++ b/test/framework/core/base_types_test.cpp
@@ -0,0 +1,143 @@
+#include "core_test.h"
+#include "any.h"
+#include "singleton.h"
+#include "tls.h"
+#include "parameter.h"
+#include "thread_pool.h"
+
+#ifdef USE_CUDA
+#include "cuda_funcs.h"
+#include "sass_funcs.h"
+#endif
+
+#include "tensor.h"
+
+#ifdef USE_CUDA
+TEST(CoreComponentsTest, sass_test) {
+    LOG(INFO) << "test for cuda code function";
+    //anakin::saber::Tensor<3, RTCUDA, float, NCHW> ts;
+    //LOG(WARNING) << " tensor num " << ts.num();
+    //ts.set_offset(8);
+    //my_print();
+    LOG(INFO) << "test for sass code function 1";
+    invoke_test();
+    LOG(INFO) << "test for sass code function 2";
+    invoke_test_2();
+}
+#endif
+
+TEST(CoreComponentsTest, core_base_types_any_test) {
+    LOG(INFO) << "test for any class .";
+    LOG(WARNING) << " level 1 : base type int (set 42 to any)";
+    const int a = 42;
+    any any_a(42);
+    int result_a = any_cast<int>(any_a);
+
+    LOG(INFO) << "casted result : " <<  result_a;
+    LOG(WARNING) << " level 2 : base type float (set 42.8 to any)";
+    float b = 42.8;
+    any any_b = b;
+    float result_b = any_cast<float>(any_b);
+    LOG(INFO) << "casted result : " <<  result_b << " decide: ";
+
+    LOG(WARNING) << " level 3 : ptuple type (set PTuple<float> to any)";
+    PTuple<float> p_tuple_float(3.2f, 3.3f, 3.5f);
+    p_tuple_float.push_back(4.3); // push_back
+
+    any p_tuple_float_any = p_tuple_float;
+    auto result_p_tuple_float_any = any_cast<PTuple<float>>(p_tuple_float_any);
+
+    for (int i = 0; i < result_p_tuple_float_any.size(); i++) {
+        LOG(INFO) << " any casted PTuple<float>[" << i << "]: " << result_p_tuple_float_any[i];
+    }
+
+    struct target {
+        void print() {
+            LOG(INFO) << " target struct Successfully recovered.";
+        }
+    };
+
+    LOG(WARNING) << " level 5 : struct type";
+
+    target tg;
+
+    any any_tg = tg;
+
+    target result_tg = any_cast<target>(any_tg);
+
+    result_tg.print();
+
+    LOG(WARNING) << " level other : struct type";
+
+    any any_tg_copy = any_tg;
+
+    target result_tg_copy = any_cast<target>(any_tg);
+
+    result_tg_copy.print();
+}
+
+void at_exit_in_test() {
+    LOG(WARNING) << "core_base_types_singleton_test exit successfully!";
+}
+
+TEST(CoreComponentsTest, core_base_types_singleton_test) {
+    struct target {
+        target() {
+            LOG(INFO) << " singleton target constructed";
+        }
+    };
+    typedef Singleton<target, at_exit_in_test> sg_target;
+    sg_target::Global();
+}
+
+typedef AnakinThreadLocalVar<int> sg_tls;
+void thread_func_0() {
+    int* tmp = sg_tls::value();
+    *tmp = 3;
+    LOG(INFO) << " thread_func_0 value: " << *(sg_tls::value());
+}
+void thread_func_1() {
+    int* tmp = sg_tls::value();
+    *tmp = 4;
+
+    LOG(INFO) << " thread_func_0 value: " << *(sg_tls::value());
+}
+TEST(CoreComponentsTest, core_base_types_tls_test) {
+    LOG(INFO) << " Create tls var 0 , check in two thread.";
+    std::thread first(thread_func_0);
+    std::thread sec(thread_func_1);
+    first.join();
+    sec.join();
+    LOG(INFO) << " main thread var: " << *(sg_tls::value());
+}
+
+int thread_pool_func(int i) {
+    LOG(INFO) << " thread_pool_func input : " << i;
+    //std::this_thread::sleep_for(std::chrono::seconds(0));
+    return i;
+}
+
+TEST(CoreComponentsTest, core_base_types_thread_pool_test) {
+    LOG(INFO) << " Create thread pool with thread num = 12 ";
+    ThreadPool thread_pool_test(100);
+    thread_pool_test.launch();
+    std::function<int(int)> test = thread_pool_func;
+
+    for (int i = 0; i < 50; i++) {
+        // run async
+        auto ret = thread_pool_test.RunAsync(test, i);
+        LOG(INFO) << " return : " << ret.get();
+
+        // run sync
+        //auto sync_ret = thread_pool_test.RunSync(test, i);
+    }
+}
+
+
+int main(int argc, const char** argv) {
+    // initial logger
+    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
diff --git a/test/framework/graph/graph_base_test.cpp b/test/framework/graph/graph_base_test.cpp
new file mode 100644
index 000000000..d42e86c02
--- /dev/null
+++ b/test/framework/graph/graph_base_test.cpp
@@ -0,0 +1,82 @@
+#include <string>
+#include "graph_test.h"
+#include "graph_base.h"
+
+using namespace anakin;
+using namespace anakin::graph;
+
+//! Usage sample
+class GraphTestClass : public GraphBase<std::string, int, int> {
+public:
+    GraphTestClass() {}
+    ~GraphTestClass() {}
+    virtual bool directed() {
+        return true;
+    };
+};
+class edge : public Arc<std::string, int> {
+public:
+    edge(std::string btm, std::string top, int weight): Arc<std::string, int>(btm, top, weight) {}
+    ~edge() {}
+};
+
+TEST(GraphTest, graph_base_test) {
+    LOG(INFO) << "test for graph base .";
+
+    GraphTestClass graph;
+    graph.add_vertex("a", 42);
+    graph.add_vertex("b", 43);
+    graph.add_vertex("c", 44);
+    graph.add_vertex("d", 45);
+    graph.add_vertex("e", 46);
+    graph.add_vertex("f", 47);
+
+    edge arc0("a", "b", 0);
+    edge arc1("b", "c", 1);
+    edge arc2("c", "d", 2);
+    edge arc3("d", "e", 3);
+    edge arc4("e", "f", 4);
+    edge arc5("f", "a", 5);
+
+    graph.add_in_arc(arc0);
+    graph.add_in_arc(arc1);
+    graph.add_in_arc(arc2);
+    graph.add_in_arc(arc3);
+    graph.add_in_arc(arc4);
+    graph.add_in_arc(arc5);
+    graph.add_out_arc(arc0);
+    graph.add_out_arc(arc1);
+    graph.add_out_arc(arc2);
+    graph.add_out_arc(arc3);
+    graph.add_out_arc(arc4);
+    graph.add_out_arc(arc5);
+
+    LOG(WARNING) << "Construction of graph.";
+    LOG(INFO) << graph.to_string();
+
+    LOG(WARNING) << "Remove a from graph.";
+    graph.remove("a");
+    LOG(INFO) << graph.to_string();
+
+    LOG(WARNING) << "Add arc: f->b to graph.";
+    edge arc_f_b("f", "b", 10);
+    graph.add_in_arc(arc_f_b);
+    graph.add_out_arc(arc_f_b);
+    LOG(INFO) << graph.to_string();
+
+    LOG(WARNING) << "Add vertex:a and arc: a->e to graph.";
+    graph.add_vertex("a", 47);
+    edge arc_a_e("a", "e", 10);
+    graph.add_out_arc(arc_a_e);
+    graph.add_in_arc(arc_a_e);
+    LOG(INFO) << graph.to_string();
+}
+
+
+int main(int argc, const char** argv) {
+    // initial logger
+    logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}

From 360433a5d2e92ecc480714c65fbbe63305c563fd Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 11:01:56 +0800
Subject: [PATCH 214/318] Implement BM scale

---
 .idea/workspace.xml                |  74 +++++++++++++++----
 saber/funcs/activation.h           |   4 +
 saber/funcs/impl/bm/vender_scale.h | 114 +++++++++++++++++++++++++++++
 3 files changed, 179 insertions(+), 13 deletions(-)
 create mode 100644 saber/funcs/impl/bm/vender_scale.h

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 210061337..d3f2e6bde 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -9,7 +9,10 @@
     </configurations>
   </component>
   <component name="ChangeListManager">
-    <list default="true" id="05b5ffab-037b-4c62-b3a1-7655f7441f38" name="Default" comment="" />
+    <list default="true" id="05b5ffab-037b-4c62-b3a1-7655f7441f38" name="Default" comment="">
+      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/vender_scale.h" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/saber/funcs/activation.h" beforeDir="false" afterPath="$PROJECT_DIR$/saber/funcs/activation.h" afterDir="false" />
+    </list>
     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
     <option name="TRACKING_ENABLED" value="true" />
     <option name="SHOW_DIALOG" value="false" />
@@ -40,11 +43,11 @@
           </provider>
         </entry>
       </file>
-      <file leaf-file-name="conv.h" pinned="false" current-in-tab="true">
+      <file leaf-file-name="conv.h" pinned="false" current-in-tab="false">
         <entry file="file://$PROJECT_DIR$/saber/funcs/conv.h">
           <provider selected="true" editor-type-id="text-editor">
             <state relative-caret-position="225">
-              <caret line="17" column="29" lean-forward="true" selection-start-line="17" selection-start-column="29" selection-end-line="17" selection-end-column="29" />
+              <caret line="17" column="29" selection-start-line="17" selection-start-column="29" selection-end-line="17" selection-end-column="29" />
               <folding>
                 <element signature="e#854#946#0" expanded="true" />
                 <element signature="e#976#1015#0" expanded="true" />
@@ -56,11 +59,24 @@
           </provider>
         </entry>
       </file>
+      <file leaf-file-name="activation.h" pinned="false" current-in-tab="true">
+        <entry file="file://$PROJECT_DIR$/saber/funcs/activation.h">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="525">
+              <caret line="36" column="50" lean-forward="true" selection-start-line="36" selection-start-column="50" selection-end-line="36" selection-end-column="50" />
+              <folding>
+                <element signature="e#1124#1174#0" expanded="true" />
+                <element signature="e#1197#1242#0" expanded="true" />
+              </folding>
+            </state>
+          </provider>
+        </entry>
+      </file>
       <file leaf-file-name="tensor.h" pinned="false" current-in-tab="false">
         <entry file="file://$PROJECT_DIR$/saber/core/tensor.h">
           <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="157">
-              <caret line="670" column="15" selection-start-line="670" selection-start-column="15" selection-end-line="670" selection-end-column="15" />
+            <state relative-caret-position="-1028">
+              <caret line="700" column="15" selection-start-line="700" selection-start-column="15" selection-end-line="700" selection-end-column="15" />
             </state>
           </provider>
         </entry>
@@ -140,6 +156,7 @@
         <option value="$PROJECT_DIR$/saber/core/tensor_op.cpp" />
         <option value="$PROJECT_DIR$/saber/core/target_wrapper.h" />
         <option value="$PROJECT_DIR$/saber/funcs/conv.h" />
+        <option value="$PROJECT_DIR$/saber/funcs/activation.h" />
       </list>
     </option>
   </component>
@@ -177,6 +194,18 @@
               <item name="Anakin" type="462c0819:PsiDirectoryNode" />
               <item name="saber" type="462c0819:PsiDirectoryNode" />
             </path>
+            <path>
+              <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
+              <item name="Anakin" type="462c0819:PsiDirectoryNode" />
+              <item name="saber" type="462c0819:PsiDirectoryNode" />
+              <item name="core" type="462c0819:PsiDirectoryNode" />
+            </path>
+            <path>
+              <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
+              <item name="Anakin" type="462c0819:PsiDirectoryNode" />
+              <item name="saber" type="462c0819:PsiDirectoryNode" />
+              <item name="funcs" type="462c0819:PsiDirectoryNode" />
+            </path>
             <path>
               <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
               <item name="Anakin" type="462c0819:PsiDirectoryNode" />
@@ -186,7 +215,14 @@
               <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
               <item name="Anakin" type="462c0819:PsiDirectoryNode" />
               <item name="test" type="462c0819:PsiDirectoryNode" />
-              <item name="saber" type="462c0819:PsiDirectoryNode" />
+              <item name="framework" type="462c0819:PsiDirectoryNode" />
+            </path>
+            <path>
+              <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
+              <item name="Anakin" type="462c0819:PsiDirectoryNode" />
+              <item name="test" type="462c0819:PsiDirectoryNode" />
+              <item name="framework" type="462c0819:PsiDirectoryNode" />
+              <item name="core" type="462c0819:PsiDirectoryNode" />
             </path>
           </expand>
           <select />
@@ -221,23 +257,24 @@
       <option name="presentableId" value="Default" />
       <updated>1533519941069</updated>
       <workItem from="1533519943497" duration="1090000" />
-      <workItem from="1533533623166" duration="5735000" />
+      <workItem from="1533533623166" duration="6388000" />
     </task>
     <servers />
   </component>
   <component name="TimeTrackingManager">
-    <option name="totallyTimeSpent" value="6825000" />
+    <option name="totallyTimeSpent" value="7478000" />
   </component>
   <component name="ToolWindowManager">
     <frame x="0" y="23" width="2560" height="1353" extended-state="0" />
+    <editor active="true" />
     <layout>
-      <window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.25" />
+      <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.25" />
       <window_info anchor="bottom" id="TODO" order="6" />
       <window_info anchor="bottom" id="Event Log" order="7" side_tool="true" />
       <window_info anchor="bottom" id="Version Control" order="7" weight="0.28850666" />
       <window_info anchor="bottom" id="Run" order="2" />
       <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
-      <window_info active="true" anchor="bottom" id="Terminal" order="7" visible="true" weight="0.19077404" />
+      <window_info anchor="bottom" id="Terminal" order="7" visible="true" weight="0.19077404" />
       <window_info id="Favorites" order="2" side_tool="true" />
       <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
       <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
@@ -325,8 +362,8 @@
     </entry>
     <entry file="file://$PROJECT_DIR$/saber/core/tensor.h">
       <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="157">
-          <caret line="670" column="15" selection-start-line="670" selection-start-column="15" selection-end-line="670" selection-end-column="15" />
+        <state relative-caret-position="-1028">
+          <caret line="700" column="15" selection-start-line="700" selection-start-column="15" selection-end-line="700" selection-end-column="15" />
         </state>
       </provider>
     </entry>
@@ -376,7 +413,7 @@
     <entry file="file://$PROJECT_DIR$/saber/funcs/conv.h">
       <provider selected="true" editor-type-id="text-editor">
         <state relative-caret-position="225">
-          <caret line="17" column="29" lean-forward="true" selection-start-line="17" selection-start-column="29" selection-end-line="17" selection-end-column="29" />
+          <caret line="17" column="29" selection-start-line="17" selection-start-column="29" selection-end-line="17" selection-end-column="29" />
           <folding>
             <element signature="e#854#946#0" expanded="true" />
             <element signature="e#976#1015#0" expanded="true" />
@@ -387,5 +424,16 @@
         </state>
       </provider>
     </entry>
+    <entry file="file://$PROJECT_DIR$/saber/funcs/activation.h">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="525">
+          <caret line="36" column="50" lean-forward="true" selection-start-line="36" selection-start-column="50" selection-end-line="36" selection-end-column="50" />
+          <folding>
+            <element signature="e#1124#1174#0" expanded="true" />
+            <element signature="e#1197#1242#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
   </component>
 </project>
\ No newline at end of file
diff --git a/saber/funcs/activation.h b/saber/funcs/activation.h
index af7c87f9e..86b4cf8c1 100644
--- a/saber/funcs/activation.h
+++ b/saber/funcs/activation.h
@@ -37,6 +37,10 @@
 #include "saber/funcs/impl/arm/saber_activation.h"
 #endif
 
+#ifdef USE_BM
+#include "saber/funcs/impl/bm/vender_scale.h"
+#endif
+
 namespace anakin {
 namespace saber {
 
diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
new file mode 100644
index 000000000..e019f1b21
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -0,0 +1,114 @@
+#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_SCALE_H
+#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_SCALE_H
+
+#include "saber/funcs/impl/impl_scale.h"
+
+namespace anakin{
+
+namespace saber{
+
+template <DataType OpDtype,
+    DataType inDtype,
+    DataType outDtype,
+    typename LayOutType_op,
+    typename LayOutType_in,
+    typename LayOutType_out>
+class SaberScale<BM, OpDtype, inDtype, outDtype,\
+    LayOutType_op, LayOutType_in, LayOutType_out> : \
+    public ImplBase<
+        Tensor<BM, inDtype, LayOutType_in>,
+        Tensor<BM, outDtype, LayOutType_out>,
+        Tensor<BM, OpDtype, LayOutType_op>,
+        ScaleParam<Tensor<BM, OpDtype, LayOutType_op> > >
+{
+public:
+    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+
+    SaberScale()
+    {}
+
+    ~SaberScale() {}
+
+    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            ScaleParam<OpTensor>& param, Context<BM>& ctx) {
+
+        _handle = get_bm_handle();
+        return create(inputs, outputs, param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            ScaleParam<OpTensor>& param, Context<BM> &ctx) {
+
+    }
+    
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
+                          std::vector<DataTensor_out*>& outputs,
+                          ScaleParam<OpTensor>& param) {
+
+        const InDataType *in_data = (const InDataType *) inputs[0]->data();
+        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
+
+        int input_n = inputs[0]->num();
+        int input_c = inputs[0]->channel();
+        int input_h = inputs[0]->height();
+        int input_w = inputs[0]->width();
+
+        int axis = (param.num_axes == 0) ? 0 : param.axis;
+        int num_axes = param.num_axes >=0 ? param.num_axes : inputs[0]->shape().dims() - axis;
+
+        int outer_dim = inputs[0]->count(0, axis);
+        int inner_dim = inputs[0]->count(axis + num_axes, inputs[0]->shape().dims());
+        int scale_dim = inputs[0]->count(axis, axis + num_axes);
+        if (inputs.size() == 1) {
+            CHECK_EQ(scale_dim, param.scale_w.size()) << "scale dim not valid";
+        }
+
+        OpDataType* scale_data = param.scale_w[0];
+        bmdnn_scale_forward(
+                _handle,
+                //input
+                *in_data,
+                *scale_data,
+                input_n,
+                input_c,
+                input_h,
+                input_w,
+                scale_dim,
+                inner_dim,
+                0,
+                //output
+                new bm_device_mem_t(),
+                *out_data
+        );
+
+        if (param.bias_term) {
+            OpDataType* bias_data = param.scale_b[0];
+            bmdnn_bias_forward(
+                    _handle,
+                    //input
+                    *out_data,
+                    *bias_data,
+                    outer_dim,
+                    inner_dim,
+                    //output
+                    *out_data
+            );
+        }
+
+        return SaberSuccess;
+    }
+private:
+    bm_handle_t _handle;
+};
+
+}
+
+}
+#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_SCALE_H

From 702b59c759b8e8654b4ead367c3e0cf5c2369726 Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Thu, 28 Jun 2018 11:05:25 +0800
Subject: [PATCH 215/318] pooling test

---
 saber/core/tensor_op.cpp                     |  2 +-
 saber/funcs/impl/bm/vender_pooling.h         |  4 +--
 test/saber/bm/test_saber_func_pooling_BM.cpp | 28 +++++++-------------
 3 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp
index 6a5d58f03..9fbd1f090 100644
--- a/saber/core/tensor_op.cpp
+++ b/saber/core/tensor_op.cpp
@@ -337,7 +337,7 @@ void print_tensor_device<Tensor<BM, AK_BM, NCHW>>(Tensor<BM, AK_BM, NCHW>& tenso
     bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr);
 
     for (int i = 0; i < tensor.size(); ++i) {
-        printf("%.2f ", host_mem[i]);
+        printf("%.2f\t", host_mem[i]);
 
         if ((i + 1) % tensor.width() == 0){
             printf("\n");
diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h
index 6e5de79a4..1bdcfdecb 100644
--- a/saber/funcs/impl/bm/vender_pooling.h
+++ b/saber/funcs/impl/bm/vender_pooling.h
@@ -28,7 +28,7 @@ class VenderPooling<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in
     typedef typename DataTensor_out::Dtype OutDataType;
     typedef typename OpTensor::Dtype OpDataType;
 
-    VenderPooling() : _handle(NULL), _pooling_type(NULL) {}
+    VenderPooling() : _handle(NULL) {}
 
     ~VenderPooling() {}
 
@@ -61,7 +61,7 @@ class VenderPooling<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in
         int stride_h = param.stride_h;
         int stride_w = param.stride_w;
         int is_avg_pooling;
-        if(_pooling_type == Pooling_max){
+        if(param.pooling_type == Pooling_max){
             is_avg_pooling = 0;
         } else {
             is_avg_pooling = 1;
diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp
index fb1a7398d..7edfc677b 100644
--- a/test/saber/bm/test_saber_func_pooling_BM.cpp
+++ b/test/saber/bm/test_saber_func_pooling_BM.cpp
@@ -49,16 +49,6 @@ TEST(TestSaberFuncBM, test_func_pooling) {
     int pad_w = 1;
     int stride_h = 1;
     int stride_w = 1;
-    LOG(INFO) << " img_num: " << img_num;
-    LOG(INFO) << " in_channels: " << in_channels;
-    LOG(INFO) << " img_h: " << img_h;
-    LOG(INFO) << " img_w: " << img_w;
-    LOG(INFO) << " window_h: " << window_h;
-    LOG(INFO) << " window_w: " << window_w;
-    LOG(INFO) << " pad_h: " << pad_h;
-    LOG(INFO) << " pad_w: " << pad_w;
-    LOG(INFO) << " stride_h: " << stride_h;
-    LOG(INFO) << " stride_w: " << stride_w;
 
     PoolingParam<TensorDf4> param(window_h, window_w, pad_h, pad_w
                                   , stride_h, stride_w, Pooling_max);
@@ -80,7 +70,7 @@ TEST(TestSaberFuncBM, test_func_pooling) {
     pooling(input, output, param, ctx1);
 
     SaberTimer<BM> t1;
-    int ts = 100;
+    int ts = 10;
 
     for (int i = 0; i < ts; ++i) {
         t1.start(ctx1);
@@ -100,7 +90,6 @@ TEST(TestSaberFuncBM, test_func_pooling) {
 
 TEST(TestSaberFuncBM, test_pooling_result) {
 
-    Env<BM>::env_init();
     typedef TargetWrapper<BM> API;
 
     typedef TargetWrapper<X86> X86_API;
@@ -109,7 +98,7 @@ TEST(TestSaberFuncBM, test_pooling_result) {
     typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
 
     int img_num = 1;
-    int in_channels = 2;
+    int in_channels = 1;
     int img_h = 8;
     int img_w = 8;
 
@@ -122,7 +111,7 @@ TEST(TestSaberFuncBM, test_pooling_result) {
     img_dev.re_alloc(img_s);
 
     for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = 0x7f & i;
+        img_host.mutable_data()[i] = rand() % 20;
     }
 
     img_dev.copy_from(img_host);
@@ -150,8 +139,8 @@ TEST(TestSaberFuncBM, test_pooling_result) {
     LOG(INFO) << " stride_h: " << stride_h;
     LOG(INFO) << " stride_w: " << stride_w;
 
-    PoolingParam<TensorDf4> param(window_h, window_w, pad_h, pad_w
-                                  , stride_h, stride_w, Pooling_max);
+    PoolingParam<TensorDf4> param(window_h, window_w, pad_h, pad_w,
+                                  stride_h, stride_w, Pooling_average_include_padding);
 
     std::vector<TensorDf4*> input;
     std::vector<TensorDf4*> output;
@@ -169,12 +158,14 @@ TEST(TestSaberFuncBM, test_pooling_result) {
     pooling(input, output, param, ctx1);
 
     output_dev.sync();
+    LOG(INFO) << "tensor data before pooling: ";
+    print_tensor_device(img_dev);
+    LOG(INFO) << "tensor data after pooling: ";
     print_tensor_device(output_dev);
 }
 
 TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
 
-    Env<BM>::env_init();
     typedef TargetWrapper<BM> API;
 
     typedef TargetWrapper<X86> X86_API;
@@ -275,12 +266,13 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
     out0.sync();
     out1.sync();
 
-    print_tensor_device(output_dev);
+    /* print_tensor_device(output_dev); */
 }
 
 int main(int argc, const char** argv) {
     // initial logger
     //logger::init(argv[0]);
+    Env<BM>::env_init();
     InitTest();
     RUN_ALL_TESTS(argv[0]);
     return 0;

From 4cabbce9d6eac9a42ab7d0354e6881106aa343ac Mon Sep 17 00:00:00 2001
From: "weihao.huang" <weihao.huang@bitmain.com>
Date: Thu, 28 Jun 2018 03:22:08 +0000
Subject: [PATCH 216/318] Fix d2d mem copy

---
 saber/core/impl/bm/bm_impl.cpp               | 2 +-
 test/saber/bm/test_saber_func_softmax_BM.cpp | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 4d24dedf0..e73e355b7 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -81,7 +81,7 @@ void BM_API::mem_set(void* ptr, int value, size_t n){
 void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
     size_t count, __DtoD) {
     handle = get_bm_handle(); 
-    //BMDNN_CHECK(bm_memcpy_s2d(handle, bm_mem_from_device(dst), dst_id, bm_mem_from_device(src), src_id, count));
+    //BMDNN_CHECK(bm_memcpy_d2d(handle, bm_mem_from_device(dst), dst_id, bm_mem_from_device(src), src_id, count));
     BMDNN_CHECK(bm_memcpy_d2d(handle, *(bm_device_mem_t *)(dst), dst_id, *(bm_device_mem_t *)(src), src_id, count));
     LOG(INFO) << "BM sync_memcpy: device to device, finished";
 };
diff --git a/test/saber/bm/test_saber_func_softmax_BM.cpp b/test/saber/bm/test_saber_func_softmax_BM.cpp
index 6c38c7534..d7707fad7 100644
--- a/test/saber/bm/test_saber_func_softmax_BM.cpp
+++ b/test/saber/bm/test_saber_func_softmax_BM.cpp
@@ -17,7 +17,7 @@ TEST(TestSaberFuncBM, test_func_softmax_BM) {
 
     typedef TensorDf4::Dtype dtype;
 
-    int test_iter = 1000;
+    int test_iter = 10;
 
     int softmax_axis = 3; // channel
     int w_in = 3;
@@ -182,12 +182,13 @@ TEST(TestSaberFuncBM, test_func_softmax_ROI_BM) {
 
     TensorDf4 troi(output_dev_4d[0]->valid_shape());
     troi.copy_from(*output_dev_4d[0]);
-    print_tensor_device(troi);
+    //print_tensor_device(troi);
 }
 
 int main(int argc, const char** argv) {
     // initial logger
     //logger::init(argv[0]);
+    Env<BM>::env_init();
     InitTest();
     RUN_ALL_TESTS(argv[0]);
     return 0;

From 4b125ce3303a266f9455c72e2353646c0cd8a05d Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 11:39:02 +0800
Subject: [PATCH 217/318] Add batch norm operation

---
 saber/funcs/activation.h                |   6 ++
 saber/funcs/batch_norm.h                | 115 ++++++++++++++++++++++++
 saber/funcs/impl/bm/vender_batch_norm.h |  63 +++++++++++++
 saber/funcs/impl/bm/vender_scale.h      |   6 +-
 saber/funcs/impl/impl_batch_norm.h      |  14 +++
 5 files changed, 201 insertions(+), 3 deletions(-)
 create mode 100644 saber/funcs/batch_norm.h
 create mode 100644 saber/funcs/impl/bm/vender_batch_norm.h
 create mode 100644 saber/funcs/impl/impl_batch_norm.h

diff --git a/saber/funcs/activation.h b/saber/funcs/activation.h
index 86b4cf8c1..bd8a644ca 100644
--- a/saber/funcs/activation.h
+++ b/saber/funcs/activation.h
@@ -79,9 +79,15 @@ class Activation : public BaseFunc<
     virtual SaberStatus init_impl(ImplEnum implenum) override {
         switch (implenum) {
             case VENDER_IMPL:
+<<<<<<< HEAD:saber/funcs/activation.h
                 //this->_impl.push_back(new VenderActivation <TargetType,
                 this->_impl.push_back(new VenderActivation <TargetType,
                         OpDtype>);
+=======
+                this->_impl.push_back(new VenderScale <TargetType,
+                OpDtype, inDtype, outDtype,
+                LayOutType_op, LayOutType_in, LayOutType_out>);
+>>>>>>> Add batch norm operation:saber/funcs/scale.h
                 return SaberSuccess;
 
             case SABER_IMPL:
diff --git a/saber/funcs/batch_norm.h b/saber/funcs/batch_norm.h
new file mode 100644
index 000000000..604687303
--- /dev/null
+++ b/saber/funcs/batch_norm.h
@@ -0,0 +1,115 @@
+#ifndef ANAKIN_SABER_FUNCS_BATCH_NORM_H
+#define ANAKIN_SABER_FUNCS_BATCH_NORM_H
+
+#include "saber/core/tensor.h"
+#include "saber/funcs/base.h"
+#include "saber/saber_funcs_param.h"
+#include "saber/funcs/impl/impl_base.h"
+#include "saber/funcs/impl/impl_batch_norm.h"
+
+#ifdef NVIDIA_GPU
+//todo
+#include "saber/funcs/impl/impl_batch_norm.h"
+#endif
+
+#ifdef USE_X86_PLACE
+//todo
+#include "saber/funcs/impl/impl_batch_norm.h"
+#endif
+
+#ifdef USE_ARM_PLACE
+//todo
+#include "saber/funcs/impl/impl_batch_norm.h"
+#endif
+
+#ifdef USE_BM
+#include "saber/funcs/impl/bm/vender_batch_norm.h"
+#endif
+
+namespace anakin {
+namespace saber {
+
+#ifdef USE_BM
+template<typename TargetType,
+        DataType OpDtype,
+        DataType inDtype = AK_BM,
+        DataType outDtype = AK_BM,
+        typename LayOutType_op = NCHW,
+        typename LayOutType_in = NCHW,
+        typename LayOutType_out = NCHW
+>
+#else
+template <typename TargetType,
+        DataType OpDtype,
+        DataType inDtype = AK_FLOAT,
+        DataType outDtype = AK_FLOAT,
+        typename LayOutType_op = NCHW,
+        typename LayOutType_in = NCHW,
+        typename LayOutType_out = NCHW
+>
+#endif
+class BatchNorm : public BaseFunc<
+        Tensor<TargetType, inDtype, LayOutType_in>,
+        Tensor<TargetType, outDtype, LayOutType_out>,
+        Tensor<TargetType, OpDtype, LayOutType_op>,
+        ImplBase,
+        BatchNormParam
+> {
+public:
+    using BaseFunc<
+            Tensor<TargetType, inDtype, LayOutType_in>,
+            Tensor<TargetType, outDtype, LayOutType_out>,
+            Tensor<TargetType, OpDtype, LayOutType_op>,
+            ImplBase,
+            BatchNormParam>::BaseFunc;
+
+    BatchNorm() = default;
+
+    typedef Tensor<TargetType, inDtype, LayOutType_in> InDataTensor;
+    typedef Tensor<TargetType, outDtype, LayOutType_out> OutDataTensor;
+    typedef Tensor<TargetType, OpDtype, LayOutType_op> OpTensor;
+    typedef BatchNormParam<OpTensor> Param_t;
+    typedef std::vector<InDataTensor *> Input_v;
+    typedef std::vector<OutDataTensor *> Output_v;
+    typedef std::vector<Shape> Shape_v;
+
+    virtual SaberStatus compute_output_shape(const Input_v &input,
+                                             Output_v &output, Param_t &param) override {
+
+        Shape output_shape = (input[0]->valid_shape());
+        return output[0]->set_shape(output_shape);
+    }
+
+    virtual SaberStatus init_impl(ImplEnum implenum) override {
+        switch (implenum) {
+            case VENDER_IMPL:
+                this->_impl.push_back(new VenderBatchNorm <TargetType,
+                OpDtype, inDtype, outDtype,
+                LayOutType_op, LayOutType_in, LayOutType_out>);
+                return SaberSuccess;
+
+            case SABER_IMPL:
+                return SaberUnImplError;
+
+            default:
+                return SaberUnImplError;
+        }
+    }
+
+private:
+
+    virtual void pick_best_static() override {
+        if (true) // some condition?
+            this->_best_impl = this->_impl[0];
+    }
+
+    virtual void pick_best_specify(ImplEnum implenum) override {
+        this->_best_impl = this->_impl[0];
+    }
+
+};
+
+} // namespace saber
+} // namespace anakin
+
+#endif
diff --git a/saber/funcs/impl/bm/vender_batch_norm.h b/saber/funcs/impl/bm/vender_batch_norm.h
new file mode 100644
index 000000000..cf767cd22
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_batch_norm.h
@@ -0,0 +1,63 @@
+#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_BATCH_NORM_H
+#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_BATCH_NORM_H
+
+#include "saber/funcs/impl/impl_batch_norm.h"
+
+namespace anakin{
+
+namespace saber {
+
+template <DataType OpDtype ,
+    DataType inDtype,
+    DataType outDtype,
+    typename LayOutType_op,
+    typename LayOutType_in,
+    typename LayOutType_out>
+class VenderBatchNorm<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, LayOutType_out>:\
+ public ImplBase<
+    Tensor<BM, inDtype, LayOutType_in>, 
+    Tensor<BM, outDtype, LayOutType_out>,
+    Tensor<BM, OpDtype, LayOutType_op>,
+    BatchNormParam<Tensor<BM, OpDtype, LayOutType_op>>> {
+public:
+    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
+
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+
+    VenderBatchNorm() : _handle(NULL) {}
+
+    ~VenderBatchNorm() {}
+
+    virtual SaberStatus init(const std::vector<DataTensor_in*>& inputs,
+                  std::vector<DataTensor_out*>& outputs,
+                  BatchNormParam<OpTensor> &batch_norm_param, Context<BM> &ctx) {
+
+        _handle = get_bm_handle();
+        return create(inputs, outputs, batch_norm_param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in*>& inputs,
+                std::vector<DataTensor_out*>& outputs,
+                BatchNormParam<OpTensor> &batch_norm_param, Context<BM> &ctx) {
+    }
+
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
+                          std::vector<DataTensor_out*>& outputs,
+                          BatchNormParam<OpTensor> &param) {
+
+        return SaberSuccess;
+    }
+
+private:
+    bm_handle_t _handle;
+};
+
+} //namespace saber
+
+} // namespace anakin
+
+#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_BATCH_NORM_H
diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
index e019f1b21..9ed364173 100644
--- a/saber/funcs/impl/bm/vender_scale.h
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -13,7 +13,7 @@ template <DataType OpDtype,
     typename LayOutType_op,
     typename LayOutType_in,
     typename LayOutType_out>
-class SaberScale<BM, OpDtype, inDtype, outDtype,\
+class VenderScale<BM, OpDtype, inDtype, outDtype,\
     LayOutType_op, LayOutType_in, LayOutType_out> : \
     public ImplBase<
         Tensor<BM, inDtype, LayOutType_in>,
@@ -29,10 +29,10 @@ class SaberScale<BM, OpDtype, inDtype, outDtype,\
     typedef typename DataTensor_out::Dtype OutDataType;
     typedef typename OpTensor::Dtype OpDataType;
 
-    SaberScale()
+    VenderScale()
     {}
 
-    ~SaberScale() {}
+    ~VenderScale() {}
 
     virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
diff --git a/saber/funcs/impl/impl_batch_norm.h b/saber/funcs/impl/impl_batch_norm.h
new file mode 100644
index 000000000..5a09220c7
--- /dev/null
+++ b/saber/funcs/impl/impl_batch_norm.h
@@ -0,0 +1,14 @@
+#ifndef ANAKIN_SABER_FUNCS_IMPL_BATCH_NORM_H
+#define ANAKIN_SABER_FUNCS_IMPL_BATCH_NORM_H
+
+#include "saber/funcs/impl/impl_macro.h"
+namespace anakin{
+
+namespace saber{
+
+DEFINE_OP_CLASS(BatchNorm, BatchnormParam);
+
+}
+}
+
+#endif //ANAKIN_SABER_FUNCS_IMPL_BATCH_NORM_H

From 1bda81da5d8f762d475fcb2737a2e4a7e4eb45e7 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 14:22:34 +0800
Subject: [PATCH 218/318] Implement batch norm for BM

---
 saber/funcs/impl/bm/vender_batch_norm.h | 31 +++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/saber/funcs/impl/bm/vender_batch_norm.h b/saber/funcs/impl/bm/vender_batch_norm.h
index cf767cd22..917dc7219 100644
--- a/saber/funcs/impl/bm/vender_batch_norm.h
+++ b/saber/funcs/impl/bm/vender_batch_norm.h
@@ -49,6 +49,37 @@ class VenderBatchNorm<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_
                           std::vector<DataTensor_out*>& outputs,
                           BatchNormParam<OpTensor> &param) {
 
+        const InDataType *in_data = (const InDataType *) inputs[0]->data();
+        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
+
+        int input_n = inputs[0]->num();
+        int input_c = inputs[0]->channel();
+        int input_h = inputs[0]->height();
+        int input_w = inputs[0]->width();
+
+        OpDataType eps = param.eps;
+        OpDataType scale = param.scale;
+
+        bm_device_mem_t mean_ma = bm_mem_from_system(&param.mean);
+        bm_device_mem_t variance_ma = bm_mem_from_system(&param.variance);
+
+        bmdnn_batchnorm_forward_inference(
+                _handle,
+                //input
+                *in_data,
+                mean_ma,
+                variance_ma,
+                scale,
+                new bm_device_mem_t(),
+                eps,
+                input_n,
+                input_c,
+                input_h,
+                input_w,
+                //output
+                *out_data
+        );
+
         return SaberSuccess;
     }
 

From 0c2f59beadad56e24d9470579c3a7c64af7eb471 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 14:57:18 +0800
Subject: [PATCH 219/318] Use template specifications instead of macro

---
 saber/funcs/activation.h                     |  8 +-------
 saber/funcs/batch_norm.h                     | 11 -----------
 test/saber/bm/test_saber_func_conv_BM.cpp    |  4 ++--
 test/saber/bm/test_saber_func_pooling_BM.cpp |  8 ++++----
 test/saber/bm/test_saber_func_softmax_BM.cpp |  4 ++--
 5 files changed, 9 insertions(+), 26 deletions(-)

diff --git a/saber/funcs/activation.h b/saber/funcs/activation.h
index bd8a644ca..74941e639 100644
--- a/saber/funcs/activation.h
+++ b/saber/funcs/activation.h
@@ -38,7 +38,7 @@
 #endif
 
 #ifdef USE_BM
-#include "saber/funcs/impl/bm/vender_scale.h"
+#include "saber/funcs/impl/bm/vender_activation.h"
 #endif
 
 namespace anakin {
@@ -79,15 +79,9 @@ class Activation : public BaseFunc<
     virtual SaberStatus init_impl(ImplEnum implenum) override {
         switch (implenum) {
             case VENDER_IMPL:
-<<<<<<< HEAD:saber/funcs/activation.h
                 //this->_impl.push_back(new VenderActivation <TargetType,
                 this->_impl.push_back(new VenderActivation <TargetType,
                         OpDtype>);
-=======
-                this->_impl.push_back(new VenderScale <TargetType,
-                OpDtype, inDtype, outDtype,
-                LayOutType_op, LayOutType_in, LayOutType_out>);
->>>>>>> Add batch norm operation:saber/funcs/scale.h
                 return SaberSuccess;
 
             case SABER_IMPL:
diff --git a/saber/funcs/batch_norm.h b/saber/funcs/batch_norm.h
index 604687303..f8cf3e693 100644
--- a/saber/funcs/batch_norm.h
+++ b/saber/funcs/batch_norm.h
@@ -29,16 +29,6 @@
 namespace anakin {
 namespace saber {
 
-#ifdef USE_BM
-template<typename TargetType,
-        DataType OpDtype,
-        DataType inDtype = AK_BM,
-        DataType outDtype = AK_BM,
-        typename LayOutType_op = NCHW,
-        typename LayOutType_in = NCHW,
-        typename LayOutType_out = NCHW
->
-#else
 template <typename TargetType,
         DataType OpDtype,
         DataType inDtype = AK_FLOAT,
@@ -47,7 +37,6 @@ template <typename TargetType,
         typename LayOutType_in = NCHW,
         typename LayOutType_out = NCHW
 >
-#endif
 class BatchNorm : public BaseFunc<
         Tensor<TargetType, inDtype, LayOutType_in>,
         Tensor<TargetType, outDtype, LayOutType_out>,
diff --git a/test/saber/bm/test_saber_func_conv_BM.cpp b/test/saber/bm/test_saber_func_conv_BM.cpp
index 554bcf843..35ffc6006 100644
--- a/test/saber/bm/test_saber_func_conv_BM.cpp
+++ b/test/saber/bm/test_saber_func_conv_BM.cpp
@@ -492,7 +492,7 @@ TEST(TestSaberFuncBM, test_conv_fp32_speed_test) {
     input.push_back(&img_dev);
     output.push_back(&output_dev);
 
-    Conv<BM, AK_BM> conv;
+    Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv;
     conv.compute_output_shape(input, output, param);
 
     output_dev.re_alloc(output[0]->shape());
@@ -546,7 +546,7 @@ void test_conv_fp32_speed(std::vector<TensorDf4*> &inputs, std::vector<TensorDf4
                                     stride, stride,
                                     1, 1,
                                     &weights, &bias);
-    Conv<BM, AK_BM> conv;
+    Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv;
     conv.compute_output_shape(inputs, outputs, conv_param);
     outputs[0]->re_alloc(outputs[0]->shape());
     Context<BM> ctx1(0, 1, 1);
diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp
index 7edfc677b..943ed130b 100644
--- a/test/saber/bm/test_saber_func_pooling_BM.cpp
+++ b/test/saber/bm/test_saber_func_pooling_BM.cpp
@@ -148,7 +148,7 @@ TEST(TestSaberFuncBM, test_pooling_result) {
     input.push_back(&img_dev);
     output.push_back(&output_dev);
 
-    Pooling<BM, AK_BM> pooling;
+    Pooling<BM, AK_BM, AK_BM, AK_BM, NCHW> pooling;
     pooling.compute_output_shape(input, output, param);
 
     output_dev.re_alloc(output[0]->shape());
@@ -234,9 +234,9 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
     input.push_back(&img_dev);
     output.push_back(&output_dev);
 
-    Pooling<BM, AK_BM> pooling;
-    Pooling<BM, AK_BM> pooling0;
-    Pooling<BM, AK_BM> pooling1;
+    Pooling<BM, AK_BM, AK_BM, AK_BM, NCHW> pooling;
+    Pooling<BM, AK_BM, AK_BM, AK_BM, NCHW> pooling0;
+    Pooling<BM, AK_BM, AK_BM, AK_BM, NCHW> pooling1;
 
     pooling.compute_output_shape(input,output,  param);
 
diff --git a/test/saber/bm/test_saber_func_softmax_BM.cpp b/test/saber/bm/test_saber_func_softmax_BM.cpp
index d7707fad7..645d081f1 100644
--- a/test/saber/bm/test_saber_func_softmax_BM.cpp
+++ b/test/saber/bm/test_saber_func_softmax_BM.cpp
@@ -52,7 +52,7 @@ TEST(TestSaberFuncBM, test_func_softmax_BM) {
     // start Reshape & doInfer
     Context<BM> ctx_dev(0, 1, 1);
 
-    Softmax<BM, AK_BM> softmax_dev;
+    Softmax<BM, AK_BM, AK_BM, AK_BM, NCHW> softmax_dev;
 
     typedef std::vector<Shape> Shape_v;
 
@@ -150,7 +150,7 @@ TEST(TestSaberFuncBM, test_func_softmax_ROI_BM) {
     // start Reshape & doInfer
     Context<BM> ctx_dev(0, 1, 1);
 
-    Softmax<BM, AK_BM> softmax_dev;
+    Softmax<BM, AK_BM, AK_BM, AK_BM, NCHW> softmax_dev;
 
     LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \
               shape_out[2] << ", " << shape_out[3];

From 03dfabbd7044185b033c3a84d3d90b5ff43dacff Mon Sep 17 00:00:00 2001
From: "tong.liu" <tong.liu@bitmain.com>
Date: Thu, 28 Jun 2018 15:54:53 +0800
Subject: [PATCH 220/318] conv test

---
 test/saber/bm/test_saber_func_conv_BM.cpp | 283 +++++++---------------
 1 file changed, 92 insertions(+), 191 deletions(-)

diff --git a/test/saber/bm/test_saber_func_conv_BM.cpp b/test/saber/bm/test_saber_func_conv_BM.cpp
index 35ffc6006..75663cb8a 100644
--- a/test/saber/bm/test_saber_func_conv_BM.cpp
+++ b/test/saber/bm/test_saber_func_conv_BM.cpp
@@ -4,7 +4,6 @@
 #include "tensor_op.h"
 #include "saber_types.h"
 #include <vector>
-//#include "cublas.h"
 
 using namespace anakin::saber;
 
@@ -39,10 +38,10 @@ inline int i_div_up(int a, int b)
     return (a % b != 0) ? (a / b + 1) : (a / b);
 }
 
-#if 1
-TEST(TestSaberFuncBM, test_depthwise_conv) {
 
-    int group = 2;
+TEST(TestSaberFuncBM, test_conv_result) {
+
+    int group = 1;
     int pad_h = 1;
     int pad_w = 1;
     int stride_h = 1;
@@ -52,30 +51,30 @@ TEST(TestSaberFuncBM, test_depthwise_conv) {
 
     int kernel_h = 3;
     int kernel_w = 3;
-    int out_channels = 2;
+    int out_channels = 1;
     
     int img_num = 1;
-    int in_channels = 2;
+    int in_channels = 1;
     int img_h = 8;
     int img_w = 8;
 
     bool bias_term = true;
 
     LOG(INFO) << "conv param: ";
-    LOG(INFO) << " img_num = " << img_num;
-    LOG(INFO) << " in_channels = " << in_channels;
-    LOG(INFO) << " img_h = " << img_h;
-    LOG(INFO) << " img_w = " << img_w;
-    LOG(INFO) << " group = " << group;
-    LOG(INFO) << " pad_h = " << pad_h;
-    LOG(INFO) << " pad_w = " << pad_w;
-    LOG(INFO) << " stride_h = " << stride_h;
-    LOG(INFO) << " stride_w = " << stride_w;
-    LOG(INFO) << " dilation_h = " << dilation_h;
-    LOG(INFO) << " dilation_w = " << dilation_w;
-    LOG(INFO) << " kernel_h = " << kernel_h;
-    LOG(INFO) << " kernel_w = " << kernel_w;
-    LOG(INFO) << " out_channels = " << out_channels;
+    LOG(INFO) << "  img_num = " << img_num;
+    LOG(INFO) << "  in_channels = " << in_channels;
+    LOG(INFO) << "  img_h = " << img_h;
+    LOG(INFO) << "  img_w = " << img_w;
+    LOG(INFO) << "  group = " << group;
+    LOG(INFO) << "  pad_h = " << pad_h;
+    LOG(INFO) << "  pad_w = " << pad_w;
+    LOG(INFO) << "  stride_h = " << stride_h;
+    LOG(INFO) << "  stride_w = " << stride_w;
+    LOG(INFO) << "  dilation_h = " << dilation_h;
+    LOG(INFO) << "  dilation_w = " << dilation_w;
+    LOG(INFO) << "  kernel_h = " << kernel_h;
+    LOG(INFO) << "  kernel_w = " << kernel_w;
+    LOG(INFO) << "  out_channels = " << out_channels;
 
     Shape img_s(img_num, in_channels, img_h, img_w);
     Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
@@ -88,7 +87,7 @@ TEST(TestSaberFuncBM, test_depthwise_conv) {
     img_dev.re_alloc(img_s);
 
     for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = 63 & i;
+        img_host.mutable_data()[i] = i;
     }
 
     img_dev.copy_from(img_host);
@@ -142,10 +141,9 @@ TEST(TestSaberFuncBM, test_depthwise_conv) {
 
     conv(input, output, param, ctx1);
 
-    //cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    //output[0]->record_event(cuda_stream);
+    output_dev.sync();
 
-    //output_dev.sync();
+    print_tensor_device(img_dev);
     print_tensor_device(output_dev);
 }
 
@@ -165,27 +163,11 @@ TEST(TestSaberFuncBM, test_conv_param_change) {
 
     int img_num = 1;
     int in_channels = 4;
-    int img_h = 65;
-    int img_w = 63;
+    int img_h = 64;
+    int img_w = 64;
 
     bool bias_term = true;
 
-    LOG(INFO) << "conv param: ";
-    LOG(INFO) << " img_num = " << img_num;
-    LOG(INFO) << " in_channels = " << in_channels;
-    LOG(INFO) << " img_h = " << img_h;
-    LOG(INFO) << " img_w = " << img_w;
-    LOG(INFO) << " group = " << group;
-    LOG(INFO) << " pad_h = " << pad_h;
-    LOG(INFO) << " pad_w = " << pad_w;
-    LOG(INFO) << " stride_h = " << stride_h;
-    LOG(INFO) << " stride_w = " << stride_w;
-    LOG(INFO) << " dilation_h = " << dilation_h;
-    LOG(INFO) << " dilation_w = " << dilation_w;
-    LOG(INFO) << " kernel_h = " << kernel_h;
-    LOG(INFO) << " kernel_w = " << kernel_w;
-    LOG(INFO) << " out_channels = " << out_channels;
-
     Shape img_s(img_num, in_channels, img_h, img_w);
     Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
     Shape bias_s(1, out_channels, 1, 1);
@@ -197,7 +179,7 @@ TEST(TestSaberFuncBM, test_conv_param_change) {
     img_dev.re_alloc(img_s);
 
     for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = 0x7f & i;
+        img_host.mutable_data()[i] = i;
     }
 
     img_dev.copy_from(img_host);
@@ -245,25 +227,20 @@ TEST(TestSaberFuncBM, test_conv_param_change) {
     output_dev.re_alloc(output[0]->shape());
     output_host.re_alloc(output[0]->shape());
 
-            LOG(INFO)<<"regular start with group = "<<group;
+    LOG(INFO) << "regular start with group = " << group;
     // init assume output tensor has been reshpaed by user.
     conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
 
     conv(input, output, param, ctx1);
-    //output_dev.sync();
 
     param.group = 1;
     param.pad_h = 1;
     param.pad_w = 1;
 
-    LOG(INFO)<<" param changed start with group = "<<param.group;
+    LOG(INFO) << " param changed start with group = " << param.group;
     conv(input, output, param, ctx1);
-
-    //print_tensor_device(output_dev);
-
 }
 
-/*
 TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
 
     int group = 1;
@@ -283,7 +260,7 @@ TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
     int img_h = 8;
     int img_w = 8;
 
-    bool bias_term = false;
+    bool bias_term = true;
 
     LOG(INFO) << "conv param: ";
     LOG(INFO) << " img_num = " << img_num;
@@ -376,7 +353,6 @@ TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
     output0.push_back(&out0);
     output1.push_back(&out1);
 
-    // FIXME ? where do i get output shape
     output_dev.re_alloc(img_s);
 
     Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv0;
@@ -393,16 +369,7 @@ TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
 
     conv0(input0, output0, param0, ctx1);
     conv1(input1, output1, param1, ctx2);
-
-    print_tensor_device(output_dev);
-
-//    print_tensor_device(output_dev);
-
-    //cudaDeviceSynchronize();
-    //CUDA_CHECK(cudaPeekAtLastError());
 }
-*/
-#endif
 
 TEST(TestSaberFuncBM, test_conv_fp32_speed_test) {
 
@@ -414,32 +381,29 @@ TEST(TestSaberFuncBM, test_conv_fp32_speed_test) {
     int dilation_h = 1;
     int dilation_w = 1;
 
-    int kernel_h = 1;
-    int kernel_w = 1;
+    int kernel_h = 3;
+    int kernel_w = 3;
     int out_channels = 128;
 
-    int img_num = 7;
-    int in_channels = 13;
+    int img_num = 64;
+    int in_channels = 4;
     int img_h = 32;
     int img_w = 32;
 
-    bool bias_term = false;
+    bool bias_term = true;
 
     LOG(INFO) << "conv param: ";
-    LOG(INFO) << " img_num = " << img_num;
-    LOG(INFO) << " in_channels = " << in_channels;
-    LOG(INFO) << " img_h = " << img_h;
-    LOG(INFO) << " img_w = " << img_w;
-    LOG(INFO) << " group = " << group;
-    LOG(INFO) << " pad_h = " << pad_h;
-    LOG(INFO) << " pad_w = " << pad_w;
-    LOG(INFO) << " stride_h = " << stride_h;
-    LOG(INFO) << " stride_w = " << stride_w;
-    LOG(INFO) << " dilation_h = " << dilation_h;
-    LOG(INFO) << " dilation_w = " << dilation_w;
-    LOG(INFO) << " kernel_h = " << kernel_h;
-    LOG(INFO) << " kernel_w = " << kernel_w;
-    LOG(INFO) << " out_channels = " << out_channels;
+    LOG(INFO) << "  img_num = " << img_num;
+    LOG(INFO) << "  in_channels = " << in_channels;
+    LOG(INFO) << "  out_channels = " << out_channels;
+    LOG(INFO) << "  img_h = " << img_h;
+    LOG(INFO) << "  img_w = " << img_w;
+    LOG(INFO) << "  group = " << group;
+    LOG(INFO) << "  pad = " << pad_h;
+    LOG(INFO) << "  stride = " << stride_h;
+    LOG(INFO) << "  dilation = " << dilation_h;
+    LOG(INFO) << "  kernel_h = " << kernel_h;
+    LOG(INFO) << "  kernel_w = " << kernel_w;
     Shape img_s(img_num, in_channels, img_h, img_w);
     Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
     Shape bias_s(1, out_channels, 1, 1);
@@ -498,149 +462,86 @@ TEST(TestSaberFuncBM, test_conv_fp32_speed_test) {
     output_dev.re_alloc(output[0]->shape());
     LOG(INFO) << "Output shape = [ " << output[0]->shape()[0] << " " << output[0]->shape()[1] << " " \
         << output[0]->shape()[2] << " " << output[0]->shape()[3] << "]";
-    //LOG(INFO) << " blocks = [ " <<  i_div_up(img_num*output[0]->shape()[2]*output[0]->shape()[3],128) << " " << i_div_up(out_channels*kernel_h, 128) << " 1 ]" ; 
-    //选择k最小的那一组，如果一样，则选128*N，N最大的那一组
-    int k0 = i_div_up(out_channels, 128) * 128 - out_channels;
-    int k1 = i_div_up(out_channels, 64) * 64 - out_channels;
-    int k2 = i_div_up(out_channels, 32) * 32 - out_channels;
-    int kk = std::min(std::min(k0,k1),k2);
-    LOG(INFO) << "k0 = " << k0 << " k1 = " << k1 << " k2 = " << k2 << " kk = " << kk;
-    if (kk == k0)
-        LOG(INFO) << "thread = [256,1,1] 128*128" ;
-    if (kk == k1)
-        LOG(INFO) << "thread = [128,1,1] 128*64" ;
-    if (kk == k2)
-        LOG(INFO) << "thread = [128,1,1] 128*32" ;
 
     LOG(INFO) << "saber conv init";
-    conv.init(input, output, param, SPECIFY, SABER_IMPL, ctx1);
-
-    LOG(INFO) << "saber conv dispatch";
-    conv(input, output, param, ctx1);
-
-    //cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    //output[0]->record_event(cuda_stream);
+    conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
 
-    //output_dev.sync();
+    /* conv(input, output, param, ctx1); */
+    /* output_dev.sync(); */
 
+    LOG(INFO) << "saber conv dispatch";
     SaberTimer<BM> t1;
-    int ts = 1;
-
+    int ts = 100;
+    t1.start(ctx1);
     for (int i = 0; i < ts; ++i) {
-        t1.start(ctx1);
         conv(input, output, param, ctx1);
         output_dev.sync();
-        t1.end(ctx1);
     }
-
-    LOG(INFO) << "fp32 average time: " << t1.get_average_ms() << " ms";
-
+    t1.end(ctx1);
+    LOG(INFO) << "elapse time: " << t1.get_average_ms()/ts << " ms";
 }
 
-void test_conv_fp32_speed(std::vector<TensorDf4*> &inputs, std::vector<TensorDf4*> &outputs,
-                         TensorDf4 &weights, int kernel_size, int stride, int pad,
-                         int in_channel, int out_channel, TensorDf4 &bias,
-                         anakin::saber::ImplEnum impl) {
-
-    ConvParam<TensorDf4> conv_param(1, pad, pad,
-                                    stride, stride,
-                                    1, 1,
-                                    &weights, &bias);
-    Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv;
-    conv.compute_output_shape(inputs, outputs, conv_param);
-    outputs[0]->re_alloc(outputs[0]->shape());
-    Context<BM> ctx1(0, 1, 1);
-
-    SABER_CHECK(conv.init(inputs, outputs, conv_param, SPECIFY, impl, ctx1));
-
-    conv(inputs, outputs, conv_param, ctx1);
-    outputs[0]->record_event(ctx1.get_compute_stream());
-    outputs[0]->sync();
-
-    //cudaDeviceSynchronize();
-
-    SaberTimer<BM> t1;
-    int ts = 100;
-    for (int i = 0; i < ts; ++i) {
-        t1.start(ctx1);
-        conv(inputs, outputs, conv_param, ctx1);
-        outputs[0]->record_event(ctx1.get_compute_stream());
-        outputs[0]->sync();
-        t1.end(ctx1);
-    }
-            LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms";
-
-    //cudaDeviceSynchronize();
-}
-
-
 TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) {
     int img_num = 1;
     int kernel = 1;
-
-//    int out_channels = 32;
-//    int in_channels = 128;
-//    int img_h = 52;
-//    int img_w = 112;
-//    int out_channels = 64;
-//    int in_channels = 256;
-//    int img_h = 26;
-//    int img_w = 56;
     int out_channels = 128;
     int in_channels = 512;
-    int img_h = 13;
-    int img_w = 28;
-
-//    int out_channels = 512;
-//    int in_channels = 128;
-//    int img_h = 13;
-//    int img_w = 28;
-
+    int img_h = 32;
+    int img_w = 32;
     int pad = 0;
     int stride = 1;
-    Context<BM> ctx1(0, 1, 1);
 
     TensorDf4 weights;
+    TensorDf4 bias;
     weights.re_alloc({out_channels, in_channels, 1, 1});
+    bias.re_alloc({1, out_channels, 1, 1});
 
-    TensorDf4 img;
+    TensorDf4 img, out;
     img.re_alloc({1, in_channels, img_h, img_w});
 
-    TensorDf4 out;
-    out.re_alloc({1, out_channels, img_h, img_w});
-    TensorDf4 out_gemm;
-    out_gemm.re_alloc({1, out_channels, img_h, img_w});
-
     fill_tensor_device_rand(weights, -1.f, 1.f);
+    fill_tensor_device_rand(bias, -1.f, 1.f);
     fill_tensor_device_rand(img, -1.f, 1.f);
 
-    LOG(INFO) << "img_num: " << img_num;
-    LOG(INFO) << "kernel: " << kernel;
-    LOG(INFO) << "out_channels: " << out_channels;
-    LOG(INFO) << "in_channels: " << in_channels;
-    LOG(INFO) << "img_h: " << img_h;
-    LOG(INFO) << "img_w: " << img_w;
-    LOG(INFO) << "pad: " << pad;
-    LOG(INFO) << "stride: " << stride;
+    LOG(INFO) << "conv param: ";
+    LOG(INFO) << "  img_num: " << img_num;
+    LOG(INFO) << "  kernel: " << kernel;
+    LOG(INFO) << "  out_channels: " << out_channels;
+    LOG(INFO) << "  in_channels: " << in_channels;
+    LOG(INFO) << "  img_h: " << img_h;
+    LOG(INFO) << "  img_w: " << img_w;
+    LOG(INFO) << "  pad: " << pad;
+    LOG(INFO) << "  stride: " << stride;
 
-    TensorDf4 bias;
+    std::vector<TensorDf4*> input;
+    std::vector<TensorDf4*> output;
+
+    input.push_back(&img);
+    output.push_back(&out);
+
+    ConvParam<TensorDf4> conv_param(1, pad, pad,
+                                    stride, stride,
+                                    1, 1,
+                                    &weights, &bias);
+    Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv;
+    conv.compute_output_shape(input, output, conv_param);
+    out.re_alloc(output[0]->shape());
+    Context<BM> ctx1(0, 1, 1);
+    conv.init(input, output, conv_param, SPECIFY, VENDER_IMPL, ctx1);
 
-    std::vector<TensorDf4*> input_v;
-    std::vector<TensorDf4*> output_gemm_v, output_v;
-
-    input_v.push_back(&img);
-    output_v.push_back(&out);
-    output_gemm_v.push_back(&out_gemm);
-    //cudaDeviceSynchronize();
-    test_conv_fp32_speed(input_v, output_v,
-                         weights, kernel, stride, pad,
-            in_channels, out_channels, bias,
-            SABER_IMPL);
+    SaberTimer<BM> t1;
+    int ts = 100;
+    t1.start(ctx1);
+    for (int i = 0; i < ts; ++i) {
+        conv(input, output, conv_param, ctx1);
+        out.sync();
+    }
+    t1.end(ctx1);
+    LOG(INFO) << "elapse time: " << t1.get_average_ms()/ts << " ms";
 }
 
 int main(int argc, const char** argv){
-    anakin::saber::Env<BM>::env_init();
-
+    Env<BM>::env_init();
     // initial logger
     //logger::init(argv[0]);
     InitTest();

From bdb0dac1c5b452a33cc892ba599490f1a8d3073a Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 16:41:18 +0800
Subject: [PATCH 221/318] Add test for batch norm

---
 saber/funcs/batch_norm.h                      |  6 +-
 saber/funcs/impl/bm/vender_batch_norm.h       | 16 ++--
 saber/funcs/impl/bm/vender_scale.h            |  3 +-
 .../bm/test_saber_func_batch_norm_BM.cpp      | 81 +++++++++++++++++++
 4 files changed, 95 insertions(+), 11 deletions(-)
 create mode 100644 test/saber/bm/test_saber_func_batch_norm_BM.cpp

diff --git a/saber/funcs/batch_norm.h b/saber/funcs/batch_norm.h
index f8cf3e693..2e817c734 100644
--- a/saber/funcs/batch_norm.h
+++ b/saber/funcs/batch_norm.h
@@ -42,7 +42,7 @@ class BatchNorm : public BaseFunc<
         Tensor<TargetType, outDtype, LayOutType_out>,
         Tensor<TargetType, OpDtype, LayOutType_op>,
         ImplBase,
-        BatchNormParam
+        BatchnormParam
 > {
 public:
     using BaseFunc<
@@ -50,14 +50,14 @@ class BatchNorm : public BaseFunc<
             Tensor<TargetType, outDtype, LayOutType_out>,
             Tensor<TargetType, OpDtype, LayOutType_op>,
             ImplBase,
-            BatchNormParam>::BaseFunc;
+            BatchnormParam>::BaseFunc;
 
     BatchNorm() = default;
 
     typedef Tensor<TargetType, inDtype, LayOutType_in> InDataTensor;
     typedef Tensor<TargetType, outDtype, LayOutType_out> OutDataTensor;
     typedef Tensor<TargetType, OpDtype, LayOutType_op> OpTensor;
-    typedef BatchNormParam<OpTensor> Param_t;
+    typedef BatchnormParam<OpTensor> Param_t;
     typedef std::vector<InDataTensor *> Input_v;
     typedef std::vector<OutDataTensor *> Output_v;
     typedef std::vector<Shape> Shape_v;
diff --git a/saber/funcs/impl/bm/vender_batch_norm.h b/saber/funcs/impl/bm/vender_batch_norm.h
index 917dc7219..e362a256f 100644
--- a/saber/funcs/impl/bm/vender_batch_norm.h
+++ b/saber/funcs/impl/bm/vender_batch_norm.h
@@ -18,7 +18,7 @@ class VenderBatchNorm<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_
     Tensor<BM, inDtype, LayOutType_in>, 
     Tensor<BM, outDtype, LayOutType_out>,
     Tensor<BM, OpDtype, LayOutType_op>,
-    BatchNormParam<Tensor<BM, OpDtype, LayOutType_op>>> {
+    BatchnormParam<Tensor<BM, OpDtype, LayOutType_op>>> {
 public:
     typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
     typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
@@ -34,7 +34,7 @@ class VenderBatchNorm<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_
 
     virtual SaberStatus init(const std::vector<DataTensor_in*>& inputs,
                   std::vector<DataTensor_out*>& outputs,
-                  BatchNormParam<OpTensor> &batch_norm_param, Context<BM> &ctx) {
+                  BatchnormParam<OpTensor> &batch_norm_param, Context<BM> &ctx) {
 
         _handle = get_bm_handle();
         return create(inputs, outputs, batch_norm_param, ctx);
@@ -42,12 +42,12 @@ class VenderBatchNorm<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_
 
     virtual SaberStatus create(const std::vector<DataTensor_in*>& inputs,
                 std::vector<DataTensor_out*>& outputs,
-                BatchNormParam<OpTensor> &batch_norm_param, Context<BM> &ctx) {
+                BatchnormParam<OpTensor> &batch_norm_param, Context<BM> &ctx) {
     }
 
     virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
                           std::vector<DataTensor_out*>& outputs,
-                          BatchNormParam<OpTensor> &param) {
+                          BatchnormParam<OpTensor> &param) {
 
         const InDataType *in_data = (const InDataType *) inputs[0]->data();
         OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
@@ -57,12 +57,14 @@ class VenderBatchNorm<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_
         int input_h = inputs[0]->height();
         int input_w = inputs[0]->width();
 
-        OpDataType eps = param.eps;
-        OpDataType scale = param.scale;
+        float eps = param.eps;
+        float scale = param.scale;
 
         bm_device_mem_t mean_ma = bm_mem_from_system(&param.mean);
         bm_device_mem_t variance_ma = bm_mem_from_system(&param.variance);
 
+        bm_device_mem_t* variance_holder = new bm_device_mem_t();
+
         bmdnn_batchnorm_forward_inference(
                 _handle,
                 //input
@@ -70,7 +72,7 @@ class VenderBatchNorm<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_
                 mean_ma,
                 variance_ma,
                 scale,
-                new bm_device_mem_t(),
+                *variance_holder,
                 eps,
                 input_n,
                 input_c,
diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
index 9ed364173..b47716a03 100644
--- a/saber/funcs/impl/bm/vender_scale.h
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -70,6 +70,7 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
             CHECK_EQ(scale_dim, param.scale_w.size()) << "scale dim not valid";
         }
 
+        bm_device_mem_t* scale_extension = new bm_device_mem_t();
         OpDataType* scale_data = param.scale_w[0];
         bmdnn_scale_forward(
                 _handle,
@@ -84,7 +85,7 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
                 inner_dim,
                 0,
                 //output
-                new bm_device_mem_t(),
+                *scale_extension,
                 *out_data
         );
 
diff --git a/test/saber/bm/test_saber_func_batch_norm_BM.cpp b/test/saber/bm/test_saber_func_batch_norm_BM.cpp
new file mode 100644
index 000000000..659d0f699
--- /dev/null
+++ b/test/saber/bm/test_saber_func_batch_norm_BM.cpp
@@ -0,0 +1,81 @@
+#include "core/context.h"
+#include "funcs/batch_norm.h"
+#include "test_saber_func_BM.h"
+#include "tensor_op.h"
+#include "saber_types.h"
+#include <vector>
+
+using namespace anakin::saber;
+
+
+TEST(TestSaberFuncBM, test_func_batch_norm_BM) {
+
+    typedef TargetWrapper<BM> API;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+    typedef TensorDf4::Dtype dtype;
+
+    //Input / output tensor
+    Shape shape_in(1, 1, 2, 2);
+    Shape shape_out = shape_in;
+
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+
+    Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
+    for (int i = 0; i < thin.size(); ++i) {
+        thin.mutable_data()[i] = 10;
+    }
+
+    TensorDf4 tdin, tdout;
+    tdin.re_alloc(shape_in);
+    tdin.copy_from(thin);
+    input_dev_4d.push_back(&tdin);
+
+    //Batch norm param
+    std::vector<float> mean;
+    mean.push_back(10);
+
+    std::vector<float> variance;
+    variance.push_back(0);
+
+    float scale_in = 1;
+    float eps_in = float(1e-5);
+
+    BatchnormParam<TensorDf4> param(mean, variance, scale_in);
+
+    //BatachNorm
+    BatchNorm<BM, AK_BM, AK_BM, AK_BM, NCHW> batchNorm;
+
+    output_dev_4d.push_back(&tdout);
+    batchNorm.compute_output_shape(input_dev_4d, output_dev_4d, param);
+
+    LOG(INFO) << "re-alloc tensor buffer";
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
+
+    LOG(INFO) << "batch norm initialized to bm impl";
+    Context<BM> ctx_dev(0, 1, 1);
+    batchNorm.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
+
+    LOG(INFO) << "bm batch norm compute";
+    SaberTimer<BM> t1;
+    t1.clear();
+    t1.start(ctx_dev);
+
+    batchNorm(input_dev_4d, output_dev_4d, param, ctx_dev);
+
+    t1.end(ctx_dev);
+    float ts = t1.get_average_ms();
+    printf("cudnn softmax total time : %.4f, avg time : %.4f\n", ts, ts);
+
+    print_tensor_device(*output_dev_4d[0]);
+}
+
+int main(int argc, const char** argv) {
+    // initial logger
+    //logger::init(argv[0]);
+    //Env<BM>::env_init();
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+

From be07f103143b27be04847f753f7a1c9b87f95412 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 17:50:07 +0800
Subject: [PATCH 222/318] Use specialization

---
 .idea/workspace.xml | 92 ++++++++++++++++++++++++++-------------------
 1 file changed, 53 insertions(+), 39 deletions(-)

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index d3f2e6bde..a3065b63c 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -9,10 +9,7 @@
     </configurations>
   </component>
   <component name="ChangeListManager">
-    <list default="true" id="05b5ffab-037b-4c62-b3a1-7655f7441f38" name="Default" comment="">
-      <change afterPath="$PROJECT_DIR$/saber/funcs/impl/bm/vender_scale.h" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/saber/funcs/activation.h" beforeDir="false" afterPath="$PROJECT_DIR$/saber/funcs/activation.h" afterDir="false" />
-    </list>
+    <list default="true" id="05b5ffab-037b-4c62-b3a1-7655f7441f38" name="Default" comment="" />
     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
     <option name="TRACKING_ENABLED" value="true" />
     <option name="SHOW_DIALOG" value="false" />
@@ -22,11 +19,22 @@
   </component>
   <component name="FileEditorManager">
     <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
-      <file leaf-file-name="saber_funcs_param.h" pinned="false" current-in-tab="false">
+      <file leaf-file-name="saber_funcs_param.h" pinned="false" current-in-tab="true">
         <entry file="file://$PROJECT_DIR$/saber/saber_funcs_param.h">
           <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="402">
-              <caret line="32" column="12" selection-start-line="32" selection-start-column="12" selection-end-line="32" selection-end-column="12" />
+            <state relative-caret-position="135">
+              <caret line="9" column="64" selection-start-line="9" selection-start-column="64" selection-end-line="9" selection-end-column="64" />
+              <folding>
+                <element signature="e#897#918#0" expanded="true" />
+                <element signature="e#948#969#0" expanded="true" />
+                <element signature="e#2779#2800#0" expanded="true" />
+                <element signature="e#6280#6301#0" expanded="true" />
+                <element signature="e#8615#8636#0" expanded="true" />
+                <element signature="e#9435#9456#0" expanded="true" />
+                <element signature="e#10395#10416#0" expanded="true" />
+                <element signature="e#12109#12130#0" expanded="true" />
+                <element signature="e#13826#13841#0" expanded="true" />
+              </folding>
             </state>
           </provider>
         </entry>
@@ -48,25 +56,22 @@
           <provider selected="true" editor-type-id="text-editor">
             <state relative-caret-position="225">
               <caret line="17" column="29" selection-start-line="17" selection-start-column="29" selection-end-line="17" selection-end-column="29" />
-              <folding>
-                <element signature="e#854#946#0" expanded="true" />
-                <element signature="e#976#1015#0" expanded="true" />
-                <element signature="e#1038#1082#0" expanded="true" />
-                <element signature="e#1112#1158#0" expanded="true" />
-                <element signature="e#1212#1259#0" expanded="true" />
-              </folding>
             </state>
           </provider>
         </entry>
       </file>
-      <file leaf-file-name="activation.h" pinned="false" current-in-tab="true">
+      <file leaf-file-name="activation.h" pinned="false" current-in-tab="false">
         <entry file="file://$PROJECT_DIR$/saber/funcs/activation.h">
           <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="525">
-              <caret line="36" column="50" lean-forward="true" selection-start-line="36" selection-start-column="50" selection-end-line="36" selection-end-column="50" />
+            <state relative-caret-position="435">
+              <caret line="29" column="6" selection-start-line="29" selection-start-column="6" selection-end-line="29" selection-end-column="6" />
               <folding>
+                <element signature="e#836#940#0" expanded="true" />
+                <element signature="e#970#1020#0" expanded="true" />
+                <element signature="e#1044#1094#0" expanded="true" />
                 <element signature="e#1124#1174#0" expanded="true" />
-                <element signature="e#1197#1242#0" expanded="true" />
+                <element signature="e#1197#1247#0" expanded="true" />
+                <element signature="e#1302#1349#0" expanded="true" />
               </folding>
             </state>
           </provider>
@@ -142,6 +147,7 @@
       <find>AMD_API</find>
       <find>BM</find>
       <find>print_tensor_host</find>
+      <find>batch</find>
     </findStrings>
   </component>
   <component name="Git.Settings">
@@ -257,24 +263,24 @@
       <option name="presentableId" value="Default" />
       <updated>1533519941069</updated>
       <workItem from="1533519943497" duration="1090000" />
-      <workItem from="1533533623166" duration="6388000" />
+      <workItem from="1533533623166" duration="6760000" />
     </task>
     <servers />
   </component>
   <component name="TimeTrackingManager">
-    <option name="totallyTimeSpent" value="7478000" />
+    <option name="totallyTimeSpent" value="7850000" />
   </component>
   <component name="ToolWindowManager">
     <frame x="0" y="23" width="2560" height="1353" extended-state="0" />
     <editor active="true" />
     <layout>
-      <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.25" />
+      <window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.25" />
       <window_info anchor="bottom" id="TODO" order="6" />
       <window_info anchor="bottom" id="Event Log" order="7" side_tool="true" />
       <window_info anchor="bottom" id="Version Control" order="7" weight="0.28850666" />
       <window_info anchor="bottom" id="Run" order="2" />
       <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
-      <window_info anchor="bottom" id="Terminal" order="7" visible="true" weight="0.19077404" />
+      <window_info active="true" anchor="bottom" id="Terminal" order="7" visible="true" weight="0.19077404" />
       <window_info id="Favorites" order="2" side_tool="true" />
       <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
       <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
@@ -353,13 +359,6 @@
     <entry file="file://$PROJECT_DIR$/CMakeLists.txt">
       <provider selected="true" editor-type-id="text-editor" />
     </entry>
-    <entry file="file://$PROJECT_DIR$/saber/saber_funcs_param.h">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="402">
-          <caret line="32" column="12" selection-start-line="32" selection-start-column="12" selection-end-line="32" selection-end-column="12" />
-        </state>
-      </provider>
-    </entry>
     <entry file="file://$PROJECT_DIR$/saber/core/tensor.h">
       <provider selected="true" editor-type-id="text-editor">
         <state relative-caret-position="-1028">
@@ -414,23 +413,38 @@
       <provider selected="true" editor-type-id="text-editor">
         <state relative-caret-position="225">
           <caret line="17" column="29" selection-start-line="17" selection-start-column="29" selection-end-line="17" selection-end-column="29" />
-          <folding>
-            <element signature="e#854#946#0" expanded="true" />
-            <element signature="e#976#1015#0" expanded="true" />
-            <element signature="e#1038#1082#0" expanded="true" />
-            <element signature="e#1112#1158#0" expanded="true" />
-            <element signature="e#1212#1259#0" expanded="true" />
-          </folding>
         </state>
       </provider>
     </entry>
     <entry file="file://$PROJECT_DIR$/saber/funcs/activation.h">
       <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="525">
-          <caret line="36" column="50" lean-forward="true" selection-start-line="36" selection-start-column="50" selection-end-line="36" selection-end-column="50" />
+        <state relative-caret-position="435">
+          <caret line="29" column="6" selection-start-line="29" selection-start-column="6" selection-end-line="29" selection-end-column="6" />
           <folding>
+            <element signature="e#836#940#0" expanded="true" />
+            <element signature="e#970#1020#0" expanded="true" />
+            <element signature="e#1044#1094#0" expanded="true" />
             <element signature="e#1124#1174#0" expanded="true" />
-            <element signature="e#1197#1242#0" expanded="true" />
+            <element signature="e#1197#1247#0" expanded="true" />
+            <element signature="e#1302#1349#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/saber/saber_funcs_param.h">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="135">
+          <caret line="9" column="64" selection-start-line="9" selection-start-column="64" selection-end-line="9" selection-end-column="64" />
+          <folding>
+            <element signature="e#897#918#0" expanded="true" />
+            <element signature="e#948#969#0" expanded="true" />
+            <element signature="e#2779#2800#0" expanded="true" />
+            <element signature="e#6280#6301#0" expanded="true" />
+            <element signature="e#8615#8636#0" expanded="true" />
+            <element signature="e#9435#9456#0" expanded="true" />
+            <element signature="e#10395#10416#0" expanded="true" />
+            <element signature="e#12109#12130#0" expanded="true" />
+            <element signature="e#13826#13841#0" expanded="true" />
           </folding>
         </state>
       </provider>

From b1ae58a4b29f3cf13c5d8e8f74a56fa8b9190414 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 18:45:39 +0800
Subject: [PATCH 223/318] Update batch norm test for BM

---
 test/saber/bm/test_saber_func_batch_norm_BM.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/test/saber/bm/test_saber_func_batch_norm_BM.cpp b/test/saber/bm/test_saber_func_batch_norm_BM.cpp
index 659d0f699..0453f818a 100644
--- a/test/saber/bm/test_saber_func_batch_norm_BM.cpp
+++ b/test/saber/bm/test_saber_func_batch_norm_BM.cpp
@@ -23,7 +23,7 @@ TEST(TestSaberFuncBM, test_func_batch_norm_BM) {
 
     Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
     for (int i = 0; i < thin.size(); ++i) {
-        thin.mutable_data()[i] = 10;
+        thin.mutable_data()[i] = 1+i;
     }
 
     TensorDf4 tdin, tdout;
@@ -31,9 +31,12 @@ TEST(TestSaberFuncBM, test_func_batch_norm_BM) {
     tdin.copy_from(thin);
     input_dev_4d.push_back(&tdin);
 
+    LOG(INFO) << "Input tensor is:";
+    print_tensor_device(*input_dev_4d[0]);
+
     //Batch norm param
     std::vector<float> mean;
-    mean.push_back(10);
+    mean.push_back(1);
 
     std::vector<float> variance;
     variance.push_back(0);
@@ -65,7 +68,7 @@ TEST(TestSaberFuncBM, test_func_batch_norm_BM) {
 
     t1.end(ctx_dev);
     float ts = t1.get_average_ms();
-    printf("cudnn softmax total time : %.4f, avg time : %.4f\n", ts, ts);
+    printf("bm batch norm total time : %.4f, avg time : %.4f\n", ts, ts);
 
     print_tensor_device(*output_dev_4d[0]);
 }

From 0d454d59080c602250332423f913475b2cd88b5a Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 18:45:39 +0800
Subject: [PATCH 224/318] Update batch norm test for BM

---
 saber/core/common.h                        |   1 +
 saber/funcs/impl/bm/vender_scale.h         |  52 +++------
 test/saber/bm/test_saber_func_scale_BM.cpp | 121 +++++++++++++++++++++
 3 files changed, 136 insertions(+), 38 deletions(-)
 create mode 100644 test/saber/bm/test_saber_func_scale_BM.cpp

diff --git a/saber/core/common.h b/saber/core/common.h
index a2110533b..3462a77e2 100644
--- a/saber/core/common.h
+++ b/saber/core/common.h
@@ -180,6 +180,7 @@ const char* cudnn_get_errorstring(cudnnStatus_t status);
 
 #include "bmlib_runtime.h"
 #include "bmdnn_api.h"
+#include "bmdnn_ext_api.h"
 #include "bmlib_utils.h"
 
 #define BMDNN_CHECK(condition) \
diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
index b47716a03..13f1d6322 100644
--- a/saber/funcs/impl/bm/vender_scale.h
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -29,8 +29,7 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
     typedef typename DataTensor_out::Dtype OutDataType;
     typedef typename OpTensor::Dtype OpDataType;
 
-    VenderScale()
-    {}
+    VenderScale() {}
 
     ~VenderScale() {}
 
@@ -52,8 +51,8 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
                           std::vector<DataTensor_out*>& outputs,
                           ScaleParam<OpTensor>& param) {
 
-        const InDataType *in_data = (const InDataType *) inputs[0]->data();
-        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
+        const InDataType in_data = *(inputs[0]->data());
+        OutDataType out_data = *(outputs[0]->mutable_data());
 
         int input_n = inputs[0]->num();
         int input_c = inputs[0]->channel();
@@ -66,43 +65,21 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
         int outer_dim = inputs[0]->count(0, axis);
         int inner_dim = inputs[0]->count(axis + num_axes, inputs[0]->shape().dims());
         int scale_dim = inputs[0]->count(axis, axis + num_axes);
-        if (inputs.size() == 1) {
-            CHECK_EQ(scale_dim, param.scale_w.size()) << "scale dim not valid";
-        }
+        /* if (inputs.size() == 1) { */
+        /*     CHECK_EQ(scale_dim, param.scale_w.size()) << "scale dim not valid"; */
+        /* } */
 
-        bm_device_mem_t* scale_extension = new bm_device_mem_t();
-        OpDataType* scale_data = param.scale_w[0];
-        bmdnn_scale_forward(
-                _handle,
-                //input
-                *in_data,
-                *scale_data,
-                input_n,
-                input_c,
-                input_h,
-                input_w,
-                scale_dim,
-                inner_dim,
-                0,
-                //output
-                *scale_extension,
-                *out_data
-        );
+        OpDataType scale_data = param.scale_w;
+        BMDNN_CHECK(bmdnn_scale_forward(_handle, in_data, scale_data,
+                input_n, input_c, input_h, input_w,
+                scale_dim, inner_dim, 0,
+                bm_mem_null(), out_data));
 
         if (param.bias_term) {
-            OpDataType* bias_data = param.scale_b[0];
-            bmdnn_bias_forward(
-                    _handle,
-                    //input
-                    *out_data,
-                    *bias_data,
-                    outer_dim,
-                    inner_dim,
-                    //output
-                    *out_data
-            );
+            OpDataType bias_data = param.scale_b;
+            BMDNN_CHECK(bmdnn_bias_forward(_handle, in_data, bias_data,
+                    outer_dim, inner_dim, out_data));
         }
-
         return SaberSuccess;
     }
 private:
@@ -110,6 +87,5 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
 };
 
 }
-
 }
 #endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_SCALE_H
diff --git a/test/saber/bm/test_saber_func_scale_BM.cpp b/test/saber/bm/test_saber_func_scale_BM.cpp
new file mode 100644
index 000000000..c746a67ff
--- /dev/null
+++ b/test/saber/bm/test_saber_func_scale_BM.cpp
@@ -0,0 +1,121 @@
+#include "core/context.h"
+#include "funcs/scale.h"
+#include "test_saber_func_BM.h"
+#include "tensor_op.h"
+#include "saber_types.h"
+#include <vector>
+
+using namespace anakin::saber;
+
+template <typename Tensor>
+void print_tensor_shape(std::string name, Tensor& t0) {
+
+    LOG(INFO) << name << " valid shape is ["
+              << t0.valid_shape()[0] << ", "
+              << t0.valid_shape()[1] << ", "
+              << t0.valid_shape()[2] << ", "
+              << t0.valid_shape()[3] << "].";
+
+    LOG(INFO) << name << " real shape is ["
+              << t0.shape()[0] << ", "
+              << t0.shape()[1] << ", "
+              << t0.shape()[2] << ", "
+              << t0.shape()[3] << "].";
+
+    LOG(INFO) << name << " offset is ["
+              << t0.offset()[0] << ", "
+              << t0.offset()[1] << ", "
+              << t0.offset()[2] << ", "
+              << t0.offset()[3] << "].";
+}
+void fill_vector_rand(std::vector<float>& vec) {
+    for (int i = 0; i < vec.size(); i++) {
+        vec[i] = rand() *1.0f/RAND_MAX - 0.5;
+    }
+}
+void print_vector_data(std::vector<float>& vec) {
+    for (int i = 0; i < vec.size(); i++) {
+        printf("%d, %f\n", i, vec[i]);
+    }
+}
+
+void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_term, int scale_dim) {
+
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+
+    int img_num = n;
+    int in_channels = c;
+    int img_h = h;
+    int img_w = w;
+
+    Shape img_s(img_num, in_channels, img_h, img_w);
+
+    TensorHf4 img_host;
+    TensorDf4 img_dev;
+
+    img_host.re_alloc(img_s);
+    img_dev.re_alloc(img_s);
+    fill_tensor_host_rand(img_host, -0.5, 0.5);
+    img_dev.copy_from(img_host);
+
+    TensorDf4 output_dev;
+
+    Context<BM> ctx1(0, 1, 1);
+    std::vector<float> scale_w;
+    std::vector<float> scale_b;
+    scale_w.resize(scale_dim);
+    fill_vector_rand(scale_w);
+    if (bias_term) {
+        scale_b.resize(scale_dim);
+        fill_vector_rand(scale_b);
+    }
+
+    ScaleParam<TensorDf4> param(bm_mem_from_system(&scale_w[0]), 
+                                bm_mem_from_system(&scale_b[0]), 
+                                bias_term, axis, num_axes);
+
+    std::vector<TensorDf4*> input;
+    std::vector<TensorDf4*> output;
+
+    input.push_back(&img_dev);
+    output.push_back(&output_dev);
+
+    Scale<BM, AK_BM, AK_BM, AK_BM, NCHW> scale;
+    scale.compute_output_shape(input, output, param);
+    output_dev.re_alloc(output[0]->valid_shape());
+
+    // init assume output tensor has been reshpaed by user.
+    scale.init(input, output, param, SPECIFY, SABER_IMPL, ctx1);
+    scale(input, output, param, ctx1);
+
+    output_dev.sync();
+    LOG(INFO) << "input data: ";
+    print_tensor_device(img_dev);
+    LOG(INFO) << "output data: ";
+    print_tensor_device(output_dev);
+    LOG(INFO) << "scale_w data: ";
+    print_vector_data(scale_w);
+    if (bias_term) {
+        LOG(INFO) << "scale_b data: ";
+        print_vector_data(scale_b);
+    }
+}
+
+TEST(TestSaberFuncBM, test_func_constructor_elt) {
+    test_scale(2, 2, 4, 4, 1, 1, false, 2);
+    /* test_scale(2, 2, 4, 4, 1, 1, true, 2); */
+    /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */
+    /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */
+    /* test_scale(2, 2, 4, 4, 0, 0, true, 1); */
+    /* test_scale(2, 2, 4, 4, 0, 0, true, 1); */
+}
+
+
+int main(int argc, const char** argv) {
+    Env<BM>::env_init();
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+

From 1f4c082148a9d6a0547ecef8f36464abfded4801 Mon Sep 17 00:00:00 2001
From: "Guangzhi (Frank) Xie" <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 20:34:39 +0800
Subject: [PATCH 225/318] Update BM batch norm test

---
 saber/funcs/impl/bm/vender_batch_norm.h         | 6 +++---
 test/saber/bm/test_saber_func_batch_norm_BM.cpp | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_batch_norm.h b/saber/funcs/impl/bm/vender_batch_norm.h
index e362a256f..4f433a4a9 100644
--- a/saber/funcs/impl/bm/vender_batch_norm.h
+++ b/saber/funcs/impl/bm/vender_batch_norm.h
@@ -59,9 +59,9 @@ class VenderBatchNorm<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_
 
         float eps = param.eps;
         float scale = param.scale;
-
-        bm_device_mem_t mean_ma = bm_mem_from_system(&param.mean);
-        bm_device_mem_t variance_ma = bm_mem_from_system(&param.variance);
+        
+        bm_device_mem_t mean_ma = bm_mem_from_system(&param.mean[0]);
+        bm_device_mem_t variance_ma = bm_mem_from_system(&param.variance[0]);
 
         bm_device_mem_t* variance_holder = new bm_device_mem_t();
 
diff --git a/test/saber/bm/test_saber_func_batch_norm_BM.cpp b/test/saber/bm/test_saber_func_batch_norm_BM.cpp
index 0453f818a..395eb525f 100644
--- a/test/saber/bm/test_saber_func_batch_norm_BM.cpp
+++ b/test/saber/bm/test_saber_func_batch_norm_BM.cpp
@@ -39,7 +39,7 @@ TEST(TestSaberFuncBM, test_func_batch_norm_BM) {
     mean.push_back(1);
 
     std::vector<float> variance;
-    variance.push_back(0);
+    variance.push_back(0.001);
 
     float scale_in = 1;
     float eps_in = float(1e-5);

From b96989fde3f085807bcf0b5bccd4aaec41788f3a Mon Sep 17 00:00:00 2001
From: "Guangzhi (Frank) Xie" <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 21:15:41 +0800
Subject: [PATCH 226/318] Use vender scale for test

---
 test/saber/bm/test_saber_func_scale_BM.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/saber/bm/test_saber_func_scale_BM.cpp b/test/saber/bm/test_saber_func_scale_BM.cpp
index c746a67ff..6b0e309d8 100644
--- a/test/saber/bm/test_saber_func_scale_BM.cpp
+++ b/test/saber/bm/test_saber_func_scale_BM.cpp
@@ -86,7 +86,7 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te
     output_dev.re_alloc(output[0]->valid_shape());
 
     // init assume output tensor has been reshpaed by user.
-    scale.init(input, output, param, SPECIFY, SABER_IMPL, ctx1);
+    scale.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
     scale(input, output, param, ctx1);
 
     output_dev.sync();

From ebfcd88c129558266953db6f4cc569705276e0cc Mon Sep 17 00:00:00 2001
From: "Guangzhi (Frank) Xie" <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 21:21:42 +0800
Subject: [PATCH 227/318] Update BM scale

---
 saber/funcs/impl/bm/vender_scale.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
index 13f1d6322..ce32e898e 100644
--- a/saber/funcs/impl/bm/vender_scale.h
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -70,10 +70,11 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
         /* } */
 
         OpDataType scale_data = param.scale_w;
+        bm_device_mem_t* scale_extension = new bm_device_mem_t();
         BMDNN_CHECK(bmdnn_scale_forward(_handle, in_data, scale_data,
                 input_n, input_c, input_h, input_w,
                 scale_dim, inner_dim, 0,
-                bm_mem_null(), out_data));
+                *scale_extension, out_data));
 
         if (param.bias_term) {
             OpDataType bias_data = param.scale_b;

From 4d5dfaad25494c44078126f3bbb6d02831b3f5cf Mon Sep 17 00:00:00 2001
From: "Guangzhi (Frank) Xie" <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 21:27:00 +0800
Subject: [PATCH 228/318] update BM bias input

---
 saber/funcs/impl/bm/vender_scale.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
index ce32e898e..8ecaa1c38 100644
--- a/saber/funcs/impl/bm/vender_scale.h
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -78,7 +78,7 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
 
         if (param.bias_term) {
             OpDataType bias_data = param.scale_b;
-            BMDNN_CHECK(bmdnn_bias_forward(_handle, in_data, bias_data,
+            BMDNN_CHECK(bmdnn_bias_forward(_handle, out_data, bias_data,
                     outer_dim, inner_dim, out_data));
         }
         return SaberSuccess;

From d6f5cbbc899bfbff5db2518158d073e57b84cce8 Mon Sep 17 00:00:00 2001
From: "Guangzhi (Frank) Xie" <guangzhi.xie@berkeley.edu>
Date: Thu, 28 Jun 2018 21:37:46 +0800
Subject: [PATCH 229/318] BM scale test with bias

---
 test/saber/bm/test_saber_func_scale_BM.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/saber/bm/test_saber_func_scale_BM.cpp b/test/saber/bm/test_saber_func_scale_BM.cpp
index 6b0e309d8..d6833bb9a 100644
--- a/test/saber/bm/test_saber_func_scale_BM.cpp
+++ b/test/saber/bm/test_saber_func_scale_BM.cpp
@@ -104,7 +104,7 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te
 
 TEST(TestSaberFuncBM, test_func_constructor_elt) {
     test_scale(2, 2, 4, 4, 1, 1, false, 2);
-    /* test_scale(2, 2, 4, 4, 1, 1, true, 2); */
+    test_scale(2, 2, 4, 4, 1, 1, true, 2);
     /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */
     /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */
     /* test_scale(2, 2, 4, 4, 0, 0, true, 1); */

From 3645cb3cebe2e68ad8dd005d6464ed35a173490b Mon Sep 17 00:00:00 2001
From: liuhong03 <hongliu104@gmail.com>
Date: Thu, 28 Jun 2018 21:21:39 -0400
Subject: [PATCH 230/318] fix bias in scale

---
 saber/funcs/impl/bm/vender_scale.h         | 33 ++++++++++++++++++----
 test/saber/bm/test_saber_func_scale_BM.cpp |  6 ++--
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
index 8ecaa1c38..5f8b6d3bb 100644
--- a/saber/funcs/impl/bm/vender_scale.h
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -70,17 +70,40 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
         /* } */
 
         OpDataType scale_data = param.scale_w;
-        bm_device_mem_t* scale_extension = new bm_device_mem_t();
+        bm_device_mem_t* data_extension = new bm_device_mem_t();
+        int size = input_n * input_c * input_h * input_w;
+        bm_malloc_device_byte(_handle, data_extension, size * sizeof(float));
         BMDNN_CHECK(bmdnn_scale_forward(_handle, in_data, scale_data,
                 input_n, input_c, input_h, input_w,
                 scale_dim, inner_dim, 0,
-                *scale_extension, out_data));
-
+                *data_extension, out_data));
+        
         if (param.bias_term) {
             OpDataType bias_data = param.scale_b;
-            BMDNN_CHECK(bmdnn_bias_forward(_handle, out_data, bias_data,
-                    outer_dim, inner_dim, out_data));
+            float* host_bias = new float[scale_dim];
+            float* host_extension = new float[size];
+            printf(".........\n");
+//        bm_device_mem_t temp;;
+//        bm_malloc_device_byte(_handle, &temp, scale_dim * sizeof(float));
+//            bm_memcpy_d2s(_handle, bm_mem_from_system(host_bias), temp);
+//            bm_memcpy_d2s(_handle, bm_mem_from_system(host_bias), reinterpret_cast<bm_device_mem_t>(param.scale_b));
+            bm_memcpy_d2s(_handle, bm_mem_from_system(host_bias), bm_mem_from_device(&bias_data));
+            int dim = inner_dim * scale_dim;
+            host_bias[0] = 1;
+            host_bias[1] = 2;
+            for (int i = 0; i < size; ++i) {
+                 int bias_dim = (i % dim) / inner_dim;
+                 host_extension[i] = host_bias[bias_dim];
+                 printf("%f, ", host_extension[i]);
+            }
+            printf("\n");
+            bm_memcpy_s2d(_handle, *data_extension, bm_mem_from_system(const_cast<float *>(host_extension)));
+            delete [] host_bias;
+            delete [] host_extension; 
+            BMDNN_CHECK(bmdnn_bias_forward(_handle, out_data, *data_extension,
+                    outer_dim, scale_dim * inner_dim, out_data));
         }
+        bm_free_device(_handle, *data_extension);
         return SaberSuccess;
     }
 private:
diff --git a/test/saber/bm/test_saber_func_scale_BM.cpp b/test/saber/bm/test_saber_func_scale_BM.cpp
index d6833bb9a..cf0a1ad91 100644
--- a/test/saber/bm/test_saber_func_scale_BM.cpp
+++ b/test/saber/bm/test_saber_func_scale_BM.cpp
@@ -66,6 +66,8 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te
     std::vector<float> scale_b;
     scale_w.resize(scale_dim);
     fill_vector_rand(scale_w);
+    scale_w[0] = 0;
+    scale_w[1] = 0;
     if (bias_term) {
         scale_b.resize(scale_dim);
         fill_vector_rand(scale_b);
@@ -103,8 +105,8 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te
 }
 
 TEST(TestSaberFuncBM, test_func_constructor_elt) {
-    test_scale(2, 2, 4, 4, 1, 1, false, 2);
-    test_scale(2, 2, 4, 4, 1, 1, true, 2);
+//    test_scale(1, 2, 1, 2, 1, 1, false, 2);
+    test_scale(1, 2, 1, 2, 1, 1, true, 2);
     /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */
     /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */
     /* test_scale(2, 2, 4, 4, 0, 0, true, 1); */

From df2b2b2290ea13bb9fb5c314e82d4cbcca3e8862 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 29 Jun 2018 10:19:07 +0800
Subject: [PATCH 231/318] Update BM scale ops

---
 saber/funcs/impl/bm/vender_scale.h         | 21 ++++++++-------------
 test/saber/bm/test_saber_func_scale_BM.cpp |  4 ++--
 2 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
index 5f8b6d3bb..64c4d22a2 100644
--- a/saber/funcs/impl/bm/vender_scale.h
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -69,25 +69,19 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
         /*     CHECK_EQ(scale_dim, param.scale_w.size()) << "scale dim not valid"; */
         /* } */
 
-        OpDataType scale_data = param.scale_w;
+        float* scale_data = &param.scale_w[0];
         bm_device_mem_t* data_extension = new bm_device_mem_t();
         int size = input_n * input_c * input_h * input_w;
         bm_malloc_device_byte(_handle, data_extension, size * sizeof(float));
-        BMDNN_CHECK(bmdnn_scale_forward(_handle, in_data, scale_data,
+        BMDNN_CHECK(bmdnn_scale_forward(_handle, in_data, bm_mem_from_system(scale_data),
                 input_n, input_c, input_h, input_w,
                 scale_dim, inner_dim, 0,
                 *data_extension, out_data));
         
         if (param.bias_term) {
-            OpDataType bias_data = param.scale_b;
-            float* host_bias = new float[scale_dim];
+            float* host_bias = &param.scale_b[0];
             float* host_extension = new float[size];
             printf(".........\n");
-//        bm_device_mem_t temp;;
-//        bm_malloc_device_byte(_handle, &temp, scale_dim * sizeof(float));
-//            bm_memcpy_d2s(_handle, bm_mem_from_system(host_bias), temp);
-//            bm_memcpy_d2s(_handle, bm_mem_from_system(host_bias), reinterpret_cast<bm_device_mem_t>(param.scale_b));
-            bm_memcpy_d2s(_handle, bm_mem_from_system(host_bias), bm_mem_from_device(&bias_data));
             int dim = inner_dim * scale_dim;
             host_bias[0] = 1;
             host_bias[1] = 2;
@@ -97,11 +91,12 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
                  printf("%f, ", host_extension[i]);
             }
             printf("\n");
-            bm_memcpy_s2d(_handle, *data_extension, bm_mem_from_system(const_cast<float *>(host_extension)));
-            delete [] host_bias;
-            delete [] host_extension; 
-            BMDNN_CHECK(bmdnn_bias_forward(_handle, out_data, *data_extension,
+
+            BMDNN_CHECK(bmdnn_bias_forward(_handle, out_data, bm_mem_from_system(host_extension),
                     outer_dim, scale_dim * inner_dim, out_data));
+
+            delete [] host_bias;
+            delete [] host_extension;
         }
         bm_free_device(_handle, *data_extension);
         return SaberSuccess;
diff --git a/test/saber/bm/test_saber_func_scale_BM.cpp b/test/saber/bm/test_saber_func_scale_BM.cpp
index cf0a1ad91..066ba194b 100644
--- a/test/saber/bm/test_saber_func_scale_BM.cpp
+++ b/test/saber/bm/test_saber_func_scale_BM.cpp
@@ -73,8 +73,8 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te
         fill_vector_rand(scale_b);
     }
 
-    ScaleParam<TensorDf4> param(bm_mem_from_system(&scale_w[0]), 
-                                bm_mem_from_system(&scale_b[0]), 
+    ScaleParam<TensorDf4> param(scale_w,
+                                scale_b,
                                 bias_term, axis, num_axes);
 
     std::vector<TensorDf4*> input;

From d345de4b92be1fdbc835126d654bceb8d5bab716 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 29 Jun 2018 10:28:54 +0800
Subject: [PATCH 232/318] cleanup

---
 saber/funcs/impl/bm/vender_scale.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
index 64c4d22a2..e1acd3bfa 100644
--- a/saber/funcs/impl/bm/vender_scale.h
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -83,8 +83,8 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
             float* host_extension = new float[size];
             printf(".........\n");
             int dim = inner_dim * scale_dim;
-            host_bias[0] = 1;
-            host_bias[1] = 2;
+            //host_bias[0] = 1;
+            //host_bias[1] = 2;
             for (int i = 0; i < size; ++i) {
                  int bias_dim = (i % dim) / inner_dim;
                  host_extension[i] = host_bias[bias_dim];

From fb6af6bbf7a5e574b4df8578eb7ca944f927ab3f Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 29 Jun 2018 10:58:57 +0800
Subject: [PATCH 233/318] Update BM scale test

---
 test/saber/bm/test_saber_func_scale_BM.cpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/test/saber/bm/test_saber_func_scale_BM.cpp b/test/saber/bm/test_saber_func_scale_BM.cpp
index 066ba194b..d4e40d44b 100644
--- a/test/saber/bm/test_saber_func_scale_BM.cpp
+++ b/test/saber/bm/test_saber_func_scale_BM.cpp
@@ -33,6 +33,11 @@ void fill_vector_rand(std::vector<float>& vec) {
         vec[i] = rand() *1.0f/RAND_MAX - 0.5;
     }
 }
+void fill_vector_const(std::vector<float>& vec, float num) {
+    for (int i = 0; i < vec.size(); i++) {
+        vec[i] = num;
+    }
+}
 void print_vector_data(std::vector<float>& vec) {
     for (int i = 0; i < vec.size(); i++) {
         printf("%d, %f\n", i, vec[i]);
@@ -56,7 +61,7 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te
 
     img_host.re_alloc(img_s);
     img_dev.re_alloc(img_s);
-    fill_tensor_host_rand(img_host, -0.5, 0.5);
+    fill_tensor_host_const(img_host, 1);
     img_dev.copy_from(img_host);
 
     TensorDf4 output_dev;
@@ -65,12 +70,10 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te
     std::vector<float> scale_w;
     std::vector<float> scale_b;
     scale_w.resize(scale_dim);
-    fill_vector_rand(scale_w);
-    scale_w[0] = 0;
-    scale_w[1] = 0;
+    fill_vector_const(scale_w, 2);
     if (bias_term) {
         scale_b.resize(scale_dim);
-        fill_vector_rand(scale_b);
+        fill_vector_const(scale_b, 0);
     }
 
     ScaleParam<TensorDf4> param(scale_w,
@@ -105,7 +108,7 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te
 }
 
 TEST(TestSaberFuncBM, test_func_constructor_elt) {
-//    test_scale(1, 2, 1, 2, 1, 1, false, 2);
+    test_scale(1, 2, 1, 2, 1, 1, false, 2);
     test_scale(1, 2, 1, 2, 1, 1, true, 2);
     /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */
     /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */

From e7d2d043abd4d5a0d8d9feef387f275b1c0a0ea2 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 29 Jun 2018 13:52:16 +0800
Subject: [PATCH 234/318] cleanup

---
 saber/funcs/impl/bm/vender_scale.h      |   8 +-
 test/saber/bm/test_saber_func_fc_BM.cpp | 146 ------------------------
 2 files changed, 3 insertions(+), 151 deletions(-)
 delete mode 100644 test/saber/bm/test_saber_func_fc_BM.cpp

diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
index e1acd3bfa..e2e6fb900 100644
--- a/saber/funcs/impl/bm/vender_scale.h
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -81,16 +81,14 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
         if (param.bias_term) {
             float* host_bias = &param.scale_b[0];
             float* host_extension = new float[size];
-            printf(".........\n");
+            //printf(".........\n");
             int dim = inner_dim * scale_dim;
-            //host_bias[0] = 1;
-            //host_bias[1] = 2;
             for (int i = 0; i < size; ++i) {
                  int bias_dim = (i % dim) / inner_dim;
                  host_extension[i] = host_bias[bias_dim];
-                 printf("%f, ", host_extension[i]);
+                 //printf("%f, ", host_extension[i]);
             }
-            printf("\n");
+            //printf("\n");
 
             BMDNN_CHECK(bmdnn_bias_forward(_handle, out_data, bm_mem_from_system(host_extension),
                     outer_dim, scale_dim * inner_dim, out_data));
diff --git a/test/saber/bm/test_saber_func_fc_BM.cpp b/test/saber/bm/test_saber_func_fc_BM.cpp
deleted file mode 100644
index 869ff1bfd..000000000
--- a/test/saber/bm/test_saber_func_fc_BM.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-#include "core/context.h"
-#include "funcs/fc.h"
-#include "test_saber_func_BM.h"
-#include "tensor_op.h"
-#include "saber_types.h"
-#include <vector>
-
-using namespace anakin::saber;
-typedef TargetWrapper<BM> API;
-typedef Tensor<BM, AK_FLOAT, NCHW> TensorDf4;
-typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-typedef TensorDf4::Dtype ftype;
-
-void fc_compute(const TensorHf4& tin, const TensorHf4& weight, \
-                const TensorHf4& bias, TensorHf4& tout) {
-
-    int m = tin.num();
-    int k = tin.valid_size() / m;
-    int n = weight.valid_size() / k;
-    bool bias_term = bias.valid_size() > 0;
-
-    const float* din = tin.data();
-    const float* w = weight.data();
-    float* dout = tout.mutable_data();
-
-    for (int i = 0; i < m; ++i) {
-        float* pdout = dout + i * n;
-        const float* pdin = din + i * k;
-
-        for (int j = 0; j < n; ++j) {
-            if (bias_term) {
-                pdout[j] = bias.data()[j];
-            } else {
-                pdout[j] = 0;
-            }
-
-            for (int l = 0; l < k; ++l) {
-                pdout[j] += pdin[l] * w[l * n + j];
-            }
-        }
-    }
-}
-
-TEST(TestSaberFuncBM, test_func_fc) {
-
-    int test_iter = 100;
-    int w_in = 7;
-    int h_in = 7;
-    int ch_in = 512;
-    int num_in = 1;
-
-    int num_out = 4096;
-    int axis = 1;
-
-    Shape shape_in(num_in, ch_in, h_in, w_in);
-    Shape shape_out = {num_in, num_out, 1, 1};
-
-    Shape sh_w{1, 1, w_in* h_in * ch_in, num_out};
-    TensorDf4 weight(sh_w);
-    Shape sh_b{1, 1, 1, num_out};
-    TensorDf4 bias(sh_b);
-    fill_tensor_device_const(weight, 1.f);
-    fill_tensor_device_const(bias, 1.f);
-
-    LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \
-              ch_in << ", height=" << h_in << ", width=" << w_in;
-
-    std::vector<TensorDf4*> input_dev_4d;
-    std::vector<TensorDf4*> output_dev_4d;
-
-    TensorDf4 tdin;
-    TensorDf4 tdout;
-    tdin.re_alloc(shape_in);
-    fill_tensor_device_const(tdin, 1.f);
-    input_dev_4d.push_back(&tdin);
-    output_dev_4d.push_back(&tdout);
-
-    // start Reshape & doInfer
-    Context<BM> ctx_dev(0, 1, 1);
-
-    FcParam<TensorDf4> param(&weight, &bias, num_out, axis);
-
-    Fc<BM, AK_FLOAT> fc;
-
-    LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \
-              shape_out[2] << ", " << shape_out[3];
-
-    SABER_CHECK(fc.compute_output_shape(input_dev_4d, output_dev_4d, param));
-
-    LOG(INFO) << "re-alloc tensor buffer";
-    output_dev_4d[0]->re_alloc(output_dev_4d[0]->valid_shape());
-    Shape va_sh = tdout.valid_shape();
-    LOG(INFO) << "shape out 4d: " << va_sh[0] << ", " << va_sh[1] << ", " << \
-              va_sh[2] << ", " << va_sh[3];
-    CHECK_EQ(tdout.valid_shape() == shape_out, true) << "compute output shape error";
-
-    LOG(INFO) << "FC initialization";
-    SABER_CHECK(fc.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev));
-
-    LOG(INFO) << "FC compute";
-    SaberTimer<BM> t1;
-    t1.clear();
-    t1.start(ctx_dev);
-
-    for (int i = 0; i < test_iter; ++i) {
-        SABER_CHECK(fc(input_dev_4d, output_dev_4d, param, ctx_dev));
-        output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
-        output_dev_4d[0]->sync();
-        //cudaDeviceSynchronize();
-    }
-
-    t1.end(ctx_dev);
-    float ts = t1.get_average_ms();
-    LOG(INFO) << "total time: " << ts << "avg time: " << ts / test_iter;
-    //print_tensor_device(*output_dev_4d[0]);
-
-    //! check result
-    TensorHf4 thin(shape_in);
-    TensorHf4 thout(shape_out);
-    TensorHf4 thw(sh_w);
-    TensorHf4 thb(sh_b);
-    thin.copy_from(tdin);
-    thw.copy_from(weight);
-    thb.copy_from(bias);
-    fc_compute(thin, thw, thb, thout);
-    //print_tensor_host(thout);
-
-    TensorHf4 thout_d(shape_out);
-    thout_d.copy_from(tdout);
-    double max_ratio = 0;
-    double max_diff = 0;
-    tensor_cmp_host(thout.data(), thout_d.data(), thout.valid_size(), max_ratio, max_diff);
-    LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio;
-    CHECK_LE(fabs(max_ratio), 1.0e-6) << "error result";
-
-}
-
-int main(int argc, const char** argv) {
-    // initial logger
-    //logger::init(argv[0]);
-    Env<BM>::env_init();
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-

From 9729c00c7f1fae67b50ccd71a91df272c3c7d1e8 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 29 Jun 2018 14:57:30 +0800
Subject: [PATCH 235/318] flush before next operation

---
 saber/funcs/impl/bm/vender_scale.h         | 4 +---
 test/saber/bm/test_saber_func_scale_BM.cpp | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
index e2e6fb900..4e9402a43 100644
--- a/saber/funcs/impl/bm/vender_scale.h
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -81,15 +81,13 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
         if (param.bias_term) {
             float* host_bias = &param.scale_b[0];
             float* host_extension = new float[size];
-            //printf(".........\n");
             int dim = inner_dim * scale_dim;
             for (int i = 0; i < size; ++i) {
                  int bias_dim = (i % dim) / inner_dim;
                  host_extension[i] = host_bias[bias_dim];
-                 //printf("%f, ", host_extension[i]);
             }
-            //printf("\n");
 
+            bm_flush(get_bm_handle());
             BMDNN_CHECK(bmdnn_bias_forward(_handle, out_data, bm_mem_from_system(host_extension),
                     outer_dim, scale_dim * inner_dim, out_data));
 
diff --git a/test/saber/bm/test_saber_func_scale_BM.cpp b/test/saber/bm/test_saber_func_scale_BM.cpp
index d4e40d44b..a20b61cbb 100644
--- a/test/saber/bm/test_saber_func_scale_BM.cpp
+++ b/test/saber/bm/test_saber_func_scale_BM.cpp
@@ -73,7 +73,7 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te
     fill_vector_const(scale_w, 2);
     if (bias_term) {
         scale_b.resize(scale_dim);
-        fill_vector_const(scale_b, 0);
+        fill_vector_const(scale_b, 3);
     }
 
     ScaleParam<TensorDf4> param(scale_w,

From ff9f16c187098e5c7c78bcb65022cfff6bbfe6e7 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 29 Jun 2018 15:54:06 +0800
Subject: [PATCH 236/318] check BM conv bias

---
 saber/funcs/impl/bm/vender_conv.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h
index 220b8a14e..7243fd6a4 100644
--- a/saber/funcs/impl/bm/vender_conv.h
+++ b/saber/funcs/impl/bm/vender_conv.h
@@ -51,7 +51,6 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
                           ConvParam<OpTensor>& param) {
         const InDataType *in_data = (const InDataType *) inputs[0]->data();
         const InDataType *weight = (const InDataType *) param.weight()->data();
-        const InDataType *bias = (const InDataType *) param.bias()->data();
         OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
 
         int input_n = inputs[0]->num();
@@ -75,6 +74,7 @@ class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
         int dilation_w = param.dilation_w;
 
         bool with_bias = param.bias()->size() > 0;
+        const InDataType *bias = with_bias? (const InDataType *) param.bias()->data() : &bm_mem_null();
 
         bm_tensor_4d_t input_shape = {
             input_n,

From 16bba5b0817df7577a325398eb9ece0c78044d80 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 5 Jul 2018 13:47:59 +0800
Subject: [PATCH 237/318] Update BM tensor test

---
 test/saber/bm/test_saber_tensor_BM.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
index 423ffe221..2400e73c3 100644
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ b/test/saber/bm/test_saber_tensor_BM.cpp
@@ -415,12 +415,12 @@ TEST(TestSaberTensorBM, test_tensor_deepcopy) {
     td21.copy_from(td01);
     print_tensor_device(td21);
     //cudaDeviceSynchronize();
-}
+}*/
 
 TEST(TestSaberTensorBM, test_tensor_shape) {
-    typedef Tensor<X86, AK_BM, NCHW> Tensor4_0;
-    typedef Tensor<X86, AK_BM, NHWC> Tensor4_1;
-    typedef Tensor<X86, AK_BM, HW> Tensor2;
+    typedef Tensor<X86, AK_FLOAT, NCHW> Tensor4_0;
+    typedef Tensor<X86, AK_FLOAT, NHWC> Tensor4_1;
+    typedef Tensor<X86, AK_FLOAT, HW> Tensor2;
 
     int nin = 2;
     int cin = 2;
@@ -562,8 +562,8 @@ TEST(TestSaberTensorBM, test_tensor_op) {
     Shape sh{1, 2, 2, 10};
     TensorDf4 td1(sh);
     TensorHf4 th1(sh);
-    Tensor<BM, AK_INT8, NCHW> td2(sh);
-    Tensor<X86, AK_INT8, NCHW> th2(sh);
+    Tensor<BM, AK_BM, NCHW> td2(sh);
+    Tensor<X86, AK_FLOAT, NCHW> th2(sh);
     LOG(INFO) << "testing host fill tensor with const 1.";
     fill_tensor_host_const(th1, 1.f);
     LOG(INFO) << "data type: float";
@@ -616,9 +616,9 @@ TEST(TestSaberTensorBM, test_tensor_op) {
 TEST(TestSaberTensorBM, test_tensor_share_diff_dtype) {
     Shape sh{1, 1, 2, 10};
     Tensor<BM, AK_BM, NCHW> td1(sh);
-    Tensor<X86, AK_BM, NCHW> th1(sh);
-    Tensor<BM, AK_INT8, NCHW> td2;
-    Tensor<X86, AK_INT8, NCHW> th2;
+    Tensor<X86, AK_FLOAT, NCHW> th1(sh);
+    Tensor<BM, AK_BM, NCHW> td2;
+    Tensor<X86, AK_FLOAT, NCHW> th2;
     td2.set_shape(sh);
     th2.set_shape(sh);
     LOG(INFO) << "testing host fill tensor with const 1.";
@@ -641,7 +641,7 @@ TEST(TestSaberTensorBM, test_tensor_share_diff_dtype) {
 TEST(TestSaberTensorBM, test_tensor_base_type) {
     Shape sh(1, 3, 10, 10);
     Tensor<BM, AK_BM, NCHW> td1(sh);
-    Tensor<X86, AK_BM, NCHW> th1(sh);
+    Tensor<X86, AK_FLOAT, NCHW> th1(sh);
     fill_tensor_host_rand(th1, 0.f, 255.f);
     td1.copy_from(th1);
     TensorBase* tb1;
@@ -652,7 +652,7 @@ TEST(TestSaberTensorBM, test_tensor_base_type) {
     Shape sh11 = th1.valid_shape();
     LOG(INFO) << "base tensor call set shape: " << "n=" << sh11[0] << ", c=" << sh11[1] << \
               ", h=" << sh11[2] << ", w=" << sh11[3];
-}*/
+}
 
 int main(int argc, const char** argv) {
     // initial logger

From 8059ee1ee3e25d6fe5af8c08698ee167a1194127 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 10 Jul 2018 11:13:29 +0800
Subject: [PATCH 238/318] Implement fc for BM

---
 saber/funcs/impl/bm/vender_fc.h         |  22 ++--
 test/saber/bm/test_saber_func_fc_BM.cpp | 147 ++++++++++++++++++++++++
 2 files changed, 160 insertions(+), 9 deletions(-)
 create mode 100644 test/saber/bm/test_saber_func_fc_BM.cpp

diff --git a/saber/funcs/impl/bm/vender_fc.h b/saber/funcs/impl/bm/vender_fc.h
index 82dd6000c..c0cd7ea66 100644
--- a/saber/funcs/impl/bm/vender_fc.h
+++ b/saber/funcs/impl/bm/vender_fc.h
@@ -34,6 +34,7 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
     virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
                             std::vector<DataTensor_out *>& outputs,
                             FcParam<OpTensor>& param, Context<BM>& ctx){
+        _handle = get_bm_handle();
         return create(inputs, outputs, param, ctx);
     }
 
@@ -47,16 +48,20 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
                             std::vector<DataTensor_out *>& outputs,
                             FcParam<OpTensor>& param){
         const InDataType *in_data = (const InDataType *) inputs[0]->data();
-        const InDataType *weights = (const InDataType *) param.weights->get_buf()->get_data();
-        const InDataType *bias = (const InDataType *) param.bias->get_buf()->get_data();
+        const InDataType *weights = (const InDataType *) param.weights->data();
+        const InDataType *bias = (const InDataType *) param.bias->data();
         OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
-        int batch_size = inputs[0]->num();
-        int input_len = inputs[0]->channel();
+        int batch_size = inputs[0]->count_valid(0, param.axis);
+        int input_len = inputs[0]->count_valid(param.axis, inputs[0]->dims());
         int output_len = param.num_output;
-        int is_transpose = param.is_transpose_weights ? 1 : 0;
-        BMDNN_CHECK(bmdnn_fc_forward(_handle, in_data, weights, bias,
-                                    batch_size, output_len, input_len, is_transpose, 1, 0,
-                                    out_data));
+        if (output_len <= 0) {
+            int weight_size = param.weights->valid_size();
+            output_len = weight_size / input_len;
+        }
+
+        BMDNN_CHECK(bmdnn_fc_forward(_handle, *in_data, *weights, *bias,
+                                    batch_size, output_len, input_len, param.is_transpose_weights, 1, 0,
+                                    *out_data));
         return SaberSuccess;
     };
 
@@ -64,7 +69,6 @@ class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, Lay
     bm_handle_t _handle;
 };
 
-template class VenderFc<BM, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>;
 } //namespace saber
 
 } //namespace anakin
diff --git a/test/saber/bm/test_saber_func_fc_BM.cpp b/test/saber/bm/test_saber_func_fc_BM.cpp
new file mode 100644
index 000000000..7b56033e6
--- /dev/null
+++ b/test/saber/bm/test_saber_func_fc_BM.cpp
@@ -0,0 +1,147 @@
+#include "core/context.h"
+#include "funcs/fc.h"
+#include "test_saber_func_BM.h"
+#include "tensor_op.h"
+#include "saber_types.h"
+#include <vector>
+
+using namespace anakin::saber;
+typedef TargetWrapper<BM> API;
+typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+typedef TensorDf4::Dtype ftype;
+
+void fc_compute(const TensorHf4& tin, const TensorHf4& weight, \
+                const TensorHf4& bias, TensorHf4& tout) {
+
+    int m = tin.num();
+    int k = tin.valid_size() / m;
+    int n = weight.valid_size() / k;
+    bool bias_term = bias.valid_size() > 0;
+
+    const float* din = tin.data();
+    const float* w = weight.data();
+    float* dout = tout.mutable_data();
+
+    for (int i = 0; i < m; ++i) {
+        float* pdout = dout + i * n;
+        const float* pdin = din + i * k;
+
+        for (int j = 0; j < n; ++j) {
+            if (bias_term) {
+                pdout[j] = bias.data()[j];
+            } else {
+                pdout[j] = 0;
+            }
+
+            for (int l = 0; l < k; ++l) {
+                pdout[j] += pdin[l] * w[l * n + j];
+            }
+        }
+    }
+}
+
+TEST(TestSaberFuncBM, test_func_fc) {
+
+    int test_iter = 10;
+    int w_in = 7;
+    int h_in = 7;
+    int ch_in = 1024;
+    int num_in = 4;
+
+    int num_out = 4096;
+    int axis = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_out = {num_in, num_out, 1, 1};
+
+    Shape sh_w{1, 1, w_in* h_in * ch_in, num_out};
+    TensorDf4 weight(sh_w);
+    Shape sh_b{1, 1, 1, num_out};
+    TensorDf4 bias(sh_b);
+    fill_tensor_device_const(weight, 1.f);
+    fill_tensor_device_const(bias, 1.f);
+
+    LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \
+              ch_in << ", height=" << h_in << ", width=" << w_in;
+
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+
+    TensorDf4 tdin;
+    TensorDf4 tdout;
+    tdin.re_alloc(shape_in);
+    fill_tensor_device_const(tdin, 1.f);
+    input_dev_4d.push_back(&tdin);
+    output_dev_4d.push_back(&tdout);
+
+    // start Reshape & doInfer
+    Context<BM> ctx_dev(0, 1, 1);
+
+    FcParam<TensorDf4> param(&weight, &bias, num_out, axis);
+
+    Fc<BM, AK_BM, AK_BM, AK_BM, NCHW> fc;
+
+    LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \
+              shape_out[2] << ", " << shape_out[3];
+
+    SABER_CHECK(fc.compute_output_shape(input_dev_4d, output_dev_4d, param));
+
+    LOG(INFO) << "re-alloc tensor buffer";
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->valid_shape());
+    Shape va_sh = tdout.valid_shape();
+    LOG(INFO) << "shape out 4d: " << va_sh[0] << ", " << va_sh[1] << ", " << \
+              va_sh[2] << ", " << va_sh[3];
+    CHECK_EQ(tdout.valid_shape() == shape_out, true) << "compute output shape error";
+
+    LOG(INFO) << "FC initialization";
+    SABER_CHECK(fc.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev));
+
+    LOG(INFO) << "FC compute";
+    SaberTimer<BM> t1;
+    t1.clear();
+    t1.start(ctx_dev);
+
+    for (int i = 0; i < test_iter; ++i) {
+        SABER_CHECK(fc(input_dev_4d, output_dev_4d, param, ctx_dev));
+        bm_flush(get_bm_handle());
+        //output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+        //output_dev_4d[0]->sync();
+        //cudaDeviceSynchronize();
+    }
+
+    t1.end(ctx_dev);
+    float ts = t1.get_average_ms();
+    LOG(INFO) << "total time: " << ts << "avg time: " << ts / test_iter;
+    //print_tensor_device(*output_dev_4d[0]);
+
+    //! check result
+    TensorHf4 thin(shape_in);
+    TensorHf4 thout(shape_out);
+    TensorHf4 thw(sh_w);
+    TensorHf4 thb(sh_b);
+    thin.copy_from(tdin);
+    thw.copy_from(weight);
+    thb.copy_from(bias);
+    fc_compute(thin, thw, thb, thout);
+    //print_tensor_host(thout);
+
+    TensorHf4 thout_d(shape_out);
+    thout_d.copy_from(tdout);
+    double max_ratio = 0;
+    double max_diff = 0;
+    tensor_cmp_host(thout.data(), thout_d.data(), thout.valid_size(), max_ratio, max_diff);
+    LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio;
+    CHECK_LE(fabs(max_ratio), 1.0e-6) << "error result";
+
+}
+
+int main(int argc, const char** argv) {
+    // initial logger
+    //logger::init(argv[0]);
+    Env<BM>::env_init();
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+

From a93a77eaacc554835dacbb2a22d407da80478b70 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 11 Jul 2018 10:08:45 +0800
Subject: [PATCH 239/318] Implement eltwise for BM

---
 saber/funcs/impl/bm/vender_eltwise.h | 118 +++++++++++++++++++++++++++
 saber/funcs/impl/bm/vender_scale.h   |   2 +-
 2 files changed, 119 insertions(+), 1 deletion(-)
 create mode 100644 saber/funcs/impl/bm/vender_eltwise.h

diff --git a/saber/funcs/impl/bm/vender_eltwise.h b/saber/funcs/impl/bm/vender_eltwise.h
new file mode 100644
index 000000000..62ac2c436
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_eltwise.h
@@ -0,0 +1,118 @@
+#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_ELTWISE_H
+#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_ELTWISE_H
+
+#include "saber/funcs/impl/impl_eltwise.h"
+
+namespace anakin {
+
+namespace saber {
+
+template <DataType OpDtype,
+            DataType inDtype,
+            DataType outDtype,
+            typename LayOutType_op,
+            typename LayOutType_in,
+            typename LayOutType_out>
+class SaberEltwise<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, LayOutType_out>:\
+public ImplBase<
+        Tensor<BM, inDtype, LayOutType_in>,
+        Tensor<BM, outDtype, LayOutType_out>,
+        Tensor<BM, OpDtype, LayOutType_op>,
+        EltwiseParam<Tensor<BM, OpDtype, LayOutType_op>>> {
+public:
+    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
+
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+
+    SaberEltwise() {}
+
+    ~SaberEltwise() {}
+
+    virtual SaberStatus init(const std::vector<DataTensor_in*>& inputs,
+                         std::vector<DataTensor_out*>& outputs,
+                         EltwiseParam<OpTensor> &param,
+                         Context<BM> &ctx) {
+        _handle = get_bm_handle();
+        return create(inputs, outputs, param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in*>& inputs,
+                           std::vector<DataTensor_out*>& outputs,
+                           EltwiseParam<OpTensor> &param,
+                           Context<BM> &ctx) {
+        return SaberSuccess;
+    }
+
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
+                             std::vector<DataTensor_out*>& outputs,
+                             EltwiseParam<OpTensor> &param) {
+
+        int op_ = 0;
+        switch (param.operation) {
+            case Eltwise_prod:
+                op_ = 0;
+                break;
+            case Eltwise_sum:
+                op_ = 1;
+                break;
+            case Eltwise_max:
+                op_ = 2;
+                break;
+            default:
+                return SaberUnImplError;
+        }
+
+        //int input_size = inputs.size();
+        //CHECK_GE(input_size, 2) << "Input size should >= 2!";
+
+        OutDataType out_data = *(outputs[0]->mutable_data());
+        int input_n = inputs[0]->num();
+        int input_c = inputs[0]->channel();
+        int input_h = inputs[0]->height();
+        int input_w = inputs[0]->width();
+
+        std::vector<float> coeff_ = param.coeff;
+        if (coeff_.size() != inputs.size()) {
+            for (int j=0; j<(inputs.size() - coeff_.size()); j++) {
+                coeff_.push_back(1);
+            }
+        }
+
+        bm_device_mem_t* mask_data = new bm_device_mem_t();
+
+        int flag_first = 1;
+        for (int i=0; i<inputs.size(); i++){
+            const InDataType in_data = *(inputs[i]->data());
+            bmdnn_eltwise_forward(
+                    _handle,
+                    op_,
+                    flag_first,
+                    coeff_[i],
+                    i,
+                    in_data,
+                    out_data,
+                    input_n,
+                    input_c * input_h * input_w,
+                    *mask_data,
+                    out_data);
+
+            bm_flush(_handle);
+            flag_first = 0;
+        }
+        bm_free_device(_handle, *mask_data);
+        return SaberSuccess;
+    }
+
+private:
+    bm_handle_t _handle;
+};
+
+} //namespace saber
+
+} //namespace anakin
+
+#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_ELTWISE_H
\ No newline at end of file
diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
index 4e9402a43..2876e8005 100644
--- a/saber/funcs/impl/bm/vender_scale.h
+++ b/saber/funcs/impl/bm/vender_scale.h
@@ -87,7 +87,7 @@ class VenderScale<BM, OpDtype, inDtype, outDtype,\
                  host_extension[i] = host_bias[bias_dim];
             }
 
-            bm_flush(get_bm_handle());
+            bm_flush(_handle);
             BMDNN_CHECK(bmdnn_bias_forward(_handle, out_data, bm_mem_from_system(host_extension),
                     outer_dim, scale_dim * inner_dim, out_data));
 

From 0eee023a4072fe0f95dd20f2b05271d6da8d927b Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 11 Jul 2018 11:27:17 +0800
Subject: [PATCH 240/318] Add test for BM eltwise

---
 saber/funcs/impl/bm/vender_eltwise.h         |  13 +-
 test/saber/bm/test_saber_func_eltwise_BM.cpp | 627 +++++++++++++++++++
 2 files changed, 634 insertions(+), 6 deletions(-)
 create mode 100644 test/saber/bm/test_saber_func_eltwise_BM.cpp

diff --git a/saber/funcs/impl/bm/vender_eltwise.h b/saber/funcs/impl/bm/vender_eltwise.h
index 62ac2c436..050fc4d43 100644
--- a/saber/funcs/impl/bm/vender_eltwise.h
+++ b/saber/funcs/impl/bm/vender_eltwise.h
@@ -13,7 +13,7 @@ template <DataType OpDtype,
             typename LayOutType_op,
             typename LayOutType_in,
             typename LayOutType_out>
-class SaberEltwise<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, LayOutType_out>:\
+class VenderEltwise<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, LayOutType_out>:\
 public ImplBase<
         Tensor<BM, inDtype, LayOutType_in>,
         Tensor<BM, outDtype, LayOutType_out>,
@@ -28,9 +28,9 @@ public ImplBase<
     typedef typename DataTensor_out::Dtype OutDataType;
     typedef typename OpTensor::Dtype OpDataType;
 
-    SaberEltwise() {}
+    VenderEltwise() {}
 
-    ~SaberEltwise() {}
+    ~VenderEltwise() {}
 
     virtual SaberStatus init(const std::vector<DataTensor_in*>& inputs,
                          std::vector<DataTensor_out*>& outputs,
@@ -44,7 +44,6 @@ public ImplBase<
                            std::vector<DataTensor_out*>& outputs,
                            EltwiseParam<OpTensor> &param,
                            Context<BM> &ctx) {
-        return SaberSuccess;
     }
 
     virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
@@ -77,7 +76,8 @@ public ImplBase<
 
         std::vector<float> coeff_ = param.coeff;
         if (coeff_.size() != inputs.size()) {
-            for (int j=0; j<(inputs.size() - coeff_.size()); j++) {
+            int diff = inputs.size() - coeff_.size();
+            for (int j=0; j<diff; j++) {
                 coeff_.push_back(1);
             }
         }
@@ -103,7 +103,8 @@ public ImplBase<
             bm_flush(_handle);
             flag_first = 0;
         }
-        bm_free_device(_handle, *mask_data);
+
+        //bm_free_device(_handle, *mask_data);
         return SaberSuccess;
     }
 
diff --git a/test/saber/bm/test_saber_func_eltwise_BM.cpp b/test/saber/bm/test_saber_func_eltwise_BM.cpp
new file mode 100644
index 000000000..da931510b
--- /dev/null
+++ b/test/saber/bm/test_saber_func_eltwise_BM.cpp
@@ -0,0 +1,627 @@
+#include "core/context.h"
+#include "funcs/eltwise.h"
+#include "test_saber_func_BM.h"
+#include "tensor_op.h"
+#include "saber_types.h"
+#include <vector>
+
+using namespace anakin::saber;
+
+/*
+TEST(TestSaberFuncBM, test_func_prod) {
+
+    Env<BM>::env_init();
+    typedef TargetWrapper<BM> API;
+
+    typedef TargetWrapper<BM> BM_API;
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+
+    EltwiseType elt_type = Eltwise_prod;
+
+    EltwiseParam<TensorDf4> param(elt_type);
+
+    int w_in = 10;
+    int h_in = 2;
+    int ch_in = 2;
+    int num_in = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_out = shape_in;
+
+    // Host Tensor
+    Tensor<X86, AK_FLOAT, NCHW> thin0(shape_in);
+    Tensor<X86, AK_FLOAT, NCHW> thin1(shape_in);
+    Tensor<X86, AK_FLOAT, NCHW> thin2(shape_in);
+    for (int i = 0; i < thin0.size(); ++i) {
+        thin0.mutable_data()[i] = i;
+    }
+    for (int i = 0; i < thin1.size(); ++i) {
+        thin1.mutable_data()[i] = i + 1;
+    }
+    for (int i = 0; i < thin2.size(); ++i) {
+        thin2.mutable_data()[i] = 1;
+    }
+
+    // Device Tensor
+    TensorDf4 tdin0, tdin1, tdin2, tdout;
+    tdin0.re_alloc(shape_in);
+    tdin1.re_alloc(shape_in);
+    tdin2.re_alloc(shape_in);
+    tdin0.copy_from(thin0);
+    tdin1.copy_from(thin1);
+    tdin2.copy_from(thin2);
+    tdout.re_alloc(shape_out);
+
+    // Device Vector of Tensor
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+    input_dev_4d.push_back(&tdin0);
+    input_dev_4d.push_back(&tdin1);
+    input_dev_4d.push_back(&tdin2);
+    output_dev_4d.push_back(&tdout);
+
+
+    Context<BM> ctx_dev(0, 1, 1);
+    Eltwise<BM, AK_BM> eltwise_dev;
+
+    LOG(INFO) << "eltwise compute output shape";
+    eltwise_dev.compute_output_shape(output_dev_4d, input_dev_4d, param);
+
+    //SABER_CHECK(eltwise_dev.compute_output_shape(output_dev_4d, input_dev_4d, param));
+
+    // Verify output shape
+    Shape sh = output_dev_4d[0]->valid_shape();
+    LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \
+        ", " << sh[2] << ", " << sh[3];
+    Shape shout{num_in, ch_in, h_in, w_in};
+    CHECK_EQ(shout == sh, true) << "compute shape error";
+
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
+
+    LOG(INFO) << "eltwise initialization";
+    eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
+
+
+    LOG(INFO) << "eltwise compute";
+    eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
+
+
+    output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+    output_dev_4d[0]->sync();
+    print_tensor_device(*output_dev_4d[0]);
+    cudaDeviceSynchronize();
+
+
+    TensorHf4 th_for_print;
+    th_for_print.re_alloc(output_dev_4d[0]->valid_shape());
+    th_for_print.copy_from(*output_dev_4d[0]);
+    print_tensor_host(th_for_print);
+
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+
+*/
+
+TEST(TestSaberFuncBM, test_func_sum) {
+
+    Env<BM>::env_init();
+
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+
+    EltwiseType elt_type = Eltwise_sum;
+
+    int w_in = 10;
+    int h_in = 2;
+    int ch_in = 2;
+    int num_in = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_out = shape_in;
+
+    // Host Tensor
+    TensorHf4 thin1(shape_in);
+    TensorHf4 thin2(shape_in);
+
+    for (int i = 0; i < thin1.size(); ++i) {
+        thin1.mutable_data()[i] = 1.0;
+    }
+
+    for (int i = 0; i < thin2.size(); ++i) {
+        thin2.mutable_data()[i] = 2.0;
+    }
+
+    // Device Tensor
+    TensorDf4 tdin0, tdin1, tdout;
+    tdin0.re_alloc(shape_in);
+    tdin1.re_alloc(shape_in);
+    tdin0.copy_from(thin1);
+    tdin1.copy_from(thin2);
+    tdout.re_alloc(shape_out);
+
+    // Device Vector of Tensor
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+    input_dev_4d.push_back(&tdin0);
+    input_dev_4d.push_back(&tdin1);
+    input_dev_4d.push_back(&tdin1);
+    output_dev_4d.push_back(&tdout);
+
+    EltwiseParam<TensorDf4> param(elt_type);
+
+    Context<BM> ctx_dev(0, 1, 1);
+    Eltwise<BM, AK_BM, AK_BM, AK_BM, NCHW> eltwise_dev;
+
+    LOG(INFO) << "eltwise compute output shape";
+    eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
+
+
+    // Verify output shape
+    Shape sh = output_dev_4d[0]->valid_shape();
+    LOG(INFO) << "output shape: " << sh[0] << ", " << sh[1] << \
+              ", " << sh[2] << ", " << sh[3];
+    Shape shout{num_in, ch_in, h_in, w_in};
+    CHECK_EQ(shout == sh, true) << "compute shape error";
+
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
+
+    LOG(INFO) << "eltwise initialization";
+    eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
+
+    LOG(INFO) << "eltwise compute";
+    eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
+    print_tensor_device(*output_dev_4d[0]);
+}
+
+TEST(TestSaberFuncBM, test_func_max) {
+
+    Env<BM>::env_init();
+
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+
+    EltwiseType elt_type = Eltwise_max;
+
+    EltwiseParam<TensorDf4> param(elt_type);
+
+    int w_in = 10;
+    int h_in = 2;
+    int ch_in = 2;
+    int num_in = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_out = shape_in;
+
+    // Host Tensor
+    Tensor<X86, AK_FLOAT, NCHW> thin0(shape_in);
+    Tensor<X86, AK_FLOAT, NCHW> thin1(shape_in);
+    Tensor<X86, AK_FLOAT, NCHW> thin2(shape_in);
+    for (int i = 0; i < thin0.size(); ++i) {
+        thin0.mutable_data()[i] = i;
+    }
+    for (int i = 0; i < thin1.size(); ++i) {
+        thin1.mutable_data()[i] = i + 2;
+    }
+    for (int i = 0; i < thin2.size(); ++i) {
+        thin2.mutable_data()[i] = i + 1;
+    }
+
+    // Device Tensor
+    TensorDf4 tdin0, tdin1, tdin2, tdout;
+    tdin0.re_alloc(shape_in);
+    tdin1.re_alloc(shape_in);
+    tdin2.re_alloc(shape_in);
+    tdin0.copy_from(thin0);
+    tdin1.copy_from(thin1);
+    tdin2.copy_from(thin2);
+    tdout.re_alloc(shape_out);
+
+    // Device Vector of Tensor
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+    input_dev_4d.push_back(&tdin0);
+    input_dev_4d.push_back(&tdin1);
+    input_dev_4d.push_back(&tdin2);
+    output_dev_4d.push_back(&tdout);
+
+    Context<BM> ctx_dev(0, 1, 1);
+    Eltwise<BM, AK_BM, AK_BM, AK_BM, NCHW> eltwise_dev;
+
+    LOG(INFO) << "eltwise compute output shape";
+    eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
+
+    // Verify output shape
+    Shape sh = output_dev_4d[0]->valid_shape();
+    LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \
+        ", " << sh[2] << ", " << sh[3];
+    Shape shout{num_in, ch_in, h_in, w_in};
+    CHECK_EQ(shout == sh, true) << "compute shape error";
+
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
+
+    LOG(INFO) << "eltwise initialization";
+    eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
+
+    LOG(INFO) << "eltwise compute";
+    eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
+    print_tensor_device(*output_dev_4d[0]);
+
+}
+
+/*   0   1   2   3   4
+ *  10  11  12  13  14   (tdin_roi1, c=0)
+ *   (tdin_roi0, c=0)   25  26  27  28  29
+ *                      35  36  37  38  39
+ * =======================================
+ *  40  41  42  43  44
+ *  50  51  52  53  54   (tdin_roi1, c=1)
+ *   (tdin_roi0, c=1)   65  66  67  68  69
+ *                      75  76  77  78  79
+ */
+/*
+TEST(TestSaberFuncBM, test_func_prod_roi) {
+
+    Env<BM>::env_init();
+    typedef TargetWrapper<BM> API;
+
+    typedef TargetWrapper<BM> BM_API;
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+
+    EltwiseType elt_type = Eltwise_prod;
+
+    EltwiseParam<TensorDf4> param(elt_type);
+
+    int w_in = 10;
+    int h_in = 4;
+    int ch_in = 2;
+    int num_in = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_in_roi{num_in, ch_in, h_in / 2, w_in / 2};
+    Shape off0{0, 0, 0, 0};
+    Shape off1{0, 0, 2, 5};
+    Shape shape_out = shape_in_roi;
+
+    // Host Tensor
+    Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
+    for (int i = 0; i < thin.size(); ++i) {
+        thin.mutable_data()[i] = i;
+    }
+
+    // Device Tensor
+    TensorDf4 tdin, tdin_roi0, tdin_roi1, tdout;
+    tdin.re_alloc(shape_in);
+    tdin.copy_from(thin);
+    tdin_roi0.share_sub_buffer(tdin, shape_in_roi, off0);
+    tdin_roi1.share_sub_buffer(tdin, shape_in_roi, off1);
+    tdout.re_alloc(shape_out);
+
+
+    // Device Vector of Tensor
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+    input_dev_4d.push_back(&tdin_roi0);
+    input_dev_4d.push_back(&tdin_roi1);
+    input_dev_4d.push_back(&tdin_roi1);
+    output_dev_4d.push_back(&tdout);
+
+
+    Context<BM> ctx_dev(0, 1, 1);
+    Eltwise<BM, AK_BM, AK_BM, AK_BM, NCHW> eltwise_dev;
+
+    LOG(INFO) << "eltwise compute output shape";
+    eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
+
+
+    // Verify output shape
+    Shape sh = output_dev_4d[0]->valid_shape();
+    LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \
+        ", " << sh[2] << ", " << sh[3];
+    Shape shout(shape_in_roi);
+    CHECK_EQ(shout == sh, true) << "compute shape error";
+
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
+
+    LOG(INFO) << "eltwise initialization";
+    eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
+
+
+    LOG(INFO) << "eltwise compute";
+    eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
+
+
+    output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+    output_dev_4d[0]->sync();
+    print_tensor_device(*output_dev_4d[0]);
+    cudaDeviceSynchronize();
+
+
+    TensorHf4 th_for_print;
+    th_for_print.re_alloc(output_dev_4d[0]->valid_shape());
+    th_for_print.copy_from(*output_dev_4d[0]);
+    print_tensor_host(th_for_print);
+
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+
+*/
+
+/*   0   1   2   3   4
+ *  10  11  12  13  14   (tdin_roi1, c=0)
+ *   (tdin_roi0, c=0)   25  26  27  28  29
+ *                      35  36  37  38  39
+ * =======================================
+ *  40  41  42  43  44
+ *  50  51  52  53  54   (tdin_roi1, c=1)
+ *   (tdin_roi0, c=1)   65  66  67  68  69
+ *                      75  76  77  78  79
+ */
+/*
+TEST(TestSaberFuncBM, test_func_sum_roi_new) {
+
+    Env<BM>::env_init();
+    typedef TargetWrapper<BM> API;
+
+    typedef TargetWrapper<BM> BM_API;
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+
+    EltwiseType elt_type = Eltwise_sum;
+
+    int w_in = 10;
+    int h_in = 4;
+    int ch_in = 2;
+    int num_in = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_in_roi{num_in, ch_in, h_in / 2, w_in / 2};
+
+    Shape off0{0, 0, 0, 0};
+    Shape off1{0, 0, 2, 5};
+    Shape shape_out = shape_in_roi;
+
+    // Host Tensor
+    Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
+    for (int i = 0; i < thin.size(); ++i) {
+        thin.mutable_data()[i] = i;
+    }
+
+    // Device Tensor
+    TensorDf4 tdin, tdin_roi0, tdin_roi1, tdout;
+    tdin.re_alloc(shape_in);
+    tdin.copy_from(thin);
+    tdin_roi0.share_sub_buffer(tdin, shape_in_roi, off0);
+    tdin_roi1.share_sub_buffer(tdin, shape_in_roi, off1);
+    tdout.re_alloc(shape_out);
+
+    // Device Vector of Tensor
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+    input_dev_4d.push_back(&tdin_roi0);
+    input_dev_4d.push_back(&tdin_roi1);
+//    input_dev_4d.push_back(&tdin_roi1);
+//    input_dev_4d.push_back(&tdin_roi1);
+    output_dev_4d.push_back(&tdout);
+
+//    Shape shape_coeff(1, 1, 1, input_dev_4d.size());
+//    TensorHf4 thcoeff(shape_coeff);
+//    for (int i = 0; i < thcoeff.size(); ++i) {
+//        thcoeff.mutable_data()[i] = 1;
+//    }
+
+    EltwiseParam<TensorDf4> param(elt_type);
+
+    Context<BM> ctx_dev(0, 1, 1);
+    Eltwise<BM, AK_BM, AK_BM, AK_BM, NCHW> eltwise_dev;
+
+    LOG(INFO) << "eltwise compute output shape";
+    eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
+
+    // Verify output shape
+    Shape sh = output_dev_4d[0]->valid_shape();
+    LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \
+        ", " << sh[2] << ", " << sh[3];
+    Shape shout(shape_in_roi);
+    CHECK_EQ(shout == sh, true) << "compute shape error";
+
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
+
+    LOG(INFO) << "eltwise initialization";
+    eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
+
+    print_tensor_device(*input_dev_4d[0]);
+    print_tensor_device(*input_dev_4d[1]);
+    cudaDeviceSynchronize();
+    LOG(INFO) << "eltwise compute";
+    eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
+
+    output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+    output_dev_4d[0]->sync();
+    print_tensor_device(*output_dev_4d[0]);
+    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+*/
+/*
+TEST(TestSaberFuncBM, test_func_sum_roi) {
+
+    Env<BM>::env_init();
+    typedef TargetWrapper<BM> API;
+
+    typedef TargetWrapper<BM> BM_API;
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+
+    EltwiseType elt_type = Eltwise_sum;
+
+    int w_in = 10;
+    int h_in = 4;
+    int ch_in = 2;
+    int num_in = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_in_roi{num_in, ch_in, h_in / 2, w_in / 2};
+    Shape off0{0, 0, 0, 0};
+    Shape off1{0, 0, 2, 5};
+    Shape shape_out = shape_in_roi;
+
+    // Host Tensor
+    Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
+    for (int i = 0; i < thin.size(); ++i) {
+        thin.mutable_data()[i] = i;
+    }
+
+    // Device Tensor
+    TensorDf4 tdin, tdin_roi0, tdin_roi1, tdout;
+    tdin.re_alloc(shape_in);
+    tdin.copy_from(thin);
+    tdin_roi0.share_sub_buffer(tdin, shape_in_roi, off0);
+    tdin_roi1.share_sub_buffer(tdin, shape_in_roi, off1);
+    tdout.re_alloc(shape_out);
+
+    // Device Vector of Tensor
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+    input_dev_4d.push_back(&tdin_roi0);
+    input_dev_4d.push_back(&tdin_roi1);
+    output_dev_4d.push_back(&tdout);
+
+    //Shape shape_coeff(1, 1, 1, 3);
+    Shape shape_coeff(1, 1, 1, input_dev_4d.size());
+    TensorHf4 thcoeff(shape_coeff);
+
+    for (int i = 0; i < thcoeff.size(); ++i) {
+        thcoeff.mutable_data()[i] = i;
+    }
+    TensorDf4 tdcoeff;
+    tdcoeff.re_alloc(shape_coeff);
+    tdcoeff.copy_from(thcoeff);
+
+    EltwiseParam<TensorDf4> param(elt_type, &tdcoeff);
+
+    Context<BM> ctx_dev(0, 1, 1);
+    Eltwise<BM, AK_BM, AK_BM, AK_BM, NCHW> eltwise_dev;
+
+    LOG(INFO) << "eltwise compute output shape";
+    eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
+
+    // Verify output shape
+    Shape sh = output_dev_4d[0]->valid_shape();
+    LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \
+        ", " << sh[2] << ", " << sh[3];
+    Shape shout(shape_in_roi);
+    CHECK_EQ(shout == sh, true) << "compute shape error";
+
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
+
+    LOG(INFO) << "eltwise initialization";
+    eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
+
+    LOG(INFO) << "eltwise compute";
+    eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
+
+    output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+    output_dev_4d[0]->sync();
+    print_tensor_device(*output_dev_4d[0]);
+    cudaDeviceSynchronize();
+
+    TensorHf4 th_for_print;
+    th_for_print.re_alloc(output_dev_4d[0]->valid_shape());
+    th_for_print.copy_from(*output_dev_4d[0]);
+    print_tensor_host(th_for_print);
+
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+*/
+
+/*
+TEST(TestSaberFuncBM, test_func_max_roi) {
+
+    Env<BM>::env_init();
+    typedef TargetWrapper<BM> API;
+
+    typedef TargetWrapper<BM> BM_API;
+    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
+    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
+
+    EltwiseType elt_type = Eltwise_max;
+
+    int w_in = 10;
+    int h_in = 4;
+    int ch_in = 2;
+    int num_in = 1;
+
+    Shape shape_in(num_in, ch_in, h_in, w_in);
+    Shape shape_in_roi{num_in, ch_in, h_in / 2, w_in / 2};
+    Shape off0{0, 0, 0, 0};
+    Shape off1{0, 0, 2, 5};
+    Shape shape_out = shape_in_roi;
+
+    // Host Tensor
+    Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
+    for (int i = 0; i < thin.size(); ++i) {
+        thin.mutable_data()[i] = i;
+    }
+
+    // Device Tensor
+    TensorDf4 tdin, tdin_roi0, tdin_roi1, tdout;
+    tdin.re_alloc(shape_in);
+    tdin.copy_from(thin);
+    tdin_roi0.share_sub_buffer(tdin, shape_in_roi, off0);
+    tdin_roi1.share_sub_buffer(tdin, shape_in_roi, off1);
+    tdout.re_alloc(shape_out);
+
+    // Device Vector of Tensor
+    std::vector<TensorDf4*> input_dev_4d;
+    std::vector<TensorDf4*> output_dev_4d;
+    input_dev_4d.push_back(&tdin_roi0);
+    input_dev_4d.push_back(&tdin_roi1);
+    output_dev_4d.push_back(&tdout);
+
+    EltwiseParam<TensorDf4> param(elt_type);
+
+    Context<BM> ctx_dev(0, 1, 1);
+    Eltwise<BM, AK_BM, AK_BM, AK_BM, NCHW> eltwise_dev;
+
+    LOG(INFO) << "eltwise compute output shape";
+    eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
+
+    // Verify output shape
+    Shape sh = output_dev_4d[0]->valid_shape();
+    LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \
+        ", " << sh[2] << ", " << sh[3];
+    Shape shout(shape_in_roi);
+    CHECK_EQ(shout == sh, true) << "compute shape error";
+
+    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
+
+    LOG(INFO) << "eltwise initialization";
+    eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
+
+    LOG(INFO) << "eltwise compute";
+    eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
+
+    output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
+    output_dev_4d[0]->sync();
+    print_tensor_device(*output_dev_4d[0]);
+    cudaDeviceSynchronize();
+
+    TensorHf4 th_for_print;
+    th_for_print.re_alloc(output_dev_4d[0]->valid_shape());
+    th_for_print.copy_from(*output_dev_4d[0]);
+    print_tensor_host(th_for_print);
+
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+
+*/
+
+int main(int argc, const char** argv) {
+    // initial logger
+    //logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}

From 340332933dcfbba2bb59750e3375686b7babf00b Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 11 Jul 2018 11:40:43 +0800
Subject: [PATCH 241/318] test eltwise PROD for BM

---
 test/saber/bm/test_saber_func_eltwise_BM.cpp | 21 +++-----------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/test/saber/bm/test_saber_func_eltwise_BM.cpp b/test/saber/bm/test_saber_func_eltwise_BM.cpp
index da931510b..643f4e026 100644
--- a/test/saber/bm/test_saber_func_eltwise_BM.cpp
+++ b/test/saber/bm/test_saber_func_eltwise_BM.cpp
@@ -7,7 +7,7 @@
 
 using namespace anakin::saber;
 
-/*
+
 TEST(TestSaberFuncBM, test_func_prod) {
 
     Env<BM>::env_init();
@@ -63,12 +63,10 @@ TEST(TestSaberFuncBM, test_func_prod) {
 
 
     Context<BM> ctx_dev(0, 1, 1);
-    Eltwise<BM, AK_BM> eltwise_dev;
+    Eltwise<BM, AK_BM, AK_BM, AK_BM, NCHW> eltwise_dev;
 
     LOG(INFO) << "eltwise compute output shape";
-    eltwise_dev.compute_output_shape(output_dev_4d, input_dev_4d, param);
-
-    //SABER_CHECK(eltwise_dev.compute_output_shape(output_dev_4d, input_dev_4d, param));
+    eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
 
     // Verify output shape
     Shape sh = output_dev_4d[0]->valid_shape();
@@ -86,22 +84,9 @@ TEST(TestSaberFuncBM, test_func_prod) {
     LOG(INFO) << "eltwise compute";
     eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
 
-
-    output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
-    output_dev_4d[0]->sync();
     print_tensor_device(*output_dev_4d[0]);
-    cudaDeviceSynchronize();
-
-
-    TensorHf4 th_for_print;
-    th_for_print.re_alloc(output_dev_4d[0]->valid_shape());
-    th_for_print.copy_from(*output_dev_4d[0]);
-    print_tensor_host(th_for_print);
-
-    CUDA_CHECK(cudaPeekAtLastError());
 }
 
-*/
 
 TEST(TestSaberFuncBM, test_func_sum) {
 

From 51cb5a2d244c8370fa7e98b96c6ad078df255b9e Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Mon, 6 Aug 2018 15:41:18 +0800
Subject: [PATCH 242/318] update gitignore

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index bbcdfd1d6..c7164ff15 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,3 +37,6 @@ android_build
 ios_build
 gpu_build
 output
+
+.idea
+.vs_code

From 2994f0b750256e217ef90ae2998cc546698a3ff6 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Mon, 6 Aug 2018 15:41:38 +0800
Subject: [PATCH 243/318] update gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index c7164ff15..89827b8e4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,3 +40,4 @@ output
 
 .idea
 .vs_code
+.idea/workspace.xml

From f8c859aaef55972a84842b46b63cb49d20d978b5 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 7 Aug 2018 14:34:41 +0800
Subject: [PATCH 244/318] Refactor according to new template scheme

---
 .gitignore                                    |   1 +
 saber/core/impl/bm/bm_impl.cpp                |  28 +-
 saber/core/impl/bm/temsor_op_bm.cpp           | 103 +++
 saber/core/target_wrapper.h                   |  16 +-
 saber/core/tensor.h                           |  45 +-
 saber/core/tensor_op.cpp                      | 181 -----
 saber/core/tensor_op.h                        |  10 -
 saber/funcs/batch_norm.h                      | 104 ---
 saber/funcs/impl/bm/vender_activation.h       |  82 ---
 saber/funcs/impl/bm/vender_batch_norm.h       |  96 ---
 saber/funcs/impl/bm/vender_conv.cpp           | 102 +++
 saber/funcs/impl/bm/vender_conv.h             | 113 +--
 saber/funcs/impl/bm/vender_conv_act.h         | 198 ------
 saber/funcs/impl/bm/vender_conv_act_pooling.h | 176 -----
 saber/funcs/impl/bm/vender_eltwise.h          | 119 ----
 saber/funcs/impl/bm/vender_fc.h               |  76 --
 saber/funcs/impl/bm/vender_pooling.h          |  87 ---
 saber/funcs/impl/bm/vender_scale.h            | 106 ---
 saber/funcs/impl/bm/vender_softmax.h          | 108 ---
 test/saber/bm/test_TargetWrapper_BM.cpp       |  26 -
 test/saber/bm/test_saber_buffer_BM.cpp        | 130 ----
 test/saber/bm/test_saber_buffer_BM.h          |  20 -
 test/saber/bm/test_saber_context_BM.cpp       |  32 -
 test/saber/bm/test_saber_context_BM.h         |  21 -
 test/saber/bm/test_saber_device_BM.cpp        |  20 -
 test/saber/bm/test_saber_device_BM.h          |  21 -
 test/saber/bm/test_saber_func_BM.h            |  38 -
 .../bm/test_saber_func_activation_BM.cpp      |  91 ---
 .../bm/test_saber_func_batch_norm_BM.cpp      |  84 ---
 test/saber/bm/test_saber_func_conv_BM.cpp     | 551 ---------------
 test/saber/bm/test_saber_func_eltwise_BM.cpp  | 612 ----------------
 test/saber/bm/test_saber_func_fc_BM.cpp       | 147 ----
 test/saber/bm/test_saber_func_pooling_BM.cpp  | 280 --------
 test/saber/bm/test_saber_func_scale_BM.cpp    | 126 ----
 test/saber/bm/test_saber_func_softmax_BM.cpp  | 196 ------
 test/saber/bm/test_saber_shape_BM.cpp         | 126 ----
 test/saber/bm/test_saber_shape_BM.h           |  25 -
 test/saber/bm/test_saber_tensor_BM.cpp        | 664 ------------------
 test/saber/bm/test_saber_tensor_BM.h          |  21 -
 39 files changed, 251 insertions(+), 4731 deletions(-)
 create mode 100644 saber/core/impl/bm/temsor_op_bm.cpp
 delete mode 100644 saber/funcs/batch_norm.h
 delete mode 100644 saber/funcs/impl/bm/vender_activation.h
 delete mode 100644 saber/funcs/impl/bm/vender_batch_norm.h
 create mode 100644 saber/funcs/impl/bm/vender_conv.cpp
 delete mode 100644 saber/funcs/impl/bm/vender_conv_act.h
 delete mode 100644 saber/funcs/impl/bm/vender_conv_act_pooling.h
 delete mode 100644 saber/funcs/impl/bm/vender_eltwise.h
 delete mode 100644 saber/funcs/impl/bm/vender_fc.h
 delete mode 100644 saber/funcs/impl/bm/vender_pooling.h
 delete mode 100644 saber/funcs/impl/bm/vender_scale.h
 delete mode 100644 saber/funcs/impl/bm/vender_softmax.h
 delete mode 100644 test/saber/bm/test_TargetWrapper_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_buffer_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_buffer_BM.h
 delete mode 100644 test/saber/bm/test_saber_context_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_context_BM.h
 delete mode 100644 test/saber/bm/test_saber_device_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_device_BM.h
 delete mode 100644 test/saber/bm/test_saber_func_BM.h
 delete mode 100644 test/saber/bm/test_saber_func_activation_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_func_batch_norm_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_func_conv_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_func_eltwise_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_func_fc_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_func_pooling_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_func_scale_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_func_softmax_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_shape_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_shape_BM.h
 delete mode 100644 test/saber/bm/test_saber_tensor_BM.cpp
 delete mode 100644 test/saber/bm/test_saber_tensor_BM.h

diff --git a/.gitignore b/.gitignore
index 89827b8e4..036a1564d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,3 +41,4 @@ output
 .idea
 .vs_code
 .idea/workspace.xml
+.vscode/settings.json
diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index e73e355b7..6ee9d6dcd 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -78,16 +78,18 @@ void BM_API::mem_set(void* ptr, int value, size_t n){
     //BMDNN_CHECK(bm_memset_device(handle, value, *pmem));
 }
 
-void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-    size_t count, __DtoD) {
+void BM_API::sync_memcpy(void* dst, size_t dst_offset, int dst_id, \
+        const void* src, size_t src_offset, int src_id, \
+        size_t count, __DtoD) {
     handle = get_bm_handle(); 
     //BMDNN_CHECK(bm_memcpy_d2d(handle, bm_mem_from_device(dst), dst_id, bm_mem_from_device(src), src_id, count));
     BMDNN_CHECK(bm_memcpy_d2d(handle, *(bm_device_mem_t *)(dst), dst_id, *(bm_device_mem_t *)(src), src_id, count));
     LOG(INFO) << "BM sync_memcpy: device to device, finished";
 };
 
-void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-    size_t count, __HtoD) {
+void BM_API::sync_memcpy(void* dst, size_t dst_offset, int dst_id, \
+        const void* src, size_t src_offset, int src_id, \
+        size_t count, __HtoD) {
     handle = get_bm_handle(); 
     BMDNN_CHECK(bm_memcpy_s2d(handle, *(bm_device_mem_t *)(dst), bm_mem_from_system(src)));
 
@@ -99,8 +101,9 @@ void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
     LOG(INFO) << "BM sync_memcpy: host to device, finished";
 };
 
-void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-    size_t count, __DtoH) {
+void BM_API::sync_memcpy(void* dst, size_t dst_offset, int dst_id, \
+        const void* src, size_t src_offset, int src_id, \
+        size_t count, __DtoH) {
     handle = get_bm_handle(); 
     BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src)));
 
@@ -112,22 +115,23 @@ void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
     LOG(INFO) << "BM sync_memcpy: device to host, finished";
 };
 
-void BM_API::sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
-    int src_dev, size_t count) { 
+void BM_API::sync_memcpy_p2p(void* dst, size_t dst_offset, int dst_id, \
+        const void* src, size_t src_offset, int src_id, \
+        size_t count) { 
 
-    LOG(INFO) << "BM sync_memcpy_p2p: temporarily no used";
+    LOG(ERROR) << "BM sync_memcpy_p2p: temporarily no used";
 };
 
-
-//! target wrapper
+//! BM TargetWrapper
 template struct TargetWrapper<BM, __device_target>;
 
 //! BM Buffer
 template class Buffer<BM>;
 
 //! BM Tensor
-INSTANTIATE_TENSOR(BM, AK_BM, NCHW);
+template class Tensor<BM>;
 
+//! BM Env
 template struct Env<BM>;
 
 #endif //USE_BM
diff --git a/saber/core/impl/bm/temsor_op_bm.cpp b/saber/core/impl/bm/temsor_op_bm.cpp
new file mode 100644
index 000000000..5c1530339
--- /dev/null
+++ b/saber/core/impl/bm/temsor_op_bm.cpp
@@ -0,0 +1,103 @@
+#include "saber/core/tensor_op.h"
+
+#ifdef USE_BM
+
+#include <random>
+
+namespace anakin{
+
+namespace saber{
+
+template<>
+void fill_tensor_rand<Tensor<BM>>(Tensor<BM>& tensor, \
+    typename Tensor<BM>::API::stream_t stream) {
+
+    float *host_mem_input = new float[tensor.size()];
+    for (int i = 0; i < tensor.size(); ++i) {
+        host_mem_input[i] = static_cast<float>(rand());
+    }
+
+    bm_device_mem_t* device_data_ptr = tensor.mutable_data();
+    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
+
+    delete [] host_mem_input;
+}
+
+template<>
+void fill_tensor_rand(Tensor<BM>& tensor, float vstart, \
+    float vend, typename Tensor<BM>::API::stream_t stream = NULL){
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dis(0, 1.f);
+
+    float *host_mem_input = new float[tensor.size()];
+    for (int i = 0; i < tensor.size(); ++i) {
+        float random_num = vstart + (vend - vstart) * dis(gen);
+        host_mem_input[i] = random_num;
+    }
+
+    bm_device_mem_t* device_data_ptr = tensor.mutable_data();
+    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
+
+    delete [] host_mem_input;
+}
+
+template<>
+void fill_tensor_const(Tensor<BM>& tensor, float value, \
+    typename Tensor<BM>::API::stream_t stream = NULL){
+
+    float *host_mem_input = new float[tensor.size()];
+    for (int i = 0; i < tensor.size(); ++i) {
+        host_mem_input[i] = value;
+    }
+
+    bm_device_mem_t* device_data_ptr = tensor.mutable_data();
+    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
+
+    delete [] host_mem_input;
+}
+
+template <>
+void print_tensor<Tensor<BM>>(Tensor<BM>& tensor,  \
+    typename Tensor<BM>::API::stream_t stream) {
+
+    LOG(INFO) << "BM device tensor data:" << tensor.size();
+
+    /*
+    const bm_device_mem_t* device_data_ptr = tensor.data();
+    unsigned long long gaddr = bm_mem_get_device_addr(*device_data_ptr);
+    bm_flush(get_bm_handle());
+    float* device_data = (float*)bm_get_global_addr(gaddr);
+
+    for (int i = 0; i < tensor.size(); ++i) {
+        printf("%.2f ", device_data[i]);
+
+        if ((i + 1) % (4 * tensor.width()) == 0) {
+            printf("\n");
+        }
+    }*/
+
+    float *host_mem = new float[tensor.size()];
+    auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
+    bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr);
+
+    for (int i = 0; i < tensor.size(); ++i) {
+        printf("%.2f\t", host_mem[i]);
+
+        if ((i + 1) % tensor.width() == 0){
+            printf("\n");
+        }
+    }
+    printf("\n");
+
+    delete [] host_mem;
+}
+
+
+
+} //namespace saber
+
+} //namespace anakin
+
+#endif //USE_BM
\ No newline at end of file
diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index 5c802fa9e..6a7aafce9 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -539,17 +539,21 @@ struct TargetWrapper<BM, __device_target> {
     static void sync_stream(event_t event, stream_t stream) {}
     // brief create event, empty function for bitmain target
 
-    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+    static void sync_memcpy(void* dst, size_t dst_offset, int dst_id, \
+        const void* src, size_t src_offset, int src_id, \
         size_t count, __DtoD);
 
-    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+    static void sync_memcpy(void* dst, size_t dst_offset, int dst_id, \
+        const void* src, size_t src_offset, int src_id, \
         size_t count, __HtoD);
 
-    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-        size_t count, __DtoH) {};
+    static void sync_memcpy(void* dst, size_t dst_offset, int dst_id, \
+        const void* src, size_t src_offset, int src_id, \
+        size_t count, __DtoH);
 
-    static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
-        int src_dev, size_t count);
+    static void sync_memcpy_p2p(void* dst, size_t dst_offset, int dst_id, \
+        const void* src, size_t src_offset, int src_id, \
+        size_t count);
 
     /**
      * \brief device target return currently used device id
diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 6b2b2a0f8..017187a3a 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -77,36 +77,6 @@ class Tensor {
         _is_subbuf = false;
     }
 
-#ifdef USE_BM
-    /**
-     * \brief Constructor with allocated data ptr and entire memory shape. only for BM
-    */
-    template <typename Dtype_s,typename TargetType_t>
-    Tensor(Dtype_s* data_ptr, TargetType_t target, int id, Shape shape) {
-        CHECK_EQ(shape.dims(), TensorAPI::layout_dims::value) << \
-            "shape dims is not matched to layout type";
-        _shape = shape;
-        _valid_shape = shape;
-        _offset = Shape::zero(shape.dims());
-
-        if(typeid(Dtype_s) == typeid(AK_FLOAT))
-        {
-        std::shared_ptr<Buffer<TargetType_t>> buf_from_date = \
-            std::make_shared<Buffer<TargetType_t>>(&bm_mem_from_system(const_cast<Dtype_s *>(data_ptr)), shape.count() * _type_len(), id);
-
-        BufferMemShare(_buf, buf_from_date);
-        }
-        else
-        {
-        std::shared_ptr<Buffer<TargetType_t>> buf_from_date = \
-            std::make_shared<Buffer<TargetType_t>>(data_ptr, shape.count() * _type_len(), id);
-
-        BufferMemShare(_buf, buf_from_date);
-        }
-        _is_subbuf = false;
-    }
-#endif
-
     /**
      * \brief Copy constructor, shallow copy.
      */
@@ -789,8 +759,8 @@ class Tensor {
     }
 
 #ifdef USE_BM
-    template <typename NewTargetType_t, DataType NewDataType_t, typename NewLayOutType_t>
-    SaberStatus copy_from(const Tensor<NewTargetType_t, NewDataType_t, NewLayOutType_t>& tensor) {
+    template <typename TargetType_t>
+    SaberStatus copy_from(const Tensor<TargetType_t>& tensor) {
         LOG(WARNING) << "Invalid: copy_from is not allowed for current type.";
         return SaberInvalidValue;
     }
@@ -1035,16 +1005,12 @@ class Tensor {
 #ifdef USE_BM
 #ifndef BM_TENSOR_COPY
 #define BM_TENSOR_COPY
-template<> inline
-size_t Tensor<BM, AK_BM, NCHW>::_type_len(){
-    return 4;
-}
-
 template<>
 template<> inline
-SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
+SaberStatus Tensor<BM>::copy_from<X86>(const Tensor<X86>& tensor) {
     LOG(INFO) << "BM copy_from X86";
     CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
+    CHECK_EQ(tensor.get_dtype(), AK_FLOAT) << "host data type should be AK_FLOAT";
 
     auto* device_data_ptr = mutable_data();
     BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(const_cast<float *>(tensor.data()))));
@@ -1053,9 +1019,10 @@ SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor
 
 template<>
 template<> inline
-SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor<BM, AK_BM, NCHW>& tensor) {
+SaberStatus Tensor<X86>::copy_from<BM>(const Tensor<BM>& tensor) {
     LOG(INFO) << "X86 copy_from BM";
     CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
+    CHECK_EQ(_dtype, AK_FLOAT) << "host data type should be AK_FLOAT";
 
     auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
     BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *device_data_ptr));
diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp
index 443884d11..4bd5c84c3 100644
--- a/saber/core/tensor_op.cpp
+++ b/saber/core/tensor_op.cpp
@@ -253,192 +253,11 @@ FILL_TENSOR_HOST(NVHX86)
 FILL_TENSOR_HOST(ARM)
 #endif
 
-#ifdef USE_ARM_PLACE
-FILL_TENSOR_HOST(BM)
-#endif
-
 template void tensor_cmp_host<float>(const float* src1, const float* src2, \
                                      int size, double& max_ratio, double& max_diff);
 template void tensor_cmp_host<char>(const char* src1, const char* src2, int size, \
                                     double& max_ratio, double& max_diff);
 
-#ifdef USE_BM
-
-        template<>
-void fill_tensor_device_rand<Tensor<BM, AK_BM, NCHW>>(Tensor<BM, AK_BM, NCHW>& tensor, \
-    typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream) {
-
-    float *host_mem_input = new float[tensor.size()];
-    for (int i = 0; i < tensor.size(); ++i) {
-        host_mem_input[i] = static_cast<float>(rand());
-    }
-
-    bm_device_mem_t* device_data_ptr = tensor.mutable_data();
-    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
-
-    delete [] host_mem_input;
-}
-
-void fill_tensor_device_rand(Tensor<BM, AK_BM, NCHW>& tensor, float vstart, \
-    float vend, typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream = NULL){
-
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_real_distribution<float> dis(0, 1.f);
-
-    float *host_mem_input = new float[tensor.size()];
-    for (int i = 0; i < tensor.size(); ++i) {
-        float random_num = vstart + (vend - vstart) * dis(gen);
-        host_mem_input[i] = random_num;
-    }
-
-    bm_device_mem_t* device_data_ptr = tensor.mutable_data();
-    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
-
-    delete [] host_mem_input;
-}
-
-void fill_tensor_device_const(Tensor<BM, AK_BM, NCHW>& tensor, float value, \
-    typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream = NULL){
-
-    float *host_mem_input = new float[tensor.size()];
-    for (int i = 0; i < tensor.size(); ++i) {
-        host_mem_input[i] = value;
-    }
-
-    bm_device_mem_t* device_data_ptr = tensor.mutable_data();
-    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
-
-    delete [] host_mem_input;
-}
-
-template <>
-void print_tensor_device<Tensor<BM, AK_BM, NCHW>>(Tensor<BM, AK_BM, NCHW>& tensor,  \
-    typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream) {
-
-    LOG(INFO) << "BM device tensor data:" << tensor.size();
-
-    /*
-    const bm_device_mem_t* device_data_ptr = tensor.data();
-    unsigned long long gaddr = bm_mem_get_device_addr(*device_data_ptr);
-    bm_flush(get_bm_handle());
-    float* device_data = (float*)bm_get_global_addr(gaddr);
-
-    for (int i = 0; i < tensor.size(); ++i) {
-        printf("%.2f ", device_data[i]);
-
-        if ((i + 1) % (4 * tensor.width()) == 0) {
-            printf("\n");
-        }
-    }*/
-
-    float *host_mem = new float[tensor.size()];
-    auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
-    bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr);
-
-    for (int i = 0; i < tensor.size(); ++i) {
-        printf("%.2f\t", host_mem[i]);
-
-        if ((i + 1) % tensor.width() == 0){
-            printf("\n");
-        }
-    }
-    printf("\n");
-
-    delete [] host_mem;
-}
-
-#endif
-
-
-#ifdef USE_BM
-
-template<>
-void fill_tensor_device_rand<Tensor<BM, AK_BM, NCHW>>(Tensor<BM, AK_BM, NCHW>& tensor, \
-    typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream) {
-
-    float *host_mem_input = new float[tensor.size()];
-    for (int i = 0; i < tensor.size(); ++i) {
-        host_mem_input[i] = static_cast<float>(rand());
-    }
-
-    bm_device_mem_t* device_data_ptr = tensor.mutable_data();
-    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
-
-    delete [] host_mem_input;
-}
-
-void fill_tensor_device_rand(Tensor<BM, AK_BM, NCHW>& tensor, float vstart, \
-    float vend, typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream = NULL){
-
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_real_distribution<float> dis(0, 1.f);
-
-    float *host_mem_input = new float[tensor.size()];
-    for (int i = 0; i < tensor.size(); ++i) {
-        float random_num = vstart + (vend - vstart) * dis(gen);
-        host_mem_input[i] = random_num;
-    }
-
-    bm_device_mem_t* device_data_ptr = tensor.mutable_data();
-    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
-
-    delete [] host_mem_input;
-}
-
-void fill_tensor_device_const(Tensor<BM, AK_BM, NCHW>& tensor, float value, \
-    typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream = NULL){
-
-    float *host_mem_input = new float[tensor.size()];
-    for (int i = 0; i < tensor.size(); ++i) {
-        host_mem_input[i] = value;
-    }
-
-    bm_device_mem_t* device_data_ptr = tensor.mutable_data();
-    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
-
-    delete [] host_mem_input;
-}
-
-template <>
-void print_tensor_device<Tensor<BM, AK_BM, NCHW>>(Tensor<BM, AK_BM, NCHW>& tensor,  \
-    typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream) {
-
-    LOG(INFO) << "BM device tensor data:" << tensor.size();
-
-    /*
-    const bm_device_mem_t* device_data_ptr = tensor.data();
-    unsigned long long gaddr = bm_mem_get_device_addr(*device_data_ptr);
-    bm_flush(get_bm_handle());
-    float* device_data = (float*)bm_get_global_addr(gaddr);
-
-    for (int i = 0; i < tensor.size(); ++i) {
-        printf("%.2f ", device_data[i]);
-
-        if ((i + 1) % (4 * tensor.width()) == 0) {
-            printf("\n");
-        }
-    }*/
-
-    float *host_mem = new float[tensor.size()];
-    auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
-    bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr);
-
-    for (int i = 0; i < tensor.size(); ++i) {
-        printf("%.2f\t", host_mem[i]);
-
-        if ((i + 1) % tensor.width() == 0){
-            printf("\n");
-        }
-    }
-    printf("\n");
-
-    delete [] host_mem;
-}
-
-#endif
-
 } //namespace saber
 
 } //namespace anakin
diff --git a/saber/core/tensor_op.h b/saber/core/tensor_op.h
index e8cb2f42d..4c3d5974f 100644
--- a/saber/core/tensor_op.h
+++ b/saber/core/tensor_op.h
@@ -154,16 +154,6 @@ class DataTensorTransformHelper{
 
 #endif
 
-#ifdef USE_BM
-
-void fill_tensor_device_const(Tensor<BM, AK_BM, NCHW>& tensor, float value, \
-    typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream = NULL);
-
-void fill_tensor_device_rand(Tensor<BM, AK_BM, NCHW>& tensor, float vstart, \
-    float vend, typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream = NULL);
-
-#endif
-
 } // namespace saber
 
 } // namespace anakin
diff --git a/saber/funcs/batch_norm.h b/saber/funcs/batch_norm.h
deleted file mode 100644
index 2e817c734..000000000
--- a/saber/funcs/batch_norm.h
+++ /dev/null
@@ -1,104 +0,0 @@
-#ifndef ANAKIN_SABER_FUNCS_BATCH_NORM_H
-#define ANAKIN_SABER_FUNCS_BATCH_NORM_H
-
-#include "saber/core/tensor.h"
-#include "saber/funcs/base.h"
-#include "saber/saber_funcs_param.h"
-#include "saber/funcs/impl/impl_base.h"
-#include "saber/funcs/impl/impl_batch_norm.h"
-
-#ifdef NVIDIA_GPU
-//todo
-#include "saber/funcs/impl/impl_batch_norm.h"
-#endif
-
-#ifdef USE_X86_PLACE
-//todo
-#include "saber/funcs/impl/impl_batch_norm.h"
-#endif
-
-#ifdef USE_ARM_PLACE
-//todo
-#include "saber/funcs/impl/impl_batch_norm.h"
-#endif
-
-#ifdef USE_BM
-#include "saber/funcs/impl/bm/vender_batch_norm.h"
-#endif
-
-namespace anakin {
-namespace saber {
-
-template <typename TargetType,
-        DataType OpDtype,
-        DataType inDtype = AK_FLOAT,
-        DataType outDtype = AK_FLOAT,
-        typename LayOutType_op = NCHW,
-        typename LayOutType_in = NCHW,
-        typename LayOutType_out = NCHW
->
-class BatchNorm : public BaseFunc<
-        Tensor<TargetType, inDtype, LayOutType_in>,
-        Tensor<TargetType, outDtype, LayOutType_out>,
-        Tensor<TargetType, OpDtype, LayOutType_op>,
-        ImplBase,
-        BatchnormParam
-> {
-public:
-    using BaseFunc<
-            Tensor<TargetType, inDtype, LayOutType_in>,
-            Tensor<TargetType, outDtype, LayOutType_out>,
-            Tensor<TargetType, OpDtype, LayOutType_op>,
-            ImplBase,
-            BatchnormParam>::BaseFunc;
-
-    BatchNorm() = default;
-
-    typedef Tensor<TargetType, inDtype, LayOutType_in> InDataTensor;
-    typedef Tensor<TargetType, outDtype, LayOutType_out> OutDataTensor;
-    typedef Tensor<TargetType, OpDtype, LayOutType_op> OpTensor;
-    typedef BatchnormParam<OpTensor> Param_t;
-    typedef std::vector<InDataTensor *> Input_v;
-    typedef std::vector<OutDataTensor *> Output_v;
-    typedef std::vector<Shape> Shape_v;
-
-    virtual SaberStatus compute_output_shape(const Input_v &input,
-                                             Output_v &output, Param_t &param) override {
-
-        Shape output_shape = (input[0]->valid_shape());
-        return output[0]->set_shape(output_shape);
-    }
-
-    virtual SaberStatus init_impl(ImplEnum implenum) override {
-        switch (implenum) {
-            case VENDER_IMPL:
-                this->_impl.push_back(new VenderBatchNorm <TargetType,
-                OpDtype, inDtype, outDtype,
-                LayOutType_op, LayOutType_in, LayOutType_out>);
-                return SaberSuccess;
-
-            case SABER_IMPL:
-                return SaberUnImplError;
-
-            default:
-                return SaberUnImplError;
-        }
-    }
-
-private:
-
-    virtual void pick_best_static() override {
-        if (true) // some condition?
-            this->_best_impl = this->_impl[0];
-    }
-
-    virtual void pick_best_specify(ImplEnum implenum) override {
-        this->_best_impl = this->_impl[0];
-    }
-
-};
-
-} // namespace saber
-} // namespace anakin
-
-#endif
diff --git a/saber/funcs/impl/bm/vender_activation.h b/saber/funcs/impl/bm/vender_activation.h
deleted file mode 100644
index ec27ac054..000000000
--- a/saber/funcs/impl/bm/vender_activation.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#ifndef ANAKIN_SABER_FUNCS_BMDNN_ACT_H
-#define ANAKIN_SABER_FUNCS_BMDNN_ACT_H
-#include "saber/funcs/impl/impl_activation.h"
-namespace anakin {
-
-namespace saber {
-
-template <DataType OpDtype ,
-    DataType inDtype,
-    DataType outDtype,
-    typename LayOutType_op,
-    typename LayOutType_in,
-    typename LayOutType_out>
-class VenderActivation<BM, OpDtype, inDtype, outDtype,\
-    LayOutType_op, LayOutType_in, LayOutType_out> : \
-    public ImplBase<
-        Tensor<BM, inDtype, LayOutType_in>, 
-        Tensor<BM, outDtype, LayOutType_out>,
-        Tensor<BM, OpDtype, LayOutType_op>,
-        ActivationParam<Tensor<BM, OpDtype, LayOutType_op> > > 
-{
-public:
-    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
-    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
-    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
-    typedef typename DataTensor_in::Dtype InDataType;
-    typedef typename DataTensor_out::Dtype OutDataType;
-    typedef typename OpTensor::Dtype OpDataType;
-
-    VenderActivation(): _handle(NULL), _active_type(Active_relu) {}
-
-    ~VenderActivation() {}
-
-    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
-                            std::vector<DataTensor_out *>& outputs,
-                            ActivationParam<OpTensor>& param, Context<BM>& ctx) {
-        // not sure
-	_handle = get_bm_handle();
-        return create(inputs, outputs, param, ctx);
-    }
-
-    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
-                            std::vector<DataTensor_out *>& outputs,
-                            ActivationParam<OpTensor>& param, Context<BM>& ctx) {
-        // not sure
-        return SaberSuccess;
-    }
-
-    //call bmdnn activation funcs here
-    virtual SaberStatus dispatch(const std::vector<DataTensor_in *>& inputs,
-                            std::vector<DataTensor_out *>& outputs,
-                            ActivationParam<OpTensor>& param) {
-        const InDataType in_data = *(inputs[0]->data());
-        OutDataType out_data = *(outputs[0]->mutable_data());
-        int input_dim = inputs[0]->channel() * inputs[0]->height() * inputs[0]->width();
-        int input_n = inputs[0]->num();
-
-        _active_type = param.active;
-        switch (_active_type) {
-            case Active_relu:
-                BMDNN_CHECK(bmdnn_relu_forward(_handle, in_data, 0.0, input_n, input_dim, out_data));
-                break;
-            case Active_sigmoid:
-                BMDNN_CHECK(bmdnn_sigmoid_forward(_handle, in_data, input_n, input_dim, out_data));
-                break;
-            case Active_tanh:
-                BMDNN_CHECK(bmdnn_tanh_forward(_handle, in_data, input_n, input_dim, out_data));
-                break;
-        }
-        return SaberSuccess;
-    }
-
-private:
-    bm_handle_t _handle;
-    ActiveType _active_type;
-};
-
-template class VenderActivation<BM, AK_BM, AK_BM, AK_BM, NCHW, NCHW, NCHW>;
-} // namespace saber
-
-} // namespace anakin
-#endif //ANAKIN_SABER_FUNCS_BMDNN_ACT_H
diff --git a/saber/funcs/impl/bm/vender_batch_norm.h b/saber/funcs/impl/bm/vender_batch_norm.h
deleted file mode 100644
index 4f433a4a9..000000000
--- a/saber/funcs/impl/bm/vender_batch_norm.h
+++ /dev/null
@@ -1,96 +0,0 @@
-#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_BATCH_NORM_H
-#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_BATCH_NORM_H
-
-#include "saber/funcs/impl/impl_batch_norm.h"
-
-namespace anakin{
-
-namespace saber {
-
-template <DataType OpDtype ,
-    DataType inDtype,
-    DataType outDtype,
-    typename LayOutType_op,
-    typename LayOutType_in,
-    typename LayOutType_out>
-class VenderBatchNorm<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, LayOutType_out>:\
- public ImplBase<
-    Tensor<BM, inDtype, LayOutType_in>, 
-    Tensor<BM, outDtype, LayOutType_out>,
-    Tensor<BM, OpDtype, LayOutType_op>,
-    BatchnormParam<Tensor<BM, OpDtype, LayOutType_op>>> {
-public:
-    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
-    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
-    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
-
-    typedef typename DataTensor_in::Dtype InDataType;
-    typedef typename DataTensor_out::Dtype OutDataType;
-    typedef typename OpTensor::Dtype OpDataType;
-
-    VenderBatchNorm() : _handle(NULL) {}
-
-    ~VenderBatchNorm() {}
-
-    virtual SaberStatus init(const std::vector<DataTensor_in*>& inputs,
-                  std::vector<DataTensor_out*>& outputs,
-                  BatchnormParam<OpTensor> &batch_norm_param, Context<BM> &ctx) {
-
-        _handle = get_bm_handle();
-        return create(inputs, outputs, batch_norm_param, ctx);
-    }
-
-    virtual SaberStatus create(const std::vector<DataTensor_in*>& inputs,
-                std::vector<DataTensor_out*>& outputs,
-                BatchnormParam<OpTensor> &batch_norm_param, Context<BM> &ctx) {
-    }
-
-    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
-                          std::vector<DataTensor_out*>& outputs,
-                          BatchnormParam<OpTensor> &param) {
-
-        const InDataType *in_data = (const InDataType *) inputs[0]->data();
-        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
-
-        int input_n = inputs[0]->num();
-        int input_c = inputs[0]->channel();
-        int input_h = inputs[0]->height();
-        int input_w = inputs[0]->width();
-
-        float eps = param.eps;
-        float scale = param.scale;
-        
-        bm_device_mem_t mean_ma = bm_mem_from_system(&param.mean[0]);
-        bm_device_mem_t variance_ma = bm_mem_from_system(&param.variance[0]);
-
-        bm_device_mem_t* variance_holder = new bm_device_mem_t();
-
-        bmdnn_batchnorm_forward_inference(
-                _handle,
-                //input
-                *in_data,
-                mean_ma,
-                variance_ma,
-                scale,
-                *variance_holder,
-                eps,
-                input_n,
-                input_c,
-                input_h,
-                input_w,
-                //output
-                *out_data
-        );
-
-        return SaberSuccess;
-    }
-
-private:
-    bm_handle_t _handle;
-};
-
-} //namespace saber
-
-} // namespace anakin
-
-#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_BATCH_NORM_H
diff --git a/saber/funcs/impl/bm/vender_conv.cpp b/saber/funcs/impl/bm/vender_conv.cpp
new file mode 100644
index 000000000..e1e6ec1ec
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_conv.cpp
@@ -0,0 +1,102 @@
+
+#include "saber/funcs/impl/bm/vender_conv.h"
+
+namespace anakin
+{
+namespace saber
+{
+
+// FP32 part
+template <>
+SaberStatus VenderConv2D<BM, AK_BM>::
+    init(const std::vector<Tensor<BM> *> &inputs,
+         std::vector<Tensor<BM> *> &outputs,
+         ConvParam<BM> &param, Context<BM> &ctx)
+{
+
+    _handle = get_bm_handle();
+    return create(inputs, outputs, param, ctx);
+}
+
+template <>
+SaberStatus VenderConv2D<BM, AK_BM>::
+    create(const std::vector<Tensor<BM> *> &inputs,
+           std::vector<Tensor<BM> *> &outputs,
+           ConvParam<BM> &param, Context<BM> &ctx)
+{
+}
+
+template <>
+SaberStatus VenderConv2D<BM, AK_BM>::\
+    dispatch(const std::vector<Tensor<BM> *> &inputs,
+                             std::vector<Tensor<BM> *> &outputs,
+                             ConvParam<BM> &param)
+{
+
+    /*const bm_device_mem_t *in_data = (const bm_device_mem_t *)inputs[0]->data();
+    const bm_device_mem_t *weight = (const bm_device_mem_t *)param.weight()->data();
+    bm_device_mem_t *out_data = (bm_device_mem_t *)outputs[0]->mutable_data();
+
+    int input_n = inputs[0]->num();
+    int input_c = inputs[0]->channel();
+    int input_h = inputs[0]->height();
+    int input_w = inputs[0]->width();
+
+    int output_n = outputs[0]->num();
+    int output_c = outputs[0]->channel();
+    int output_h = outputs[0]->height();
+    int output_w = outputs[0]->width();
+
+    int group = param.group;
+    int kh = param.weight()->height();
+    int kw = param.weight()->width();
+    int pad_h = param.pad_h;
+    int pad_w = param.pad_w;
+    int stride_h = param.stride_h;
+    int stride_w = param.stride_w;
+    int dilation_h = param.dilation_h;
+    int dilation_w = param.dilation_w;
+
+    bool with_bias = param.bias()->size() > 0;
+    const bm_device_mem_t *bias = with_bias ? (const bm_device_mem_t *)param.bias()->data() : &bm_mem_null();
+
+    bm_tensor_4d_t input_shape = {
+        input_n,
+        input_c,
+        input_h,
+        input_w};
+
+    bm_tensor_4d_t output_shape = {
+        output_n,
+        output_c,
+        output_h,
+        output_w};
+
+    bm_kernel_param_t kernel_param = {
+        group,
+        output_c,
+        input_c,
+        kh,
+        kw};
+
+    bm_conv_param_t conv_param = {
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        dilation_h,
+        dilation_w,
+        0};
+
+    BMDNN_CHECK(bmdnn_conv_forward(_handle, *in_data, *weight, *bias, input_shape,
+                                   kernel_param, output_shape, conv_param, with_bias, *out_data));*/
+
+    return SaberSuccess;
+}
+
+// INT8 part
+// TODO:
+
+template class VenderConv2D<BM, AK_BM>;
+} // namespace saber
+} // namespace anakin
\ No newline at end of file
diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h
index 7243fd6a4..fbb56d359 100644
--- a/saber/funcs/impl/bm/vender_conv.h
+++ b/saber/funcs/impl/bm/vender_conv.h
@@ -7,112 +7,25 @@ namespace anakin{
 
 namespace saber{
 
-template <DataType OpDtype,
-    DataType inDtype,
-    DataType outDtype,
-    typename LayOutType_op,
-    typename LayOutType_in,
-    typename LayOutType_out>
-class VenderConv2D<BM, OpDtype, inDtype, outDtype,\
-    LayOutType_op, LayOutType_in, LayOutType_out> : \
-    public ImplBase<
-        Tensor<BM, inDtype, LayOutType_in>,
-        Tensor<BM, outDtype, LayOutType_out>,
-        Tensor<BM, OpDtype, LayOutType_op>,
-        ConvParam<Tensor<BM, OpDtype, LayOutType_op> > >
-{
+template <DataType OpDtype>
+class VenderConv2D<BM, OpDtype> : public ImplBase<
+        BM, OpDtype, ConvParam<BM> > {
+            
 public:
-    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
-    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
-    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
-    typedef typename DataTensor_in::Dtype InDataType;
-    typedef typename DataTensor_out::Dtype OutDataType;
-    typedef typename OpTensor::Dtype OpDataType;
-
     VenderConv2D(): _handle(NULL) {}
     ~VenderConv2D() {}
 
-    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
-                            std::vector<DataTensor_out *>& outputs,
-                            ConvParam<OpTensor>& param, Context<BM>& ctx) {
-
-        _handle = get_bm_handle();
-        return create(inputs, outputs, param, ctx);
-    }
-
-    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
-                            std::vector<DataTensor_out *>& outputs,
-                            ConvParam<OpTensor>& param, Context<BM>& ctx) {
-        
-    }
-
-    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
-                          std::vector<DataTensor_out*>& outputs,
-                          ConvParam<OpTensor>& param) {
-        const InDataType *in_data = (const InDataType *) inputs[0]->data();
-        const InDataType *weight = (const InDataType *) param.weight()->data();
-        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
-
-        int input_n = inputs[0]->num();
-        int input_c = inputs[0]->channel();
-        int input_h = inputs[0]->height();
-        int input_w = inputs[0]->width();
-
-        int output_n = outputs[0]->num();
-        int output_c = outputs[0]->channel();
-        int output_h = outputs[0]->height();
-        int output_w = outputs[0]->width();
-
-        int group = param.group;
-        int kh = param.weight()->height();
-        int kw = param.weight()->width();
-        int pad_h = param.pad_h;
-        int pad_w = param.pad_w;
-        int stride_h = param.stride_h;
-        int stride_w = param.stride_w;
-        int dilation_h = param.dilation_h;
-        int dilation_w = param.dilation_w;
-
-        bool with_bias = param.bias()->size() > 0;
-        const InDataType *bias = with_bias? (const InDataType *) param.bias()->data() : &bm_mem_null();
-
-        bm_tensor_4d_t input_shape = {
-            input_n,
-            input_c,
-            input_h,
-            input_w
-        };
-
-        bm_tensor_4d_t output_shape = {
-            output_n,
-            output_c,
-            output_h,
-            output_w
-        };
-
-        bm_kernel_param_t kernel_param = {
-            group,
-            output_c,
-            input_c,
-            kh,
-            kw
-        };
+    virtual SaberStatus init(const std::vector<Tensor<BM> *>& inputs,
+                             std::vector<Tensor<BM> *>& outputs,
+                             ConvParam<BM>& param, Context<BM>& ctx);
 
-        bm_conv_param_t conv_param = {
-            stride_h,
-            stride_w,
-            pad_h,
-            pad_w,
-            dilation_h,
-            dilation_w,
-            0
-        };
+    virtual SaberStatus create(const std::vector<Tensor<BM> *>& inputs,
+                               std::vector<Tensor<BM> *>& outputs,
+                               ConvParam<BM>& param, Context<BM>& ctx);
 
-        BMDNN_CHECK(bmdnn_conv_forward(_handle, *in_data, *weight, *bias, input_shape, 
-                                    kernel_param, output_shape, conv_param, with_bias, *out_data));
-                                    
-        return SaberSuccess;
-    }
+    virtual SaberStatus dispatch(const std::vector<Tensor<BM>*>& inputs,
+                                 std::vector<Tensor<BM>*>& outputs,
+                                 ConvParam<BM>& param);
 
 private:
     bm_handle_t _handle;
diff --git a/saber/funcs/impl/bm/vender_conv_act.h b/saber/funcs/impl/bm/vender_conv_act.h
deleted file mode 100644
index 4d9c9f3bb..000000000
--- a/saber/funcs/impl/bm/vender_conv_act.h
+++ /dev/null
@@ -1,198 +0,0 @@
-/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-   
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. 
-*/
-
-#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ACT_H
-#define ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ACT_H
-
-#include "saber/funcs/impl/impl_conv_act.h"
-#include "saber/funcs/impl/cuda/cudnn_helper.h"   
-#include <cudnn.h>
-
-namespace anakin{
-
-namespace saber{
-
-template <DataType OpDtype,
-    DataType inDtype,
-    DataType outDtype,
-    typename LayOutType_op,
-    typename LayOutType_in,
-    typename LayOutType_out>
-class VenderConv2DAct<BM, OpDtype, inDtype, outDtype,\
-    LayOutType_op, LayOutType_in, LayOutType_out> : \
-    public ImplBase<
-        Tensor<BM, inDtype, LayOutType_in>,
-        Tensor<BM, outDtype, LayOutType_out>,
-        Tensor<BM, OpDtype, LayOutType_op>,
-        ConvActiveParam<Tensor<BM, OpDtype, LayOutType_op> > >
-{
-public:
-    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
-    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
-    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
-    typedef typename DataTensor_in::Dtype InDataType;
-    typedef typename DataTensor_out::Dtype OutDataType;
-    typedef typename OpTensor::Dtype OpDataType;
-
-    VenderConv2DAct()
-            : _handle(NULL)
-            , _workspaceData(NULL)
-            , _workspace(NULL)
-            , _conv_descs(NULL)
-            , _input_descs(NULL)
-            , _output_descs(NULL)
-            , _filter_desc(NULL)
-            , _workspace_fwd_sizes(0)
-            , _workspaceSizeInBytes(0)
-            , _fwd_algo((cudnnConvolutionFwdAlgo_t)0)
-            , _input_nchw_descs(NULL)
-            , _output_nchw_descs(NULL)
-            , x8_data(NULL)
-            , y8_data(NULL)
-            , x8_data_size(0)
-            , y8_data_size(0)
-    {}
-
-    ~VenderConv2DAct() {
-
-        if (_conv_descs) {
-            CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(_conv_descs));
-        }
-        if (_input_descs) {
-            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_descs));
-        }
-        if (_output_descs) {
-            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_descs));
-        }
-        if (_filter_desc) {
-            CUDNN_CHECK(cudnnDestroyFilterDescriptor(_filter_desc));
-        }
-        if (_handle != NULL) {
-            CUDNN_CHECK(cudnnDestroy(_handle));
-        }
-        if (_workspaceData != NULL) {
-            cudaFree(_workspaceData);
-        }
-        if (_input_nchw_descs != NULL) {
-            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_nchw_descs));
-        }
-        if (_output_nchw_descs != NULL) {
-            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_nchw_descs));
-        }
-        if (x8_data != NULL) {
-            CUDA_CHECK(cudaFree(x8_data));
-        }
-        if (y8_data != NULL) {
-            CUDA_CHECK(cudaFree(y8_data));
-        }
-    }
-
-    /**
-     * [Create description] Init all cudnn resource here
-     * @AuthorHTL
-     * @DateTime  2018-02-01T16:13:06+0800
-     * @param     inputs                    [description]
-     * @param     outputs                   [description]
-     * @param     conv_param                [conv parameters]
-     */
-    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
-                            std::vector<DataTensor_out *>& outputs,
-                            ConvActiveParam<OpTensor>& param, Context<BM>& ctx) {
-        // ---- init cudnn resources ----
-
-        _workspaceSizeInBytes = 0;
-        _workspaceData = NULL;
-
-        _workspace_fwd_sizes = 0;
-
-        this->_ctx = ctx;
-        // ---- get cuda resources ----
-
-        cudaStream_t cuda_stream;
-        cuda_stream = ctx.get_compute_stream();
-
-        CUDNN_CHECK(cudnnCreate(&_handle));
-        CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream));
-
-        _workspace = NULL;
-
-        int in_channels = inputs[0]->channel();
-
-        // ---- create cudnn Descs ----
-        cudnn::createFilterDesc<OpDataType>(&_filter_desc);
-
-        cudnn::createTensorDesc<InDataType>(&_input_descs);
-        cudnn::createTensorDesc<OutDataType>(&_output_descs);
-        cudnn::createConvolutionDesc<OpDataType>(&_conv_descs);
-        cudnn::create_activation_des<InDataType>(&_active_descs);
-
-        if (param.conv_param.bias()->size() > 0) {
-            cudnn::createTensorDesc<OpDataType>(&_bias_desc);
-        }
-
-        cudnnCreateTensorDescriptor(&_input_nchw_descs);
-        cudnnCreateTensorDescriptor(&_output_nchw_descs);
-
-        return create(inputs, outputs, param, ctx);
-    }
-
-    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
-                            std::vector<DataTensor_out *>& outputs,
-                            ConvActiveParam<OpTensor>& param, Context<BM>& ctx);
-    //call cudnnConvolutionForward here
-    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
-                          std::vector<DataTensor_out*>& outputs,
-                          ConvActiveParam<OpTensor>& param);
-private:
-    cudnnHandle_t _handle;
-    cudnnConvolutionFwdAlgo_t _fwd_algo;
-
-    cudnnTensorDescriptor_t _input_descs;
-    cudnnTensorDescriptor_t _output_descs;
-    cudnnTensorDescriptor_t _bias_desc;
-
-    cudnnFilterDescriptor_t _filter_desc;
-
-    cudnnConvolutionDescriptor_t _conv_descs;
-
-    size_t _workspace_fwd_sizes;
-    size_t _workspaceSizeInBytes;  // size of underlying storage
-
-    void *_workspaceData;  // underlying storage
-    void *_workspace;  // aliases into workspaceData
-
-    const bool _use_tensor_core = true;
-    const size_t _workspace_limit_bytes = 64 * 1024 * 1024;
-    const cudnnConvolutionFwdPreference_t _preference = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
-
-    // activation descriptor
-    cudnnActivationDescriptor_t _active_descs;
-
-    // create transform descriptor
-    cudnnTensorDescriptor_t _input_nchw_descs;
-    cudnnTensorDescriptor_t _output_nchw_descs;
-
-    void *x8_data;
-    void *y8_data;
-
-    int x8_data_size;
-    int y8_data_size;
-};
-
-
-}
-
-}
-#endif //ANAKIN_SABER_FUNCS_CUDNN_CONV2D_H
diff --git a/saber/funcs/impl/bm/vender_conv_act_pooling.h b/saber/funcs/impl/bm/vender_conv_act_pooling.h
deleted file mode 100644
index e602a693d..000000000
--- a/saber/funcs/impl/bm/vender_conv_act_pooling.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-   
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. 
-*/
-
-#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ACT_POOLING_H
-#define ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ACT_POOLING_H
-
-#include "saber/funcs/impl/impl_conv_act_pooling.h"
-#include "saber/funcs/impl/cuda/cudnn_helper.h"   
-#include <cudnn.h>
-
-namespace anakin{
-
-namespace saber{
-
-template <DataType OpDtype,
-    DataType inDtype,
-    DataType outDtype,
-    typename LayOutType_op,
-    typename LayOutType_in,
-    typename LayOutType_out>
-class VenderConv2DActPooling<BM, OpDtype, inDtype, outDtype,\
-    LayOutType_op, LayOutType_in, LayOutType_out> : \
-    public ImplBase<
-        Tensor<BM, inDtype, LayOutType_in>,
-        Tensor<BM, outDtype, LayOutType_out>,
-        Tensor<BM, OpDtype, LayOutType_op>,
-        ConvActivePoolingParam<Tensor<BM, OpDtype, LayOutType_op> > >
-{
-public:
-    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
-    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
-    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
-    typedef typename DataTensor_in::Dtype InDataType;
-    typedef typename DataTensor_out::Dtype OutDataType;
-    typedef typename OpTensor::Dtype OpDataType;
-
-    VenderConv2DActPooling()
-            : _handle(NULL)
-            , _workspaceData(NULL)
-            , _workspace(NULL)
-            , _conv_descs(NULL)
-            , _input_descs(NULL)
-            , _output_descs(NULL)
-            , _filter_desc(NULL)
-            , _workspace_fwd_sizes(0)
-            , _workspaceSizeInBytes(0)
-            , _fwd_algo((cudnnConvolutionFwdAlgo_t)0)
-    {}
-    ~VenderConv2DActPooling() {
-
-        if (_conv_descs) {
-            CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(_conv_descs));
-        }
-        if (_input_descs) {
-            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_descs));
-        }
-        if (_output_descs) {
-            CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_descs));
-        }
-        if (_filter_desc) {
-            CUDNN_CHECK(cudnnDestroyFilterDescriptor(_filter_desc));
-        }
-        if (_handle != NULL) {
-            CUDNN_CHECK(cudnnDestroy(_handle));
-        }
-        if (_workspaceData != NULL) {
-            cudaFree(_workspaceData);
-        }
-    }
-
-    /**
-     * [Create description] Init all cudnn resource here
-     * @AuthorHTL
-     * @DateTime  2018-02-01T16:13:06+0800
-     * @param     inputs                    [description]
-     * @param     outputs                   [description]
-     * @param     conv_param                [conv parameters]
-     */
-    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
-                            std::vector<DataTensor_out *>& outputs,
-                            ConvActivePoolingParam<OpTensor>& param, Context<BM>& ctx) {
-        // ---- init cudnn resources ----
-
-        _workspaceSizeInBytes = 0;
-        _workspaceData = NULL;
-
-        _workspace_fwd_sizes = 0;
-
-        this->_ctx = ctx;
-        // ---- get cuda resources ----
-
-        cudaStream_t cuda_stream;
-        cuda_stream = ctx.get_compute_stream();
-
-        CUDNN_CHECK(cudnnCreate(&_handle));
-        CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream));
-
-        _workspace = NULL;
-
-        int in_channels = inputs[0]->channel();
-
-        // ---- create cudnn Descs ----
-        cudnn::createFilterDesc<OpDataType>(&_filter_desc);
-
-        cudnn::createTensorDesc<InDataType>(&_input_descs);
-        cudnn::createTensorDesc<InDataType>(&_inner_descs);
-        cudnn::createTensorDesc<OutDataType>(&_output_descs);
-        cudnn::createConvolutionDesc<OpDataType>(&_conv_descs);
-        if (param.has_activation) {
-            cudnn::create_activation_des<InDataType>(&_active_descs);
-        }
-        if (param.has_pooling) {
-            cudnn::create_pooling_des<InDataType>(&_pooling_descs);
-        }
-        if (param.conv_param.bias()->size() > 0) {
-            cudnn::createTensorDesc<OpDataType>(&_bias_desc);
-        }
-
-        return create(inputs, outputs, param, ctx);
-    }
-
-    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
-                            std::vector<DataTensor_out *>& outputs,
-                            ConvActivePoolingParam<OpTensor>& param, Context<BM>& ctx);
-    //call cudnnConvolutionForward here
-    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
-                          std::vector<DataTensor_out*>& outputs,
-                          ConvActivePoolingParam<OpTensor>& param);
-private:
-    cudnnHandle_t _handle;
-    cudnnConvolutionFwdAlgo_t _fwd_algo;
-
-    cudnnTensorDescriptor_t _input_descs;
-    cudnnTensorDescriptor_t _output_descs;
-    cudnnTensorDescriptor_t _inner_descs;
-    cudnnTensorDescriptor_t _bias_desc;
-
-    cudnnFilterDescriptor_t _filter_desc;
-
-    cudnnConvolutionDescriptor_t _conv_descs;
-    cudnnPoolingDescriptor_t _pooling_descs;
-
-    size_t _workspace_fwd_sizes;
-    size_t _workspaceSizeInBytes;  // size of underlying storage
-
-    void *_workspaceData;  // underlying storage
-    void *_workspace;  // aliases into workspaceData
-
-    const bool _use_tensor_core = true;
-    const size_t _workspace_limit_bytes = 64 * 1024 * 1024;
-    const cudnnConvolutionFwdPreference_t _preference = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
-
-    // activation descriptor
-    cudnnActivationDescriptor_t _active_descs;
-
-    Shape _inner_shape;
-    DataTensor_out _inner_tensor;
-};
-
-
-}
-
-}
-#endif //ANAKIN_SABER_FUNCS_CUDNN_CONV2D_H
diff --git a/saber/funcs/impl/bm/vender_eltwise.h b/saber/funcs/impl/bm/vender_eltwise.h
deleted file mode 100644
index 050fc4d43..000000000
--- a/saber/funcs/impl/bm/vender_eltwise.h
+++ /dev/null
@@ -1,119 +0,0 @@
-#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_ELTWISE_H
-#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_ELTWISE_H
-
-#include "saber/funcs/impl/impl_eltwise.h"
-
-namespace anakin {
-
-namespace saber {
-
-template <DataType OpDtype,
-            DataType inDtype,
-            DataType outDtype,
-            typename LayOutType_op,
-            typename LayOutType_in,
-            typename LayOutType_out>
-class VenderEltwise<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, LayOutType_out>:\
-public ImplBase<
-        Tensor<BM, inDtype, LayOutType_in>,
-        Tensor<BM, outDtype, LayOutType_out>,
-        Tensor<BM, OpDtype, LayOutType_op>,
-        EltwiseParam<Tensor<BM, OpDtype, LayOutType_op>>> {
-public:
-    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
-    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
-    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
-
-    typedef typename DataTensor_in::Dtype InDataType;
-    typedef typename DataTensor_out::Dtype OutDataType;
-    typedef typename OpTensor::Dtype OpDataType;
-
-    VenderEltwise() {}
-
-    ~VenderEltwise() {}
-
-    virtual SaberStatus init(const std::vector<DataTensor_in*>& inputs,
-                         std::vector<DataTensor_out*>& outputs,
-                         EltwiseParam<OpTensor> &param,
-                         Context<BM> &ctx) {
-        _handle = get_bm_handle();
-        return create(inputs, outputs, param, ctx);
-    }
-
-    virtual SaberStatus create(const std::vector<DataTensor_in*>& inputs,
-                           std::vector<DataTensor_out*>& outputs,
-                           EltwiseParam<OpTensor> &param,
-                           Context<BM> &ctx) {
-    }
-
-    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
-                             std::vector<DataTensor_out*>& outputs,
-                             EltwiseParam<OpTensor> &param) {
-
-        int op_ = 0;
-        switch (param.operation) {
-            case Eltwise_prod:
-                op_ = 0;
-                break;
-            case Eltwise_sum:
-                op_ = 1;
-                break;
-            case Eltwise_max:
-                op_ = 2;
-                break;
-            default:
-                return SaberUnImplError;
-        }
-
-        //int input_size = inputs.size();
-        //CHECK_GE(input_size, 2) << "Input size should >= 2!";
-
-        OutDataType out_data = *(outputs[0]->mutable_data());
-        int input_n = inputs[0]->num();
-        int input_c = inputs[0]->channel();
-        int input_h = inputs[0]->height();
-        int input_w = inputs[0]->width();
-
-        std::vector<float> coeff_ = param.coeff;
-        if (coeff_.size() != inputs.size()) {
-            int diff = inputs.size() - coeff_.size();
-            for (int j=0; j<diff; j++) {
-                coeff_.push_back(1);
-            }
-        }
-
-        bm_device_mem_t* mask_data = new bm_device_mem_t();
-
-        int flag_first = 1;
-        for (int i=0; i<inputs.size(); i++){
-            const InDataType in_data = *(inputs[i]->data());
-            bmdnn_eltwise_forward(
-                    _handle,
-                    op_,
-                    flag_first,
-                    coeff_[i],
-                    i,
-                    in_data,
-                    out_data,
-                    input_n,
-                    input_c * input_h * input_w,
-                    *mask_data,
-                    out_data);
-
-            bm_flush(_handle);
-            flag_first = 0;
-        }
-
-        //bm_free_device(_handle, *mask_data);
-        return SaberSuccess;
-    }
-
-private:
-    bm_handle_t _handle;
-};
-
-} //namespace saber
-
-} //namespace anakin
-
-#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_ELTWISE_H
\ No newline at end of file
diff --git a/saber/funcs/impl/bm/vender_fc.h b/saber/funcs/impl/bm/vender_fc.h
deleted file mode 100644
index c0cd7ea66..000000000
--- a/saber/funcs/impl/bm/vender_fc.h
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef ANAKIN_SABER_FUNCS_BMDNN_FC_H
-#define ANAKIN_SABER_FUNCS_BMDNN_FC_H
-
-#include "saber/funcs/impl/impl_fc.h"
-
-namespace anakin{
-
-namespace saber{
-
-template <DataType OpDtype,
-        DataType inDtype,
-        DataType outDtype,
-        typename LayOutType_op,
-        typename LayOutType_in,
-        typename LayOutType_out>
-class VenderFc<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, LayOutType_out>: \
-    public ImplBase<
-        Tensor<BM, inDtype, LayOutType_in>, \
-        Tensor<BM, outDtype, LayOutType_out>, \
-        Tensor<BM, OpDtype, LayOutType_op>, \
-        FcParam<Tensor<BM, OpDtype, LayOutType_op>>> {
-
-public:
-    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
-    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
-    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
-    typedef typename DataTensor_in::Dtype InDataType;
-    typedef typename DataTensor_out::Dtype OutDataType;
-    typedef typename OpTensor::Dtype OpDataType;
-
-    VenderFc(): _handle(NULL) {};
-    ~VenderFc() {}
-
-    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
-                            std::vector<DataTensor_out *>& outputs,
-                            FcParam<OpTensor>& param, Context<BM>& ctx){
-        _handle = get_bm_handle();
-        return create(inputs, outputs, param, ctx);
-    }
-
-    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
-                            std::vector<DataTensor_out *>& outputs,
-                            FcParam<OpTensor>& param, Context<BM>& ctx){
-        return SaberSuccess;
-    }
-
-    virtual SaberStatus dispatch(const std::vector<DataTensor_in *>& inputs,
-                            std::vector<DataTensor_out *>& outputs,
-                            FcParam<OpTensor>& param){
-        const InDataType *in_data = (const InDataType *) inputs[0]->data();
-        const InDataType *weights = (const InDataType *) param.weights->data();
-        const InDataType *bias = (const InDataType *) param.bias->data();
-        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
-        int batch_size = inputs[0]->count_valid(0, param.axis);
-        int input_len = inputs[0]->count_valid(param.axis, inputs[0]->dims());
-        int output_len = param.num_output;
-        if (output_len <= 0) {
-            int weight_size = param.weights->valid_size();
-            output_len = weight_size / input_len;
-        }
-
-        BMDNN_CHECK(bmdnn_fc_forward(_handle, *in_data, *weights, *bias,
-                                    batch_size, output_len, input_len, param.is_transpose_weights, 1, 0,
-                                    *out_data));
-        return SaberSuccess;
-    };
-
-private:
-    bm_handle_t _handle;
-};
-
-} //namespace saber
-
-} //namespace anakin
-
-#endif // ANAKIN_SABER_FUNCS_BMDNN_FC_H
diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h
deleted file mode 100644
index 1bdcfdecb..000000000
--- a/saber/funcs/impl/bm/vender_pooling.h
+++ /dev/null
@@ -1,87 +0,0 @@
-#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_POOLING_H
-#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_POOLING_H
-
-#include "saber/funcs/impl/impl_pooling.h"
-
-namespace anakin{
-
-namespace saber {
-
-template <DataType OpDtype ,
-    DataType inDtype,
-    DataType outDtype,
-    typename LayOutType_op,
-    typename LayOutType_in,
-    typename LayOutType_out>
-class VenderPooling<BM, OpDtype, inDtype, outDtype, LayOutType_op, LayOutType_in, LayOutType_out>:\
- public ImplBase<
-    Tensor<BM, inDtype, LayOutType_in>, 
-    Tensor<BM, outDtype, LayOutType_out>,
-    Tensor<BM, OpDtype, LayOutType_op>,
-    PoolingParam<Tensor<BM, OpDtype, LayOutType_op>>> {
-public:
-    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
-    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
-    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
-
-    typedef typename DataTensor_in::Dtype InDataType;
-    typedef typename DataTensor_out::Dtype OutDataType;
-    typedef typename OpTensor::Dtype OpDataType;
-
-    VenderPooling() : _handle(NULL) {}
-
-    ~VenderPooling() {}
-
-    virtual SaberStatus init(const std::vector<DataTensor_in*>& inputs,
-                  std::vector<DataTensor_out*>& outputs,
-                  PoolingParam<OpTensor> &pooling_param, Context<BM> &ctx) {
-
-        _handle = get_bm_handle();
-        return create(inputs, outputs, pooling_param, ctx);
-    }
-
-    virtual SaberStatus create(const std::vector<DataTensor_in*>& inputs,
-                std::vector<DataTensor_out*>& outputs,
-                PoolingParam<OpTensor> &pooling_param, Context<BM> &ctx) {
-    }
-
-    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
-                          std::vector<DataTensor_out*>& outputs,
-                          PoolingParam<OpTensor> &param) {
-        const InDataType in_data = *(inputs[0]->data());
-        OutDataType out_data = *(outputs[0]->mutable_data());
-        int input_n = inputs[0]->num();
-        int input_c = inputs[0]->channel();
-        int input_h = inputs[0]->height();
-        int input_w = inputs[0]->width();
-        int kh = param.window_h;
-        int kw = param.window_w;
-        int pad_h = param.pad_h;
-        int pad_w = param.pad_w;
-        int stride_h = param.stride_h;
-        int stride_w = param.stride_w;
-        int is_avg_pooling;
-        if(param.pooling_type == Pooling_max){
-            is_avg_pooling = 0;
-        } else {
-            is_avg_pooling = 1;
-        }
-
-        BMDNN_CHECK(bmdnn_pooling_forward(_handle, in_data, 
-                            input_n, input_c, input_h, input_w, kh, kw, pad_h, pad_w, 
-                            stride_h, stride_w, is_avg_pooling, out_data));
-        return SaberSuccess;
-    }
-
-private:
-    bm_handle_t _handle;
-    PoolingType _pooling_type;
-};
-
-template class VenderPooling<BM, AK_BM, AK_BM, AK_BM, NCHW, NCHW, NCHW>;
-
-} //namespace saber
-
-} // namespace anakin
-
-#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_POOLING_H
diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h
deleted file mode 100644
index 2876e8005..000000000
--- a/saber/funcs/impl/bm/vender_scale.h
+++ /dev/null
@@ -1,106 +0,0 @@
-#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_SCALE_H
-#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_SCALE_H
-
-#include "saber/funcs/impl/impl_scale.h"
-
-namespace anakin{
-
-namespace saber{
-
-template <DataType OpDtype,
-    DataType inDtype,
-    DataType outDtype,
-    typename LayOutType_op,
-    typename LayOutType_in,
-    typename LayOutType_out>
-class VenderScale<BM, OpDtype, inDtype, outDtype,\
-    LayOutType_op, LayOutType_in, LayOutType_out> : \
-    public ImplBase<
-        Tensor<BM, inDtype, LayOutType_in>,
-        Tensor<BM, outDtype, LayOutType_out>,
-        Tensor<BM, OpDtype, LayOutType_op>,
-        ScaleParam<Tensor<BM, OpDtype, LayOutType_op> > >
-{
-public:
-    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
-    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
-    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
-    typedef typename DataTensor_in::Dtype InDataType;
-    typedef typename DataTensor_out::Dtype OutDataType;
-    typedef typename OpTensor::Dtype OpDataType;
-
-    VenderScale() {}
-
-    ~VenderScale() {}
-
-    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
-                            std::vector<DataTensor_out *>& outputs,
-                            ScaleParam<OpTensor>& param, Context<BM>& ctx) {
-
-        _handle = get_bm_handle();
-        return create(inputs, outputs, param, ctx);
-    }
-
-    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
-                            std::vector<DataTensor_out *>& outputs,
-                            ScaleParam<OpTensor>& param, Context<BM> &ctx) {
-
-    }
-    
-    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
-                          std::vector<DataTensor_out*>& outputs,
-                          ScaleParam<OpTensor>& param) {
-
-        const InDataType in_data = *(inputs[0]->data());
-        OutDataType out_data = *(outputs[0]->mutable_data());
-
-        int input_n = inputs[0]->num();
-        int input_c = inputs[0]->channel();
-        int input_h = inputs[0]->height();
-        int input_w = inputs[0]->width();
-
-        int axis = (param.num_axes == 0) ? 0 : param.axis;
-        int num_axes = param.num_axes >=0 ? param.num_axes : inputs[0]->shape().dims() - axis;
-
-        int outer_dim = inputs[0]->count(0, axis);
-        int inner_dim = inputs[0]->count(axis + num_axes, inputs[0]->shape().dims());
-        int scale_dim = inputs[0]->count(axis, axis + num_axes);
-        /* if (inputs.size() == 1) { */
-        /*     CHECK_EQ(scale_dim, param.scale_w.size()) << "scale dim not valid"; */
-        /* } */
-
-        float* scale_data = &param.scale_w[0];
-        bm_device_mem_t* data_extension = new bm_device_mem_t();
-        int size = input_n * input_c * input_h * input_w;
-        bm_malloc_device_byte(_handle, data_extension, size * sizeof(float));
-        BMDNN_CHECK(bmdnn_scale_forward(_handle, in_data, bm_mem_from_system(scale_data),
-                input_n, input_c, input_h, input_w,
-                scale_dim, inner_dim, 0,
-                *data_extension, out_data));
-        
-        if (param.bias_term) {
-            float* host_bias = &param.scale_b[0];
-            float* host_extension = new float[size];
-            int dim = inner_dim * scale_dim;
-            for (int i = 0; i < size; ++i) {
-                 int bias_dim = (i % dim) / inner_dim;
-                 host_extension[i] = host_bias[bias_dim];
-            }
-
-            bm_flush(_handle);
-            BMDNN_CHECK(bmdnn_bias_forward(_handle, out_data, bm_mem_from_system(host_extension),
-                    outer_dim, scale_dim * inner_dim, out_data));
-
-            delete [] host_bias;
-            delete [] host_extension;
-        }
-        bm_free_device(_handle, *data_extension);
-        return SaberSuccess;
-    }
-private:
-    bm_handle_t _handle;
-};
-
-}
-}
-#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_SCALE_H
diff --git a/saber/funcs/impl/bm/vender_softmax.h b/saber/funcs/impl/bm/vender_softmax.h
deleted file mode 100644
index 55612f66a..000000000
--- a/saber/funcs/impl/bm/vender_softmax.h
+++ /dev/null
@@ -1,108 +0,0 @@
-#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_SOFTMAX_H
-#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_SOFTMAX_H
-
-#include "saber/funcs/impl/impl_softmax.h"
-#include "saber/saber_funcs_param.h"
-#include "saber/saber_types.h"
-
-namespace anakin{
-
-namespace saber{
-
-template <DataType OpDtype,
-    DataType inDtype,
-    DataType outDtype,
-    typename LayOutType_op,
-    typename LayOutType_in,
-    typename LayOutType_out>
-class VenderSoftmax<BM, OpDtype, inDtype, outDtype,\
-    LayOutType_op, LayOutType_in, LayOutType_out> : \
-    public ImplBase<
-        Tensor<BM, inDtype, LayOutType_in>,
-        Tensor<BM, outDtype, LayOutType_out>,
-        Tensor<BM, OpDtype, LayOutType_op>,
-        SoftmaxParam<Tensor<BM, OpDtype, LayOutType_op> > >
-{
-public:
-    typedef Tensor<BM, inDtype, LayOutType_in> DataTensor_in;
-    typedef Tensor<BM, outDtype, LayOutType_out> DataTensor_out;
-    typedef Tensor<BM, OpDtype, LayOutType_op> OpTensor;
-    typedef typename DataTensor_in::Dtype InDataType;
-    typedef typename DataTensor_out::Dtype OutDataType;
-    typedef typename OpTensor::Dtype OpDataType;
-
-    VenderSoftmax(): _handle(NULL) {}
-    ~VenderSoftmax() {}
-
-    /**
-     * \brief initial all bmdnn resources here
-     * @param inputs
-     * @param outputs
-     * @param param
-     * @param ctx
-     */
-    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
-                            std::vector<DataTensor_out *>& outputs,
-                            SoftmaxParam<OpTensor>& param, Context<BM>& ctx) {
-
-        _handle = get_bm_handle();
-        return create(inputs, outputs, param, ctx);
-    }
-
-    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
-                            std::vector<DataTensor_out *>& outputs,
-                            SoftmaxParam<OpTensor>& param, Context<BM> &ctx) {
-
-    }
-
-    //call cudnnConvolutionForward here
-    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
-                          std::vector<DataTensor_out*>& outputs,
-                          SoftmaxParam<OpTensor> &param){
-
-        const InDataType *in_data = (const InDataType *) inputs[0]->data();
-        OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data();
-
-        /*
-        int input_n = inputs[0]->num();
-        int input_c = inputs[0]->channel();
-        int input_h = inputs[0]->height();
-        int input_w = inputs[0]->width();
-        */
-
-        int outer_num = inputs[0]->count(0, param.axis);
-        int inner_num = inputs[0]->count(param.axis + 1, inputs[0]->dims());
-
-        int N = outer_num;
-        int K = inputs[0]->valid_shape()[param.axis];
-        int H = inner_num;
-        int W = 1;
-
-        /*
-        const int stride_w = 1;
-        const int stride_h = W * stride_w;
-        const int stride_c = H * stride_h;
-        const int stride_n = K * stride_c;
-        */
-        
-        bmdnn_softmax_forward(
-                _handle,
-                *in_data,
-                N,
-                K,
-                H * W,
-                *out_data
-        );
-
-        return SaberSuccess;
-    }
-
-private:
-    bm_handle_t _handle;
-};
-
-} //namespace saber
-
-} //namespace anakin
-
-#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_SOFTMAX_H
diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp
deleted file mode 100644
index 8de77498a..000000000
--- a/test/saber/bm/test_TargetWrapper_BM.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-#include "saber_types.h"
-#include "target_wrapper.h"
-#include <iostream>
-
-#ifdef USE_BM
-using namespace anakin::saber;
-//static bm_handle_t handle;
-int main() {
-    //bmdnn_init(&handle);
-    typedef TargetWrapper<BM> API;
-    //int dev_count = 0;
-    //API::get_device_count(dev_count);
-    //std::cout << "dev_count: " << dev_count << std::endl;
-    
-    //bm_device_mem_t *pmem = new bm_device_mem_t();
-    void* pmem;
-    std::cout << "mem addr before mem_alloc: " << pmem << std::endl;
-    API::mem_alloc(&pmem, 3*200*400);
-    std::cout << "mem addr after  mem_alloc: " << pmem << std::endl;
-    std::cout << "Start mem_free test." << std::endl;
-    API::mem_free(pmem);
-    std::cout << "End mem_free test." << std::endl;
-    //bmdnn_deinit(handle);
-}
-#endif
-
diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp
deleted file mode 100644
index f8c8f46bb..000000000
--- a/test/saber/bm/test_saber_buffer_BM.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-#include "test_saber_buffer_BM.h"
-#include "saber/core/buffer.h"
-#include "saber/core/data_traits.h"
-
-using namespace anakin::saber;
-
-int get_bm_size() {
-    return 4;
-}
-
-template <DataType Ddatatype, DataType Hdatatype>
-void test_buffer() {
-
-    typedef TargetWrapper<X86> X86_API;
-    typedef TargetWrapper<BM> BM_API;
-    typedef typename DataTrait<Ddatatype>::dtype Ddtype;
-    typedef typename DataTrait<Hdatatype>::dtype Hdtype;
-    typedef Buffer<X86> BufferH;
-    typedef Buffer<BM> BufferD;
-
-    int n0 = 1024;
-    int n1 = 2048;
-
-    void* tmp_x86;
-    Hdtype* x86_ptr;
-    X86_API::mem_alloc(&tmp_x86, sizeof(Hdtype) * n0);
-    x86_ptr = static_cast<Hdtype*>(tmp_x86);
-
-    for (int i = 0; i < n0; i++) {
-        x86_ptr[i] = static_cast<Hdtype>(i);
-    }
-
-    void* tmp_bm;
-    Ddtype* bm_ptr;
-    BM_API::mem_alloc(&tmp_bm, get_bm_size() * n0);
-    bm_ptr = static_cast<Ddtype*>(tmp_bm);
-
-    LOG(INFO) << "Buffer: test default(empty) constructor";
-    BufferH x86_buf0;
-    BufferD bm_buf0;
-
-    LOG(INFO) << "Buffer: test constructor with data size";
-    BufferH x86_buf1(n0 * sizeof(Hdtype));
-    BufferD bm_buf1(n0 * sizeof(Ddtype));
-
-    LOG(INFO) << "Buffer: test constructor with data pointer, size and device id";
-    BufferH x86_buf2(x86_ptr, n0 * sizeof(Hdtype), X86_API::get_device_id());
-    BufferD bm_buf2(bm_ptr, n0 * get_bm_size(), BM_API::get_device_id());
-
-    LOG(INFO) << "Buffer: test copy constructor";
-    BufferH x86_buf3(x86_buf2);
-    LOG(INFO) << "BM Buffer copy constructor";
-    LOG(INFO) << "bm target id: " << BM_API::get_device_id();
-    LOG(INFO) << "bm buffer target id: " << bm_buf2.get_id();
-    BufferD bm_buf3(bm_buf2);
-    CHECK_EQ(x86_buf3.get_count(), x86_buf2.get_count()) << \
-            "shared buffer should have same data count";
-    CHECK_EQ(bm_buf3.get_count(), bm_buf2.get_count()) << \
-            "shared buffer should have same data count";
-
-    LOG(INFO) << "Buffer: test operator =";
-    x86_buf0 = x86_buf2;
-    bm_buf0 = bm_buf2;
-    CHECK_EQ(x86_buf0.get_count(), x86_buf2.get_count()) << \
-            "shared buffer should have same data count";
-    CHECK_EQ(bm_buf0.get_count(), bm_buf2.get_count()) << \
-            "shared buffer should have same data count";
-
-    LOG(INFO) << "Buffer: test re_alloc";
-    x86_buf1.re_alloc(n1 * sizeof(Hdtype));
-    bm_buf1.re_alloc(n1 * sizeof(Ddtype));
-    CHECK_EQ(x86_buf1.get_count(), n1 * sizeof(Hdtype)) << "buffer count error";
-    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Hdtype)) << "buffer capacity error";
-    CHECK_EQ(bm_buf1.get_count(), n1 * sizeof(Ddtype)) << "buffer count error";
-    CHECK_EQ(bm_buf1.get_capacity(), n1 * sizeof(Ddtype)) << "buffer capacity error";
-    x86_buf1.re_alloc(n0 * sizeof(Hdtype));
-    bm_buf1.re_alloc(n0 * sizeof(Ddtype));
-    CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Hdtype)) << "buffer count error";
-    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Hdtype)) << "buffer capacity error";
-    CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Hdtype)) << "buffer count error";
-    CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Hdtype)) << "buffer capacity error";
-
-    LOG(INFO) << "Buffer: test get_id()";
-    LOG(INFO) << "X86 device id: " << x86_buf0.get_id() << \
-              ", bm device id: " << bm_buf0.get_id();
-    CHECK_EQ(X86_API::get_device_id(), x86_buf0.get_id()) << "x86 device id error";
-    CHECK_EQ(BM_API::get_device_id(), bm_buf0.get_id()) << "bm device id error";
-
-    LOG(INFO) << "Buffer: test deep_cpy()";
-    x86_buf1.sync_copy_from(x86_buf2);
-    LOG(INFO) << "deep copy between two host buffer: ";
-    const Hdtype* ptr1 = static_cast<const Hdtype*>(x86_buf1.get_data());
-    const Hdtype* ptr2 = static_cast<const Hdtype*>(x86_buf2.get_data());
-
-    for (int i = 0; i < 10; i++) {
-        std::cout << ptr1[i] << std::endl;
-    }
-
-    CHECK_EQ(ptr1[n0 / 2], ptr2[n0 / 2]) << "deep copy between host is incorrect";
-    bm_buf1.sync_copy_from(x86_buf2); 
-    LOG(INFO) << "deep copy from host buffer to device buffer";
-
-    //LOG(INFO) << "bm_buf1 cap & cnt: " << bm_buf1.get_capacity() << " " << bm_buf1.get_count();
-    //LOG(INFO) << "x86_buf1 cap & cnt: " << x86_buf1.get_capacity() << " " << x86_buf1.get_count(); 
-    //LOG(INFO) << "size of Hdtype: " << sizeof(Hdtype);
-    //LOG(INFO) << "size of Ddtype: " << sizeof(Ddtype);
-    
-
-    x86_buf1.re_alloc(bm_buf1.get_capacity());
-    x86_buf1.sync_copy_from(bm_buf1);
-    LOG(INFO) << "deep copy from device buffer to host buffer: ";
-    ptr1 = static_cast<const Hdtype*>(x86_buf1.get_data());
-
-    for (int i = 0; i < 10; i++) {
-        std::cout << ptr1[i] << std::endl;
-    }
-
-}
-
-TEST(TestSaberBufferBM, test_buffer_memcpy) {
-    test_buffer<AK_BM, AK_FLOAT>();
-}
-
-int main(int argc, const char** argv) {
-    // initial logger
-    logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
diff --git a/test/saber/bm/test_saber_buffer_BM.h b/test/saber/bm/test_saber_buffer_BM.h
deleted file mode 100644
index 8bbbe4511..000000000
--- a/test/saber/bm/test_saber_buffer_BM.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H
-#define ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H
-
-#include "utils/unit_test/aktest.h"
-#include "utils/logger/logger.h"
-
-using namespace anakin::test;
-
-class TestSaberBufferBM : public Test {
-public:
-    TestSaberBufferBM() {}
-    ~TestSaberBufferBM() {}
-
-protected:
-    virtual void setup() {}
-    virtual void teardown() {}
-
-};
-
-#endif //ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H
diff --git a/test/saber/bm/test_saber_context_BM.cpp b/test/saber/bm/test_saber_context_BM.cpp
deleted file mode 100644
index f2df59c88..000000000
--- a/test/saber/bm/test_saber_context_BM.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-#include "test_saber_context_BM.h"
-
-#ifdef USE_BM
-
-using namespace anakin::saber;
-
-TEST(TestSaberContextBM, test_BM_context) {
-    Env<BM>::env_init();
-    typedef TargetWrapper<BM> API;
-    typename API::event_t event;
-    API::create_event(event);
-    LOG(INFO) << "test context constructor";
-    Context<BM> ctx0;
-    Context<BM> ctx1(0, 1, 1);
-
-    //for BM no need to test stream as it is not in use
-}
-
-#endif
-
-int main(int argc, const char** argv) {
-    //TODO: init in another place
-    static bm_handle_t handle;
-    bmdnn_init(&handle);
-    
-    // initial logger
-    logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
diff --git a/test/saber/bm/test_saber_context_BM.h b/test/saber/bm/test_saber_context_BM.h
deleted file mode 100644
index 653ee11fd..000000000
--- a/test/saber/bm/test_saber_context_BM.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef SABER_TEST_SABER_CONTEXT_BM_H
-#define SABER_TEST_SABER_CONTEXT_BM_H
-
-#include "utils/unit_test/aktest.h"
-#include "utils/logger/logger.h"
-#include "core/context.h"
-
-using namespace anakin::test;
-
-class TestSaberContextBM : public Test {
-public:
-    TestSaberContextBM() {}
-    ~TestSaberContextBM() {}
-
-protected:
-    virtual void setup() {}
-    virtual void teardown() {}
-
-};
-
-#endif //SABER_TEST_SABER_CONTEXT_BM_H
diff --git a/test/saber/bm/test_saber_device_BM.cpp b/test/saber/bm/test_saber_device_BM.cpp
deleted file mode 100644
index 1c7086cf1..000000000
--- a/test/saber/bm/test_saber_device_BM.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "test_saber_device_BM.h"
-
-#ifdef USE_BM
-
-using namespace anakin::saber;
-
-TEST(TestSaberDeviceBM, test_BM_device) {
-    Device<BM> dev_BM;
-}
-
-#endif
-
-int main(int argc, const char** argv) {
-    // initial logger
-    logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
diff --git a/test/saber/bm/test_saber_device_BM.h b/test/saber/bm/test_saber_device_BM.h
deleted file mode 100644
index 3a6d61236..000000000
--- a/test/saber/bm/test_saber_device_BM.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef SABER_TEST_SABER_DEVICE_BM_H
-#define SABER_TEST_SABER_DEVICE_BM_H
-
-#include "utils/unit_test/aktest.h"
-#include "utils/logger/logger.h"
-#include "core/device.h"
-
-using namespace anakin::test;
-
-class TestSaberDeviceBM : public Test {
-public:
-    TestSaberDeviceBM() {}
-    ~TestSaberDeviceBM() {}
-
-protected:
-    virtual void setup() {}
-    virtual void teardown() {}
-
-};
-
-#endif //SABER_TEST_SABER_DEVICE_BM_H
diff --git a/test/saber/bm/test_saber_func_BM.h b/test/saber/bm/test_saber_func_BM.h
deleted file mode 100644
index 61d27d6f9..000000000
--- a/test/saber/bm/test_saber_func_BM.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef ANAKIN_TEST_SABER_TEST_SABER_FUNC_BM_H
-#define ANAKIN_TEST_SABER_TEST_SABER_FUNC_BM_H
-
-#include "utils/unit_test/aktest.h"
-#include "utils/logger/logger.h"
-#include "core/tensor.h"
-#include <fstream>
-#include <vector>
-
-using namespace anakin::test;
-
-int read_file(std::vector<float> &results, const char* file_name) {
-
-    std::ifstream infile(file_name);
-    if (!infile.good()) {
-        std::cout << "Cannot open " << std::endl;
-        return false;
-    }
-    LOG(INFO)<<"found filename: "<<file_name;
-    std::string line;
-    while (std::getline(infile, line)) {
-        results.push_back((float)atof(line.c_str()));
-    }
-    return 0;
-}
-
-class TestSaberFuncBM : public Test {
-public:
-    TestSaberFuncBM() {}
-    ~TestSaberFuncBM() {}
-
-protected:
-    virtual void setup() {}
-    virtual void teardown() {}
-
-};
-
-#endif //ANAKIN_TEST_SABER_TEST_SABER_TENSOR_BM_H
diff --git a/test/saber/bm/test_saber_func_activation_BM.cpp b/test/saber/bm/test_saber_func_activation_BM.cpp
deleted file mode 100644
index 42f33e58d..000000000
--- a/test/saber/bm/test_saber_func_activation_BM.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-#include "core/context.h"
-#include "funcs/activation.h"
-#include "test_saber_func_BM.h"
-#include "tensor_op.h"
-#include "saber_types.h"
-#include <vector>
-
-using namespace anakin::saber;
-
-template <typename Tensor>
-void print_tensor_shape(std::string name, Tensor& t0) {
-
-    LOG(INFO) << name << " valid shape is ["
-              << t0.valid_shape()[0] << ", "
-              << t0.valid_shape()[1] << ", "
-              << t0.valid_shape()[2] << ", "
-              << t0.valid_shape()[3] << "].";
-
-    LOG(INFO) << name << " real shape is ["
-              << t0.shape()[0] << ", "
-              << t0.shape()[1] << ", "
-              << t0.shape()[2] << ", "
-              << t0.shape()[3] << "].";
-
-    LOG(INFO) << name << " offset is ["
-              << t0.offset()[0] << ", "
-              << t0.offset()[1] << ", "
-              << t0.offset()[2] << ", "
-              << t0.offset()[3] << "].";
-}
-
-TEST(TestSaberFuncBM, test_func_constructor) {
-
-    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
-
-    int img_num = 1;
-    int in_channels = 1;
-    int img_h = 8;
-    int img_w = 8;
-
-    Shape img_s(img_num, in_channels, img_h, img_w);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    int sign = -1;
-    for (int i = 0; i < img_host.size(); ++i) {
-	sign = i % 2 ? -1 : 1;
-        img_host.mutable_data()[i] = (float)(0.05 * (i & 0x1f) * sign);
-    }
-
-    img_dev.copy_from(img_host);
-    TensorDf4 output_dev;
-    print_tensor_device(img_dev);
-
-    // start Reshape & doInfer
-
-    Context<BM> ctx1(0, 1, 1);
-
-    ActivationParam<TensorDf4> param(Active_relu);
-
-    std::vector<TensorDf4*> input;
-    std::vector<TensorDf4*> output;
-
-    input.push_back(&img_dev);
-    output.push_back(&output_dev);
-
-    Activation<BM, AK_BM, AK_BM, AK_BM, NCHW> act;
-    act.compute_output_shape(input, output, param);
-    output_dev.re_alloc(output[0]->shape());
-
-    // init assume output tensor has been reshpaed by user.
-    act.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
-    act(input, output, param, ctx1);
-
-    print_tensor_device(output_dev);
-}
-
-int main(int argc, const char** argv) {
-    Env<BM>::env_init();
-    // initial logger
-    //logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
diff --git a/test/saber/bm/test_saber_func_batch_norm_BM.cpp b/test/saber/bm/test_saber_func_batch_norm_BM.cpp
deleted file mode 100644
index 395eb525f..000000000
--- a/test/saber/bm/test_saber_func_batch_norm_BM.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-#include "core/context.h"
-#include "funcs/batch_norm.h"
-#include "test_saber_func_BM.h"
-#include "tensor_op.h"
-#include "saber_types.h"
-#include <vector>
-
-using namespace anakin::saber;
-
-
-TEST(TestSaberFuncBM, test_func_batch_norm_BM) {
-
-    typedef TargetWrapper<BM> API;
-    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
-    typedef TensorDf4::Dtype dtype;
-
-    //Input / output tensor
-    Shape shape_in(1, 1, 2, 2);
-    Shape shape_out = shape_in;
-
-    std::vector<TensorDf4*> input_dev_4d;
-    std::vector<TensorDf4*> output_dev_4d;
-
-    Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
-    for (int i = 0; i < thin.size(); ++i) {
-        thin.mutable_data()[i] = 1+i;
-    }
-
-    TensorDf4 tdin, tdout;
-    tdin.re_alloc(shape_in);
-    tdin.copy_from(thin);
-    input_dev_4d.push_back(&tdin);
-
-    LOG(INFO) << "Input tensor is:";
-    print_tensor_device(*input_dev_4d[0]);
-
-    //Batch norm param
-    std::vector<float> mean;
-    mean.push_back(1);
-
-    std::vector<float> variance;
-    variance.push_back(0.001);
-
-    float scale_in = 1;
-    float eps_in = float(1e-5);
-
-    BatchnormParam<TensorDf4> param(mean, variance, scale_in);
-
-    //BatachNorm
-    BatchNorm<BM, AK_BM, AK_BM, AK_BM, NCHW> batchNorm;
-
-    output_dev_4d.push_back(&tdout);
-    batchNorm.compute_output_shape(input_dev_4d, output_dev_4d, param);
-
-    LOG(INFO) << "re-alloc tensor buffer";
-    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
-
-    LOG(INFO) << "batch norm initialized to bm impl";
-    Context<BM> ctx_dev(0, 1, 1);
-    batchNorm.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
-
-    LOG(INFO) << "bm batch norm compute";
-    SaberTimer<BM> t1;
-    t1.clear();
-    t1.start(ctx_dev);
-
-    batchNorm(input_dev_4d, output_dev_4d, param, ctx_dev);
-
-    t1.end(ctx_dev);
-    float ts = t1.get_average_ms();
-    printf("bm batch norm total time : %.4f, avg time : %.4f\n", ts, ts);
-
-    print_tensor_device(*output_dev_4d[0]);
-}
-
-int main(int argc, const char** argv) {
-    // initial logger
-    //logger::init(argv[0]);
-    //Env<BM>::env_init();
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
diff --git a/test/saber/bm/test_saber_func_conv_BM.cpp b/test/saber/bm/test_saber_func_conv_BM.cpp
deleted file mode 100644
index 75663cb8a..000000000
--- a/test/saber/bm/test_saber_func_conv_BM.cpp
+++ /dev/null
@@ -1,551 +0,0 @@
-#include "core/context.h"
-#include "funcs/conv.h"
-#include "test_saber_func_BM.h"
-#include "tensor_op.h"
-#include "saber_types.h"
-#include <vector>
-
-using namespace anakin::saber;
-
-typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
-
-template <typename Tensor>
-void print_tensor_shape(std::string name, Tensor &t0) {
-
-            LOG(INFO) << name << " valid shape is ["
-                      << t0.valid_shape()[0] << ", "
-                      << t0.valid_shape()[1] << ", "
-                      << t0.valid_shape()[2] << ", "
-                      << t0.valid_shape()[3] << "].";
-
-            LOG(INFO) << name << " real shape is ["
-                      << t0.shape()[0] << ", "
-                      << t0.shape()[1] << ", "
-                      << t0.shape()[2] << ", "
-                      << t0.shape()[3] << "].";
-
-            LOG(INFO) << name << " offset is ["
-                      << t0.offset()[0] << ", "
-                      << t0.offset()[1] << ", "
-                      << t0.offset()[2] << ", "
-                      << t0.offset()[3] << "].";
-}
-
-//Round a / b to nearest higher integer value
-inline int i_div_up(int a, int b)
-{
-    return (a % b != 0) ? (a / b + 1) : (a / b);
-}
-
-
-TEST(TestSaberFuncBM, test_conv_result) {
-
-    int group = 1;
-    int pad_h = 1;
-    int pad_w = 1;
-    int stride_h = 1;
-    int stride_w = 1;
-    int dilation_h = 1;
-    int dilation_w = 1;
-
-    int kernel_h = 3;
-    int kernel_w = 3;
-    int out_channels = 1;
-    
-    int img_num = 1;
-    int in_channels = 1;
-    int img_h = 8;
-    int img_w = 8;
-
-    bool bias_term = true;
-
-    LOG(INFO) << "conv param: ";
-    LOG(INFO) << "  img_num = " << img_num;
-    LOG(INFO) << "  in_channels = " << in_channels;
-    LOG(INFO) << "  img_h = " << img_h;
-    LOG(INFO) << "  img_w = " << img_w;
-    LOG(INFO) << "  group = " << group;
-    LOG(INFO) << "  pad_h = " << pad_h;
-    LOG(INFO) << "  pad_w = " << pad_w;
-    LOG(INFO) << "  stride_h = " << stride_h;
-    LOG(INFO) << "  stride_w = " << stride_w;
-    LOG(INFO) << "  dilation_h = " << dilation_h;
-    LOG(INFO) << "  dilation_w = " << dilation_w;
-    LOG(INFO) << "  kernel_h = " << kernel_h;
-    LOG(INFO) << "  kernel_w = " << kernel_w;
-    LOG(INFO) << "  out_channels = " << out_channels;
-
-    Shape img_s(img_num, in_channels, img_h, img_w);
-    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
-    Shape bias_s(1, out_channels, 1, 1);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-    
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = i;
-    }
-
-    img_dev.copy_from(img_host);
-    
-    TensorHf4 weights_host;
-    TensorDf4 weights_dev;
-    
-    weights_host.re_alloc(weights_s);
-    weights_dev.re_alloc(weights_s);
-
-    fill_tensor_host_const(weights_host, 1.f);
-    weights_dev.copy_from(weights_host);
-
-    TensorHf4 bias_host;
-    TensorDf4 bias_dev;
-
-    if (bias_term) {
-        bias_host.re_alloc(bias_s);
-        bias_dev.re_alloc(bias_s);
-
-        fill_tensor_host_const(bias_host, 1.f);
-        bias_dev.copy_from(bias_host);
-    }
-
-    TensorHf4 output_host;
-    TensorDf4 output_dev;
-
-    // start Reshape & doInfer
-    Context<BM> ctx1(0, 1, 1);
-    
-    ConvParam<TensorDf4> param(group, pad_h, pad_w,
-                               stride_h, stride_w,
-                               dilation_h, dilation_w,
-                               &weights_dev, &bias_dev);
-
-    std::vector<TensorDf4*> input;
-    std::vector<TensorDf4*> output;
-
-    input.push_back(&img_dev);
-    output.push_back(&output_dev);
-
-    Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv;
-    conv.compute_output_shape(input, output, param);
-
-    output_dev.re_alloc(output[0]->shape());
-    output_host.re_alloc(output[0]->shape());
-
-    LOG(INFO) << "regular start with group = " << group;
-    // init assume output tensor has been reshpaed by user.
-    conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
-
-    conv(input, output, param, ctx1);
-
-    output_dev.sync();
-
-    print_tensor_device(img_dev);
-    print_tensor_device(output_dev);
-}
-
-TEST(TestSaberFuncBM, test_conv_param_change) {
-
-    int group = 4;
-    int pad_h = 1;
-    int pad_w = 1;
-    int stride_h = 1;
-    int stride_w = 1;
-    int dilation_h = 1;
-    int dilation_w = 1;
-
-    int kernel_h = 3;
-    int kernel_w = 3;
-    int out_channels = 4;
-
-    int img_num = 1;
-    int in_channels = 4;
-    int img_h = 64;
-    int img_w = 64;
-
-    bool bias_term = true;
-
-    Shape img_s(img_num, in_channels, img_h, img_w);
-    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
-    Shape bias_s(1, out_channels, 1, 1);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = i;
-    }
-
-    img_dev.copy_from(img_host);
-
-    TensorHf4 weights_host;
-    TensorDf4 weights_dev;
-
-    weights_host.re_alloc(weights_s);
-    weights_dev.re_alloc(weights_s);
-
-    fill_tensor_host_const(weights_host, 1.f);
-    weights_dev.copy_from(weights_host);
-
-    TensorHf4 bias_host;
-    TensorDf4 bias_dev;
-
-    if (bias_term) {
-        bias_host.re_alloc(bias_s);
-        bias_dev.re_alloc(bias_s);
-
-        fill_tensor_host_const(bias_host, 1.f);
-        bias_dev.copy_from(bias_host);
-    }
-
-    TensorHf4 output_host;
-    TensorDf4 output_dev;
-
-    // start Reshape & doInfer
-    Context<BM> ctx1(0, 1, 1);
-
-    ConvParam<TensorDf4> param(group, pad_h, pad_w,
-                               stride_h, stride_w,
-                               dilation_h, dilation_w,
-                               &weights_dev, &bias_dev);
-
-    std::vector<TensorDf4*> input;
-    std::vector<TensorDf4*> output;
-
-    input.push_back(&img_dev);
-    output.push_back(&output_dev);
-
-    Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv;
-    conv.compute_output_shape(input, output, param);
-
-    output_dev.re_alloc(output[0]->shape());
-    output_host.re_alloc(output[0]->shape());
-
-    LOG(INFO) << "regular start with group = " << group;
-    // init assume output tensor has been reshpaed by user.
-    conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
-
-    conv(input, output, param, ctx1);
-
-    param.group = 1;
-    param.pad_h = 1;
-    param.pad_w = 1;
-
-    LOG(INFO) << " param changed start with group = " << param.group;
-    conv(input, output, param, ctx1);
-}
-
-TEST(TestSaberFuncBM, test_conv_share_sub_tensor) {
-
-    int group = 1;
-    int pad_h = 1;
-    int pad_w = 1;
-    int stride_h = 1;
-    int stride_w = 1;
-    int dilation_h = 1;
-    int dilation_w = 1;
-
-    int kernel_h = 3;
-    int kernel_w = 3;
-    int out_channels = 2;
-
-    int img_num = 1;
-    int in_channels = 2;
-    int img_h = 8;
-    int img_w = 8;
-
-    bool bias_term = true;
-
-    LOG(INFO) << "conv param: ";
-    LOG(INFO) << " img_num = " << img_num;
-    LOG(INFO) << " in_channels = " << in_channels;
-    LOG(INFO) << " img_h = " << img_h;
-    LOG(INFO) << " img_w = " << img_w;
-    LOG(INFO) << " group = " << group;
-    LOG(INFO) << " pad_h = " << pad_h;
-    LOG(INFO) << " pad_w = " << pad_w;
-    LOG(INFO) << " stride_h = " << stride_h;
-    LOG(INFO) << " stride_w = " << stride_w;
-    LOG(INFO) << " dilation_h = " << dilation_h;
-    LOG(INFO) << " dilation_w = " << dilation_w;
-    LOG(INFO) << " kernel_h = " << kernel_h;
-    LOG(INFO) << " kernel_w = " << kernel_w;
-    LOG(INFO) << " out_channels = " << out_channels;
-
-    Shape img_s(img_num, in_channels, img_h, img_w);
-    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
-    Shape bias_s(1, out_channels, 1, 1);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = 0x7f & i;
-    }
-
-    img_dev.copy_from(img_host);
-
-    Shape img_s_sub(img_num, in_channels, 4, 4);
-
-    TensorDf4 t0;
-    TensorDf4 t1;
-
-    t0.share_sub_buffer(img_dev, img_s_sub, {0,0,0,0});
-    t1.share_sub_buffer(img_dev, img_s_sub, {0,0,4,4});
-
-    print_tensor_shape("t0", t0);
-    print_tensor_shape("t1", t1);
-
-    TensorHf4 weights_host;
-    TensorDf4 weights_dev;
-
-    weights_host.re_alloc(weights_s);
-    weights_dev.re_alloc(weights_s);
-
-    fill_tensor_host_const(weights_host, 1.f);
-    weights_dev.copy_from(weights_host);
-
-    TensorHf4 bias_host;
-    TensorDf4 bias_dev;
-
-    if (bias_term) {
-        bias_host.re_alloc(bias_s);
-        bias_dev.re_alloc(bias_s);
-
-        fill_tensor_host_const(bias_host, 1.f);
-        bias_dev.copy_from(bias_host);
-    }
-
-    TensorDf4 output_dev;
-
-    // start Reshape & doInfer
-    Context<BM> ctx1(0, 1, 1);
-    Context<BM> ctx2(0, 2, 2);
-
-    TensorDf4 out0;
-    TensorDf4 out1;
-
-    ConvParam<TensorDf4> param0(group, pad_h, pad_w,
-                               stride_h, stride_w,
-                               dilation_h, dilation_w,
-                               &weights_dev, &bias_dev);
-
-    ConvParam<TensorDf4> param1(group, pad_h, pad_w,
-                                stride_h, stride_w,
-                                dilation_h, dilation_w,
-                                &weights_dev, &bias_dev);
-
-    std::vector<TensorDf4*> input0, input1;
-    std::vector<TensorDf4*> output0, output1;
-
-    input0.push_back(&t0);
-    input1.push_back(&t1);
-
-    output0.push_back(&out0);
-    output1.push_back(&out1);
-
-    output_dev.re_alloc(img_s);
-
-    Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv0;
-    Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv1;
-
-    conv0.compute_output_shape(input0, output0, param0);
-    conv1.compute_output_shape(input1, output1, param1);
-
-    out0.share_sub_buffer(output_dev, output0[0]->valid_shape(),{0,0,0,0});
-    out1.share_sub_buffer(output_dev, output1[0]->valid_shape(),{0,0,4,4});
-
-    conv0.init(input0, output0, param0, SPECIFY, VENDER_IMPL, ctx1);
-    conv1.init(input1, output1, param1, SPECIFY, VENDER_IMPL, ctx2);
-
-    conv0(input0, output0, param0, ctx1);
-    conv1(input1, output1, param1, ctx2);
-}
-
-TEST(TestSaberFuncBM, test_conv_fp32_speed_test) {
-
-    int group = 1;
-    int pad_h = 1;
-    int pad_w = 1;
-    int stride_h = 1;
-    int stride_w = 1;
-    int dilation_h = 1;
-    int dilation_w = 1;
-
-    int kernel_h = 3;
-    int kernel_w = 3;
-    int out_channels = 128;
-
-    int img_num = 64;
-    int in_channels = 4;
-    int img_h = 32;
-    int img_w = 32;
-
-    bool bias_term = true;
-
-    LOG(INFO) << "conv param: ";
-    LOG(INFO) << "  img_num = " << img_num;
-    LOG(INFO) << "  in_channels = " << in_channels;
-    LOG(INFO) << "  out_channels = " << out_channels;
-    LOG(INFO) << "  img_h = " << img_h;
-    LOG(INFO) << "  img_w = " << img_w;
-    LOG(INFO) << "  group = " << group;
-    LOG(INFO) << "  pad = " << pad_h;
-    LOG(INFO) << "  stride = " << stride_h;
-    LOG(INFO) << "  dilation = " << dilation_h;
-    LOG(INFO) << "  kernel_h = " << kernel_h;
-    LOG(INFO) << "  kernel_w = " << kernel_w;
-    Shape img_s(img_num, in_channels, img_h, img_w);
-    Shape weights_s(out_channels, in_channels, kernel_h, kernel_w);
-    Shape bias_s(1, out_channels, 1, 1);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = 1;
-    }
-
-    img_dev.copy_from(img_host);
-
-    TensorHf4 weights_host;
-    TensorDf4 weights_dev;
-
-    weights_host.re_alloc(weights_s);
-    weights_dev.re_alloc(weights_s);
-
-    fill_tensor_host_const(weights_host, 1.f);
-    weights_dev.copy_from(weights_host);
-
-    TensorHf4 bias_host;
-    TensorDf4 bias_dev;
-
-    if (bias_term) {
-        bias_host.re_alloc(bias_s);
-        bias_dev.re_alloc(bias_s);
-
-        fill_tensor_host_const(bias_host, 1.f);
-        bias_dev.copy_from(bias_host);
-    }
-
-    TensorDf4 output_dev;
-
-    // start Reshape & doInfer
-    Context<BM> ctx1(0, 1, 1);
-
-    ConvParam<TensorDf4> param(group, pad_h, pad_w,
-                               stride_h, stride_w,
-                               dilation_h, dilation_w,
-                               &weights_dev, &bias_dev);
-
-    std::vector<TensorDf4*> input;
-    std::vector<TensorDf4*> output;
-
-    input.push_back(&img_dev);
-    output.push_back(&output_dev);
-
-    Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv;
-    conv.compute_output_shape(input, output, param);
-
-    output_dev.re_alloc(output[0]->shape());
-    LOG(INFO) << "Output shape = [ " << output[0]->shape()[0] << " " << output[0]->shape()[1] << " " \
-        << output[0]->shape()[2] << " " << output[0]->shape()[3] << "]";
-
-    LOG(INFO) << "saber conv init";
-    conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
-
-    /* conv(input, output, param, ctx1); */
-    /* output_dev.sync(); */
-
-    LOG(INFO) << "saber conv dispatch";
-    SaberTimer<BM> t1;
-    int ts = 100;
-    t1.start(ctx1);
-    for (int i = 0; i < ts; ++i) {
-        conv(input, output, param, ctx1);
-        output_dev.sync();
-    }
-    t1.end(ctx1);
-    LOG(INFO) << "elapse time: " << t1.get_average_ms()/ts << " ms";
-}
-
-TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) {
-    int img_num = 1;
-    int kernel = 1;
-    int out_channels = 128;
-    int in_channels = 512;
-    int img_h = 32;
-    int img_w = 32;
-    int pad = 0;
-    int stride = 1;
-
-    TensorDf4 weights;
-    TensorDf4 bias;
-    weights.re_alloc({out_channels, in_channels, 1, 1});
-    bias.re_alloc({1, out_channels, 1, 1});
-
-    TensorDf4 img, out;
-    img.re_alloc({1, in_channels, img_h, img_w});
-
-    fill_tensor_device_rand(weights, -1.f, 1.f);
-    fill_tensor_device_rand(bias, -1.f, 1.f);
-    fill_tensor_device_rand(img, -1.f, 1.f);
-
-    LOG(INFO) << "conv param: ";
-    LOG(INFO) << "  img_num: " << img_num;
-    LOG(INFO) << "  kernel: " << kernel;
-    LOG(INFO) << "  out_channels: " << out_channels;
-    LOG(INFO) << "  in_channels: " << in_channels;
-    LOG(INFO) << "  img_h: " << img_h;
-    LOG(INFO) << "  img_w: " << img_w;
-    LOG(INFO) << "  pad: " << pad;
-    LOG(INFO) << "  stride: " << stride;
-
-    std::vector<TensorDf4*> input;
-    std::vector<TensorDf4*> output;
-
-    input.push_back(&img);
-    output.push_back(&out);
-
-    ConvParam<TensorDf4> conv_param(1, pad, pad,
-                                    stride, stride,
-                                    1, 1,
-                                    &weights, &bias);
-    Conv<BM, AK_BM, AK_BM, AK_BM, NCHW> conv;
-    conv.compute_output_shape(input, output, conv_param);
-    out.re_alloc(output[0]->shape());
-    Context<BM> ctx1(0, 1, 1);
-    conv.init(input, output, conv_param, SPECIFY, VENDER_IMPL, ctx1);
-
-    SaberTimer<BM> t1;
-    int ts = 100;
-    t1.start(ctx1);
-    for (int i = 0; i < ts; ++i) {
-        conv(input, output, conv_param, ctx1);
-        out.sync();
-    }
-    t1.end(ctx1);
-    LOG(INFO) << "elapse time: " << t1.get_average_ms()/ts << " ms";
-}
-
-int main(int argc, const char** argv){
-    Env<BM>::env_init();
-    // initial logger
-    //logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
diff --git a/test/saber/bm/test_saber_func_eltwise_BM.cpp b/test/saber/bm/test_saber_func_eltwise_BM.cpp
deleted file mode 100644
index 643f4e026..000000000
--- a/test/saber/bm/test_saber_func_eltwise_BM.cpp
+++ /dev/null
@@ -1,612 +0,0 @@
-#include "core/context.h"
-#include "funcs/eltwise.h"
-#include "test_saber_func_BM.h"
-#include "tensor_op.h"
-#include "saber_types.h"
-#include <vector>
-
-using namespace anakin::saber;
-
-
-TEST(TestSaberFuncBM, test_func_prod) {
-
-    Env<BM>::env_init();
-    typedef TargetWrapper<BM> API;
-
-    typedef TargetWrapper<BM> BM_API;
-    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
-
-    EltwiseType elt_type = Eltwise_prod;
-
-    EltwiseParam<TensorDf4> param(elt_type);
-
-    int w_in = 10;
-    int h_in = 2;
-    int ch_in = 2;
-    int num_in = 1;
-
-    Shape shape_in(num_in, ch_in, h_in, w_in);
-    Shape shape_out = shape_in;
-
-    // Host Tensor
-    Tensor<X86, AK_FLOAT, NCHW> thin0(shape_in);
-    Tensor<X86, AK_FLOAT, NCHW> thin1(shape_in);
-    Tensor<X86, AK_FLOAT, NCHW> thin2(shape_in);
-    for (int i = 0; i < thin0.size(); ++i) {
-        thin0.mutable_data()[i] = i;
-    }
-    for (int i = 0; i < thin1.size(); ++i) {
-        thin1.mutable_data()[i] = i + 1;
-    }
-    for (int i = 0; i < thin2.size(); ++i) {
-        thin2.mutable_data()[i] = 1;
-    }
-
-    // Device Tensor
-    TensorDf4 tdin0, tdin1, tdin2, tdout;
-    tdin0.re_alloc(shape_in);
-    tdin1.re_alloc(shape_in);
-    tdin2.re_alloc(shape_in);
-    tdin0.copy_from(thin0);
-    tdin1.copy_from(thin1);
-    tdin2.copy_from(thin2);
-    tdout.re_alloc(shape_out);
-
-    // Device Vector of Tensor
-    std::vector<TensorDf4*> input_dev_4d;
-    std::vector<TensorDf4*> output_dev_4d;
-    input_dev_4d.push_back(&tdin0);
-    input_dev_4d.push_back(&tdin1);
-    input_dev_4d.push_back(&tdin2);
-    output_dev_4d.push_back(&tdout);
-
-
-    Context<BM> ctx_dev(0, 1, 1);
-    Eltwise<BM, AK_BM, AK_BM, AK_BM, NCHW> eltwise_dev;
-
-    LOG(INFO) << "eltwise compute output shape";
-    eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
-
-    // Verify output shape
-    Shape sh = output_dev_4d[0]->valid_shape();
-    LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \
-        ", " << sh[2] << ", " << sh[3];
-    Shape shout{num_in, ch_in, h_in, w_in};
-    CHECK_EQ(shout == sh, true) << "compute shape error";
-
-    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
-
-    LOG(INFO) << "eltwise initialization";
-    eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
-
-
-    LOG(INFO) << "eltwise compute";
-    eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
-
-    print_tensor_device(*output_dev_4d[0]);
-}
-
-
-TEST(TestSaberFuncBM, test_func_sum) {
-
-    Env<BM>::env_init();
-
-    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
-
-    EltwiseType elt_type = Eltwise_sum;
-
-    int w_in = 10;
-    int h_in = 2;
-    int ch_in = 2;
-    int num_in = 1;
-
-    Shape shape_in(num_in, ch_in, h_in, w_in);
-    Shape shape_out = shape_in;
-
-    // Host Tensor
-    TensorHf4 thin1(shape_in);
-    TensorHf4 thin2(shape_in);
-
-    for (int i = 0; i < thin1.size(); ++i) {
-        thin1.mutable_data()[i] = 1.0;
-    }
-
-    for (int i = 0; i < thin2.size(); ++i) {
-        thin2.mutable_data()[i] = 2.0;
-    }
-
-    // Device Tensor
-    TensorDf4 tdin0, tdin1, tdout;
-    tdin0.re_alloc(shape_in);
-    tdin1.re_alloc(shape_in);
-    tdin0.copy_from(thin1);
-    tdin1.copy_from(thin2);
-    tdout.re_alloc(shape_out);
-
-    // Device Vector of Tensor
-    std::vector<TensorDf4*> input_dev_4d;
-    std::vector<TensorDf4*> output_dev_4d;
-    input_dev_4d.push_back(&tdin0);
-    input_dev_4d.push_back(&tdin1);
-    input_dev_4d.push_back(&tdin1);
-    output_dev_4d.push_back(&tdout);
-
-    EltwiseParam<TensorDf4> param(elt_type);
-
-    Context<BM> ctx_dev(0, 1, 1);
-    Eltwise<BM, AK_BM, AK_BM, AK_BM, NCHW> eltwise_dev;
-
-    LOG(INFO) << "eltwise compute output shape";
-    eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
-
-
-    // Verify output shape
-    Shape sh = output_dev_4d[0]->valid_shape();
-    LOG(INFO) << "output shape: " << sh[0] << ", " << sh[1] << \
-              ", " << sh[2] << ", " << sh[3];
-    Shape shout{num_in, ch_in, h_in, w_in};
-    CHECK_EQ(shout == sh, true) << "compute shape error";
-
-    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
-
-    LOG(INFO) << "eltwise initialization";
-    eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
-
-    LOG(INFO) << "eltwise compute";
-    eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
-    print_tensor_device(*output_dev_4d[0]);
-}
-
-TEST(TestSaberFuncBM, test_func_max) {
-
-    Env<BM>::env_init();
-
-    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
-
-    EltwiseType elt_type = Eltwise_max;
-
-    EltwiseParam<TensorDf4> param(elt_type);
-
-    int w_in = 10;
-    int h_in = 2;
-    int ch_in = 2;
-    int num_in = 1;
-
-    Shape shape_in(num_in, ch_in, h_in, w_in);
-    Shape shape_out = shape_in;
-
-    // Host Tensor
-    Tensor<X86, AK_FLOAT, NCHW> thin0(shape_in);
-    Tensor<X86, AK_FLOAT, NCHW> thin1(shape_in);
-    Tensor<X86, AK_FLOAT, NCHW> thin2(shape_in);
-    for (int i = 0; i < thin0.size(); ++i) {
-        thin0.mutable_data()[i] = i;
-    }
-    for (int i = 0; i < thin1.size(); ++i) {
-        thin1.mutable_data()[i] = i + 2;
-    }
-    for (int i = 0; i < thin2.size(); ++i) {
-        thin2.mutable_data()[i] = i + 1;
-    }
-
-    // Device Tensor
-    TensorDf4 tdin0, tdin1, tdin2, tdout;
-    tdin0.re_alloc(shape_in);
-    tdin1.re_alloc(shape_in);
-    tdin2.re_alloc(shape_in);
-    tdin0.copy_from(thin0);
-    tdin1.copy_from(thin1);
-    tdin2.copy_from(thin2);
-    tdout.re_alloc(shape_out);
-
-    // Device Vector of Tensor
-    std::vector<TensorDf4*> input_dev_4d;
-    std::vector<TensorDf4*> output_dev_4d;
-    input_dev_4d.push_back(&tdin0);
-    input_dev_4d.push_back(&tdin1);
-    input_dev_4d.push_back(&tdin2);
-    output_dev_4d.push_back(&tdout);
-
-    Context<BM> ctx_dev(0, 1, 1);
-    Eltwise<BM, AK_BM, AK_BM, AK_BM, NCHW> eltwise_dev;
-
-    LOG(INFO) << "eltwise compute output shape";
-    eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
-
-    // Verify output shape
-    Shape sh = output_dev_4d[0]->valid_shape();
-    LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \
-        ", " << sh[2] << ", " << sh[3];
-    Shape shout{num_in, ch_in, h_in, w_in};
-    CHECK_EQ(shout == sh, true) << "compute shape error";
-
-    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
-
-    LOG(INFO) << "eltwise initialization";
-    eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
-
-    LOG(INFO) << "eltwise compute";
-    eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
-    print_tensor_device(*output_dev_4d[0]);
-
-}
-
-/*   0   1   2   3   4
- *  10  11  12  13  14   (tdin_roi1, c=0)
- *   (tdin_roi0, c=0)   25  26  27  28  29
- *                      35  36  37  38  39
- * =======================================
- *  40  41  42  43  44
- *  50  51  52  53  54   (tdin_roi1, c=1)
- *   (tdin_roi0, c=1)   65  66  67  68  69
- *                      75  76  77  78  79
- */
-/*
-TEST(TestSaberFuncBM, test_func_prod_roi) {
-
-    Env<BM>::env_init();
-    typedef TargetWrapper<BM> API;
-
-    typedef TargetWrapper<BM> BM_API;
-    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
-
-    EltwiseType elt_type = Eltwise_prod;
-
-    EltwiseParam<TensorDf4> param(elt_type);
-
-    int w_in = 10;
-    int h_in = 4;
-    int ch_in = 2;
-    int num_in = 1;
-
-    Shape shape_in(num_in, ch_in, h_in, w_in);
-    Shape shape_in_roi{num_in, ch_in, h_in / 2, w_in / 2};
-    Shape off0{0, 0, 0, 0};
-    Shape off1{0, 0, 2, 5};
-    Shape shape_out = shape_in_roi;
-
-    // Host Tensor
-    Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
-    for (int i = 0; i < thin.size(); ++i) {
-        thin.mutable_data()[i] = i;
-    }
-
-    // Device Tensor
-    TensorDf4 tdin, tdin_roi0, tdin_roi1, tdout;
-    tdin.re_alloc(shape_in);
-    tdin.copy_from(thin);
-    tdin_roi0.share_sub_buffer(tdin, shape_in_roi, off0);
-    tdin_roi1.share_sub_buffer(tdin, shape_in_roi, off1);
-    tdout.re_alloc(shape_out);
-
-
-    // Device Vector of Tensor
-    std::vector<TensorDf4*> input_dev_4d;
-    std::vector<TensorDf4*> output_dev_4d;
-    input_dev_4d.push_back(&tdin_roi0);
-    input_dev_4d.push_back(&tdin_roi1);
-    input_dev_4d.push_back(&tdin_roi1);
-    output_dev_4d.push_back(&tdout);
-
-
-    Context<BM> ctx_dev(0, 1, 1);
-    Eltwise<BM, AK_BM, AK_BM, AK_BM, NCHW> eltwise_dev;
-
-    LOG(INFO) << "eltwise compute output shape";
-    eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
-
-
-    // Verify output shape
-    Shape sh = output_dev_4d[0]->valid_shape();
-    LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \
-        ", " << sh[2] << ", " << sh[3];
-    Shape shout(shape_in_roi);
-    CHECK_EQ(shout == sh, true) << "compute shape error";
-
-    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
-
-    LOG(INFO) << "eltwise initialization";
-    eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
-
-
-    LOG(INFO) << "eltwise compute";
-    eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
-
-
-    output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
-    output_dev_4d[0]->sync();
-    print_tensor_device(*output_dev_4d[0]);
-    cudaDeviceSynchronize();
-
-
-    TensorHf4 th_for_print;
-    th_for_print.re_alloc(output_dev_4d[0]->valid_shape());
-    th_for_print.copy_from(*output_dev_4d[0]);
-    print_tensor_host(th_for_print);
-
-    CUDA_CHECK(cudaPeekAtLastError());
-}
-
-*/
-
-/*   0   1   2   3   4
- *  10  11  12  13  14   (tdin_roi1, c=0)
- *   (tdin_roi0, c=0)   25  26  27  28  29
- *                      35  36  37  38  39
- * =======================================
- *  40  41  42  43  44
- *  50  51  52  53  54   (tdin_roi1, c=1)
- *   (tdin_roi0, c=1)   65  66  67  68  69
- *                      75  76  77  78  79
- */
-/*
-TEST(TestSaberFuncBM, test_func_sum_roi_new) {
-
-    Env<BM>::env_init();
-    typedef TargetWrapper<BM> API;
-
-    typedef TargetWrapper<BM> BM_API;
-    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
-
-    EltwiseType elt_type = Eltwise_sum;
-
-    int w_in = 10;
-    int h_in = 4;
-    int ch_in = 2;
-    int num_in = 1;
-
-    Shape shape_in(num_in, ch_in, h_in, w_in);
-    Shape shape_in_roi{num_in, ch_in, h_in / 2, w_in / 2};
-
-    Shape off0{0, 0, 0, 0};
-    Shape off1{0, 0, 2, 5};
-    Shape shape_out = shape_in_roi;
-
-    // Host Tensor
-    Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
-    for (int i = 0; i < thin.size(); ++i) {
-        thin.mutable_data()[i] = i;
-    }
-
-    // Device Tensor
-    TensorDf4 tdin, tdin_roi0, tdin_roi1, tdout;
-    tdin.re_alloc(shape_in);
-    tdin.copy_from(thin);
-    tdin_roi0.share_sub_buffer(tdin, shape_in_roi, off0);
-    tdin_roi1.share_sub_buffer(tdin, shape_in_roi, off1);
-    tdout.re_alloc(shape_out);
-
-    // Device Vector of Tensor
-    std::vector<TensorDf4*> input_dev_4d;
-    std::vector<TensorDf4*> output_dev_4d;
-    input_dev_4d.push_back(&tdin_roi0);
-    input_dev_4d.push_back(&tdin_roi1);
-//    input_dev_4d.push_back(&tdin_roi1);
-//    input_dev_4d.push_back(&tdin_roi1);
-    output_dev_4d.push_back(&tdout);
-
-//    Shape shape_coeff(1, 1, 1, input_dev_4d.size());
-//    TensorHf4 thcoeff(shape_coeff);
-//    for (int i = 0; i < thcoeff.size(); ++i) {
-//        thcoeff.mutable_data()[i] = 1;
-//    }
-
-    EltwiseParam<TensorDf4> param(elt_type);
-
-    Context<BM> ctx_dev(0, 1, 1);
-    Eltwise<BM, AK_BM, AK_BM, AK_BM, NCHW> eltwise_dev;
-
-    LOG(INFO) << "eltwise compute output shape";
-    eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
-
-    // Verify output shape
-    Shape sh = output_dev_4d[0]->valid_shape();
-    LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \
-        ", " << sh[2] << ", " << sh[3];
-    Shape shout(shape_in_roi);
-    CHECK_EQ(shout == sh, true) << "compute shape error";
-
-    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
-
-    LOG(INFO) << "eltwise initialization";
-    eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
-
-    print_tensor_device(*input_dev_4d[0]);
-    print_tensor_device(*input_dev_4d[1]);
-    cudaDeviceSynchronize();
-    LOG(INFO) << "eltwise compute";
-    eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
-
-    output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
-    output_dev_4d[0]->sync();
-    print_tensor_device(*output_dev_4d[0]);
-    cudaDeviceSynchronize();
-    CUDA_CHECK(cudaPeekAtLastError());
-}
-*/
-/*
-TEST(TestSaberFuncBM, test_func_sum_roi) {
-
-    Env<BM>::env_init();
-    typedef TargetWrapper<BM> API;
-
-    typedef TargetWrapper<BM> BM_API;
-    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
-
-    EltwiseType elt_type = Eltwise_sum;
-
-    int w_in = 10;
-    int h_in = 4;
-    int ch_in = 2;
-    int num_in = 1;
-
-    Shape shape_in(num_in, ch_in, h_in, w_in);
-    Shape shape_in_roi{num_in, ch_in, h_in / 2, w_in / 2};
-    Shape off0{0, 0, 0, 0};
-    Shape off1{0, 0, 2, 5};
-    Shape shape_out = shape_in_roi;
-
-    // Host Tensor
-    Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
-    for (int i = 0; i < thin.size(); ++i) {
-        thin.mutable_data()[i] = i;
-    }
-
-    // Device Tensor
-    TensorDf4 tdin, tdin_roi0, tdin_roi1, tdout;
-    tdin.re_alloc(shape_in);
-    tdin.copy_from(thin);
-    tdin_roi0.share_sub_buffer(tdin, shape_in_roi, off0);
-    tdin_roi1.share_sub_buffer(tdin, shape_in_roi, off1);
-    tdout.re_alloc(shape_out);
-
-    // Device Vector of Tensor
-    std::vector<TensorDf4*> input_dev_4d;
-    std::vector<TensorDf4*> output_dev_4d;
-    input_dev_4d.push_back(&tdin_roi0);
-    input_dev_4d.push_back(&tdin_roi1);
-    output_dev_4d.push_back(&tdout);
-
-    //Shape shape_coeff(1, 1, 1, 3);
-    Shape shape_coeff(1, 1, 1, input_dev_4d.size());
-    TensorHf4 thcoeff(shape_coeff);
-
-    for (int i = 0; i < thcoeff.size(); ++i) {
-        thcoeff.mutable_data()[i] = i;
-    }
-    TensorDf4 tdcoeff;
-    tdcoeff.re_alloc(shape_coeff);
-    tdcoeff.copy_from(thcoeff);
-
-    EltwiseParam<TensorDf4> param(elt_type, &tdcoeff);
-
-    Context<BM> ctx_dev(0, 1, 1);
-    Eltwise<BM, AK_BM, AK_BM, AK_BM, NCHW> eltwise_dev;
-
-    LOG(INFO) << "eltwise compute output shape";
-    eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
-
-    // Verify output shape
-    Shape sh = output_dev_4d[0]->valid_shape();
-    LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \
-        ", " << sh[2] << ", " << sh[3];
-    Shape shout(shape_in_roi);
-    CHECK_EQ(shout == sh, true) << "compute shape error";
-
-    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
-
-    LOG(INFO) << "eltwise initialization";
-    eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
-
-    LOG(INFO) << "eltwise compute";
-    eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
-
-    output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
-    output_dev_4d[0]->sync();
-    print_tensor_device(*output_dev_4d[0]);
-    cudaDeviceSynchronize();
-
-    TensorHf4 th_for_print;
-    th_for_print.re_alloc(output_dev_4d[0]->valid_shape());
-    th_for_print.copy_from(*output_dev_4d[0]);
-    print_tensor_host(th_for_print);
-
-    CUDA_CHECK(cudaPeekAtLastError());
-}
-*/
-
-/*
-TEST(TestSaberFuncBM, test_func_max_roi) {
-
-    Env<BM>::env_init();
-    typedef TargetWrapper<BM> API;
-
-    typedef TargetWrapper<BM> BM_API;
-    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
-
-    EltwiseType elt_type = Eltwise_max;
-
-    int w_in = 10;
-    int h_in = 4;
-    int ch_in = 2;
-    int num_in = 1;
-
-    Shape shape_in(num_in, ch_in, h_in, w_in);
-    Shape shape_in_roi{num_in, ch_in, h_in / 2, w_in / 2};
-    Shape off0{0, 0, 0, 0};
-    Shape off1{0, 0, 2, 5};
-    Shape shape_out = shape_in_roi;
-
-    // Host Tensor
-    Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
-    for (int i = 0; i < thin.size(); ++i) {
-        thin.mutable_data()[i] = i;
-    }
-
-    // Device Tensor
-    TensorDf4 tdin, tdin_roi0, tdin_roi1, tdout;
-    tdin.re_alloc(shape_in);
-    tdin.copy_from(thin);
-    tdin_roi0.share_sub_buffer(tdin, shape_in_roi, off0);
-    tdin_roi1.share_sub_buffer(tdin, shape_in_roi, off1);
-    tdout.re_alloc(shape_out);
-
-    // Device Vector of Tensor
-    std::vector<TensorDf4*> input_dev_4d;
-    std::vector<TensorDf4*> output_dev_4d;
-    input_dev_4d.push_back(&tdin_roi0);
-    input_dev_4d.push_back(&tdin_roi1);
-    output_dev_4d.push_back(&tdout);
-
-    EltwiseParam<TensorDf4> param(elt_type);
-
-    Context<BM> ctx_dev(0, 1, 1);
-    Eltwise<BM, AK_BM, AK_BM, AK_BM, NCHW> eltwise_dev;
-
-    LOG(INFO) << "eltwise compute output shape";
-    eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
-
-    // Verify output shape
-    Shape sh = output_dev_4d[0]->valid_shape();
-    LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \
-        ", " << sh[2] << ", " << sh[3];
-    Shape shout(shape_in_roi);
-    CHECK_EQ(shout == sh, true) << "compute shape error";
-
-    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
-
-    LOG(INFO) << "eltwise initialization";
-    eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
-
-    LOG(INFO) << "eltwise compute";
-    eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
-
-    output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
-    output_dev_4d[0]->sync();
-    print_tensor_device(*output_dev_4d[0]);
-    cudaDeviceSynchronize();
-
-    TensorHf4 th_for_print;
-    th_for_print.re_alloc(output_dev_4d[0]->valid_shape());
-    th_for_print.copy_from(*output_dev_4d[0]);
-    print_tensor_host(th_for_print);
-
-    CUDA_CHECK(cudaPeekAtLastError());
-}
-
-*/
-
-int main(int argc, const char** argv) {
-    // initial logger
-    //logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
diff --git a/test/saber/bm/test_saber_func_fc_BM.cpp b/test/saber/bm/test_saber_func_fc_BM.cpp
deleted file mode 100644
index 7b56033e6..000000000
--- a/test/saber/bm/test_saber_func_fc_BM.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-#include "core/context.h"
-#include "funcs/fc.h"
-#include "test_saber_func_BM.h"
-#include "tensor_op.h"
-#include "saber_types.h"
-#include <vector>
-
-using namespace anakin::saber;
-typedef TargetWrapper<BM> API;
-typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
-typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-typedef TensorDf4::Dtype ftype;
-
-void fc_compute(const TensorHf4& tin, const TensorHf4& weight, \
-                const TensorHf4& bias, TensorHf4& tout) {
-
-    int m = tin.num();
-    int k = tin.valid_size() / m;
-    int n = weight.valid_size() / k;
-    bool bias_term = bias.valid_size() > 0;
-
-    const float* din = tin.data();
-    const float* w = weight.data();
-    float* dout = tout.mutable_data();
-
-    for (int i = 0; i < m; ++i) {
-        float* pdout = dout + i * n;
-        const float* pdin = din + i * k;
-
-        for (int j = 0; j < n; ++j) {
-            if (bias_term) {
-                pdout[j] = bias.data()[j];
-            } else {
-                pdout[j] = 0;
-            }
-
-            for (int l = 0; l < k; ++l) {
-                pdout[j] += pdin[l] * w[l * n + j];
-            }
-        }
-    }
-}
-
-TEST(TestSaberFuncBM, test_func_fc) {
-
-    int test_iter = 10;
-    int w_in = 7;
-    int h_in = 7;
-    int ch_in = 1024;
-    int num_in = 4;
-
-    int num_out = 4096;
-    int axis = 1;
-
-    Shape shape_in(num_in, ch_in, h_in, w_in);
-    Shape shape_out = {num_in, num_out, 1, 1};
-
-    Shape sh_w{1, 1, w_in* h_in * ch_in, num_out};
-    TensorDf4 weight(sh_w);
-    Shape sh_b{1, 1, 1, num_out};
-    TensorDf4 bias(sh_b);
-    fill_tensor_device_const(weight, 1.f);
-    fill_tensor_device_const(bias, 1.f);
-
-    LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \
-              ch_in << ", height=" << h_in << ", width=" << w_in;
-
-    std::vector<TensorDf4*> input_dev_4d;
-    std::vector<TensorDf4*> output_dev_4d;
-
-    TensorDf4 tdin;
-    TensorDf4 tdout;
-    tdin.re_alloc(shape_in);
-    fill_tensor_device_const(tdin, 1.f);
-    input_dev_4d.push_back(&tdin);
-    output_dev_4d.push_back(&tdout);
-
-    // start Reshape & doInfer
-    Context<BM> ctx_dev(0, 1, 1);
-
-    FcParam<TensorDf4> param(&weight, &bias, num_out, axis);
-
-    Fc<BM, AK_BM, AK_BM, AK_BM, NCHW> fc;
-
-    LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \
-              shape_out[2] << ", " << shape_out[3];
-
-    SABER_CHECK(fc.compute_output_shape(input_dev_4d, output_dev_4d, param));
-
-    LOG(INFO) << "re-alloc tensor buffer";
-    output_dev_4d[0]->re_alloc(output_dev_4d[0]->valid_shape());
-    Shape va_sh = tdout.valid_shape();
-    LOG(INFO) << "shape out 4d: " << va_sh[0] << ", " << va_sh[1] << ", " << \
-              va_sh[2] << ", " << va_sh[3];
-    CHECK_EQ(tdout.valid_shape() == shape_out, true) << "compute output shape error";
-
-    LOG(INFO) << "FC initialization";
-    SABER_CHECK(fc.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev));
-
-    LOG(INFO) << "FC compute";
-    SaberTimer<BM> t1;
-    t1.clear();
-    t1.start(ctx_dev);
-
-    for (int i = 0; i < test_iter; ++i) {
-        SABER_CHECK(fc(input_dev_4d, output_dev_4d, param, ctx_dev));
-        bm_flush(get_bm_handle());
-        //output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
-        //output_dev_4d[0]->sync();
-        //cudaDeviceSynchronize();
-    }
-
-    t1.end(ctx_dev);
-    float ts = t1.get_average_ms();
-    LOG(INFO) << "total time: " << ts << "avg time: " << ts / test_iter;
-    //print_tensor_device(*output_dev_4d[0]);
-
-    //! check result
-    TensorHf4 thin(shape_in);
-    TensorHf4 thout(shape_out);
-    TensorHf4 thw(sh_w);
-    TensorHf4 thb(sh_b);
-    thin.copy_from(tdin);
-    thw.copy_from(weight);
-    thb.copy_from(bias);
-    fc_compute(thin, thw, thb, thout);
-    //print_tensor_host(thout);
-
-    TensorHf4 thout_d(shape_out);
-    thout_d.copy_from(tdout);
-    double max_ratio = 0;
-    double max_diff = 0;
-    tensor_cmp_host(thout.data(), thout_d.data(), thout.valid_size(), max_ratio, max_diff);
-    LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio;
-    CHECK_LE(fabs(max_ratio), 1.0e-6) << "error result";
-
-}
-
-int main(int argc, const char** argv) {
-    // initial logger
-    //logger::init(argv[0]);
-    Env<BM>::env_init();
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp
deleted file mode 100644
index 943ed130b..000000000
--- a/test/saber/bm/test_saber_func_pooling_BM.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-#include "core/context.h"
-#include "funcs/pooling.h"
-#include "test_saber_func_BM.h"
-#include "tensor_op.h"
-#include "saber_types.h"
-#include "funcs/timer.h"
-#include <vector>
-
-using namespace anakin::saber;
-
-TEST(TestSaberFuncBM, test_func_pooling) {
-    typedef TargetWrapper<BM> API;
-
-    typedef TargetWrapper<X86> X86_API;
-    typedef TargetWrapper<BM> BM_API;
-    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
-
-    int img_num = 1;
-    int in_channels = 4;
-    int img_h = 800;
-    int img_w = 1440;
-
-    Shape img_s(img_num, in_channels, img_h, img_w);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = 0x7f & i;
-    }
-
-    img_dev.copy_from(img_host);
-
-    TensorHf4 output_host;
-    TensorDf4 output_dev;
-
-    // start Reshape & doInfer
-
-    LOG(INFO) << "init env...";
-    Env<BM>::env_init();
-    Context<BM> ctx1(0, 1, 1);
-    int window_h = 2;
-    int window_w = 2;
-    int pad_h = 1;
-    int pad_w = 1;
-    int stride_h = 1;
-    int stride_w = 1;
-
-    PoolingParam<TensorDf4> param(window_h, window_w, pad_h, pad_w
-                                  , stride_h, stride_w, Pooling_max);
-
-    std::vector<TensorDf4*> input;
-    std::vector<TensorDf4*> output;
-
-    input.push_back(&img_dev);
-    output.push_back(&output_dev);
-
-    Pooling<BM, AK_BM, AK_BM, AK_BM, NCHW> pooling;
-    pooling.compute_output_shape(input, output, param);
-
-    output_dev.re_alloc(output[0]->shape());
-    output_host.re_alloc(output[0]->shape());
-
-    // init assume output tensor has been reshpaed by user.
-    pooling.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
-    pooling(input, output, param, ctx1);
-
-    SaberTimer<BM> t1;
-    int ts = 10;
-
-    for (int i = 0; i < ts; ++i) {
-        t1.start(ctx1);
-        pooling(input, output, param, ctx1);
-        output[0]->sync();
-        t1.end(ctx1);
-    }
-
-    output_dev.sync();
-    LOG(INFO) << " average time: " << t1.get_average_ms() << " ms";
-    LOG(INFO) << " tile 10% time: " << t1.get_tile_time(10) << " ms";
-    LOG(INFO) << " tile 50% time: " << t1.get_tile_time(50) << " ms";
-    LOG(INFO) << " tile 90% time: " << t1.get_tile_time(90) << " ms";
-    LOG(INFO) << " tile 95% time: " << t1.get_tile_time(95) << " ms";
-    LOG(INFO) << " tile 99% time: " << t1.get_tile_time(99) << " ms";
-}
-
-TEST(TestSaberFuncBM, test_pooling_result) {
-
-    typedef TargetWrapper<BM> API;
-
-    typedef TargetWrapper<X86> X86_API;
-    typedef TargetWrapper<BM> BM_API;
-    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
-
-    int img_num = 1;
-    int in_channels = 1;
-    int img_h = 8;
-    int img_w = 8;
-
-    Shape img_s(img_num, in_channels, img_h, img_w);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = rand() % 20;
-    }
-
-    img_dev.copy_from(img_host);
-
-    TensorDf4 output_dev;
-
-    // start Reshape & doInfer
-
-    Context<BM> ctx1(0, 1, 1);
-    int window_h = 2;
-    int window_w = 2;
-    int pad_h = 1;
-    int pad_w = 1;
-    int stride_h = 1;
-    int stride_w = 1;
-
-    LOG(INFO) << " img_num: " << img_num;
-    LOG(INFO) << " in_channels: " << in_channels;
-    LOG(INFO) << " img_h: " << img_h;
-    LOG(INFO) << " img_w: " << img_w;
-    LOG(INFO) << " window_h: " << window_h;
-    LOG(INFO) << " window_w: " << window_w;
-    LOG(INFO) << " pad_h: " << pad_h;
-    LOG(INFO) << " pad_w: " << pad_w;
-    LOG(INFO) << " stride_h: " << stride_h;
-    LOG(INFO) << " stride_w: " << stride_w;
-
-    PoolingParam<TensorDf4> param(window_h, window_w, pad_h, pad_w,
-                                  stride_h, stride_w, Pooling_average_include_padding);
-
-    std::vector<TensorDf4*> input;
-    std::vector<TensorDf4*> output;
-
-    input.push_back(&img_dev);
-    output.push_back(&output_dev);
-
-    Pooling<BM, AK_BM, AK_BM, AK_BM, NCHW> pooling;
-    pooling.compute_output_shape(input, output, param);
-
-    output_dev.re_alloc(output[0]->shape());
-
-    // init assume output tensor has been reshpaed by user.
-    pooling.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
-    pooling(input, output, param, ctx1);
-
-    output_dev.sync();
-    LOG(INFO) << "tensor data before pooling: ";
-    print_tensor_device(img_dev);
-    LOG(INFO) << "tensor data after pooling: ";
-    print_tensor_device(output_dev);
-}
-
-TEST(TestSaberFuncBM, test_pooling_shared_buffer) {
-
-    typedef TargetWrapper<BM> API;
-
-    typedef TargetWrapper<X86> X86_API;
-    typedef TargetWrapper<BM> BM_API;
-    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
-
-    int img_num = 1;
-    int in_channels = 2;
-    int img_h = 8;
-    int img_w = 8;
-
-    Shape img_s(img_num, in_channels, img_h, img_w);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-
-    for (int i = 0; i < img_host.size(); ++i) {
-        img_host.mutable_data()[i] = 0x7f & i;
-    }
-
-    img_dev.copy_from(img_host);
-
-    TensorDf4 t0;
-    TensorDf4 t1;
-    Shape img_s_sub(img_num, in_channels, img_h / 2, img_w / 2);
-
-    t0.share_sub_buffer(img_dev, img_s_sub, {0, 0, 0, 0});
-    t1.share_sub_buffer(img_dev, img_s_sub, {0, 0, 4, 4});
-
-    TensorDf4 output_dev;
-
-    TensorDf4 out0;
-    TensorDf4 out1;
-
-    // start Reshape & doInfer
-
-    Context<BM> ctx1(0, 1, 1);
-    int window_h = 2;
-    int window_w = 2;
-    int pad_h = 1;
-    int pad_w = 1;
-    int stride_h = 1;
-    int stride_w = 1;
-
-    LOG(INFO) << " img_num: " << img_num;
-    LOG(INFO) << " in_channels: " << in_channels;
-    LOG(INFO) << " img_h: " << img_h;
-    LOG(INFO) << " img_w: " << img_w;
-    LOG(INFO) << " window_h: " << window_h;
-    LOG(INFO) << " window_w: " << window_w;
-    LOG(INFO) << " pad_h: " << pad_h;
-    LOG(INFO) << " pad_w: " << pad_w;
-    LOG(INFO) << " stride_h: " << stride_h;
-    LOG(INFO) << " stride_w: " << stride_w;
-
-    PoolingParam<TensorDf4> param(window_h, window_w, pad_h, pad_w
-                                  , stride_h, stride_w, Pooling_max);
-
-    std::vector<TensorDf4*> input;
-    std::vector<TensorDf4*> output;
-
-    input.push_back(&img_dev);
-    output.push_back(&output_dev);
-
-    Pooling<BM, AK_BM, AK_BM, AK_BM, NCHW> pooling;
-    Pooling<BM, AK_BM, AK_BM, AK_BM, NCHW> pooling0;
-    Pooling<BM, AK_BM, AK_BM, AK_BM, NCHW> pooling1;
-
-    pooling.compute_output_shape(input,output,  param);
-
-    Shape total_shape = output[0]->shape();
-
-    output_dev.re_alloc(total_shape);
-    Shape out_sub_shape = {total_shape[0], total_shape[1], total_shape[2] / 2, total_shape[3] / 2};
-
-    out0.share_sub_buffer(output_dev, out_sub_shape, {0, 0, 0, 0});
-    out1.share_sub_buffer(output_dev, out_sub_shape, {0, 0, out_sub_shape[2], out_sub_shape[3]});
-
-    std::vector<TensorDf4*> input0, input1;
-    std::vector<TensorDf4*> output0, output1;
-
-    input0.push_back(&t0);
-    input1.push_back(&t1);
-    output0.push_back(&out0);
-    output1.push_back(&out1);
-
-    // init assume output tensor has been reshpaed by user.
-    pooling0.init(input0, output0, param, SPECIFY, VENDER_IMPL, ctx1);
-    pooling0(input0, output0, param, ctx1);
-
-    pooling1.init(input1, output1, param, SPECIFY, VENDER_IMPL, ctx1);
-    pooling1(input1, output1, param, ctx1);
-
-    out0.sync();
-    out1.sync();
-
-    /* print_tensor_device(output_dev); */
-}
-
-int main(int argc, const char** argv) {
-    // initial logger
-    //logger::init(argv[0]);
-    Env<BM>::env_init();
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
diff --git a/test/saber/bm/test_saber_func_scale_BM.cpp b/test/saber/bm/test_saber_func_scale_BM.cpp
deleted file mode 100644
index a20b61cbb..000000000
--- a/test/saber/bm/test_saber_func_scale_BM.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-#include "core/context.h"
-#include "funcs/scale.h"
-#include "test_saber_func_BM.h"
-#include "tensor_op.h"
-#include "saber_types.h"
-#include <vector>
-
-using namespace anakin::saber;
-
-template <typename Tensor>
-void print_tensor_shape(std::string name, Tensor& t0) {
-
-    LOG(INFO) << name << " valid shape is ["
-              << t0.valid_shape()[0] << ", "
-              << t0.valid_shape()[1] << ", "
-              << t0.valid_shape()[2] << ", "
-              << t0.valid_shape()[3] << "].";
-
-    LOG(INFO) << name << " real shape is ["
-              << t0.shape()[0] << ", "
-              << t0.shape()[1] << ", "
-              << t0.shape()[2] << ", "
-              << t0.shape()[3] << "].";
-
-    LOG(INFO) << name << " offset is ["
-              << t0.offset()[0] << ", "
-              << t0.offset()[1] << ", "
-              << t0.offset()[2] << ", "
-              << t0.offset()[3] << "].";
-}
-void fill_vector_rand(std::vector<float>& vec) {
-    for (int i = 0; i < vec.size(); i++) {
-        vec[i] = rand() *1.0f/RAND_MAX - 0.5;
-    }
-}
-void fill_vector_const(std::vector<float>& vec, float num) {
-    for (int i = 0; i < vec.size(); i++) {
-        vec[i] = num;
-    }
-}
-void print_vector_data(std::vector<float>& vec) {
-    for (int i = 0; i < vec.size(); i++) {
-        printf("%d, %f\n", i, vec[i]);
-    }
-}
-
-void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_term, int scale_dim) {
-
-    typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
-
-    int img_num = n;
-    int in_channels = c;
-    int img_h = h;
-    int img_w = w;
-
-    Shape img_s(img_num, in_channels, img_h, img_w);
-
-    TensorHf4 img_host;
-    TensorDf4 img_dev;
-
-    img_host.re_alloc(img_s);
-    img_dev.re_alloc(img_s);
-    fill_tensor_host_const(img_host, 1);
-    img_dev.copy_from(img_host);
-
-    TensorDf4 output_dev;
-
-    Context<BM> ctx1(0, 1, 1);
-    std::vector<float> scale_w;
-    std::vector<float> scale_b;
-    scale_w.resize(scale_dim);
-    fill_vector_const(scale_w, 2);
-    if (bias_term) {
-        scale_b.resize(scale_dim);
-        fill_vector_const(scale_b, 3);
-    }
-
-    ScaleParam<TensorDf4> param(scale_w,
-                                scale_b,
-                                bias_term, axis, num_axes);
-
-    std::vector<TensorDf4*> input;
-    std::vector<TensorDf4*> output;
-
-    input.push_back(&img_dev);
-    output.push_back(&output_dev);
-
-    Scale<BM, AK_BM, AK_BM, AK_BM, NCHW> scale;
-    scale.compute_output_shape(input, output, param);
-    output_dev.re_alloc(output[0]->valid_shape());
-
-    // init assume output tensor has been reshpaed by user.
-    scale.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
-    scale(input, output, param, ctx1);
-
-    output_dev.sync();
-    LOG(INFO) << "input data: ";
-    print_tensor_device(img_dev);
-    LOG(INFO) << "output data: ";
-    print_tensor_device(output_dev);
-    LOG(INFO) << "scale_w data: ";
-    print_vector_data(scale_w);
-    if (bias_term) {
-        LOG(INFO) << "scale_b data: ";
-        print_vector_data(scale_b);
-    }
-}
-
-TEST(TestSaberFuncBM, test_func_constructor_elt) {
-    test_scale(1, 2, 1, 2, 1, 1, false, 2);
-    test_scale(1, 2, 1, 2, 1, 1, true, 2);
-    /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */
-    /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */
-    /* test_scale(2, 2, 4, 4, 0, 0, true, 1); */
-    /* test_scale(2, 2, 4, 4, 0, 0, true, 1); */
-}
-
-
-int main(int argc, const char** argv) {
-    Env<BM>::env_init();
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
diff --git a/test/saber/bm/test_saber_func_softmax_BM.cpp b/test/saber/bm/test_saber_func_softmax_BM.cpp
deleted file mode 100644
index 645d081f1..000000000
--- a/test/saber/bm/test_saber_func_softmax_BM.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-#include "core/context.h"
-#include "funcs/softmax.h"
-#include "test_saber_func_BM.h"
-#include "tensor_op.h"
-#include "saber_types.h"
-#include <vector>
-
-using namespace anakin::saber;
-
-
-TEST(TestSaberFuncBM, test_func_softmax_BM) {
-
-    //Env<BM>::env_init();
-    typedef TargetWrapper<BM> API;
-
-    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
-
-    typedef TensorDf4::Dtype dtype;
-
-    int test_iter = 10;
-
-    int softmax_axis = 3; // channel
-    int w_in = 3;
-    int h_in = 225;
-    int ch_in = 40;
-    int num_in = 1;
-
-    Shape shape_in(num_in, ch_in, h_in, w_in);
-    Shape shape_out = shape_in;
-
-    SoftmaxParam<TensorDf4> param(softmax_axis);
-
-    LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \
-              ch_in << ", height=" << h_in << ", width=" << w_in;
-
-    LOG(INFO) << "softmax axis= " << param.axis;
-
-    std::vector<TensorDf4*> input_dev_4d;
-    std::vector<TensorDf4*> output_dev_4d;
-
-    Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
-
-    for (int i = 0; i < thin.size(); ++i) {
-        thin.mutable_data()[i] = i % 4;
-    }
-
-    TensorDf4 tdin, tdout;
-    tdin.re_alloc(shape_in);
-    tdin.copy_from(thin);
-    input_dev_4d.push_back(&tdin);
-
-    // start Reshape & doInfer
-    Context<BM> ctx_dev(0, 1, 1);
-
-    Softmax<BM, AK_BM, AK_BM, AK_BM, NCHW> softmax_dev;
-
-    typedef std::vector<Shape> Shape_v;
-
-    LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \
-              shape_out[2] << ", " << shape_out[3];
-
-    output_dev_4d.push_back(&tdout);
-    softmax_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
-
-    LOG(INFO) << "re-alloc tensor buffer";
-    output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape());
-
-    LOG(INFO) << "softmax initialized to cudnn impl";
-    softmax_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev);
-
-    LOG(INFO) << "cudnn softmax compute";
-    SaberTimer<BM> t1;
-    t1.clear();
-    t1.start(ctx_dev);
-
-    for (int i = 0; i < test_iter; ++i) {
-        softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
-        //output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
-        //output_dev_4d[0]->sync();
-    }
-
-    t1.end(ctx_dev);
-    float ts = t1.get_average_ms();
-    printf("cudnn softmax total time : %.4f, avg time : %.4f\n", ts, ts / test_iter);
-
-    LOG(INFO) << "softmax initialized to saber impl";
-    softmax_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, SABER_IMPL, ctx_dev);
-
-    LOG(INFO) << "saber softmax compute";
-    t1.clear();
-    t1.start(ctx_dev);
-
-    for (int i = 0; i < test_iter; ++i) {
-        softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
-        //output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
-        //output_dev_4d[0]->sync();
-    }
-
-    t1.end(ctx_dev);
-    ts = t1.get_average_ms();
-    printf("saber softmax total time : %.4f, avg time : %.4f\n", ts, ts / test_iter);
-    //print_tensor_device(*output_dev_4d[0]);
-}
-
-TEST(TestSaberFuncBM, test_func_softmax_ROI_BM) {
-
-    //Env<BM>::env_init();
-    typedef TargetWrapper<BM> API;
-
-    typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
-
-    typedef TensorDf4::Dtype dtype;
-
-    int test_iter = 1;
-
-    int softmax_axis = 3; // channel
-    int w_in = 3;
-    int h_in = 10;
-    int ch_in = 10;
-    int num_in = 1;
-
-    Shape shape_in(num_in, ch_in, h_in, w_in);
-    Shape shape_in_roi{num_in, ch_in / 2, h_in / 2, w_in};
-    Shape shape_out = shape_in_roi;
-
-    SoftmaxParam<TensorDf4> param(softmax_axis);
-
-    LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \
-              ch_in << ", height=" << h_in << ", width=" << w_in;
-
-    LOG(INFO) << "softmax axis= " << param.axis;
-
-    std::vector<TensorDf4*> input_dev_4d;
-    std::vector<TensorDf4*> output_dev_4d;
-
-    Tensor<X86, AK_FLOAT, NCHW> thin(shape_in);
-
-    for (int i = 0; i < thin.size(); ++i) {
-        thin.mutable_data()[i] = (i % 3);
-    }
-
-    TensorDf4 tdin, tdin_roi, tdout, tdout_roi;
-    tdin.re_alloc(shape_in);
-    tdout.re_alloc(shape_in);
-    tdin.copy_from(thin);
-    tdin_roi.share_sub_buffer(tdin, shape_in_roi, Shape(0, 0, 0, 0));
-    input_dev_4d.push_back(&tdin_roi);
-    output_dev_4d.push_back(&tdout_roi);
-
-    // start Reshape & doInfer
-    Context<BM> ctx_dev(0, 1, 1);
-
-    Softmax<BM, AK_BM, AK_BM, AK_BM, NCHW> softmax_dev;
-
-    LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \
-              shape_out[2] << ", " << shape_out[3];
-
-    softmax_dev.compute_output_shape(input_dev_4d, output_dev_4d, param);
-
-    LOG(INFO) << "re-alloc tensor buffer";
-    output_dev_4d[0]->share_sub_buffer(tdout, shape_in_roi, Shape(0, 0, 0, 0));
-    //output_dev_4d[0]->reshape(output_dev_4d[0]->valid_shape());
-
-    LOG(INFO) << "softmax initialization";
-    softmax_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, SABER_IMPL, ctx_dev);
-
-    LOG(INFO) << "softmax compute";
-    SaberTimer<BM> t1;
-    t1.clear();
-    t1.start(ctx_dev);
-
-    for (int i = 0; i < test_iter; ++i) {
-        softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev);
-        //output_dev_4d[0]->record_event(ctx_dev.get_compute_stream());
-        //output_dev_4d[0]->sync();
-    }
-
-    t1.end(ctx_dev);
-    float ts = t1.get_average_ms();
-    printf("total time : %.4f, avg time : %.4f\n", ts, ts / test_iter);
-    print_tensor_device(*output_dev_4d[0]);
-
-    TensorDf4 troi(output_dev_4d[0]->valid_shape());
-    troi.copy_from(*output_dev_4d[0]);
-    //print_tensor_device(troi);
-}
-
-int main(int argc, const char** argv) {
-    // initial logger
-    //logger::init(argv[0]);
-    Env<BM>::env_init();
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
diff --git a/test/saber/bm/test_saber_shape_BM.cpp b/test/saber/bm/test_saber_shape_BM.cpp
deleted file mode 100644
index 18479cd18..000000000
--- a/test/saber/bm/test_saber_shape_BM.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-#include "test_saber_shape_BM.h"
-#include "shape.h"
-#include "anakin_config.h"
-
-#ifdef USE_OPENMP
-#include <omp.h>
-#include <core/shape.h>
-#endif
-
-using namespace anakin;
-using namespace saber;
-
-
-TEST(TestSaberShapeBM, test_saber_shape) {
-
-    int dim = 4;
-    Shape sh4d0{0, 0, 0, 0};
-    CHECK_EQ(sh4d0.dims(), 4) << "check shape dim error";
-
-    for (int i = 0; i < dim; ++i) {
-        CHECK_EQ(sh4d0[i], 0) << "check default constructor, dim size error";
-    }
-
-    CHECK_EQ(sh4d0.count(), 0) << "check shape count error";
-
-    int N = 1;
-    int C = 3;
-    int H = 11;
-    int W = 11;
-    std::vector<int> sh_size = {N, C, H, W};
-    //Shape sh4d1(sh_size);
-    Shape sh4d1(N, C, H, W);
-    LOG(INFO) << "Test Saber Shape, size of shape: " << sh4d1.size();
-    CHECK_EQ(sh4d1.count(), N * C * H * W) << "size error with vector constructor!";
-    //CHECK_EQ(sh4d2.size(), N * C * H * W) << "size error with args constructor!";
-
-    CHECK_EQ(sh4d1[0], N) << "get shape size error";
-    CHECK_EQ(sh4d1[1], C) << "get shape size error";
-    CHECK_EQ(sh4d1[2], H) << "get shape size error";
-    CHECK_EQ(sh4d1[3], W) << "get shape size error";
-
-    //CHECK_EQ(sh4d2[0], N) << "get shape size error";
-    //CHECK_EQ(sh4d2[1], C) << "get shape size error";
-    //CHECK_EQ(sh4d2[2], H) << "get shape size error";
-    //CHECK_EQ(sh4d2[3], W) << "get shape size error";
-
-    CHECK_EQ(sh4d1.count(0), N * C * H * W) << "calculate count failed";
-
-    C = 10;
-    sh4d1[1] = C;
-    CHECK_EQ(sh4d1[1], C) << "set shape size error";
-
-    bool is_equal = (sh4d0 == sh4d1);
-    CHECK_EQ(is_equal, false) << "check shape is_equal failed";
-
-    sh4d0 = sh4d1;
-    CHECK_EQ(sh4d1[0], N) << "constructor failed";
-    CHECK_EQ(sh4d1[1], C) << "get shape size error";
-    CHECK_EQ(sh4d1[2], H) << "get shape size error";
-    CHECK_EQ(sh4d1[3], W) << "get shape size error";
-
-    Shape sh4d3 = sh4d1;
-    CHECK_EQ((sh4d3 == sh4d1), true) << "constructor error";
-
-    Shape sh4d4(sh4d1);
-    CHECK_EQ((sh4d4 == sh4d1), true) << "constructor error";
-
-    Shape sh1d0{0};
-    //std::vector<int> sh1d_size = {W};
-
-    //Shape sh1d1(sh1d_size);
-    //Shape sh1d0{W};
-    Shape sh1d1(W);
-
-    Shape sh1d3 = sh1d1;
-    Shape sh1d4(sh1d1);
-
-    CHECK_EQ(sh1d0.dims(), 1) << "shape dim error";
-
-    CHECK_EQ(sh1d0.count(), 0) << "shape size error";
-
-    CHECK_EQ(sh1d0.count(0), 0) << "shape1d count error";
-
-    CHECK_EQ(sh1d1[0], W) << "get shape size error";
-
-    //CHECK_EQ(sh1d2.count(0), W) << "shape dim error";
-
-    CHECK_EQ((sh1d0 != sh1d1), true) << "compare shape error";
-
-    CHECK_EQ((sh1d3 == sh1d1), true) << "compare shape error";
-
-    CHECK_EQ((sh1d4 == sh1d1), true) << "compare shape error";
-
-    Shape sh0{2, 2, 3, 4};
-    Shape sh1{2, 1, 1, 24};
-    Shape sh2{2, 2, 3, 4};
-    Shape sh3{1, 1, 2, 3};
-
-    CHECK_EQ(sh0 == sh2, true) << "error ==";
-    CHECK_EQ(sh3 < sh0, true) << "error <";
-    CHECK_EQ(sh3 >= sh0, false) << "error >=";
-    CHECK_EQ(sh3 > sh0, false) << "error >";
-    CHECK_EQ(sh0 > sh3, true) << "error >";
-    CHECK_EQ(sh0 < sh1, false) << "error <";
-    CHECK_EQ(sh0 <= sh2, true) << "error <=";
-    CHECK_EQ(sh0 >= sh2, true) << "error >=";
-
-    Shape sh001 = Shape::zero(2);
-    Shape sh002 = Shape::zero(3);
-
-    if (sh001 > sh002) {
-        LOG(ERROR) << "error <";
-    }
-
-}
-
-
-int main(int argc, const char** argv) {
-    // initial logger
-    logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
-
diff --git a/test/saber/bm/test_saber_shape_BM.h b/test/saber/bm/test_saber_shape_BM.h
deleted file mode 100644
index a2ca02c9b..000000000
--- a/test/saber/bm/test_saber_shape_BM.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H
-#define ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H
-
-#include "utils/unit_test/aktest.h"
-#include "utils/logger/logger.h"
-#include "saber/core/shape.h"
-
-using namespace anakin::test;
-
-class TestSaberShapeBM : public Test {
-public:
-    TestSaberShapeBM() {}
-    ~TestSaberShapeBM() {}
-
-protected:
-    virtual void setup() {}
-    virtual void teardown() {}
-
-protected:
-    std::string name;
-    std::string _test;
-};
-
-#endif //ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H
-
diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp
deleted file mode 100644
index 2400e73c3..000000000
--- a/test/saber/bm/test_saber_tensor_BM.cpp
+++ /dev/null
@@ -1,664 +0,0 @@
-#include "test_saber_tensor_BM.h"
-#include "tensor_op.h"
-#include <vector>
-using namespace anakin::saber;
-
-typedef TargetWrapper<X86> X86_API;
-typedef TargetWrapper<BM> BM_API;
-typedef Tensor<X86, AK_FLOAT, NCHW> TensorHf4;
-typedef Tensor<BM, AK_BM, NCHW> TensorDf4;
-typedef TensorHf4::Dtype dtype;
-typedef TensorDf4::Dtype dtype2;
-
-
-static bm_handle_t handle;
-TEST(TestSaberTensorBM, test_tensor_constructor) {
-    bmdnn_init(&handle);
-
-    //! test empty constructor
-    LOG(INFO) << "test default (empty) constructor";
-    TensorHf4 thost0;
-    TensorDf4 tdev0;
-
-    //! test tensor re_alloc function empty constructor
-    Shape sh0(2, 2, 8, 8);
-    LOG(INFO) << "|--test tensor re_alloc function on empty tensor";
-    thost0.re_alloc(sh0);
-    tdev0.re_alloc(sh0);
-    LOG(INFO) << "|--tensor size of host: " << thost0.size();
-    LOG(INFO) << "|--tensor size of device: " << tdev0.size();
-    CHECK_EQ(thost0.size(), 256) << "error with tensor size";
-    CHECK_EQ(tdev0.size(), 256) << "error with tensor size";
-
-    //! test tensor re_alloc function on tensor with data
-    LOG(INFO) << "|--test tensor re_alloc function on tensor with data";
-    Shape sh1(1, 4, 4, 4);
-    thost0.re_alloc(sh1);
-    tdev0.re_alloc(sh1);
-    LOG(INFO) << "|--tensor size of host: " << thost0.size();
-    LOG(INFO) << "|--tensor size of device: " << tdev0.size();
-    CHECK_EQ(thost0.size(), 64) << "error with tensor size";
-    CHECK_EQ(tdev0.size(), 64) << "error with tensor size";
-
-    //! test tensor shape() function
-    LOG(INFO) << "|--test tensor shape() function";
-    Shape sho = thost0.shape();
-    LOG(INFO) << "|--shape of tensor: " << sho[0] << ", " << sho[1] << "," << sho[2] << "," << sho[3];
-    LOG(INFO) << "|--test get tensor n, c, h, w function, num = " \
-              << thost0.num() << ", channel = " << thost0.channel() << ", height = " \
-              << thost0.height() << ", width = " << thost0.width();
-
-    //! test tensor mutable_data() function
-    LOG(INFO) << "|--xxxxxxxxtest tensor mutable_data() function, write tensor data buffer with 2.f";
-    fill_tensor_host_const(thost0, 2.f);
-    LOG(INFO) << "|--test tensor data() function, show the const data, 2.f";
-    print_tensor_host(thost0);
-
-    //! test tensor constructor with shape
-    LOG(INFO) << "test tensor constructor with shape";
-    TensorHf4 thost1(sh1);
-    TensorDf4 tdev1(sh1);
-
-    //! test tensor copy_from() function
-    LOG(INFO) << "test copy_from() function, input tensor could be any target";
-
-    // host to host
-    thost1.copy_from(thost0);
-    print_tensor_host(thost1);
-
-    // host to device
-    tdev1.copy_from(thost0);
-    print_tensor_device(tdev1);
-
-    // device to host
-    thost1.copy_from(tdev1);
-    print_tensor_host(thost1);
-
-    LOG(INFO) << "test copy_from() function device to device";
-
-    tdev1.copy_from(tdev0);
-    print_tensor_device(tdev1);
-
-    
-    //! test tensor constructor with shape and real_shape
-    LOG(INFO) << "test tensor constructor with shape and real_shape";
-    //! constructor with 3 shapes is removed
-    TensorHf4 thost2(sh0);
-    TensorDf4 tdev2(sh0);
-
-    //! test tensor constructor with data, if target is different, create buffer, and copy the data
-    LOG(INFO) <<
-              "test tensor constructor with data, if target is different, create buffer, and copy the data";
-    dtype* host_data_ptr;
-    dtype2* dev_data_ptr;
-    void* tmp_pt_host;
-    void* tmp_pt_dev;
-    X86_API::mem_alloc(&tmp_pt_host, sizeof(dtype) * sh1.count());
-    host_data_ptr = static_cast<dtype*>(tmp_pt_host);
-
-    for (int i = 0; i < sh1.count(); ++i) {
-        host_data_ptr[i] = i;
-    }
-
-    BM_API::mem_alloc(&tmp_pt_dev, sizeof(dtype2) * sh1.count());
-    dev_data_ptr = static_cast<dtype2*>(tmp_pt_dev);
-//---    cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice);
-    BM_API::sync_memcpy(dev_data_ptr,0,host_data_ptr,0,0,__HtoD());
-    LOG(INFO) << "|--construct host tensor from host data ptr";
-    TensorHf4 thost3(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
-    LOG(INFO) << "|--constructor device tensor from host data ptr";
-
-//    TensorDf4 tdev3(&bm_mem_from_system(const_cast<float *>(host_data_ptr)), X86(), X86_API::get_device_id(), sh1);
-
-    TensorDf4 tdev3(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
-
-
-    print_tensor_host(thost3);
-
-    print_tensor_device(tdev3);
-
-//    TensorHf4 thost_lian(sh1);
-//    thost_lian.copy_from(tdev3);
-//    print_tensor_host(thost_lian);
-//
-//    thost_lian.copy_from(thost3);
-//    print_tensor_host(thost_lian);
-
-    //cudaDeviceSynchronize();
-    //
-
-    LOG(INFO) << "|--construct host tensor from device data ptr";
-    TensorHf4 thost4(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
-
-    TensorDf4 tdev4(host_data_ptr, X86(), X86_API::get_device_id(), sh1);
-
-//    TensorDf4 tdev3(&bm_mem_from_system(const_cast<float *>(host_data_ptr)), X86(), X86_API::get_device_id(), sh1);
-
-//    TensorHf4 thost4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1);
-//    LOG(INFO) << "|--constructor device tensor from device data ptr";
-//    TensorDf4 tdev4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1);
-//    print_tensor_host(thost4);
-//    print_tensor_device(tdev4);
-
-
-    //BM_API::stream_t dev_stream0;
-    //BM_API::create_stream_with_flag(dev_stream0, 1);
-    //cudaDeviceSynchronize();
-
-    //! test tensor copy constructor
-    LOG(INFO) << "test tensor copy constructor";
-    LOG(INFO) << "|--normal copy constructor";
-    TensorHf4 thost5(thost4);
-    TensorDf4 tdev5(tdev4);
-
-    LOG(INFO) << "|--push back to vector";
-    std::vector<TensorHf4> vthost;
-    std::vector<TensorDf4> vtdev;
-    vthost.push_back(thost0);
-    vthost.push_back(thost1);
-    vthost.push_back(thost2);
-    vthost.push_back(thost3);
-    vthost.push_back(thost4);
-    vthost.push_back(thost5);
-    vtdev.push_back(tdev0);
-    vtdev.push_back(tdev1);
-    vtdev.push_back(tdev2);
-    vtdev.push_back(tdev3);
-    vtdev.push_back(tdev4);
-    vtdev.push_back(tdev5);
-    print_tensor_host(vthost[5]);
-    print_tensor_device(vtdev[5]);
-    //cudaDeviceSynchronize();
-
-    //! test share_from function, if targets are the same, buffer is shared, otherwise, buffer is copied
-    LOG(INFO) << "test share_from function";
-    TensorHf4 thost6, thost7;
-    TensorDf4 tdev6, tdev7;
-    thost6.set_shape(thost4.shape());
-    thost7.set_shape(thost4.shape());
-    tdev6.set_shape(thost4.shape());
-    tdev7.set_shape(thost4.shape());
-    Shape sh2(1, 2, 2, 2);
-    Shape offset(0, 0, 1, 1);
-    LOG(INFO) << "|--shared host";
-
-    thost6.share_sub_buffer(thost4, sh2, offset);
-
-    LOG(INFO) << "|--copied host";
-    tdev6.share_from(thost4);
-    LOG(INFO) << "|--copied device";
-    thost7.share_from(tdev4);
-    LOG(INFO) << "|--shared device";
-    tdev7.share_from(tdev4);
-
-
-    LOG(INFO) << "|--change data in shared tensor";
-
-    //Shape sh_real = thost6.shape();
-    //Shape sh_act = thost6.valid_shape();
-    //Shape offset_act = thost6.offset();
-
-    //int start_w = offset_act[3];
-    //int start_h = offset_act[2];
-    //int start_c = offset_act[1];
-    //int start_n = offset_act[0];
-    //int stride_h = sh_real.count(3);
-    //int stride_c = sh_real.count(2);
-    //int stride_n = sh_real.count(1);
-    //int stride_n = sh_real.count(0);
-    Shape stride = thost6.get_stride();
-    int w = thost6.width();
-    int h = thost6.height();
-    int c = thost6.channel();
-    int n = thost6.num();
-
-    dtype* ptr_host = thost6.mutable_data();
-
-    for (int in = 0; in < n; ++in) {
-        dtype* ptr_batch = ptr_host + in * stride[0];
-
-        for (int ic = 0; ic < c; ++ic) {
-            dtype* ptr_channel = ptr_batch + ic * stride[1];
-
-            for (int ih = 0; ih < h; ++ih) {
-                dtype* ptr_row = ptr_channel + ih * stride[2];
-
-                for (int iw = 0; iw < w; ++iw) {
-                    ptr_row[iw] = 1.f;
-                }
-            }
-        }
-    }
-
-    LOG(INFO) << "|--show root tensor while data is changed by shared tensor";
-    print_tensor_host(thost4);
-    bmdnn_deinit(handle);
-}
-
-/*
-TEST(TestSaberTensorBM, test_tensor_deepcopy) {
-    //! tensor constructor with alloc data, if target is different, create buffer, and copy the data
-    LOG(INFO) << "test tensor deep copy";
-    Shape sh0(2, 2, 4, 4);
-    Shape va_sh0(2, 2, 2, 2);
-    Shape off_sh0(0, 0, 1, 1);
-
-    Shape sh1(2, 2, 4, 4);
-    Shape va_sh1(va_sh0);
-    Shape off_sh1(0, 0, 1, 0);
-
-    Shape sh2(2, 32);
-    Shape va_sh2(2, 8);
-    Shape off_sh2(0, 8);
-
-    X86_API::stream_t x86_stream;
-    BM_API::stream_t nv_stream;
-    X86_API::create_stream(x86_stream);
-    BM_API::create_stream(nv_stream);
-
-    //! create source tensor, th0, td0, th01, td01, th1, td1;
-    TensorHf4 th0(sh0);
-
-    for (int i = 0; i < sh0.count(); ++i) {
-        th0.mutable_data()[i] = i;
-    }
-
-    TensorHf4 th1(va_sh0);
-
-    for (int i = 0; i < va_sh0.count(); ++i) {
-        th1.mutable_data()[i] = i;
-    }
-
-    TensorHf4 th01;
-    th01.share_sub_buffer(th0, va_sh0, off_sh0);
-
-    TensorDf4 td0, td1, td01;
-    td0.set_shape(th0.shape());
-    td1.set_shape(th1.shape());
-    td0.share_from(th0);
-    td1.share_from(th1);
-    TensorDf4 dev_tmp0;
-    dev_tmp0.set_shape(th0.shape());
-    dev_tmp0.share_from(th0);
-    td01.share_sub_buffer(dev_tmp0, va_sh0, off_sh0);
-
-    print_tensor_host(th0);
-    print_tensor_host(th1);
-    print_tensor_device(td0);
-    print_tensor_device(td1);
-
-    //! create th2, th3, th21, td2, td3, td21 as dst tensor
-    TensorHf2 th2(sh2);
-    fill_tensor_host_const(th2, 0.f);
-    TensorHf2 th21;
-    th21.share_sub_buffer(th2, va_sh2, off_sh2);
-    TensorHf2 th3(va_sh2);
-
-    TensorDf2 td2(sh2);
-    fill_tensor_device_const(td2, 0.f);
-    //cudaDeviceSynchronize();
-    TensorDf2 td21;
-    td21.share_sub_buffer(td2, va_sh2, off_sh2);
-    TensorDf2 td3(va_sh2);
-
-    double max_diff;
-    double  max_ratio;
-    //! test tensor deep copy, entire buffer copy
-    LOG(INFO) << "test tensor deep copy, entire buffer copy, H2H";
-    th3.copy_from(th1);
-    print_tensor_host(th3);
-    tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff);
-    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, sync, H2H";
-    fill_tensor_host_const(th3, 0.f);
-    th3.async_copy_from(th1, x86_stream);
-    th3.record_event(x86_stream);
-    th3.sync();
-    tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff);
-    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, async, H2H";
-
-    LOG(INFO) << "test tensor deep copy, entire buffer copy, D2H";
-    th3.copy_from(td1);
-    print_tensor_host(th3);
-    tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff);
-    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, sync, D2H";
-    fill_tensor_host_const(th3, 0.f);
-    th3.async_copy_from(td1, nv_stream);
-    th3.record_event(x86_stream);
-    th3.sync();
-    tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff);
-    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, async, D2H";
-
-    LOG(INFO) << "test tensor deep copy, entire buffer copy, H2D";
-    td3.copy_from(th1);
-    print_tensor_device(td3);
-    //cudaDeviceSynchronize();
-    tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff);
-    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, sync, D2H";
-    fill_tensor_device_const(td3, 0.f);
-    //cudaDeviceSynchronize();
-    td3.async_copy_from(th1, nv_stream);
-    td3.record_event(nv_stream);
-    td3.sync();
-    tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff);
-    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, async, D2H";
-
-    LOG(INFO) << "test tensor deep copy, entire buffer copy, D2D";
-    td3.copy_from(td1);
-    print_tensor_device(td3);
-    //cudaDeviceSynchronize();
-    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, sync, D2D";
-    fill_tensor_device_const(td3, 0.f);
-    //cudaDeviceSynchronize();
-    td3.async_copy_from(td1, nv_stream);
-    td3.record_event(nv_stream);
-    td3.sync();
-    CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, async, D2D";
-
-
-    //! test tensor deep copy, src with roi
-    LOG(INFO) << "test tensor deep copy, src with roi, H2H";
-    th3.copy_from(th01);
-    print_tensor_host(th3);
-
-    LOG(INFO) << "test tensor deep copy, src with roi, D2H";
-    th3.copy_from(td01);
-    print_tensor_host(th3);
-
-    LOG(INFO) << "test tensor deep copy, src with roi, H2D";
-    td3.copy_from(th01);
-    print_tensor_device(td3);
-    //cudaDeviceSynchronize();
-
-    LOG(INFO) << "test tensor deep copy, src with roi, D2D";
-    td3.copy_from(td01);
-    print_tensor_device(td3);
-    //cudaDeviceSynchronize();
-
-
-    //! test tensor deep copy, dst with roi
-    LOG(INFO) << "test tensor deep copy, dst with roi, H2H";
-    print_tensor_host(th21);
-    print_tensor_host(th1);
-    th21.copy_from(th1);
-    print_tensor_host(th21);
-
-    LOG(INFO) << "test tensor deep copy, dst with roi, D2H";
-    th21.copy_from(td1);
-    print_tensor_host(th21);
-
-    LOG(INFO) << "test tensor deep copy, dst with roi, H2D";
-    td21.copy_from(th1);
-    print_tensor_device(td21);
-    //cudaDeviceSynchronize();
-
-    LOG(INFO) << "test tensor deep copy, dst with roi, D2D";
-    td21.copy_from(td1);
-    print_tensor_device(td21);
-    //cudaDeviceSynchronize();
-
-
-    //! test tensor deep copy, src and dst are with roi
-    LOG(INFO) << "test tensor deep copy, src and dst are with roi, H2H";
-    th21.copy_from(th01);
-    print_tensor_host(th21);
-
-    LOG(INFO) << "test tensor deep copy, src and dst are with roi, D2H";
-    th21.copy_from(td01);
-    print_tensor_host(th21);
-
-    LOG(INFO) << "test tensor deep copy, src and dst are with roi, H2D";
-    td21.copy_from(th01);
-    print_tensor_device(td21);
-    //cudaDeviceSynchronize();
-
-    LOG(INFO) << "test tensor deep copy, src and dst are with roi, D2D";
-    td21.copy_from(td01);
-    print_tensor_device(td21);
-    //cudaDeviceSynchronize();
-}*/
-
-TEST(TestSaberTensorBM, test_tensor_shape) {
-    typedef Tensor<X86, AK_FLOAT, NCHW> Tensor4_0;
-    typedef Tensor<X86, AK_FLOAT, NHWC> Tensor4_1;
-    typedef Tensor<X86, AK_FLOAT, HW> Tensor2;
-
-    int nin = 2;
-    int cin = 2;
-    int hin = 4;
-    int win = 4;
-
-    LOG(INFO) << "test tensor interface";
-
-    Tensor4_0 t1(Shape(nin, cin, hin, win));
-    Tensor4_1 t2(Shape(nin, hin, win, cin));
-    Tensor2 t3(Shape(hin, win));
-
-    LOG(INFO) << "test tensor with layout of NCHW";
-    LOG(INFO) << "num: " << t1.num() << ", num idx: " << t1.num_index() << \
-              ", channel: " << t1.channel() << ", channel idx: " << t1.channel_index() << \
-              ", height: " << t1.height() << ", height idx: " << t1.height_index() << \
-              ", widhth: " << t1.width() << ", width idx: " << t1.width_index();
-
-    CHECK_EQ(t1.num(), nin) << "NCHW get num error";
-    CHECK_EQ(t1.channel(), cin) << "NCHW get channel error";
-    CHECK_EQ(t1.height(), hin) << "NCHW get height error";
-    CHECK_EQ(t1.width(), win) << "NCHW get width error";
-
-    CHECK_EQ(t1.num_index(), 0) << "NCHW get num index error";
-    CHECK_EQ(t1.channel_index(), 1) << "NCHW get channel index error";
-    CHECK_EQ(t1.height_index(), 2) << "NCHW get height index error";
-    CHECK_EQ(t1.width_index(), 3) << "NCHW get width index error";
-
-    LOG(INFO) << "test tensor with layout of NHWC";
-    LOG(INFO) << "num: " << t2.num() << ", num idx: " << t2.num_index() << \
-              ", channel: " << t2.channel() << ", channel idx: " << t2.channel_index() << \
-              ", height: " << t2.height() << ", height idx: " << t2.height_index() << \
-              ", widhth: " << t2.width() << ", width idx: " << t2.width_index();
-
-    CHECK_EQ(t2.num(), nin) << "NHWC get num error";
-    CHECK_EQ(t2.channel(), cin) << "NHWC get channel error";
-    CHECK_EQ(t2.height(), hin) << "NHWC get height error";
-    CHECK_EQ(t2.width(), win) << "NHWC get width error";
-
-    CHECK_EQ(t2.num_index(), 0) << "NHWC get num index error";
-    CHECK_EQ(t2.channel_index(), 3) << "NHWC get channel index error";
-    CHECK_EQ(t2.height_index(), 1) << "NHWC get height index error";
-    CHECK_EQ(t2.width_index(), 2) << "NHWC get width index error";
-
-    LOG(INFO) << "test tensor with layout of HW";
-    LOG(INFO) << "num: " << t3.num() << ", num idx: " << t3.num_index() << \
-              ", channel: " << t3.channel() << ", channel idx: " << t3.channel_index() << \
-              ", height: " << t3.height() << ", height idx: " << t3.height_index() << \
-              ", widhth: " << t3.width() << ", width idx: " << t3.width_index();
-
-    CHECK_EQ(t3.num(), 1) << "HW get num error";
-    CHECK_EQ(t3.channel(), 1) << "HW get channel error";
-    CHECK_EQ(t3.height(), hin) << "HW get height error";
-    CHECK_EQ(t3.width(), win) << "HW get width error";
-
-    CHECK_EQ(t3.num_index(), -1) << "HW get num index error";
-    CHECK_EQ(t3.channel_index(), -1) << "HW get channel index error";
-    CHECK_EQ(t3.height_index(), 0) << "HW get height index error";
-    CHECK_EQ(t3.width_index(), 1) << "HW get width index error";
-
-}
-
-TEST(TestSaberTensorBM, test_tensor_reshape_realloc) {
-
-    LOG(INFO) << "test tensor reshape and re_alloc funcs";
-
-    Shape sh0(2, 2, 2, 2);
-    Shape sh1(2, 2, 4, 4);
-    TensorHf4 th0(sh1);
-    TensorDf4 td0(sh1);
-    fill_tensor_host_const(th0, 1);
-    fill_tensor_device_const(td0, 1);
-    LOG(INFO) << "ori tensor with size: " << th0.valid_size();
-    print_tensor_host(th0);
-    print_tensor_device(td0);
-    //cudaDeviceSynchronize();
-
-    th0.reshape(sh0);
-    td0.reshape(sh0);
-    LOG(INFO) << "tensor after reshape(from big space to small) with size: " << th0.valid_size();
-    print_tensor_host(th0);
-    print_tensor_device(td0);
-    //cudaDeviceSynchronize();
-    fill_tensor_host_const(th0, 1);
-    fill_tensor_device_const(td0, 1);
-    //cudaDeviceSynchronize();
-
-    th0.reshape(sh1);
-    td0.reshape(sh1);
-    LOG(INFO) << "tensor after reshape(from small to big, not larger than ori) with size: " <<
-              th0.valid_size();
-    print_tensor_host(th0);
-    print_tensor_device(td0);
-    //cudaDeviceSynchronize();
-
-    th0.re_alloc(sh0);
-    td0.re_alloc(sh0);
-    LOG(INFO) << "tensor after re_alloc(from big space to small) with size: " << th0.valid_size();
-    print_tensor_host(th0);
-    print_tensor_device(td0);
-    //cudaDeviceSynchronize();
-
-    TensorHf4 th1(sh0);
-    TensorDf4 td1(sh0);
-    LOG(INFO) << "ori tensor with size: " << th1.valid_size();
-    fill_tensor_host_const(th1, 1);
-    fill_tensor_device_const(td1, 1);
-    //cudaDeviceSynchronize();
-    print_tensor_host(th1);
-    print_tensor_device(td1);
-    //cudaDeviceSynchronize();
-
-    th1.reshape(sh1);
-    td1.reshape(sh1);
-    LOG(INFO) << "tensor after reshape(from small space to big) with size: " << th1.valid_size();
-    //printf("real_shape: %d,%d, %d, %d, valid_shape: %d, %d, %d, %d\n", \
-    th1.shape()[0], th1.shape()[1], th1.shape()[2], th1.shape()[3], \
-    th1.valid_shape()[0], th1.valid_shape()[1], th1.valid_shape()[2], th1.valid_shape()[3]);
-    print_tensor_host(th1);
-    print_tensor_device(td1);
-    //cudaDeviceSynchronize();
-    fill_tensor_host_const(th1, 1);
-    fill_tensor_device_const(td1, 1);
-    //cudaDeviceSynchronize();
-
-    th1.reshape(sh0);
-    td1.reshape(sh0);
-
-    LOG(INFO) << "tensor after re_alloc(from small space to big) with size: " << th1.valid_size();
-    th1.re_alloc(sh1);
-    td1.re_alloc(sh1);
-    print_tensor_host(th1);
-    print_tensor_device(td1);
-    //cudaDeviceSynchronize();
-
-}
-
-TEST(TestSaberTensorBM, test_tensor_op) {
-    Shape sh{1, 2, 2, 10};
-    TensorDf4 td1(sh);
-    TensorHf4 th1(sh);
-    Tensor<BM, AK_BM, NCHW> td2(sh);
-    Tensor<X86, AK_FLOAT, NCHW> th2(sh);
-    LOG(INFO) << "testing host fill tensor with const 1.";
-    fill_tensor_host_const(th1, 1.f);
-    LOG(INFO) << "data type: float";
-    print_tensor_host(th1);
-    fill_tensor_host_const(th2, 1);
-    LOG(INFO) << "data type: int8";
-    print_tensor_host(th2);
-
-    LOG(INFO) << "testing device fill tensor with const 1.";
-    fill_tensor_device_const(td1, 1.f);
-    LOG(INFO) << "data type: float";
-    print_tensor_device(td1);
-    fill_tensor_device_const(td2, 1);
-    LOG(INFO) << "data type: int8";
-    print_tensor_device(td2);
-
-    LOG(INFO) << "testing host fill tensor with rand";
-    fill_tensor_host_rand(th1);
-    LOG(INFO) << "data type: float";
-    print_tensor_host(th1);
-    fill_tensor_host_rand(th2);
-    LOG(INFO) << "data type: int8";
-    print_tensor_host(th2);
-
-    LOG(INFO) << "testing device fill tensor with rand";
-    fill_tensor_device_rand(td1);
-    LOG(INFO) << "data type: float";
-    print_tensor_device(td1);
-    fill_tensor_device_rand(td2);
-    LOG(INFO) << "data type: int8";
-    print_tensor_device(td2);
-
-    LOG(INFO) << "testing host fill tensor with rand from 1 to 10";
-    fill_tensor_host_rand(th1, 1, 10);
-    LOG(INFO) << "data type: float";
-    print_tensor_host(th1);
-    fill_tensor_host_rand(th2, 1, 10);
-    LOG(INFO) << "data type: int8";
-    print_tensor_host(th2);
-
-    LOG(INFO) << "testing device fill tensor with rand from 1 to 10";
-    fill_tensor_device_rand(td1, 1, 10);
-    LOG(INFO) << "data type: float";
-    print_tensor_device(td1);
-    fill_tensor_device_rand(td2, 1, 10);
-    LOG(INFO) << "data type: int8";
-    print_tensor_device(td2);
-}
-
-TEST(TestSaberTensorBM, test_tensor_share_diff_dtype) {
-    Shape sh{1, 1, 2, 10};
-    Tensor<BM, AK_BM, NCHW> td1(sh);
-    Tensor<X86, AK_FLOAT, NCHW> th1(sh);
-    Tensor<BM, AK_BM, NCHW> td2;
-    Tensor<X86, AK_FLOAT, NCHW> th2;
-    td2.set_shape(sh);
-    th2.set_shape(sh);
-    LOG(INFO) << "testing host fill tensor with const 1.";
-    fill_tensor_host_const(th1, -1);
-    LOG(INFO) << "data type: float";
-    print_tensor_host(th1);
-    fill_tensor_device_const(td1, -1);
-    LOG(INFO) << "data type: int8";
-    print_tensor_device(td1);
-    //cudaDeviceSynchronize();
-
-    td2.share_from(td1);
-    th2.share_from(th1);
-
-    print_tensor_host(th2);
-    print_tensor_device(td2);
-    //cudaDeviceSynchronize();
-}
-
-TEST(TestSaberTensorBM, test_tensor_base_type) {
-    Shape sh(1, 3, 10, 10);
-    Tensor<BM, AK_BM, NCHW> td1(sh);
-    Tensor<X86, AK_FLOAT, NCHW> th1(sh);
-    fill_tensor_host_rand(th1, 0.f, 255.f);
-    td1.copy_from(th1);
-    TensorBase* tb1;
-    TensorBase* tb2;
-    tb1 = &th1;
-    Shape sh1(1, 1, 10, 10);
-    tb1->set_shape(sh1);
-    Shape sh11 = th1.valid_shape();
-    LOG(INFO) << "base tensor call set shape: " << "n=" << sh11[0] << ", c=" << sh11[1] << \
-              ", h=" << sh11[2] << ", w=" << sh11[3];
-}
-
-int main(int argc, const char** argv) {
-    // initial logger
-    logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
diff --git a/test/saber/bm/test_saber_tensor_BM.h b/test/saber/bm/test_saber_tensor_BM.h
deleted file mode 100644
index 32a402258..000000000
--- a/test/saber/bm/test_saber_tensor_BM.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef ANAKIN_TEST_SABER_TEST_SABER_TENSOR_BM_H
-#define ANAKIN_TEST_SABER_TEST_SABER_TENSOR_BM_H
-
-#include "utils/unit_test/aktest.h"
-#include "utils/logger/logger.h"
-#include "core/tensor.h"
-
-using namespace anakin::test;
-
-class TestSaberTensorBM : public Test {
-public:
-    TestSaberTensorBM() {}
-    ~TestSaberTensorBM() {}
-
-protected:
-    virtual void setup() {}
-    virtual void teardown() {}
-
-};
-
-#endif //ANAKIN_TEST_SABER_TEST_SABER_TENSOR_BM_H

From 4002edecf35ea44b517fe06632fd9a934c19db94 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 7 Aug 2018 14:59:36 +0800
Subject: [PATCH 245/318] Specialize AK_BM for DataTrait

---
 saber/core/data_traits.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/saber/core/data_traits.h b/saber/core/data_traits.h
index 45fd9b25f..cfe7951b4 100644
--- a/saber/core/data_traits.h
+++ b/saber/core/data_traits.h
@@ -50,13 +50,6 @@ struct DataTraitBase<AMD>{
 };
 #endif
 
-#ifdef USE_BM
-template <>
-struct DataTrait<AK_BM> {
-    typedef bm_device_mem_t dtype;
-};
-#endif
-
 static size_t type_length(DataType type) {
     switch (type){
         case AK_INT8:
@@ -150,6 +143,14 @@ struct DataTrait<Ttype, AK_UINT32> {
     typedef unsigned int* PtrDtype;
 };
 
+#ifdef USE_BM
+template <>
+struct DataTrait<BM, AK_BM> {
+    typedef bm_device_mem_t Dtype;
+    typedef bm_device_mem_t* PtrDtype;
+};
+#endif
+
 #ifdef USE_OPENCL
 struct ClMem{
     ClMem(){

From 16383d8cd0bde251c40f6f7347d815e420638f94 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 7 Aug 2018 15:02:26 +0800
Subject: [PATCH 246/318] Update copy_from template

---
 saber/core/tensor.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 017187a3a..148905841 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -758,14 +758,6 @@ class Tensor {
         return SaberSuccess;
     }
 
-#ifdef USE_BM
-    template <typename TargetType_t>
-    SaberStatus copy_from(const Tensor<TargetType_t>& tensor) {
-        LOG(WARNING) << "Invalid: copy_from is not allowed for current type.";
-        return SaberInvalidValue;
-    }
-#endif
-
     /**
      * \brief Asynchronously copy entire buffer from source tensor.
      */

From cb06b3ab466a9ef66d0e511e9747681efd85c265 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 7 Aug 2018 15:06:29 +0800
Subject: [PATCH 247/318] cast data type

---
 saber/core/tensor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index 148905841..cdbfc07c0 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -1004,7 +1004,7 @@ SaberStatus Tensor<BM>::copy_from<X86>(const Tensor<X86>& tensor) {
     CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
     CHECK_EQ(tensor.get_dtype(), AK_FLOAT) << "host data type should be AK_FLOAT";
 
-    auto* device_data_ptr = mutable_data();
+    bm_device_mem_t* device_data_ptr = mutable_data();
     BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(const_cast<float *>(tensor.data()))));
     return SaberSuccess;
 }

From ccf5156f47156872afb4feea74ea0ef074fb642a Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 7 Aug 2018 15:13:28 +0800
Subject: [PATCH 248/318] cast data type

---
 saber/core/impl/bm/temsor_op_bm.cpp | 8 ++++----
 saber/core/tensor.h                 | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/saber/core/impl/bm/temsor_op_bm.cpp b/saber/core/impl/bm/temsor_op_bm.cpp
index 5c1530339..95c12d837 100644
--- a/saber/core/impl/bm/temsor_op_bm.cpp
+++ b/saber/core/impl/bm/temsor_op_bm.cpp
@@ -17,7 +17,7 @@ void fill_tensor_rand<Tensor<BM>>(Tensor<BM>& tensor, \
         host_mem_input[i] = static_cast<float>(rand());
     }
 
-    bm_device_mem_t* device_data_ptr = tensor.mutable_data();
+    bm_device_mem_t* device_data_ptr = (bm_device_mem_t*) tensor.mutable_data();
     BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
 
     delete [] host_mem_input;
@@ -37,7 +37,7 @@ void fill_tensor_rand(Tensor<BM>& tensor, float vstart, \
         host_mem_input[i] = random_num;
     }
 
-    bm_device_mem_t* device_data_ptr = tensor.mutable_data();
+    bm_device_mem_t* device_data_ptr = (bm_device_mem_t*) tensor.mutable_data();
     BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
 
     delete [] host_mem_input;
@@ -52,7 +52,7 @@ void fill_tensor_const(Tensor<BM>& tensor, float value, \
         host_mem_input[i] = value;
     }
 
-    bm_device_mem_t* device_data_ptr = tensor.mutable_data();
+    bm_device_mem_t* device_data_ptr = (bm_device_mem_t*) tensor.mutable_data();
     BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
 
     delete [] host_mem_input;
@@ -79,7 +79,7 @@ void print_tensor<Tensor<BM>>(Tensor<BM>& tensor,  \
     }*/
 
     float *host_mem = new float[tensor.size()];
-    auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
+    auto* device_data_ptr = const_cast<bm_device_mem_t *>((bm_device_mem_t*) tensor.data());
     bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr);
 
     for (int i = 0; i < tensor.size(); ++i) {
diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index cdbfc07c0..feeb1688b 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -1004,8 +1004,8 @@ SaberStatus Tensor<BM>::copy_from<X86>(const Tensor<X86>& tensor) {
     CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
     CHECK_EQ(tensor.get_dtype(), AK_FLOAT) << "host data type should be AK_FLOAT";
 
-    bm_device_mem_t* device_data_ptr = mutable_data();
-    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(const_cast<float *>(tensor.data()))));
+    bm_device_mem_t* device_data_ptr = (bm_device_mem_t*) mutable_data();
+    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(const_cast<float *>((float*) tensor.data()))));
     return SaberSuccess;
 }
 
@@ -1016,8 +1016,8 @@ SaberStatus Tensor<X86>::copy_from<BM>(const Tensor<BM>& tensor) {
     CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
     CHECK_EQ(_dtype, AK_FLOAT) << "host data type should be AK_FLOAT";
 
-    auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
-    BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *device_data_ptr));
+    auto* device_data_ptr = const_cast<bm_device_mem_t *>((bm_device_mem_t*) tensor.data());
+    BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system((float*) mutable_data()), *device_data_ptr));
     return SaberSuccess;
 }
 #endif

From d5d7399b4b2f2d9aea53d4fbc5485171f61a8e24 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 7 Aug 2018 15:24:47 +0800
Subject: [PATCH 249/318] set default value

---
 saber/core/impl/bm/temsor_op_bm.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/saber/core/impl/bm/temsor_op_bm.cpp b/saber/core/impl/bm/temsor_op_bm.cpp
index 95c12d837..f10a7accc 100644
--- a/saber/core/impl/bm/temsor_op_bm.cpp
+++ b/saber/core/impl/bm/temsor_op_bm.cpp
@@ -10,7 +10,7 @@ namespace saber{
 
 template<>
 void fill_tensor_rand<Tensor<BM>>(Tensor<BM>& tensor, \
-    typename Tensor<BM>::API::stream_t stream) {
+    typename Tensor<BM>::API::stream_t stream = NULL) {
 
     float *host_mem_input = new float[tensor.size()];
     for (int i = 0; i < tensor.size(); ++i) {
@@ -60,7 +60,7 @@ void fill_tensor_const(Tensor<BM>& tensor, float value, \
 
 template <>
 void print_tensor<Tensor<BM>>(Tensor<BM>& tensor,  \
-    typename Tensor<BM>::API::stream_t stream) {
+    typename Tensor<BM>::API::stream_t stream = NULL) {
 
     LOG(INFO) << "BM device tensor data:" << tensor.size();
 

From 3587f90a67a36964167398a2e18398a624d4be0a Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 7 Aug 2018 15:38:26 +0800
Subject: [PATCH 250/318] Update restrictions

---
 saber/core/impl/bm/temsor_op_bm.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/saber/core/impl/bm/temsor_op_bm.cpp b/saber/core/impl/bm/temsor_op_bm.cpp
index f10a7accc..256cf977c 100644
--- a/saber/core/impl/bm/temsor_op_bm.cpp
+++ b/saber/core/impl/bm/temsor_op_bm.cpp
@@ -9,7 +9,7 @@ namespace anakin{
 namespace saber{
 
 template<>
-void fill_tensor_rand<Tensor<BM>>(Tensor<BM>& tensor, \
+void fill_tensor_rand<BM>(Tensor<BM>& tensor, \
     typename Tensor<BM>::API::stream_t stream = NULL) {
 
     float *host_mem_input = new float[tensor.size()];
@@ -24,7 +24,7 @@ void fill_tensor_rand<Tensor<BM>>(Tensor<BM>& tensor, \
 }
 
 template<>
-void fill_tensor_rand(Tensor<BM>& tensor, float vstart, \
+void fill_tensor_rand<BM>(Tensor<BM>& tensor, float vstart, \
     float vend, typename Tensor<BM>::API::stream_t stream = NULL){
 
     std::random_device rd;
@@ -44,7 +44,7 @@ void fill_tensor_rand(Tensor<BM>& tensor, float vstart, \
 }
 
 template<>
-void fill_tensor_const(Tensor<BM>& tensor, float value, \
+void fill_tensor_const<BM>(Tensor<BM>& tensor, float value, \
     typename Tensor<BM>::API::stream_t stream = NULL){
 
     float *host_mem_input = new float[tensor.size()];
@@ -59,7 +59,7 @@ void fill_tensor_const(Tensor<BM>& tensor, float value, \
 }
 
 template <>
-void print_tensor<Tensor<BM>>(Tensor<BM>& tensor,  \
+void print_tensor<BM>(Tensor<BM>& tensor,  \
     typename Tensor<BM>::API::stream_t stream = NULL) {
 
     LOG(INFO) << "BM device tensor data:" << tensor.size();

From 4cb9112f91220ff45eb98b174ebe69791a8a378f Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 7 Aug 2018 15:58:50 +0800
Subject: [PATCH 251/318] Update BM conv impl

---
 saber/funcs/impl/bm/vender_conv.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_conv.cpp b/saber/funcs/impl/bm/vender_conv.cpp
index e1e6ec1ec..8f7782074 100644
--- a/saber/funcs/impl/bm/vender_conv.cpp
+++ b/saber/funcs/impl/bm/vender_conv.cpp
@@ -8,7 +8,7 @@ namespace saber
 
 // FP32 part
 template <>
-SaberStatus VenderConv2D<BM, AK_BM>::
+SaberStatus VenderConv2D<BM, AK_BM>::\
     init(const std::vector<Tensor<BM> *> &inputs,
          std::vector<Tensor<BM> *> &outputs,
          ConvParam<BM> &param, Context<BM> &ctx)
@@ -19,18 +19,18 @@ SaberStatus VenderConv2D<BM, AK_BM>::
 }
 
 template <>
-SaberStatus VenderConv2D<BM, AK_BM>::
-    create(const std::vector<Tensor<BM> *> &inputs,
-           std::vector<Tensor<BM> *> &outputs,
-           ConvParam<BM> &param, Context<BM> &ctx)
+SaberStatus VenderConv2D<BM, AK_BM>::\
+    create(const std::vector<Tensor<BM> *>& inputs,
+            std::vector<Tensor<BM> *>& outputs,
+            ConvParam<BM>& param, Context<BM>& ctx)
 {
 }
 
 template <>
 SaberStatus VenderConv2D<BM, AK_BM>::\
-    dispatch(const std::vector<Tensor<BM> *> &inputs,
-                             std::vector<Tensor<BM> *> &outputs,
-                             ConvParam<BM> &param)
+    dispatch(const std::vector<Tensor<BM>*>& inputs,
+                std::vector<Tensor<BM>*>& outputs,
+                ConvParam<BM>& param)
 {
 
     /*const bm_device_mem_t *in_data = (const bm_device_mem_t *)inputs[0]->data();

From 84d2021a5371b0328a4021f7a65e4a8a2bad1baf Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 7 Aug 2018 16:01:26 +0800
Subject: [PATCH 252/318] Keep in correct order

---
 saber/funcs/impl/bm/vender_conv.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_conv.cpp b/saber/funcs/impl/bm/vender_conv.cpp
index 8f7782074..aa12462d3 100644
--- a/saber/funcs/impl/bm/vender_conv.cpp
+++ b/saber/funcs/impl/bm/vender_conv.cpp
@@ -7,6 +7,14 @@ namespace saber
 {
 
 // FP32 part
+template <>
+SaberStatus VenderConv2D<BM, AK_BM>::\
+    create(const std::vector<Tensor<BM> *>& inputs,
+            std::vector<Tensor<BM> *>& outputs,
+            ConvParam<BM>& param, Context<BM>& ctx)
+{
+}
+
 template <>
 SaberStatus VenderConv2D<BM, AK_BM>::\
     init(const std::vector<Tensor<BM> *> &inputs,
@@ -18,14 +26,6 @@ SaberStatus VenderConv2D<BM, AK_BM>::\
     return create(inputs, outputs, param, ctx);
 }
 
-template <>
-SaberStatus VenderConv2D<BM, AK_BM>::\
-    create(const std::vector<Tensor<BM> *>& inputs,
-            std::vector<Tensor<BM> *>& outputs,
-            ConvParam<BM>& param, Context<BM>& ctx)
-{
-}
-
 template <>
 SaberStatus VenderConv2D<BM, AK_BM>::\
     dispatch(const std::vector<Tensor<BM>*>& inputs,

From 03ae49b6e3f2aefca421638440a42c6f7b9dcdeb Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 7 Aug 2018 16:05:03 +0800
Subject: [PATCH 253/318] comment out BM activation op

---
 saber/funcs/activation.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/funcs/activation.h b/saber/funcs/activation.h
index 74941e639..012f38b96 100644
--- a/saber/funcs/activation.h
+++ b/saber/funcs/activation.h
@@ -38,7 +38,7 @@
 #endif
 
 #ifdef USE_BM
-#include "saber/funcs/impl/bm/vender_activation.h"
+//#include "saber/funcs/impl/bm/vender_activation.h"
 #endif
 
 namespace anakin {

From 4413241f4fdb610727c3895215b01ff3b1194aa5 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 7 Aug 2018 16:18:57 +0800
Subject: [PATCH 254/318] Refactor

---
 .gitignore          |   5 +-
 .idea/workspace.xml | 453 --------------------------------------------
 2 files changed, 1 insertion(+), 457 deletions(-)
 delete mode 100644 .idea/workspace.xml

diff --git a/.gitignore b/.gitignore
index 036a1564d..1494599a0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,7 +38,4 @@ ios_build
 gpu_build
 output
 
-.idea
-.vs_code
-.idea/workspace.xml
-.vscode/settings.json
+.vscode
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
deleted file mode 100644
index a3065b63c..000000000
--- a/.idea/workspace.xml
+++ /dev/null
@@ -1,453 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="CMakeRunConfigurationManager" shouldGenerate="true" shouldDeleteObsolete="true" buildAllGenerated="false">
-    <generated />
-  </component>
-  <component name="CMakeSettings">
-    <configurations>
-      <configuration PROFILE_NAME="Debug" CONFIG_NAME="Debug" />
-    </configurations>
-  </component>
-  <component name="ChangeListManager">
-    <list default="true" id="05b5ffab-037b-4c62-b3a1-7655f7441f38" name="Default" comment="" />
-    <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
-    <option name="TRACKING_ENABLED" value="true" />
-    <option name="SHOW_DIALOG" value="false" />
-    <option name="HIGHLIGHT_CONFLICTS" value="true" />
-    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
-    <option name="LAST_RESOLUTION" value="IGNORE" />
-  </component>
-  <component name="FileEditorManager">
-    <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
-      <file leaf-file-name="saber_funcs_param.h" pinned="false" current-in-tab="true">
-        <entry file="file://$PROJECT_DIR$/saber/saber_funcs_param.h">
-          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="135">
-              <caret line="9" column="64" selection-start-line="9" selection-start-column="64" selection-end-line="9" selection-end-column="64" />
-              <folding>
-                <element signature="e#897#918#0" expanded="true" />
-                <element signature="e#948#969#0" expanded="true" />
-                <element signature="e#2779#2800#0" expanded="true" />
-                <element signature="e#6280#6301#0" expanded="true" />
-                <element signature="e#8615#8636#0" expanded="true" />
-                <element signature="e#9435#9456#0" expanded="true" />
-                <element signature="e#10395#10416#0" expanded="true" />
-                <element signature="e#12109#12130#0" expanded="true" />
-                <element signature="e#13826#13841#0" expanded="true" />
-              </folding>
-            </state>
-          </provider>
-        </entry>
-      </file>
-      <file leaf-file-name="target_wrapper.h" pinned="false" current-in-tab="false">
-        <entry file="file://$PROJECT_DIR$/saber/core/target_wrapper.h">
-          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="4423">
-              <caret line="523" column="37" selection-start-line="523" selection-start-column="37" selection-end-line="523" selection-end-column="37" />
-              <folding>
-                <element signature="e#14794#16896#0" expanded="true" />
-              </folding>
-            </state>
-          </provider>
-        </entry>
-      </file>
-      <file leaf-file-name="conv.h" pinned="false" current-in-tab="false">
-        <entry file="file://$PROJECT_DIR$/saber/funcs/conv.h">
-          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="225">
-              <caret line="17" column="29" selection-start-line="17" selection-start-column="29" selection-end-line="17" selection-end-column="29" />
-            </state>
-          </provider>
-        </entry>
-      </file>
-      <file leaf-file-name="activation.h" pinned="false" current-in-tab="false">
-        <entry file="file://$PROJECT_DIR$/saber/funcs/activation.h">
-          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="435">
-              <caret line="29" column="6" selection-start-line="29" selection-start-column="6" selection-end-line="29" selection-end-column="6" />
-              <folding>
-                <element signature="e#836#940#0" expanded="true" />
-                <element signature="e#970#1020#0" expanded="true" />
-                <element signature="e#1044#1094#0" expanded="true" />
-                <element signature="e#1124#1174#0" expanded="true" />
-                <element signature="e#1197#1247#0" expanded="true" />
-                <element signature="e#1302#1349#0" expanded="true" />
-              </folding>
-            </state>
-          </provider>
-        </entry>
-      </file>
-      <file leaf-file-name="tensor.h" pinned="false" current-in-tab="false">
-        <entry file="file://$PROJECT_DIR$/saber/core/tensor.h">
-          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="-1028">
-              <caret line="700" column="15" selection-start-line="700" selection-start-column="15" selection-end-line="700" selection-end-column="15" />
-            </state>
-          </provider>
-        </entry>
-      </file>
-      <file leaf-file-name="tensor_op.cpp" pinned="false" current-in-tab="false">
-        <entry file="file://$PROJECT_DIR$/saber/core/tensor_op.cpp">
-          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="-4">
-              <caret line="113" column="5" selection-start-line="113" selection-start-column="5" selection-end-line="113" selection-end-column="5" />
-              <folding>
-                <element signature="e#90#106#0" expanded="true" />
-                <element signature="e#268#289#0" expanded="true" />
-                <element signature="e#1629#1645#0" expanded="true" />
-                <element signature="e#1814#1835#0" expanded="true" />
-                <element signature="e#2886#2902#0" expanded="true" />
-                <element signature="e#3276#3297#0" expanded="true" />
-                <element signature="e#5075#5091#0" expanded="true" />
-                <element signature="e#5365#5386#0" expanded="true" />
-                <element signature="e#6578#6599#0" expanded="true" />
-                <element signature="e#8139#8155#0" expanded="true" />
-                <element signature="e#8800#8816#0" expanded="true" />
-                <element signature="e#9017#9038#0" expanded="true" />
-                <element signature="e#10133#10154#0" expanded="true" />
-                <element signature="e#12605#12626#0" expanded="true" />
-                <element signature="e#12651#12675#0" expanded="true" />
-                <element signature="e#12705#12726#0" expanded="true" />
-                <element signature="e#12756#12776#0" expanded="true" />
-                <element signature="e#13129#15921#0" expanded="true" />
-              </folding>
-            </state>
-          </provider>
-        </entry>
-      </file>
-      <file leaf-file-name="resize.h" pinned="false" current-in-tab="false">
-        <entry file="file://$PROJECT_DIR$/saber/funcs/resize.h">
-          <provider selected="true" editor-type-id="text-editor" />
-        </entry>
-      </file>
-      <file leaf-file-name=".gitignore" pinned="false" current-in-tab="false">
-        <entry file="file://$PROJECT_DIR$/.gitignore">
-          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="315">
-              <caret line="21" selection-start-line="21" selection-end-line="21" />
-            </state>
-          </provider>
-        </entry>
-      </file>
-      <file leaf-file-name="CMakeLists.txt" pinned="false" current-in-tab="false">
-        <entry file="file://$PROJECT_DIR$/CMakeLists.txt">
-          <provider selected="true" editor-type-id="text-editor" />
-        </entry>
-      </file>
-    </leaf>
-  </component>
-  <component name="FindInProjectRecents">
-    <findStrings>
-      <find>MvnParam</find>
-      <find>ConvParam</find>
-      <find>TargetType</find>
-      <find>mem_set</find>
-      <find>&amp;</find>
-      <find>print</find>
-      <find>AMD_API</find>
-      <find>BM</find>
-      <find>print_tensor_host</find>
-      <find>batch</find>
-    </findStrings>
-  </component>
-  <component name="Git.Settings">
-    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
-  </component>
-  <component name="IdeDocumentHistory">
-    <option name="CHANGED_PATHS">
-      <list>
-        <option value="$PROJECT_DIR$/saber/core/tensor.h" />
-        <option value="$PROJECT_DIR$/.gitignore" />
-        <option value="$PROJECT_DIR$/CMakeLists.txt" />
-        <option value="$PROJECT_DIR$/saber/core/tensor_op.cpp" />
-        <option value="$PROJECT_DIR$/saber/core/target_wrapper.h" />
-        <option value="$PROJECT_DIR$/saber/funcs/conv.h" />
-        <option value="$PROJECT_DIR$/saber/funcs/activation.h" />
-      </list>
-    </option>
-  </component>
-  <component name="JsBuildToolGruntFileManager" detection-done="true" sorting="DEFINITION_ORDER" />
-  <component name="JsBuildToolPackageJson" detection-done="true" sorting="DEFINITION_ORDER" />
-  <component name="JsGulpfileManager">
-    <detection-done>true</detection-done>
-    <sorting>DEFINITION_ORDER</sorting>
-  </component>
-  <component name="NodePackageJsonFileManager">
-    <packageJsonPaths />
-  </component>
-  <component name="ProjectFrameBounds">
-    <option name="y" value="23" />
-    <option name="width" value="2560" />
-    <option name="height" value="1353" />
-  </component>
-  <component name="ProjectLevelVcsManager">
-    <ConfirmationsSetting value="2" id="Add" />
-  </component>
-  <component name="ProjectView">
-    <navigator proportions="" version="1">
-      <foldersAlwaysOnTop value="true" />
-    </navigator>
-    <panes>
-      <pane id="ProjectPane">
-        <subPane>
-          <expand>
-            <path>
-              <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
-              <item name="Anakin" type="462c0819:PsiDirectoryNode" />
-            </path>
-            <path>
-              <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
-              <item name="Anakin" type="462c0819:PsiDirectoryNode" />
-              <item name="saber" type="462c0819:PsiDirectoryNode" />
-            </path>
-            <path>
-              <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
-              <item name="Anakin" type="462c0819:PsiDirectoryNode" />
-              <item name="saber" type="462c0819:PsiDirectoryNode" />
-              <item name="core" type="462c0819:PsiDirectoryNode" />
-            </path>
-            <path>
-              <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
-              <item name="Anakin" type="462c0819:PsiDirectoryNode" />
-              <item name="saber" type="462c0819:PsiDirectoryNode" />
-              <item name="funcs" type="462c0819:PsiDirectoryNode" />
-            </path>
-            <path>
-              <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
-              <item name="Anakin" type="462c0819:PsiDirectoryNode" />
-              <item name="test" type="462c0819:PsiDirectoryNode" />
-            </path>
-            <path>
-              <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
-              <item name="Anakin" type="462c0819:PsiDirectoryNode" />
-              <item name="test" type="462c0819:PsiDirectoryNode" />
-              <item name="framework" type="462c0819:PsiDirectoryNode" />
-            </path>
-            <path>
-              <item name="Anakin" type="b2602c69:ProjectViewProjectNode" />
-              <item name="Anakin" type="462c0819:PsiDirectoryNode" />
-              <item name="test" type="462c0819:PsiDirectoryNode" />
-              <item name="framework" type="462c0819:PsiDirectoryNode" />
-              <item name="core" type="462c0819:PsiDirectoryNode" />
-            </path>
-          </expand>
-          <select />
-        </subPane>
-      </pane>
-      <pane id="Scope" />
-    </panes>
-  </component>
-  <component name="PropertiesComponent">
-    <property name="WebServerToolWindowFactoryState" value="false" />
-  </component>
-  <component name="RunDashboard">
-    <option name="ruleStates">
-      <list>
-        <RuleState>
-          <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
-        </RuleState>
-        <RuleState>
-          <option name="name" value="StatusDashboardGroupingRule" />
-        </RuleState>
-      </list>
-    </option>
-  </component>
-  <component name="SvnConfiguration">
-    <configuration />
-  </component>
-  <component name="TaskManager">
-    <task active="true" id="Default" summary="Default task">
-      <changelist id="05b5ffab-037b-4c62-b3a1-7655f7441f38" name="Default" comment="" />
-      <created>1533519941069</created>
-      <option name="number" value="Default" />
-      <option name="presentableId" value="Default" />
-      <updated>1533519941069</updated>
-      <workItem from="1533519943497" duration="1090000" />
-      <workItem from="1533533623166" duration="6760000" />
-    </task>
-    <servers />
-  </component>
-  <component name="TimeTrackingManager">
-    <option name="totallyTimeSpent" value="7850000" />
-  </component>
-  <component name="ToolWindowManager">
-    <frame x="0" y="23" width="2560" height="1353" extended-state="0" />
-    <editor active="true" />
-    <layout>
-      <window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.25" />
-      <window_info anchor="bottom" id="TODO" order="6" />
-      <window_info anchor="bottom" id="Event Log" order="7" side_tool="true" />
-      <window_info anchor="bottom" id="Version Control" order="7" weight="0.28850666" />
-      <window_info anchor="bottom" id="Run" order="2" />
-      <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
-      <window_info active="true" anchor="bottom" id="Terminal" order="7" visible="true" weight="0.19077404" />
-      <window_info id="Favorites" order="2" side_tool="true" />
-      <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
-      <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
-      <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
-      <window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
-      <window_info anchor="bottom" id="Message" order="0" />
-      <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
-      <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
-      <window_info anchor="bottom" id="Find" order="1" />
-    </layout>
-  </component>
-  <component name="TypeScriptGeneratedFilesManager">
-    <option name="version" value="1" />
-  </component>
-  <component name="Vcs.Log.History.Properties">
-    <option name="COLUMN_ORDER">
-      <list>
-        <option value="0" />
-        <option value="2" />
-        <option value="3" />
-        <option value="1" />
-      </list>
-    </option>
-  </component>
-  <component name="Vcs.Log.Tabs.Properties">
-    <option name="TAB_STATES">
-      <map>
-        <entry key="MAIN">
-          <value>
-            <State>
-              <option name="RECENTLY_FILTERED_USER_GROUPS">
-                <collection />
-              </option>
-              <option name="RECENTLY_FILTERED_BRANCH_GROUPS">
-                <collection />
-              </option>
-              <option name="COLUMN_ORDER">
-                <list>
-                  <option value="0" />
-                  <option value="1" />
-                  <option value="2" />
-                  <option value="3" />
-                </list>
-              </option>
-            </State>
-          </value>
-        </entry>
-      </map>
-    </option>
-  </component>
-  <component name="VcsContentAnnotationSettings">
-    <option name="myLimit" value="2678400000" />
-  </component>
-  <component name="editorHistoryManager">
-    <entry file="file://$PROJECT_DIR$/examples/cuda/example_nv_cnn_net.cpp">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="255">
-          <caret line="17" column="26" lean-forward="true" selection-start-line="17" selection-start-column="26" selection-end-line="17" selection-end-column="26" />
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/examples/cuda/example_nv_cnn_net.cpp">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="255">
-          <caret line="17" column="26" selection-start-line="17" selection-start-column="26" selection-end-line="17" selection-end-column="26" />
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/.gitignore">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="315">
-          <caret line="21" selection-start-line="21" selection-end-line="21" />
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/CMakeLists.txt">
-      <provider selected="true" editor-type-id="text-editor" />
-    </entry>
-    <entry file="file://$PROJECT_DIR$/saber/core/tensor.h">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="-1028">
-          <caret line="700" column="15" selection-start-line="700" selection-start-column="15" selection-end-line="700" selection-end-column="15" />
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/saber/core/tensor_op.cpp">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="-4">
-          <caret line="113" column="5" selection-start-line="113" selection-start-column="5" selection-end-line="113" selection-end-column="5" />
-          <folding>
-            <element signature="e#90#106#0" expanded="true" />
-            <element signature="e#268#289#0" expanded="true" />
-            <element signature="e#1629#1645#0" expanded="true" />
-            <element signature="e#1814#1835#0" expanded="true" />
-            <element signature="e#2886#2902#0" expanded="true" />
-            <element signature="e#3276#3297#0" expanded="true" />
-            <element signature="e#5075#5091#0" expanded="true" />
-            <element signature="e#5365#5386#0" expanded="true" />
-            <element signature="e#6578#6599#0" expanded="true" />
-            <element signature="e#8139#8155#0" expanded="true" />
-            <element signature="e#8800#8816#0" expanded="true" />
-            <element signature="e#9017#9038#0" expanded="true" />
-            <element signature="e#10133#10154#0" expanded="true" />
-            <element signature="e#12605#12626#0" expanded="true" />
-            <element signature="e#12651#12675#0" expanded="true" />
-            <element signature="e#12705#12726#0" expanded="true" />
-            <element signature="e#12756#12776#0" expanded="true" />
-            <element signature="e#13129#15921#0" expanded="true" />
-          </folding>
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/saber/funcs/resize.h">
-      <provider selected="true" editor-type-id="text-editor" />
-    </entry>
-    <entry file="file://$PROJECT_DIR$/saber/core/target_wrapper.h">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="4423">
-          <caret line="523" column="37" selection-start-line="523" selection-start-column="37" selection-end-line="523" selection-end-column="37" />
-          <folding>
-            <element signature="e#14794#16896#0" expanded="true" />
-          </folding>
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/saber/funcs/softmax.h">
-      <provider selected="true" editor-type-id="text-editor" />
-    </entry>
-    <entry file="file://$PROJECT_DIR$/saber/funcs/conv.h">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="225">
-          <caret line="17" column="29" selection-start-line="17" selection-start-column="29" selection-end-line="17" selection-end-column="29" />
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/saber/funcs/activation.h">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="435">
-          <caret line="29" column="6" selection-start-line="29" selection-start-column="6" selection-end-line="29" selection-end-column="6" />
-          <folding>
-            <element signature="e#836#940#0" expanded="true" />
-            <element signature="e#970#1020#0" expanded="true" />
-            <element signature="e#1044#1094#0" expanded="true" />
-            <element signature="e#1124#1174#0" expanded="true" />
-            <element signature="e#1197#1247#0" expanded="true" />
-            <element signature="e#1302#1349#0" expanded="true" />
-          </folding>
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/saber/saber_funcs_param.h">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="135">
-          <caret line="9" column="64" selection-start-line="9" selection-start-column="64" selection-end-line="9" selection-end-column="64" />
-          <folding>
-            <element signature="e#897#918#0" expanded="true" />
-            <element signature="e#948#969#0" expanded="true" />
-            <element signature="e#2779#2800#0" expanded="true" />
-            <element signature="e#6280#6301#0" expanded="true" />
-            <element signature="e#8615#8636#0" expanded="true" />
-            <element signature="e#9435#9456#0" expanded="true" />
-            <element signature="e#10395#10416#0" expanded="true" />
-            <element signature="e#12109#12130#0" expanded="true" />
-            <element signature="e#13826#13841#0" expanded="true" />
-          </folding>
-        </state>
-      </provider>
-    </entry>
-  </component>
-</project>
\ No newline at end of file

From 82e61f265f56711b31a6f683493833fd1a771956 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 7 Aug 2018 16:20:55 +0800
Subject: [PATCH 255/318] Update gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 1494599a0..3d6100c30 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,4 +38,5 @@ ios_build
 gpu_build
 output
 
+.idea
 .vscode

From e86bc25cfb92fc8494440ad9becd13945e1d0d87 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 7 Aug 2018 17:14:28 +0800
Subject: [PATCH 256/318] cmake update for BM

---
 cmake/gather.cmake | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cmake/gather.cmake b/cmake/gather.cmake
index ff1a45d4e..437318c79 100644
--- a/cmake/gather.cmake
+++ b/cmake/gather.cmake
@@ -41,6 +41,11 @@ if(USE_AMD)
     amd_build_cl_file("${CMAKE_SOURCE_DIR}/test/saber/amd" "${PROJECT_SOURCE_DIR}/output/unit_test")
 endif()
 
+if(USE_BM)
+    amd_build_cl_binary_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/bm/base/lib/app" "${CMAKE_BINARY_DIR}/cl/bm")
+    amd_build_cl_binary_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/bm/base/lib/app" "${PROJECT_SOURCE_DIR}/output/unit_test")
+endif()
+
 # find opencl
 if(USE_OPENCL)
     #anakin_generate_kernel(${ANAKIN_ROOT})

From 5c3b32be6e040e3a81729e11a1fcc03c6f4e23bd Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 7 Aug 2018 17:19:34 +0800
Subject: [PATCH 257/318] cmake update for BM

---
 CMakeLists.txt     |  4 ++++
 cmake/bm.cmake     | 26 ++++++++++++++++++++++++++
 cmake/gather.cmake |  4 ++--
 3 files changed, 32 insertions(+), 2 deletions(-)
 create mode 100644 cmake/bm.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cc6d9d559..b403f4287 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -208,6 +208,10 @@ if(USE_AMD)
     include(cmake/amd.cmake)
 endif()
 
+if(USE_BM)
+    include(cmake/bm.cmake)
+endif()
+
 # gather all the config options to anakin
 include(cmake/gather.cmake)
 
diff --git a/cmake/bm.cmake b/cmake/bm.cmake
new file mode 100644
index 000000000..3b74d9b4b
--- /dev/null
+++ b/cmake/bm.cmake
@@ -0,0 +1,26 @@
+# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+################################################################################
+
+macro(bm_build_cl_binary_file file_path dest_path)
+    FILE(GLOB CL_FILES ${file_path}/*.so)
+    message(STATUS "found cl files: ${CL_FILES}")
+    foreach(src_file ${CL_FILES})
+        get_filename_component(src_file_name ${src_file} NAME)
+        message(STATUS "copy ${src_file} to : ${dest_path}/${src_file_name}")
+        configure_file( ${absdir}/${src_file} ${dest_path}/${src_file_name} COPYONLY)
+    endforeach()
+endmacro()
+
diff --git a/cmake/gather.cmake b/cmake/gather.cmake
index 437318c79..b108a715d 100644
--- a/cmake/gather.cmake
+++ b/cmake/gather.cmake
@@ -42,8 +42,8 @@ if(USE_AMD)
 endif()
 
 if(USE_BM)
-    amd_build_cl_binary_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/bm/base/lib/app" "${CMAKE_BINARY_DIR}/cl/bm")
-    amd_build_cl_binary_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/bm/base/lib/app" "${PROJECT_SOURCE_DIR}/output/unit_test")
+    bm_build_cl_binary_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/bm/base/lib/app" "${CMAKE_BINARY_DIR}/cl/bm")
+    bm_build_cl_binary_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/bm/base/lib/app" "${PROJECT_SOURCE_DIR}/output/unit_test")
 endif()
 
 # find opencl

From 4e43d91dbbf93fe8ca544c53d0a50cfc6e0cde90 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 7 Aug 2018 17:26:33 +0800
Subject: [PATCH 258/318] Revert "cmake update for BM"

This reverts commit 5c3b32be6e040e3a81729e11a1fcc03c6f4e23bd.
---
 CMakeLists.txt     |  4 ----
 cmake/bm.cmake     | 26 --------------------------
 cmake/gather.cmake |  4 ++--
 3 files changed, 2 insertions(+), 32 deletions(-)
 delete mode 100644 cmake/bm.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b403f4287..cc6d9d559 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -208,10 +208,6 @@ if(USE_AMD)
     include(cmake/amd.cmake)
 endif()
 
-if(USE_BM)
-    include(cmake/bm.cmake)
-endif()
-
 # gather all the config options to anakin
 include(cmake/gather.cmake)
 
diff --git a/cmake/bm.cmake b/cmake/bm.cmake
deleted file mode 100644
index 3b74d9b4b..000000000
--- a/cmake/bm.cmake
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-################################################################################
-
-macro(bm_build_cl_binary_file file_path dest_path)
-    FILE(GLOB CL_FILES ${file_path}/*.so)
-    message(STATUS "found cl files: ${CL_FILES}")
-    foreach(src_file ${CL_FILES})
-        get_filename_component(src_file_name ${src_file} NAME)
-        message(STATUS "copy ${src_file} to : ${dest_path}/${src_file_name}")
-        configure_file( ${absdir}/${src_file} ${dest_path}/${src_file_name} COPYONLY)
-    endforeach()
-endmacro()
-
diff --git a/cmake/gather.cmake b/cmake/gather.cmake
index b108a715d..437318c79 100644
--- a/cmake/gather.cmake
+++ b/cmake/gather.cmake
@@ -42,8 +42,8 @@ if(USE_AMD)
 endif()
 
 if(USE_BM)
-    bm_build_cl_binary_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/bm/base/lib/app" "${CMAKE_BINARY_DIR}/cl/bm")
-    bm_build_cl_binary_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/bm/base/lib/app" "${PROJECT_SOURCE_DIR}/output/unit_test")
+    amd_build_cl_binary_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/bm/base/lib/app" "${CMAKE_BINARY_DIR}/cl/bm")
+    amd_build_cl_binary_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/bm/base/lib/app" "${PROJECT_SOURCE_DIR}/output/unit_test")
 endif()
 
 # find opencl

From f4e9d42bddd73f49a5aeb3858e41ce1d66617089 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 7 Aug 2018 17:26:44 +0800
Subject: [PATCH 259/318] Revert "cmake update for BM"

This reverts commit e86bc25cfb92fc8494440ad9becd13945e1d0d87.
---
 cmake/gather.cmake | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/cmake/gather.cmake b/cmake/gather.cmake
index 437318c79..ff1a45d4e 100644
--- a/cmake/gather.cmake
+++ b/cmake/gather.cmake
@@ -41,11 +41,6 @@ if(USE_AMD)
     amd_build_cl_file("${CMAKE_SOURCE_DIR}/test/saber/amd" "${PROJECT_SOURCE_DIR}/output/unit_test")
 endif()
 
-if(USE_BM)
-    amd_build_cl_binary_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/bm/base/lib/app" "${CMAKE_BINARY_DIR}/cl/bm")
-    amd_build_cl_binary_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/bm/base/lib/app" "${PROJECT_SOURCE_DIR}/output/unit_test")
-endif()
-
 # find opencl
 if(USE_OPENCL)
     #anakin_generate_kernel(${ANAKIN_ROOT})

From 72365386b75e65d240097248e549b1d14ed8b921 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 7 Aug 2018 17:53:45 +0800
Subject: [PATCH 260/318] cleanup

---
 CMakeLists.txt               | 16 ----------------
 cmake/compiler_options.cmake | 16 +---------------
 cmake/find_modules.cmake     | 15 +++++++++++++++
 cmake/gather.cmake           |  4 +---
 4 files changed, 17 insertions(+), 34 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cc6d9d559..d87b886dc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -79,15 +79,6 @@ anakin_option(USE_CUDNN "Use Cudnn libs." YES if USE_CUDA)
 anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform." YES if USE_CUDA)
 anakin_option(BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform" NO if BUILD_CROSS_PLANTFORM)
 
-# compile options for BM place
-#anakin_option(USE_BM "Use Cuda libs." YES if NVIDIA_GPU)
-#anakin_option(USE_CUBLAS "Use Cublas libs." YES if USE_BM)
-#anakin_option(USE_CURAND "Use Curand libs." YES if USE_BM)
-#anakin_option(USE_CUFFT "Use CuFFT libs." YES if USE_BM)
-#anakin_option(USE_CUDNN "Use Cudnn libs." YES if USE_BM)
-#anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform." YES if USE_BM)
-
-
 
 # build options for amd.
 anakin_option(USE_AMD "Use AMD ROCm OpenCL" YES if AMD_GPU)
@@ -97,9 +88,6 @@ cmake_minimum_required(VERSION ${MIN_CMAKE_V} FATAL_ERROR)
 if(USE_CUDA)
     # Select gpu target arch for local high performance implement sass code . Now we have checked on sm_61 sm_50 and it works well.
     set(SELECTED_SASS_TARGET_ARCH "61")
-elseif(USE_BM)
-    # Select gpu target arch for local high performance implement sass code . Now we have checked on sm_61 sm_50 and it works well.
-    #set(SELECTED_SASS_TARGET_ARCH "61")
 endif()
 if((NOT BUILD_FAT_BIN) AND (NOT BUILD_CROSS_PLANTFORM) AND USE_CUDA)
     # Select the only nvidia gpu arch you want to be built on
@@ -189,10 +177,6 @@ if(USE_CUDA)
     include(cmake/cuda.cmake)
 endif()
 
-if(USE_BM)
-    #include(cmake/cuda.cmake)
-endif()
-
 if(USE_X86_PLACE)
     set(ANAKIN_TEMP_THIRD_PARTY_PATH ${CMAKE_BINARY_DIR}/third-party)
     if(USE_MKLML)
diff --git a/cmake/compiler_options.cmake b/cmake/compiler_options.cmake
index e10d32783..93f1594be 100644
--- a/cmake/compiler_options.cmake
+++ b/cmake/compiler_options.cmake
@@ -122,19 +122,5 @@ if(USE_CUDA)
 endif()
 
 if(USE_BM)
-	if(CMAKE_BUILD_TYPE MATCHES Debug)
-		anakin_add_compile_option("-Xcompiler -fPIC" NVCC)
-		anakin_add_compile_option(-G NVCC)
-		anakin_add_compile_option(-g NVCC)
-		anakin_add_compile_option(-std=c++11 NVCC)
-		anakin_add_compile_option(-Wno-deprecated-gpu-targets NVCC) # suppress warning by architectures are deprecated (2.0,2.1)
-	else()
-		anakin_add_compile_option("-Xcompiler -fPIC" NVCC)
-		anakin_add_compile_option(-O3 NVCC)
-		anakin_add_compile_option(-std=c++11 NVCC)
-		anakin_add_compile_option("--default-stream per-thread" NVCC)
-		anakin_add_compile_option(-Wno-deprecated-gpu-targets NVCC)
-	endif()
-	# set default nvidia gpu arch
-	set(ANAKIN_ARCH_LIST "3.5;5.0;6.0;6.1")
+
 endif()
diff --git a/cmake/find_modules.cmake b/cmake/find_modules.cmake
index 37f8d12e9..da6e9968a 100644
--- a/cmake/find_modules.cmake
+++ b/cmake/find_modules.cmake
@@ -324,3 +324,18 @@ macro(anakin_find_openmp)
 		message(FATAL_ERROR "Could not found openmp !")
 	endif()
 endmacro()
+
+macro(anakin_find_bm)
+	set(BM_INCLUDE_DIR ${ANAKIN_ROOT}/saber/funcs/impl/bm/base/lib)
+	find_library(BM_LIBRARY NAMES libbmdnn_device.so libbmlib_device.so libbmrt.so
+			PATHS ${BM_INCLUDE_DIR}/app
+			DOC "library path for bm.")
+	if(BM_INCLUDE_DIR AND BM_LIBRARY)
+		set(BM_FOUND TRUE)
+	endif()
+	if(BM_FOUND)
+		message(STATUS "Found bm in ${BM_INCLUDE_DIR}")
+		include_directories(${BM_INCLUDE_DIR})
+		list(APPEND ANAKIN_LINKER_LIBS ${BM_LIBRARY})
+	endif()
+endmacro()
diff --git a/cmake/gather.cmake b/cmake/gather.cmake
index ff1a45d4e..f989804ac 100644
--- a/cmake/gather.cmake
+++ b/cmake/gather.cmake
@@ -25,9 +25,7 @@ if(USE_CUDA)
 endif()
 
 if(USE_BM)
-    #set other cuda path
-    #set(CUDA_TOOLKIT_ROOT_DIR $ENV{CUDA_PATH})
-    #anakin_find_cuda()
+    anakin_find_bm()
 endif()
 
 # set amd opencl path

From 7017d02d9d69d4f6984564bceb4a614dbbca7b82 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 7 Aug 2018 18:05:39 +0800
Subject: [PATCH 261/318] TEST

---
 test/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index d698a0f3b..666f5081e 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -50,7 +50,7 @@ foreach(SRC_NAME ${ANAKIN_TEST_CASE_SRC})
             if(BUILD_WITH_FRAMEWORK)
                 target_link_libraries(${TEST_CASE_NAME} ${anakin_lib_so})
             else()
-                target_link_libraries(${TEST_CASE_NAME} ${ANAKIN_SABER_LIB_TARGET})
+                target_link_libraries(${TEST_CASE_NAME} ${anakin_lib_so})
             endif()
 	else()
         if(BUILD_WITH_FRAMEWORK)

From f73a88ce83a782e88d0ac3e39e261570d48bda65 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 7 Aug 2018 18:16:57 +0800
Subject: [PATCH 262/318] Revert "TEST"

This reverts commit 7017d02d9d69d4f6984564bceb4a614dbbca7b82.
---
 test/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 666f5081e..d698a0f3b 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -50,7 +50,7 @@ foreach(SRC_NAME ${ANAKIN_TEST_CASE_SRC})
             if(BUILD_WITH_FRAMEWORK)
                 target_link_libraries(${TEST_CASE_NAME} ${anakin_lib_so})
             else()
-                target_link_libraries(${TEST_CASE_NAME} ${anakin_lib_so})
+                target_link_libraries(${TEST_CASE_NAME} ${ANAKIN_SABER_LIB_TARGET})
             endif()
 	else()
         if(BUILD_WITH_FRAMEWORK)

From f7f4cc74976786178f90c78d20ac71a6ae967141 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 7 Aug 2018 18:21:42 +0800
Subject: [PATCH 263/318] cmake for test

---
 test/CMakeLists.txt | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index d698a0f3b..39eaa253c 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -33,6 +33,15 @@ endif()
 
 anakin_fetch_files_with_suffix(${ANAKIN_UNIT_TEST}/saber "cpp" ANAKIN_TEST_CASE_SRC)
 
+if(USE_BM)
+    find_library(BMRT bmrt HINTS ${BM_BASE_CODE_ROOT}/lib/app)
+    find_library(BMDNN_DEVICE bmdnn_device HINTS ${BM_BASE_CODE_ROOT}/lib/app)
+    find_library(BMLIB_DEVICE bmlib_device HINTS ${BM_BASE_CODE_ROOT}/lib/app)
+    target_link_libraries(${ANAKIN_SABER_TEMP_COMMMON_LIB} ${BMRT})
+    target_link_libraries(${ANAKIN_SABER_TEMP_COMMMON_LIB} ${BMDNN_DEVICE})
+    target_link_libraries(${ANAKIN_SABER_TEMP_COMMMON_LIB} ${BMLIB_DEVICE})
+endif()
+
 file(REMOVE ${PROJECT_SOURCE_DIR}/output/unit_test/*)
 install(FILES ${PROJECT_BINARY_DIR}/anakin_config.h
         DESTINATION ${PROJECT_SOURCE_DIR}/output/include)

From 95dd7e1a573b8703989c78d9c46d5292b80dad01 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 8 Aug 2018 09:44:45 +0800
Subject: [PATCH 264/318] Revert "cmake for test"

This reverts commit f7f4cc74976786178f90c78d20ac71a6ae967141.
---
 test/CMakeLists.txt | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 39eaa253c..d698a0f3b 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -33,15 +33,6 @@ endif()
 
 anakin_fetch_files_with_suffix(${ANAKIN_UNIT_TEST}/saber "cpp" ANAKIN_TEST_CASE_SRC)
 
-if(USE_BM)
-    find_library(BMRT bmrt HINTS ${BM_BASE_CODE_ROOT}/lib/app)
-    find_library(BMDNN_DEVICE bmdnn_device HINTS ${BM_BASE_CODE_ROOT}/lib/app)
-    find_library(BMLIB_DEVICE bmlib_device HINTS ${BM_BASE_CODE_ROOT}/lib/app)
-    target_link_libraries(${ANAKIN_SABER_TEMP_COMMMON_LIB} ${BMRT})
-    target_link_libraries(${ANAKIN_SABER_TEMP_COMMMON_LIB} ${BMDNN_DEVICE})
-    target_link_libraries(${ANAKIN_SABER_TEMP_COMMMON_LIB} ${BMLIB_DEVICE})
-endif()
-
 file(REMOVE ${PROJECT_SOURCE_DIR}/output/unit_test/*)
 install(FILES ${PROJECT_BINARY_DIR}/anakin_config.h
         DESTINATION ${PROJECT_SOURCE_DIR}/output/include)

From abf819d85237058d35405a49811090c62de1056b Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 8 Aug 2018 10:40:10 +0800
Subject: [PATCH 265/318] implementation according to data types

---
 saber/core/impl/bm/temsor_op_bm.cpp | 122 +++++++++++++++++-----------
 1 file changed, 75 insertions(+), 47 deletions(-)

diff --git a/saber/core/impl/bm/temsor_op_bm.cpp b/saber/core/impl/bm/temsor_op_bm.cpp
index 256cf977c..f921f0616 100644
--- a/saber/core/impl/bm/temsor_op_bm.cpp
+++ b/saber/core/impl/bm/temsor_op_bm.cpp
@@ -12,86 +12,114 @@ template<>
 void fill_tensor_rand<BM>(Tensor<BM>& tensor, \
     typename Tensor<BM>::API::stream_t stream = NULL) {
 
-    float *host_mem_input = new float[tensor.size()];
-    for (int i = 0; i < tensor.size(); ++i) {
-        host_mem_input[i] = static_cast<float>(rand());
-    }
+    DataType type = tensor.get_dtype();
+    switch (type){
+        case AK_BM: 
+            float *host_mem_input = new float[tensor.size()];
+            for (int i = 0; i < tensor.size(); ++i) {
+                host_mem_input[i] = static_cast<float>(rand());
+            }
+
+            bm_device_mem_t* device_data_ptr = (bm_device_mem_t*) tensor.mutable_data();
+            BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
 
-    bm_device_mem_t* device_data_ptr = (bm_device_mem_t*) tensor.mutable_data();
-    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
+            delete [] host_mem_input;
+            break;
 
-    delete [] host_mem_input;
+        default: LOG(FATAL) << "data type: " << type << " is unsupported now";
+    }
 }
 
 template<>
 void fill_tensor_rand<BM>(Tensor<BM>& tensor, float vstart, \
     float vend, typename Tensor<BM>::API::stream_t stream = NULL){
 
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_real_distribution<float> dis(0, 1.f);
+    DataType type = tensor.get_dtype();
+    switch (type){
+        case AK_BM:
+            std::random_device rd;
+            std::mt19937 gen(rd());
+            std::uniform_real_distribution<float> dis(0, 1.f);
 
-    float *host_mem_input = new float[tensor.size()];
-    for (int i = 0; i < tensor.size(); ++i) {
-        float random_num = vstart + (vend - vstart) * dis(gen);
-        host_mem_input[i] = random_num;
-    }
+            float *host_mem_input = new float[tensor.size()];
+            for (int i = 0; i < tensor.size(); ++i) {
+                float random_num = vstart + (vend - vstart) * dis(gen);
+                host_mem_input[i] = random_num;
+            }
+
+            bm_device_mem_t* device_data_ptr = (bm_device_mem_t*) tensor.mutable_data();
+            BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
 
-    bm_device_mem_t* device_data_ptr = (bm_device_mem_t*) tensor.mutable_data();
-    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
+            delete [] host_mem_input;
+            break;
 
-    delete [] host_mem_input;
+        default: LOG(FATAL) << "data type: " << type << " is unsupported now";
+    }
 }
 
 template<>
 void fill_tensor_const<BM>(Tensor<BM>& tensor, float value, \
     typename Tensor<BM>::API::stream_t stream = NULL){
 
-    float *host_mem_input = new float[tensor.size()];
-    for (int i = 0; i < tensor.size(); ++i) {
-        host_mem_input[i] = value;
-    }
+    DataType type = tensor.get_dtype();
+    switch (type){
+        case AK_BM:
+            float *host_mem_input = new float[tensor.size()];
+            for (int i = 0; i < tensor.size(); ++i) {
+                host_mem_input[i] = value;
+            }
+
+            bm_device_mem_t* device_data_ptr = (bm_device_mem_t*) tensor.mutable_data();
+            BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
 
-    bm_device_mem_t* device_data_ptr = (bm_device_mem_t*) tensor.mutable_data();
-    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
+            delete [] host_mem_input;
+            break;
 
-    delete [] host_mem_input;
+        default: LOG(FATAL) << "data type: " << type << " is unsupported now";
+    }
 }
 
 template <>
 void print_tensor<BM>(Tensor<BM>& tensor,  \
     typename Tensor<BM>::API::stream_t stream = NULL) {
 
-    LOG(INFO) << "BM device tensor data:" << tensor.size();
+    DataType type = tensor.get_dtype();
+    switch (type){
+        case AK_BM:   
+            LOG(INFO) << "BM device tensor data:" << tensor.size();
 
-    /*
-    const bm_device_mem_t* device_data_ptr = tensor.data();
-    unsigned long long gaddr = bm_mem_get_device_addr(*device_data_ptr);
-    bm_flush(get_bm_handle());
-    float* device_data = (float*)bm_get_global_addr(gaddr);
+            /*
+            const bm_device_mem_t* device_data_ptr = tensor.data();
+            unsigned long long gaddr = bm_mem_get_device_addr(*device_data_ptr);
+            bm_flush(get_bm_handle());
+            float* device_data = (float*)bm_get_global_addr(gaddr);
 
-    for (int i = 0; i < tensor.size(); ++i) {
-        printf("%.2f ", device_data[i]);
+            for (int i = 0; i < tensor.size(); ++i) {
+                printf("%.2f ", device_data[i]);
 
-        if ((i + 1) % (4 * tensor.width()) == 0) {
-            printf("\n");
-        }
-    }*/
+                if ((i + 1) % (4 * tensor.width()) == 0) {
+                    printf("\n");
+                }
+            }*/
 
-    float *host_mem = new float[tensor.size()];
-    auto* device_data_ptr = const_cast<bm_device_mem_t *>((bm_device_mem_t*) tensor.data());
-    bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr);
+            float *host_mem = new float[tensor.size()];
+            auto* device_data_ptr = const_cast<bm_device_mem_t *>((bm_device_mem_t*) tensor.data());
+            bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr);
 
-    for (int i = 0; i < tensor.size(); ++i) {
-        printf("%.2f\t", host_mem[i]);
+            for (int i = 0; i < tensor.size(); ++i) {
+                printf("%.2f\t", host_mem[i]);
 
-        if ((i + 1) % tensor.width() == 0){
+                if ((i + 1) % tensor.width() == 0){
+                    printf("\n");
+                }
+            }
             printf("\n");
-        }
-    }
-    printf("\n");
 
-    delete [] host_mem;
+            delete [] host_mem;
+            break;
+            
+        default: LOG(FATAL) << "data type: " << type << " is unsupported now";
+    }
 }
 
 

From 24d3ef1e80236f437d0f83a572d916f52ebda338 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 8 Aug 2018 10:53:00 +0800
Subject: [PATCH 266/318] implementation according to data types

---
 saber/core/impl/bm/temsor_op_bm.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/saber/core/impl/bm/temsor_op_bm.cpp b/saber/core/impl/bm/temsor_op_bm.cpp
index f921f0616..9ddca605c 100644
--- a/saber/core/impl/bm/temsor_op_bm.cpp
+++ b/saber/core/impl/bm/temsor_op_bm.cpp
@@ -14,7 +14,7 @@ void fill_tensor_rand<BM>(Tensor<BM>& tensor, \
 
     DataType type = tensor.get_dtype();
     switch (type){
-        case AK_BM: 
+        case AK_BM: {
             float *host_mem_input = new float[tensor.size()];
             for (int i = 0; i < tensor.size(); ++i) {
                 host_mem_input[i] = static_cast<float>(rand());
@@ -25,7 +25,7 @@ void fill_tensor_rand<BM>(Tensor<BM>& tensor, \
 
             delete [] host_mem_input;
             break;
-
+        }
         default: LOG(FATAL) << "data type: " << type << " is unsupported now";
     }
 }
@@ -36,7 +36,7 @@ void fill_tensor_rand<BM>(Tensor<BM>& tensor, float vstart, \
 
     DataType type = tensor.get_dtype();
     switch (type){
-        case AK_BM:
+        case AK_BM: {
             std::random_device rd;
             std::mt19937 gen(rd());
             std::uniform_real_distribution<float> dis(0, 1.f);
@@ -52,7 +52,7 @@ void fill_tensor_rand<BM>(Tensor<BM>& tensor, float vstart, \
 
             delete [] host_mem_input;
             break;
-
+        }
         default: LOG(FATAL) << "data type: " << type << " is unsupported now";
     }
 }
@@ -63,7 +63,7 @@ void fill_tensor_const<BM>(Tensor<BM>& tensor, float value, \
 
     DataType type = tensor.get_dtype();
     switch (type){
-        case AK_BM:
+        case AK_BM: {
             float *host_mem_input = new float[tensor.size()];
             for (int i = 0; i < tensor.size(); ++i) {
                 host_mem_input[i] = value;
@@ -74,7 +74,7 @@ void fill_tensor_const<BM>(Tensor<BM>& tensor, float value, \
 
             delete [] host_mem_input;
             break;
-
+        }
         default: LOG(FATAL) << "data type: " << type << " is unsupported now";
     }
 }
@@ -85,7 +85,7 @@ void print_tensor<BM>(Tensor<BM>& tensor,  \
 
     DataType type = tensor.get_dtype();
     switch (type){
-        case AK_BM:   
+        case AK_BM: {
             LOG(INFO) << "BM device tensor data:" << tensor.size();
 
             /*
@@ -117,7 +117,7 @@ void print_tensor<BM>(Tensor<BM>& tensor,  \
 
             delete [] host_mem;
             break;
-            
+        }
         default: LOG(FATAL) << "data type: " << type << " is unsupported now";
     }
 }

From 9922c83f3626e9dcd5f3c9488a6cc3b524a4b8ac Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 8 Aug 2018 10:55:39 +0800
Subject: [PATCH 267/318] cmake updates

---
 cmake/gather.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/gather.cmake b/cmake/gather.cmake
index f989804ac..9418134e0 100644
--- a/cmake/gather.cmake
+++ b/cmake/gather.cmake
@@ -25,7 +25,7 @@ if(USE_CUDA)
 endif()
 
 if(USE_BM)
-    anakin_find_bm()
+    #anakin_find_bm()
 endif()
 
 # set amd opencl path

From ea1309753f114854fcc33679c0a04a17d4a81d24 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Mon, 13 Aug 2018 09:28:20 +0800
Subject: [PATCH 268/318] Remove AK_BM and use AK_FLOAT instead

---
 framework/core/data_types.h         | 2 --
 saber/core/data_traits.h            | 8 --------
 saber/core/impl/bm/temsor_op_bm.cpp | 8 ++++----
 saber/funcs/impl/bm/vender_conv.cpp | 8 ++++----
 saber/saber_types.h                 | 3 +--
 5 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/framework/core/data_types.h b/framework/core/data_types.h
index c0c550a12..a34e3745b 100644
--- a/framework/core/data_types.h
+++ b/framework/core/data_types.h
@@ -46,7 +46,6 @@ SABER_TO_BASE_TYPE(AK_UINT16, uint16_t);
 SABER_TO_BASE_TYPE(AK_UINT32, uint32_t);
 SABER_TO_BASE_TYPE(AK_BOOL, bool);
 SABER_TO_BASE_TYPE(AK_STRING, std::string);
-SABER_TO_BASE_TYPE(AK_BM, bm_device_mem_t);
 
 template<typename T>
 struct DataTypeRecover {
@@ -71,7 +70,6 @@ BASE_TYPE_TO_SABER(uint8_t, AK_UINT8);
 BASE_TYPE_TO_SABER(uint32_t, AK_UINT32);
 BASE_TYPE_TO_SABER(bool, AK_BOOL);
 BASE_TYPE_TO_SABER(std::string, AK_STRING);
-BASE_TYPE_TO_SABER(bm_device_mem_t, AK_BM);
 
 template<typename T>
 struct TypeWarpper {
diff --git a/saber/core/data_traits.h b/saber/core/data_traits.h
index cfe7951b4..3dc30cb18 100644
--- a/saber/core/data_traits.h
+++ b/saber/core/data_traits.h
@@ -143,14 +143,6 @@ struct DataTrait<Ttype, AK_UINT32> {
     typedef unsigned int* PtrDtype;
 };
 
-#ifdef USE_BM
-template <>
-struct DataTrait<BM, AK_BM> {
-    typedef bm_device_mem_t Dtype;
-    typedef bm_device_mem_t* PtrDtype;
-};
-#endif
-
 #ifdef USE_OPENCL
 struct ClMem{
     ClMem(){
diff --git a/saber/core/impl/bm/temsor_op_bm.cpp b/saber/core/impl/bm/temsor_op_bm.cpp
index 9ddca605c..19cb8a81e 100644
--- a/saber/core/impl/bm/temsor_op_bm.cpp
+++ b/saber/core/impl/bm/temsor_op_bm.cpp
@@ -14,7 +14,7 @@ void fill_tensor_rand<BM>(Tensor<BM>& tensor, \
 
     DataType type = tensor.get_dtype();
     switch (type){
-        case AK_BM: {
+        case AK_FLOAT: {
             float *host_mem_input = new float[tensor.size()];
             for (int i = 0; i < tensor.size(); ++i) {
                 host_mem_input[i] = static_cast<float>(rand());
@@ -36,7 +36,7 @@ void fill_tensor_rand<BM>(Tensor<BM>& tensor, float vstart, \
 
     DataType type = tensor.get_dtype();
     switch (type){
-        case AK_BM: {
+        case AK_FLOAT: {
             std::random_device rd;
             std::mt19937 gen(rd());
             std::uniform_real_distribution<float> dis(0, 1.f);
@@ -63,7 +63,7 @@ void fill_tensor_const<BM>(Tensor<BM>& tensor, float value, \
 
     DataType type = tensor.get_dtype();
     switch (type){
-        case AK_BM: {
+        case AK_FLOAT: {
             float *host_mem_input = new float[tensor.size()];
             for (int i = 0; i < tensor.size(); ++i) {
                 host_mem_input[i] = value;
@@ -85,7 +85,7 @@ void print_tensor<BM>(Tensor<BM>& tensor,  \
 
     DataType type = tensor.get_dtype();
     switch (type){
-        case AK_BM: {
+        case AK_FLOAT: {
             LOG(INFO) << "BM device tensor data:" << tensor.size();
 
             /*
diff --git a/saber/funcs/impl/bm/vender_conv.cpp b/saber/funcs/impl/bm/vender_conv.cpp
index aa12462d3..55ae63afb 100644
--- a/saber/funcs/impl/bm/vender_conv.cpp
+++ b/saber/funcs/impl/bm/vender_conv.cpp
@@ -8,7 +8,7 @@ namespace saber
 
 // FP32 part
 template <>
-SaberStatus VenderConv2D<BM, AK_BM>::\
+SaberStatus VenderConv2D<BM, AK_FLOAT>::\
     create(const std::vector<Tensor<BM> *>& inputs,
             std::vector<Tensor<BM> *>& outputs,
             ConvParam<BM>& param, Context<BM>& ctx)
@@ -16,7 +16,7 @@ SaberStatus VenderConv2D<BM, AK_BM>::\
 }
 
 template <>
-SaberStatus VenderConv2D<BM, AK_BM>::\
+SaberStatus VenderConv2D<BM, AK_FLOAT>::\
     init(const std::vector<Tensor<BM> *> &inputs,
          std::vector<Tensor<BM> *> &outputs,
          ConvParam<BM> &param, Context<BM> &ctx)
@@ -27,7 +27,7 @@ SaberStatus VenderConv2D<BM, AK_BM>::\
 }
 
 template <>
-SaberStatus VenderConv2D<BM, AK_BM>::\
+SaberStatus VenderConv2D<BM, AK_FLOAT>::\
     dispatch(const std::vector<Tensor<BM>*>& inputs,
                 std::vector<Tensor<BM>*>& outputs,
                 ConvParam<BM>& param)
@@ -97,6 +97,6 @@ SaberStatus VenderConv2D<BM, AK_BM>::\
 // INT8 part
 // TODO:
 
-template class VenderConv2D<BM, AK_BM>;
+template class VenderConv2D<BM, AK_FLOAT>;
 } // namespace saber
 } // namespace anakin
\ No newline at end of file
diff --git a/saber/saber_types.h b/saber/saber_types.h
index 16bdf9e35..212238c9b 100644
--- a/saber/saber_types.h
+++ b/saber/saber_types.h
@@ -168,8 +168,7 @@ enum DataType {
     AK_STRING       =       10,
     AK_BOOL         =       11,
     AK_SHAPE        =       12,
-    AK_TENSOR       =       13,
-    AK_BM           =       14
+    AK_TENSOR       =       13
 };
 
 typedef enum {

From 76b35cc8dc4eb118c5636413953c563cd1b83f00 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Mon, 13 Aug 2018 16:35:47 +0800
Subject: [PATCH 269/318] Upgrade BM SDK; New way to get BM handle

---
 saber/core/impl/bm/bm_impl.cpp                | 20 ++++++++++----
 saber/core/impl/bm/temsor_op_bm.cpp           | 12 +++++----
 saber/core/target_wrapper.h                   |  2 +-
 saber/core/tensor.h                           |  4 +--
 .../impl/bm/base/include/bmdnn/bmdnn_api.h    | 16 ++++++++++++
 .../impl/bm/base/include/bmdnn/op_code.h      |  2 ++
 .../bm/base/include/bmlib/bmlib_runtime.h     | 26 ++++++++++++++-----
 .../bm/base/include/bmruntime/bmruntime.h     |  1 +
 .../base/include/bmruntime/bmruntime_common.h |  1 +
 saber/funcs/impl/bm/vender_conv.cpp           |  2 +-
 10 files changed, 66 insertions(+), 20 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 6ee9d6dcd..d2359868c 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -41,6 +41,16 @@ typedef TargetWrapper<BM, __device_target> BM_API;
 static bm_handle_t handle;
 static bm_status_t init_handle{bmdnn_init(&handle)};
 
+bm_handle_t BM_API::get_handle() {
+    /*bm_handle_t handle;
+    int ret = 0;
+
+    ret = bm_dev_request(&handle, 0, devid);
+    CHECK_NE(ret, 0) << "request BM device failed: " << devid;
+    */
+    return handle;
+};
+
 void BM_API::get_device_count(int &count) {
     BMDNN_CHECK(bm_dev_getcount(&count));
 }
@@ -56,7 +66,7 @@ int BM_API::get_device_id(){
 }
         
 void BM_API::mem_alloc(void** ptr, size_t n){
-    handle = get_bm_handle();
+    //handle = BM_API::get_handle();
     /* bm_device_mem_t *mem = reinterpret_cast<struct bm_mem_desc *>(*ptr); */
     bm_device_mem_t *mem = new bm_device_mem_t();
     BMDNN_CHECK(bm_malloc_device_byte(handle, mem, n));
@@ -65,7 +75,7 @@ void BM_API::mem_alloc(void** ptr, size_t n){
         
 void BM_API::mem_free(void* ptr){
     if(ptr != nullptr){
-        handle = get_bm_handle();
+        //handle = BM_API::get_handle();
         bm_free_device(handle, *(struct bm_mem_desc*)(ptr));
         delete ptr;
     }
@@ -81,7 +91,7 @@ void BM_API::mem_set(void* ptr, int value, size_t n){
 void BM_API::sync_memcpy(void* dst, size_t dst_offset, int dst_id, \
         const void* src, size_t src_offset, int src_id, \
         size_t count, __DtoD) {
-    handle = get_bm_handle(); 
+    //handle = BM_API::get_handle(); 
     //BMDNN_CHECK(bm_memcpy_d2d(handle, bm_mem_from_device(dst), dst_id, bm_mem_from_device(src), src_id, count));
     BMDNN_CHECK(bm_memcpy_d2d(handle, *(bm_device_mem_t *)(dst), dst_id, *(bm_device_mem_t *)(src), src_id, count));
     LOG(INFO) << "BM sync_memcpy: device to device, finished";
@@ -90,7 +100,7 @@ void BM_API::sync_memcpy(void* dst, size_t dst_offset, int dst_id, \
 void BM_API::sync_memcpy(void* dst, size_t dst_offset, int dst_id, \
         const void* src, size_t src_offset, int src_id, \
         size_t count, __HtoD) {
-    handle = get_bm_handle(); 
+    //handle = BM_API::get_handle(); 
     BMDNN_CHECK(bm_memcpy_s2d(handle, *(bm_device_mem_t *)(dst), bm_mem_from_system(src)));
 
     #ifdef DEBUG
@@ -104,7 +114,7 @@ void BM_API::sync_memcpy(void* dst, size_t dst_offset, int dst_id, \
 void BM_API::sync_memcpy(void* dst, size_t dst_offset, int dst_id, \
         const void* src, size_t src_offset, int src_id, \
         size_t count, __DtoH) {
-    handle = get_bm_handle(); 
+    //handle = BM_API::get_handle(); 
     BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src)));
 
     #ifdef DEBUG
diff --git a/saber/core/impl/bm/temsor_op_bm.cpp b/saber/core/impl/bm/temsor_op_bm.cpp
index 19cb8a81e..7f7dda655 100644
--- a/saber/core/impl/bm/temsor_op_bm.cpp
+++ b/saber/core/impl/bm/temsor_op_bm.cpp
@@ -8,9 +8,11 @@ namespace anakin{
 
 namespace saber{
 
+typedef Tensor<BM>::API API;
+
 template<>
 void fill_tensor_rand<BM>(Tensor<BM>& tensor, \
-    typename Tensor<BM>::API::stream_t stream = NULL) {
+    typename API::stream_t stream = NULL) {
 
     DataType type = tensor.get_dtype();
     switch (type){
@@ -21,7 +23,7 @@ void fill_tensor_rand<BM>(Tensor<BM>& tensor, \
             }
 
             bm_device_mem_t* device_data_ptr = (bm_device_mem_t*) tensor.mutable_data();
-            BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
+            BMDNN_CHECK(bm_memcpy_s2d(API::get_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
 
             delete [] host_mem_input;
             break;
@@ -48,7 +50,7 @@ void fill_tensor_rand<BM>(Tensor<BM>& tensor, float vstart, \
             }
 
             bm_device_mem_t* device_data_ptr = (bm_device_mem_t*) tensor.mutable_data();
-            BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
+            BMDNN_CHECK(bm_memcpy_s2d(API::get_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
 
             delete [] host_mem_input;
             break;
@@ -70,7 +72,7 @@ void fill_tensor_const<BM>(Tensor<BM>& tensor, float value, \
             }
 
             bm_device_mem_t* device_data_ptr = (bm_device_mem_t*) tensor.mutable_data();
-            BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
+            BMDNN_CHECK(bm_memcpy_s2d(API::get_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
 
             delete [] host_mem_input;
             break;
@@ -104,7 +106,7 @@ void print_tensor<BM>(Tensor<BM>& tensor,  \
 
             float *host_mem = new float[tensor.size()];
             auto* device_data_ptr = const_cast<bm_device_mem_t *>((bm_device_mem_t*) tensor.data());
-            bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr);
+            bm_memcpy_d2s(API::get_handle(), bm_mem_from_system(host_mem), *device_data_ptr);
 
             for (int i = 0; i < tensor.size(); ++i) {
                 printf("%.2f\t", host_mem[i]);
diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index 6a7aafce9..bafbb55e8 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -561,7 +561,7 @@ struct TargetWrapper<BM, __device_target> {
      */
     static int get_device_id();
 
-    static bm_handle_t get_handler();
+    static bm_handle_t get_handle();
 };
 
 #endif //USE_BM
diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index feeb1688b..ca3d8231c 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -1005,7 +1005,7 @@ SaberStatus Tensor<BM>::copy_from<X86>(const Tensor<X86>& tensor) {
     CHECK_EQ(tensor.get_dtype(), AK_FLOAT) << "host data type should be AK_FLOAT";
 
     bm_device_mem_t* device_data_ptr = (bm_device_mem_t*) mutable_data();
-    BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(const_cast<float *>((float*) tensor.data()))));
+    BMDNN_CHECK(bm_memcpy_s2d(Tensor<BM>::API::get_handle(), *device_data_ptr, bm_mem_from_system(const_cast<float *>((float*) tensor.data()))));
     return SaberSuccess;
 }
 
@@ -1017,7 +1017,7 @@ SaberStatus Tensor<X86>::copy_from<BM>(const Tensor<BM>& tensor) {
     CHECK_EQ(_dtype, AK_FLOAT) << "host data type should be AK_FLOAT";
 
     auto* device_data_ptr = const_cast<bm_device_mem_t *>((bm_device_mem_t*) tensor.data());
-    BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system((float*) mutable_data()), *device_data_ptr));
+    BMDNN_CHECK(bm_memcpy_d2s(Tensor<BM>::API::get_handle(), bm_mem_from_system((float*) mutable_data()), *device_data_ptr));
     return SaberSuccess;
 }
 #endif
diff --git a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h
index 97feb1972..05c970c92 100644
--- a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h
+++ b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h
@@ -380,6 +380,7 @@ bm_status_t bmdnn_interp_forward(
     int                 pad_end,
     int                 output_h,
     int                 output_w,
+    int                 platform_sp,
     //output
     bm_device_mem_t  output
     );
@@ -807,6 +808,21 @@ bm_status_t bmdnn_img_scale(
         bm_handle_t handle, bm_device_mem_t dst, bm_device_mem_t src, int n,
         int c, int dh, int sh, int dw, int sw);
 
+bm_status_t bmdnn_bn_forward_inference(
+    bm_handle_t      handle,
+    bm_device_mem_t  input,
+    bm_device_mem_t  output,
+    bm_device_mem_t  mean_ma,
+    bm_device_mem_t  variance_ma,
+    bm_device_mem_t  scale,
+    bm_device_mem_t  bias,
+    bm_device_mem_t  scale_ext,              
+    float            eps,
+    int              input_n,
+    int              input_c,
+    int              input_h,
+    int              input_w
+  );
 #if defined (__cplusplus)
 }
 #endif
diff --git a/saber/funcs/impl/bm/base/include/bmdnn/op_code.h b/saber/funcs/impl/bm/base/include/bmdnn/op_code.h
index f85846a8a..fa9443116 100644
--- a/saber/funcs/impl/bm/base/include/bmdnn/op_code.h
+++ b/saber/funcs/impl/bm/base/include/bmdnn/op_code.h
@@ -49,6 +49,8 @@ typedef struct tensor_4d_t {
 #define SHARE_REG_MESSAGE_RP            1
 #define SHARE_REG_MESSAGE_IRQSTATUS     2
 #define SHARE_REG_CDMA_IRQSTATUS    3 
+#define SHARE_REG_MSGIRQ_NUM_LO     4
+#define SHARE_REG_MSGIRQ_NUM_HI     5
 
 #define SHAREMEM_MSG_FIXED_OFFSET  (8192)
 #define SHAREMEM_SIZE_BIT  8
diff --git a/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h b/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h
index 7d537401c..eec7996c3 100644
--- a/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h
+++ b/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h
@@ -79,7 +79,11 @@ bm_status_t bm_malloc_device_dword(
     bm_handle_t      handle,
     bm_device_mem_t *pmem,
     int              count);
-
+bm_status_t bm_malloc_ctx_dword(
+    bm_handle_t      handle,
+    bm_device_mem_t *pmem,
+    int              count,
+    unsigned long long ctx_addr);
 /*
  * brief malloc host memory in size of byte
 */
@@ -101,7 +105,7 @@ bm_status_t bm_malloc_host(
     bm_host_mem_t   *pmem,
     unsigned int     size);
 
-void bm_free_host(
+bm_status_t bm_free_host(
     bm_handle_t      handle,
     bm_host_mem_t    mem);
 
@@ -148,9 +152,6 @@ bm_status_t bm_memset_device(
 bm_device_mem_t bm_mem_from_system(
     void *              system_addr);
 
-bm_device_mem_t bm_mem_from_device(
-    void *              device_addr);
-	
 /*
 *brief malloc one device memory with the shape of (N,C,H,W), copy the sys_mem to
 device mem if need_copy is true
@@ -186,11 +187,16 @@ unsigned int       bm_mem_get_device_size(struct bm_mem_desc mem);
 void               bm_mem_set_device_size(struct bm_mem_desc & mem, unsigned int size);
 bm_mem_type_t      bm_mem_get_type(struct bm_mem_desc mem);
 
+unsigned long long bm_gmem_arm_reserved_request(bm_handle_t handle);
+void bm_gmem_arm_reserved_release(bm_handle_t handle);
+
 /* 
 * brief Get the handle of bmlib_runtime
 * return : If the handle has been inited, return the handle it self , else init one and return it
 */
-bm_handle_t get_bm_handle();
+
+bm_status_t bm_init(bm_handle_t *handle, bool bmkernel_used);
+void bm_deinit(bm_handle_t handle);
 
 /*
  * Helper functions
@@ -225,6 +231,14 @@ bm_status_t bm_dev_query(int devid);
 bm_status_t bm_dev_request(bm_handle_t *handle, bool bmkernel_used, int devid);
 void bm_dev_free(bm_handle_t handle);
 
+typedef struct bm_fw_desc {
+	unsigned int *itcm_fw;
+	int itcmfw_size;
+	unsigned int *ddr_fw;
+	int ddrfw_size;
+} bm_fw_desc, *pbm_fw_desc;
+bm_status_t bm_update_firmware(bm_handle_t handle, pbm_fw_desc pfw);
+
 #if defined (__cplusplus)
 }
 #endif
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h
index daa101fce..ef438ce14 100644
--- a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h
+++ b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h
@@ -146,6 +146,7 @@ class bmruntime {
     //previous value or state
     int pre_net_num;
     int pre_m_device_mem_info_vec_size;  
+    int pre_m_device_mem_vec_size;  
 
     //append mem offset when appending another framework's context.
     vector<u64> apd_ctx_mem_offset;
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h
index 200656739..77ae9bd22 100644
--- a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h
+++ b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h
@@ -40,6 +40,7 @@ typedef struct device_mem_info {
     int coeff_count;
     int groups;
     unsigned long long address;
+    unsigned long size;
 } DEVICE_MEM_INFO;
 
 //info for compute output tensor
diff --git a/saber/funcs/impl/bm/vender_conv.cpp b/saber/funcs/impl/bm/vender_conv.cpp
index 55ae63afb..4b0556e89 100644
--- a/saber/funcs/impl/bm/vender_conv.cpp
+++ b/saber/funcs/impl/bm/vender_conv.cpp
@@ -22,7 +22,7 @@ SaberStatus VenderConv2D<BM, AK_FLOAT>::\
          ConvParam<BM> &param, Context<BM> &ctx)
 {
 
-    _handle = get_bm_handle();
+    _handle = ctx.get_handle();
     return create(inputs, outputs, param, ctx);
 }
 

From b47d5750394c5fee8acde22c3bda2dbdd02d7b99 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Mon, 13 Aug 2018 16:38:14 +0800
Subject: [PATCH 270/318] New way to get BM handle

---
 saber/core/context.h | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/saber/core/context.h b/saber/core/context.h
index 2147033f0..4be1d8693 100644
--- a/saber/core/context.h
+++ b/saber/core/context.h
@@ -42,10 +42,13 @@ class Context final{
      * @param compute_stream_id
      */
     Context(int device_id = 0, int data_stream_id = 0, int compute_stream_id = 0){
+#ifdef USE_BM        
         if(std::is_same<TargetType, BM>::value){
             LOG(INFO) << "context init for BM";
+            _bm_handle = TargetWrapper<BM>::get_handle();
             return;
         }
+#endif
 
         CHECK_GT(devs.size(), 0) << "Env is not initialized or current target is not exit!";
         if (device_id >= devs.size()){
@@ -70,11 +73,13 @@ class Context final{
     }
 
     Context(const Context<TargetType>& ctx){
+#ifdef USE_BM
         if(std::is_same<TargetType, BM>::value){
             LOG(INFO) << "context init for BM";
+            _bm_handle = ctx._bm_handle;
             return;
         }
-
+#endif
         _device_id = ctx._device_id;
         _data_stream_id = ctx._data_stream_id;
         _compute_stream_id = ctx._compute_stream_id;
@@ -84,6 +89,7 @@ class Context final{
         _act_ids = ctx._act_ids;
         _mode = ctx._mode;
 #endif
+
     }
 
     Context& operator=(const Context& ctx){
@@ -95,6 +101,9 @@ class Context final{
 #ifdef USE_ARM_PLACE
         this->_act_ids = ctx._act_ids;
         this->_mode = ctx._mode;
+#endif
+#ifdef USE_BM
+        this->_bm_handle = ctx._bm_handle;
 #endif
         return *this;
     }
@@ -104,6 +113,9 @@ class Context final{
         comp_eq = comp_eq && (_device_id == right._device_id);
         comp_eq = comp_eq && (_data_stream_id == right._data_stream_id);
         comp_eq = comp_eq && (_compute_stream_id == right._compute_stream_id);
+#ifdef USE_BM
+        comp_eq = comp_eq && (_bm_handle == right._bm_handle);
+#endif
         return comp_eq;
     }
 
@@ -143,6 +155,12 @@ class Context final{
     //std::vector<int> get_act_ids();
 #endif
 
+#ifdef USE_BM
+    bm_handle_t get_handle() {
+        return _bm_handle;
+    }
+#endif
+
 
 private:
     //! current stream to process
@@ -156,6 +174,9 @@ class Context final{
     PowerMode _mode{SABER_POWER_HIGH};
     std::vector<int> _act_ids{0};
 #endif
+#ifdef USE_BM
+    bm_handle_t _bm_handle;
+#endif
 };
 
 } //namespace saber

From f86b95e6d0f81413e7e421f9810ce5f9f2d759ac Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Mon, 13 Aug 2018 17:37:07 +0800
Subject: [PATCH 271/318] Update tests for BM

---
 test/saber/test_saber_buffer.cpp  | 8 ++++++++
 test/saber/test_saber_context.cpp | 7 +++++++
 2 files changed, 15 insertions(+)

diff --git a/test/saber/test_saber_buffer.cpp b/test/saber/test_saber_buffer.cpp
index 085629f1b..2c0816880 100644
--- a/test/saber/test_saber_buffer.cpp
+++ b/test/saber/test_saber_buffer.cpp
@@ -116,20 +116,28 @@ void test_buffer() {
 
 TEST(TestSaberFunc, test_saber_buffer) {
 #ifdef USE_CUDA
+    LOG(INFO) << "test NV";
     test_buffer<NV, NVHX86, AK_FLOAT>();
     test_buffer<NV, NVHX86, AK_INT8>();
 #endif
 
 #ifdef USE_X86_PLACE
+    LOG(INFO) << "test X86";
     test_buffer<X86, X86, AK_FLOAT>();
     test_buffer<X86, X86, AK_INT8>();
 #endif
 
 #ifdef USE_ARM_PLACE
+    LOG(INFO) << "test ARM";
     test_buffer<ARM, ARM, AK_FLOAT>();
     test_buffer<ARM, ARM, AK_INT8>();
 #endif
 
+#ifdef USE_BM
+    LOG(INFO) << "test BM";
+    test_buffer<BM, X86, AK_FLOAT>();
+#endif
+
 }
 
 int main(int argc, const char** argv) {
diff --git a/test/saber/test_saber_context.cpp b/test/saber/test_saber_context.cpp
index 442443c2c..edd9f8db0 100644
--- a/test/saber/test_saber_context.cpp
+++ b/test/saber/test_saber_context.cpp
@@ -57,6 +57,13 @@ TEST(TestSaberFuncTest, test_arm_context) {
 }
 #endif //USE_ARM_PLACE
 
+#ifdef USE_BM
+TEST(TestSaberFunc, test_BM_context) {
+    Context<BM> ctx;
+    CHECK_NOTNULL(ctx.get_handle()) << "Failed to get BM handle";
+}
+#endif //USE_BM
+
 int main(int argc, const char** argv) {
     // initial logger
     logger::init(argv[0]);

From 43bf9efb238499e0b13973b42eb63516c0daddeb Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 14 Aug 2018 18:29:17 +0800
Subject: [PATCH 272/318] BM tensor test

---
 saber/core/impl/bm/bm_impl.cpp   | 46 +++++++++++++++++---------
 saber/core/target_wrapper.h      | 17 ++++++++++
 saber/core/tensor.h              | 29 -----------------
 test/saber/test_saber_buffer.cpp | 11 ++++---
 test/saber/test_saber_tensor.cpp | 56 +++++++++++++++++++++++++-------
 5 files changed, 99 insertions(+), 60 deletions(-)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index d2359868c..1e5345b6d 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -1,10 +1,6 @@
 #include "core/tensor.h"
 #include "env.h"
 
-#include "bmlib_runtime.h"
-#include "bmdnn_api.h"
-#include "bmlib_utils.h"
-
 #ifdef USE_BM
 const char* bmdnn_get_errorstring(bm_status_t error) {
     switch (error) {
@@ -102,12 +98,6 @@ void BM_API::sync_memcpy(void* dst, size_t dst_offset, int dst_id, \
         size_t count, __HtoD) {
     //handle = BM_API::get_handle(); 
     BMDNN_CHECK(bm_memcpy_s2d(handle, *(bm_device_mem_t *)(dst), bm_mem_from_system(src)));
-
-    #ifdef DEBUG
-    for(int i=0; i<10; i++)
-	    LOG(INFO) << "HtoD src: " << *((float *)(src)+i);
-    #endif
-    
     LOG(INFO) << "BM sync_memcpy: host to device, finished";
 };
 
@@ -116,13 +106,28 @@ void BM_API::sync_memcpy(void* dst, size_t dst_offset, int dst_id, \
         size_t count, __DtoH) {
     //handle = BM_API::get_handle(); 
     BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src)));
+    LOG(INFO) << "BM sync_memcpy: device to host, finished";
+};
 
-    #ifdef DEBUG
-    for(int i=0; i<10; i++)
-        LOG(INFO) << "DtoH dst: " << *((float *)(dst)+i);
-    #endif
+void BM_API::async_memcpy(void* dst, size_t dst_offset, int dst_id, \
+        const void* src, size_t src_offset, int src_id, \
+        size_t count, stream_t stream, __HtoD) {
+    LOG(WARNING) << "BM async_memcpy: currently using sync method";
+    sync_memcpy(dst, dst_offset, dst_id, src, src_offset, src_id, count, __HtoD());
+};
 
-    LOG(INFO) << "BM sync_memcpy: device to host, finished";
+void BM_API::async_memcpy(void* dst, size_t dst_offset, int dst_id, \
+        const void* src, size_t src_offset, int src_id, \
+        size_t count, stream_t stream, __DtoH) {
+    LOG(WARNING) << "BM async_memcpy: currently using sync method";
+    sync_memcpy(dst, dst_offset, dst_id, src, src_offset, src_id, count, __DtoH());
+};
+
+void BM_API::async_memcpy(void* dst, size_t dst_offset, int dst_id, \
+        const void* src, size_t src_offset, int src_id, \
+        size_t count, stream_t stream, __DtoD) {
+    LOG(WARNING) << "BM async_memcpy: currently using sync method";
+    sync_memcpy(dst, dst_offset, dst_id, src, src_offset, src_id, count, __DtoD());
 };
 
 void BM_API::sync_memcpy_p2p(void* dst, size_t dst_offset, int dst_id, \
@@ -132,6 +137,17 @@ void BM_API::sync_memcpy_p2p(void* dst, size_t dst_offset, int dst_id, \
     LOG(ERROR) << "BM sync_memcpy_p2p: temporarily no used";
 };
 
+void BM_API::async_memcpy_p2p(void* dst, size_t dst_offset, int dst_id, \
+        const void* src, size_t src_offset, int src_id, \
+        size_t count, stream_t stream) {
+
+    LOG(ERROR) << "BM async_memcpy_p2p: temporarily no used";
+};
+
+void BM_API::device_sync() {
+    LOG(ERROR) << "BM device_sync: temporarily no used";
+};
+
 //! BM TargetWrapper
 template struct TargetWrapper<BM, __device_target>;
 
diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index bafbb55e8..119b52bd0 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -555,11 +555,28 @@ struct TargetWrapper<BM, __device_target> {
         const void* src, size_t src_offset, int src_id, \
         size_t count);
 
+    static void async_memcpy(void* dst, size_t dst_offset, int dst_id, \
+        const void* src, size_t src_offset, int src_id, \
+        size_t count, stream_t stream, __HtoD);
+
+    static void async_memcpy(void* dst, size_t dst_offset, int dst_id, \
+        const void* src, size_t src_offset, int src_id, \
+        size_t count, stream_t stream, __DtoH);
+
+    static void async_memcpy(void* dst, size_t dst_offset, int dst_id, \
+        const void* src, size_t src_offset, int src_id, \
+        size_t count, stream_t stream, __DtoD);
+
+    static void async_memcpy_p2p(void* dst, size_t dst_offset, int dst_id, \
+        const void* src, size_t src_offset, int src_id, \
+        size_t count, stream_t stream);
+
     /**
      * \brief device target return currently used device id
      * @return          currently activated device id
      */
     static int get_device_id();
+    static void device_sync();
 
     static bm_handle_t get_handle();
 };
diff --git a/saber/core/tensor.h b/saber/core/tensor.h
index ca3d8231c..f18884421 100644
--- a/saber/core/tensor.h
+++ b/saber/core/tensor.h
@@ -994,35 +994,6 @@ class Tensor {
     }
 };
 
-#ifdef USE_BM
-#ifndef BM_TENSOR_COPY
-#define BM_TENSOR_COPY
-template<>
-template<> inline
-SaberStatus Tensor<BM>::copy_from<X86>(const Tensor<X86>& tensor) {
-    LOG(INFO) << "BM copy_from X86";
-    CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
-    CHECK_EQ(tensor.get_dtype(), AK_FLOAT) << "host data type should be AK_FLOAT";
-
-    bm_device_mem_t* device_data_ptr = (bm_device_mem_t*) mutable_data();
-    BMDNN_CHECK(bm_memcpy_s2d(Tensor<BM>::API::get_handle(), *device_data_ptr, bm_mem_from_system(const_cast<float *>((float*) tensor.data()))));
-    return SaberSuccess;
-}
-
-template<>
-template<> inline
-SaberStatus Tensor<X86>::copy_from<BM>(const Tensor<BM>& tensor) {
-    LOG(INFO) << "X86 copy_from BM";
-    CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
-    CHECK_EQ(_dtype, AK_FLOAT) << "host data type should be AK_FLOAT";
-
-    auto* device_data_ptr = const_cast<bm_device_mem_t *>((bm_device_mem_t*) tensor.data());
-    BMDNN_CHECK(bm_memcpy_d2s(Tensor<BM>::API::get_handle(), bm_mem_from_system((float*) mutable_data()), *device_data_ptr));
-    return SaberSuccess;
-}
-#endif
-#endif
-
 } //namespace saber
 
 } //namespace anakin
diff --git a/test/saber/test_saber_buffer.cpp b/test/saber/test_saber_buffer.cpp
index 2c0816880..c18c3d14d 100644
--- a/test/saber/test_saber_buffer.cpp
+++ b/test/saber/test_saber_buffer.cpp
@@ -116,25 +116,28 @@ void test_buffer() {
 
 TEST(TestSaberFunc, test_saber_buffer) {
 #ifdef USE_CUDA
-    LOG(INFO) << "test NV";
+    LOG(INFO) << "test NV FP32 buffer";
     test_buffer<NV, NVHX86, AK_FLOAT>();
+    LOG(INFO) << "test NV INT8 buffer";
     test_buffer<NV, NVHX86, AK_INT8>();
 #endif
 
 #ifdef USE_X86_PLACE
-    LOG(INFO) << "test X86";
+    LOG(INFO) << "test X86 FP32 buffer";
     test_buffer<X86, X86, AK_FLOAT>();
+    LOG(INFO) << "test X86 INT8 buffer";
     test_buffer<X86, X86, AK_INT8>();
 #endif
 
 #ifdef USE_ARM_PLACE
-    LOG(INFO) << "test ARM";
+    LOG(INFO) << "test ARM FP32 buffer";
     test_buffer<ARM, ARM, AK_FLOAT>();
+    LOG(INFO) << "test ARM INT8 buffer";
     test_buffer<ARM, ARM, AK_INT8>();
 #endif
 
 #ifdef USE_BM
-    LOG(INFO) << "test BM";
+    LOG(INFO) << "test BM FP32 buffer";
     test_buffer<BM, X86, AK_FLOAT>();
 #endif
 
diff --git a/test/saber/test_saber_tensor.cpp b/test/saber/test_saber_tensor.cpp
index 64c47068a..d03d5c270 100644
--- a/test/saber/test_saber_tensor.cpp
+++ b/test/saber/test_saber_tensor.cpp
@@ -10,11 +10,11 @@ void tensor_constructor() {
     typedef TargetWrapper<TargetH> HAPI;
     typedef TargetWrapper<TargetD> DAPI;
 
-    typedef typename TargetTypeTraits<TargetH>::target_type target_H;
-    typedef typename TargetTypeTraits<TargetD>::target_type target_D;
+    typedef typename TargetTypeTraits<TargetH>::target_category target_H;
+    typedef typename TargetTypeTraits<TargetD>::target_category target_D;
     typedef typename IF<std::is_same<target_D, target_H>::value, __HtoH, __DtoH>::Type then_type;
     typedef typename IF<std::is_same<target_D, target_H>::value, __DtoD, __HtoD>::Type else_type;
-    typedef typename IF<std::is_same<target_D, __host_target>::value, else_type, then_type>::Type flag_type;
+    typedef typename IF<std::is_same<target_D, __host_target>::value, then_type, else_type>::Type flag_type;
     typedef typename IF<std::is_same<target_D, __host_target>::value, HAPI, DAPI>::Type copy_API;
 
     typedef Tensor<TargetH> TensorH;
@@ -273,6 +273,12 @@ TEST(TestSaberFunc, test_tensor_constructor) {
     LOG(INFO) << "test ARM INT8 tensor";
     tensor_constructor<ARM, ARM, AK_INT8>();
 #endif
+
+#ifdef USE_BM
+    Env<BM>::env_init();
+    LOG(INFO) << "test BM FP32 tensor";
+    tensor_constructor<BM, X86, AK_FLOAT>();
+#endif
 }
 
 #if 1
@@ -282,11 +288,11 @@ void tensor_deepcopy() {
     typedef TargetWrapper<TargetH> HAPI;
     typedef TargetWrapper<TargetD> DAPI;
 
-    typedef typename TargetTypeTraits<TargetH>::target_type target_H;
-    typedef typename TargetTypeTraits<TargetD>::target_type target_D;
+    typedef typename TargetTypeTraits<TargetH>::target_category target_H;
+    typedef typename TargetTypeTraits<TargetD>::target_category target_D;
     typedef typename IF<std::is_same<target_D, target_H>::value, __HtoH, __DtoH>::Type then_type;
     typedef typename IF<std::is_same<target_D, target_H>::value, __DtoD, __HtoD>::Type else_type;
-    typedef typename IF<std::is_same<target_D, __host_target>::value, else_type, then_type>::Type flag_type;
+    typedef typename IF<std::is_same<target_D, __host_target>::value, then_type, else_type>::Type flag_type;
     typedef typename IF<std::is_same<target_D, __host_target>::value, HAPI, DAPI>::Type copy_API;
 
     typedef Tensor<TargetH> TensorH;
@@ -500,6 +506,12 @@ TEST(TestSaberFunc, test_tensor_deepcopy) {
     LOG(INFO) << "test ARM INT8 tensor deep copy";
     tensor_deepcopy<ARM, ARM, AK_INT8>();
 #endif //USE_ARM_PLACE
+
+#ifdef USE_BM
+    Env<BM>::env_init();
+    LOG(INFO) << "test BM FP32 tensor deep copy";
+    //tensor_deepcopy<BM, X86, AK_FLOAT>();
+#endif //USE_BM
 }
 #endif
 
@@ -600,6 +612,12 @@ TEST(TestSaberFunc, test_saber_tensor_shape) {
     LOG(INFO) << "test ARM tensor shape API";
     test_tensor_shape<ARM>();
 #endif //USE_ARM_PLACE
+
+#ifdef USE_BM
+    Env<BM>::env_init();
+    LOG(INFO) << "test BM tensor shape API";
+    test_tensor_shape<BM>();
+#endif //USE_BM
 }
 #endif
 
@@ -610,11 +628,11 @@ void tensor_reshape_realloc() {
     typedef TargetWrapper<TargetH> HAPI;
     typedef TargetWrapper<TargetD> DAPI;
 
-    typedef typename TargetTypeTraits<TargetH>::target_type target_H;
-    typedef typename TargetTypeTraits<TargetD>::target_type target_D;
+    typedef typename TargetTypeTraits<TargetH>::target_category target_H;
+    typedef typename TargetTypeTraits<TargetD>::target_category target_D;
     typedef typename IF<std::is_same<target_D, target_H>::value, __HtoH, __DtoH>::Type then_type;
     typedef typename IF<std::is_same<target_D, target_H>::value, __DtoD, __HtoD>::Type else_type;
-    typedef typename IF<std::is_same<target_D, __host_target>::value, else_type, then_type>::Type flag_type;
+    typedef typename IF<std::is_same<target_D, __host_target>::value, then_type, else_type>::Type flag_type;
     typedef typename IF<std::is_same<target_D, __host_target>::value, HAPI, DAPI>::Type copy_API;
 
     typedef Tensor<TargetH> TensorH;
@@ -714,6 +732,12 @@ TEST(TestSaberFunc, test_tensor_reshape_realloc) {
     LOG(INFO) << "test ARM INT8 tensor reshape realloc";
     tensor_reshape_realloc<ARM, ARM, AK_INT8>();
 #endif //USE_ARM_PLACE
+
+#ifdef USE_BM
+    Env<BM>::env_init();
+    LOG(INFO) << "test BM FP32 tensor reshape realloc";
+    tensor_reshape_realloc<BM, X86, AK_FLOAT>();
+#endif //USE_BM
 }
 #endif
 
@@ -723,11 +747,11 @@ void test_tensor_op() {
     typedef TargetWrapper<TargetH> HAPI;
     typedef TargetWrapper<TargetD> DAPI;
 
-    typedef typename TargetTypeTraits<TargetH>::target_type target_H;
-    typedef typename TargetTypeTraits<TargetD>::target_type target_D;
+    typedef typename TargetTypeTraits<TargetH>::target_category target_H;
+    typedef typename TargetTypeTraits<TargetD>::target_category target_D;
     typedef typename IF<std::is_same<target_D, target_H>::value, __HtoH, __DtoH>::Type then_type;
     typedef typename IF<std::is_same<target_D, target_H>::value, __DtoD, __HtoD>::Type else_type;
-    typedef typename IF<std::is_same<target_D, __host_target>::value, else_type, then_type>::Type flag_type;
+    typedef typename IF<std::is_same<target_D, __host_target>::value, then_type, else_type>::Type flag_type;
     typedef typename IF<std::is_same<target_D, __host_target>::value, HAPI, DAPI>::Type copy_API;
 
     typedef Tensor<TargetH> TensorH;
@@ -788,6 +812,12 @@ TEST(TestSaberFunc, test_tensor_ops) {
     LOG(INFO) << "test ARM INT8 tensor op";
     test_tensor_op<ARM, ARM, AK_INT8>();
 #endif //USE_ARM_PLACE
+
+#ifdef USE_BM
+    Env<BM>::env_init();
+    LOG(INFO) << "test BM FP32 tensor op";
+    test_tensor_op<BM, X86, AK_FLOAT>();
+#endif //USE_BM
 }
 #endif
 
@@ -834,6 +864,8 @@ TEST(TestSaberFunc, test_tensor_share_diff_dtype) {
     LOG(INFO) << "test ARM tensor share different data type";
     tensor_share_diff_dtype<ARM, ARM>();
 #endif //USE_ARM_PLACE
+
+//BM does not support this yet
 }
 #endif
 int main(int argc, const char** argv) {

From c75c35a8da051ff7c4bfeffc4b1bd958f8f587d4 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 15 Aug 2018 09:20:25 +0800
Subject: [PATCH 273/318] Remove all BM ops for now

---
 saber/funcs/conv.h                  |   2 +-
 saber/funcs/impl/bm/vender_conv.cpp | 102 ----------------------------
 saber/funcs/impl/bm/vender_conv.h   |  36 ----------
 3 files changed, 1 insertion(+), 139 deletions(-)
 delete mode 100644 saber/funcs/impl/bm/vender_conv.cpp
 delete mode 100644 saber/funcs/impl/bm/vender_conv.h

diff --git a/saber/funcs/conv.h b/saber/funcs/conv.h
index 1626d38a9..d107befdd 100644
--- a/saber/funcs/conv.h
+++ b/saber/funcs/conv.h
@@ -30,7 +30,7 @@
 #endif
 
 #ifdef USE_BM
-#include "saber/funcs/impl/bm/vender_conv.h"
+//#include "saber/funcs/impl/bm/vender_conv.h"
 #endif
 
 #ifdef USE_ARM_PLACE
diff --git a/saber/funcs/impl/bm/vender_conv.cpp b/saber/funcs/impl/bm/vender_conv.cpp
deleted file mode 100644
index 4b0556e89..000000000
--- a/saber/funcs/impl/bm/vender_conv.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-
-#include "saber/funcs/impl/bm/vender_conv.h"
-
-namespace anakin
-{
-namespace saber
-{
-
-// FP32 part
-template <>
-SaberStatus VenderConv2D<BM, AK_FLOAT>::\
-    create(const std::vector<Tensor<BM> *>& inputs,
-            std::vector<Tensor<BM> *>& outputs,
-            ConvParam<BM>& param, Context<BM>& ctx)
-{
-}
-
-template <>
-SaberStatus VenderConv2D<BM, AK_FLOAT>::\
-    init(const std::vector<Tensor<BM> *> &inputs,
-         std::vector<Tensor<BM> *> &outputs,
-         ConvParam<BM> &param, Context<BM> &ctx)
-{
-
-    _handle = ctx.get_handle();
-    return create(inputs, outputs, param, ctx);
-}
-
-template <>
-SaberStatus VenderConv2D<BM, AK_FLOAT>::\
-    dispatch(const std::vector<Tensor<BM>*>& inputs,
-                std::vector<Tensor<BM>*>& outputs,
-                ConvParam<BM>& param)
-{
-
-    /*const bm_device_mem_t *in_data = (const bm_device_mem_t *)inputs[0]->data();
-    const bm_device_mem_t *weight = (const bm_device_mem_t *)param.weight()->data();
-    bm_device_mem_t *out_data = (bm_device_mem_t *)outputs[0]->mutable_data();
-
-    int input_n = inputs[0]->num();
-    int input_c = inputs[0]->channel();
-    int input_h = inputs[0]->height();
-    int input_w = inputs[0]->width();
-
-    int output_n = outputs[0]->num();
-    int output_c = outputs[0]->channel();
-    int output_h = outputs[0]->height();
-    int output_w = outputs[0]->width();
-
-    int group = param.group;
-    int kh = param.weight()->height();
-    int kw = param.weight()->width();
-    int pad_h = param.pad_h;
-    int pad_w = param.pad_w;
-    int stride_h = param.stride_h;
-    int stride_w = param.stride_w;
-    int dilation_h = param.dilation_h;
-    int dilation_w = param.dilation_w;
-
-    bool with_bias = param.bias()->size() > 0;
-    const bm_device_mem_t *bias = with_bias ? (const bm_device_mem_t *)param.bias()->data() : &bm_mem_null();
-
-    bm_tensor_4d_t input_shape = {
-        input_n,
-        input_c,
-        input_h,
-        input_w};
-
-    bm_tensor_4d_t output_shape = {
-        output_n,
-        output_c,
-        output_h,
-        output_w};
-
-    bm_kernel_param_t kernel_param = {
-        group,
-        output_c,
-        input_c,
-        kh,
-        kw};
-
-    bm_conv_param_t conv_param = {
-        stride_h,
-        stride_w,
-        pad_h,
-        pad_w,
-        dilation_h,
-        dilation_w,
-        0};
-
-    BMDNN_CHECK(bmdnn_conv_forward(_handle, *in_data, *weight, *bias, input_shape,
-                                   kernel_param, output_shape, conv_param, with_bias, *out_data));*/
-
-    return SaberSuccess;
-}
-
-// INT8 part
-// TODO:
-
-template class VenderConv2D<BM, AK_FLOAT>;
-} // namespace saber
-} // namespace anakin
\ No newline at end of file
diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h
deleted file mode 100644
index fbb56d359..000000000
--- a/saber/funcs/impl/bm/vender_conv.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H
-#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H
-
-#include "saber/funcs/impl/impl_conv.h"
-
-namespace anakin{
-
-namespace saber{
-
-template <DataType OpDtype>
-class VenderConv2D<BM, OpDtype> : public ImplBase<
-        BM, OpDtype, ConvParam<BM> > {
-            
-public:
-    VenderConv2D(): _handle(NULL) {}
-    ~VenderConv2D() {}
-
-    virtual SaberStatus init(const std::vector<Tensor<BM> *>& inputs,
-                             std::vector<Tensor<BM> *>& outputs,
-                             ConvParam<BM>& param, Context<BM>& ctx);
-
-    virtual SaberStatus create(const std::vector<Tensor<BM> *>& inputs,
-                               std::vector<Tensor<BM> *>& outputs,
-                               ConvParam<BM>& param, Context<BM>& ctx);
-
-    virtual SaberStatus dispatch(const std::vector<Tensor<BM>*>& inputs,
-                                 std::vector<Tensor<BM>*>& outputs,
-                                 ConvParam<BM>& param);
-
-private:
-    bm_handle_t _handle;
-};
-
-}
-}
-#endif //ANAKIN_SABER_FUNCS_BMDNN_CONV2D_H

From 224f0258f9fc344c3ea2e50828aa9b6dc04d64ea Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 16 Aug 2018 15:20:13 +0800
Subject: [PATCH 274/318] Updates according to dev_v2

---
 saber/core/impl/bm/bm_impl.cpp                |  23 +--
 saber/core/impl/bm/temsor_op_bm.cpp           | 133 ------------------
 saber/core/impl/bm/tensor_op_bm.cpp           |   2 +
 saber/core/target_wrapper.h                   |  87 +-----------
 saber/funcs/conv.h                            |   2 +-
 saber/funcs/impl/bm/vender_activation.h       |  76 ----------
 saber/funcs/impl/bm/vender_conv.h             | 114 ---------------
 test/saber/test_saber_buffer.cpp              |   4 +-
 test/saber/test_saber_tensor.cpp              |  14 +-
 .../bm_lib}/include/bmdnn/bmdnn_api.h         |   0
 .../bm_lib}/include/bmdnn/bmdnn_ext_api.h     |   0
 .../bm_lib}/include/bmdnn/bmdnn_runtime.h     |   0
 .../bm_lib}/include/bmdnn/op_code.h           |   0
 .../bm_lib}/include/bmlib/bmlib_runtime.h     |   0
 .../bm_lib}/include/bmlib/bmlib_utils.h       |   0
 .../bm_lib}/include/bmruntime/bmblob.h        |   0
 .../bm_lib}/include/bmruntime/bmruntime.h     |   0
 .../include/bmruntime/bmruntime_common.h      |   0
 .../include/bmruntime/bmruntime_interface.h   |   0
 19 files changed, 18 insertions(+), 437 deletions(-)
 delete mode 100644 saber/core/impl/bm/temsor_op_bm.cpp
 delete mode 100644 saber/funcs/impl/bm/vender_activation.h
 delete mode 100644 saber/funcs/impl/bm/vender_conv.h
 rename {saber/funcs/impl/bm/base => third-party/bm_lib}/include/bmdnn/bmdnn_api.h (100%)
 rename {saber/funcs/impl/bm/base => third-party/bm_lib}/include/bmdnn/bmdnn_ext_api.h (100%)
 rename {saber/funcs/impl/bm/base => third-party/bm_lib}/include/bmdnn/bmdnn_runtime.h (100%)
 rename {saber/funcs/impl/bm/base => third-party/bm_lib}/include/bmdnn/op_code.h (100%)
 rename {saber/funcs/impl/bm/base => third-party/bm_lib}/include/bmlib/bmlib_runtime.h (100%)
 rename {saber/funcs/impl/bm/base => third-party/bm_lib}/include/bmlib/bmlib_utils.h (100%)
 rename {saber/funcs/impl/bm/base => third-party/bm_lib}/include/bmruntime/bmblob.h (100%)
 rename {saber/funcs/impl/bm/base => third-party/bm_lib}/include/bmruntime/bmruntime.h (100%)
 rename {saber/funcs/impl/bm/base => third-party/bm_lib}/include/bmruntime/bmruntime_common.h (100%)
 rename {saber/funcs/impl/bm/base => third-party/bm_lib}/include/bmruntime/bmruntime_interface.h (100%)

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index d08f40380..2efbf80a0 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -3,11 +3,6 @@
 #include "core/data_traits.h"
 #include "env.h"
 
-
-#include "bmlib_runtime.h"
-#include "bmdnn_api.h"
-#include "bmlib_utils.h"
-
 const char* bmdnn_get_errorstring(bm_status_t error) {
     switch (error) {
     case BM_SUCCESS:
@@ -45,7 +40,7 @@ namespace saber {
 
 
 
-typedef  TargetWrapper<BM, __device_target>BM_API;
+typedef TargetWrapper<BM, __device_target> BM_API;
 
 
 // Init handle only once in the lifetime
@@ -72,7 +67,6 @@ int BM_API::get_device_id() {
 }
 
 void BM_API::mem_alloc(TPtr* ptr, size_t n) {
-    handle = get_bm_handle();
     /* bm_device_mem_t *mem = reinterpret_cast<struct bm_mem_desc *>(*ptr); */
     //    bm_device_mem_t *mem = new bm_device_mem_t();
     bm_device_mem_t mem;
@@ -82,7 +76,6 @@ void BM_API::mem_alloc(TPtr* ptr, size_t n) {
 
 void BM_API::mem_free(TPtr ptr) {
     if ((ptr != BM_MEM_NULL)) {
-        handle = get_bm_handle();
         bm_free_device(handle, ptr);
         //        delete ptr;
     }
@@ -100,7 +93,6 @@ void BM_API::sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \
         size_t count, __DtoD) {
     if(count==0)
         return;
-    handle = get_bm_handle();
     //BMDNN_CHECK(bm_memcpy_d2d(handle, bm_mem_from_device(dst), dst_id, bm_mem_from_device(src), src_id, count));
     BMDNN_CHECK(bm_memcpy_d2d(handle, dst, dst_offset, src, src_offset, count));
 };
@@ -110,7 +102,6 @@ void BM_API::sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \
         size_t count, __HtoD) {
     if(count==0)
         return;
-    handle = get_bm_handle();
     BMDNN_CHECK(bm_memcpy_s2d(handle, dst+dst_offset, bm_mem_from_system(const_cast<void*>(src)+src_offset)));
 
 #ifdef DEBUG
@@ -127,7 +118,6 @@ void BM_API::sync_memcpy(void* dst, size_t dst_offset, int dst_id, \
         size_t count, __DtoH) {
     if(count==0)
         return;
-    handle = get_bm_handle();
 //    LOG(INFO)<<"host ptr = "<<(dst)<<",dst_offset = "<<dst_offset<<", dev ptr = "<<(src)<<",dev offset = "<<src_offset;
     BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst+dst_offset), src+src_offset));
 
@@ -149,17 +139,6 @@ void BM_API::sync_memcpy_p2p(TPtr dst, size_t dst_offset, int dst_id, \
     LOG(FATAL) << "BM sync_memcpy_p2p: temporarily no used";
 };
 
-void BM_API::async_memcpy_p2p(void* dst, size_t dst_offset, int dst_id, \
-        const void* src, size_t src_offset, int src_id, \
-        size_t count, stream_t stream) {
-
-    LOG(FATAL) << "BM async_memcpy_p2p: temporarily no used";
-};
-
-void BM_API::device_sync() {
-    LOG(FATAL) << "BM device_sync: temporarily no used";
-};
-
 //! BM TargetWrapper
 template struct TargetWrapper<BM, __device_target>;
 
diff --git a/saber/core/impl/bm/temsor_op_bm.cpp b/saber/core/impl/bm/temsor_op_bm.cpp
deleted file mode 100644
index 7f7dda655..000000000
--- a/saber/core/impl/bm/temsor_op_bm.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-#include "saber/core/tensor_op.h"
-
-#ifdef USE_BM
-
-#include <random>
-
-namespace anakin{
-
-namespace saber{
-
-typedef Tensor<BM>::API API;
-
-template<>
-void fill_tensor_rand<BM>(Tensor<BM>& tensor, \
-    typename API::stream_t stream = NULL) {
-
-    DataType type = tensor.get_dtype();
-    switch (type){
-        case AK_FLOAT: {
-            float *host_mem_input = new float[tensor.size()];
-            for (int i = 0; i < tensor.size(); ++i) {
-                host_mem_input[i] = static_cast<float>(rand());
-            }
-
-            bm_device_mem_t* device_data_ptr = (bm_device_mem_t*) tensor.mutable_data();
-            BMDNN_CHECK(bm_memcpy_s2d(API::get_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
-
-            delete [] host_mem_input;
-            break;
-        }
-        default: LOG(FATAL) << "data type: " << type << " is unsupported now";
-    }
-}
-
-template<>
-void fill_tensor_rand<BM>(Tensor<BM>& tensor, float vstart, \
-    float vend, typename Tensor<BM>::API::stream_t stream = NULL){
-
-    DataType type = tensor.get_dtype();
-    switch (type){
-        case AK_FLOAT: {
-            std::random_device rd;
-            std::mt19937 gen(rd());
-            std::uniform_real_distribution<float> dis(0, 1.f);
-
-            float *host_mem_input = new float[tensor.size()];
-            for (int i = 0; i < tensor.size(); ++i) {
-                float random_num = vstart + (vend - vstart) * dis(gen);
-                host_mem_input[i] = random_num;
-            }
-
-            bm_device_mem_t* device_data_ptr = (bm_device_mem_t*) tensor.mutable_data();
-            BMDNN_CHECK(bm_memcpy_s2d(API::get_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
-
-            delete [] host_mem_input;
-            break;
-        }
-        default: LOG(FATAL) << "data type: " << type << " is unsupported now";
-    }
-}
-
-template<>
-void fill_tensor_const<BM>(Tensor<BM>& tensor, float value, \
-    typename Tensor<BM>::API::stream_t stream = NULL){
-
-    DataType type = tensor.get_dtype();
-    switch (type){
-        case AK_FLOAT: {
-            float *host_mem_input = new float[tensor.size()];
-            for (int i = 0; i < tensor.size(); ++i) {
-                host_mem_input[i] = value;
-            }
-
-            bm_device_mem_t* device_data_ptr = (bm_device_mem_t*) tensor.mutable_data();
-            BMDNN_CHECK(bm_memcpy_s2d(API::get_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input)));
-
-            delete [] host_mem_input;
-            break;
-        }
-        default: LOG(FATAL) << "data type: " << type << " is unsupported now";
-    }
-}
-
-template <>
-void print_tensor<BM>(Tensor<BM>& tensor,  \
-    typename Tensor<BM>::API::stream_t stream = NULL) {
-
-    DataType type = tensor.get_dtype();
-    switch (type){
-        case AK_FLOAT: {
-            LOG(INFO) << "BM device tensor data:" << tensor.size();
-
-            /*
-            const bm_device_mem_t* device_data_ptr = tensor.data();
-            unsigned long long gaddr = bm_mem_get_device_addr(*device_data_ptr);
-            bm_flush(get_bm_handle());
-            float* device_data = (float*)bm_get_global_addr(gaddr);
-
-            for (int i = 0; i < tensor.size(); ++i) {
-                printf("%.2f ", device_data[i]);
-
-                if ((i + 1) % (4 * tensor.width()) == 0) {
-                    printf("\n");
-                }
-            }*/
-
-            float *host_mem = new float[tensor.size()];
-            auto* device_data_ptr = const_cast<bm_device_mem_t *>((bm_device_mem_t*) tensor.data());
-            bm_memcpy_d2s(API::get_handle(), bm_mem_from_system(host_mem), *device_data_ptr);
-
-            for (int i = 0; i < tensor.size(); ++i) {
-                printf("%.2f\t", host_mem[i]);
-
-                if ((i + 1) % tensor.width() == 0){
-                    printf("\n");
-                }
-            }
-            printf("\n");
-
-            delete [] host_mem;
-            break;
-        }
-        default: LOG(FATAL) << "data type: " << type << " is unsupported now";
-    }
-}
-
-
-
-} //namespace saber
-
-} //namespace anakin
-
-#endif //USE_BM
\ No newline at end of file
diff --git a/saber/core/impl/bm/tensor_op_bm.cpp b/saber/core/impl/bm/tensor_op_bm.cpp
index 6438e10e5..8380d145a 100644
--- a/saber/core/impl/bm/tensor_op_bm.cpp
+++ b/saber/core/impl/bm/tensor_op_bm.cpp
@@ -27,6 +27,7 @@ void fill_tensor_rand<BM>(Tensor<BM>& tensor, float vstart, float vend,
 
 template<>
 void print_tensor<BM>(Tensor<BM>& tensor, typename Tensor<BM>::API::stream_t stream = NULL) {
+    LOG(INFO) << "device tensor data";
     Tensor<X86> temp_tensor(tensor.valid_shape(),tensor.get_dtype());
     temp_tensor.copy_from(tensor);
     print_tensor(temp_tensor);
@@ -34,6 +35,7 @@ void print_tensor<BM>(Tensor<BM>& tensor, typename Tensor<BM>::API::stream_t str
 
 template<>
 void print_tensor_valid<BM>(Tensor<BM>& tensor, typename Tensor<BM>::API::stream_t stream = NULL) {
+    LOG(INFO) << "device tensor data";
     print_tensor(tensor);
 }
 
diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index cd6a117d2..09c11821c 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -441,18 +441,16 @@ struct TargetWrapper<BM, __device_target> {
         const TPtr src, size_t src_offset, int src_id, \
         size_t count);
 
-
     /**
      * \brief device target return currently used device id
      * @return          currently activated device id
      */
     static int get_device_id();
     static void device_sync(){};
-//    static bm_handle_t get_handler();
+    static bm_handle_t get_handle();
 
-//    bm_handle_t handle;
 };
-#endif
+#endif //USE_BM
 
 #ifdef USE_AMD
 
@@ -572,87 +570,6 @@ struct TargetWrapper<AMD, __device_target> {
 
 #endif //USE_AMD
 
-#ifdef USE_BM
-        /**
- * \brief for Bitmain sophon device target only, device target is BM tpu
- * use bitmain api to manage memory
- * support device to device, device to host, host to device memcpy
-*/
-template <>
-struct TargetWrapper<BM, __device_target> {
-    typedef void* event_t;
-    typedef void* stream_t;
-
-    static void get_device_count(int& count);
-
-    static void set_device(int id);
-
-    //We should add strategy to avoid malloc directly
-    static void mem_alloc(void** ptr, size_t n);
-
-    //template <typename void>
-    static void mem_free(void * ptr);
-    
-    //template <typename void>
-    static void mem_set(void* ptr, int value, size_t n);
-
-    // brief create event, empty function for bitmain target
-    static void create_event(event_t event, bool flag = false) {}
-    static void destroy_event(event_t event) {}
-    static void create_stream(stream_t stream) {}
-    static void create_stream_with_flag(stream_t stream, unsigned int flag) {}
-    static void create_stream_with_priority(stream_t stream, unsigned int flag, int priority) {}
-    static void destroy_stream(stream_t stream) {}
-    static void record_event(event_t event, stream_t stream) {}
-    static void query_event(event_t event) {}
-    static void sync_event(event_t event) {}
-    static void sync_stream(event_t event, stream_t stream) {}
-    // brief create event, empty function for bitmain target
-
-    static void sync_memcpy(void* dst, size_t dst_offset, int dst_id, \
-        const void* src, size_t src_offset, int src_id, \
-        size_t count, __DtoD);
-
-    static void sync_memcpy(void* dst, size_t dst_offset, int dst_id, \
-        const void* src, size_t src_offset, int src_id, \
-        size_t count, __HtoD);
-
-    static void sync_memcpy(void* dst, size_t dst_offset, int dst_id, \
-        const void* src, size_t src_offset, int src_id, \
-        size_t count, __DtoH);
-
-    static void sync_memcpy_p2p(void* dst, size_t dst_offset, int dst_id, \
-        const void* src, size_t src_offset, int src_id, \
-        size_t count);
-
-    static void async_memcpy(void* dst, size_t dst_offset, int dst_id, \
-        const void* src, size_t src_offset, int src_id, \
-        size_t count, stream_t stream, __HtoD);
-
-    static void async_memcpy(void* dst, size_t dst_offset, int dst_id, \
-        const void* src, size_t src_offset, int src_id, \
-        size_t count, stream_t stream, __DtoH);
-
-    static void async_memcpy(void* dst, size_t dst_offset, int dst_id, \
-        const void* src, size_t src_offset, int src_id, \
-        size_t count, stream_t stream, __DtoD);
-
-    static void async_memcpy_p2p(void* dst, size_t dst_offset, int dst_id, \
-        const void* src, size_t src_offset, int src_id, \
-        size_t count, stream_t stream);
-
-    /**
-     * \brief device target return currently used device id
-     * @return          currently activated device id
-     */
-    static int get_device_id();
-    static void device_sync();
-
-    static bm_handle_t get_handle();
-};
-
-#endif //USE_BM
-
 } //namespace saber
 
 } //namespace anakin
diff --git a/saber/funcs/conv.h b/saber/funcs/conv.h
index 1c42dd063..90a78da8f 100644
--- a/saber/funcs/conv.h
+++ b/saber/funcs/conv.h
@@ -34,7 +34,7 @@
 #endif
 
 #ifdef USE_BM
-#include "saber/funcs/impl/bm/vender_conv.h"
+//#include "saber/funcs/impl/bm/vender_conv.h"
 #endif
 namespace anakin {
 namespace saber {
diff --git a/saber/funcs/impl/bm/vender_activation.h b/saber/funcs/impl/bm/vender_activation.h
deleted file mode 100644
index c873cee79..000000000
--- a/saber/funcs/impl/bm/vender_activation.h
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef ANAKIN_SABER_FUNCS_BMDNN_ACT_H
-#define ANAKIN_SABER_FUNCS_BMDNN_ACT_H
-#include "saber/funcs/impl/impl_activation.h"
-namespace anakin {
-
-namespace saber {
-
-template <DataType OpDtype>
-class VenderActivation<BM, OpDtype> : \
-    public ImplBase <
-    BM,
-    OpDtype,
-    ActivationParam<BM > > {
-public:
-    typedef Tensor<BM> OpTensor;
-    typedef typename DataTraitBase<BM>::PtrDtype DataPtr;
-
-    VenderActivation(): _handle(NULL), _active_type(Active_relu) {}
-
-    ~VenderActivation() {}
-
-    virtual SaberStatus init(const std::vector<OpTensor*>& inputs,
-                             std::vector<OpTensor*>& outputs,
-                             ActivationParam<BM>& param, Context<BM>& ctx) {
-        // not sure
-        _handle = get_bm_handle();
-        return create(inputs, outputs, param, ctx);
-    }
-
-    virtual SaberStatus create(const std::vector<OpTensor*>& inputs,
-                               std::vector<OpTensor*>& outputs,
-                               ActivationParam<BM>& param, Context<BM>& ctx) {
-        // not sure
-        return SaberSuccess;
-    }
-
-    //call bmdnn activation funcs here
-    virtual SaberStatus dispatch(const std::vector<OpTensor*>& inputs,
-                                 std::vector<OpTensor*>& outputs,
-                                 ActivationParam<BM>& param) {
-        const DataPtr in_data = (inputs[0]->data());
-        DataPtr out_data = (outputs[0]->mutable_data());
-        int input_dim = inputs[0]->channel() * inputs[0]->height() * inputs[0]->width();
-        int input_n = inputs[0]->num();
-
-        _active_type = param.active;
-
-        switch (_active_type) {
-        case Active_relu:
-            BMDNN_CHECK(bmdnn_relu_forward(_handle, in_data, 0.0, input_n, input_dim, out_data));
-            break;
-
-        case Active_sigmoid:
-            BMDNN_CHECK(bmdnn_sigmoid_forward(_handle, in_data, input_n, input_dim, out_data));
-            break;
-
-        case Active_tanh:
-            BMDNN_CHECK(bmdnn_tanh_forward(_handle, in_data, input_n, input_dim, out_data));
-            break;
-        default:LOG(INFO)<<"type not support now";
-                return SaberUnImplError;
-        }
-
-        return SaberSuccess;
-    }
-
-private:
-    bm_handle_t _handle;
-    ActiveType _active_type;
-};
-
-template class VenderActivation<BM, AK_FLOAT>;
-} // namespace saber
-
-} // namespace anakin
-#endif //ANAKIN_SABER_FUNCS_BMDNN_ACT_H
diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h
deleted file mode 100644
index a0f7e34f0..000000000
--- a/saber/funcs/impl/bm/vender_conv.h
+++ /dev/null
@@ -1,114 +0,0 @@
-#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H
-#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H
-
-#include "saber/funcs/impl/impl_conv.h"
-
-namespace anakin {
-
-namespace saber {
-
-template <DataType OpDtype>
-class VenderConv2D<BM, OpDtype> : \
-    public ImplBase <
-    BM, OpDtype,
-    ConvParam<BM> > {
-public:
-    typedef Tensor<BM> OpTensor;
-    typedef typename DataTraitBase<BM>::PtrDtype DataPtr;
-
-    VenderConv2D(): _handle(NULL) {}
-    ~VenderConv2D() {}
-
-    virtual SaberStatus init(const std::vector<OpTensor*>& inputs,
-                             std::vector<OpTensor*>& outputs,
-                             ConvParam<BM>& param, Context<BM>& ctx) {
-
-        _handle = get_bm_handle();
-        return create(inputs, outputs, param, ctx);
-    }
-
-    virtual SaberStatus create(const std::vector<OpTensor*>& inputs,
-                               std::vector<OpTensor*>& outputs,
-                               ConvParam<BM>& param, Context<BM>& ctx) {
-
-    }
-
-    virtual SaberStatus dispatch(const std::vector<OpTensor*>& inputs,
-                                 std::vector<OpTensor*>& outputs,
-                                 ConvParam<BM>& param) {
-        CHECK_GE(inputs[0]->get_dtype(),AK_FLOAT)<<"bm only support AK_FLOAT";
-        CHECK_GE(outputs[0]->get_dtype(),AK_FLOAT)<<"bm only support AK_FLOAT";
-
-        const DataPtr in_data =  inputs[0]->data();
-        const DataPtr weight =  param.weight()->data();
-        DataPtr out_data =  outputs[0]->mutable_data();
-
-        int input_n = inputs[0]->num();
-        int input_c = inputs[0]->channel();
-        int input_h = inputs[0]->height();
-        int input_w = inputs[0]->width();
-
-        int output_n = outputs[0]->num();
-        int output_c = outputs[0]->channel();
-        int output_h = outputs[0]->height();
-        int output_w = outputs[0]->width();
-
-        int group = param.group;
-        int kh = param.weight()->height();
-        int kw = param.weight()->width();
-        int pad_h = param.pad_h;
-        int pad_w = param.pad_w;
-        int stride_h = param.stride_h;
-        int stride_w = param.stride_w;
-        int dilation_h = param.dilation_h;
-        int dilation_w = param.dilation_w;
-
-        bool with_bias = param.bias()->size() > 0;
-        const DataPtr bias = with_bias == true ? param.bias()->data() : static_cast<DataPtr>
-                                (bm_mem_null());
-
-        bm_tensor_4d_t input_shape = {
-            input_n,
-            input_c,
-            input_h,
-            input_w
-        };
-
-        bm_tensor_4d_t output_shape = {
-            output_n,
-            output_c,
-            output_h,
-            output_w
-        };
-
-        bm_kernel_param_t kernel_param = {
-            group,
-            output_c,
-            input_c,
-            kh,
-            kw
-        };
-
-        bm_conv_param_t conv_param = {
-            stride_h,
-            stride_w,
-            pad_h,
-            pad_w,
-            dilation_h,
-            dilation_w,
-            0
-        };
-
-        BMDNN_CHECK(bmdnn_conv_forward(_handle, in_data, weight, bias, input_shape,
-                                       kernel_param, output_shape, conv_param, with_bias, out_data));
-
-        return SaberSuccess;
-    }
-
-private:
-    bm_handle_t _handle;
-};
-template class VenderConv2D<BM, AK_FLOAT>;
-}
-}
-#endif //ANAKIN_SABER_FUNCS_BMDNN_CONV2D_H
diff --git a/test/saber/test_saber_buffer.cpp b/test/saber/test_saber_buffer.cpp
index c18c3d14d..f1afc3a9b 100644
--- a/test/saber/test_saber_buffer.cpp
+++ b/test/saber/test_saber_buffer.cpp
@@ -14,6 +14,8 @@ void test_buffer() {
     typedef Buffer<Th> BufferH;
     typedef Buffer<Td> BufferD;
 
+    typedef typename DataTraitBase<Td>::PtrDtype TPtr;
+
     int n0 = 1024;
     int n1 = 2048;
 
@@ -138,7 +140,7 @@ TEST(TestSaberFunc, test_saber_buffer) {
 
 #ifdef USE_BM
     LOG(INFO) << "test BM FP32 buffer";
-    test_buffer<BM, X86, AK_FLOAT>();
+    //test_buffer<BM, X86, AK_FLOAT>();
 #endif
 
 }
diff --git a/test/saber/test_saber_tensor.cpp b/test/saber/test_saber_tensor.cpp
index c094104ef..83ca72d2e 100644
--- a/test/saber/test_saber_tensor.cpp
+++ b/test/saber/test_saber_tensor.cpp
@@ -279,6 +279,7 @@ TEST(TestSaberFunc, test_tensor_constructor) {
 
 #ifdef USE_BM
     Env<BM>::env_init();
+    Env<X86>::env_init();
     LOG(INFO) << "test BM FP32 tensor";
     tensor_constructor<BM, X86, AK_FLOAT>();
 #endif
@@ -514,9 +515,7 @@ TEST(TestSaberFunc, test_tensor_deepcopy) {
 #endif //USE_ARM_PLACE
 
 #ifdef USE_BM
-    Env<BM>::env_init();
-    LOG(INFO) << "test BM FP32 tensor deep copy";
-    //tensor_deepcopy<BM, X86, AK_FLOAT>();
+    //BM does not support this yet
 #endif //USE_BM
 }
 #endif
@@ -621,6 +620,7 @@ TEST(TestSaberFunc, test_saber_tensor_shape) {
 
 #ifdef USE_BM
     Env<BM>::env_init();
+    Env<X86>::env_init();
     LOG(INFO) << "test BM tensor shape API";
     test_tensor_shape<BM>();
 #endif //USE_BM
@@ -741,8 +741,9 @@ TEST(TestSaberFunc, test_tensor_reshape_realloc) {
 
 #ifdef USE_BM
     Env<BM>::env_init();
+    Env<X86>::env_init();
     LOG(INFO) << "test BM FP32 tensor reshape realloc";
-    tensor_reshape_realloc<BM, X86, AK_FLOAT>();
+    //tensor_reshape_realloc<BM, X86, AK_FLOAT>();
 #endif //USE_BM
 }
 #endif
@@ -821,6 +822,7 @@ TEST(TestSaberFunc, test_tensor_ops) {
 
 #ifdef USE_BM
     Env<BM>::env_init();
+    Env<X86>::env_init();
     LOG(INFO) << "test BM FP32 tensor op";
     test_tensor_op<BM, X86, AK_FLOAT>();
 #endif //USE_BM
@@ -871,7 +873,9 @@ TEST(TestSaberFunc, test_tensor_share_diff_dtype) {
     tensor_share_diff_dtype<ARM, ARM>();
 #endif //USE_ARM_PLACE
 
-//BM does not support this yet
+#ifdef USE_BM
+    //BM does not support this yet
+#endif //USE_BM
 }
 #endif
 int main(int argc, const char** argv) {
diff --git a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h b/third-party/bm_lib/include/bmdnn/bmdnn_api.h
similarity index 100%
rename from saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h
rename to third-party/bm_lib/include/bmdnn/bmdnn_api.h
diff --git a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h b/third-party/bm_lib/include/bmdnn/bmdnn_ext_api.h
similarity index 100%
rename from saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h
rename to third-party/bm_lib/include/bmdnn/bmdnn_ext_api.h
diff --git a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h b/third-party/bm_lib/include/bmdnn/bmdnn_runtime.h
similarity index 100%
rename from saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h
rename to third-party/bm_lib/include/bmdnn/bmdnn_runtime.h
diff --git a/saber/funcs/impl/bm/base/include/bmdnn/op_code.h b/third-party/bm_lib/include/bmdnn/op_code.h
similarity index 100%
rename from saber/funcs/impl/bm/base/include/bmdnn/op_code.h
rename to third-party/bm_lib/include/bmdnn/op_code.h
diff --git a/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h b/third-party/bm_lib/include/bmlib/bmlib_runtime.h
similarity index 100%
rename from saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h
rename to third-party/bm_lib/include/bmlib/bmlib_runtime.h
diff --git a/saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h b/third-party/bm_lib/include/bmlib/bmlib_utils.h
similarity index 100%
rename from saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h
rename to third-party/bm_lib/include/bmlib/bmlib_utils.h
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmblob.h b/third-party/bm_lib/include/bmruntime/bmblob.h
similarity index 100%
rename from saber/funcs/impl/bm/base/include/bmruntime/bmblob.h
rename to third-party/bm_lib/include/bmruntime/bmblob.h
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h b/third-party/bm_lib/include/bmruntime/bmruntime.h
similarity index 100%
rename from saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h
rename to third-party/bm_lib/include/bmruntime/bmruntime.h
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h b/third-party/bm_lib/include/bmruntime/bmruntime_common.h
similarity index 100%
rename from saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h
rename to third-party/bm_lib/include/bmruntime/bmruntime_common.h
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h b/third-party/bm_lib/include/bmruntime/bmruntime_interface.h
similarity index 100%
rename from saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h
rename to third-party/bm_lib/include/bmruntime/bmruntime_interface.h

From e5c96fb32b2743836b4ed7c244e3f8c8c34dd216 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 16 Aug 2018 15:25:20 +0800
Subject: [PATCH 275/318] Cleanup

---
 saber/core/context.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/saber/core/context.h b/saber/core/context.h
index 4be1d8693..860c63ee4 100644
--- a/saber/core/context.h
+++ b/saber/core/context.h
@@ -18,13 +18,6 @@
 
 #include "core/env.h"
 #include "saber/saber_types.h"
-#include <type_traits>
-
-#ifdef USE_BM
-#include "bmlib_runtime.h"
-#include "bmdnn_api.h"
-#include "bmlib_utils.h"
-#endif
 
 namespace anakin{
 

From 934128ffb90e6bed1a62a7e6d0c8282614d17bea Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 16 Aug 2018 15:53:00 +0800
Subject: [PATCH 276/318] Update BM header file path

---
 cmake/find_modules.cmake                                        | 2 +-
 .../funcs/impl/bm/base}/include/bmdnn/bmdnn_api.h               | 0
 .../funcs/impl/bm/base}/include/bmdnn/bmdnn_ext_api.h           | 0
 .../funcs/impl/bm/base}/include/bmdnn/bmdnn_runtime.h           | 0
 .../bm_lib => saber/funcs/impl/bm/base}/include/bmdnn/op_code.h | 0
 .../funcs/impl/bm/base}/include/bmlib/bmlib_runtime.h           | 0
 .../funcs/impl/bm/base}/include/bmlib/bmlib_utils.h             | 0
 .../funcs/impl/bm/base}/include/bmruntime/bmblob.h              | 0
 .../funcs/impl/bm/base}/include/bmruntime/bmruntime.h           | 0
 .../funcs/impl/bm/base}/include/bmruntime/bmruntime_common.h    | 0
 .../funcs/impl/bm/base}/include/bmruntime/bmruntime_interface.h | 0
 11 files changed, 1 insertion(+), 1 deletion(-)
 rename {third-party/bm_lib => saber/funcs/impl/bm/base}/include/bmdnn/bmdnn_api.h (100%)
 rename {third-party/bm_lib => saber/funcs/impl/bm/base}/include/bmdnn/bmdnn_ext_api.h (100%)
 rename {third-party/bm_lib => saber/funcs/impl/bm/base}/include/bmdnn/bmdnn_runtime.h (100%)
 rename {third-party/bm_lib => saber/funcs/impl/bm/base}/include/bmdnn/op_code.h (100%)
 rename {third-party/bm_lib => saber/funcs/impl/bm/base}/include/bmlib/bmlib_runtime.h (100%)
 rename {third-party/bm_lib => saber/funcs/impl/bm/base}/include/bmlib/bmlib_utils.h (100%)
 rename {third-party/bm_lib => saber/funcs/impl/bm/base}/include/bmruntime/bmblob.h (100%)
 rename {third-party/bm_lib => saber/funcs/impl/bm/base}/include/bmruntime/bmruntime.h (100%)
 rename {third-party/bm_lib => saber/funcs/impl/bm/base}/include/bmruntime/bmruntime_common.h (100%)
 rename {third-party/bm_lib => saber/funcs/impl/bm/base}/include/bmruntime/bmruntime_interface.h (100%)

diff --git a/cmake/find_modules.cmake b/cmake/find_modules.cmake
index fb2e58b06..dca7a550e 100644
--- a/cmake/find_modules.cmake
+++ b/cmake/find_modules.cmake
@@ -327,7 +327,7 @@ endmacro()
 
 macro(anakin_find_bmlib)
 	if(USE_BM)
-		find_path(BM_ROOT include/bmdnn/bmdnn_api.h ${CMAKE_SOURCE_DIR}/third-party/bm_lib/ $ENV{BM_ROOT}/)
+		find_path(BM_ROOT include/bmdnn/bmdnn_api.h ${CMAKE_SOURCE_DIR}/saber/funcs/impl/bm/base/ $ENV{BM_ROOT}/)
 		find_path(BM_ROOT_INCLUDE_DNN bmdnn_api.h ${BM_ROOT}/include/bmdnn)
 		find_path(BM_ROOT_INCLUDE_RT bmruntime.h ${BM_ROOT}/include/bmruntime)
 		find_path(BM_ROOT_INCLUDE_LIB bmlib_runtime.h ${BM_ROOT}/include/bmlib)
diff --git a/third-party/bm_lib/include/bmdnn/bmdnn_api.h b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h
similarity index 100%
rename from third-party/bm_lib/include/bmdnn/bmdnn_api.h
rename to saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h
diff --git a/third-party/bm_lib/include/bmdnn/bmdnn_ext_api.h b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h
similarity index 100%
rename from third-party/bm_lib/include/bmdnn/bmdnn_ext_api.h
rename to saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h
diff --git a/third-party/bm_lib/include/bmdnn/bmdnn_runtime.h b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h
similarity index 100%
rename from third-party/bm_lib/include/bmdnn/bmdnn_runtime.h
rename to saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h
diff --git a/third-party/bm_lib/include/bmdnn/op_code.h b/saber/funcs/impl/bm/base/include/bmdnn/op_code.h
similarity index 100%
rename from third-party/bm_lib/include/bmdnn/op_code.h
rename to saber/funcs/impl/bm/base/include/bmdnn/op_code.h
diff --git a/third-party/bm_lib/include/bmlib/bmlib_runtime.h b/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h
similarity index 100%
rename from third-party/bm_lib/include/bmlib/bmlib_runtime.h
rename to saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h
diff --git a/third-party/bm_lib/include/bmlib/bmlib_utils.h b/saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h
similarity index 100%
rename from third-party/bm_lib/include/bmlib/bmlib_utils.h
rename to saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h
diff --git a/third-party/bm_lib/include/bmruntime/bmblob.h b/saber/funcs/impl/bm/base/include/bmruntime/bmblob.h
similarity index 100%
rename from third-party/bm_lib/include/bmruntime/bmblob.h
rename to saber/funcs/impl/bm/base/include/bmruntime/bmblob.h
diff --git a/third-party/bm_lib/include/bmruntime/bmruntime.h b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h
similarity index 100%
rename from third-party/bm_lib/include/bmruntime/bmruntime.h
rename to saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h
diff --git a/third-party/bm_lib/include/bmruntime/bmruntime_common.h b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h
similarity index 100%
rename from third-party/bm_lib/include/bmruntime/bmruntime_common.h
rename to saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h
diff --git a/third-party/bm_lib/include/bmruntime/bmruntime_interface.h b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h
similarity index 100%
rename from third-party/bm_lib/include/bmruntime/bmruntime_interface.h
rename to saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h

From a66072ac3748dd3993326149358109d64c95455d Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 16 Aug 2018 17:02:19 +0800
Subject: [PATCH 277/318] Check device count when init context

---
 saber/core/context.h           | 4 ++++
 saber/core/impl/bm/bm_impl.cpp | 1 -
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/saber/core/context.h b/saber/core/context.h
index 860c63ee4..9db50053e 100644
--- a/saber/core/context.h
+++ b/saber/core/context.h
@@ -38,6 +38,10 @@ class Context final{
 #ifdef USE_BM        
         if(std::is_same<TargetType, BM>::value){
             LOG(INFO) << "context init for BM";
+            int dev_count = 0;
+            TargetWrapper<BM>::get_device_count(dev_count);
+            CHECK_GE(dev_count, 1) << "Env is not initialized or current target is not exit!";
+
             _bm_handle = TargetWrapper<BM>::get_handle();
             return;
         }
diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 2efbf80a0..1505acbf9 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -53,7 +53,6 @@ bm_handle_t BM_API::get_handle() {
 
 void BM_API::get_device_count(int& count) {
     BMDNN_CHECK(bm_dev_getcount(&count));
-    CHECK_GE(count,1)<<"can`t find device, please check driver and card";
 }
 
 void BM_API::set_device(int id) {

From b87eca83671103ed4d53fede987709908fb9425a Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 16 Aug 2018 17:21:00 +0800
Subject: [PATCH 278/318] Check device count when init context

---
 saber/core/context.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/saber/core/context.h b/saber/core/context.h
index 9db50053e..0cc032f2f 100644
--- a/saber/core/context.h
+++ b/saber/core/context.h
@@ -41,7 +41,6 @@ class Context final{
             int dev_count = 0;
             TargetWrapper<BM>::get_device_count(dev_count);
             CHECK_GE(dev_count, 1) << "Env is not initialized or current target is not exit!";
-
             _bm_handle = TargetWrapper<BM>::get_handle();
             return;
         }

From deb0bd6722038e01a52a1748db649d868482b6e1 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 16 Aug 2018 17:51:10 +0800
Subject: [PATCH 279/318] User system SDK

---
 cmake/find_modules.cmake                      |   2 +-
 .../impl/bm/base/include/bmdnn/bmdnn_api.h    | 830 ------------------
 .../bm/base/include/bmdnn/bmdnn_ext_api.h     | 438 ---------
 .../bm/base/include/bmdnn/bmdnn_runtime.h     |  20 -
 .../impl/bm/base/include/bmdnn/op_code.h      |  64 --
 .../bm/base/include/bmlib/bmlib_runtime.h     | 246 ------
 .../impl/bm/base/include/bmlib/bmlib_utils.h  |  72 --
 .../impl/bm/base/include/bmruntime/bmblob.h   |  97 --
 .../bm/base/include/bmruntime/bmruntime.h     | 155 ----
 .../base/include/bmruntime/bmruntime_common.h |  66 --
 .../include/bmruntime/bmruntime_interface.h   |  11 -
 11 files changed, 1 insertion(+), 2000 deletions(-)
 delete mode 100644 saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h
 delete mode 100644 saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h
 delete mode 100644 saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h
 delete mode 100644 saber/funcs/impl/bm/base/include/bmdnn/op_code.h
 delete mode 100644 saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h
 delete mode 100644 saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h
 delete mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmblob.h
 delete mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h
 delete mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h
 delete mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h

diff --git a/cmake/find_modules.cmake b/cmake/find_modules.cmake
index dca7a550e..287fd69b9 100644
--- a/cmake/find_modules.cmake
+++ b/cmake/find_modules.cmake
@@ -327,7 +327,7 @@ endmacro()
 
 macro(anakin_find_bmlib)
 	if(USE_BM)
-		find_path(BM_ROOT include/bmdnn/bmdnn_api.h ${CMAKE_SOURCE_DIR}/saber/funcs/impl/bm/base/ $ENV{BM_ROOT}/)
+		find_path(BM_ROOT include/bmdnn/bmdnn_api.h /usr/local/include/bm/ $ENV{BM_ROOT}/)
 		find_path(BM_ROOT_INCLUDE_DNN bmdnn_api.h ${BM_ROOT}/include/bmdnn)
 		find_path(BM_ROOT_INCLUDE_RT bmruntime.h ${BM_ROOT}/include/bmruntime)
 		find_path(BM_ROOT_INCLUDE_LIB bmlib_runtime.h ${BM_ROOT}/include/bmlib)
diff --git a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h
deleted file mode 100644
index 05c970c92..000000000
--- a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h
+++ /dev/null
@@ -1,830 +0,0 @@
-#ifndef BMDNN_API_H
-#define BMDNN_API_H
-
-#include "bmdnn_runtime.h"
-#include "op_code.h"
-
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
-/*
- * All the name-style of input/output are in the viewpoint of forward operation
- */
-
-typedef struct kernel_param{
-    int g;
-    int oc;
-    int ic;
-    int h;
-    int w;
-}bm_kernel_param_t;
-
-typedef struct bm_conv_param{
-    int stride_h;
-    int stride_w;
-    int pad_h;
-    int pad_w;
-    int dilation_h;
-    int dilation_w;
-    bool result_add;
-}bm_conv_param_t;
-
-typedef struct bm_pool_param{
-  int stride_h;
-  int stride_w;
-  int pad_h;
-  int pad_w;
-  int kh;
-  int kw;
-  bool is_avg_pooling;
-}bm_pool_param_t;
-
-bm_status_t bmdnn_conv_relu_pool_forward(
-    bm_handle_t      handle,
-    bm_device_mem_t  input,
-    bm_device_mem_t  weight,
-    bm_device_mem_t  bias,
-    bm_tensor_4d_t      input_shape,
-    bm_kernel_param_t   kernel_param,
-    bm_pool_param_t     pool_param,
-    bm_conv_param_t     conv_param,
-    bool                with_bias,
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_conv_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    bm_device_mem_t  weight,
-    bm_device_mem_t  bias,
-    bm_tensor_4d_t      input_shape,
-    bm_kernel_param_t   kernel_param,
-    bm_tensor_4d_t      output_shape,
-    bm_conv_param_t     conv_param,
-    bool                with_bias,
-    //output
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_deconv_forward(
-    bm_handle_t      handle,
-    bm_device_mem_t  input,
-    bm_device_mem_t  weight,
-    bm_device_mem_t  bias,
-    bm_tensor_4d_t      input_shape,
-    bm_kernel_param_t   kernel_param,
-    bm_tensor_4d_t      output_shape,
-    bm_conv_param_t     conv_param,
-    bool                with_bias,
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_conv_backward_bias(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  output_diff,
-    int                 input_n,
-    int                 input_c,
-    int                 input_h,
-    int                 input_w,
-    int                 groups,
-    int                 output_c,
-    int                 kh,
-    int                 kw,
-    int                 pad_h,
-    int                 pad_w,
-    int                 stride_h,
-    int                 stride_w,
-    int                 result_add,
-    //output
-    bm_device_mem_t  bias_diff);
-
-bm_status_t bmdnn_pooling_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    int                 input_n,
-    int                 input_c,
-    int                 input_h,
-    int                 input_w,
-    int                 kh,
-    int                 kw,
-    int                 pad_h,
-    int                 pad_w,
-    int                 stride_h,
-    int                 stride_w,
-    int                 is_avg_pooling,
-    //output
-    bm_device_mem_t  output
-    );
-bm_status_t bmdnn_upsample_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    int                 input_n,
-    int                 input_c,
-    int                 input_h,
-    int                 input_w,
-    int                 size,
-    //output
-    bm_device_mem_t  output
-    );
-bm_status_t bmdnn_roi_pooling_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    bm_device_mem_t  rois,
-    int              input_n,
-    int              input_c,
-    int              input_h,
-    int              input_w,
-    int              pooled_h,
-    int              pooled_w,
-    int              roi_num,
-    int              spatial_scale,
-    //output
-    bm_device_mem_t  output
-    );
-
-bm_status_t bmdnn_fc_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    bm_device_mem_t  weight,
-    bm_device_mem_t  bias,
-    int              batch_size,
-    int              num_output_neuron,
-    int              num_input_neuron,
-    int              transpose,
-    int              using_bias,
-    int              using_relu,
-    //output
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_fc_backward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  output_diff,
-    bm_device_mem_t  input,
-    bm_device_mem_t  weight,
-    int              num_output_neuron,
-    int              batch_size,
-    int              num_input_neuron,
-    int              using_bias,
-    int              propagate_down_bias_diff,
-    int              propagate_down_weight_diff,
-    int              propagate_down_bottom,
-    //output
-    bm_device_mem_t  weight_diff,
-    bm_device_mem_t  bias_diff,
-    bm_device_mem_t  input_diff);
-
-bm_status_t bmdnn_dropout_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    float            dropout_ratio,
-    int              input_n,
-    int              input_dim,
-    //output
-    bm_device_mem_t  output,
-    bm_device_mem_t  mask);
-
-bm_status_t bmdnn_dropout_backward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    float               dropout_ratio,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_batchnorm_forward_inference(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    bm_device_mem_t  mean_ma,
-    bm_device_mem_t  variance_ma,
-    float               scale_ma,
-    bm_device_mem_t  variance,
-    float               eps,
-    int                 input_n,
-    int                 input_c,
-    int                 input_h,
-    int                 input_w,
-    //output
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_batchnorm_forward_train(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    float               ma_fraction,
-    float               eps,
-    int                 input_n,
-    int                 input_c,
-    int                 input_h,
-    int                 input_w,
-    //output
-    bm_device_mem_t  output,
-    bm_device_mem_t  mean,
-    bm_device_mem_t  variance,
-    bm_device_mem_t  mean_ma,
-    bm_device_mem_t  variance_ma);
-
-bm_status_t bmdnn_batchnorm_backward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  output_diff,
-    bm_device_mem_t  output,
-    bm_device_mem_t  variance,
-    int                 input_n,
-    int                 input_c,
-    int                 input_h,
-    int                 input_w,
-    int                 using_global_stats,
-    //output
-    bm_device_mem_t  input_diff);
-
-bm_status_t bmdnn_lrn_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    int                 input_n,
-    int                 input_c,
-    int                 input_h,
-    int                 input_w,
-    int                 lrn_n,
-    float               alpha,
-    float               beta,
-    float               k,
-    //output
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_lrn_backward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  output_diff,
-    bm_device_mem_t  output,
-    bm_device_mem_t  input,
-    int                 lrn_n,
-    float               alpha,
-    float               beta,
-    float               k,
-    int                 input_n,
-    int                 input_c,
-    int                 input_h,
-    int                 input_w,
-    //output
-    bm_device_mem_t  input_diff);
-
-bm_status_t bmdnn_relu_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    float               negative_slope,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_relu_backward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  output_diff,
-    bm_device_mem_t  input,
-    float               negative_slope,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  input_diff);
-
-bm_status_t bmdnn_sigmoid_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_sigmoid_backward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  output_diff,
-    bm_device_mem_t  output,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  input_diff);
-
-bm_status_t bmdnn_tanh_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_tanh_backward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  output_diff,
-    bm_device_mem_t  output,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  input_diff);
-
-bm_status_t bmdnn_softmax_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    int                 input_n,
-    int                 input_c,
-    int                 input_inner_dim,
-    //output
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_softmax_backward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  output_diff,
-    bm_device_mem_t  output,
-    int                 input_n,
-    int                 input_c,
-    int                 input_inner_dim,
-    //output
-    bm_device_mem_t  input_diff);
-
-bm_status_t bmdnn_softmax_loss_forward(
-    bm_handle_t      handle,
-    bm_device_mem_t  input,
-    bm_device_mem_t  label,
-    float               normalizer,
-    int                 input_n,
-    int                 input_c,
-    int                 input_inner_dim,
-    bm_device_mem_t  output,
-    bm_device_mem_t  loss);
-bm_status_t bmdnn_interp_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    int                 input_n,
-    int                 input_c,
-    int                 input_h,
-    int                 input_w,
-    int                 pad_bag,
-    int                 pad_end,
-    int                 output_h,
-    int                 output_w,
-    int                 platform_sp,
-    //output
-    bm_device_mem_t  output
-    );
-bm_status_t bmdnn_softmax_loss_backward(
-    bm_handle_t      handle,
-    bm_device_mem_t  output,
-    bm_device_mem_t  label,
-    bm_device_mem_t  loss,
-    float               normalizer,
-    int                 input_n,
-    int                 input_c,
-    int                 input_inner_dim,
-    bm_device_mem_t  input_diff);
-
-bm_status_t bmdnn_softmax_loss_bidirection(
-    bm_handle_t      handle,
-    bm_device_mem_t  input,
-    bm_device_mem_t  label,
-    float               normalizer,
-    int                 input_n,
-    int                 input_c,
-    int                 input_inner_dim,
-    bm_device_mem_t  output_diff,
-    bm_device_mem_t  loss);
-
-bm_status_t bmdnn_multiregion_forward_parallel(
-    bm_handle_t         handle,
-    //input
-    bm_device_mem_t*     input,
-    int*                 input_n,
-    int*                 input_c,
-    int*                 input_h,
-    int*                 input_w,
-    int                  input_num,
-    int                 classes,
-    int                 coords,
-    int                 nums,
-    int*                 Activate_parm,
-    //output
-    bm_device_mem_t*  output
-);
-
-bm_status_t bmdnn_accuracy(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    bm_device_mem_t  label_idx,
-    bm_device_mem_t  input_mem_buffer,
-    int                 input_num,
-    int                 input_dim,
-    int                 top_k,
-    //output
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_coeff_update_sgd(
-    bm_handle_t      handle,
-    bm_device_mem_t  weight_diff,
-    bm_device_mem_t  weight,
-    bm_device_mem_t  history_weight,
-    int                 weight_count,
-    float               base_lr,
-    float               momentum,
-    float               weight_decay);
-
-bm_status_t bmdnn_fc_backward_sgd(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  output_diff,
-    bm_device_mem_t  input,
-    //input and output
-    bm_device_mem_t  weight,
-    bm_device_mem_t  weight_history,
-    int                 num_output_neuron,
-    int                 batch_size,
-    int                 num_input_neuron,
-    int                 using_bias,
-    int                 propagate_down_bias_diff,
-    int                 propagate_down_weight_diff,
-    int                 propagate_down_bottom,
-    float               base_lr,
-    float               momentum,
-    float               weight_decay,
-    //output
-    bm_device_mem_t  bias_diff,
-    bm_device_mem_t  input_diff);
-
-bm_status_t bmdnn_permute(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    int                 input_n,
-    int                 input_c,
-    int                 input_h,
-    int                 input_w,
-    //output
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_normalize_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    bm_device_mem_t  scale,
-    float               eps,
-    float               scale_val,
-    bool                across_spatial,
-    bool                channel_share,
-    int                 input_n,
-    int                 input_c,
-    int                 input_h,
-    int                 input_w,
-    //output
-    bm_device_mem_t  output);
-
-/*
- * MD Operations for user
- */
-
-
-bm_status_t bmdnn_md_scalar(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  tensor_A,
-    bm_device_mem_t  tensor_B,
-    int                 input_n,
-    int                 input_c,
-    int                 input_h,
-    int                 input_w,
-    ALIGN_TENSOR_OP             align_tensor_op,
-    int                 result_add,
-    int                 A_is_constant,
-    int                 B_is_constant,
-    float               A_const_val,
-    float               B_const_val,
-    int                 B_N_is_1,
-    int                 B_index_is_1,
-    //output
-    bm_device_mem_t  tensor_R);
-
-bm_status_t bmdnn_md_cmp(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  tensor_A,
-    bm_device_mem_t  tensor_B,
-    bm_device_mem_t  tensor_C,
-    bm_device_mem_t  tensor_D,
-    int                 input_n,
-    int                 input_c,
-    int                 input_h,
-    int                 input_w,
-    int                 A_is_constant,
-    int                 B_is_constant,
-    int                 C_is_constant,
-    int                 D_is_constant,
-    float               A_constant,
-    float               B_constant,
-    unsigned int        C_constant,
-    unsigned int        D_constant,
-    int                 result_skip,
-    //output
-    bm_device_mem_t  tensor_Y,
-    bm_device_mem_t  tensor_R);
-
-bm_status_t bmdnn_md_sfu(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  tensor_A,
-    int                 input_n,
-    int                 input_c,
-    int                 input_h,
-    int                 input_w,
-    SFU_OP              sfu_op,
-    float               a,
-    int                 n,
-    //output
-    bm_device_mem_t  tensor_Y);
-
-bm_status_t bmdnn_md_sum(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  tensor_A,
-    int                 input_n,
-    int                 input_c,
-    int                 input_h,
-    int                 input_w,
-    int                 result_add,
-    //output
-    bm_device_mem_t  tensor_Y);
-
-
-bm_status_t bmdnn_md_linear(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  tensor_A,
-    bm_device_mem_t  tensor_B,
-    bm_device_mem_t  tensor_S,
-    int                 input_n,
-    int                 input_c,
-    int                 input_h,
-    int                 input_w,
-    LINEAR_OP           linear_op,
-    int                 result_add,
-    int                 B_is_const,
-    int                 S_is_const,
-    float               B_const_val,
-    float               S_const_val,
-    //output
-    bm_device_mem_t  tensor_Y);
-
-bm_status_t bmdnn_img_sum(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  tensor_A,
-    int                 input_n,
-    int                 input_c,
-    int                 input_h,
-    int                 input_w,
-    int                 result_add,
-    //output
-    bm_device_mem_t  tensor_Y);
-
-/*
- * fullnet mode
- */
-bm_status_t bmdnn_fullnet(
-        bm_handle_t handle,
-        unsigned long long bdc_cmd_offset,
-        unsigned long long gdma_cmd_offset,
-        unsigned long long cdma_cmd_offset,
-        unsigned long long cmd_num_offset
-        );
-
-/*
- * multiple fullnet mode
- */
-bm_status_t bmdnn_multi_fullnet(
-        bm_handle_t handle,
-        int input_num,
-        unsigned long long* user_input_global_offset,
-        unsigned long long* cmd_input_global_offset,
-        int* input_tensor_size,
-        int output_num,
-        unsigned long long* user_output_global_offset,
-        unsigned long long* cmd_output_global_offset,
-        int* output_tensor_size,
-        unsigned long long bdc_cmd_offset,
-        unsigned long long gdma_cmd_offset,
-        unsigned long long cdma_cmd_offset,
-        int* bdc_cmd_num,
-        int* gdma_cmd_num,
-        int* cdma_cmd_num,
-        int cmdgroup_num
-        );
-
-/*
- * dynamic fullnet mode
- */
-bm_status_t bmdnn_dynamic_fullnet(
-        bm_handle_t handle,
-        unsigned long long compiled_ir_global_addr,
-        unsigned int compiled_ir_length,
-        unsigned int batch_num,
-        unsigned int input_num,
-        unsigned long long* input_global_offset,
-        unsigned int* input_height,
-        unsigned int* input_width,
-        unsigned int output_num,
-        unsigned long long* output_global_offset,
-        unsigned long long apd_ctx_mem_offset
-#if defined(USING_CMODEL) && !defined(USING_FULLNET)
-        ,float**    p_refer_result
-#endif
-        );
-
-/**
-  * Depthwise convolution.
-  */
-bm_status_t bmdnn_depthwise_forward(
-        bm_handle_t         handle,
-        bm_device_mem_t     input,
-        bm_device_mem_t     weight,
-        bm_device_mem_t     bias,
-        int                 input_n,
-        int                 input_c,
-        int                 input_h,
-        int                 input_w,
-        int                 kernel_h,
-        int                 kernel_w,
-        int                 dilation_h,
-        int                 dilation_w,
-        int                 pad_h,
-        int                 pad_w,
-        int                 stride_h,
-        int                 stride_w,
-        int                 using_bias,
-        bm_device_mem_t     output);
-
-bm_status_t bmdnn_lstm_forward(
-        bm_handle_t      handle,
-        //input
-        bm_device_mem_t  input,
-        bm_device_mem_t  cont,
-        bm_device_mem_t  input_static,
-        /*bm_device_mem_t  w_hc,
-        bm_device_mem_t  w_xc,*/
-        bm_device_mem_t  w_hxc,
-        bm_device_mem_t  w_xstatic,
-        bm_device_mem_t  b_c,
-        bm_device_mem_t  h_0,
-        bm_device_mem_t  c_0,
-        int                 input_n,
-        int                 seq_len,
-        int                 input_dim,
-        int                 input_static_dim,
-        int                 output_dim,
-        int                 with_input_static,
-        //output
-        bm_device_mem_t  c,
-        bm_device_mem_t  gate,
-        bm_device_mem_t  h_T,
-        bm_device_mem_t  c_T,
-        bm_device_mem_t  h);
-
-bm_status_t bmdnn_netease_ocr_forward(
-        bm_handle_t      handle,
-        //input
-        bm_device_mem_t  conv1_ifmap,
-        bm_device_mem_t  params,
-        bm_device_mem_t  result);
-
-typedef struct dim4_s {
-    int n, c, h, w;
-} dim4_t;
-enum
-{
-    CONV_DEPTHWISE,
-    CONV_3D
-};
-typedef struct mobilenet_conv_param_s
-{
-    /** convolution. */
-    int type;
-    bm_device_mem_t kernel;
-    bm_device_mem_t bias;
-    dim4_t          kernel_shape;
-    int             dilation_h, dilation_w;
-    int             pad_h, pad_w;
-    int             stride_h, stride_w;
-    bool            using_bias;
-    /** batchnorm. */
-    bm_device_mem_t mean;
-    bm_device_mem_t variance;
-    /** relu. */
-    float           slope;
-} mobilenet_conv_param_t;
-bm_status_t bmdnn_mobilenet_forward(
-        bm_handle_t handle,
-        const mobilenet_conv_param_t   *conv,
-        int                             num,
-        const dim4_t                   &input_shape,
-        const bm_device_mem_t          &input_global_mem,
-        dim4_t                         &output_shape,
-        bm_device_mem_t                &output_global_mem,
-        float                           parallel_performance_factor = 1.f);
-
-bm_status_t bmdnn_conv_forward_bank_conflict(
-    bm_handle_t         handle,
-    //input
-    bm_device_mem_t     input,
-    bm_device_mem_t     weight,
-    bm_device_mem_t     bias,
-    bm_tensor_4d_t      input_shape,
-    bm_kernel_param_t   kernel_param,
-    bm_tensor_4d_t      output_shape,
-    bm_conv_param_t     conv_param,
-    bool                with_bias,
-    //output
-    bm_device_mem_t     output);
-
-bm_status_t bmdnn_pooling_forward_bank_conflict(
-    bm_handle_t         handle,
-    //input
-    bm_device_mem_t     input,
-    int                 input_n,
-    int                 input_c,
-    int                 input_h,
-    int                 input_w,
-    int                 kh,
-    int                 kw,
-    int                 pad_h,
-    int                 pad_w,
-    int                 stride_h,
-    int                 stride_w,
-    int                 is_avg_pooling,
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_fc_forward_bank_conflict(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    bm_device_mem_t  weight,
-    bm_device_mem_t  bias,
-    int              batch_size,
-    int              num_output_neuron,
-    int              num_input_neuron,
-    int              transpose,
-    int              using_bias,
-    int              using_relu,
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_conv_forward_power_evaluation(
-    bm_handle_t         handle,
-    //input
-    bm_device_mem_t     input,
-    bm_device_mem_t     weight,
-    bm_device_mem_t     bias,
-    bm_tensor_4d_t      input_shape,
-    bm_kernel_param_t   kernel_param,
-    bm_tensor_4d_t      output_shape,
-    bm_conv_param_t     conv_param,
-    bool                with_bias,
-    //output
-    bm_device_mem_t     output);
-
-bm_status_t bmdnn_img_scale(
-        bm_handle_t handle, bm_device_mem_t dst, bm_device_mem_t src, int n,
-        int c, int dh, int sh, int dw, int sw);
-
-bm_status_t bmdnn_bn_forward_inference(
-    bm_handle_t      handle,
-    bm_device_mem_t  input,
-    bm_device_mem_t  output,
-    bm_device_mem_t  mean_ma,
-    bm_device_mem_t  variance_ma,
-    bm_device_mem_t  scale,
-    bm_device_mem_t  bias,
-    bm_device_mem_t  scale_ext,              
-    float            eps,
-    int              input_n,
-    int              input_c,
-    int              input_h,
-    int              input_w
-  );
-#if defined (__cplusplus)
-}
-#endif
-
-#endif /* BMDNN_API_H */
diff --git a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h
deleted file mode 100644
index 384cd4108..000000000
--- a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h
+++ /dev/null
@@ -1,438 +0,0 @@
-#ifndef BMDNN_EXT_API_H
-#define BMDNN_EXT_API_H
-
-#include "bmdnn_runtime.h"
-
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
-bm_status_t bmdnn_threshold_forward(
-    bm_handle_t      handle,
-    float               threshold,
-    //input
-    bm_device_mem_t  input,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  output
-    );
-
-bm_status_t bmdnn_exp_forward(
-    bm_handle_t      handle,
-    float               base,
-    float               input_scale,
-    float               input_shift,
-    //input
-    bm_device_mem_t  input,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  output
-    );
-
-bm_status_t bmdnn_exp_backward(
-    bm_handle_t      handle,
-    float               base,
-    float               input_scale,
-    float               input_shift,
-    //input
-    bm_device_mem_t  output_diff,
-    bm_device_mem_t  output,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  input_diff
-    );
-
-bm_status_t bmdnn_power_forward(
-    bm_handle_t      handle,
-    float               power_,
-    float               scale_,
-    float               shift_,
-    //input
-    bm_device_mem_t  input,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  output
-    );
-
-bm_status_t bmdnn_power_backward(
-    bm_handle_t      handle,
-    float               power_,
-    float               scale_,
-    float               shift_,
-    //input
-    bm_device_mem_t  output_diff,
-    bm_device_mem_t  output,
-    bm_device_mem_t  input,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  input_diff
-    );
-
-bm_status_t bmdnn_euclidean_loss_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    bm_device_mem_t  label,
-    bm_device_mem_t  temp_,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  diff,
-    bm_device_mem_t  loss);
-
-bm_status_t bmdnn_euclidean_loss_backward(
-    bm_handle_t      handle,
-    float               alpha,
-    //input
-    bm_device_mem_t  output,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  input_diff);
-
-bm_status_t bmdnn_silence_backward(
-    bm_handle_t      handle,
-    //input
-    //bm_device_mem_t  output_data,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  input_diff);
-
-bm_status_t bmdnn_lstm_unit_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  X_i,
-    bm_device_mem_t  X_f,
-    bm_device_mem_t  X_o,
-    bm_device_mem_t  X_g,
-    bm_device_mem_t  C_prev,
-    bm_device_mem_t  cont_expand,
-    int                 num,
-    int                 hidden_dim,
-    //output
-    bm_device_mem_t  C,
-    bm_device_mem_t  H);
-
-bm_status_t bmdnn_lstm_unit_backward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  C_diff,
-    bm_device_mem_t  H_diff,
-    bm_device_mem_t  X_i,
-    bm_device_mem_t  X_f,
-    bm_device_mem_t  X_o,
-    bm_device_mem_t  X_g,
-    bm_device_mem_t  C_prev,
-    bm_device_mem_t  C,
-    bm_device_mem_t  cont_expand,
-    int                 num,
-    int                 hidden_dim,
-    //output
-    bm_device_mem_t  C_prev_diff,
-    bm_device_mem_t  X_i_diff,
-    bm_device_mem_t  X_f_diff,
-    bm_device_mem_t  X_o_diff,
-    bm_device_mem_t  X_g_diff);
-
-bm_status_t bmdnn_eltwise_forward(
-    bm_handle_t      handle,
-    int                 op_,
-    int                 flag_first,
-    float               coeffs_,
-    int                 index,
-    //input
-    bm_device_mem_t  input,
-    bm_device_mem_t  target,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  mask_data,
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_eltwise_backward(
-    bm_handle_t      handle,
-    int                 op_,
-    int                 flag_first,
-    float               coeffs_,
-    int                 index,
-    //input
-    bm_device_mem_t  output_data,
-    bm_device_mem_t  output_diff,
-    bm_device_mem_t  input_data,
-    bm_device_mem_t  mask_data,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  input_diff);
-
-bm_status_t bmdnn_bias_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    bm_device_mem_t  bias,
-    int                 outer_dim,
-    int                 dim,
-    //output
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_bias_backward(
-    bm_handle_t      handle,
-    int                 flag,
-    //input
-    bm_device_mem_t  output_diff,
-    int                 outer_dim,
-    int                 bias_dim,
-    int                 inner_dim,
-    //output
-    bm_device_mem_t  input_diff,
-    bm_device_mem_t  bias_diff);
-
-bm_status_t bmdnn_log_forward(
-    bm_handle_t      handle,
-    float               scale,
-    float               shift,
-    float               base,
-    //input
-    bm_device_mem_t  input,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_log_backward(
-    bm_handle_t      handle,
-    float               scale,
-    float               shift,
-    float               base,
-    //input
-    bm_device_mem_t  output_diff,
-    bm_device_mem_t  input,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  input_diff);
-
-bm_status_t bmdnn_absval_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_absval_backward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  output_diff,
-    bm_device_mem_t  input,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  input_diff);
-
-bm_status_t bmdnn_sigmoid_cross_entropy_loss_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    bm_device_mem_t  target,
-    bm_device_mem_t  buffer,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  output,
-    bm_device_mem_t  loss);
-
-bm_status_t bmdnn_sigmoid_cross_entropy_loss_backward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  output,
-    bm_device_mem_t  target,
-    bm_device_mem_t  output_diff,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  input_diff);
-
-bm_status_t bmdnn_contrastive_loss_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input_0,
-    bm_device_mem_t  input_1,
-    bm_device_mem_t  label,
-    bm_device_mem_t  buffer,
-    int                 input_n,
-    int                 input_c,
-    float               margin,
-    bool                legacy_version,
-    //output
-    bm_device_mem_t  diff,
-    bm_device_mem_t  dist_sq,
-    bm_device_mem_t  loss);
-
-bm_status_t bmdnn_contrastive_loss_backward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  label,
-    bm_device_mem_t  diff,
-    bm_device_mem_t  dist_sq,
-    bm_device_mem_t  output_diff,
-    bm_device_mem_t  buffer,
-    int                 input_n,
-    int                 input_dim,
-    float               margin,
-    bool                legacy_version,
-    int                 propagate_down_flag,
-    //output
-    bm_device_mem_t  input_diff_0,
-    bm_device_mem_t  input_diff_1);
-
-bm_status_t bmdnn_filter_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    bm_device_mem_t  filter,
-    int                 input_n,
-    int                 output_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_filter_backward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  output_diff,
-    bm_device_mem_t  filter,
-    int                 input_n,
-    int                 output_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  input_diff);
-
-bm_status_t bmdnn_split_backward(
-    bm_handle_t      handle,
-    //input
-    int                 is_first,
-    bm_device_mem_t  output_diff,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  input_diff);
-
-bm_status_t bmdnn_bnll_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_bnll_backward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  output_diff,
-    bm_device_mem_t  input,
-    float               threshold,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  input_diff);
-
-bm_status_t bmdnn_prelu_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    bm_device_mem_t  slope,
-    float            slope0,
-    int                 channel_shared,
-    int                 input_n,
-    int                 input_c,
-    int                 input_h,
-    int                 input_w,
-    //output
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_prelu_backward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  output_diff,
-    bm_device_mem_t  input,
-    bm_device_mem_t  slope,
-    int                 propagate_down_flag,
-    int                 channel_shared,
-    int                 input_n,
-    int                 input_c,
-    int                 input_h,
-    int                 input_w,
-    //output
-    bm_device_mem_t  slope_diff,
-    bm_device_mem_t  input_diff);
-
-bm_status_t bmdnn_scale_forward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  input,
-    bm_device_mem_t  scale,
-    int                 input_n,
-    int                 input_c,
-    int                 input_h,
-    int                 input_w,
-    int                 scale_dim,
-    int                 inner_dim,
-    int                 scale_is_neuron,
-    //output
-    bm_device_mem_t  scale_extension,
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_scale_backward(
-    bm_handle_t      handle,
-    //input
-    bm_device_mem_t  output_diff,
-    bm_device_mem_t  input_data,
-    bm_device_mem_t  scale_extension,
-    int                 propagate_down_flag,
-    int                 input_n,
-    int                 input_c,
-    int                 input_h,
-    int                 input_w,
-    int                 scale_dim,
-    int                 inner_dim,
-    int                 scale_is_neuron,
-    //output
-    bm_device_mem_t  scale_diff,
-    bm_device_mem_t  input_diff);
-
-bm_status_t bmdnn_elu_forward(
-    bm_handle_t      handle,
-    float               alpha,
-    //input
-    bm_device_mem_t  input,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  output);
-
-bm_status_t bmdnn_elu_backward(
-    bm_handle_t      handle,
-    float               alpha,
-    //input
-    bm_device_mem_t  output_diff,
-    bm_device_mem_t  output,
-    bm_device_mem_t  input,
-    int                 input_n,
-    int                 input_dim,
-    //output
-    bm_device_mem_t  input_diff);
-
-#if defined (__cplusplus)
-}
-#endif
-
-#endif /* BMDNN_EXT_API_H */
diff --git a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h
deleted file mode 100644
index 6fede1338..000000000
--- a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef BMDNN_RUNTIME_H_
-#define BMDNN_RUNTIME_H_
-
-#include "bmlib_runtime.h"
-
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
-bm_status_t bmdnn_init(
-    bm_handle_t     *handle);
-
-void bmdnn_deinit(
-    bm_handle_t      handle);
-
-#if defined (__cplusplus)
-}
-#endif
-
-#endif
diff --git a/saber/funcs/impl/bm/base/include/bmdnn/op_code.h b/saber/funcs/impl/bm/base/include/bmdnn/op_code.h
deleted file mode 100644
index fa9443116..000000000
--- a/saber/funcs/impl/bm/base/include/bmdnn/op_code.h
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifndef OP_CODE_H_
-#define OP_CODE_H_
-
-
-typedef enum align_tensor_op {
-    ALIGN_TENSOR_ADD,
-    ALIGN_TENSOR_SUB,
-    ALIGN_TENSOR_MUL,
-    ALIGN_TENSOR_DIV,
-    TENSOR_INVALID
-} ALIGN_TENSOR_OP;
-
-typedef enum linear_op {
-    LINEAR_MAC,
-    LINEAR_ADD_SQR,
-    LINEAR_SUB_SQR
-} LINEAR_OP;
-
-typedef enum sfu_op {
-    SFU_XN,
-    SFU_EX,
-    SFU_LNX,
-    SFU_RSQ,
-    SFU_INVALID
-} SFU_OP;
-typedef struct tensor_4d_t {
-    int n;
-    int c;
-    int h;
-    int w;
-}bm_tensor_4d_t;
-
-
-#define TENSOR_ADD 0
-#define TENSOR_SUB 1
-#define TENSOR_MUL 2
-//Note the div should be implmented by KAMAKE algorithm
-#define TENSOR_DIV 3
-#define TENSOR_MAX 4
-#define TENSOR_CPY 5
-#define TENSOR_MAC 6
-
-#define TENSOR_N_DIM 0
-#define TENSOR_C_DIM 1
-#define TENSOR_H_DIM 2
-#define TENSOR_W_DIM 3
-
-#define SHARE_REG_MESSAGE_WP            0
-#define SHARE_REG_MESSAGE_RP            1
-#define SHARE_REG_MESSAGE_IRQSTATUS     2
-#define SHARE_REG_CDMA_IRQSTATUS    3 
-#define SHARE_REG_MSGIRQ_NUM_LO     4
-#define SHARE_REG_MSGIRQ_NUM_HI     5
-
-#define SHAREMEM_MSG_FIXED_OFFSET  (8192)
-#define SHAREMEM_SIZE_BIT  8
-#define SHAREMEM_MASK      ((1<<SHAREMEM_SIZE_BIT) - 1)
-#define SHARE_REG_CNT      16
-
-#define IRQ_STATUS_CDMA_INT             0x1111
-#define IRQ_STATUS_MSG_DONE_INT         0x2222
-
- 
-#endif /* OP_CODE_H_ */
diff --git a/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h b/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h
deleted file mode 100644
index eec7996c3..000000000
--- a/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h
+++ /dev/null
@@ -1,246 +0,0 @@
-#ifndef BMLIB_RUNTIME_H_
-#define BMLIB_RUNTIME_H_
-#include <stdbool.h>
-#include <stddef.h>
-
-#if !defined(__x86_64__) && !defined(__aarch64__)
-#error "BM needs 64-bit to compile"
-#endif
-
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
-typedef enum {
-  BM_SUCCESS                 = 0,
-  BM_ERR_DEVNOTREADY          = 1,   /* Device not ready yet */
-  BM_ERR_FAILURE             = 2,   /* General failure */
-  BM_ERR_TIMEOUT             = 3,   /* Timeout */
-  BM_ERR_PARAM               = 4,   /* Parameters invalid */
-  BM_ERR_NOMEM               = 5,   /* Not enough memory */
-  BM_ERR_DATA                = 6,   /* Data error */
-  BM_ERR_BUSY                = 7,   /* Busy */
-  BM_ERR_NOFEATURE           = 8,    /* Not supported yet */
-  BM_NOT_SUPPORTED           = 9
-} bm_status_t;
-
-typedef enum {
-  BM_MEM_TYPE_DEVICE  = 0,
-  BM_MEM_TYPE_HOST    = 1,
-  BM_MEM_TYPE_SYSTEM  = 2,
-  BM_MEM_TYPE_INT8_DEVICE  = 3,
-  BM_MEM_TYPE_INVALID = 4
-} bm_mem_type_t;
-
-#define BM_MEM_ADDR_NULL     (0xfffffffff)
-
-typedef struct bm_mem_desc {
-  unsigned char                 desc[16];
-} bm_mem_desc_t;
-
-struct bm_context;
-typedef struct bm_context *  bm_handle_t;
-typedef struct bm_mem_desc   bm_device_mem_t;
-typedef struct bm_mem_desc   bm_host_mem_t;
-typedef struct bm_mem_desc   bm_system_mem_t;
-
-#define BM_CHECK_RET(call)                         \
-    do {                                        \
-      bm_status_t ret = call;                \
-	  if ( ret != BM_SUCCESS ) {             \
-        printf("BM_CHECK_RET failed %d\n", ret);   \
-        ASSERT(0);                              \
-        exit(-ret);                             \
-      }                                         \
-    } while(0)
-
-/*
- * control 
- */
-void bm_flush(
-    bm_handle_t      handle);
-/*
- * brief malloc host memory according to a tensor shape(each neuron is 32 bits)
-*/
-
-bm_status_t bm_malloc_neuron_device(
-    bm_handle_t      handle,
-    bm_device_mem_t *pmem,
-    int              n,
-    int              c,
-    int              h,
-    int              w);
-
-/*
- * brief malloc host memory in size of dword(32 bits)
-*/
-
-bm_status_t bm_malloc_device_dword(
-    bm_handle_t      handle,
-    bm_device_mem_t *pmem,
-    int              count);
-bm_status_t bm_malloc_ctx_dword(
-    bm_handle_t      handle,
-    bm_device_mem_t *pmem,
-    int              count,
-    unsigned long long ctx_addr);
-/*
- * brief malloc host memory in size of byte
-*/
-
-bm_status_t bm_malloc_device_byte(
-    bm_handle_t      handle,
-    bm_device_mem_t *pmem,
-    unsigned int     size);
-
-void bm_free_device(
-    bm_handle_t      handle,
-    bm_device_mem_t  mem);
-
-/*
- * brief malloc host memory in size of byte
- */
-bm_status_t bm_malloc_host(
-    bm_handle_t      handle,
-    bm_host_mem_t   *pmem,
-    unsigned int     size);
-
-bm_status_t bm_free_host(
-    bm_handle_t      handle,
-    bm_host_mem_t    mem);
-
-void *bm_host_mem_get_pointer(
-    bm_host_mem_t    mem);
-
-/*
- * Memory copy and set
- */
-bm_status_t bm_memcpy_h2d(
-    bm_handle_t      handle,
-    bm_device_mem_t  dst,
-    bm_host_mem_t    src);
-
-bm_status_t bm_memcpy_d2h(
-    bm_handle_t      handle,
-    bm_host_mem_t    dst,
-    bm_device_mem_t  src);
-
-
-bm_status_t bm_memcpy_s2d(
-    bm_handle_t      handle,
-    bm_device_mem_t  dst,
-    bm_system_mem_t  src);
-
-bm_status_t bm_memcpy_d2s(
-    bm_handle_t      handle,
-    bm_system_mem_t  dst,
-    bm_device_mem_t  src);
-
-bm_status_t bm_memcpy_d2d(
-    bm_handle_t     handle,
-    bm_device_mem_t dst,
-    int             dst_offset,
-    bm_device_mem_t src,
-    int             src_offset,
-    int             len);
-
-bm_status_t bm_memset_device(
-    bm_handle_t      handle,
-    const int        value,
-    bm_device_mem_t  mem);
-
-bm_device_mem_t bm_mem_from_system(
-    void *              system_addr);
-
-/*
-*brief malloc one device memory with the shape of (N,C,H,W), copy the sys_mem to
-device mem if need_copy is true
-*/
-
-bm_status_t bm_mem_convert_system_to_device_neuron(
-    bm_handle_t          handle,
-    struct bm_mem_desc  *dev_mem,
-    struct bm_mem_desc   sys_mem,
-    bool                 need_copy,
-    int                  n,
-    int                  c,
-    int                  h,
-    int                  w);
-
-/*
-*brief malloc one device memory with the size of coeff_count, copy the sys_mem to
-device mem if need_copy is true
-*/
-bm_status_t bm_mem_convert_system_to_device_coeff(
-    bm_handle_t          handle,
-    struct bm_mem_desc  *dev_mem,
-    struct bm_mem_desc   sys_mem,
-    bool                 need_copy,
-    int                  coeff_count);
-
-/*
- * memory info get and set
- */
-unsigned long long bm_mem_get_device_addr(struct bm_mem_desc mem);
-void               bm_mem_set_device_addr(struct bm_mem_desc & mem, unsigned long long addr);
-unsigned int       bm_mem_get_device_size(struct bm_mem_desc mem);
-void               bm_mem_set_device_size(struct bm_mem_desc & mem, unsigned int size);
-bm_mem_type_t      bm_mem_get_type(struct bm_mem_desc mem);
-
-unsigned long long bm_gmem_arm_reserved_request(bm_handle_t handle);
-void bm_gmem_arm_reserved_release(bm_handle_t handle);
-
-/* 
-* brief Get the handle of bmlib_runtime
-* return : If the handle has been inited, return the handle it self , else init one and return it
-*/
-
-bm_status_t bm_init(bm_handle_t *handle, bool bmkernel_used);
-void bm_deinit(bm_handle_t handle);
-
-/*
- * Helper functions
- */
-
-/**
-* \brief Get the number of nodechip (Constant 1 in bm1682)
-* \return
-* \ref NO
-*/
-int bm_get_nodechip_num(
-    bm_handle_t      handle);
-
-/**
-* \brief Get the number of nodechip (Constant 64 in bm1682)
-* \return
-* \ref NO
-*/
-int bm_get_npu_num(
-    bm_handle_t      handle);
-int bm_get_eu_num( bm_handle_t handle);
-/**
-* \brief Get the number of nodechip (Constant 64 in bm1682)
-* \return
-* \ref NO
-*/
-bm_device_mem_t bm_mem_null(void);
-#define BM_MEM_NULL  (bm_mem_null())
-
-bm_status_t bm_dev_getcount(int* count);
-bm_status_t bm_dev_query(int devid);
-bm_status_t bm_dev_request(bm_handle_t *handle, bool bmkernel_used, int devid);
-void bm_dev_free(bm_handle_t handle);
-
-typedef struct bm_fw_desc {
-	unsigned int *itcm_fw;
-	int itcmfw_size;
-	unsigned int *ddr_fw;
-	int ddrfw_size;
-} bm_fw_desc, *pbm_fw_desc;
-bm_status_t bm_update_firmware(bm_handle_t handle, pbm_fw_desc pfw);
-
-#if defined (__cplusplus)
-}
-#endif
-
-#endif /* BM_RUNTIME_H_ */
diff --git a/saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h b/saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h
deleted file mode 100644
index e878343ef..000000000
--- a/saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h
+++ /dev/null
@@ -1,72 +0,0 @@
-#ifndef BMLIB_UTILS_H
-#define BMLIB_UTILS_H
-#include <stdlib.h>
-
-/*
- * Debug definitions for user app only
- * Copy from common.h
- * Don't include for internal usage
- */
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define UNUSED(x)               (void)(x)
-
-#define __ALIGN_MASK(x,mask)    (((x)+(mask))&~(mask))
-#define ALIGN(x,a)              __ALIGN_MASK(x,(__typeof__(x))(a)-1)
-
-int array_cmp(
-    float *p_exp,
-    float *p_got,
-    int len,
-    const char *info_label,
-    float delta);
-
-int tri_array_cmp(
-    float *p_exp,
-    float *p_got,
-    float *third_party,
-    int len,
-    const char *info_label,
-    float delta,
-    int* err_idx);
-
-int array_cmp_int(
-    int *p_exp,
-    int *p_got,
-    int len,
-    const char *info_label
-);
-
-void dump_hex(char *desc, void *addr, int len);
-void dump_data_float(char *desc, void *addr, int n, int c, int h, int w);
-void dump_data_int(char *desc, void *addr, int n, int c, int h, int w);
-void dump_matrix_float(char *desc, void *addr, int row, int col);
-void dump_array_file(char * file, int row_num, int col_num, int transpose, float * parr);
-
-/* dump to file */
-void dump_float_tensor(const char * filename,
-    int length, float * dump_data);
-
-#ifdef __cplusplus
-/* not available in C */
-void random_param(
-    int &n, int &c, int &h, int &w,
-    int &kh, int &kw, int &ph, int &pw, int &sh, int &sw,
-    int &oc);
-
-void random_conv_param(
-    int &n, int &ic, int &ih, int &iw, int &oc,
-    int &kh, int &kw, int &dh, int &dw,
-    int &ph, int &pw, int &sh, int &sw);
-#endif
-
-int conv_coeff_storage_convert(float * coeff_orig, float ** coeff_reformat, unsigned int oc, unsigned int ic, unsigned int kh, unsigned int kw, unsigned int npu_num);
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* BMLIB_UTILS_H */
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmblob.h b/saber/funcs/impl/bm/base/include/bmruntime/bmblob.h
deleted file mode 100644
index f3e086f91..000000000
--- a/saber/funcs/impl/bm/base/include/bmruntime/bmblob.h
+++ /dev/null
@@ -1,97 +0,0 @@
-#ifndef __BM_BLOB_H__
-#define __BM_BLOB_H__
-
-struct bm_mem_desc;
-typedef struct bm_mem_desc bm_device_mem_t;
-namespace bmcnn {
-
-typedef struct { int n, c, h, w; } Shape;
-
-class BMBlob
-{
-public:
-    /**
-     * \brief Constructor of blob.
-     *
-     * \param shape - Shape of blob
-     */
-    explicit BMBlob(const Shape &shape, void *handle);
-    /**
-     * \brief Deconstructor of blob.
-     */
-    virtual ~BMBlob();
-    /**
-     * \brief Reshape blob.
-     * 
-     * \param n - Batch number of blob
-     * \param c - Channel number of blob
-     * \param h - Height of blob section
-     * \param w - Width of blob section
-     *
-     * \note
-     * (1) For now, number of channels is not allowed to be reshaped.\n
-     * (2) After reshaping, data in this blob will be set vanished.\n
-     */
-    void Reshape(int n, int c, int h, int w);
-    /**
-     * \brief Get shape.
-     */
-    inline Shape shape() const
-    { return shape_; }
-    /**
-     * \brief Get batch size.
-     */
-    inline int batch_num() const
-    { return shape_.n; }
-    /**
-     * \brief Get feature
-     *
-     * \return Channel number of the blob\n
-     */
-    inline int channels() const
-    { return shape_.c; }
-    /**
-     * \brief Get height of section
-     */
-    int height() const
-    { return shape_.h; }
-    /**
-     * \brief Get width of section.
-     */
-    int width() const
-    { return shape_.w; }
-    /**
-     * \brief Get read-only pointer to data in cpu.
-     */
-    const float *cpu_data(); 
-    /**
-     * \brief Get mutable pointer of data in cpu.
-     */    
-    float *mutable_cpu_data();
-    /**
-     * \brief Get mutable pointer of memory in device.
-     */    
-    bm_device_mem_t *mutable_dev_mem();
-    /**
-     * \brief Get read-only pointer of memory in device.
-     */    
-    const bm_device_mem_t *dev_mem();
-private:
-    BMBlob(const BMBlob &other);
-    BMBlob &operator=(const BMBlob &other);
-    
-    bm_device_mem_t *dev_mem_;
-    float *sys_data_;
-    Shape shape_;
-    int data_pos_;
-    int capacity_;
-    void *handle_;
-    
-    enum { AIR = 0x00, SYS = 0x01, DEV = 0x10 };
-    void sync_s2d();
-    void sync_d2s();
-};
-
-} /* namespace bmcnn */
-
-#endif /* __BM_BLOB_H__ */
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h
deleted file mode 100644
index ef438ce14..000000000
--- a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h
+++ /dev/null
@@ -1,155 +0,0 @@
-#ifndef  BMRUNTIME_H_
-#define  BMRUNTIME_H_
-#include <algorithm>
-#include <vector>
-#include "bmlib_runtime.h"
-#include "bmruntime_common.h"
-#include "stdio.h"
-#include <string>
-#include <map>
-#include <set>
-#include <iostream>
-
-using std::vector;
-using std::map;
-using std::set;
-using std::string;
-using std::pair;
-using std::make_pair;
-using std::cout;
-using std::endl;
-typedef unsigned int            u32;
-typedef unsigned long long      u64;
-
-typedef struct stage_param_with_idx{
-  int height_high;
-  int height_low;
-  int width_high;
-  int width_low;
-  int stage_index;
-}stage_param_with_idx_t;
-
-class bmruntime {
-  public:
-    bmruntime(bm_handle_t bm_handle);
-    ~bmruntime();
-
-    bool load_context(const string& ctx_dir);
-
-    const set<string>& get_input_tensor(int net_idx) const;
-    const set<string>& get_input_tensor(const string& net_name);
-
-    const set<string>& get_output_tensor(int net_idx) const;
-    const set<string>& get_output_tensor(const string& net_name);
-
-    const bm_device_mem_t* get_input_blob(const string& tensor_name, int net_idx);
-    const bm_device_mem_t* get_input_blob(const string& tensor_name, const string& net_name);
-
-    const bm_device_mem_t* get_output_blob(const string& tensor_name, int net_idx);
-    const bm_device_mem_t* get_output_blob(const string& tensor_name, const string& net_name);
-
-    bool launch(int net_idx);
-    bool launch(const string& net_name);
-
-    bool launch(int net_idx, const bm_device_mem_t* input_tensors, int input_num,
-            const bm_device_mem_t* output_tensors, int output_num);
-    bool launch(const string& net_name, const bm_device_mem_t* input_tensors, int input_num,
-            const bm_device_mem_t* output_tensors, int output_num);
-
-    bool launch(int net_idx, int n, int h , int w);
-    bool launch(const string& net_name, int n, int h, int w);
-    bool launch(int net_idx, const bm_device_mem_t* input_tensors, int input_num,
-            const bm_device_mem_t* output_tensors, int output_num, int n, int h, int w);
-    bool launch(const string& net_name, const bm_device_mem_t* input_tensors, int input_num,
-            const bm_device_mem_t* output_tensors, int output_num, int n , int h, int w);
-
-    void get_input_blob_max_nhw(const string& tensor_name, int net_idx, int * max_n, int * max_c, int * max_h, int * max_w);
-    void get_input_blob_max_nhw(const string& tensor_name, const string& net_name, int * max_n, int * max_c, int * max_h, int * max_w);
-    void get_output_blob_max_nhw(const string& tensor_name, int net_idx, int * max_n, int * max_c, int * max_h, int * max_w);
-    void get_output_blob_max_nhw(const string& tensor_name, const string& net_name, int * max_n, int *max_c, int * max_h, int * max_w);
-
-    int get_oh_from_ih(const string& input_tensor_name, const string& output_tensor_name, const string& net_name, int ih);
-    int get_oh_from_ih(const string& input_tensor_name, const string& output_tensor_name, int net_idx, int ih);
-    int get_ow_from_iw(const string& input_tensor_name, const string& output_tensor_name, const string& net_name, int iw);
-    int get_ow_from_iw(const string& input_tensor_name, const string& output_tensor_name, int net_idx, int iw);
-
-
-
-
-    bool can_batch_size_change(int net_idx);
-    bool can_batch_size_change(const string& net_name);
-    bool can_height_and_width_change(int net_idx);
-    bool can_height_and_width_change(const string& net_name);
-
-    void show_neuron_network();
-
-    int get_network_number() {return net_num;}
-
-    inline bm_handle_t get_bm_handle() {return m_handle;}
-
-  protected:
-    bool setup_mem_context(const string& ctx_dir);
-    bool setup_cmd_context(const string& ctx_dir);
-    bool set_using_cmd_file(const string& ctx_dir);
-    void load_cmd(u32* cmd, int engine_id, bool last_cmd, u64 start_address, u64 append_mem_offset);
-    bool setup_ir_context(const string& ctx_dir);
-
-    void wrong_net_idx_handle(int net_idx) const;
-
-    int get_net_idx(const string& net_name);
-    int get_stage_idx(int net_idx, int h, int w);
-    u64 get_stage_offset(int net_idx, int stage_idx);
-
-    int compute_output_height(int input_height, int global_kh, int global_stride_h, int global_pad_h, int global_pool_kh);
-    int compute_output_width(int input_width, int global_kw, int global_stride_w, int global_pad_w, int global_pool_kw);
-
-    bm_handle_t m_handle;
-    std::vector<DEVICE_MEM_INFO>            m_device_mem_info_vec;
-    std::vector<bm_device_mem_t>            m_device_mem_vec;
-
-    vector<int>                             m_gdma_total_id_v;
-    vector<int>                             m_cdma_total_id_v;
-    vector<int>                             m_bdc_total_id_v;
-    vector<vector<int> >                    m_gdma_group_id_v;
-    vector<vector<int> >                    m_cdma_group_id_v;
-    vector<vector<int> >                    m_bdc_group_id_v;
-    vector<int>                             m_cmdgroup_num;
-    vector<u64>                             m_gdma_cmd_start_address_v;
-    vector<u64>                             m_cdma_cmd_start_address_v;
-    vector<u64>                             m_bdc_cmd_start_address_v;
-    vector<map<string, bm_device_mem_t> >   input_tensor_mem_map_v;
-    vector<map<string, bm_device_mem_t> >   output_tensor_mem_map_v;
-    vector<set<string> >                    m_input_tensor_set_v;
-    vector<set<string> >                    m_output_tensor_set_v;
-    int                                     net_num;
-    map<string,int>                         net_name_to_idx;
-    vector<int>                             stage_num;
-
-    bool                                    have_ir_info;
-    vector<vector<unsigned int> >           m_ir_info_len;
-    vector<u64>                             m_ir_info_start_address_v;
-    vector<vector<stage_param_with_idx_t> > stage_param_with_idx_vv;
-
-    //io tensor param
-    vector<int>                             n_can_change_v;
-    vector<int>                             h_w_can_change_v;
-
-    vector<vector<map<string, tensor_max_shape_t> > >           input_tensor_max_shape_vv;
-    vector<vector<map<string, tensor_max_shape_t> > >           output_tensor_max_shape_vv;
-    vector<vector<map<string, global_output_tensor_param_t> > > global_output_tensor_param_vv;
-
-    bool m_using_cmd_file;
-    FILE * m_gdma_cmd_file;
-    FILE * m_cdma_cmd_file;
-    FILE * m_bdc_cmd_file;
-
-    //previous value or state
-    int pre_net_num;
-    int pre_m_device_mem_info_vec_size;  
-    int pre_m_device_mem_vec_size;  
-
-    //append mem offset when appending another framework's context.
-    vector<u64> apd_ctx_mem_offset;
-};
-
-#endif
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h
deleted file mode 100644
index 77ae9bd22..000000000
--- a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef BMRUNTIME_COMMON_H
-#define BMRUNTIME_COMMON_H
-
-#define BMRT_ASSERT(_cond)                       \
-  do {                                           \
-    if (!(_cond)) {                              \
-      printf("ASSERT %s: %s: %d: %s\n",          \
-          __FILE__, __func__, __LINE__, #_cond); \
-      exit(-1);                                  \
-    }                                            \
-  } while(0)
-
-typedef enum neuron_device_mem_type {
-    INPUT_NEURON_TENSOR = 0,
-    INTERMEDIATE_NEURON_TENSOR = 1,
-    OUTPUT_NEURON_TENSOR = 2,
-    CMD_BUF_TENSOR = 3,
-    CMD_NUM_TENSOR = 4
-} NEURON_DEVICE_MEM_TYPE;
-
-typedef enum device_mem_type {
-    NEURON = 0,
-    COEFF = 1,
-#ifdef INT8_COEFF_FUNC
-    COEFF_INT8 = 2,
-    COEFF_INT8SCALE = 3,
-    LOCAL = 4
-#else
-    LOCAL = 2
-#endif
-} DEVICE_MEM_TYPE;
-
-typedef struct device_mem_info {
-    DEVICE_MEM_TYPE device_mem_type;
-    NEURON_DEVICE_MEM_TYPE neuron_device_mem_type;
-    int n;
-    int c;
-    int h;
-    int w;
-    int coeff_count;
-    int groups;
-    unsigned long long address;
-    unsigned long size;
-} DEVICE_MEM_INFO;
-
-//info for compute output tensor
-typedef struct tensor_max_shape {
-  int max_n;
-  int channel;
-  int max_h;
-  int max_w;
-} tensor_max_shape_t;
-
-typedef struct global_output_tensor_param {
-  int input_idx;
-  int global_kh;
-  int global_kw;
-  int global_stride_h;
-  int global_stride_w;
-  int global_pad_h;
-  int global_pad_w;
-  int global_pool_kh;
-  int global_pool_kw;
-} global_output_tensor_param_t; 
-
-#endif
diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h
deleted file mode 100644
index 4214674f3..000000000
--- a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef BMRUNTIME_INTERFACE_H_
-#define BMRUNTIME_INTERFACE_H_
-
-#include "bmruntime.h"
-#include "bmdnn_runtime.h"
-
-bmruntime* create_bmruntime(bm_handle_t* bm_handle);
-
-void destroy_bmruntime(bm_handle_t bm_handle, bmruntime* p_bmrt);
-
-#endif

From dc1bdfef8003f52701a6f416c7d48d7cf0356ddf Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 17 Aug 2018 17:18:04 +0800
Subject: [PATCH 280/318] Fix bug for tensor_reshape_realloc test. Init host
 tensor with proper shape

---
 saber/core/impl/bm/tensor_op_bm.cpp | 20 +++++++++++++-------
 test/saber/test_saber_tensor.cpp    |  3 +--
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/saber/core/impl/bm/tensor_op_bm.cpp b/saber/core/impl/bm/tensor_op_bm.cpp
index 8380d145a..7c14c21da 100644
--- a/saber/core/impl/bm/tensor_op_bm.cpp
+++ b/saber/core/impl/bm/tensor_op_bm.cpp
@@ -6,13 +6,15 @@ namespace saber {
 template<>
 void fill_tensor_const<BM>(Tensor<BM>& tensor, float value,
                            typename Tensor<BM>::API::stream_t stream = NULL) {
-    Tensor<X86> temp_tensor(tensor.valid_shape(),tensor.get_dtype());
+    Tensor<X86> temp_tensor(tensor.shape(), tensor.get_dtype());
+    temp_tensor.set_shape(tensor.valid_shape());
     fill_tensor_const(temp_tensor, value);
     tensor.copy_from(temp_tensor);
 }
 template<>
 void fill_tensor_rand<BM>(Tensor<BM>& tensor, typename Tensor<BM>::API::stream_t stream = NULL) {
-    Tensor<X86> temp_tensor(tensor.valid_shape(),tensor.get_dtype());
+    Tensor<X86> temp_tensor(tensor.shape(), tensor.get_dtype());
+    temp_tensor.set_shape(tensor.valid_shape());
     fill_tensor_rand(temp_tensor);
     tensor.copy_from(temp_tensor);
 }
@@ -20,15 +22,17 @@ void fill_tensor_rand<BM>(Tensor<BM>& tensor, typename Tensor<BM>::API::stream_t
 template<>
 void fill_tensor_rand<BM>(Tensor<BM>& tensor, float vstart, float vend,
                           typename Tensor<BM>::API::stream_t stream = NULL) {
-    Tensor<X86> temp_tensor(tensor.valid_shape(),tensor.get_dtype());
+    Tensor<X86> temp_tensor(tensor.shape(), tensor.get_dtype());
+    temp_tensor.set_shape(tensor.valid_shape());
     fill_tensor_rand(temp_tensor, vstart, vend);
     tensor.copy_from(temp_tensor);
 }
 
 template<>
 void print_tensor<BM>(Tensor<BM>& tensor, typename Tensor<BM>::API::stream_t stream = NULL) {
-    LOG(INFO) << "device tensor data";
-    Tensor<X86> temp_tensor(tensor.valid_shape(),tensor.get_dtype());
+    LOG(INFO) << "BM device tensor data:";
+    Tensor<X86> temp_tensor(tensor.shape(), tensor.get_dtype());
+    temp_tensor.set_shape(tensor.valid_shape());
     temp_tensor.copy_from(tensor);
     print_tensor(temp_tensor);
 }
@@ -41,7 +45,8 @@ void print_tensor_valid<BM>(Tensor<BM>& tensor, typename Tensor<BM>::API::stream
 
 template<>
 double tensor_mean_value<BM>(Tensor<BM>& tensor, typename Tensor<BM>::API::stream_t stream = NULL) {
-    Tensor<X86> temp_tensor(tensor.valid_shape(),tensor.get_dtype());
+    Tensor<X86> temp_tensor(tensor.shape(), tensor.get_dtype());
+    temp_tensor.set_shape(tensor.valid_shape());
     temp_tensor.copy_from(tensor);
     return tensor_mean_value(temp_tensor);
 }
@@ -49,7 +54,8 @@ double tensor_mean_value<BM>(Tensor<BM>& tensor, typename Tensor<BM>::API::strea
 template<>
 double tensor_mean_value_valid<BM>(Tensor<BM>& tensor,
                                    typename Tensor<BM>::API::stream_t stream = NULL) {
-    Tensor<X86> temp_tensor(tensor.valid_shape(),tensor.get_dtype());
+    Tensor<X86> temp_tensor(tensor.shape(), tensor.get_dtype());
+    temp_tensor.set_shape(tensor.valid_shape());
     temp_tensor.copy_from(tensor);
     return tensor_mean_value(temp_tensor);
 }
diff --git a/test/saber/test_saber_tensor.cpp b/test/saber/test_saber_tensor.cpp
index 83ca72d2e..41b1e2168 100644
--- a/test/saber/test_saber_tensor.cpp
+++ b/test/saber/test_saber_tensor.cpp
@@ -620,7 +620,6 @@ TEST(TestSaberFunc, test_saber_tensor_shape) {
 
 #ifdef USE_BM
     Env<BM>::env_init();
-    Env<X86>::env_init();
     LOG(INFO) << "test BM tensor shape API";
     test_tensor_shape<BM>();
 #endif //USE_BM
@@ -743,7 +742,7 @@ TEST(TestSaberFunc, test_tensor_reshape_realloc) {
     Env<BM>::env_init();
     Env<X86>::env_init();
     LOG(INFO) << "test BM FP32 tensor reshape realloc";
-    //tensor_reshape_realloc<BM, X86, AK_FLOAT>();
+    tensor_reshape_realloc<BM, X86, AK_FLOAT>();
 #endif //USE_BM
 }
 #endif

From 2115134fee2f6999346e08ffbf62eade9f1c2f5e Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 21 Aug 2018 09:14:45 +0800
Subject: [PATCH 281/318] Use BM Kernel instead of BMDNN

---
 cmake/find_modules.cmake       | 25 +++++++++++++------------
 saber/core/common.h            |  3 +--
 saber/core/data_traits.h       |  1 -
 saber/core/impl/bm/bm_impl.cpp |  2 +-
 4 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/cmake/find_modules.cmake b/cmake/find_modules.cmake
index 287fd69b9..c766e9117 100644
--- a/cmake/find_modules.cmake
+++ b/cmake/find_modules.cmake
@@ -327,22 +327,23 @@ endmacro()
 
 macro(anakin_find_bmlib)
 	if(USE_BM)
-		find_path(BM_ROOT include/bmdnn/bmdnn_api.h /usr/local/include/bm/ $ENV{BM_ROOT}/)
-		find_path(BM_ROOT_INCLUDE_DNN bmdnn_api.h ${BM_ROOT}/include/bmdnn)
-		find_path(BM_ROOT_INCLUDE_RT bmruntime.h ${BM_ROOT}/include/bmruntime)
-		find_path(BM_ROOT_INCLUDE_LIB bmlib_runtime.h ${BM_ROOT}/include/bmlib)
-		if(BM_ROOT_INCLUDE_DNN AND BM_ROOT_INCLUDE_RT AND BM_ROOT_INCLUDE_LIB)
+		find_path(BM_ROOT include/host/bmkernel_runtime.h /usr/local/include/bm/ $ENV{BM_ROOT}/)
+		if(BM_ROOT)
 			set(BM_FOUND TRUE)
 		endif()
 		if(BM_FOUND)
-			message(STATUS " Found bm_lib in ${BM_ROOT}  ${BM_ROOT_INCLUDE_DNN} ${BM_ROOT_INCLUDE_RT} ${BM_ROOT_INCLUDE_LIB}")
-			include_directories(${BM_ROOT_INCLUDE_DNN})
-			include_directories(${BM_ROOT_INCLUDE_RT})
-			include_directories(${BM_ROOT_INCLUDE_LIB})
+			message(STATUS " Found bm_lib in ${BM_ROOT}")
+			anakin_fetch_include_recursively(${BM_ROOT}/include)
 			set(BM_LIBRARIES "")
-			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/app/libbmdnn_device.so)
-			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/app/libbmlib_device.so)
-			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/app/libbmrt.so)
+			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/host/libbmkernel-host.so)
+			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/host/libbmlib-asic.so)
+			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/host/libbmlib-palladium.so)
+			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/host/libbmkernel-host-cmodel.so)
+			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/libbmkernel-core-asic.a)
+			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/libbmkernel-core-palladium.a)
+			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/libbmkernel-top-asic.a)
+			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/libbmkernel-top-palladium.a)
+			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/cmodel/bmlib.a)
 			list(APPEND ANAKIN_LINKER_LIBS ${BM_LIBRARIES})
 		else()
 			message(FATAL_ERROR "Could not found bm_lib")
diff --git a/saber/core/common.h b/saber/core/common.h
index 1a4e29348..bb61922c7 100644
--- a/saber/core/common.h
+++ b/saber/core/common.h
@@ -179,9 +179,8 @@ const char* cudnn_get_errorstring(cudnnStatus_t status);
 #ifdef USE_BM
 
 #include "bmlib_runtime.h"
-#include "bmdnn_api.h"
-#include "bmdnn_ext_api.h"
 #include "bmlib_utils.h"
+#include "bmkernel_runtime.h"
 
 #define BMDNN_CHECK(condition) \
   do { \
diff --git a/saber/core/data_traits.h b/saber/core/data_traits.h
index 331481383..52a704ebb 100644
--- a/saber/core/data_traits.h
+++ b/saber/core/data_traits.h
@@ -20,7 +20,6 @@
 
 #ifdef USE_BM
 #include "bmlib_runtime.h"
-#include "bmdnn_api.h"
 #include "bmlib_utils.h"
 #endif
 namespace anakin {
diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 1505acbf9..fb2b2f8ee 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -45,7 +45,7 @@ typedef TargetWrapper<BM, __device_target> BM_API;
 
 // Init handle only once in the lifetime
 static bm_handle_t handle;
-static bm_status_t init_handle{bmdnn_init(&handle)};
+static bm_status_t init_handle{bmkernel_init(&handle)};
 
 bm_handle_t BM_API::get_handle() {
     return handle;

From 7558ec01b36896f0d78733534502a38d5766bc41 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 28 Aug 2018 09:43:29 +0800
Subject: [PATCH 282/318] Revert "Use BM Kernel instead of BMDNN"

This reverts commit 2115134fee2f6999346e08ffbf62eade9f1c2f5e.
---
 cmake/find_modules.cmake       | 25 ++++++++++++-------------
 saber/core/common.h            |  3 ++-
 saber/core/data_traits.h       |  1 +
 saber/core/impl/bm/bm_impl.cpp |  2 +-
 4 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/cmake/find_modules.cmake b/cmake/find_modules.cmake
index c766e9117..287fd69b9 100644
--- a/cmake/find_modules.cmake
+++ b/cmake/find_modules.cmake
@@ -327,23 +327,22 @@ endmacro()
 
 macro(anakin_find_bmlib)
 	if(USE_BM)
-		find_path(BM_ROOT include/host/bmkernel_runtime.h /usr/local/include/bm/ $ENV{BM_ROOT}/)
-		if(BM_ROOT)
+		find_path(BM_ROOT include/bmdnn/bmdnn_api.h /usr/local/include/bm/ $ENV{BM_ROOT}/)
+		find_path(BM_ROOT_INCLUDE_DNN bmdnn_api.h ${BM_ROOT}/include/bmdnn)
+		find_path(BM_ROOT_INCLUDE_RT bmruntime.h ${BM_ROOT}/include/bmruntime)
+		find_path(BM_ROOT_INCLUDE_LIB bmlib_runtime.h ${BM_ROOT}/include/bmlib)
+		if(BM_ROOT_INCLUDE_DNN AND BM_ROOT_INCLUDE_RT AND BM_ROOT_INCLUDE_LIB)
 			set(BM_FOUND TRUE)
 		endif()
 		if(BM_FOUND)
-			message(STATUS " Found bm_lib in ${BM_ROOT}")
-			anakin_fetch_include_recursively(${BM_ROOT}/include)
+			message(STATUS " Found bm_lib in ${BM_ROOT}  ${BM_ROOT_INCLUDE_DNN} ${BM_ROOT_INCLUDE_RT} ${BM_ROOT_INCLUDE_LIB}")
+			include_directories(${BM_ROOT_INCLUDE_DNN})
+			include_directories(${BM_ROOT_INCLUDE_RT})
+			include_directories(${BM_ROOT_INCLUDE_LIB})
 			set(BM_LIBRARIES "")
-			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/host/libbmkernel-host.so)
-			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/host/libbmlib-asic.so)
-			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/host/libbmlib-palladium.so)
-			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/host/libbmkernel-host-cmodel.so)
-			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/libbmkernel-core-asic.a)
-			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/libbmkernel-core-palladium.a)
-			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/libbmkernel-top-asic.a)
-			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/libbmkernel-top-palladium.a)
-			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/cmodel/bmlib.a)
+			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/app/libbmdnn_device.so)
+			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/app/libbmlib_device.so)
+			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/app/libbmrt.so)
 			list(APPEND ANAKIN_LINKER_LIBS ${BM_LIBRARIES})
 		else()
 			message(FATAL_ERROR "Could not found bm_lib")
diff --git a/saber/core/common.h b/saber/core/common.h
index bb61922c7..1a4e29348 100644
--- a/saber/core/common.h
+++ b/saber/core/common.h
@@ -179,8 +179,9 @@ const char* cudnn_get_errorstring(cudnnStatus_t status);
 #ifdef USE_BM
 
 #include "bmlib_runtime.h"
+#include "bmdnn_api.h"
+#include "bmdnn_ext_api.h"
 #include "bmlib_utils.h"
-#include "bmkernel_runtime.h"
 
 #define BMDNN_CHECK(condition) \
   do { \
diff --git a/saber/core/data_traits.h b/saber/core/data_traits.h
index 52a704ebb..331481383 100644
--- a/saber/core/data_traits.h
+++ b/saber/core/data_traits.h
@@ -20,6 +20,7 @@
 
 #ifdef USE_BM
 #include "bmlib_runtime.h"
+#include "bmdnn_api.h"
 #include "bmlib_utils.h"
 #endif
 namespace anakin {
diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index fb2b2f8ee..1505acbf9 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -45,7 +45,7 @@ typedef TargetWrapper<BM, __device_target> BM_API;
 
 // Init handle only once in the lifetime
 static bm_handle_t handle;
-static bm_status_t init_handle{bmkernel_init(&handle)};
+static bm_status_t init_handle{bmdnn_init(&handle)};
 
 bm_handle_t BM_API::get_handle() {
     return handle;

From 30e64c64bed4301ae817706ec139d1fd9cdbd86f Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 7 Sep 2018 11:49:23 +0800
Subject: [PATCH 283/318] Revert "Revert "Use BM Kernel instead of BMDNN""

This reverts commit 7558ec01b36896f0d78733534502a38d5766bc41.
---
 cmake/find_modules.cmake       | 25 +++++++++++++------------
 saber/core/common.h            |  3 +--
 saber/core/data_traits.h       |  1 -
 saber/core/impl/bm/bm_impl.cpp |  2 +-
 4 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/cmake/find_modules.cmake b/cmake/find_modules.cmake
index 287fd69b9..c766e9117 100644
--- a/cmake/find_modules.cmake
+++ b/cmake/find_modules.cmake
@@ -327,22 +327,23 @@ endmacro()
 
 macro(anakin_find_bmlib)
 	if(USE_BM)
-		find_path(BM_ROOT include/bmdnn/bmdnn_api.h /usr/local/include/bm/ $ENV{BM_ROOT}/)
-		find_path(BM_ROOT_INCLUDE_DNN bmdnn_api.h ${BM_ROOT}/include/bmdnn)
-		find_path(BM_ROOT_INCLUDE_RT bmruntime.h ${BM_ROOT}/include/bmruntime)
-		find_path(BM_ROOT_INCLUDE_LIB bmlib_runtime.h ${BM_ROOT}/include/bmlib)
-		if(BM_ROOT_INCLUDE_DNN AND BM_ROOT_INCLUDE_RT AND BM_ROOT_INCLUDE_LIB)
+		find_path(BM_ROOT include/host/bmkernel_runtime.h /usr/local/include/bm/ $ENV{BM_ROOT}/)
+		if(BM_ROOT)
 			set(BM_FOUND TRUE)
 		endif()
 		if(BM_FOUND)
-			message(STATUS " Found bm_lib in ${BM_ROOT}  ${BM_ROOT_INCLUDE_DNN} ${BM_ROOT_INCLUDE_RT} ${BM_ROOT_INCLUDE_LIB}")
-			include_directories(${BM_ROOT_INCLUDE_DNN})
-			include_directories(${BM_ROOT_INCLUDE_RT})
-			include_directories(${BM_ROOT_INCLUDE_LIB})
+			message(STATUS " Found bm_lib in ${BM_ROOT}")
+			anakin_fetch_include_recursively(${BM_ROOT}/include)
 			set(BM_LIBRARIES "")
-			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/app/libbmdnn_device.so)
-			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/app/libbmlib_device.so)
-			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/app/libbmrt.so)
+			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/host/libbmkernel-host.so)
+			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/host/libbmlib-asic.so)
+			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/host/libbmlib-palladium.so)
+			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/host/libbmkernel-host-cmodel.so)
+			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/libbmkernel-core-asic.a)
+			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/libbmkernel-core-palladium.a)
+			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/libbmkernel-top-asic.a)
+			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/libbmkernel-top-palladium.a)
+			list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/cmodel/bmlib.a)
 			list(APPEND ANAKIN_LINKER_LIBS ${BM_LIBRARIES})
 		else()
 			message(FATAL_ERROR "Could not found bm_lib")
diff --git a/saber/core/common.h b/saber/core/common.h
index 1a4e29348..bb61922c7 100644
--- a/saber/core/common.h
+++ b/saber/core/common.h
@@ -179,9 +179,8 @@ const char* cudnn_get_errorstring(cudnnStatus_t status);
 #ifdef USE_BM
 
 #include "bmlib_runtime.h"
-#include "bmdnn_api.h"
-#include "bmdnn_ext_api.h"
 #include "bmlib_utils.h"
+#include "bmkernel_runtime.h"
 
 #define BMDNN_CHECK(condition) \
   do { \
diff --git a/saber/core/data_traits.h b/saber/core/data_traits.h
index 331481383..52a704ebb 100644
--- a/saber/core/data_traits.h
+++ b/saber/core/data_traits.h
@@ -20,7 +20,6 @@
 
 #ifdef USE_BM
 #include "bmlib_runtime.h"
-#include "bmdnn_api.h"
 #include "bmlib_utils.h"
 #endif
 namespace anakin {
diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 1505acbf9..fb2b2f8ee 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -45,7 +45,7 @@ typedef TargetWrapper<BM, __device_target> BM_API;
 
 // Init handle only once in the lifetime
 static bm_handle_t handle;
-static bm_status_t init_handle{bmdnn_init(&handle)};
+static bm_status_t init_handle{bmkernel_init(&handle)};
 
 bm_handle_t BM_API::get_handle() {
     return handle;

From de1fdbe134b16ff0bb0d2482d069c90da214eaec Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 7 Sep 2018 13:39:01 +0800
Subject: [PATCH 284/318] bm kernel implementation for saber op

---
 saber/CMakeLists.txt                       | 22 +++++++++++++++++++
 saber/core/common.h                        |  2 +-
 saber/core/impl/bm/bm_impl.cpp             | 18 ++++++++--------
 saber/funcs/impl/bm/device/bmkernel_base.c | 25 ++++++++++++++++++++++
 saber/funcs/impl/bm/device/bmkernel_base.h | 12 +++++++++++
 5 files changed, 69 insertions(+), 10 deletions(-)
 create mode 100644 saber/funcs/impl/bm/device/bmkernel_base.c
 create mode 100644 saber/funcs/impl/bm/device/bmkernel_base.h

diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt
index bcbe1f961..62e7ea9d0 100644
--- a/saber/CMakeLists.txt
+++ b/saber/CMakeLists.txt
@@ -92,6 +92,28 @@ if(USE_CUDA)
 				      ${WHOLE_ARCHIVE_END})	
 endif()
 
+if(USE_BM)
+    set(BIN_NAME bmkernel_bin)
+    set(LINK_CONFIG link/bm1682_ddr.lds)
+    add_custom_command(OUTPUT bm_kernel_tmp
+        COMMAND arm-none-eabi-gcc ${ANAKIN_SABER}/funcs/impl/bm/device/bmkernel_base.c -mcpu=arm926ej-s -mfpu=vfp -fno-short-enums -std=gnu99 -O2 -Wall -Werror -ffunction-sections -fdata-sections -nostdlib -DENABLE_PRINT -I${BM_ROOT}/include/device -c -o ${BIN_NAME}.o
+        COMMAND arm-none-eabi-gcc -T ${BM_ROOT}/${LINK_CONFIG} -mcpu=arm926ej-s -mfpu=vfp -fno-short-enums -Wl,--check-sections -Wl,--gc-sections -Wl,--unresolved-symbols=report-all -Wl,--no-enum-size-warning -Wl,--start-group -lc -lm ${BIN_NAME}.o ${BM_ROOT}/lib/device/libbmkernel-top-asic.a ${BM_ROOT}/lib/device/libbmkernel-core-asic.a -Wl,--end-group -o ${BIN_NAME}.elf
+        COMMAND arm-none-eabi-objcopy -O binary -R *.slow* ${BIN_NAME}.elf ${BIN_NAME}_itcm.bin
+        COMMAND hexdump -v -e '1/4 \"%08x\\n\"' ${BIN_NAME}_itcm.bin > ${BIN_NAME}_itcm.hex.sim
+        COMMAND arm-none-eabi-objcopy -O binary -j *.slow* ${BIN_NAME}.elf ${BIN_NAME}_ddr.bin
+        COMMAND hexdump -v -e '1/4 \"%08x\\n\"' ${BIN_NAME}_ddr.bin > ${BIN_NAME}_ddr.hex.sim
+        COMMAND printf "%x" 0xAABBCCDD > ${BIN_NAME}.bin
+        COMMAND printf "%x" 0x0 >> ${BIN_NAME}.bin
+        COMMAND printf "%x" 0x0 >> ${BIN_NAME}.bin
+        COMMAND printf "%x" 0x0 >> ${BIN_NAME}.bin
+        COMMAND printf "%x" 0x58926 >> ${BIN_NAME}.bin
+        COMMAND cat ${BIN_NAME}_itcm.hex.sim >> ${BIN_NAME}.bin
+        COMMAND cat ${BIN_NAME}_ddr.hex.sim >> ${BIN_NAME}.bin
+        COMMENT "custom compilation..."
+    )
+    add_custom_target(ANAKIN ALL DEPENDS bm_kernel_tmp)
+endif()
+
 # add saber library to static
 if(UNIX OR APPLE)
     if (USE_ARM_PLACE)
diff --git a/saber/core/common.h b/saber/core/common.h
index bb61922c7..9a6a3c72d 100644
--- a/saber/core/common.h
+++ b/saber/core/common.h
@@ -182,7 +182,7 @@ const char* cudnn_get_errorstring(cudnnStatus_t status);
 #include "bmlib_utils.h"
 #include "bmkernel_runtime.h"
 
-#define BMDNN_CHECK(condition) \
+#define BM_CHECK(condition) \
   do { \
     bm_status_t error = condition; \
     CHECK_EQ(error, BM_SUCCESS) << " Failed with error code:" << error; \
diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index fb2b2f8ee..70b693917 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -52,12 +52,12 @@ bm_handle_t BM_API::get_handle() {
 };
 
 void BM_API::get_device_count(int& count) {
-    BMDNN_CHECK(bm_dev_getcount(&count));
+    BM_CHECK(bm_dev_getcount(&count));
 }
 
 void BM_API::set_device(int id) {
     //(bm_handle_t &handle, bool bmkernel_used, int id){
-    //BMDNN_CHECK(bm_dev_request(&handle, 0, id));
+    //BM_CHECK(bm_dev_request(&handle, 0, id));
 }
 
 //TODO: Do we have this functionality?
@@ -69,7 +69,7 @@ void BM_API::mem_alloc(TPtr* ptr, size_t n) {
     /* bm_device_mem_t *mem = reinterpret_cast<struct bm_mem_desc *>(*ptr); */
     //    bm_device_mem_t *mem = new bm_device_mem_t();
     bm_device_mem_t mem;
-    BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n));
+    BM_CHECK(bm_malloc_device_byte(handle, &mem, n));
     *ptr = TPtr(mem);
 }
 
@@ -82,9 +82,9 @@ void BM_API::mem_free(TPtr ptr) {
 
 void BM_API::mem_set(TPtr ptr, int value, size_t n) {
     //(bm_handle_t handle, const int value, bm_device_mem_t mem){
-    BMDNN_CHECK(bm_memset_device(handle, value, ptr));
+    BM_CHECK(bm_memset_device(handle, value, ptr));
     //bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr);
-    //BMDNN_CHECK(bm_memset_device(handle, value, *pmem));
+    //BM_CHECK(bm_memset_device(handle, value, *pmem));
 }
 
 void BM_API::sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \
@@ -92,8 +92,8 @@ void BM_API::sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \
         size_t count, __DtoD) {
     if(count==0)
         return;
-    //BMDNN_CHECK(bm_memcpy_d2d(handle, bm_mem_from_device(dst), dst_id, bm_mem_from_device(src), src_id, count));
-    BMDNN_CHECK(bm_memcpy_d2d(handle, dst, dst_offset, src, src_offset, count));
+    //BM_CHECK(bm_memcpy_d2d(handle, bm_mem_from_device(dst), dst_id, bm_mem_from_device(src), src_id, count));
+    BM_CHECK(bm_memcpy_d2d(handle, dst, dst_offset, src, src_offset, count));
 };
 
 void BM_API::sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \
@@ -101,7 +101,7 @@ void BM_API::sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \
         size_t count, __HtoD) {
     if(count==0)
         return;
-    BMDNN_CHECK(bm_memcpy_s2d(handle, dst+dst_offset, bm_mem_from_system(const_cast<void*>(src)+src_offset)));
+    BM_CHECK(bm_memcpy_s2d(handle, dst+dst_offset, bm_mem_from_system(const_cast<void*>(src)+src_offset)));
 
 #ifdef DEBUG
 
@@ -118,7 +118,7 @@ void BM_API::sync_memcpy(void* dst, size_t dst_offset, int dst_id, \
     if(count==0)
         return;
 //    LOG(INFO)<<"host ptr = "<<(dst)<<",dst_offset = "<<dst_offset<<", dev ptr = "<<(src)<<",dev offset = "<<src_offset;
-    BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst+dst_offset), src+src_offset));
+    BM_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst+dst_offset), src+src_offset));
 
 #ifdef DEBUG
 
diff --git a/saber/funcs/impl/bm/device/bmkernel_base.c b/saber/funcs/impl/bm/device/bmkernel_base.c
new file mode 100644
index 000000000..51557917b
--- /dev/null
+++ b/saber/funcs/impl/bm/device/bmkernel_base.c
@@ -0,0 +1,25 @@
+#include "bmkernel_base.h"
+#include "bm_config.h"
+#include <stdio.h>
+/**
+ * bmkernel_func is the user entry to BMKERNEL just like "main" to some applications.
+ * 
+ * \param args - Pointer to arguments that user sends from host.
+ *               op - Flag to determine which op forward function 
+ *                    it should delegate to.             
+ */
+int bmkernel_func(void *args)
+{
+    bmkernel_api_base* param = (bmkernel_api_base *)args;
+    switch (param->op) {
+        case "activation":
+            // bm_activation_fwd(param)
+            break;
+        case "conv":
+            // bm_conv_fwd(param)
+            break;
+        default:
+            LOG(FATAL) << "op is not supported by BM yet.";
+    }
+    return 0;
+}
diff --git a/saber/funcs/impl/bm/device/bmkernel_base.h b/saber/funcs/impl/bm/device/bmkernel_base.h
new file mode 100644
index 000000000..f7c8561da
--- /dev/null
+++ b/saber/funcs/impl/bm/device/bmkernel_base.h
@@ -0,0 +1,12 @@
+#ifndef ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BMKERNEL_BASE_H
+#define ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BMKERNEL_BASE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+typedef struct {
+    char str[64];
+} __attribute__((packed)) bmkernel_api_base;
+#ifdef __cplusplus
+}
+#endif
+#endif /* ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BMKERNEL_BASE_H */

From eaf730230d0400744d7481e3b16f2bec1aff9332 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 7 Sep 2018 13:49:49 +0800
Subject: [PATCH 285/318] Update bmkernel_api_base

---
 saber/funcs/impl/bm/device/bmkernel_base.c | 2 ++
 saber/funcs/impl/bm/device/bmkernel_base.h | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/saber/funcs/impl/bm/device/bmkernel_base.c b/saber/funcs/impl/bm/device/bmkernel_base.c
index 51557917b..e24f51299 100644
--- a/saber/funcs/impl/bm/device/bmkernel_base.c
+++ b/saber/funcs/impl/bm/device/bmkernel_base.c
@@ -1,5 +1,7 @@
 #include "bmkernel_base.h"
 #include "bm_config.h"
+#include "core/common.h"
+#include "core/tensor.h"
 #include <stdio.h>
 /**
  * bmkernel_func is the user entry to BMKERNEL just like "main" to some applications.
diff --git a/saber/funcs/impl/bm/device/bmkernel_base.h b/saber/funcs/impl/bm/device/bmkernel_base.h
index f7c8561da..17c926010 100644
--- a/saber/funcs/impl/bm/device/bmkernel_base.h
+++ b/saber/funcs/impl/bm/device/bmkernel_base.h
@@ -4,7 +4,7 @@
 extern "C" {
 #endif
 typedef struct {
-    char str[64];
+    char op[64];
 } __attribute__((packed)) bmkernel_api_base;
 #ifdef __cplusplus
 }

From c1c061c31105bf0b863e0fc7366a392556426a29 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 7 Sep 2018 13:53:14 +0800
Subject: [PATCH 286/318] Add namespace

---
 saber/funcs/impl/bm/device/bmkernel_base.c | 7 +++++++
 saber/funcs/impl/bm/device/bmkernel_base.h | 7 +++++++
 2 files changed, 14 insertions(+)

diff --git a/saber/funcs/impl/bm/device/bmkernel_base.c b/saber/funcs/impl/bm/device/bmkernel_base.c
index e24f51299..4695c36f9 100644
--- a/saber/funcs/impl/bm/device/bmkernel_base.c
+++ b/saber/funcs/impl/bm/device/bmkernel_base.c
@@ -10,6 +10,10 @@
  *               op - Flag to determine which op forward function 
  *                    it should delegate to.             
  */
+
+namespace anakin {
+namespace saber {
+
 int bmkernel_func(void *args)
 {
     bmkernel_api_base* param = (bmkernel_api_base *)args;
@@ -25,3 +29,6 @@ int bmkernel_func(void *args)
     }
     return 0;
 }
+
+}
+}
diff --git a/saber/funcs/impl/bm/device/bmkernel_base.h b/saber/funcs/impl/bm/device/bmkernel_base.h
index 17c926010..fb22fc456 100644
--- a/saber/funcs/impl/bm/device/bmkernel_base.h
+++ b/saber/funcs/impl/bm/device/bmkernel_base.h
@@ -3,9 +3,16 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
+
+namespace anakin {
+namespace saber {
+
 typedef struct {
     char op[64];
 } __attribute__((packed)) bmkernel_api_base;
+
+}
+}
 #ifdef __cplusplus
 }
 #endif

From a8a83a6ef26a71035cf49fa90fa4b77cfcf314a5 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 7 Sep 2018 13:58:29 +0800
Subject: [PATCH 287/318] revert namespace first

---
 saber/funcs/impl/bm/device/bmkernel_base.c | 10 +---------
 saber/funcs/impl/bm/device/bmkernel_base.h |  5 -----
 2 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/saber/funcs/impl/bm/device/bmkernel_base.c b/saber/funcs/impl/bm/device/bmkernel_base.c
index 4695c36f9..acdbff658 100644
--- a/saber/funcs/impl/bm/device/bmkernel_base.c
+++ b/saber/funcs/impl/bm/device/bmkernel_base.c
@@ -1,7 +1,5 @@
 #include "bmkernel_base.h"
 #include "bm_config.h"
-#include "core/common.h"
-#include "core/tensor.h"
 #include <stdio.h>
 /**
  * bmkernel_func is the user entry to BMKERNEL just like "main" to some applications.
@@ -11,9 +9,6 @@
  *                    it should delegate to.             
  */
 
-namespace anakin {
-namespace saber {
-
 int bmkernel_func(void *args)
 {
     bmkernel_api_base* param = (bmkernel_api_base *)args;
@@ -25,10 +20,7 @@ int bmkernel_func(void *args)
             // bm_conv_fwd(param)
             break;
         default:
-            LOG(FATAL) << "op is not supported by BM yet.";
+            printf("op %s is not supported by BM yet.\n", param->op);
     }
     return 0;
 }
-
-}
-}
diff --git a/saber/funcs/impl/bm/device/bmkernel_base.h b/saber/funcs/impl/bm/device/bmkernel_base.h
index fb22fc456..a47a1e056 100644
--- a/saber/funcs/impl/bm/device/bmkernel_base.h
+++ b/saber/funcs/impl/bm/device/bmkernel_base.h
@@ -4,15 +4,10 @@
 extern "C" {
 #endif
 
-namespace anakin {
-namespace saber {
-
 typedef struct {
     char op[64];
 } __attribute__((packed)) bmkernel_api_base;
 
-}
-}
 #ifdef __cplusplus
 }
 #endif

From 156dfa8ba7d27bc4afaf4d1e900c15e25d86b887 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 7 Sep 2018 14:08:12 +0800
Subject: [PATCH 288/318] switch for bm kernel op

---
 saber/funcs/impl/bm/device/bmkernel_base.c | 4 ++--
 saber/funcs/impl/bm/device/bmkernel_base.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/saber/funcs/impl/bm/device/bmkernel_base.c b/saber/funcs/impl/bm/device/bmkernel_base.c
index acdbff658..be313bb1e 100644
--- a/saber/funcs/impl/bm/device/bmkernel_base.c
+++ b/saber/funcs/impl/bm/device/bmkernel_base.c
@@ -13,10 +13,10 @@ int bmkernel_func(void *args)
 {
     bmkernel_api_base* param = (bmkernel_api_base *)args;
     switch (param->op) {
-        case "activation":
+        case 0:
             // bm_activation_fwd(param)
             break;
-        case "conv":
+        case 1:
             // bm_conv_fwd(param)
             break;
         default:
diff --git a/saber/funcs/impl/bm/device/bmkernel_base.h b/saber/funcs/impl/bm/device/bmkernel_base.h
index a47a1e056..029521f67 100644
--- a/saber/funcs/impl/bm/device/bmkernel_base.h
+++ b/saber/funcs/impl/bm/device/bmkernel_base.h
@@ -5,7 +5,7 @@ extern "C" {
 #endif
 
 typedef struct {
-    char op[64];
+    int op; //TODO: use ENUM
 } __attribute__((packed)) bmkernel_api_base;
 
 #ifdef __cplusplus

From 27dbddae552deb77e04677556a41c6731c1e67ea Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 7 Sep 2018 14:08:57 +0800
Subject: [PATCH 289/318] switch for bm kernel op

---
 saber/funcs/impl/bm/device/bmkernel_base.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/funcs/impl/bm/device/bmkernel_base.c b/saber/funcs/impl/bm/device/bmkernel_base.c
index be313bb1e..23a862b57 100644
--- a/saber/funcs/impl/bm/device/bmkernel_base.c
+++ b/saber/funcs/impl/bm/device/bmkernel_base.c
@@ -20,7 +20,7 @@ int bmkernel_func(void *args)
             // bm_conv_fwd(param)
             break;
         default:
-            printf("op %s is not supported by BM yet.\n", param->op);
+            printf("op %d is not supported by BM yet.\n", param->op);
     }
     return 0;
 }

From afc072a96e32a88746cd832fd2ce3243cb73894b Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 7 Sep 2018 15:47:37 +0800
Subject: [PATCH 290/318] Use enum for bm op type

---
 saber/funcs/impl/bm/device/bmkernel_base.c | 7 +++----
 saber/funcs/impl/bm/device/bmkernel_base.h | 7 ++++++-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/saber/funcs/impl/bm/device/bmkernel_base.c b/saber/funcs/impl/bm/device/bmkernel_base.c
index 23a862b57..a478eece3 100644
--- a/saber/funcs/impl/bm/device/bmkernel_base.c
+++ b/saber/funcs/impl/bm/device/bmkernel_base.c
@@ -5,18 +5,17 @@
  * bmkernel_func is the user entry to BMKERNEL just like "main" to some applications.
  * 
  * \param args - Pointer to arguments that user sends from host.
- *               op - Flag to determine which op forward function 
- *                    it should delegate to.             
+ *               op - Flag to determine the operation type.             
  */
 
 int bmkernel_func(void *args)
 {
     bmkernel_api_base* param = (bmkernel_api_base *)args;
     switch (param->op) {
-        case 0:
+        case ACTIVATION:
             // bm_activation_fwd(param)
             break;
-        case 1:
+        case CONV:
             // bm_conv_fwd(param)
             break;
         default:
diff --git a/saber/funcs/impl/bm/device/bmkernel_base.h b/saber/funcs/impl/bm/device/bmkernel_base.h
index 029521f67..97f099300 100644
--- a/saber/funcs/impl/bm/device/bmkernel_base.h
+++ b/saber/funcs/impl/bm/device/bmkernel_base.h
@@ -4,8 +4,13 @@
 extern "C" {
 #endif
 
+enum BmOpType {
+    ACTIVATION, 
+    CONV
+};
+
 typedef struct {
-    int op; //TODO: use ENUM
+    enum BmOpType op; // Flag to determine the operation type.
 } __attribute__((packed)) bmkernel_api_base;
 
 #ifdef __cplusplus

From 6ce0724ac99744b7ad9645d64130cd7dfa8ac1b4 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 7 Sep 2018 15:55:34 +0800
Subject: [PATCH 291/318] Add BM conv implementation

---
 saber/funcs/conv.h                  |  2 +-
 saber/funcs/impl/bm/vender_conv.cpp | 58 +++++++++++++++++++++++++++++
 saber/funcs/impl/bm/vender_conv.h   | 36 ++++++++++++++++++
 3 files changed, 95 insertions(+), 1 deletion(-)
 create mode 100644 saber/funcs/impl/bm/vender_conv.cpp
 create mode 100644 saber/funcs/impl/bm/vender_conv.h

diff --git a/saber/funcs/conv.h b/saber/funcs/conv.h
index 0fdefb291..5c407e93f 100644
--- a/saber/funcs/conv.h
+++ b/saber/funcs/conv.h
@@ -34,7 +34,7 @@
 #endif
 
 #ifdef USE_BM
-//#include "saber/funcs/impl/bm/vender_conv.h"
+#include "saber/funcs/impl/bm/vender_conv.h"
 #endif
 namespace anakin {
 namespace saber {
diff --git a/saber/funcs/impl/bm/vender_conv.cpp b/saber/funcs/impl/bm/vender_conv.cpp
new file mode 100644
index 000000000..cf6a72aac
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_conv.cpp
@@ -0,0 +1,58 @@
+
+#include "saber/funcs/impl/bm/vender_conv.h"
+#include "bmkernel_base.h"
+#include <string.h>
+#include <stdio.h>
+#include <iostream>
+
+namespace anakin
+{
+namespace saber
+{
+
+// FP32 part
+template <>
+SaberStatus VenderConv2D<BM, AK_FLOAT>::\
+    create(const std::vector<Tensor<BM> *>& inputs,
+            std::vector<Tensor<BM> *>& outputs,
+            ConvParam<BM>& param, Context<BM>& ctx)
+{
+}
+
+template <>
+SaberStatus VenderConv2D<BM, AK_FLOAT>::\
+    init(const std::vector<Tensor<BM> *> &inputs,
+         std::vector<Tensor<BM> *> &outputs,
+         ConvParam<BM> &param, Context<BM> &ctx)
+{
+
+    _handle = ctx.get_handle();
+    return create(inputs, outputs, param, ctx);
+}
+
+template <>
+SaberStatus VenderConv2D<BM, AK_FLOAT>::\
+    dispatch(const std::vector<Tensor<BM>*>& inputs,
+                std::vector<Tensor<BM>*>& outputs,
+                ConvParam<BM>& param)
+{
+    enum BmOpType op = CONV;
+    bmkernel_api_base api = { op };
+
+    //TODO: pass conv args into BM Kernel
+
+    bm_status_t bm_stat = bmkernel_launch(_handle, "bmkernel_bin.bin");
+    CHECK_EQ(BM_SUCCESS, bm_stat) << "bmkernel_launch failed.";
+    
+    /* Send arguments. */
+    BM_CHECK(bmkernel_send_args(_handle, reinterpret_cast<void *>(&api), sizeof(api)));
+
+    return SaberSuccess;
+}
+
+// INT8 part
+// Not supported yet
+
+template class VenderConv2D<BM, AK_FLOAT>;
+} // namespace saber
+} // namespace anakin
\ No newline at end of file
diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h
new file mode 100644
index 000000000..b37ed1f9b
--- /dev/null
+++ b/saber/funcs/impl/bm/vender_conv.h
@@ -0,0 +1,36 @@
+#ifndef ANAKIN_SABER_FUNCS_IMPL_BM_CONV2D_H
+#define ANAKIN_SABER_FUNCS_IMPL_BM_CONV2D_H
+
+#include "saber/funcs/impl/impl_conv.h"
+
+namespace anakin{
+
+namespace saber{
+
+template <DataType OpDtype>
+class VenderConv2D<BM, OpDtype> : public ImplBase<
+        BM, OpDtype, ConvParam<BM> > {
+            
+public:
+    VenderConv2D(): _handle(NULL) {}
+    ~VenderConv2D() {}
+
+    virtual SaberStatus init(const std::vector<Tensor<BM> *>& inputs,
+                             std::vector<Tensor<BM> *>& outputs,
+                             ConvParam<BM>& param, Context<BM>& ctx);
+
+    virtual SaberStatus create(const std::vector<Tensor<BM> *>& inputs,
+                               std::vector<Tensor<BM> *>& outputs,
+                               ConvParam<BM>& param, Context<BM>& ctx);
+
+    virtual SaberStatus dispatch(const std::vector<Tensor<BM>*>& inputs,
+                                 std::vector<Tensor<BM>*>& outputs,
+                                 ConvParam<BM>& param);
+
+private:
+    bm_handle_t _handle;
+};
+
+}
+}
+#endif //ANAKIN_SABER_FUNCS_IMPL_BM_CONV2D_H

From 10ec5244dc8507b0a9101a02f673282b946272ed Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Mon, 10 Sep 2018 14:47:01 +0800
Subject: [PATCH 292/318] update bm bin path

---
 saber/funcs/impl/bm/vender_conv.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/funcs/impl/bm/vender_conv.cpp b/saber/funcs/impl/bm/vender_conv.cpp
index cf6a72aac..a8433b8c1 100644
--- a/saber/funcs/impl/bm/vender_conv.cpp
+++ b/saber/funcs/impl/bm/vender_conv.cpp
@@ -41,7 +41,7 @@ SaberStatus VenderConv2D<BM, AK_FLOAT>::\
 
     //TODO: pass conv args into BM Kernel
 
-    bm_status_t bm_stat = bmkernel_launch(_handle, "bmkernel_bin.bin");
+    bm_status_t bm_stat = bmkernel_launch(_handle, "../../build/saber/bmkernel_bin.bin");
     CHECK_EQ(BM_SUCCESS, bm_stat) << "bmkernel_launch failed.";
     
     /* Send arguments. */

From eff245e709e5b06893ee3caec1aed36c1293ad7f Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Mon, 10 Sep 2018 17:21:43 +0800
Subject: [PATCH 293/318] host bm kernel bin at bm root system directory

---
 saber/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt
index 62e7ea9d0..03a9de20b 100644
--- a/saber/CMakeLists.txt
+++ b/saber/CMakeLists.txt
@@ -112,6 +112,7 @@ if(USE_BM)
         COMMENT "custom compilation..."
     )
     add_custom_target(ANAKIN ALL DEPENDS bm_kernel_tmp)
+    configure_file(${ANAKIN_ROOT}/build/saber/bmkernel_bin.bin ${BM_ROOT}/bmkernel_bin.bin COPYONLY)
 endif()
 
 # add saber library to static

From 62301c64d5b49a3367aa762e415cb10b565737cb Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Mon, 10 Sep 2018 17:23:01 +0800
Subject: [PATCH 294/318] Update bm kernel bin path

---
 saber/funcs/impl/bm/vender_conv.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/funcs/impl/bm/vender_conv.cpp b/saber/funcs/impl/bm/vender_conv.cpp
index a8433b8c1..1da25809f 100644
--- a/saber/funcs/impl/bm/vender_conv.cpp
+++ b/saber/funcs/impl/bm/vender_conv.cpp
@@ -41,7 +41,7 @@ SaberStatus VenderConv2D<BM, AK_FLOAT>::\
 
     //TODO: pass conv args into BM Kernel
 
-    bm_status_t bm_stat = bmkernel_launch(_handle, "../../build/saber/bmkernel_bin.bin");
+    bm_status_t bm_stat = bmkernel_launch(_handle, "/usr/local/include/bm/bmkernel_bin.bin");
     CHECK_EQ(BM_SUCCESS, bm_stat) << "bmkernel_launch failed.";
     
     /* Send arguments. */

From b957a813f59d0abb0901b114ed284b40fecf5e86 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 11 Sep 2018 09:44:11 +0800
Subject: [PATCH 295/318] Cleanup after merge

---
 saber/CMakeLists.txt              |  2 +-
 saber/core/context.h              | 12 ++++++------
 saber/core/target_wrapper.h       |  2 +-
 test/saber/test_saber_buffer.cpp  |  2 +-
 test/saber/test_saber_context.cpp |  4 ++--
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt
index 8a37e0cad..d822b7b76 100644
--- a/saber/CMakeLists.txt
+++ b/saber/CMakeLists.txt
@@ -92,7 +92,7 @@ if(USE_CUDA)
 				      ${WHOLE_ARCHIVE_END})	
 endif()
 
-if(USE_BM)
+if(USE_BM_PLACE)
     set(BIN_NAME bmkernel_bin)
     set(LINK_CONFIG link/bm1682_ddr.lds)
     add_custom_command(OUTPUT bm_kernel_tmp
diff --git a/saber/core/context.h b/saber/core/context.h
index 0cc032f2f..fc21bc755 100644
--- a/saber/core/context.h
+++ b/saber/core/context.h
@@ -35,7 +35,7 @@ class Context final{
      * @param compute_stream_id
      */
     Context(int device_id = 0, int data_stream_id = 0, int compute_stream_id = 0){
-#ifdef USE_BM        
+#ifdef USE_BM_PLACE        
         if(std::is_same<TargetType, BM>::value){
             LOG(INFO) << "context init for BM";
             int dev_count = 0;
@@ -69,7 +69,7 @@ class Context final{
     }
 
     Context(const Context<TargetType>& ctx){
-#ifdef USE_BM
+#ifdef USE_BM_PLACE
         if(std::is_same<TargetType, BM>::value){
             LOG(INFO) << "context init for BM";
             _bm_handle = ctx._bm_handle;
@@ -98,7 +98,7 @@ class Context final{
         this->_act_ids = ctx._act_ids;
         this->_mode = ctx._mode;
 #endif
-#ifdef USE_BM
+#ifdef USE_BM_PLACE
         this->_bm_handle = ctx._bm_handle;
 #endif
         return *this;
@@ -109,7 +109,7 @@ class Context final{
         comp_eq = comp_eq && (_device_id == right._device_id);
         comp_eq = comp_eq && (_data_stream_id == right._data_stream_id);
         comp_eq = comp_eq && (_compute_stream_id == right._compute_stream_id);
-#ifdef USE_BM
+#ifdef USE_BM_PLACE
         comp_eq = comp_eq && (_bm_handle == right._bm_handle);
 #endif
         return comp_eq;
@@ -151,7 +151,7 @@ class Context final{
     //std::vector<int> get_act_ids();
 #endif
 
-#ifdef USE_BM
+#ifdef USE_BM_PLACE
     bm_handle_t get_handle() {
         return _bm_handle;
     }
@@ -170,7 +170,7 @@ class Context final{
     PowerMode _mode{SABER_POWER_HIGH};
     std::vector<int> _act_ids{0};
 #endif
-#ifdef USE_BM
+#ifdef USE_BM_PLACE
     bm_handle_t _bm_handle;
 #endif
 };
diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index b888a5e24..6991119c8 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -450,7 +450,7 @@ struct TargetWrapper<BM, __device_target> {
     static bm_handle_t get_handle();
 
 };
-#endif //USE_BM
+#endif //USE_BM_PLACE
 
 #ifdef AMD_GPU
 
diff --git a/test/saber/test_saber_buffer.cpp b/test/saber/test_saber_buffer.cpp
index f1afc3a9b..ac49f4f20 100644
--- a/test/saber/test_saber_buffer.cpp
+++ b/test/saber/test_saber_buffer.cpp
@@ -138,7 +138,7 @@ TEST(TestSaberFunc, test_saber_buffer) {
     test_buffer<ARM, ARM, AK_INT8>();
 #endif
 
-#ifdef USE_BM
+#ifdef USE_BM_PLACE
     LOG(INFO) << "test BM FP32 buffer";
     //test_buffer<BM, X86, AK_FLOAT>();
 #endif
diff --git a/test/saber/test_saber_context.cpp b/test/saber/test_saber_context.cpp
index db2e7a6f9..b7c41b768 100644
--- a/test/saber/test_saber_context.cpp
+++ b/test/saber/test_saber_context.cpp
@@ -57,12 +57,12 @@ TEST(TestSaberFunc, test_arm_context) {
 }
 #endif //USE_ARM_PLACE
 
-#ifdef USE_BM
+#ifdef USE_BM_PLACE
 TEST(TestSaberFunc, test_BM_context) {
     Context<BM> ctx;
     CHECK_NOTNULL(ctx.get_handle()) << "Failed to get BM handle";
 }
-#endif //USE_BM
+#endif //USE_BM_PLACE
 
 int main(int argc, const char** argv) {
     // initial logger

From 59476a1d4edb25e1c94db3d08217015c6fc1dc3a Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 11 Sep 2018 10:28:53 +0800
Subject: [PATCH 296/318] comment out configure_file first

---
 saber/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt
index d822b7b76..50735977c 100644
--- a/saber/CMakeLists.txt
+++ b/saber/CMakeLists.txt
@@ -112,7 +112,7 @@ if(USE_BM_PLACE)
         COMMENT "custom compilation..."
     )
     add_custom_target(ANAKIN ALL DEPENDS bm_kernel_tmp)
-    configure_file(${ANAKIN_ROOT}/build/saber/bmkernel_bin.bin ${BM_ROOT}/bmkernel_bin.bin COPYONLY)
+    #configure_file(${ANAKIN_ROOT}/build/saber/bmkernel_bin.bin ${BM_ROOT}/bmkernel_bin.bin COPYONLY)
 endif()
 
 # add saber library to static

From b07a5dc3aca3b633cefbc2dafb76f630a9ee15c1 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 11 Sep 2018 11:45:02 +0800
Subject: [PATCH 297/318] Comment out code with issue

---
 framework/core/net/worker.cpp | 4 ++--
 saber/CMakeLists.txt          | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/framework/core/net/worker.cpp b/framework/core/net/worker.cpp
index 0e8e436da..0af132fa2 100644
--- a/framework/core/net/worker.cpp
+++ b/framework/core/net/worker.cpp
@@ -107,7 +107,7 @@ Worker<Ttype, Ptype, RunType>::sync_prediction(std::vector<Tensor4d<typename tar
             d_tensor_in_p->copy_from(ins[i]);
             d_tensor_in_p->set_seq_offset(ins[i].get_seq_offset());
         } 
-        Context<NV> ctx(0, 0, 0); 
+        /*Context<NV> ctx(0, 0, 0); 
         saber::SaberTimer<NV> my_time; 
         my_time.start(ctx);
 #ifdef ENABLE_OP_TIMER
@@ -118,7 +118,7 @@ Worker<Ttype, Ptype, RunType>::sync_prediction(std::vector<Tensor4d<typename tar
         net.prediction(); 
 
         my_time.end(ctx); 
-        LOG(ERROR) << " exec  << time: " << my_time.get_average_ms() << " ms ";
+        LOG(ERROR) << " exec  << time: " << my_time.get_average_ms() << " ms ";*/
 
 #ifdef ENABLE_OP_TIMER
         my_time.end(ctx); 
diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt
index 50735977c..c3b42c1bd 100644
--- a/saber/CMakeLists.txt
+++ b/saber/CMakeLists.txt
@@ -109,10 +109,10 @@ if(USE_BM_PLACE)
         COMMAND printf "%x" 0x58926 >> ${BIN_NAME}.bin
         COMMAND cat ${BIN_NAME}_itcm.hex.sim >> ${BIN_NAME}.bin
         COMMAND cat ${BIN_NAME}_ddr.hex.sim >> ${BIN_NAME}.bin
-        COMMENT "custom compilation..."
+        COMMAND cp ${ANAKIN_ROOT}/build/saber/bmkernel_bin.bin ${BM_ROOT}/bmkernel_bin.bin
+        COMMENT "BM Kernel compilation..."
     )
     add_custom_target(ANAKIN ALL DEPENDS bm_kernel_tmp)
-    #configure_file(${ANAKIN_ROOT}/build/saber/bmkernel_bin.bin ${BM_ROOT}/bmkernel_bin.bin COPYONLY)
 endif()
 
 # add saber library to static

From a0a9618c4c055cc6f8c4bae1b9ed28355e5a395d Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 11 Sep 2018 14:34:01 +0800
Subject: [PATCH 298/318] uncomment bm conv test

---
 test/saber/test_saber_conv.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/saber/test_saber_conv.cpp b/test/saber/test_saber_conv.cpp
index 52411fbb3..6fa1bede6 100644
--- a/test/saber/test_saber_conv.cpp
+++ b/test/saber/test_saber_conv.cpp
@@ -11,7 +11,6 @@ using namespace anakin::saber;
 #define CHECK_RESULT
 //#define CHECK_SPEED
 
-#if 0
 #ifdef USE_BM_PLACE
 TEST(TestSaberFunc, test_saber_conv_results_bm) {
     Env<BM>::env_init();
@@ -70,7 +69,6 @@ TEST(TestSaberFunc, test_saber_conv_results_bm) {
     }
 }
 #endif
-#endif
 
 TEST(TestSaberFunc, test_saber_conv_results) {
 #ifdef USE_CUDA

From 7beca133e959df62301d1890fa90355c29f00284 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 13 Sep 2018 12:06:46 +0800
Subject: [PATCH 299/318] test

---
 saber/funcs/impl/bm/vender_conv.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/saber/funcs/impl/bm/vender_conv.cpp b/saber/funcs/impl/bm/vender_conv.cpp
index 1da25809f..4e50032fc 100644
--- a/saber/funcs/impl/bm/vender_conv.cpp
+++ b/saber/funcs/impl/bm/vender_conv.cpp
@@ -47,6 +47,8 @@ SaberStatus VenderConv2D<BM, AK_FLOAT>::\
     /* Send arguments. */
     BM_CHECK(bmkernel_send_args(_handle, reinterpret_cast<void *>(&api), sizeof(api)));
 
+    LOG(INFO) << "BM conv done!";
+
     return SaberSuccess;
 }
 

From 0e45e329865a489f09412856b97f471e0cd155a6 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 13 Sep 2018 12:13:28 +0800
Subject: [PATCH 300/318] Revert "test"

This reverts commit 7beca133e959df62301d1890fa90355c29f00284.
---
 saber/funcs/impl/bm/vender_conv.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/saber/funcs/impl/bm/vender_conv.cpp b/saber/funcs/impl/bm/vender_conv.cpp
index 4e50032fc..1da25809f 100644
--- a/saber/funcs/impl/bm/vender_conv.cpp
+++ b/saber/funcs/impl/bm/vender_conv.cpp
@@ -47,8 +47,6 @@ SaberStatus VenderConv2D<BM, AK_FLOAT>::\
     /* Send arguments. */
     BM_CHECK(bmkernel_send_args(_handle, reinterpret_cast<void *>(&api), sizeof(api)));
 
-    LOG(INFO) << "BM conv done!";
-
     return SaberSuccess;
 }
 

From c3ac7c9ecb62aa01c9706d081d6397d5fc71140f Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 13 Sep 2018 16:03:21 +0800
Subject: [PATCH 301/318] Fix BM bin compilation issue

---
 saber/CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt
index c3b42c1bd..47289de4b 100644
--- a/saber/CMakeLists.txt
+++ b/saber/CMakeLists.txt
@@ -96,8 +96,8 @@ if(USE_BM_PLACE)
     set(BIN_NAME bmkernel_bin)
     set(LINK_CONFIG link/bm1682_ddr.lds)
     add_custom_command(OUTPUT bm_kernel_tmp
-        COMMAND arm-none-eabi-gcc ${ANAKIN_SABER}/funcs/impl/bm/device/bmkernel_base.c -mcpu=arm926ej-s -mfpu=vfp -fno-short-enums -std=gnu99 -O2 -Wall -Werror -ffunction-sections -fdata-sections -nostdlib -DENABLE_PRINT -I${BM_ROOT}/include/device -c -o ${BIN_NAME}.o
-        COMMAND arm-none-eabi-gcc -T ${BM_ROOT}/${LINK_CONFIG} -mcpu=arm926ej-s -mfpu=vfp -fno-short-enums -Wl,--check-sections -Wl,--gc-sections -Wl,--unresolved-symbols=report-all -Wl,--no-enum-size-warning -Wl,--start-group -lc -lm ${BIN_NAME}.o ${BM_ROOT}/lib/device/libbmkernel-top-asic.a ${BM_ROOT}/lib/device/libbmkernel-core-asic.a -Wl,--end-group -o ${BIN_NAME}.elf
+        COMMAND arm-none-eabi-gcc ${ANAKIN_SABER}/funcs/impl/bm/device/bmkernel_base.c -mcpu=arm926ej-s -mfpu=vfp -fno-short-enums -std=gnu99 -O2 -Wall -Werror -ffunction-sections -fdata-sections -nostdlib -DENABLE_PRINT -I${BM_ROOT}/include/device/atomic -I${BM_ROOT}/include/device/utils -I${BM_ROOT}/include/device -c -o ${BIN_NAME}.o
+        COMMAND arm-none-eabi-gcc -T ${BM_ROOT}/${LINK_CONFIG} -mcpu=arm926ej-s -mfpu=vfp -fno-short-enums -Wl,--check-sections -Wl,--gc-sections -Wl,--unresolved-symbols=report-all -Wl,--no-enum-size-warning -o ${BIN_NAME}.elf -Wl,--start-group -lc -lm ${BIN_NAME}.o ${BM_ROOT}/lib/device/libbmkernel-top-asic.a ${BM_ROOT}/lib/device/libbmkernel-core-asic.a -Wl,--end-group
         COMMAND arm-none-eabi-objcopy -O binary -R *.slow* ${BIN_NAME}.elf ${BIN_NAME}_itcm.bin
         COMMAND hexdump -v -e '1/4 \"%08x\\n\"' ${BIN_NAME}_itcm.bin > ${BIN_NAME}_itcm.hex.sim
         COMMAND arm-none-eabi-objcopy -O binary -j *.slow* ${BIN_NAME}.elf ${BIN_NAME}_ddr.bin
@@ -106,7 +106,7 @@ if(USE_BM_PLACE)
         COMMAND printf "%x" 0x0 >> ${BIN_NAME}.bin
         COMMAND printf "%x" 0x0 >> ${BIN_NAME}.bin
         COMMAND printf "%x" 0x0 >> ${BIN_NAME}.bin
-        COMMAND printf "%x" 0x58926 >> ${BIN_NAME}.bin
+        COMMAND printf "%x" 348444 >> ${BIN_NAME}.bin
         COMMAND cat ${BIN_NAME}_itcm.hex.sim >> ${BIN_NAME}.bin
         COMMAND cat ${BIN_NAME}_ddr.hex.sim >> ${BIN_NAME}.bin
         COMMAND cp ${ANAKIN_ROOT}/build/saber/bmkernel_bin.bin ${BM_ROOT}/bmkernel_bin.bin

From 3e7048df6608df602669d5dd13e673215d065d75 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 14 Sep 2018 13:13:04 +0800
Subject: [PATCH 302/318] Fix issue for BM bin

---
 saber/CMakeLists.txt              | 8 +++++---
 saber/funcs/impl/bm/bmk_script.sh | 4 ++++
 2 files changed, 9 insertions(+), 3 deletions(-)
 create mode 100644 saber/funcs/impl/bm/bmk_script.sh

diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt
index c3b42c1bd..82019fe42 100644
--- a/saber/CMakeLists.txt
+++ b/saber/CMakeLists.txt
@@ -96,8 +96,8 @@ if(USE_BM_PLACE)
     set(BIN_NAME bmkernel_bin)
     set(LINK_CONFIG link/bm1682_ddr.lds)
     add_custom_command(OUTPUT bm_kernel_tmp
-        COMMAND arm-none-eabi-gcc ${ANAKIN_SABER}/funcs/impl/bm/device/bmkernel_base.c -mcpu=arm926ej-s -mfpu=vfp -fno-short-enums -std=gnu99 -O2 -Wall -Werror -ffunction-sections -fdata-sections -nostdlib -DENABLE_PRINT -I${BM_ROOT}/include/device -c -o ${BIN_NAME}.o
-        COMMAND arm-none-eabi-gcc -T ${BM_ROOT}/${LINK_CONFIG} -mcpu=arm926ej-s -mfpu=vfp -fno-short-enums -Wl,--check-sections -Wl,--gc-sections -Wl,--unresolved-symbols=report-all -Wl,--no-enum-size-warning -Wl,--start-group -lc -lm ${BIN_NAME}.o ${BM_ROOT}/lib/device/libbmkernel-top-asic.a ${BM_ROOT}/lib/device/libbmkernel-core-asic.a -Wl,--end-group -o ${BIN_NAME}.elf
+        COMMAND arm-none-eabi-gcc ${ANAKIN_SABER}/funcs/impl/bm/device/bmkernel_base.c -mcpu=arm926ej-s -mfpu=vfp -fno-short-enums -std=gnu99 -O2 -Wall -Werror -ffunction-sections -fdata-sections -nostdlib -DENABLE_PRINT -I${BM_ROOT}/include/device/atomic -I${BM_ROOT}/include/device/utils -I${BM_ROOT}/include/device -c -o ${BIN_NAME}.o
+        COMMAND arm-none-eabi-gcc -T ${BM_ROOT}/${LINK_CONFIG} -mcpu=arm926ej-s -mfpu=vfp -fno-short-enums -Wl,--check-sections -Wl,--gc-sections -Wl,--unresolved-symbols=report-all -Wl,--no-enum-size-warning -o ${BIN_NAME}.elf -Wl,--start-group -lc -lm ${BIN_NAME}.o ${BM_ROOT}/lib/device/libbmkernel-top-asic.a ${BM_ROOT}/lib/device/libbmkernel-core-asic.a -Wl,--end-group
         COMMAND arm-none-eabi-objcopy -O binary -R *.slow* ${BIN_NAME}.elf ${BIN_NAME}_itcm.bin
         COMMAND hexdump -v -e '1/4 \"%08x\\n\"' ${BIN_NAME}_itcm.bin > ${BIN_NAME}_itcm.hex.sim
         COMMAND arm-none-eabi-objcopy -O binary -j *.slow* ${BIN_NAME}.elf ${BIN_NAME}_ddr.bin
@@ -106,7 +106,9 @@ if(USE_BM_PLACE)
         COMMAND printf "%x" 0x0 >> ${BIN_NAME}.bin
         COMMAND printf "%x" 0x0 >> ${BIN_NAME}.bin
         COMMAND printf "%x" 0x0 >> ${BIN_NAME}.bin
-        COMMAND printf "%x" 0x58926 >> ${BIN_NAME}.bin
+
+        COMMAND ${ANAKIN_SABER}/funcs/impl/bm/bmk_script.sh
+
         COMMAND cat ${BIN_NAME}_itcm.hex.sim >> ${BIN_NAME}.bin
         COMMAND cat ${BIN_NAME}_ddr.hex.sim >> ${BIN_NAME}.bin
         COMMAND cp ${ANAKIN_ROOT}/build/saber/bmkernel_bin.bin ${BM_ROOT}/bmkernel_bin.bin
diff --git a/saber/funcs/impl/bm/bmk_script.sh b/saber/funcs/impl/bm/bmk_script.sh
new file mode 100644
index 000000000..892bf3af1
--- /dev/null
+++ b/saber/funcs/impl/bm/bmk_script.sh
@@ -0,0 +1,4 @@
+ITCM_SZ_OCT=$(wc -c < bmkernel_bin_itcm.hex.sim)
+ITCM_SZ=$(echo "obase=16;$ITCM_SZ_OCT" | bc)
+echo 0x$ITCM_SZ
+printf "%x" 0x$ITCM_SZ >> bmkernel_bin.bin
\ No newline at end of file

From 179d6f4fa737af87094f07b26f954d2a5dfd8c94 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 14 Sep 2018 13:46:30 +0800
Subject: [PATCH 303/318] scripting permission

---
 saber/funcs/impl/bm/bmk_script.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 saber/funcs/impl/bm/bmk_script.sh

diff --git a/saber/funcs/impl/bm/bmk_script.sh b/saber/funcs/impl/bm/bmk_script.sh
old mode 100644
new mode 100755

From bf6e712dd155dc2fbb6bc4961272ae64917442ba Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 14 Sep 2018 14:55:08 +0800
Subject: [PATCH 304/318] Fix bm bin compilation issue

---
 saber/CMakeLists.txt              | 4 ++--
 saber/funcs/impl/bm/bmk_script.sh | 4 ----
 2 files changed, 2 insertions(+), 6 deletions(-)
 delete mode 100755 saber/funcs/impl/bm/bmk_script.sh

diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt
index 82019fe42..c27591c8f 100644
--- a/saber/CMakeLists.txt
+++ b/saber/CMakeLists.txt
@@ -107,11 +107,11 @@ if(USE_BM_PLACE)
         COMMAND printf "%x" 0x0 >> ${BIN_NAME}.bin
         COMMAND printf "%x" 0x0 >> ${BIN_NAME}.bin
 
-        COMMAND ${ANAKIN_SABER}/funcs/impl/bm/bmk_script.sh
+        COMMAND printf \"%x\" `wc -c < ${BIN_NAME}_itcm.hex.sim` >> ${BIN_NAME}.bin
 
         COMMAND cat ${BIN_NAME}_itcm.hex.sim >> ${BIN_NAME}.bin
         COMMAND cat ${BIN_NAME}_ddr.hex.sim >> ${BIN_NAME}.bin
-        COMMAND cp ${ANAKIN_ROOT}/build/saber/bmkernel_bin.bin ${BM_ROOT}/bmkernel_bin.bin
+        COMMAND cp ${ANAKIN_ROOT}/build/saber/bmkernel_bin.bin ${BM_ROOT}/${BIN_NAME}.bin
         COMMENT "BM Kernel compilation..."
     )
     add_custom_target(ANAKIN ALL DEPENDS bm_kernel_tmp)
diff --git a/saber/funcs/impl/bm/bmk_script.sh b/saber/funcs/impl/bm/bmk_script.sh
deleted file mode 100755
index 892bf3af1..000000000
--- a/saber/funcs/impl/bm/bmk_script.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-ITCM_SZ_OCT=$(wc -c < bmkernel_bin_itcm.hex.sim)
-ITCM_SZ=$(echo "obase=16;$ITCM_SZ_OCT" | bc)
-echo 0x$ITCM_SZ
-printf "%x" 0x$ITCM_SZ >> bmkernel_bin.bin
\ No newline at end of file

From 46aa1545a65d53e9d2055b63ee3802c5a57eb9df Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 18 Sep 2018 16:05:37 +0800
Subject: [PATCH 305/318] BM conv host implementation

---
 saber/funcs/impl/bm/bm_common.h            | 171 +++++++++++++++
 saber/funcs/impl/bm/device/bmk_conv.c      |   5 +
 saber/funcs/impl/bm/device/bmkernel_base.c |  10 +-
 saber/funcs/impl/bm/device/bmkernel_base.h |  28 +++
 saber/funcs/impl/bm/vender_conv.cpp        | 229 ++++++++++++++++++++-
 5 files changed, 437 insertions(+), 6 deletions(-)
 create mode 100644 saber/funcs/impl/bm/bm_common.h
 create mode 100644 saber/funcs/impl/bm/device/bmk_conv.c

diff --git a/saber/funcs/impl/bm/bm_common.h b/saber/funcs/impl/bm/bm_common.h
new file mode 100644
index 000000000..ab9716433
--- /dev/null
+++ b/saber/funcs/impl/bm/bm_common.h
@@ -0,0 +1,171 @@
+#ifndef ANAKIN_SABER_FUNCS_IMPL_BM_BM_COMMON_H
+#define ANAKIN_SABER_FUNCS_IMPL_BM_BM_COMMON_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <math.h>
+#include <unistd.h>
+#include <time.h>
+#include "bm_config.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_MESSAGE
+#ifdef DEBUG_MESSAGE
+#define MSG_DBG(fmt, ...)       printf("MSG: "fmt, ##__VA_ARGS__)
+#else
+#define MSG_DBG(fmt, ...)
+#endif
+
+
+#define INLINE                  inline
+
+#define UNUSED(x)               (void)(x)
+
+#define __ALIGN_MASK(x,mask)    (((x)+(mask))&~(mask))
+#define ALIGN(x,a)              __ALIGN_MASK(x,(__typeof__(x))(a)-1)
+
+#define ROUND_UP(A, B)  ((A)/(B) + ((A) % (B) == 0 ? 0 : 1))
+
+#define bm_min(x, y)               ((x) < (y) ? (x) : (y))
+#define bm_max(x, y)               ((x) > (y) ? (x) : (y))
+
+
+typedef unsigned char           u8;
+typedef unsigned short          u16;
+typedef unsigned int            u32;
+typedef unsigned long long      u64;
+
+typedef union {
+  int ival;
+  float fval;
+} IF_VAL;
+
+typedef u32 tuple4_u32[4];
+
+typedef struct tensor_info{
+    u32 n,c,h,w;
+    u32 w_stride, n_stride, c_stride, h_stride;
+    u32 address;
+    u32 data_format;
+    u32 neuron_matrix;		//0: neuron, 1: matrix
+    u32 matrix_col_magin;	//the magin is not 0, when column_num%w_param!=0
+}TENSOR_INFO;
+
+
+typedef struct shape{
+    u16 n, c, h, w;
+}local_shape_t;
+
+#define FLOAT_SIZE              4
+#define INT8_SIZE               1
+#define FLOAT_BITWIDTH          32
+#define GET_U64(U32_H, U32_L)   (((u64)(U32_H) << 32) | (u64)(U32_L))
+
+typedef enum {
+    CAFFE_SUPPORT             = 0,
+    TENSORFLOW_SUPPORT        = 1
+} PLATFORM_SUPPORT;
+
+typedef enum {
+    NODECHIP_REG    = 0,
+    HOST_REG        = 1
+} REG_TYPE;
+
+typedef enum {
+  ENGINE_BD                     = 0,
+  ENGINE_GDMA                   = 1,
+  ENGINE_CDMA                   = 2,
+  ENGINE_HDMA                   = 3,
+  ENGINE_END
+} ENGINE_ID;
+
+typedef struct tensor_4d_t {
+    int n;
+    int c;
+    int h;
+    int w;
+} bm_tensor_4d_t;
+
+typedef struct kernel_param{
+    int g;
+    int oc;
+    int ic;
+    int h;
+    int w;
+} bm_kernel_param_t;
+
+typedef struct bm_conv_param{
+    int stride_h;
+    int stride_w;
+    int pad_h;
+    int pad_w;
+    int dilation_h;
+    int dilation_w;
+    bool result_add;
+} bm_conv_param_t;
+
+typedef struct conv_secs_info{
+    int ocsecs;
+    int icsecs;
+    int nsecs;
+    int hsecs;
+} conv_secs_info_t;
+
+static INLINE int ceiling_func(int numerator, int denominator)
+{
+  return (numerator + denominator - 1) / denominator;
+}
+
+static INLINE int ceiling_func_shift(int numerator, int shift)
+{
+  return (numerator + (1 << shift) - 1) >> shift;
+}
+
+static int INLINE calc_offset(int *shape, int *offset)
+{
+  return ((offset[0] * shape[1] + offset[1]) * shape[2] + offset[2])
+      * shape[3] + offset[3];
+}
+
+//All the size are in the units of bytes
+static int INLINE get_index_csize_global(int h, int w, int index_bitwidth)
+{
+  int size = h * w * index_bitwidth;
+  //32 bit align
+  return (((size >> 5)) + ((size & 0x1f) != 0)) * FLOAT_SIZE;
+}
+
+static int INLINE get_index_cstride_global(int h, int w, int index_bitwidth)
+{
+  int size = h * w * index_bitwidth;
+  //32 bit align
+  return (((size >> 5)) +
+          ((size & 0x1f) != 0)) * FLOAT_BITWIDTH / index_bitwidth;
+}
+
+static int INLINE get_neuron_csize_local(int h, int w)
+{
+  int size = h * w;
+  //EU_NUM neurons align
+  return ALIGN(size,EU_NUM) * FLOAT_SIZE;
+}
+
+static int INLINE addr_EU_align(int addr){
+  addr = addr / FLOAT_SIZE;
+  return ALIGN( addr, EU_NUM ) * FLOAT_SIZE;
+}
+
+static INLINE int get_align_tensor_size(bm_tensor_4d_t shape){
+  int c_per_npu = ceiling_func_shift(shape.c, NPU_SHIFT);
+  return shape.n * c_per_npu * get_neuron_csize_local(shape.h, shape.w);
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* ANAKIN_SABER_FUNCS_IMPL_BM_BM_COMMON_H */
diff --git a/saber/funcs/impl/bm/device/bmk_conv.c b/saber/funcs/impl/bm/device/bmk_conv.c
new file mode 100644
index 000000000..9f85f1009
--- /dev/null
+++ b/saber/funcs/impl/bm/device/bmk_conv.c
@@ -0,0 +1,5 @@
+#include <stdio.h>
+
+void bm_conv_fwd(bm_api_conv_forward conv_param) {
+    
+}
\ No newline at end of file
diff --git a/saber/funcs/impl/bm/device/bmkernel_base.c b/saber/funcs/impl/bm/device/bmkernel_base.c
index a478eece3..42cb0e591 100644
--- a/saber/funcs/impl/bm/device/bmkernel_base.c
+++ b/saber/funcs/impl/bm/device/bmkernel_base.c
@@ -1,5 +1,6 @@
 #include "bmkernel_base.h"
 #include "bm_config.h"
+#include "bmk_conv.c"
 #include <stdio.h>
 /**
  * bmkernel_func is the user entry to BMKERNEL just like "main" to some applications.
@@ -12,12 +13,15 @@ int bmkernel_func(void *args)
 {
     bmkernel_api_base* param = (bmkernel_api_base *)args;
     switch (param->op) {
-        case ACTIVATION:
+        case ACTIVATION: {
             // bm_activation_fwd(param)
             break;
-        case CONV:
-            // bm_conv_fwd(param)
+        }
+        case CONV: {
+            bm_api_conv_forward* api = (bm_api_conv_forward *)param->opParam;
+            bm_conv_fwd(*api);
             break;
+        }
         default:
             printf("op %d is not supported by BM yet.\n", param->op);
     }
diff --git a/saber/funcs/impl/bm/device/bmkernel_base.h b/saber/funcs/impl/bm/device/bmkernel_base.h
index 97f099300..5a192127c 100644
--- a/saber/funcs/impl/bm/device/bmkernel_base.h
+++ b/saber/funcs/impl/bm/device/bmkernel_base.h
@@ -9,8 +9,36 @@ enum BmOpType {
     CONV
 };
 
+typedef struct {
+    unsigned long long             ifmap_offset_global;
+    unsigned long long             ofmap_offset_global;
+    unsigned long long             weight_offset_global;
+    unsigned long long             bias_offset_global;
+    int                            input_n;   // note this is total input_n
+    int                            input_c;
+    int                            input_h;
+    int                            input_w;
+    int                            groups;
+    int                            output_c;
+    int                            kh;
+    int                            kw;
+    int                            dh;
+    int                            dw;
+    int                            pad_h;
+    int                            pad_w;
+    int                            stride_h;
+    int                            stride_w;
+    int                            using_bias;
+    int                            result_add;
+    int                            icsecs;
+    int                            ocsecs;
+    int                            nsecs;
+    int                            hsecs;
+} __attribute__((packed)) bm_api_conv_forward;
+
 typedef struct {
     enum BmOpType op; // Flag to determine the operation type.
+    void* opParam;
 } __attribute__((packed)) bmkernel_api_base;
 
 #ifdef __cplusplus
diff --git a/saber/funcs/impl/bm/vender_conv.cpp b/saber/funcs/impl/bm/vender_conv.cpp
index 1da25809f..ae0be1fd8 100644
--- a/saber/funcs/impl/bm/vender_conv.cpp
+++ b/saber/funcs/impl/bm/vender_conv.cpp
@@ -1,6 +1,7 @@
 
 #include "saber/funcs/impl/bm/vender_conv.h"
 #include "bmkernel_base.h"
+#include "bm_common.h"
 #include <string.h>
 #include <stdio.h>
 #include <iostream>
@@ -10,6 +11,141 @@ namespace anakin
 namespace saber
 {
 
+void conv_splitc(bm_kernel_param_t kernel_param, conv_secs_info_t *secs_info){
+  int oc_per_NPU = ceiling_func_shift(kernel_param.oc, NPU_SHIFT);
+  int kernel_size = kernel_param.h * kernel_param.w * FLOAT_SIZE;
+  int weight_capacity = kernel_param.ic * oc_per_NPU * kernel_size;
+  secs_info->icsecs = 1;
+  secs_info->ocsecs = 1;
+  const int quart_local_size = (LOCAL_MEM_SIZE >> 2);
+  if( weight_capacity > (LOCAL_MEM_SIZE >> 1) ){
+    const int max_weight_size = quart_local_size;
+    secs_info->icsecs = weight_capacity / max_weight_size + 1;
+    if(secs_info->icsecs > kernel_param.ic){
+      secs_info->icsecs = kernel_param.ic;
+    }
+    int icslice = (kernel_param.ic + secs_info->icsecs - 1) / secs_info->icsecs;
+    weight_capacity = icslice * oc_per_NPU * kernel_size * FLOAT_SIZE;
+    weight_capacity = addr_EU_align( weight_capacity);
+    int max_ocsecs = oc_per_NPU;
+    while( weight_capacity > max_weight_size ){
+      if(secs_info->ocsecs == 1){
+        secs_info->ocsecs = weight_capacity / quart_local_size + 1;
+      }
+      if(secs_info->ocsecs > max_ocsecs){
+        secs_info->ocsecs = max_ocsecs;
+        break;
+      }else{
+        secs_info->ocsecs++;
+      }
+      int ocslice = (kernel_param.oc + secs_info->ocsecs - 1) / secs_info->ocsecs;
+      oc_per_NPU = ceiling_func_shift(ocslice, NPU_SHIFT);
+      weight_capacity = icslice * oc_per_NPU * kernel_size * FLOAT_SIZE;
+      weight_capacity = addr_EU_align(weight_capacity);
+    }
+  }
+}
+
+static bm_status_t conv_splith(bm_tensor_4d_t input_shape, bm_tensor_4d_t output_shape,
+    bm_conv_param_t conv_param, int local_mem_capacity, int kh, conv_secs_info_t *secs_info){
+  int io_need = get_align_tensor_size(input_shape) +
+      get_align_tensor_size(output_shape);
+  secs_info->hsecs = io_need / local_mem_capacity;
+  int output_h = output_shape.h;
+  output_shape.h = (output_h + secs_info->hsecs - 1) / secs_info->hsecs;
+  input_shape.h = output_shape.h * conv_param.stride_h + kh;
+  while(io_need > local_mem_capacity){
+    if(secs_info->hsecs == output_h){
+      return BM_NOT_SUPPORTED; 
+    }
+    secs_info->hsecs++;
+    output_shape.h = (output_h + secs_info->hsecs - 1) / secs_info->hsecs;
+    input_shape.h = output_shape.h * conv_param.stride_h + kh;
+    io_need = get_align_tensor_size(input_shape) +
+                       get_align_tensor_size(output_shape);
+  }
+  return BM_SUCCESS;
+}
+
+static bm_status_t get_conv_secs_info(
+    bm_tensor_4d_t    input_shape,
+    bm_kernel_param_t kernel_param,
+    bm_tensor_4d_t    output_shape,
+    bool              with_bias,
+    bm_conv_param_t   conv_param,
+    conv_secs_info_t *secs_info){
+  int ic = kernel_param.ic;
+  int oc = kernel_param.oc;
+  int oc_per_NPU = ceiling_func_shift(oc, NPU_SHIFT);
+  int bias_tensor_size = oc_per_NPU * FLOAT_SIZE;
+  if(!with_bias){
+    bias_tensor_size = 0;
+  }
+  int kernel_size = kernel_param.h * kernel_param.w;
+  int weight_tensor_size = ic * oc_per_NPU * kernel_size * FLOAT_SIZE;
+  int weight_capacity = addr_EU_align( weight_tensor_size  + bias_tensor_size);
+  int ifmap_total_tensor_size = get_align_tensor_size(input_shape);
+  int ofmap_total_tensor_size = get_align_tensor_size(output_shape);
+  int totalneed_local_size = ifmap_total_tensor_size +
+                          ofmap_total_tensor_size + weight_capacity;
+  secs_info->nsecs = 1; secs_info->hsecs = 1;
+  if(totalneed_local_size > LOCAL_MEM_SIZE){
+    //if weight_capacity > 2 * bank_size then split oc and ic
+    conv_splitc(kernel_param, secs_info);
+    int ocslice = (oc + secs_info->ocsecs - 1) / secs_info->ocsecs;
+    int icslice = (ic + secs_info->icsecs - 1) / secs_info->icsecs;
+    oc_per_NPU = ceiling_func_shift(ocslice, NPU_SHIFT);
+
+    weight_capacity = icslice * oc_per_NPU * kernel_size * FLOAT_SIZE;
+    weight_capacity = addr_EU_align( weight_capacity + bias_tensor_size );
+    int local_mem_capacity = LOCAL_MEM_SIZE - weight_capacity;
+    CHECK_GT(local_mem_capacity, 0) << "local memory capacity not enough";
+    input_shape.c = icslice;
+    output_shape.c = ocslice;
+    ifmap_total_tensor_size = get_align_tensor_size(input_shape);
+    ofmap_total_tensor_size = get_align_tensor_size(output_shape);
+    int totalneed_local_size = ifmap_total_tensor_size + ofmap_total_tensor_size;
+    if(totalneed_local_size > local_mem_capacity){
+      int kh_ext = conv_param.dilation_h * (kernel_param.h - 1) + 1;
+      if(input_shape.n > 1){
+        if( totalneed_local_size > local_mem_capacity * input_shape.n){
+          secs_info->nsecs = input_shape.n;
+          output_shape.n = input_shape.n = 1;
+          bm_status_t result = conv_splith(input_shape, output_shape,
+              conv_param, local_mem_capacity, kh_ext, secs_info);
+          if(result == BM_NOT_SUPPORTED){
+            return result;
+          }
+        }else{
+          int input_n = input_shape.n;
+          secs_info->nsecs = (totalneed_local_size + local_mem_capacity - 1) / local_mem_capacity;
+          input_shape.n = (input_n + secs_info->nsecs - 1) / secs_info->nsecs;
+          output_shape.n = input_shape.n;
+          totalneed_local_size = get_align_tensor_size(input_shape) +
+                       get_align_tensor_size(output_shape);
+          while(totalneed_local_size > local_mem_capacity){
+            secs_info->nsecs++;
+            input_shape.n = (input_n + secs_info->nsecs - 1) / secs_info->nsecs;
+            output_shape.n = input_shape.n;
+            totalneed_local_size = get_align_tensor_size(input_shape) +
+                       get_align_tensor_size(output_shape);
+          }
+        }
+      }else{
+        bm_status_t result = conv_splith(input_shape, output_shape,
+            conv_param, local_mem_capacity, kh_ext, secs_info);
+        if(result == BM_NOT_SUPPORTED){
+          return result;
+        }
+      }
+    }
+  }else{
+    secs_info->icsecs = 1;
+    secs_info->ocsecs = 1;
+  }
+  return BM_SUCCESS;
+}
+
 // FP32 part
 template <>
 SaberStatus VenderConv2D<BM, AK_FLOAT>::\
@@ -36,15 +172,102 @@ SaberStatus VenderConv2D<BM, AK_FLOAT>::\
                 std::vector<Tensor<BM>*>& outputs,
                 ConvParam<BM>& param)
 {
-    enum BmOpType op = CONV;
-    bmkernel_api_base api = { op };
+    const BM_mem_addr in_data = (const BM_mem_addr) inputs[0]->data();
+    BM_mem_addr out_data = (BM_mem_addr) outputs[0]->mutable_data();
+    const BM_mem_addr weight = (const BM_mem_addr) param.weight()->data();
+
+    int input_n = inputs[0]->num();
+    int input_c = inputs[0]->channel();
+    int input_h = inputs[0]->height();
+    int input_w = inputs[0]->width();
+
+    int output_n = outputs[0]->num();
+    int output_c = outputs[0]->channel();
+    int output_h = outputs[0]->height();
+    int output_w = outputs[0]->width();
 
-    //TODO: pass conv args into BM Kernel
+    int group = param.group;
+    int kh = param.weight()->height();
+    int kw = param.weight()->width();
+    int pad_h = param.pad_h;
+    int pad_w = param.pad_w;
+    int stride_h = param.stride_h;
+    int stride_w = param.stride_w;
+    int dilation_h = param.dilation_h;
+    int dilation_w = param.dilation_w;
+
+    bool with_bias = param.bias()->size() > 0;
+    const bm_mem_desc bias = with_bias ? (const bm_mem_desc) param.bias()->data() : BM_MEM_NULL;
+
+    bm_tensor_4d_t input_shape = {
+        input_n,
+        input_c,
+        input_h,
+        input_w};
+
+    bm_tensor_4d_t output_shape = {
+        output_n,
+        output_c,
+        output_h,
+        output_w};
+
+    bm_kernel_param_t kernel_param = {
+        group,
+        output_c,
+        input_c,
+        kh,
+        kw};
+
+    bm_conv_param_t conv_param = {
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        dilation_h,
+        dilation_w,
+        0};
+
+    bm_device_mem_t input_buf_mem = in_data;
+    // TODO: handle special case with pooling op
+
+    conv_secs_info_t secs_info;
+    bm_status_t result = get_conv_secs_info(input_shape, kernel_param,
+          output_shape, with_bias, conv_param, &secs_info);
+    CHECK_EQ(BM_SUCCESS, result) << "local memory is not enough in conv.";
+
+    bm_api_conv_forward bm_conv_param = {
+      bm_mem_get_device_addr(input_buf_mem),
+      bm_mem_get_device_addr(out_data),
+      bm_mem_get_device_addr(weight),
+      with_bias ? bm_mem_get_device_addr(bias) : BM_MEM_ADDR_NULL,
+      input_shape.n,
+      input_shape.c,
+      input_shape.h,
+      input_shape.w,
+      kernel_param.g,
+      output_shape.c,
+      kernel_param.h,
+      kernel_param.w,
+      conv_param.dilation_h,
+      conv_param.dilation_w,
+      conv_param.pad_h,
+      conv_param.pad_w,
+      conv_param.stride_h,
+      conv_param.stride_w,
+      with_bias,
+      conv_param.result_add,
+      secs_info.icsecs,
+      secs_info.ocsecs,
+      secs_info.nsecs,
+      secs_info.hsecs
+    };
 
     bm_status_t bm_stat = bmkernel_launch(_handle, "/usr/local/include/bm/bmkernel_bin.bin");
     CHECK_EQ(BM_SUCCESS, bm_stat) << "bmkernel_launch failed.";
     
     /* Send arguments. */
+    enum BmOpType op = CONV;
+    bmkernel_api_base api = { op, reinterpret_cast<void *>(&bm_conv_param) };
     BM_CHECK(bmkernel_send_args(_handle, reinterpret_cast<void *>(&api), sizeof(api)));
 
     return SaberSuccess;

From 60decef58ef65c5ab73d015e85fac58cba7a9608 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 19 Sep 2018 11:15:57 +0800
Subject: [PATCH 306/318] BM conv device implementation

---
 saber/funcs/impl/bm/{ => device}/bm_common.h |  15 +-
 saber/funcs/impl/bm/device/bm_memmap.h       |  61 ++++
 saber/funcs/impl/bm/device/bmk_conv.c        | 288 ++++++++++++++++++-
 saber/funcs/impl/bm/device/bmkernel_base.c   |   7 +-
 4 files changed, 362 insertions(+), 9 deletions(-)
 rename saber/funcs/impl/bm/{ => device}/bm_common.h (91%)
 create mode 100644 saber/funcs/impl/bm/device/bm_memmap.h

diff --git a/saber/funcs/impl/bm/bm_common.h b/saber/funcs/impl/bm/device/bm_common.h
similarity index 91%
rename from saber/funcs/impl/bm/bm_common.h
rename to saber/funcs/impl/bm/device/bm_common.h
index ab9716433..47220d468 100644
--- a/saber/funcs/impl/bm/bm_common.h
+++ b/saber/funcs/impl/bm/device/bm_common.h
@@ -1,5 +1,5 @@
-#ifndef ANAKIN_SABER_FUNCS_IMPL_BM_BM_COMMON_H
-#define ANAKIN_SABER_FUNCS_IMPL_BM_BM_COMMON_H
+#ifndef ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BM_COMMON_H
+#define ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BM_COMMON_H
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -10,6 +10,8 @@
 #include <unistd.h>
 #include <time.h>
 #include "bm_config.h"
+#include "bm_atomic.h"
+#include "bm_memmap.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -165,7 +167,14 @@ static INLINE int get_align_tensor_size(bm_tensor_4d_t shape){
   return shape.n * c_per_npu * get_neuron_csize_local(shape.h, shape.w);
 }
 
+static int INLINE get_cstride_local(int h, int w)
+{
+  int size = h * w;
+  //EU_NUM neurons align
+  return ALIGN(size,EU_NUM);
+}
+
 #ifdef __cplusplus
 }
 #endif
-#endif /* ANAKIN_SABER_FUNCS_IMPL_BM_BM_COMMON_H */
+#endif /* ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BM_COMMON_H */
diff --git a/saber/funcs/impl/bm/device/bm_memmap.h b/saber/funcs/impl/bm/device/bm_memmap.h
new file mode 100644
index 000000000..18d8185e6
--- /dev/null
+++ b/saber/funcs/impl/bm/device/bm_memmap.h
@@ -0,0 +1,61 @@
+#ifndef ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BM_MEMMAP_H
+#define ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BM_MEMMAP_H
+
+#define ITCM_MEM_START_ADDR            0x00000000
+#define ITCM_MEM_SIZE                  0x00080000           // 512KB
+#define DTCM_MEM_START_ADDR            0x02000000
+#define DTCM_MEM_SIZE                  0x00010000           // 64KB
+#define SHARE_MEM_START_ADDR           (DTCM_MEM_START_ADDR + DTCM_MEM_SIZE)
+#define SHARE_MEM_SIZE                 0x00010000           // 64KB
+
+#define LOCAL_MEM_ADDRWIDTH            18
+#define LOCAL_MEM_START_ADDR           0x04000000
+//#define LOCAL_MEM_SIZE                 (1<<LOCAL_MEM_ADDRWIDTH)  // 256KB each
+
+#define BDC_RAM_SIZE                   0x00040000    // 512 bit * 4096 entries
+#define BDC_RAM_ADDR                   0x45100000
+
+#define SPI_CTLR_BASE_ADDR_REMAP       0x44000000
+
+#define DDR_CTLR_BASE_ADDR             0x50000000
+
+#define GDMA_ENIGNE_BASE_ADDR          0x60000000
+#define BD_ENIGNE_BASE_ADDR            0x60002000
+#define CDMA_ENIGNE_BASE_ADDR          0x60003000
+#define MINIMAC_BASE_ADDR              0x60005000
+
+#define COUNT_RESERVED_DDR_SWAP        0x1000000
+#define COUNT_RESERVED_DDR_INSTR       0x1000000
+#define COUNT_RESERVED_DDR_IMAGE_SCALE       0x2000000
+
+#define GLOBAL_MEM_START_ADDR_BDC      0x80000000 + COUNT_RESERVED_DDR_INSTR + COUNT_RESERVED_DDR_SWAP
+#define GLOBAL_MEM_START_ADDR_ARM      0x80000000
+
+#define GLOBAL_MEM_BOUNDARY            0x80000000
+#define PCIE_SOC_BASE_DISTANCE         0x100000000
+
+#ifdef SOC_MODE
+  #define GLOBAL_MEM_START_ADDR_CMD      0x0
+  #define GLOBAL_MEM_START_ADDR          0x200000000
+#else
+  #define GLOBAL_MEM_START_ADDR_CMD      0x0
+  #define GLOBAL_MEM_START_ADDR          0x100000000
+#endif
+
+#define GLOBAL_MEM_ALIGN_SIZE          (4)
+
+#define SPI_CTLR_BASE_ADDR             0xFFF00000
+
+#define ITCM_MAP_B_START_ADDR          0x6001A000
+
+#define SFU_TABLE_ADDR_OFFSET          GLOBAL_MEM_START_ADDR_BDC + 0
+
+#define SOFT_RESET_REG_ADDR            0x50008004
+#define GDMA_SOFT_RESET_BIT            5
+#define NPS_SOFT_RESET_BIT             6
+#define CHIPLINK_SOFT_RESET_BIT        7
+
+#define SHARE_REG_BASE_ADDR            (0x50008240)
+
+
+#endif /* ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BM_MEMMAP_H */
\ No newline at end of file
diff --git a/saber/funcs/impl/bm/device/bmk_conv.c b/saber/funcs/impl/bm/device/bmk_conv.c
index 9f85f1009..837c06a21 100644
--- a/saber/funcs/impl/bm/device/bmk_conv.c
+++ b/saber/funcs/impl/bm/device/bmk_conv.c
@@ -1,5 +1,289 @@
 #include <stdio.h>
+#include "bm_common.h"
 
-void bm_conv_fwd(bm_api_conv_forward conv_param) {
-    
+int bm_conv_fwd(bm_api_conv_forward conv_param)
+{
+    // Unpack parameters
+    u64 ifmap_offset_global = conv_param.ifmap_offset_global;
+    u64 ofmap_offset_global = conv_param.ofmap_offset_global;
+    u64 weight_offset_global = conv_param.weight_offset_global;
+    u64 bias_offset_global = conv_param.bias_offset_global;
+    int input_n = conv_param.input_n;
+    int input_c = conv_param.input_c;
+    int input_h = conv_param.input_h;
+    int input_w = conv_param.input_w;
+    int groups = conv_param.groups;
+    int output_c = conv_param.output_c;
+    int kh = conv_param.kh;
+    int kw = conv_param.kw;
+    int dh = conv_param.dh;
+    int dw = conv_param.dw;
+    int pad_h = conv_param.pad_h;
+    int pad_w = conv_param.pad_w;
+    int stride_h = conv_param.stride_h;
+    int stride_w = conv_param.stride_w;
+    int using_bias = conv_param.using_bias;
+    int result_add = conv_param.result_add;
+    int icsecs = conv_param.icsecs;
+    int ocsecs = conv_param.ocsecs;
+    int nsecs = conv_param.nsecs;
+    int hsecs = conv_param.hsecs;
+
+    BM_ATOMIC_RESULT bm_res = BM_ATOMIC_SUCCESS;
+    const int start_npu_idx = 0;
+
+    int kh_ext = dh * (kh - 1) + 1;
+    int kw_ext = dw * (kw - 1) + 1;
+    int output_h = (input_h + 2 * pad_h - kh_ext) / stride_h + 1;
+    int output_w = (input_w + 2 * pad_w - kw_ext) / stride_w + 1;
+
+    int ic = input_c / groups;
+    int oc = output_c / groups;
+    int ic_per_NPU = ceiling_func_shift(ic, NPU_SHIFT);
+    int oc_per_NPU = ceiling_func_shift(oc, NPU_SHIFT);
+    int bias_offset_local = 0;
+    int bias_tensor_size = oc_per_NPU * FLOAT_SIZE;
+    int weight_offset_local = bias_offset_local + bias_tensor_size;
+    int weight_group_offset = oc * ic * kh * kw;
+    int weight_tensor_size = ic * oc_per_NPU * kh * kw * FLOAT_SIZE;
+    int weight_capacity = addr_EU_align(weight_tensor_size + bias_tensor_size);
+    int ifmap_group_offset = ic * input_h * input_w;
+    int ofmap_group_offset = oc * output_h * output_w;
+    int global_ifmap_Nstride = ifmap_group_offset * groups;
+    int global_ofmap_Nstride = ofmap_group_offset * groups;
+    int nslice = input_n, ocslice = oc, icslice = ic, hslice = output_h;
+    nslice = input_n / nsecs;
+    int n_residual = input_n - nslice * nsecs;
+    hslice = output_h / hsecs;
+    int h_residual = output_h - hslice * hsecs;
+    icslice = ic / icsecs;
+    int ic_residual = ic - icslice * icsecs;
+    ocslice = oc / ocsecs;
+    int oc_residual = oc - ocslice * ocsecs;
+    int bias_group_offset = oc;
+    int max_icslice = icslice + (ic_residual > 0);
+    int max_ic_per_NPU = ceiling_func_shift(max_icslice, NPU_SHIFT);
+    int max_ocslice = ocslice + (oc_residual > 0);
+    int max_oc_per_NPU = ceiling_func_shift(max_ocslice, NPU_SHIFT);
+
+    for (int ig = 0; ig < groups; ig++){
+        int ocend = 0;
+        for (int ocidx = 0; ocidx < ocsecs; ocidx++){
+            int ocstart = ocend;
+            int cur_ocslice = ocslice + (oc_residual > ocidx);
+            ocend = ocstart + cur_ocslice;
+            oc_per_NPU = ceiling_func_shift(cur_ocslice, NPU_SHIFT);
+            if (using_bias){
+                bm_res = bm_atomic_tensor_compact_move(
+                    start_npu_idx,
+                    bias_offset_local,
+                    bias_offset_global + (ig * bias_group_offset + ocstart) * FLOAT_SIZE,
+                    1, // n
+                    cur_ocslice, // c 
+                    1, // h
+                    1, // w
+                    DMA_G2L, // direction
+                    false, // transpose
+                    false // add results
+                );
+                if (bm_res != BM_ATOMIC_SUCCESS) {
+                    printf("bm_atomic_tensor_compact_move failed.\n");
+                    return -1;
+                }
+            }
+            weight_capacity = max_icslice * oc_per_NPU * kh * kw * FLOAT_SIZE;
+            int ofmap_offset_local = addr_EU_align(weight_capacity + weight_offset_local);
+            int nend = 0;
+            for (int nidx = 0; nidx < nsecs; nidx++){
+                int nstart = nend;
+                int sec_len_n = nslice + (nidx < n_residual);
+                nend = nstart + sec_len_n;
+                int o_hb = 0;
+                for (int hidx = 0; hidx < hsecs; hidx++){
+                    int o_ht = o_hb;
+                    int o_h = hslice + (h_residual > hidx);
+                    o_hb = o_ht + o_h;
+                    int i_ht = bm_max(o_ht * stride_h - pad_h, 0);
+                    int pad_h_t = 0;
+                    if (i_ht == 0){
+                        pad_h_t = pad_h - o_ht * stride_h;
+                    }
+                    int i_hb = bm_min(o_hb * stride_h + kh_ext - 1 - pad_h, input_h);
+                    int pad_h_b = 0;
+                    if (i_hb == input_h){
+                        pad_h_b = o_hb * stride_h + kh_ext - 1 - pad_h - input_h;
+                    }
+                    int i_h = i_hb - i_ht;
+                    int ifmap_align_size = get_neuron_csize_local(i_h, input_w);
+                    int ifmap_tensor_size = sec_len_n * max_ic_per_NPU * ifmap_align_size;
+                    int ofmap_align_size = get_neuron_csize_local(o_h, output_w);
+                    int ofmap_tensor_size = sec_len_n * max_oc_per_NPU * ofmap_align_size;
+                    int ifmap_offset_local = ofmap_offset_local + ofmap_tensor_size;
+                    int offset_local_end = ifmap_offset_local + ifmap_tensor_size;
+                    if (offset_local_end > LOCAL_MEM_SIZE) {
+                        printf("local memory not enough.\n");
+                        return -1;
+                    }
+                    if (result_add){
+                        u64 shift = nstart * global_ofmap_Nstride + ig * ofmap_group_offset +
+                                    (ocstart * output_h + o_ht) * output_w;
+                        int local_cstride = get_cstride_local(o_h, output_w);
+                        bm_res = bm_atomic_tensor_stride_move(
+                            start_npu_idx,
+                            ofmap_offset_local, 
+                            ofmap_offset_global + shift * FLOAT_SIZE,
+                            sec_len_n, // n
+                            cur_ocslice, // c
+                            o_h, // h
+                            output_w, //w
+                            oc_per_NPU * local_cstride, // dst_stride_n
+                            local_cstride, // dst_stride_c
+                            output_w, // dst_stride_h
+                            global_ofmap_Nstride, // src_stride_n
+                            output_h * output_w, // src_stride_c
+                            output_w, // src_stride_h
+                            DMA_G2L, // direction
+                            DMA_F32, // format
+                            false // transpose
+                        );
+                        if (bm_res != BM_ATOMIC_SUCCESS) {
+                            printf("bm_atomic_tensor_stride_move failed.\n");
+                            return -1;
+                        }
+                    }
+                    int icend = 0;
+                    for (int icidx = 0; icidx < icsecs; icidx++){
+                        int icstart = icend;
+                        int cur_icslice = icslice + (ic_residual > icidx);
+                        icend = icstart + cur_icslice;
+                        ic_per_NPU = ceiling_func_shift(cur_icslice, NPU_SHIFT);
+                        u64 shift = (ocstart * ic + icstart) * kh * kw + ig * weight_group_offset;
+                        if ((icsecs != 1) || (nidx == 0 && hidx == 0)){
+                            bm_res = bm_atomic_tensor_stride_move(
+                                start_npu_idx,
+                                weight_offset_local, 
+                                weight_offset_global + shift * FLOAT_SIZE,
+                                1, // n
+                                cur_ocslice, // c
+                                cur_icslice, // h
+                                kh * kw, // w
+                                0, // dst_stride_n
+                                cur_icslice * kh * kw, // dst_stride_c
+                                kh * kw, // dst_stride_h
+                                0, // src_stride_n
+                                ic * kh * kw, // src_stride_c
+                                kh * kw, // src_stride_h
+                                DMA_G2L, // direction
+                                DMA_F32, // format
+                                false // transpose
+                            );
+                            if (bm_res != BM_ATOMIC_SUCCESS) {
+                                printf("bm_atomic_tensor_stride_move failed.\n");
+                                return -1;
+                            }
+                        }
+                        shift = nstart * global_ifmap_Nstride + ig * ifmap_group_offset +
+                                (icstart * input_h + i_ht) * input_w;
+                        int local_cstride = get_cstride_local(i_h, input_w);
+                        bm_res = bm_atomic_tensor_stride_move(
+                            start_npu_idx,
+                            ifmap_offset_local, 
+                            ifmap_offset_global + shift * FLOAT_SIZE,
+                            sec_len_n, // n
+                            cur_icslice, // c
+                            i_h, // h
+                            input_w, // w
+                            ic_per_NPU * local_cstride, // dst_stride_n
+                            local_cstride, // dst_stride_c
+                            input_w, // dst_stride_h
+                            global_ifmap_Nstride, // src_stride_n
+                            input_h * input_w, // src_stride_c
+                            input_w, // src_stride_h
+                            DMA_G2L, // direction
+                            DMA_F32, // format
+                            false // transpose
+                        );
+                        if (bm_res != BM_ATOMIC_SUCCESS) {
+                            printf("bm_atomic_tensor_stride_move failed.\n");
+                            return -1;
+                        }
+
+                        /*local_shape_t ifshape, ofshape;
+                        ifshape.n = sec_len_n;
+                        ifshape.c = cur_icslice;
+                        ifshape.h = i_h;
+                        ifshape.w = input_w;
+                        ofshape.c = cur_ocslice;
+                        ofshape.h = o_h;
+                        ofshape.w = output_w;*/
+                        
+                        bm_res = bm_atomic_conv_kernel_stride(
+                            start_npu_idx,
+                            LOCAL_MEM_START_ADDR | ofmap_offset_local, // ouput local offset
+                            LOCAL_MEM_START_ADDR | ifmap_offset_local, // input local offset
+                            LOCAL_MEM_START_ADDR | weight_offset_local, // weight
+                            LOCAL_MEM_START_ADDR | bias_offset_local, // bias
+                            sec_len_n, // output n
+                            cur_ocslice, // output c
+                            cur_icslice, // input c
+                            i_h, // input h
+                            input_w, // input w
+                            kh, // kernel h
+                            kw, // kernel w
+                            kh * kw, // kernel stride n
+                            cur_icslice * kh * kw, // kernel stride c
+                            kw, // kernel stride h
+                            dh, // dilation h
+                            dw, // dilation w
+                            pad_h_t, // pad top
+                            pad_h_b, // pad bottom
+                            pad_w, // pad left
+                            pad_w, // pad right
+                            stride_h, // stride h
+                            stride_w, // stride w
+                            icidx == icsecs - 1 ? using_bias : 0, // using bias
+                            result_add || icidx > 0 // add result
+                        );
+                        if (bm_res != BM_ATOMIC_SUCCESS) {
+                            printf("bm_atomic_conv_kernel_stride failed.\n");
+                            return -1;
+                        }
+                    }
+                    u64 shift = nstart * global_ofmap_Nstride + ig * ofmap_group_offset +
+                                (ocstart * output_h + o_ht) * output_w;
+                    int local_cstride = get_cstride_local(o_h, output_w);
+                    
+                    bm_res = bm_atomic_tensor_stride_move(
+                        start_npu_idx,
+                        ofmap_offset_local, 
+                        ofmap_offset_global + shift * FLOAT_SIZE,
+                        sec_len_n, // n
+                        cur_ocslice, // c
+                        o_h, // h
+                        output_w, // w
+                        global_ofmap_Nstride, // dst_stride_n
+                        output_h * output_w, // dst_stride_c
+                        output_w, // dst_stride_h
+                        oc_per_NPU * local_cstride, // src_stride_n
+                        local_cstride, // src_stride_c
+                        output_w, // src_stride_h
+                        DMA_L2G, // direction
+                        DMA_F32, // format
+                        false // transpose
+                    );
+                    if (bm_res != BM_ATOMIC_SUCCESS) {
+                        printf("bm_atomic_tensor_stride_move failed.\n");
+                        return -1;
+                    }
+                }
+            }
+        }
+    }
+
+    bm_res = bm_atomic_wait_all_task_complete();
+    if (bm_res != BM_ATOMIC_SUCCESS) {
+        printf("bm_atomic_wait_all_task_complete failed.\n");
+        return -1;
+    }
+    return 0;
 }
\ No newline at end of file
diff --git a/saber/funcs/impl/bm/device/bmkernel_base.c b/saber/funcs/impl/bm/device/bmkernel_base.c
index 42cb0e591..8dde54397 100644
--- a/saber/funcs/impl/bm/device/bmkernel_base.c
+++ b/saber/funcs/impl/bm/device/bmkernel_base.c
@@ -15,15 +15,14 @@ int bmkernel_func(void *args)
     switch (param->op) {
         case ACTIVATION: {
             // bm_activation_fwd(param)
-            break;
+            return 0;
         }
         case CONV: {
             bm_api_conv_forward* api = (bm_api_conv_forward *)param->opParam;
-            bm_conv_fwd(*api);
-            break;
+            return bm_conv_fwd(*api);
         }
         default:
             printf("op %d is not supported by BM yet.\n", param->op);
+            return -1;
     }
-    return 0;
 }

From 73a932607371749ef307417c1c6ec0d6e8ae95bb Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Wed, 19 Sep 2018 11:22:57 +0800
Subject: [PATCH 307/318] Refactor

---
 saber/funcs/impl/bm/device/bmkernel_base.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/saber/funcs/impl/bm/device/bmkernel_base.c b/saber/funcs/impl/bm/device/bmkernel_base.c
index 8dde54397..39a59f6f9 100644
--- a/saber/funcs/impl/bm/device/bmkernel_base.c
+++ b/saber/funcs/impl/bm/device/bmkernel_base.c
@@ -21,8 +21,9 @@ int bmkernel_func(void *args)
             bm_api_conv_forward* api = (bm_api_conv_forward *)param->opParam;
             return bm_conv_fwd(*api);
         }
-        default:
+        default: {
             printf("op %d is not supported by BM yet.\n", param->op);
             return -1;
+        }
     }
 }

From e72aa2642fda0b6d446c48eaab1e0de9e1416d68 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Mon, 1 Oct 2018 10:22:27 +0800
Subject: [PATCH 308/318] Update to new version to bm kernel APIs

---
 cmake/compiler_options.cmake           |  6 ++++++
 cmake/find_modules.cmake               | 17 +++++++----------
 saber/CMakeLists.txt                   |  4 ++--
 saber/core/common.h                    |  1 -
 saber/core/impl/bm/bm_impl.cpp         |  2 +-
 saber/funcs/impl/bm/device/bm_common.h |  9 +--------
 saber/funcs/impl/bm/vender_conv.cpp    |  6 +++---
 7 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/cmake/compiler_options.cmake b/cmake/compiler_options.cmake
index c7c55f402..21bd43c31 100644
--- a/cmake/compiler_options.cmake
+++ b/cmake/compiler_options.cmake
@@ -39,6 +39,12 @@ anakin_add_compile_option(-Wshadow)
 anakin_add_compile_option(-fpermissive)
 anakin_add_compile_option(-Wsign-promo)
 anakin_add_compile_option(-fdiagnostics-show-option)
+if(USE_BM_PLACE)
+    anakin_add_compile_option(-lbmlib-asic)
+    #anakin_add_compile_option(-lopencv_core)
+    #anakin_add_compile_option(-lopencv_imgproc)
+    #anakin_add_compile_option(-lopencv_highgui)
+endif()
 
 if(ENABLE_NOISY_WARNINGS)
 	anakin_add_compile_option(-Wcast-align)
diff --git a/cmake/find_modules.cmake b/cmake/find_modules.cmake
index cf633e63f..a06ea879b 100644
--- a/cmake/find_modules.cmake
+++ b/cmake/find_modules.cmake
@@ -357,7 +357,7 @@ macro(anakin_find_openmp)
 endmacro()
 
 macro(anakin_find_bmlib)
-	find_path(BM_ROOT include/host/bmkernel_runtime.h /usr/local/include/bm/ $ENV{BM_ROOT}/)
+	find_path(BM_ROOT include/bmlib/bmlib_runtime.h /usr/local/include/bm/ $ENV{BM_ROOT}/)
 	if(BM_ROOT)
 		set(BM_FOUND TRUE)
 	endif()
@@ -365,15 +365,12 @@ macro(anakin_find_bmlib)
 		message(STATUS " Found bm_lib in ${BM_ROOT}")
 		anakin_fetch_include_recursively(${BM_ROOT}/include)
 		set(BM_LIBRARIES "")
-		list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/host/libbmkernel-host.so)
-		list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/host/libbmlib-asic.so)
-		list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/host/libbmlib-palladium.so)
-		list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/host/libbmkernel-host-cmodel.so)
-		list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/libbmkernel-core-asic.a)
-		list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/libbmkernel-core-palladium.a)
-		list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/libbmkernel-top-asic.a)
-		list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/libbmkernel-top-palladium.a)
-		list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/cmodel/bmlib.a)
+		list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/bmlib.a)
+		list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/common-arm.a)
+		list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/common.a)
+		list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/fw-arm.a)
+		list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/fw-top.a)
+		list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/libbmlib-asic.so)
 		list(APPEND ANAKIN_LINKER_LIBS ${BM_LIBRARIES})
 	else()
 		message(FATAL_ERROR "Could not found bm_lib")
diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt
index c27591c8f..80e1fc9db 100644
--- a/saber/CMakeLists.txt
+++ b/saber/CMakeLists.txt
@@ -96,8 +96,8 @@ if(USE_BM_PLACE)
     set(BIN_NAME bmkernel_bin)
     set(LINK_CONFIG link/bm1682_ddr.lds)
     add_custom_command(OUTPUT bm_kernel_tmp
-        COMMAND arm-none-eabi-gcc ${ANAKIN_SABER}/funcs/impl/bm/device/bmkernel_base.c -mcpu=arm926ej-s -mfpu=vfp -fno-short-enums -std=gnu99 -O2 -Wall -Werror -ffunction-sections -fdata-sections -nostdlib -DENABLE_PRINT -I${BM_ROOT}/include/device/atomic -I${BM_ROOT}/include/device/utils -I${BM_ROOT}/include/device -c -o ${BIN_NAME}.o
-        COMMAND arm-none-eabi-gcc -T ${BM_ROOT}/${LINK_CONFIG} -mcpu=arm926ej-s -mfpu=vfp -fno-short-enums -Wl,--check-sections -Wl,--gc-sections -Wl,--unresolved-symbols=report-all -Wl,--no-enum-size-warning -o ${BIN_NAME}.elf -Wl,--start-group -lc -lm ${BIN_NAME}.o ${BM_ROOT}/lib/device/libbmkernel-top-asic.a ${BM_ROOT}/lib/device/libbmkernel-core-asic.a -Wl,--end-group
+        COMMAND arm-none-eabi-gcc ${ANAKIN_SABER}/funcs/impl/bm/device/bmkernel_base.c -mcpu=arm926ej-s -mfpu=vfp -fno-short-enums -std=gnu99 -O2 -Wall -Werror -ffunction-sections -fdata-sections -nostdlib -DENABLE_PRINT -I${BM_ROOT}/include/config -I${BM_ROOT}/include/common -I${BM_ROOT}/include/c_model -I${BM_ROOT}/include/firmware_core -I${BM_ROOT}/include/bmlib -c -o ${BIN_NAME}.o
+        COMMAND arm-none-eabi-gcc -T ${BM_ROOT}/${LINK_CONFIG} -mcpu=arm926ej-s -mfpu=vfp -fno-short-enums -Wl,--check-sections -Wl,--gc-sections -Wl,--unresolved-symbols=report-all -Wl,--no-enum-size-warning -o ${BIN_NAME}.elf -Wl,--start-group -lc -lm ${BIN_NAME}.o ${BM_ROOT}/lib/device/fw-top.a ${BM_ROOT}/lib/device/fw-arm.a -Wl,--end-group
         COMMAND arm-none-eabi-objcopy -O binary -R *.slow* ${BIN_NAME}.elf ${BIN_NAME}_itcm.bin
         COMMAND hexdump -v -e '1/4 \"%08x\\n\"' ${BIN_NAME}_itcm.bin > ${BIN_NAME}_itcm.hex.sim
         COMMAND arm-none-eabi-objcopy -O binary -j *.slow* ${BIN_NAME}.elf ${BIN_NAME}_ddr.bin
diff --git a/saber/core/common.h b/saber/core/common.h
index 6c1338859..3a64b0d5d 100644
--- a/saber/core/common.h
+++ b/saber/core/common.h
@@ -180,7 +180,6 @@ const char* cudnn_get_errorstring(cudnnStatus_t status);
 
 #include "bmlib_runtime.h"
 #include "bmlib_utils.h"
-#include "bmkernel_runtime.h"
 
 #define BM_CHECK(condition) \
   do { \
diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 70b693917..6144f9630 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -45,7 +45,7 @@ typedef TargetWrapper<BM, __device_target> BM_API;
 
 // Init handle only once in the lifetime
 static bm_handle_t handle;
-static bm_status_t init_handle{bmkernel_init(&handle)};
+static bm_status_t init_handle{bmlib_kernel_init(&handle)};
 
 bm_handle_t BM_API::get_handle() {
     return handle;
diff --git a/saber/funcs/impl/bm/device/bm_common.h b/saber/funcs/impl/bm/device/bm_common.h
index 47220d468..755e9c1ab 100644
--- a/saber/funcs/impl/bm/device/bm_common.h
+++ b/saber/funcs/impl/bm/device/bm_common.h
@@ -86,13 +86,6 @@ typedef enum {
   ENGINE_END
 } ENGINE_ID;
 
-typedef struct tensor_4d_t {
-    int n;
-    int c;
-    int h;
-    int w;
-} bm_tensor_4d_t;
-
 typedef struct kernel_param{
     int g;
     int oc;
@@ -162,7 +155,7 @@ static int INLINE addr_EU_align(int addr){
   return ALIGN( addr, EU_NUM ) * FLOAT_SIZE;
 }
 
-static INLINE int get_align_tensor_size(bm_tensor_4d_t shape){
+static int get_align_tensor_size(bm_tensor_4d_t shape){
   int c_per_npu = ceiling_func_shift(shape.c, NPU_SHIFT);
   return shape.n * c_per_npu * get_neuron_csize_local(shape.h, shape.w);
 }
diff --git a/saber/funcs/impl/bm/vender_conv.cpp b/saber/funcs/impl/bm/vender_conv.cpp
index ae0be1fd8..388581117 100644
--- a/saber/funcs/impl/bm/vender_conv.cpp
+++ b/saber/funcs/impl/bm/vender_conv.cpp
@@ -262,13 +262,13 @@ SaberStatus VenderConv2D<BM, AK_FLOAT>::\
       secs_info.hsecs
     };
 
-    bm_status_t bm_stat = bmkernel_launch(_handle, "/usr/local/include/bm/bmkernel_bin.bin");
-    CHECK_EQ(BM_SUCCESS, bm_stat) << "bmkernel_launch failed.";
+    bm_status_t bm_stat = bmlib_kernel_launch(_handle, "/usr/local/include/bm/bmkernel_bin.bin");
+    CHECK_EQ(BM_SUCCESS, bm_stat) << "bmlib_kernel_launch failed.";
     
     /* Send arguments. */
     enum BmOpType op = CONV;
     bmkernel_api_base api = { op, reinterpret_cast<void *>(&bm_conv_param) };
-    BM_CHECK(bmkernel_send_args(_handle, reinterpret_cast<void *>(&api), sizeof(api)));
+    BM_CHECK(bmlib_kernel_send_args(_handle, reinterpret_cast<void *>(&api), sizeof(api)));
 
     return SaberSuccess;
 }

From 81467b224e918190ed2f7c85fbb31425267a507f Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Mon, 1 Oct 2018 10:28:36 +0800
Subject: [PATCH 309/318] Update BM Kernel dependencies

---
 saber/funcs/impl/bm/device/bm_common.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/saber/funcs/impl/bm/device/bm_common.h b/saber/funcs/impl/bm/device/bm_common.h
index 755e9c1ab..5e724caf2 100644
--- a/saber/funcs/impl/bm/device/bm_common.h
+++ b/saber/funcs/impl/bm/device/bm_common.h
@@ -10,8 +10,9 @@
 #include <unistd.h>
 #include <time.h>
 #include "bm_config.h"
-#include "bm_atomic.h"
+#include "op_code.h"
 #include "bm_memmap.h"
+#include "firmware_core_kernel.h"
 #ifdef __cplusplus
 extern "C" {
 #endif

From 015cd28bd433d77feb1142e287da130138d3e45e Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Mon, 1 Oct 2018 14:00:25 +0800
Subject: [PATCH 310/318] Implement conv with new version of BM Kernel

---
 saber/funcs/impl/bm/device/bmk_conv.c | 231 ++++++++++----------------
 1 file changed, 90 insertions(+), 141 deletions(-)

diff --git a/saber/funcs/impl/bm/device/bmk_conv.c b/saber/funcs/impl/bm/device/bmk_conv.c
index 837c06a21..97ce98c06 100644
--- a/saber/funcs/impl/bm/device/bmk_conv.c
+++ b/saber/funcs/impl/bm/device/bmk_conv.c
@@ -1,5 +1,7 @@
 #include <stdio.h>
 #include "bm_common.h"
+#include "atomic_dma_gen_cmd.h"
+#include "atomic_conv_gen_cmd.h"
 
 int bm_conv_fwd(bm_api_conv_forward conv_param)
 {
@@ -29,8 +31,9 @@ int bm_conv_fwd(bm_api_conv_forward conv_param)
     int nsecs = conv_param.nsecs;
     int hsecs = conv_param.hsecs;
 
-    BM_ATOMIC_RESULT bm_res = BM_ATOMIC_SUCCESS;
-    const int start_npu_idx = 0;
+    P_COMMAND dma_command;
+    CMD_ID_NODE id_node;
+    resync_cmd_id( &id_node );
 
     int kh_ext = dh * (kh - 1) + 1;
     int kw_ext = dw * (kw - 1) + 1;
@@ -74,22 +77,18 @@ int bm_conv_fwd(bm_api_conv_forward conv_param)
             ocend = ocstart + cur_ocslice;
             oc_per_NPU = ceiling_func_shift(cur_ocslice, NPU_SHIFT);
             if (using_bias){
-                bm_res = bm_atomic_tensor_compact_move(
-                    start_npu_idx,
-                    bias_offset_local,
-                    bias_offset_global + (ig * bias_group_offset + ocstart) * FLOAT_SIZE,
-                    1, // n
-                    cur_ocslice, // c 
-                    1, // h
-                    1, // w
-                    DMA_G2L, // direction
-                    false, // transpose
-                    false // add results
+                dma_command = get_command(ENGINE_GDMA);
+                tensor_compact_move_gen_cmd(
+                    bias_offset_local, // local_mem_start_addr
+                    bias_offset_global + (ig * bias_group_offset + ocstart) * FLOAT_SIZE, // p_coeff_addr
+                    1, cur_ocslice, 1, 1, // n, c, h, w
+                    0, // direction G2L
+                    0, // transpose
+                    (void *)dma_command,
+                    0, // local memory index
+                    &id_node
                 );
-                if (bm_res != BM_ATOMIC_SUCCESS) {
-                    printf("bm_atomic_tensor_compact_move failed.\n");
-                    return -1;
-                }
+                call_atomic(nodechip_idx, atomic_global_dma, dma_command, ENGINE_GDMA);
             }
             weight_capacity = max_icslice * oc_per_NPU * kh * kw * FLOAT_SIZE;
             int ofmap_offset_local = addr_EU_align(weight_capacity + weight_offset_local);
@@ -125,31 +124,23 @@ int bm_conv_fwd(bm_api_conv_forward conv_param)
                         return -1;
                     }
                     if (result_add){
+                        dma_command = get_command(ENGINE_GDMA);
                         u64 shift = nstart * global_ofmap_Nstride + ig * ofmap_group_offset +
-                                    (ocstart * output_h + o_ht) * output_w;
+                            (ocstart * output_h + o_ht) * output_w;
                         int local_cstride = get_cstride_local(o_h, output_w);
-                        bm_res = bm_atomic_tensor_stride_move(
-                            start_npu_idx,
-                            ofmap_offset_local, 
-                            ofmap_offset_global + shift * FLOAT_SIZE,
-                            sec_len_n, // n
-                            cur_ocslice, // c
-                            o_h, // h
-                            output_w, //w
-                            oc_per_NPU * local_cstride, // dst_stride_n
-                            local_cstride, // dst_stride_c
-                            output_w, // dst_stride_h
-                            global_ofmap_Nstride, // src_stride_n
-                            output_h * output_w, // src_stride_c
-                            output_w, // src_stride_h
-                            DMA_G2L, // direction
-                            DMA_F32, // format
-                            false // transpose
+                        tensor_stride_move_gen_cmd(
+                            ofmap_offset_local, // local_mem_start_addr
+                            ofmap_offset_global + shift * FLOAT_SIZE, // p_tensor_addr
+                            sec_len_n, cur_ocslice, o_h, output_w, // n, c, h, w
+                            0, // local memory index
+                            0, // direction G2L
+                            global_ofmap_Nstride, output_h * output_w, output_w, // src stride n,c,h
+                            oc_per_NPU * local_cstride, local_cstride, output_w, // dst stride n,c,h
+                            GDMA_TYPE_f32, 
+                            0, // transpose 
+                            dma_command, &id_node
                         );
-                        if (bm_res != BM_ATOMIC_SUCCESS) {
-                            printf("bm_atomic_tensor_stride_move failed.\n");
-                            return -1;
-                        }
+                        call_atomic(nodechip_idx, atomic_global_dma, dma_command, ENGINE_GDMA);
                     }
                     int icend = 0;
                     for (int icidx = 0; icidx < icsecs; icidx++){
@@ -159,131 +150,89 @@ int bm_conv_fwd(bm_api_conv_forward conv_param)
                         ic_per_NPU = ceiling_func_shift(cur_icslice, NPU_SHIFT);
                         u64 shift = (ocstart * ic + icstart) * kh * kw + ig * weight_group_offset;
                         if ((icsecs != 1) || (nidx == 0 && hidx == 0)){
-                            bm_res = bm_atomic_tensor_stride_move(
-                                start_npu_idx,
-                                weight_offset_local, 
-                                weight_offset_global + shift * FLOAT_SIZE,
-                                1, // n
-                                cur_ocslice, // c
-                                cur_icslice, // h
-                                kh * kw, // w
-                                0, // dst_stride_n
-                                cur_icslice * kh * kw, // dst_stride_c
-                                kh * kw, // dst_stride_h
-                                0, // src_stride_n
-                                ic * kh * kw, // src_stride_c
-                                kh * kw, // src_stride_h
-                                DMA_G2L, // direction
-                                DMA_F32, // format
-                                false // transpose
+                            dma_command = get_command(ENGINE_GDMA);
+                            tensor_stride_move_gen_cmd(
+                                weight_offset_local, // local_mem_start_addr
+                                weight_offset_global + shift * FLOAT_SIZE, // p_tensor_addr
+                                1, cur_ocslice, cur_icslice, kh * kw, // n, c, h, w
+                                0, // local memory index
+                                0, // direction G2L
+                                0, ic * kh * kw, kh * kw, // src stride n,c,h
+                                0, cur_icslice * kh * kw, kh * kw, // dst stride n,c,h
+                                GDMA_TYPE_f32, 
+                                0, // transpose 
+                                dma_command, &id_node
                             );
-                            if (bm_res != BM_ATOMIC_SUCCESS) {
-                                printf("bm_atomic_tensor_stride_move failed.\n");
-                                return -1;
-                            }
+                            call_atomic(nodechip_idx, atomic_global_dma, dma_command, ENGINE_GDMA);
                         }
                         shift = nstart * global_ifmap_Nstride + ig * ifmap_group_offset +
                                 (icstart * input_h + i_ht) * input_w;
                         int local_cstride = get_cstride_local(i_h, input_w);
-                        bm_res = bm_atomic_tensor_stride_move(
-                            start_npu_idx,
-                            ifmap_offset_local, 
-                            ifmap_offset_global + shift * FLOAT_SIZE,
-                            sec_len_n, // n
-                            cur_icslice, // c
-                            i_h, // h
-                            input_w, // w
-                            ic_per_NPU * local_cstride, // dst_stride_n
-                            local_cstride, // dst_stride_c
-                            input_w, // dst_stride_h
-                            global_ifmap_Nstride, // src_stride_n
-                            input_h * input_w, // src_stride_c
-                            input_w, // src_stride_h
-                            DMA_G2L, // direction
-                            DMA_F32, // format
-                            false // transpose
+                        dma_command = (float*)get_command(ENGINE_GDMA);
+                        tensor_stride_move_gen_cmd(
+                            ifmap_offset_local, // local_mem_start_addr
+                            ifmap_offset_global + shift * FLOAT_SIZE, // p_tensor_addr
+                            sec_len_n, cur_icslice, i_h, input_w, // n, c, h, w
+                            0, // local memory index
+                            0, // direction G2L
+                            global_ifmap_Nstride, input_h * input_w, input_w, // src stride n,c,h
+                            ic_per_NPU * local_cstride, local_cstride, input_w, // dst stride n,c,h
+                            GDMA_TYPE_f32, 
+                            0, // transpose
+                            dma_command, &id_node
                         );
-                        if (bm_res != BM_ATOMIC_SUCCESS) {
-                            printf("bm_atomic_tensor_stride_move failed.\n");
-                            return -1;
-                        }
+                        call_atomic(nodechip_idx, atomic_global_dma, dma_command, ENGINE_GDMA);
 
-                        /*local_shape_t ifshape, ofshape;
+                        local_shape_t ifshape, ofshape;
                         ifshape.n = sec_len_n;
                         ifshape.c = cur_icslice;
                         ifshape.h = i_h;
                         ifshape.w = input_w;
                         ofshape.c = cur_ocslice;
                         ofshape.h = o_h;
-                        ofshape.w = output_w;*/
-                        
-                        bm_res = bm_atomic_conv_kernel_stride(
-                            start_npu_idx,
-                            LOCAL_MEM_START_ADDR | ofmap_offset_local, // ouput local offset
-                            LOCAL_MEM_START_ADDR | ifmap_offset_local, // input local offset
-                            LOCAL_MEM_START_ADDR | weight_offset_local, // weight
-                            LOCAL_MEM_START_ADDR | bias_offset_local, // bias
-                            sec_len_n, // output n
-                            cur_ocslice, // output c
-                            cur_icslice, // input c
-                            i_h, // input h
-                            input_w, // input w
-                            kh, // kernel h
-                            kw, // kernel w
-                            kh * kw, // kernel stride n
-                            cur_icslice * kh * kw, // kernel stride c
-                            kw, // kernel stride h
-                            dh, // dilation h
-                            dw, // dilation w
-                            pad_h_t, // pad top
-                            pad_h_b, // pad bottom
-                            pad_w, // pad left
-                            pad_w, // pad right
-                            stride_h, // stride h
-                            stride_w, // stride w
-                            icidx == icsecs - 1 ? using_bias : 0, // using bias
-                            result_add || icidx > 0 // add result
+                        ofshape.w = output_w;
+                        P_COMMAND conv_command = get_command(ENGINE_BD);
+                        atomic_conv_kernel_stride_gen_cmd(
+                            conv_command,
+                            LOCAL_MEM_START_ADDR | ifmap_offset_local, // input address
+                            LOCAL_MEM_START_ADDR | ofmap_offset_local, // output address
+                            LOCAL_MEM_START_ADDR | weight_offset_local, // weight address
+                            LOCAL_MEM_START_ADDR | bias_offset_local, // bias address
+                            ifshape, // input shape
+                            ofshape, // output shape
+                            kh, kw, // kernel h, w
+                            dh, dw, // dilation h, w
+                            kh * kw, cur_icslice * kh * kw, kw, // kernel stride n,c,h
+                            pad_h_t, pad_h_b, pad_w, pad_w, // pad top, bottom, left, right
+                            stride_h, stride_w, // stride h, w
+                            icidx == icsecs - 1 ? using_bias: 0, // use bias
+                            result_add || icidx > 0, // add result
+                            &id_node
                         );
-                        if (bm_res != BM_ATOMIC_SUCCESS) {
-                            printf("bm_atomic_conv_kernel_stride failed.\n");
-                            return -1;
-                        }
+                        call_atomic(nodechip_idx, atomic_conv_neuron, conv_command, ENGINE_BD);
                     }
                     u64 shift = nstart * global_ofmap_Nstride + ig * ofmap_group_offset +
                                 (ocstart * output_h + o_ht) * output_w;
                     int local_cstride = get_cstride_local(o_h, output_w);
                     
-                    bm_res = bm_atomic_tensor_stride_move(
-                        start_npu_idx,
-                        ofmap_offset_local, 
-                        ofmap_offset_global + shift * FLOAT_SIZE,
-                        sec_len_n, // n
-                        cur_ocslice, // c
-                        o_h, // h
-                        output_w, // w
-                        global_ofmap_Nstride, // dst_stride_n
-                        output_h * output_w, // dst_stride_c
-                        output_w, // dst_stride_h
-                        oc_per_NPU * local_cstride, // src_stride_n
-                        local_cstride, // src_stride_c
-                        output_w, // src_stride_h
-                        DMA_L2G, // direction
-                        DMA_F32, // format
-                        false // transpose
+                    dma_command = get_command(ENGINE_GDMA);
+                    tensor_stride_move_gen_cmd(
+                        ofmap_offset_local, // local_mem_start_addr
+                        ofmap_offset_global + shift * FLOAT_SIZE, // p_tensor_addr
+                        sec_len_n, cur_ocslice, o_h, output_w, // n, c, h, w
+                        0, // local memory index
+                        1, // direction L2G
+                        oc_per_NPU * local_cstride, local_cstride, output_w, // src stride n,c,h
+                        global_ofmap_Nstride, output_h * output_w, output_w, // dst stride n,c,h
+                        GDMA_TYPE_f32, 
+                        0, // transpose
+                        dma_command, &id_node
                     );
-                    if (bm_res != BM_ATOMIC_SUCCESS) {
-                        printf("bm_atomic_tensor_stride_move failed.\n");
-                        return -1;
-                    }
+                    call_atomic(nodechip_idx, atomic_global_dma, dma_command, ENGINE_GDMA);
                 }
             }
         }
     }
-
-    bm_res = bm_atomic_wait_all_task_complete();
-    if (bm_res != BM_ATOMIC_SUCCESS) {
-        printf("bm_atomic_wait_all_task_complete failed.\n");
-        return -1;
-    }
+    poll_all_engine_done(&id_node);
     return 0;
 }
\ No newline at end of file

From 6473856a27e4cbdaf3528553dadbc6f1234a3099 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Mon, 1 Oct 2018 14:06:24 +0800
Subject: [PATCH 311/318] Remove redundancy

---
 saber/funcs/impl/bm/device/bm_common.h | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/saber/funcs/impl/bm/device/bm_common.h b/saber/funcs/impl/bm/device/bm_common.h
index 5e724caf2..b34dd3cd4 100644
--- a/saber/funcs/impl/bm/device/bm_common.h
+++ b/saber/funcs/impl/bm/device/bm_common.h
@@ -59,11 +59,6 @@ typedef struct tensor_info{
     u32 matrix_col_magin;	//the magin is not 0, when column_num%w_param!=0
 }TENSOR_INFO;
 
-
-typedef struct shape{
-    u16 n, c, h, w;
-}local_shape_t;
-
 #define FLOAT_SIZE              4
 #define INT8_SIZE               1
 #define FLOAT_BITWIDTH          32
@@ -79,14 +74,6 @@ typedef enum {
     HOST_REG        = 1
 } REG_TYPE;
 
-typedef enum {
-  ENGINE_BD                     = 0,
-  ENGINE_GDMA                   = 1,
-  ENGINE_CDMA                   = 2,
-  ENGINE_HDMA                   = 3,
-  ENGINE_END
-} ENGINE_ID;
-
 typedef struct kernel_param{
     int g;
     int oc;

From e89f3d55b3c9f73ff26666491b99d86fb2507259 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Mon, 1 Oct 2018 14:09:37 +0800
Subject: [PATCH 312/318] Refactor

---
 saber/funcs/impl/bm/device/bm_common.h | 5 -----
 saber/funcs/impl/bm/vender_conv.cpp    | 5 +++++
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/saber/funcs/impl/bm/device/bm_common.h b/saber/funcs/impl/bm/device/bm_common.h
index b34dd3cd4..b10230dad 100644
--- a/saber/funcs/impl/bm/device/bm_common.h
+++ b/saber/funcs/impl/bm/device/bm_common.h
@@ -143,11 +143,6 @@ static int INLINE addr_EU_align(int addr){
   return ALIGN( addr, EU_NUM ) * FLOAT_SIZE;
 }
 
-static int get_align_tensor_size(bm_tensor_4d_t shape){
-  int c_per_npu = ceiling_func_shift(shape.c, NPU_SHIFT);
-  return shape.n * c_per_npu * get_neuron_csize_local(shape.h, shape.w);
-}
-
 static int INLINE get_cstride_local(int h, int w)
 {
   int size = h * w;
diff --git a/saber/funcs/impl/bm/vender_conv.cpp b/saber/funcs/impl/bm/vender_conv.cpp
index 388581117..eb3c38681 100644
--- a/saber/funcs/impl/bm/vender_conv.cpp
+++ b/saber/funcs/impl/bm/vender_conv.cpp
@@ -11,6 +11,11 @@ namespace anakin
 namespace saber
 {
 
+int get_align_tensor_size(bm_tensor_4d_t shape){
+  int c_per_npu = ceiling_func_shift(shape.c, NPU_SHIFT);
+  return shape.n * c_per_npu * get_neuron_csize_local(shape.h, shape.w);
+}
+
 void conv_splitc(bm_kernel_param_t kernel_param, conv_secs_info_t *secs_info){
   int oc_per_NPU = ceiling_func_shift(kernel_param.oc, NPU_SHIFT);
   int kernel_size = kernel_param.h * kernel_param.w * FLOAT_SIZE;

From 97afe14149d82fadfa7009c37e0ad22a6dd0cfd9 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Fri, 5 Oct 2018 16:50:11 +0800
Subject: [PATCH 313/318] refactor

---
 saber/funcs/impl/bm/device/bmk_conv.c | 30 +++++++++++++--------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/saber/funcs/impl/bm/device/bmk_conv.c b/saber/funcs/impl/bm/device/bmk_conv.c
index 97ce98c06..ce70b02b6 100644
--- a/saber/funcs/impl/bm/device/bmk_conv.c
+++ b/saber/funcs/impl/bm/device/bmk_conv.c
@@ -79,13 +79,13 @@ int bm_conv_fwd(bm_api_conv_forward conv_param)
             if (using_bias){
                 dma_command = get_command(ENGINE_GDMA);
                 tensor_compact_move_gen_cmd(
-                    bias_offset_local, // local_mem_start_addr
-                    bias_offset_global + (ig * bias_group_offset + ocstart) * FLOAT_SIZE, // p_coeff_addr
+                    bias_offset_local, // local mem start address
+                    bias_offset_global + (ig * bias_group_offset + ocstart) * FLOAT_SIZE, // global mem start address
                     1, cur_ocslice, 1, 1, // n, c, h, w
                     0, // direction G2L
                     0, // transpose
                     (void *)dma_command,
-                    0, // local memory index
+                    0, // local mem index
                     &id_node
                 );
                 call_atomic(nodechip_idx, atomic_global_dma, dma_command, ENGINE_GDMA);
@@ -129,10 +129,10 @@ int bm_conv_fwd(bm_api_conv_forward conv_param)
                             (ocstart * output_h + o_ht) * output_w;
                         int local_cstride = get_cstride_local(o_h, output_w);
                         tensor_stride_move_gen_cmd(
-                            ofmap_offset_local, // local_mem_start_addr
-                            ofmap_offset_global + shift * FLOAT_SIZE, // p_tensor_addr
+                            ofmap_offset_local, // local mem start address
+                            ofmap_offset_global + shift * FLOAT_SIZE, // global mem start address
                             sec_len_n, cur_ocslice, o_h, output_w, // n, c, h, w
-                            0, // local memory index
+                            0, // local mem index
                             0, // direction G2L
                             global_ofmap_Nstride, output_h * output_w, output_w, // src stride n,c,h
                             oc_per_NPU * local_cstride, local_cstride, output_w, // dst stride n,c,h
@@ -152,10 +152,10 @@ int bm_conv_fwd(bm_api_conv_forward conv_param)
                         if ((icsecs != 1) || (nidx == 0 && hidx == 0)){
                             dma_command = get_command(ENGINE_GDMA);
                             tensor_stride_move_gen_cmd(
-                                weight_offset_local, // local_mem_start_addr
-                                weight_offset_global + shift * FLOAT_SIZE, // p_tensor_addr
+                                weight_offset_local, // local mem start address
+                                weight_offset_global + shift * FLOAT_SIZE, // global mem start address
                                 1, cur_ocslice, cur_icslice, kh * kw, // n, c, h, w
-                                0, // local memory index
+                                0, // local mem index
                                 0, // direction G2L
                                 0, ic * kh * kw, kh * kw, // src stride n,c,h
                                 0, cur_icslice * kh * kw, kh * kw, // dst stride n,c,h
@@ -170,10 +170,10 @@ int bm_conv_fwd(bm_api_conv_forward conv_param)
                         int local_cstride = get_cstride_local(i_h, input_w);
                         dma_command = (float*)get_command(ENGINE_GDMA);
                         tensor_stride_move_gen_cmd(
-                            ifmap_offset_local, // local_mem_start_addr
-                            ifmap_offset_global + shift * FLOAT_SIZE, // p_tensor_addr
+                            ifmap_offset_local, // local mem start address
+                            ifmap_offset_global + shift * FLOAT_SIZE, // global mem start address
                             sec_len_n, cur_icslice, i_h, input_w, // n, c, h, w
-                            0, // local memory index
+                            0, // local mem index
                             0, // direction G2L
                             global_ifmap_Nstride, input_h * input_w, input_w, // src stride n,c,h
                             ic_per_NPU * local_cstride, local_cstride, input_w, // dst stride n,c,h
@@ -217,10 +217,10 @@ int bm_conv_fwd(bm_api_conv_forward conv_param)
                     
                     dma_command = get_command(ENGINE_GDMA);
                     tensor_stride_move_gen_cmd(
-                        ofmap_offset_local, // local_mem_start_addr
-                        ofmap_offset_global + shift * FLOAT_SIZE, // p_tensor_addr
+                        ofmap_offset_local, // local mem start address
+                        ofmap_offset_global + shift * FLOAT_SIZE, // global mem start address
                         sec_len_n, cur_ocslice, o_h, output_w, // n, c, h, w
-                        0, // local memory index
+                        0, // local mem index
                         1, // direction L2G
                         oc_per_NPU * local_cstride, local_cstride, output_w, // src stride n,c,h
                         global_ofmap_Nstride, output_h * output_w, output_w, // dst stride n,c,h

From e9f6bb6bcbc144df2a44e3f4269c10f35f247989 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Mon, 8 Oct 2018 12:19:49 +0800
Subject: [PATCH 314/318] refactor

---
 saber/funcs/impl/bm/device/bmk_conv.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/saber/funcs/impl/bm/device/bmk_conv.c b/saber/funcs/impl/bm/device/bmk_conv.c
index ce70b02b6..92667a260 100644
--- a/saber/funcs/impl/bm/device/bmk_conv.c
+++ b/saber/funcs/impl/bm/device/bmk_conv.c
@@ -119,10 +119,7 @@ int bm_conv_fwd(bm_api_conv_forward conv_param)
                     int ofmap_tensor_size = sec_len_n * max_oc_per_NPU * ofmap_align_size;
                     int ifmap_offset_local = ofmap_offset_local + ofmap_tensor_size;
                     int offset_local_end = ifmap_offset_local + ifmap_tensor_size;
-                    if (offset_local_end > LOCAL_MEM_SIZE) {
-                        printf("local memory not enough.\n");
-                        return -1;
-                    }
+                    ASSERT(offset_local_end <= LOCAL_MEM_SIZE);
                     if (result_add){
                         dma_command = get_command(ENGINE_GDMA);
                         u64 shift = nstart * global_ofmap_Nstride + ig * ofmap_group_offset +

From 9b63a03066116d8cbcf94c725b4a71f22d5912e7 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 11 Oct 2018 12:45:13 +0800
Subject: [PATCH 315/318] Refactor according to new APIs

---
 cmake/compiler_options.cmake   | 3 ---
 saber/core/impl/bm/bm_impl.cpp | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/cmake/compiler_options.cmake b/cmake/compiler_options.cmake
index 21bd43c31..ef4a0dbcf 100644
--- a/cmake/compiler_options.cmake
+++ b/cmake/compiler_options.cmake
@@ -41,9 +41,6 @@ anakin_add_compile_option(-Wsign-promo)
 anakin_add_compile_option(-fdiagnostics-show-option)
 if(USE_BM_PLACE)
     anakin_add_compile_option(-lbmlib-asic)
-    #anakin_add_compile_option(-lopencv_core)
-    #anakin_add_compile_option(-lopencv_imgproc)
-    #anakin_add_compile_option(-lopencv_highgui)
 endif()
 
 if(ENABLE_NOISY_WARNINGS)
diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 6144f9630..dd1a194c8 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -74,7 +74,7 @@ void BM_API::mem_alloc(TPtr* ptr, size_t n) {
 }
 
 void BM_API::mem_free(TPtr ptr) {
-    if ((ptr != BM_MEM_NULL)) {
+    if (bm_mem_get_type(ptr) == BM_MEM_TYPE_SYSTEM) {
         bm_free_device(handle, ptr);
         //        delete ptr;
     }

From a8cc6c53fa250905a5a4bed5241ed8ac91952682 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 11 Oct 2018 16:14:57 +0800
Subject: [PATCH 316/318] merge from upstream

---
 framework/core/net/worker.cpp | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/framework/core/net/worker.cpp b/framework/core/net/worker.cpp
index 0af132fa2..9d2b70b72 100644
--- a/framework/core/net/worker.cpp
+++ b/framework/core/net/worker.cpp
@@ -106,19 +106,20 @@ Worker<Ttype, Ptype, RunType>::sync_prediction(std::vector<Tensor4d<typename tar
             d_tensor_in_p->reshape(ins[i].valid_shape());
             d_tensor_in_p->copy_from(ins[i]);
             d_tensor_in_p->set_seq_offset(ins[i].get_seq_offset());
-        } 
-        /*Context<NV> ctx(0, 0, 0); 
+        }
+#ifdef NVIDIA_GPU 
+        Context<NV> ctx(0, 0, 0); 
         saber::SaberTimer<NV> my_time; 
         my_time.start(ctx);
 #ifdef ENABLE_OP_TIMER
         Context<NV> ctx(0, 0, 0); 
         saber::SaberTimer<NV> my_time;
         my_time.start(ctx);
-#endif
+#endif // ENABLE_OP_TIMER
         net.prediction(); 
 
         my_time.end(ctx); 
-        LOG(ERROR) << " exec  << time: " << my_time.get_average_ms() << " ms ";*/
+        LOG(ERROR) << " exec  << time: " << my_time.get_average_ms() << " ms ";
 
 #ifdef ENABLE_OP_TIMER
         my_time.end(ctx); 
@@ -127,7 +128,8 @@ Worker<Ttype, Ptype, RunType>::sync_prediction(std::vector<Tensor4d<typename tar
             _thead_id_to_prediction_times_vec_in_ms[std::this_thread::get_id()].push_back(my_time.get_average_ms());
             LOG(ERROR) << " exec  << time: " << my_time.get_average_ms() << " ms ";
         }
-#endif
+#endif // ENABLE_OP_TIMER
+#endif // NVIDIA_GPU
         // get outputs of graph
         std::vector<Tensor4d<typename target_host<Ttype>::type>> ret;
         ret.resize(_outputs_in_order.size());
@@ -226,6 +228,16 @@ template class Worker<NV, Precision::FP16, OpRunType::SYNC>;
 template class Worker<NV, Precision::INT8, OpRunType::SYNC>;
 #endif
 
+#ifdef AMD_GPU
+template class Worker<AMD, Precision::FP32, OpRunType::ASYNC>;
+template class Worker<AMD, Precision::FP16, OpRunType::ASYNC>;
+template class Worker<AMD, Precision::INT8, OpRunType::ASYNC>;
+
+template class Worker<AMD, Precision::FP32, OpRunType::SYNC>;
+template class Worker<AMD, Precision::FP16, OpRunType::SYNC>;
+template class Worker<AMD, Precision::INT8, OpRunType::SYNC>;
+#endif
+
 #ifdef USE_X86_PLACE
 template class Worker<X86, Precision::FP32, OpRunType::ASYNC>;
 template class Worker<X86, Precision::FP16, OpRunType::ASYNC>;
@@ -256,4 +268,3 @@ template class Worker<ARM, Precision::INT8, OpRunType::SYNC>;
 #endif
 
 } /* namespace */
-

From b42ca192c740c9cfc56197cd6329273688a37831 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Thu, 11 Oct 2018 16:17:43 +0800
Subject: [PATCH 317/318] revert bm conv

---
 saber/funcs/conv.h                         |   2 +-
 saber/funcs/impl/bm/device/bmk_conv.c      | 235 -----------------
 saber/funcs/impl/bm/device/bmkernel_base.c |   5 +-
 saber/funcs/impl/bm/vender_conv.cpp        | 286 ---------------------
 saber/funcs/impl/bm/vender_conv.h          |  36 ---
 5 files changed, 3 insertions(+), 561 deletions(-)
 delete mode 100644 saber/funcs/impl/bm/device/bmk_conv.c
 delete mode 100644 saber/funcs/impl/bm/vender_conv.cpp
 delete mode 100644 saber/funcs/impl/bm/vender_conv.h

diff --git a/saber/funcs/conv.h b/saber/funcs/conv.h
index 939ce87d1..834335b0a 100644
--- a/saber/funcs/conv.h
+++ b/saber/funcs/conv.h
@@ -34,7 +34,7 @@
 #endif
 
 #ifdef USE_BM_PLACE
-#include "saber/funcs/impl/bm/vender_conv.h"
+//#include "saber/funcs/impl/bm/vender_conv.h"
 #endif
 namespace anakin {
 namespace saber {
diff --git a/saber/funcs/impl/bm/device/bmk_conv.c b/saber/funcs/impl/bm/device/bmk_conv.c
deleted file mode 100644
index 92667a260..000000000
--- a/saber/funcs/impl/bm/device/bmk_conv.c
+++ /dev/null
@@ -1,235 +0,0 @@
-#include <stdio.h>
-#include "bm_common.h"
-#include "atomic_dma_gen_cmd.h"
-#include "atomic_conv_gen_cmd.h"
-
-int bm_conv_fwd(bm_api_conv_forward conv_param)
-{
-    // Unpack parameters
-    u64 ifmap_offset_global = conv_param.ifmap_offset_global;
-    u64 ofmap_offset_global = conv_param.ofmap_offset_global;
-    u64 weight_offset_global = conv_param.weight_offset_global;
-    u64 bias_offset_global = conv_param.bias_offset_global;
-    int input_n = conv_param.input_n;
-    int input_c = conv_param.input_c;
-    int input_h = conv_param.input_h;
-    int input_w = conv_param.input_w;
-    int groups = conv_param.groups;
-    int output_c = conv_param.output_c;
-    int kh = conv_param.kh;
-    int kw = conv_param.kw;
-    int dh = conv_param.dh;
-    int dw = conv_param.dw;
-    int pad_h = conv_param.pad_h;
-    int pad_w = conv_param.pad_w;
-    int stride_h = conv_param.stride_h;
-    int stride_w = conv_param.stride_w;
-    int using_bias = conv_param.using_bias;
-    int result_add = conv_param.result_add;
-    int icsecs = conv_param.icsecs;
-    int ocsecs = conv_param.ocsecs;
-    int nsecs = conv_param.nsecs;
-    int hsecs = conv_param.hsecs;
-
-    P_COMMAND dma_command;
-    CMD_ID_NODE id_node;
-    resync_cmd_id( &id_node );
-
-    int kh_ext = dh * (kh - 1) + 1;
-    int kw_ext = dw * (kw - 1) + 1;
-    int output_h = (input_h + 2 * pad_h - kh_ext) / stride_h + 1;
-    int output_w = (input_w + 2 * pad_w - kw_ext) / stride_w + 1;
-
-    int ic = input_c / groups;
-    int oc = output_c / groups;
-    int ic_per_NPU = ceiling_func_shift(ic, NPU_SHIFT);
-    int oc_per_NPU = ceiling_func_shift(oc, NPU_SHIFT);
-    int bias_offset_local = 0;
-    int bias_tensor_size = oc_per_NPU * FLOAT_SIZE;
-    int weight_offset_local = bias_offset_local + bias_tensor_size;
-    int weight_group_offset = oc * ic * kh * kw;
-    int weight_tensor_size = ic * oc_per_NPU * kh * kw * FLOAT_SIZE;
-    int weight_capacity = addr_EU_align(weight_tensor_size + bias_tensor_size);
-    int ifmap_group_offset = ic * input_h * input_w;
-    int ofmap_group_offset = oc * output_h * output_w;
-    int global_ifmap_Nstride = ifmap_group_offset * groups;
-    int global_ofmap_Nstride = ofmap_group_offset * groups;
-    int nslice = input_n, ocslice = oc, icslice = ic, hslice = output_h;
-    nslice = input_n / nsecs;
-    int n_residual = input_n - nslice * nsecs;
-    hslice = output_h / hsecs;
-    int h_residual = output_h - hslice * hsecs;
-    icslice = ic / icsecs;
-    int ic_residual = ic - icslice * icsecs;
-    ocslice = oc / ocsecs;
-    int oc_residual = oc - ocslice * ocsecs;
-    int bias_group_offset = oc;
-    int max_icslice = icslice + (ic_residual > 0);
-    int max_ic_per_NPU = ceiling_func_shift(max_icslice, NPU_SHIFT);
-    int max_ocslice = ocslice + (oc_residual > 0);
-    int max_oc_per_NPU = ceiling_func_shift(max_ocslice, NPU_SHIFT);
-
-    for (int ig = 0; ig < groups; ig++){
-        int ocend = 0;
-        for (int ocidx = 0; ocidx < ocsecs; ocidx++){
-            int ocstart = ocend;
-            int cur_ocslice = ocslice + (oc_residual > ocidx);
-            ocend = ocstart + cur_ocslice;
-            oc_per_NPU = ceiling_func_shift(cur_ocslice, NPU_SHIFT);
-            if (using_bias){
-                dma_command = get_command(ENGINE_GDMA);
-                tensor_compact_move_gen_cmd(
-                    bias_offset_local, // local mem start address
-                    bias_offset_global + (ig * bias_group_offset + ocstart) * FLOAT_SIZE, // global mem start address
-                    1, cur_ocslice, 1, 1, // n, c, h, w
-                    0, // direction G2L
-                    0, // transpose
-                    (void *)dma_command,
-                    0, // local mem index
-                    &id_node
-                );
-                call_atomic(nodechip_idx, atomic_global_dma, dma_command, ENGINE_GDMA);
-            }
-            weight_capacity = max_icslice * oc_per_NPU * kh * kw * FLOAT_SIZE;
-            int ofmap_offset_local = addr_EU_align(weight_capacity + weight_offset_local);
-            int nend = 0;
-            for (int nidx = 0; nidx < nsecs; nidx++){
-                int nstart = nend;
-                int sec_len_n = nslice + (nidx < n_residual);
-                nend = nstart + sec_len_n;
-                int o_hb = 0;
-                for (int hidx = 0; hidx < hsecs; hidx++){
-                    int o_ht = o_hb;
-                    int o_h = hslice + (h_residual > hidx);
-                    o_hb = o_ht + o_h;
-                    int i_ht = bm_max(o_ht * stride_h - pad_h, 0);
-                    int pad_h_t = 0;
-                    if (i_ht == 0){
-                        pad_h_t = pad_h - o_ht * stride_h;
-                    }
-                    int i_hb = bm_min(o_hb * stride_h + kh_ext - 1 - pad_h, input_h);
-                    int pad_h_b = 0;
-                    if (i_hb == input_h){
-                        pad_h_b = o_hb * stride_h + kh_ext - 1 - pad_h - input_h;
-                    }
-                    int i_h = i_hb - i_ht;
-                    int ifmap_align_size = get_neuron_csize_local(i_h, input_w);
-                    int ifmap_tensor_size = sec_len_n * max_ic_per_NPU * ifmap_align_size;
-                    int ofmap_align_size = get_neuron_csize_local(o_h, output_w);
-                    int ofmap_tensor_size = sec_len_n * max_oc_per_NPU * ofmap_align_size;
-                    int ifmap_offset_local = ofmap_offset_local + ofmap_tensor_size;
-                    int offset_local_end = ifmap_offset_local + ifmap_tensor_size;
-                    ASSERT(offset_local_end <= LOCAL_MEM_SIZE);
-                    if (result_add){
-                        dma_command = get_command(ENGINE_GDMA);
-                        u64 shift = nstart * global_ofmap_Nstride + ig * ofmap_group_offset +
-                            (ocstart * output_h + o_ht) * output_w;
-                        int local_cstride = get_cstride_local(o_h, output_w);
-                        tensor_stride_move_gen_cmd(
-                            ofmap_offset_local, // local mem start address
-                            ofmap_offset_global + shift * FLOAT_SIZE, // global mem start address
-                            sec_len_n, cur_ocslice, o_h, output_w, // n, c, h, w
-                            0, // local mem index
-                            0, // direction G2L
-                            global_ofmap_Nstride, output_h * output_w, output_w, // src stride n,c,h
-                            oc_per_NPU * local_cstride, local_cstride, output_w, // dst stride n,c,h
-                            GDMA_TYPE_f32, 
-                            0, // transpose 
-                            dma_command, &id_node
-                        );
-                        call_atomic(nodechip_idx, atomic_global_dma, dma_command, ENGINE_GDMA);
-                    }
-                    int icend = 0;
-                    for (int icidx = 0; icidx < icsecs; icidx++){
-                        int icstart = icend;
-                        int cur_icslice = icslice + (ic_residual > icidx);
-                        icend = icstart + cur_icslice;
-                        ic_per_NPU = ceiling_func_shift(cur_icslice, NPU_SHIFT);
-                        u64 shift = (ocstart * ic + icstart) * kh * kw + ig * weight_group_offset;
-                        if ((icsecs != 1) || (nidx == 0 && hidx == 0)){
-                            dma_command = get_command(ENGINE_GDMA);
-                            tensor_stride_move_gen_cmd(
-                                weight_offset_local, // local mem start address
-                                weight_offset_global + shift * FLOAT_SIZE, // global mem start address
-                                1, cur_ocslice, cur_icslice, kh * kw, // n, c, h, w
-                                0, // local mem index
-                                0, // direction G2L
-                                0, ic * kh * kw, kh * kw, // src stride n,c,h
-                                0, cur_icslice * kh * kw, kh * kw, // dst stride n,c,h
-                                GDMA_TYPE_f32, 
-                                0, // transpose 
-                                dma_command, &id_node
-                            );
-                            call_atomic(nodechip_idx, atomic_global_dma, dma_command, ENGINE_GDMA);
-                        }
-                        shift = nstart * global_ifmap_Nstride + ig * ifmap_group_offset +
-                                (icstart * input_h + i_ht) * input_w;
-                        int local_cstride = get_cstride_local(i_h, input_w);
-                        dma_command = (float*)get_command(ENGINE_GDMA);
-                        tensor_stride_move_gen_cmd(
-                            ifmap_offset_local, // local mem start address
-                            ifmap_offset_global + shift * FLOAT_SIZE, // global mem start address
-                            sec_len_n, cur_icslice, i_h, input_w, // n, c, h, w
-                            0, // local mem index
-                            0, // direction G2L
-                            global_ifmap_Nstride, input_h * input_w, input_w, // src stride n,c,h
-                            ic_per_NPU * local_cstride, local_cstride, input_w, // dst stride n,c,h
-                            GDMA_TYPE_f32, 
-                            0, // transpose
-                            dma_command, &id_node
-                        );
-                        call_atomic(nodechip_idx, atomic_global_dma, dma_command, ENGINE_GDMA);
-
-                        local_shape_t ifshape, ofshape;
-                        ifshape.n = sec_len_n;
-                        ifshape.c = cur_icslice;
-                        ifshape.h = i_h;
-                        ifshape.w = input_w;
-                        ofshape.c = cur_ocslice;
-                        ofshape.h = o_h;
-                        ofshape.w = output_w;
-                        P_COMMAND conv_command = get_command(ENGINE_BD);
-                        atomic_conv_kernel_stride_gen_cmd(
-                            conv_command,
-                            LOCAL_MEM_START_ADDR | ifmap_offset_local, // input address
-                            LOCAL_MEM_START_ADDR | ofmap_offset_local, // output address
-                            LOCAL_MEM_START_ADDR | weight_offset_local, // weight address
-                            LOCAL_MEM_START_ADDR | bias_offset_local, // bias address
-                            ifshape, // input shape
-                            ofshape, // output shape
-                            kh, kw, // kernel h, w
-                            dh, dw, // dilation h, w
-                            kh * kw, cur_icslice * kh * kw, kw, // kernel stride n,c,h
-                            pad_h_t, pad_h_b, pad_w, pad_w, // pad top, bottom, left, right
-                            stride_h, stride_w, // stride h, w
-                            icidx == icsecs - 1 ? using_bias: 0, // use bias
-                            result_add || icidx > 0, // add result
-                            &id_node
-                        );
-                        call_atomic(nodechip_idx, atomic_conv_neuron, conv_command, ENGINE_BD);
-                    }
-                    u64 shift = nstart * global_ofmap_Nstride + ig * ofmap_group_offset +
-                                (ocstart * output_h + o_ht) * output_w;
-                    int local_cstride = get_cstride_local(o_h, output_w);
-                    
-                    dma_command = get_command(ENGINE_GDMA);
-                    tensor_stride_move_gen_cmd(
-                        ofmap_offset_local, // local mem start address
-                        ofmap_offset_global + shift * FLOAT_SIZE, // global mem start address
-                        sec_len_n, cur_ocslice, o_h, output_w, // n, c, h, w
-                        0, // local mem index
-                        1, // direction L2G
-                        oc_per_NPU * local_cstride, local_cstride, output_w, // src stride n,c,h
-                        global_ofmap_Nstride, output_h * output_w, output_w, // dst stride n,c,h
-                        GDMA_TYPE_f32, 
-                        0, // transpose
-                        dma_command, &id_node
-                    );
-                    call_atomic(nodechip_idx, atomic_global_dma, dma_command, ENGINE_GDMA);
-                }
-            }
-        }
-    }
-    poll_all_engine_done(&id_node);
-    return 0;
-}
\ No newline at end of file
diff --git a/saber/funcs/impl/bm/device/bmkernel_base.c b/saber/funcs/impl/bm/device/bmkernel_base.c
index 39a59f6f9..4c890b20e 100644
--- a/saber/funcs/impl/bm/device/bmkernel_base.c
+++ b/saber/funcs/impl/bm/device/bmkernel_base.c
@@ -1,6 +1,5 @@
 #include "bmkernel_base.h"
 #include "bm_config.h"
-#include "bmk_conv.c"
 #include <stdio.h>
 /**
  * bmkernel_func is the user entry to BMKERNEL just like "main" to some applications.
@@ -18,8 +17,8 @@ int bmkernel_func(void *args)
             return 0;
         }
         case CONV: {
-            bm_api_conv_forward* api = (bm_api_conv_forward *)param->opParam;
-            return bm_conv_fwd(*api);
+            // bm_api_conv_forward* api = (bm_api_conv_forward *)param->opParam;
+            return 0;
         }
         default: {
             printf("op %d is not supported by BM yet.\n", param->op);
diff --git a/saber/funcs/impl/bm/vender_conv.cpp b/saber/funcs/impl/bm/vender_conv.cpp
deleted file mode 100644
index eb3c38681..000000000
--- a/saber/funcs/impl/bm/vender_conv.cpp
+++ /dev/null
@@ -1,286 +0,0 @@
-
-#include "saber/funcs/impl/bm/vender_conv.h"
-#include "bmkernel_base.h"
-#include "bm_common.h"
-#include <string.h>
-#include <stdio.h>
-#include <iostream>
-
-namespace anakin
-{
-namespace saber
-{
-
-int get_align_tensor_size(bm_tensor_4d_t shape){
-  int c_per_npu = ceiling_func_shift(shape.c, NPU_SHIFT);
-  return shape.n * c_per_npu * get_neuron_csize_local(shape.h, shape.w);
-}
-
-void conv_splitc(bm_kernel_param_t kernel_param, conv_secs_info_t *secs_info){
-  int oc_per_NPU = ceiling_func_shift(kernel_param.oc, NPU_SHIFT);
-  int kernel_size = kernel_param.h * kernel_param.w * FLOAT_SIZE;
-  int weight_capacity = kernel_param.ic * oc_per_NPU * kernel_size;
-  secs_info->icsecs = 1;
-  secs_info->ocsecs = 1;
-  const int quart_local_size = (LOCAL_MEM_SIZE >> 2);
-  if( weight_capacity > (LOCAL_MEM_SIZE >> 1) ){
-    const int max_weight_size = quart_local_size;
-    secs_info->icsecs = weight_capacity / max_weight_size + 1;
-    if(secs_info->icsecs > kernel_param.ic){
-      secs_info->icsecs = kernel_param.ic;
-    }
-    int icslice = (kernel_param.ic + secs_info->icsecs - 1) / secs_info->icsecs;
-    weight_capacity = icslice * oc_per_NPU * kernel_size * FLOAT_SIZE;
-    weight_capacity = addr_EU_align( weight_capacity);
-    int max_ocsecs = oc_per_NPU;
-    while( weight_capacity > max_weight_size ){
-      if(secs_info->ocsecs == 1){
-        secs_info->ocsecs = weight_capacity / quart_local_size + 1;
-      }
-      if(secs_info->ocsecs > max_ocsecs){
-        secs_info->ocsecs = max_ocsecs;
-        break;
-      }else{
-        secs_info->ocsecs++;
-      }
-      int ocslice = (kernel_param.oc + secs_info->ocsecs - 1) / secs_info->ocsecs;
-      oc_per_NPU = ceiling_func_shift(ocslice, NPU_SHIFT);
-      weight_capacity = icslice * oc_per_NPU * kernel_size * FLOAT_SIZE;
-      weight_capacity = addr_EU_align(weight_capacity);
-    }
-  }
-}
-
-static bm_status_t conv_splith(bm_tensor_4d_t input_shape, bm_tensor_4d_t output_shape,
-    bm_conv_param_t conv_param, int local_mem_capacity, int kh, conv_secs_info_t *secs_info){
-  int io_need = get_align_tensor_size(input_shape) +
-      get_align_tensor_size(output_shape);
-  secs_info->hsecs = io_need / local_mem_capacity;
-  int output_h = output_shape.h;
-  output_shape.h = (output_h + secs_info->hsecs - 1) / secs_info->hsecs;
-  input_shape.h = output_shape.h * conv_param.stride_h + kh;
-  while(io_need > local_mem_capacity){
-    if(secs_info->hsecs == output_h){
-      return BM_NOT_SUPPORTED; 
-    }
-    secs_info->hsecs++;
-    output_shape.h = (output_h + secs_info->hsecs - 1) / secs_info->hsecs;
-    input_shape.h = output_shape.h * conv_param.stride_h + kh;
-    io_need = get_align_tensor_size(input_shape) +
-                       get_align_tensor_size(output_shape);
-  }
-  return BM_SUCCESS;
-}
-
-static bm_status_t get_conv_secs_info(
-    bm_tensor_4d_t    input_shape,
-    bm_kernel_param_t kernel_param,
-    bm_tensor_4d_t    output_shape,
-    bool              with_bias,
-    bm_conv_param_t   conv_param,
-    conv_secs_info_t *secs_info){
-  int ic = kernel_param.ic;
-  int oc = kernel_param.oc;
-  int oc_per_NPU = ceiling_func_shift(oc, NPU_SHIFT);
-  int bias_tensor_size = oc_per_NPU * FLOAT_SIZE;
-  if(!with_bias){
-    bias_tensor_size = 0;
-  }
-  int kernel_size = kernel_param.h * kernel_param.w;
-  int weight_tensor_size = ic * oc_per_NPU * kernel_size * FLOAT_SIZE;
-  int weight_capacity = addr_EU_align( weight_tensor_size  + bias_tensor_size);
-  int ifmap_total_tensor_size = get_align_tensor_size(input_shape);
-  int ofmap_total_tensor_size = get_align_tensor_size(output_shape);
-  int totalneed_local_size = ifmap_total_tensor_size +
-                          ofmap_total_tensor_size + weight_capacity;
-  secs_info->nsecs = 1; secs_info->hsecs = 1;
-  if(totalneed_local_size > LOCAL_MEM_SIZE){
-    //if weight_capacity > 2 * bank_size then split oc and ic
-    conv_splitc(kernel_param, secs_info);
-    int ocslice = (oc + secs_info->ocsecs - 1) / secs_info->ocsecs;
-    int icslice = (ic + secs_info->icsecs - 1) / secs_info->icsecs;
-    oc_per_NPU = ceiling_func_shift(ocslice, NPU_SHIFT);
-
-    weight_capacity = icslice * oc_per_NPU * kernel_size * FLOAT_SIZE;
-    weight_capacity = addr_EU_align( weight_capacity + bias_tensor_size );
-    int local_mem_capacity = LOCAL_MEM_SIZE - weight_capacity;
-    CHECK_GT(local_mem_capacity, 0) << "local memory capacity not enough";
-    input_shape.c = icslice;
-    output_shape.c = ocslice;
-    ifmap_total_tensor_size = get_align_tensor_size(input_shape);
-    ofmap_total_tensor_size = get_align_tensor_size(output_shape);
-    int totalneed_local_size = ifmap_total_tensor_size + ofmap_total_tensor_size;
-    if(totalneed_local_size > local_mem_capacity){
-      int kh_ext = conv_param.dilation_h * (kernel_param.h - 1) + 1;
-      if(input_shape.n > 1){
-        if( totalneed_local_size > local_mem_capacity * input_shape.n){
-          secs_info->nsecs = input_shape.n;
-          output_shape.n = input_shape.n = 1;
-          bm_status_t result = conv_splith(input_shape, output_shape,
-              conv_param, local_mem_capacity, kh_ext, secs_info);
-          if(result == BM_NOT_SUPPORTED){
-            return result;
-          }
-        }else{
-          int input_n = input_shape.n;
-          secs_info->nsecs = (totalneed_local_size + local_mem_capacity - 1) / local_mem_capacity;
-          input_shape.n = (input_n + secs_info->nsecs - 1) / secs_info->nsecs;
-          output_shape.n = input_shape.n;
-          totalneed_local_size = get_align_tensor_size(input_shape) +
-                       get_align_tensor_size(output_shape);
-          while(totalneed_local_size > local_mem_capacity){
-            secs_info->nsecs++;
-            input_shape.n = (input_n + secs_info->nsecs - 1) / secs_info->nsecs;
-            output_shape.n = input_shape.n;
-            totalneed_local_size = get_align_tensor_size(input_shape) +
-                       get_align_tensor_size(output_shape);
-          }
-        }
-      }else{
-        bm_status_t result = conv_splith(input_shape, output_shape,
-            conv_param, local_mem_capacity, kh_ext, secs_info);
-        if(result == BM_NOT_SUPPORTED){
-          return result;
-        }
-      }
-    }
-  }else{
-    secs_info->icsecs = 1;
-    secs_info->ocsecs = 1;
-  }
-  return BM_SUCCESS;
-}
-
-// FP32 part
-template <>
-SaberStatus VenderConv2D<BM, AK_FLOAT>::\
-    create(const std::vector<Tensor<BM> *>& inputs,
-            std::vector<Tensor<BM> *>& outputs,
-            ConvParam<BM>& param, Context<BM>& ctx)
-{
-}
-
-template <>
-SaberStatus VenderConv2D<BM, AK_FLOAT>::\
-    init(const std::vector<Tensor<BM> *> &inputs,
-         std::vector<Tensor<BM> *> &outputs,
-         ConvParam<BM> &param, Context<BM> &ctx)
-{
-
-    _handle = ctx.get_handle();
-    return create(inputs, outputs, param, ctx);
-}
-
-template <>
-SaberStatus VenderConv2D<BM, AK_FLOAT>::\
-    dispatch(const std::vector<Tensor<BM>*>& inputs,
-                std::vector<Tensor<BM>*>& outputs,
-                ConvParam<BM>& param)
-{
-    const BM_mem_addr in_data = (const BM_mem_addr) inputs[0]->data();
-    BM_mem_addr out_data = (BM_mem_addr) outputs[0]->mutable_data();
-    const BM_mem_addr weight = (const BM_mem_addr) param.weight()->data();
-
-    int input_n = inputs[0]->num();
-    int input_c = inputs[0]->channel();
-    int input_h = inputs[0]->height();
-    int input_w = inputs[0]->width();
-
-    int output_n = outputs[0]->num();
-    int output_c = outputs[0]->channel();
-    int output_h = outputs[0]->height();
-    int output_w = outputs[0]->width();
-
-    int group = param.group;
-    int kh = param.weight()->height();
-    int kw = param.weight()->width();
-    int pad_h = param.pad_h;
-    int pad_w = param.pad_w;
-    int stride_h = param.stride_h;
-    int stride_w = param.stride_w;
-    int dilation_h = param.dilation_h;
-    int dilation_w = param.dilation_w;
-
-    bool with_bias = param.bias()->size() > 0;
-    const bm_mem_desc bias = with_bias ? (const bm_mem_desc) param.bias()->data() : BM_MEM_NULL;
-
-    bm_tensor_4d_t input_shape = {
-        input_n,
-        input_c,
-        input_h,
-        input_w};
-
-    bm_tensor_4d_t output_shape = {
-        output_n,
-        output_c,
-        output_h,
-        output_w};
-
-    bm_kernel_param_t kernel_param = {
-        group,
-        output_c,
-        input_c,
-        kh,
-        kw};
-
-    bm_conv_param_t conv_param = {
-        stride_h,
-        stride_w,
-        pad_h,
-        pad_w,
-        dilation_h,
-        dilation_w,
-        0};
-
-    bm_device_mem_t input_buf_mem = in_data;
-    // TODO: handle special case with pooling op
-
-    conv_secs_info_t secs_info;
-    bm_status_t result = get_conv_secs_info(input_shape, kernel_param,
-          output_shape, with_bias, conv_param, &secs_info);
-    CHECK_EQ(BM_SUCCESS, result) << "local memory is not enough in conv.";
-
-    bm_api_conv_forward bm_conv_param = {
-      bm_mem_get_device_addr(input_buf_mem),
-      bm_mem_get_device_addr(out_data),
-      bm_mem_get_device_addr(weight),
-      with_bias ? bm_mem_get_device_addr(bias) : BM_MEM_ADDR_NULL,
-      input_shape.n,
-      input_shape.c,
-      input_shape.h,
-      input_shape.w,
-      kernel_param.g,
-      output_shape.c,
-      kernel_param.h,
-      kernel_param.w,
-      conv_param.dilation_h,
-      conv_param.dilation_w,
-      conv_param.pad_h,
-      conv_param.pad_w,
-      conv_param.stride_h,
-      conv_param.stride_w,
-      with_bias,
-      conv_param.result_add,
-      secs_info.icsecs,
-      secs_info.ocsecs,
-      secs_info.nsecs,
-      secs_info.hsecs
-    };
-
-    bm_status_t bm_stat = bmlib_kernel_launch(_handle, "/usr/local/include/bm/bmkernel_bin.bin");
-    CHECK_EQ(BM_SUCCESS, bm_stat) << "bmlib_kernel_launch failed.";
-    
-    /* Send arguments. */
-    enum BmOpType op = CONV;
-    bmkernel_api_base api = { op, reinterpret_cast<void *>(&bm_conv_param) };
-    BM_CHECK(bmlib_kernel_send_args(_handle, reinterpret_cast<void *>(&api), sizeof(api)));
-
-    return SaberSuccess;
-}
-
-// INT8 part
-// Not supported yet
-
-template class VenderConv2D<BM, AK_FLOAT>;
-} // namespace saber
-} // namespace anakin
\ No newline at end of file
diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h
deleted file mode 100644
index b37ed1f9b..000000000
--- a/saber/funcs/impl/bm/vender_conv.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef ANAKIN_SABER_FUNCS_IMPL_BM_CONV2D_H
-#define ANAKIN_SABER_FUNCS_IMPL_BM_CONV2D_H
-
-#include "saber/funcs/impl/impl_conv.h"
-
-namespace anakin{
-
-namespace saber{
-
-template <DataType OpDtype>
-class VenderConv2D<BM, OpDtype> : public ImplBase<
-        BM, OpDtype, ConvParam<BM> > {
-            
-public:
-    VenderConv2D(): _handle(NULL) {}
-    ~VenderConv2D() {}
-
-    virtual SaberStatus init(const std::vector<Tensor<BM> *>& inputs,
-                             std::vector<Tensor<BM> *>& outputs,
-                             ConvParam<BM>& param, Context<BM>& ctx);
-
-    virtual SaberStatus create(const std::vector<Tensor<BM> *>& inputs,
-                               std::vector<Tensor<BM> *>& outputs,
-                               ConvParam<BM>& param, Context<BM>& ctx);
-
-    virtual SaberStatus dispatch(const std::vector<Tensor<BM>*>& inputs,
-                                 std::vector<Tensor<BM>*>& outputs,
-                                 ConvParam<BM>& param);
-
-private:
-    bm_handle_t _handle;
-};
-
-}
-}
-#endif //ANAKIN_SABER_FUNCS_IMPL_BM_CONV2D_H

From 2a1c06adff15df1c6d12f98d8c3c6d9f537fbf36 Mon Sep 17 00:00:00 2001
From: guangzhixie <guangzhi.xie@berkeley.edu>
Date: Tue, 23 Oct 2018 16:33:10 +0800
Subject: [PATCH 318/318] resolve permission issue

---
 saber/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt
index 80e1fc9db..c31f7f941 100644
--- a/saber/CMakeLists.txt
+++ b/saber/CMakeLists.txt
@@ -111,7 +111,7 @@ if(USE_BM_PLACE)
 
         COMMAND cat ${BIN_NAME}_itcm.hex.sim >> ${BIN_NAME}.bin
         COMMAND cat ${BIN_NAME}_ddr.hex.sim >> ${BIN_NAME}.bin
-        COMMAND cp ${ANAKIN_ROOT}/build/saber/bmkernel_bin.bin ${BM_ROOT}/${BIN_NAME}.bin
+        COMMAND cp ${ANAKIN_ROOT}/build/saber/bmkernel_bin.bin /var/tmp/${BIN_NAME}.bin
         COMMENT "BM Kernel compilation..."
     )
     add_custom_target(ANAKIN ALL DEPENDS bm_kernel_tmp)