From 3b63b66129305c04bfb484d192cf794cb6537b02 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Mon, 18 Jun 2018 13:39:29 +0800 Subject: [PATCH 001/318] Initial checkin for BM device support --- CMakeLists.txt | 21 + cmake/compiler_options.cmake | 18 + cmake/config/anakin_config.h.in | 2 + cmake/gather.cmake | 6 + framework/core/data_types.h | 9 + saber/CMakeLists.txt | 38 +- saber/core/common.h | 14 + saber/core/impl/bm/bm_device.cpp | 24 + saber/core/impl/bm/bm_impl.cpp | 89 ++ saber/core/target_traits.h | 7 + saber/core/target_wrapper.h | 58 ++ saber/core/tensor_op.cpp | 94 ++ saber/funcs/CMakeLists.txt | 12 + saber/funcs/impl/bm/base/CMakeLists.txt | 20 + .../impl/bm/base/include/bmdnn/bmdnn_api.h | 814 ++++++++++++++++++ .../bm/base/include/bmdnn/bmdnn_ext_api.h | 438 ++++++++++ .../bm/base/include/bmdnn/bmdnn_runtime.h | 20 + .../impl/bm/base/include/bmdnn/op_code.h | 62 ++ .../bm/base/include/bmlib/bmlib_runtime.h | 229 +++++ .../impl/bm/base/include/bmlib/bmlib_utils.h | 72 ++ .../impl/bm/base/include/bmruntime/bmblob.h | 97 +++ .../impl/bm/base/include/bmruntime/bmcnnctx.h | 58 ++ .../impl/bm/base/include/bmruntime/bmnet.h | 78 ++ .../bm/base/include/bmruntime/bmruntime.h | 154 ++++ .../base/include/bmruntime/bmruntime_common.h | 65 ++ .../include/bmruntime/bmruntime_interface.h | 11 + saber/funcs/impl/bm/vender_activation.h | 96 +++ saber/funcs/impl/bm/vender_conv.h | 195 +++++ saber/funcs/impl/bm/vender_conv_act.h | 198 +++++ saber/funcs/impl/bm/vender_conv_act_pooling.h | 176 ++++ saber/funcs/impl/bm/vender_fc.h | 114 +++ saber/funcs/impl/bm/vender_pooling.h | 151 ++++ saber/saber_funcs_param.h | 85 ++ saber/saber_types.h | 31 +- test/CMakeLists.txt | 4 + test/saber/bm/test_TargetWrapper_BM.cpp | 16 + test/saber/bm/test_saber_buffer_BM.cpp | 116 +++ test/saber/bm/test_saber_buffer_BM.h | 20 + test/saber/bm/test_saber_context_BM.cpp | 31 + test/saber/bm/test_saber_context_BM.h | 21 + test/saber/bm/test_saber_device_BM.cpp | 20 + test/saber/bm/test_saber_device_BM.h | 21 + test/saber/bm/test_saber_func_BM.h | 38 + .../bm/test_saber_func_activation_BM.cpp | 183 ++++ test/saber/bm/test_saber_func_conv_BM.cpp | 725 ++++++++++++++++ test/saber/bm/test_saber_func_fc_BM.cpp | 148 ++++ test/saber/bm/test_saber_func_pooling_BM.cpp | 311 +++++++ test/saber/bm/test_saber_shape_BM.cpp | 126 +++ test/saber/bm/test_saber_shape_BM.h | 25 + test/saber/bm/test_saber_tensor_BM.cpp | 642 ++++++++++++++ test/saber/bm/test_saber_tensor_BM.h | 21 + 51 files changed, 6016 insertions(+), 8 deletions(-) create mode 100644 saber/core/impl/bm/bm_device.cpp create mode 100644 saber/core/impl/bm/bm_impl.cpp create mode 100644 saber/funcs/impl/bm/base/CMakeLists.txt create mode 100644 saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h create mode 100644 saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h create mode 100644 saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h create mode 100644 saber/funcs/impl/bm/base/include/bmdnn/op_code.h create mode 100644 saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h create mode 100644 saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmblob.h create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmnet.h create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h create mode 100644 saber/funcs/impl/bm/vender_activation.h create mode 100644 saber/funcs/impl/bm/vender_conv.h create mode 100644 saber/funcs/impl/bm/vender_conv_act.h create mode 100644 saber/funcs/impl/bm/vender_conv_act_pooling.h create mode 100644 saber/funcs/impl/bm/vender_fc.h create mode 100644 saber/funcs/impl/bm/vender_pooling.h create mode 100644 test/saber/bm/test_TargetWrapper_BM.cpp create mode 100644 test/saber/bm/test_saber_buffer_BM.cpp create mode 100644 test/saber/bm/test_saber_buffer_BM.h create mode 100644 test/saber/bm/test_saber_context_BM.cpp create mode 100644 test/saber/bm/test_saber_context_BM.h create mode 100644 test/saber/bm/test_saber_device_BM.cpp create mode 100644 test/saber/bm/test_saber_device_BM.h create mode 100644 test/saber/bm/test_saber_func_BM.h create mode 100644 test/saber/bm/test_saber_func_activation_BM.cpp create mode 100644 test/saber/bm/test_saber_func_conv_BM.cpp create mode 100644 test/saber/bm/test_saber_func_fc_BM.cpp create mode 100644 test/saber/bm/test_saber_func_pooling_BM.cpp create mode 100644 test/saber/bm/test_saber_shape_BM.cpp create mode 100644 test/saber/bm/test_saber_shape_BM.h create mode 100644 test/saber/bm/test_saber_tensor_BM.cpp create mode 100644 test/saber/bm/test_saber_tensor_BM.h diff --git a/CMakeLists.txt b/CMakeLists.txt index c4dbfc25d..0a81d7c02 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -63,9 +63,22 @@ anakin_option(USE_CUFFT "Use CuFFT libs." YES if USE_CUDA) anakin_option(USE_CUDNN "Use Cudnn libs." YES if USE_CUDA) anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform." YES if USE_CUDA) anakin_option(BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform" NO if BUILD_CROSS_PLANTFORM) + +# compile options for BM place +anakin_option(USE_BM "Use Cuda libs." YES if NVIDIA_GPU) +anakin_option(USE_CUBLAS "Use Cublas libs." YES if USE_BM) +anakin_option(USE_CURAND "Use Curand libs." YES if USE_BM) +anakin_option(USE_CUFFT "Use CuFFT libs." YES if USE_BM) +anakin_option(USE_CUDNN "Use Cudnn libs." YES if USE_BM) +anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform." YES if USE_BM) + + if(USE_CUDA) # Select gpu target arch for local high performance implement sass code . Now we have checked on sm_61 sm_50 and it works well. set(SELECTED_SASS_TARGET_ARCH "61") +elseif(USE_BM) + # Select gpu target arch for local high performance implement sass code . Now we have checked on sm_61 sm_50 and it works well. + #set(SELECTED_SASS_TARGET_ARCH "61") endif() if((NOT BUILD_FAT_BIN) AND (NOT BUILD_CROSS_PLANTFORM) AND USE_CUDA) # Select the only nvidia gpu arch you want to be built on @@ -76,6 +89,10 @@ endif() anakin_option(BUILD_CUBIN "BUILD with the -cubin option in Device mode" NO if USE_CUDA) anakin_option(COMPILE_PTX "Returns a list of PTX files generated from src." NO if USE_CUDA) +# build options for BM. +anakin_option(BUILD_CUBIN "BUILD with the -cubin option in Device mode" NO if USE_BM) +anakin_option(COMPILE_PTX "Returns a list of PTX files generated from src." NO if USE_BM) + # common build options anakin_option(ENABLE_DEBUG "Enable DEBUG(default) mode." NO) @@ -140,6 +157,10 @@ if(USE_CUDA) include(cmake/cuda.cmake) endif() +if(USE_BM) + #include(cmake/cuda.cmake) +endif() + if(USE_X86_PLACE) set(ANAKIN_TEMP_THIRD_PARTY_PATH ${CMAKE_BINARY_DIR}/third-party) if(USE_MKLML) diff --git a/cmake/compiler_options.cmake b/cmake/compiler_options.cmake index 169c042fc..49d133c7f 100644 --- a/cmake/compiler_options.cmake +++ b/cmake/compiler_options.cmake @@ -112,3 +112,21 @@ if(USE_CUDA) # set default nvidia gpu arch set(ANAKIN_ARCH_LIST "3.5;5.0;6.0;6.1") endif() + +if(USE_BM) + if(CMAKE_BUILD_TYPE MATCHES Debug) + anakin_add_compile_option("-Xcompiler -fPIC" NVCC) + anakin_add_compile_option(-G NVCC) + anakin_add_compile_option(-g NVCC) + anakin_add_compile_option(-std=c++11 NVCC) + anakin_add_compile_option(-Wno-deprecated-gpu-targets NVCC) # suppress warning by architectures are deprecated (2.0,2.1) + else() + anakin_add_compile_option("-Xcompiler -fPIC" NVCC) + anakin_add_compile_option(-O3 NVCC) + anakin_add_compile_option(-std=c++11 NVCC) + anakin_add_compile_option("--default-stream per-thread" NVCC) + anakin_add_compile_option(-Wno-deprecated-gpu-targets NVCC) + endif() + # set default nvidia gpu arch + set(ANAKIN_ARCH_LIST "3.5;5.0;6.0;6.1") +endif() diff --git a/cmake/config/anakin_config.h.in b/cmake/config/anakin_config.h.in index b75990953..0a8560593 100644 --- a/cmake/config/anakin_config.h.in +++ b/cmake/config/anakin_config.h.in @@ -35,6 +35,8 @@ #cmakedefine USE_CUDA +#cmakedefine USE_BM + #cmakedefine USE_CUDNN #cmakedefine USE_PYTHON diff --git a/cmake/gather.cmake b/cmake/gather.cmake index cc7b3cc27..5017efff7 100644 --- a/cmake/gather.cmake +++ b/cmake/gather.cmake @@ -17,6 +17,12 @@ if(USE_CUDA) anakin_find_cuda() endif() +if(USE_BM) + #set other cuda path + #set(CUDA_TOOLKIT_ROOT_DIR $ENV{CUDA_PATH}) + #anakin_find_cuda() +endif() + # find opencl if(USE_OPENCL) diff --git a/framework/core/data_types.h b/framework/core/data_types.h index f06db5bdc..16bfccd08 100644 --- a/framework/core/data_types.h +++ b/framework/core/data_types.h @@ -17,6 +17,7 @@ #define ANAKIN_DATA_TYPES_H #include "framework/core/parameter.h" +#include "bmlib_runtime.h" #include namespace anakin { @@ -45,6 +46,7 @@ SABER_TO_BASE_TYPE(AK_UINT16, uint16_t); SABER_TO_BASE_TYPE(AK_UINT32, uint32_t); SABER_TO_BASE_TYPE(AK_BOOL, bool); SABER_TO_BASE_TYPE(AK_STRING, std::string); +SABER_TO_BASE_TYPE(AK_BM, bm_device_mem_t); template struct DataTypeRecover { @@ -69,6 +71,7 @@ BASE_TYPE_TO_SABER(uint8_t, AK_UINT8); BASE_TYPE_TO_SABER(uint32_t, AK_UINT32); BASE_TYPE_TO_SABER(bool, AK_BOOL); BASE_TYPE_TO_SABER(std::string, AK_STRING); +BASE_TYPE_TO_SABER(bm_device_mem_t, AK_BM); template struct TypeWarpper { @@ -96,6 +99,7 @@ ANAKIN_TO_TYPE_ID(long long, anakin_int64) ANAKIN_TO_TYPE_ID(unsigned long long, anakin_uint64) ANAKIN_TO_TYPE_ID(bool, anakin_bool) ANAKIN_TO_TYPE_ID(std::string, anakin_string) +ANAKIN_TO_TYPE_ID(bm_device_mem_t, anakin_bm) /// unique type tensor /// ANAKIN_TO_TYPE_ID(tensor, anakin_tensor) @@ -133,6 +137,11 @@ ANAKIN_TO_TYPE_ID(Enum, anakin_tuple_enum) ANAKIN_PBLOCK_TO_TYPE_ID(float, ARM, anakin_block_float) #endif +#ifdef USE_BM + ANAKIN_PBLOCK_TO_TYPE_ID(bm_device_mem_t, BM, anakin_block_float) +#endif + + template struct type_id { typedef T type; diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt index 415497b0c..440d1de07 100644 --- a/saber/CMakeLists.txt +++ b/saber/CMakeLists.txt @@ -56,7 +56,7 @@ if(USE_CUDA) # set select arch for cuda add_subdirectory(${ANAKIN_SABER}/funcs/impl/cuda/base) - set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS}) + set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS}) set(CMAKE_CXX_FLAGS "") if(BUILD_SHARED) CUDA_COMPILE(ANAKIN_SABER_CUDA_C_SRC_OBJS SHARED ${ANAKIN_SABER_CUDA_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG}) @@ -66,15 +66,41 @@ if(USE_CUDA) endif() set(CMAKE_CXX_FLAGS ${FLAGS_BACKUP}) - set(ANAKIN_SABER_STATIC_RELAY ${ANAKIN_SABER_STATIC_RELAY} - ${BEGIN_WHOLE_ARCHIVE} - ${ANAKIN_SABER_SASS_STATIC_LIB} - ${WHOLE_ARCHIVE_END}) + set(ANAKIN_SABER_STATIC_RELAY ${ANAKIN_SABER_STATIC_RELAY} + ${BEGIN_WHOLE_ARCHIVE} + ${ANAKIN_SABER_SASS_STATIC_LIB} + ${WHOLE_ARCHIVE_END}) endif() +if(USE_BM) + anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/bm "cpp" ANAKIN_SABER_BASE_SRC) + anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/bm "cpp" ANAKIN_SABER_BASE_SRC) + + # set root + set(BM_BASE_CODE_ROOT ${ANAKIN_SABER}/funcs/impl/bm/base) + # set select arch for cuda + add_subdirectory(${ANAKIN_SABER}/funcs/impl/bm/base) + + set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS}) + set(CMAKE_CXX_FLAGS "") + if(BUILD_SHARED) + CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS SHARED ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG}) + endif() + if(BUILD_STATIC) + CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS STATIC ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG}) + endif() + set(CMAKE_CXX_FLAGS ${FLAGS_BACKUP}) + + set(ANAKIN_SABER_STATIC_RELAY ${ANAKIN_SABER_STATIC_RELAY} + ${BEGIN_WHOLE_ARCHIVE} + ${ANAKIN_SABER_BM_STATIC_LIB} + ${WHOLE_ARCHIVE_END}) +endif() + + # add saber library to static if(UNIX OR APPLE) - ADD_LIBRARY(${ANAKIN_SABER_TEMP_COMMMON_LIB} SHARED ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC}) + ADD_LIBRARY(${ANAKIN_SABER_TEMP_COMMMON_LIB} SHARED ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BM_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC}) #$) if(USE_X86_PLACE) message(STATUS ${ANAKIN_SABER_DEPENDENCIES}) diff --git a/saber/core/common.h b/saber/core/common.h index 9e1bdd89d..2e7cd2650 100644 --- a/saber/core/common.h +++ b/saber/core/common.h @@ -146,3 +146,17 @@ const char* cudnn_get_errorstring(cudnnStatus_t status); #endif //ANAKIN_SABER_CORE_COMMON_H +#ifdef USE_BM + +#include "bmlib_runtime.h" +#include "bmdnn_api.h" +#include "bmlib_utils.h" + +#define BMDNN_CHECK(condition) \ + do { \ + bm_status_t error = condition; \ + CHECK_EQ(error, BM_SUCCESS) << " Failed with error code:" << error; \ + } while (0) + +#endif // USE_BM + diff --git a/saber/core/impl/bm/bm_device.cpp b/saber/core/impl/bm/bm_device.cpp new file mode 100644 index 000000000..c89045dcf --- /dev/null +++ b/saber/core/impl/bm/bm_device.cpp @@ -0,0 +1,24 @@ +#include "core/device.h" +namespace anakin{ + +namespace saber{ + +template <> +void Device::create_stream() { + // todo + LOG(WARNING) << "BM create_stream is not implemented"; +} + +template <> +void Device::get_info() { + // todo + LOG(WARNING) << "BM get_info is not implemented"; +} + +template void Device::get_info(); +template void Device::create_stream(); + + +} //namespace saber + +} //namespace anakin diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp new file mode 100644 index 000000000..3ff30773a --- /dev/null +++ b/saber/core/impl/bm/bm_impl.cpp @@ -0,0 +1,89 @@ +#include "core/tensor.h" +#include "env.h" + +#include "bmlib_runtime.h" +#include "bmdnn_api.h" +#include "bmlib_utils.h" + +#ifdef USE_BM +const char* bmdnn_get_errorstring(bm_status_t error) { + switch (error) { + case BM_SUCCESS: + return "BM API call correct"; + case BM_ERR_FAILURE: + return "BM API fail to return"; + case BM_ERR_TIMEOUT: + return "BM API time out"; + case BM_ERR_PARAM: + return "BM API invalid parameter"; + case BM_ERR_NOMEM: + return "BM API insufficient memory"; + case BM_ERR_DATA: + return "BM API invalid data"; + case BM_ERR_BUSY: + return "BM device is busy"; + case BM_NOT_SUPPORTED: + return "BM unsupported operate"; + } + return "Unknown bmdnn status"; +} +#endif + +namespace anakin{ + +namespace saber{ + +#ifdef USE_BM + +typedef TargetWrapper BM_API; + +static bm_handle_t handle; + +void BM_API::get_device_count(int &count) { + BMDNN_CHECK(bm_dev_getcount(&count)); +} + +void BM_API::set_device(int id){ + //(bm_handle_t &handle, bool bmkernel_used, int id){ + BMDNN_CHECK(bm_dev_request(&handle, 0, id)); +} + +//TODO: Do we have this functionality? +int BM_API::get_device_id(){ + return 0; +} + +void BM_API::mem_alloc(void** ptr, size_t n){ + //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n) + bm_device_mem_t mem = bm_mem_from_system(ptr); + BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n)); +} + +void BM_API::mem_free(void* ptr){ + //(bm_handle_t handle, bm_device_mem_t mem){ + if(ptr != nullptr){ + bm_free_device(handle, bm_mem_from_system(ptr)); + } +} + +void BM_API::mem_set(void* ptr, int value, size_t n){ + //(bm_handle_t handle, const int value, bm_device_mem_t mem){ + BMDNN_CHECK(bm_memset_device(handle, value, bm_mem_from_system(ptr))); +} + +//! target wrapper +template struct TargetWrapper; + +//! BM Buffer +template class Buffer; + +//! BM Tensor +INSTANTIATE_TENSOR(BM, AK_BM, NCHW); + +template struct Env; + +#endif //USE_BM + +} //namespace saber + +} //namespace anakin diff --git a/saber/core/target_traits.h b/saber/core/target_traits.h index 9c1d06d95..b4eb38ff0 100644 --- a/saber/core/target_traits.h +++ b/saber/core/target_traits.h @@ -27,6 +27,7 @@ struct __cuda_device{}; struct __arm_device{}; struct __amd_device{}; struct __x86_device{}; +struct __bm_device{}; struct __HtoD{}; struct __HtoH{}; @@ -69,6 +70,12 @@ struct TargetTypeTraits { typedef __amd_device target_type; }; +template <> +struct TargetTypeTraits { + typedef __device_target target_category; + typedef __bm_device target_type; +}; + } //namespace saber } //namespace anakin diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h index 778491505..6d5d6a8d1 100644 --- a/saber/core/target_wrapper.h +++ b/saber/core/target_wrapper.h @@ -360,6 +360,64 @@ struct TargetWrapper { #endif //USE_CUDA +#ifdef USE_BM + /** + * \brief for Bitmain sophon device target only, device target is BM tpu + * use bitmain api to manage memory + * support device to device, device to host, host to device memcpy +*/ +template <> +struct TargetWrapper { + typedef void* event_t; + typedef void* stream_t; + + static void get_device_count(int& count); + + static void set_device(int id); + + //We should add strategy to avoid malloc directly + static void mem_alloc(void** ptr, size_t n); + + //template + static void mem_free(void * ptr); + + //template + static void mem_set(void* ptr, int value, size_t n); + + // brief create event, empty function for bitmain target + static void create_event(event_t& event, bool flag = false) {} + static void destroy_event(event_t& event) {} + static void create_stream(stream_t& stream) {} + static void create_stream_with_flag(stream_t& stream, unsigned int flag) {} + static void create_stream_with_priority(stream_t& stream, unsigned int flag, int priority) {} + static void destroy_stream(stream_t& stream) {} + static void record_event(event_t& event, stream_t stream) {} + static void query_event(event_t& event) {} + static void sync_event(event_t& event) {} + static void sync_stream(event_t& event, stream_t& stream) {} + // brief create event, empty function for bitmain target + + static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ + size_t count, __DtoD); + + static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ + size_t count, __HtoD); + + static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ + size_t count, __DtoH); + + static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \ + int src_dev, size_t count); + + /** + * \brief device target return currently used device id + * @return currently activated device id + */ + static int get_device_id(); +}; + +#endif //USE_BM + } //namespace saber } //namespace anakin diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp index 2e64dcdec..046fef53c 100644 --- a/saber/core/tensor_op.cpp +++ b/saber/core/tensor_op.cpp @@ -362,6 +362,100 @@ Context ctx) { } #endif + +/*#ifdef USE_BM + +template<> +SaberStatus +DataTensorTransformHelper::convert_weights, + Tensor >(Tensor& out_tensor, + const Tensor& in_tensor, +Context ctx) { + int input_channel = in_tensor.channel(); + int output_channel = out_tensor.shape()[1]; + // LOG(INFO)<<"input_channel = "< +SaberStatus +DataTensorTransformHelper::convert_bias, + Tensor >(Tensor& out_tensor, + const Tensor& in_tensor, +Context ctx) { + unsigned long weight_size = _vector_weight_scale.size(); + unsigned long bias_size = in_tensor.size(); + CHECK_GT(_in_scale, 0); + CHECK_GT(weight_size, 0); + CHECK_EQ(bias_size, weight_size); + + const float* in_data = in_tensor.data(); + float* out_data = out_tensor.mutable_data(); + + for (int i = 0; i < bias_size; ++i) { + out_data[i] = in_data[i] / _in_scale / _vector_weight_scale[i]; + } + + return SaberSuccess; +} +#endif*/ + } //namespace saber } //namespace anakin diff --git a/saber/funcs/CMakeLists.txt b/saber/funcs/CMakeLists.txt index deaf76eab..bdd319f13 100644 --- a/saber/funcs/CMakeLists.txt +++ b/saber/funcs/CMakeLists.txt @@ -10,6 +10,10 @@ if(USE_CUDA) #FILE(GLOB CUDA_BASE_SRCS "cuda/*.cpp" "cuda/*.cu") aux_source_directory(impl/cuda CUDA_BASE_SRCS) endif() +if(USE_BM) + #FILE(GLOB BM_BASE_SRCS "cuda/*.cpp" "cuda/*.cu") + aux_source_directory(impl/bm BM_BASE_SRCS) +endif() if(USE_AMD) #FILE(GLOB CUDA_BASE_SRCS "cuda/*.cpp" "cuda/*.cu") aux_source_directory(impl/amd AMD_BASE_SRCS) @@ -48,6 +52,14 @@ foreach(SRC_NAME ${CUDA_BASE_SRCS}) list(APPEND DIR_SRCS_CUR "${CMAKE_CURRENT_SOURCE_DIR}/${FILE_NAME}") endforeach() +foreach(SRC_NAME ${BM_BASE_SRCS}) + #unpack the dir "/" + string(REPLACE "./" "" FILE_NAME ${SRC_NAME}) + string(REPLACE " " "" FILE_NAME ${FILE_NAME}) + #string(REPLACE ".cpp" ".cpp;" FILE_NAME ${FILE_NAME}) + list(APPEND DIR_SRCS_CUR "${CMAKE_CURRENT_SOURCE_DIR}/${FILE_NAME}") +endforeach() + foreach(SRC_NAME ${X86_BASE_SRCS}) #unpack the dir "/" string(REPLACE "./" "" FILE_NAME ${SRC_NAME}) diff --git a/saber/funcs/impl/bm/base/CMakeLists.txt b/saber/funcs/impl/bm/base/CMakeLists.txt new file mode 100644 index 000000000..fd4b3d680 --- /dev/null +++ b/saber/funcs/impl/bm/base/CMakeLists.txt @@ -0,0 +1,20 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved +# @file CMakeLists files in the saber subdirectory for nvidia gpu code +# @auther cuichaowen +# @date 2017-11-29 +# ---------------------------------------------------------------------------- + +if(USE_BM) + anakin_fetch_files_with_suffix_recursively(${BM_BASE_CODE_ROOT}/include "h" ANAKIN_SABER_BM_C_SRC) + anakin_fetch_files_with_suffix_recursively(${BM_BASE_CODE_ROOT}/lib "o" ANAKIN_SABER_BM_STATIC_LIB) +endif() + +macro(anakin_set_upscope src) + set(${src} ${${src}} PARENT_SCOPE) +endmacro() + +if(USE_BM) + anakin_set_upscope(ANAKIN_SABER_BM_C_SRC) + anakin_set_upscope(ANAKIN_SABER_BM_STATIC_LIB) +endif() diff --git a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h new file mode 100644 index 000000000..97feb1972 --- /dev/null +++ b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h @@ -0,0 +1,814 @@ +#ifndef BMDNN_API_H +#define BMDNN_API_H + +#include "bmdnn_runtime.h" +#include "op_code.h" + +#if defined (__cplusplus) +extern "C" { +#endif + +/* + * All the name-style of input/output are in the viewpoint of forward operation + */ + +typedef struct kernel_param{ + int g; + int oc; + int ic; + int h; + int w; +}bm_kernel_param_t; + +typedef struct bm_conv_param{ + int stride_h; + int stride_w; + int pad_h; + int pad_w; + int dilation_h; + int dilation_w; + bool result_add; +}bm_conv_param_t; + +typedef struct bm_pool_param{ + int stride_h; + int stride_w; + int pad_h; + int pad_w; + int kh; + int kw; + bool is_avg_pooling; +}bm_pool_param_t; + +bm_status_t bmdnn_conv_relu_pool_forward( + bm_handle_t handle, + bm_device_mem_t input, + bm_device_mem_t weight, + bm_device_mem_t bias, + bm_tensor_4d_t input_shape, + bm_kernel_param_t kernel_param, + bm_pool_param_t pool_param, + bm_conv_param_t conv_param, + bool with_bias, + bm_device_mem_t output); + +bm_status_t bmdnn_conv_forward( + bm_handle_t handle, + //input + bm_device_mem_t input, + bm_device_mem_t weight, + bm_device_mem_t bias, + bm_tensor_4d_t input_shape, + bm_kernel_param_t kernel_param, + bm_tensor_4d_t output_shape, + bm_conv_param_t conv_param, + bool with_bias, + //output + bm_device_mem_t output); + +bm_status_t bmdnn_deconv_forward( + bm_handle_t handle, + bm_device_mem_t input, + bm_device_mem_t weight, + bm_device_mem_t bias, + bm_tensor_4d_t input_shape, + bm_kernel_param_t kernel_param, + bm_tensor_4d_t output_shape, + bm_conv_param_t conv_param, + bool with_bias, + bm_device_mem_t output); + +bm_status_t bmdnn_conv_backward_bias( + bm_handle_t handle, + //input + bm_device_mem_t output_diff, + int input_n, + int input_c, + int input_h, + int input_w, + int groups, + int output_c, + int kh, + int kw, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + int result_add, + //output + bm_device_mem_t bias_diff); + +bm_status_t bmdnn_pooling_forward( + bm_handle_t handle, + //input + bm_device_mem_t input, + int input_n, + int input_c, + int input_h, + int input_w, + int kh, + int kw, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + int is_avg_pooling, + //output + bm_device_mem_t output + ); +bm_status_t bmdnn_upsample_forward( + bm_handle_t handle, + //input + bm_device_mem_t input, + int input_n, + int input_c, + int input_h, + int input_w, + int size, + //output + bm_device_mem_t output + ); +bm_status_t bmdnn_roi_pooling_forward( + bm_handle_t handle, + //input + bm_device_mem_t input, + bm_device_mem_t rois, + int input_n, + int input_c, + int input_h, + int input_w, + int pooled_h, + int pooled_w, + int roi_num, + int spatial_scale, + //output + bm_device_mem_t output + ); + +bm_status_t bmdnn_fc_forward( + bm_handle_t handle, + //input + bm_device_mem_t input, + bm_device_mem_t weight, + bm_device_mem_t bias, + int batch_size, + int num_output_neuron, + int num_input_neuron, + int transpose, + int using_bias, + int using_relu, + //output + bm_device_mem_t output); + +bm_status_t bmdnn_fc_backward( + bm_handle_t handle, + //input + bm_device_mem_t output_diff, + bm_device_mem_t input, + bm_device_mem_t weight, + int num_output_neuron, + int batch_size, + int num_input_neuron, + int using_bias, + int propagate_down_bias_diff, + int propagate_down_weight_diff, + int propagate_down_bottom, + //output + bm_device_mem_t weight_diff, + bm_device_mem_t bias_diff, + bm_device_mem_t input_diff); + +bm_status_t bmdnn_dropout_forward( + bm_handle_t handle, + //input + bm_device_mem_t input, + float dropout_ratio, + int input_n, + int input_dim, + //output + bm_device_mem_t output, + bm_device_mem_t mask); + +bm_status_t bmdnn_dropout_backward( + bm_handle_t handle, + //input + bm_device_mem_t input, + float dropout_ratio, + int input_n, + int input_dim, + //output + bm_device_mem_t output); + +bm_status_t bmdnn_batchnorm_forward_inference( + bm_handle_t handle, + //input + bm_device_mem_t input, + bm_device_mem_t mean_ma, + bm_device_mem_t variance_ma, + float scale_ma, + bm_device_mem_t variance, + float eps, + int input_n, + int input_c, + int input_h, + int input_w, + //output + bm_device_mem_t output); + +bm_status_t bmdnn_batchnorm_forward_train( + bm_handle_t handle, + //input + bm_device_mem_t input, + float ma_fraction, + float eps, + int input_n, + int input_c, + int input_h, + int input_w, + //output + bm_device_mem_t output, + bm_device_mem_t mean, + bm_device_mem_t variance, + bm_device_mem_t mean_ma, + bm_device_mem_t variance_ma); + +bm_status_t bmdnn_batchnorm_backward( + bm_handle_t handle, + //input + bm_device_mem_t output_diff, + bm_device_mem_t output, + bm_device_mem_t variance, + int input_n, + int input_c, + int input_h, + int input_w, + int using_global_stats, + //output + bm_device_mem_t input_diff); + +bm_status_t bmdnn_lrn_forward( + bm_handle_t handle, + //input + bm_device_mem_t input, + int input_n, + int input_c, + int input_h, + int input_w, + int lrn_n, + float alpha, + float beta, + float k, + //output + bm_device_mem_t output); + +bm_status_t bmdnn_lrn_backward( + bm_handle_t handle, + //input + bm_device_mem_t output_diff, + bm_device_mem_t output, + bm_device_mem_t input, + int lrn_n, + float alpha, + float beta, + float k, + int input_n, + int input_c, + int input_h, + int input_w, + //output + bm_device_mem_t input_diff); + +bm_status_t bmdnn_relu_forward( + bm_handle_t handle, + //input + bm_device_mem_t input, + float negative_slope, + int input_n, + int input_dim, + //output + bm_device_mem_t output); + +bm_status_t bmdnn_relu_backward( + bm_handle_t handle, + //input + bm_device_mem_t output_diff, + bm_device_mem_t input, + float negative_slope, + int input_n, + int input_dim, + //output + bm_device_mem_t input_diff); + +bm_status_t bmdnn_sigmoid_forward( + bm_handle_t handle, + //input + bm_device_mem_t input, + int input_n, + int input_dim, + //output + bm_device_mem_t output); + +bm_status_t bmdnn_sigmoid_backward( + bm_handle_t handle, + //input + bm_device_mem_t output_diff, + bm_device_mem_t output, + int input_n, + int input_dim, + //output + bm_device_mem_t input_diff); + +bm_status_t bmdnn_tanh_forward( + bm_handle_t handle, + //input + bm_device_mem_t input, + int input_n, + int input_dim, + //output + bm_device_mem_t output); + +bm_status_t bmdnn_tanh_backward( + bm_handle_t handle, + //input + bm_device_mem_t output_diff, + bm_device_mem_t output, + int input_n, + int input_dim, + //output + bm_device_mem_t input_diff); + +bm_status_t bmdnn_softmax_forward( + bm_handle_t handle, + //input + bm_device_mem_t input, + int input_n, + int input_c, + int input_inner_dim, + //output + bm_device_mem_t output); + +bm_status_t bmdnn_softmax_backward( + bm_handle_t handle, + //input + bm_device_mem_t output_diff, + bm_device_mem_t output, + int input_n, + int input_c, + int input_inner_dim, + //output + bm_device_mem_t input_diff); + +bm_status_t bmdnn_softmax_loss_forward( + bm_handle_t handle, + bm_device_mem_t input, + bm_device_mem_t label, + float normalizer, + int input_n, + int input_c, + int input_inner_dim, + bm_device_mem_t output, + bm_device_mem_t loss); +bm_status_t bmdnn_interp_forward( + bm_handle_t handle, + //input + bm_device_mem_t input, + int input_n, + int input_c, + int input_h, + int input_w, + int pad_bag, + int pad_end, + int output_h, + int output_w, + //output + bm_device_mem_t output + ); +bm_status_t bmdnn_softmax_loss_backward( + bm_handle_t handle, + bm_device_mem_t output, + bm_device_mem_t label, + bm_device_mem_t loss, + float normalizer, + int input_n, + int input_c, + int input_inner_dim, + bm_device_mem_t input_diff); + +bm_status_t bmdnn_softmax_loss_bidirection( + bm_handle_t handle, + bm_device_mem_t input, + bm_device_mem_t label, + float normalizer, + int input_n, + int input_c, + int input_inner_dim, + bm_device_mem_t output_diff, + bm_device_mem_t loss); + +bm_status_t bmdnn_multiregion_forward_parallel( + bm_handle_t handle, + //input + bm_device_mem_t* input, + int* input_n, + int* input_c, + int* input_h, + int* input_w, + int input_num, + int classes, + int coords, + int nums, + int* Activate_parm, + //output + bm_device_mem_t* output +); + +bm_status_t bmdnn_accuracy( + bm_handle_t handle, + //input + bm_device_mem_t input, + bm_device_mem_t label_idx, + bm_device_mem_t input_mem_buffer, + int input_num, + int input_dim, + int top_k, + //output + bm_device_mem_t output); + +bm_status_t bmdnn_coeff_update_sgd( + bm_handle_t handle, + bm_device_mem_t weight_diff, + bm_device_mem_t weight, + bm_device_mem_t history_weight, + int weight_count, + float base_lr, + float momentum, + float weight_decay); + +bm_status_t bmdnn_fc_backward_sgd( + bm_handle_t handle, + //input + bm_device_mem_t output_diff, + bm_device_mem_t input, + //input and output + bm_device_mem_t weight, + bm_device_mem_t weight_history, + int num_output_neuron, + int batch_size, + int num_input_neuron, + int using_bias, + int propagate_down_bias_diff, + int propagate_down_weight_diff, + int propagate_down_bottom, + float base_lr, + float momentum, + float weight_decay, + //output + bm_device_mem_t bias_diff, + bm_device_mem_t input_diff); + +bm_status_t bmdnn_permute( + bm_handle_t handle, + //input + bm_device_mem_t input, + int input_n, + int input_c, + int input_h, + int input_w, + //output + bm_device_mem_t output); + +bm_status_t bmdnn_normalize_forward( + bm_handle_t handle, + //input + bm_device_mem_t input, + bm_device_mem_t scale, + float eps, + float scale_val, + bool across_spatial, + bool channel_share, + int input_n, + int input_c, + int input_h, + int input_w, + //output + bm_device_mem_t output); + +/* + * MD Operations for user + */ + + +bm_status_t bmdnn_md_scalar( + bm_handle_t handle, + //input + bm_device_mem_t tensor_A, + bm_device_mem_t tensor_B, + int input_n, + int input_c, + int input_h, + int input_w, + ALIGN_TENSOR_OP align_tensor_op, + int result_add, + int A_is_constant, + int B_is_constant, + float A_const_val, + float B_const_val, + int B_N_is_1, + int B_index_is_1, + //output + bm_device_mem_t tensor_R); + +bm_status_t bmdnn_md_cmp( + bm_handle_t handle, + //input + bm_device_mem_t tensor_A, + bm_device_mem_t tensor_B, + bm_device_mem_t tensor_C, + bm_device_mem_t tensor_D, + int input_n, + int input_c, + int input_h, + int input_w, + int A_is_constant, + int B_is_constant, + int C_is_constant, + int D_is_constant, + float A_constant, + float B_constant, + unsigned int C_constant, + unsigned int D_constant, + int result_skip, + //output + bm_device_mem_t tensor_Y, + bm_device_mem_t tensor_R); + +bm_status_t bmdnn_md_sfu( + bm_handle_t handle, + //input + bm_device_mem_t tensor_A, + int input_n, + int input_c, + int input_h, + int input_w, + SFU_OP sfu_op, + float a, + int n, + //output + bm_device_mem_t tensor_Y); + +bm_status_t bmdnn_md_sum( + bm_handle_t handle, + //input + bm_device_mem_t tensor_A, + int input_n, + int input_c, + int input_h, + int input_w, + int result_add, + //output + bm_device_mem_t tensor_Y); + + +bm_status_t bmdnn_md_linear( + bm_handle_t handle, + //input + bm_device_mem_t tensor_A, + bm_device_mem_t tensor_B, + bm_device_mem_t tensor_S, + int input_n, + int input_c, + int input_h, + int input_w, + LINEAR_OP linear_op, + int result_add, + int B_is_const, + int S_is_const, + float B_const_val, + float S_const_val, + //output + bm_device_mem_t tensor_Y); + +bm_status_t bmdnn_img_sum( + bm_handle_t handle, + //input + bm_device_mem_t tensor_A, + int input_n, + int input_c, + int input_h, + int input_w, + int result_add, + //output + bm_device_mem_t tensor_Y); + +/* + * fullnet mode + */ +bm_status_t bmdnn_fullnet( + bm_handle_t handle, + unsigned long long bdc_cmd_offset, + unsigned long long gdma_cmd_offset, + unsigned long long cdma_cmd_offset, + unsigned long long cmd_num_offset + ); + +/* + * multiple fullnet mode + */ +bm_status_t bmdnn_multi_fullnet( + bm_handle_t handle, + int input_num, + unsigned long long* user_input_global_offset, + unsigned long long* cmd_input_global_offset, + int* input_tensor_size, + int output_num, + unsigned long long* user_output_global_offset, + unsigned long long* cmd_output_global_offset, + int* output_tensor_size, + unsigned long long bdc_cmd_offset, + unsigned long long gdma_cmd_offset, + unsigned long long cdma_cmd_offset, + int* bdc_cmd_num, + int* gdma_cmd_num, + int* cdma_cmd_num, + int cmdgroup_num + ); + +/* + * dynamic fullnet mode + */ +bm_status_t bmdnn_dynamic_fullnet( + bm_handle_t handle, + unsigned long long compiled_ir_global_addr, + unsigned int compiled_ir_length, + unsigned int batch_num, + unsigned int input_num, + unsigned long long* input_global_offset, + unsigned int* input_height, + unsigned int* input_width, + unsigned int output_num, + unsigned long long* output_global_offset, + unsigned long long apd_ctx_mem_offset +#if defined(USING_CMODEL) && !defined(USING_FULLNET) + ,float** p_refer_result +#endif + ); + +/** + * Depthwise convolution. + */ +bm_status_t bmdnn_depthwise_forward( + bm_handle_t handle, + bm_device_mem_t input, + bm_device_mem_t weight, + bm_device_mem_t bias, + int input_n, + int input_c, + int input_h, + int input_w, + int kernel_h, + int kernel_w, + int dilation_h, + int dilation_w, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + int using_bias, + bm_device_mem_t output); + +bm_status_t bmdnn_lstm_forward( + bm_handle_t handle, + //input + bm_device_mem_t input, + bm_device_mem_t cont, + bm_device_mem_t input_static, + /*bm_device_mem_t w_hc, + bm_device_mem_t w_xc,*/ + bm_device_mem_t w_hxc, + bm_device_mem_t w_xstatic, + bm_device_mem_t b_c, + bm_device_mem_t h_0, + bm_device_mem_t c_0, + int input_n, + int seq_len, + int input_dim, + int input_static_dim, + int output_dim, + int with_input_static, + //output + bm_device_mem_t c, + bm_device_mem_t gate, + bm_device_mem_t h_T, + bm_device_mem_t c_T, + bm_device_mem_t h); + +bm_status_t bmdnn_netease_ocr_forward( + bm_handle_t handle, + //input + bm_device_mem_t conv1_ifmap, + bm_device_mem_t params, + bm_device_mem_t result); + +typedef struct dim4_s { + int n, c, h, w; +} dim4_t; +enum +{ + CONV_DEPTHWISE, + CONV_3D +}; +typedef struct mobilenet_conv_param_s +{ + /** convolution. */ + int type; + bm_device_mem_t kernel; + bm_device_mem_t bias; + dim4_t kernel_shape; + int dilation_h, dilation_w; + int pad_h, pad_w; + int stride_h, stride_w; + bool using_bias; + /** batchnorm. */ + bm_device_mem_t mean; + bm_device_mem_t variance; + /** relu. */ + float slope; +} mobilenet_conv_param_t; +bm_status_t bmdnn_mobilenet_forward( + bm_handle_t handle, + const mobilenet_conv_param_t *conv, + int num, + const dim4_t &input_shape, + const bm_device_mem_t &input_global_mem, + dim4_t &output_shape, + bm_device_mem_t &output_global_mem, + float parallel_performance_factor = 1.f); + +bm_status_t bmdnn_conv_forward_bank_conflict( + bm_handle_t handle, + //input + bm_device_mem_t input, + bm_device_mem_t weight, + bm_device_mem_t bias, + bm_tensor_4d_t input_shape, + bm_kernel_param_t kernel_param, + bm_tensor_4d_t output_shape, + bm_conv_param_t conv_param, + bool with_bias, + //output + bm_device_mem_t output); + +bm_status_t bmdnn_pooling_forward_bank_conflict( + bm_handle_t handle, + //input + bm_device_mem_t input, + int input_n, + int input_c, + int input_h, + int input_w, + int kh, + int kw, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + int is_avg_pooling, + bm_device_mem_t output); + +bm_status_t bmdnn_fc_forward_bank_conflict( + bm_handle_t handle, + //input + bm_device_mem_t input, + bm_device_mem_t weight, + bm_device_mem_t bias, + int batch_size, + int num_output_neuron, + int num_input_neuron, + int transpose, + int using_bias, + int using_relu, + bm_device_mem_t output); + +bm_status_t bmdnn_conv_forward_power_evaluation( + bm_handle_t handle, + //input + bm_device_mem_t input, + bm_device_mem_t weight, + bm_device_mem_t bias, + bm_tensor_4d_t input_shape, + bm_kernel_param_t kernel_param, + bm_tensor_4d_t output_shape, + bm_conv_param_t conv_param, + bool with_bias, + //output + bm_device_mem_t output); + +bm_status_t bmdnn_img_scale( + bm_handle_t handle, bm_device_mem_t dst, bm_device_mem_t src, int n, + int c, int dh, int sh, int dw, int sw); + +#if defined (__cplusplus) +} +#endif + +#endif /* BMDNN_API_H */ diff --git a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h new file mode 100644 index 000000000..384cd4108 --- /dev/null +++ b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h @@ -0,0 +1,438 @@ +#ifndef BMDNN_EXT_API_H +#define BMDNN_EXT_API_H + +#include "bmdnn_runtime.h" + +#if defined (__cplusplus) +extern "C" { +#endif + +bm_status_t bmdnn_threshold_forward( + bm_handle_t handle, + float threshold, + //input + bm_device_mem_t input, + int input_n, + int input_dim, + //output + bm_device_mem_t output + ); + +bm_status_t bmdnn_exp_forward( + bm_handle_t handle, + float base, + float input_scale, + float input_shift, + //input + bm_device_mem_t input, + int input_n, + int input_dim, + //output + bm_device_mem_t output + ); + +bm_status_t bmdnn_exp_backward( + bm_handle_t handle, + float base, + float input_scale, + float input_shift, + //input + bm_device_mem_t output_diff, + bm_device_mem_t output, + int input_n, + int input_dim, + //output + bm_device_mem_t input_diff + ); + +bm_status_t bmdnn_power_forward( + bm_handle_t handle, + float power_, + float scale_, + float shift_, + //input + bm_device_mem_t input, + int input_n, + int input_dim, + //output + bm_device_mem_t output + ); + +bm_status_t bmdnn_power_backward( + bm_handle_t handle, + float power_, + float scale_, + float shift_, + //input + bm_device_mem_t output_diff, + bm_device_mem_t output, + bm_device_mem_t input, + int input_n, + int input_dim, + //output + bm_device_mem_t input_diff + ); + +bm_status_t bmdnn_euclidean_loss_forward( + bm_handle_t handle, + //input + bm_device_mem_t input, + bm_device_mem_t label, + bm_device_mem_t temp_, + int input_n, + int input_dim, + //output + bm_device_mem_t diff, + bm_device_mem_t loss); + +bm_status_t bmdnn_euclidean_loss_backward( + bm_handle_t handle, + float alpha, + //input + bm_device_mem_t output, + int input_n, + int input_dim, + //output + bm_device_mem_t input_diff); + +bm_status_t bmdnn_silence_backward( + bm_handle_t handle, + //input + //bm_device_mem_t output_data, + int input_n, + int input_dim, + //output + bm_device_mem_t input_diff); + +bm_status_t bmdnn_lstm_unit_forward( + bm_handle_t handle, + //input + bm_device_mem_t X_i, + bm_device_mem_t X_f, + bm_device_mem_t X_o, + bm_device_mem_t X_g, + bm_device_mem_t C_prev, + bm_device_mem_t cont_expand, + int num, + int hidden_dim, + //output + bm_device_mem_t C, + bm_device_mem_t H); + +bm_status_t bmdnn_lstm_unit_backward( + bm_handle_t handle, + //input + bm_device_mem_t C_diff, + bm_device_mem_t H_diff, + bm_device_mem_t X_i, + bm_device_mem_t X_f, + bm_device_mem_t X_o, + bm_device_mem_t X_g, + bm_device_mem_t C_prev, + bm_device_mem_t C, + bm_device_mem_t cont_expand, + int num, + int hidden_dim, + //output + bm_device_mem_t C_prev_diff, + bm_device_mem_t X_i_diff, + bm_device_mem_t X_f_diff, + bm_device_mem_t X_o_diff, + bm_device_mem_t X_g_diff); + +bm_status_t bmdnn_eltwise_forward( + bm_handle_t handle, + int op_, + int flag_first, + float coeffs_, + int index, + //input + bm_device_mem_t input, + bm_device_mem_t target, + int input_n, + int input_dim, + //output + bm_device_mem_t mask_data, + bm_device_mem_t output); + +bm_status_t bmdnn_eltwise_backward( + bm_handle_t handle, + int op_, + int flag_first, + float coeffs_, + int index, + //input + bm_device_mem_t output_data, + bm_device_mem_t output_diff, + bm_device_mem_t input_data, + bm_device_mem_t mask_data, + int input_n, + int input_dim, + //output + bm_device_mem_t input_diff); + +bm_status_t bmdnn_bias_forward( + bm_handle_t handle, + //input + bm_device_mem_t input, + bm_device_mem_t bias, + int outer_dim, + int dim, + //output + bm_device_mem_t output); + +bm_status_t bmdnn_bias_backward( + bm_handle_t handle, + int flag, + //input + bm_device_mem_t output_diff, + int outer_dim, + int bias_dim, + int inner_dim, + //output + bm_device_mem_t input_diff, + bm_device_mem_t bias_diff); + +bm_status_t bmdnn_log_forward( + bm_handle_t handle, + float scale, + float shift, + float base, + //input + bm_device_mem_t input, + int input_n, + int input_dim, + //output + bm_device_mem_t output); + +bm_status_t bmdnn_log_backward( + bm_handle_t handle, + float scale, + float shift, + float base, + //input + bm_device_mem_t output_diff, + bm_device_mem_t input, + int input_n, + int input_dim, + //output + bm_device_mem_t input_diff); + +bm_status_t bmdnn_absval_forward( + bm_handle_t handle, + //input + bm_device_mem_t input, + int input_n, + int input_dim, + //output + bm_device_mem_t output); + +bm_status_t bmdnn_absval_backward( + bm_handle_t handle, + //input + bm_device_mem_t output_diff, + bm_device_mem_t input, + int input_n, + int input_dim, + //output + bm_device_mem_t input_diff); + +bm_status_t bmdnn_sigmoid_cross_entropy_loss_forward( + bm_handle_t handle, + //input + bm_device_mem_t input, + bm_device_mem_t target, + bm_device_mem_t buffer, + int input_n, + int input_dim, + //output + bm_device_mem_t output, + bm_device_mem_t loss); + +bm_status_t bmdnn_sigmoid_cross_entropy_loss_backward( + bm_handle_t handle, + //input + bm_device_mem_t output, + bm_device_mem_t target, + bm_device_mem_t output_diff, + int input_n, + int input_dim, + //output + bm_device_mem_t input_diff); + +bm_status_t bmdnn_contrastive_loss_forward( + bm_handle_t handle, + //input + bm_device_mem_t input_0, + bm_device_mem_t input_1, + bm_device_mem_t label, + bm_device_mem_t buffer, + int input_n, + int input_c, + float margin, + bool legacy_version, + //output + bm_device_mem_t diff, + bm_device_mem_t dist_sq, + bm_device_mem_t loss); + +bm_status_t bmdnn_contrastive_loss_backward( + bm_handle_t handle, + //input + bm_device_mem_t label, + bm_device_mem_t diff, + bm_device_mem_t dist_sq, + bm_device_mem_t output_diff, + bm_device_mem_t buffer, + int input_n, + int input_dim, + float margin, + bool legacy_version, + int propagate_down_flag, + //output + bm_device_mem_t input_diff_0, + bm_device_mem_t input_diff_1); + +bm_status_t bmdnn_filter_forward( + bm_handle_t handle, + //input + bm_device_mem_t input, + bm_device_mem_t filter, + int input_n, + int output_n, + int input_dim, + //output + bm_device_mem_t output); + +bm_status_t bmdnn_filter_backward( + bm_handle_t handle, + //input + bm_device_mem_t output_diff, + bm_device_mem_t filter, + int input_n, + int output_n, + int input_dim, + //output + bm_device_mem_t input_diff); + +bm_status_t bmdnn_split_backward( + bm_handle_t handle, + //input + int is_first, + bm_device_mem_t output_diff, + int input_n, + int input_dim, + //output + bm_device_mem_t input_diff); + +bm_status_t bmdnn_bnll_forward( + bm_handle_t handle, + //input + bm_device_mem_t input, + int input_n, + int input_dim, + //output + bm_device_mem_t output); + +bm_status_t bmdnn_bnll_backward( + bm_handle_t handle, + //input + bm_device_mem_t output_diff, + bm_device_mem_t input, + float threshold, + int input_n, + int input_dim, + //output + bm_device_mem_t input_diff); + +bm_status_t bmdnn_prelu_forward( + bm_handle_t handle, + //input + bm_device_mem_t input, + bm_device_mem_t slope, + float slope0, + int channel_shared, + int input_n, + int input_c, + int input_h, + int input_w, + //output + bm_device_mem_t output); + +bm_status_t bmdnn_prelu_backward( + bm_handle_t handle, + //input + bm_device_mem_t output_diff, + bm_device_mem_t input, + bm_device_mem_t slope, + int propagate_down_flag, + int channel_shared, + int input_n, + int input_c, + int input_h, + int input_w, + //output + bm_device_mem_t slope_diff, + bm_device_mem_t input_diff); + +bm_status_t bmdnn_scale_forward( + bm_handle_t handle, + //input + bm_device_mem_t input, + bm_device_mem_t scale, + int input_n, + int input_c, + int input_h, + int input_w, + int scale_dim, + int inner_dim, + int scale_is_neuron, + //output + bm_device_mem_t scale_extension, + bm_device_mem_t output); + +bm_status_t bmdnn_scale_backward( + bm_handle_t handle, + //input + bm_device_mem_t output_diff, + bm_device_mem_t input_data, + bm_device_mem_t scale_extension, + int propagate_down_flag, + int input_n, + int input_c, + int input_h, + int input_w, + int scale_dim, + int inner_dim, + int scale_is_neuron, + //output + bm_device_mem_t scale_diff, + bm_device_mem_t input_diff); + +bm_status_t bmdnn_elu_forward( + bm_handle_t handle, + float alpha, + //input + bm_device_mem_t input, + int input_n, + int input_dim, + //output + bm_device_mem_t output); + +bm_status_t bmdnn_elu_backward( + bm_handle_t handle, + float alpha, + //input + bm_device_mem_t output_diff, + bm_device_mem_t output, + bm_device_mem_t input, + int input_n, + int input_dim, + //output + bm_device_mem_t input_diff); + +#if defined (__cplusplus) +} +#endif + +#endif /* BMDNN_EXT_API_H */ diff --git a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h new file mode 100644 index 000000000..6fede1338 --- /dev/null +++ b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h @@ -0,0 +1,20 @@ +#ifndef BMDNN_RUNTIME_H_ +#define BMDNN_RUNTIME_H_ + +#include "bmlib_runtime.h" + +#if defined (__cplusplus) +extern "C" { +#endif + +bm_status_t bmdnn_init( + bm_handle_t *handle); + +void bmdnn_deinit( + bm_handle_t handle); + +#if defined (__cplusplus) +} +#endif + +#endif diff --git a/saber/funcs/impl/bm/base/include/bmdnn/op_code.h b/saber/funcs/impl/bm/base/include/bmdnn/op_code.h new file mode 100644 index 000000000..f85846a8a --- /dev/null +++ b/saber/funcs/impl/bm/base/include/bmdnn/op_code.h @@ -0,0 +1,62 @@ +#ifndef OP_CODE_H_ +#define OP_CODE_H_ + + +typedef enum align_tensor_op { + ALIGN_TENSOR_ADD, + ALIGN_TENSOR_SUB, + ALIGN_TENSOR_MUL, + ALIGN_TENSOR_DIV, + TENSOR_INVALID +} ALIGN_TENSOR_OP; + +typedef enum linear_op { + LINEAR_MAC, + LINEAR_ADD_SQR, + LINEAR_SUB_SQR +} LINEAR_OP; + +typedef enum sfu_op { + SFU_XN, + SFU_EX, + SFU_LNX, + SFU_RSQ, + SFU_INVALID +} SFU_OP; +typedef struct tensor_4d_t { + int n; + int c; + int h; + int w; +}bm_tensor_4d_t; + + +#define TENSOR_ADD 0 +#define TENSOR_SUB 1 +#define TENSOR_MUL 2 +//Note the div should be implmented by KAMAKE algorithm +#define TENSOR_DIV 3 +#define TENSOR_MAX 4 +#define TENSOR_CPY 5 +#define TENSOR_MAC 6 + +#define TENSOR_N_DIM 0 +#define TENSOR_C_DIM 1 +#define TENSOR_H_DIM 2 +#define TENSOR_W_DIM 3 + +#define SHARE_REG_MESSAGE_WP 0 +#define SHARE_REG_MESSAGE_RP 1 +#define SHARE_REG_MESSAGE_IRQSTATUS 2 +#define SHARE_REG_CDMA_IRQSTATUS 3 + +#define SHAREMEM_MSG_FIXED_OFFSET (8192) +#define SHAREMEM_SIZE_BIT 8 +#define SHAREMEM_MASK ((1< +#include + +#if !defined(__x86_64__) && !defined(__aarch64__) +#error "BM needs 64-bit to compile" +#endif + +#if defined (__cplusplus) +extern "C" { +#endif + +typedef enum { + BM_SUCCESS = 0, + BM_ERR_DEVNOTREADY = 1, /* Device not ready yet */ + BM_ERR_FAILURE = 2, /* General failure */ + BM_ERR_TIMEOUT = 3, /* Timeout */ + BM_ERR_PARAM = 4, /* Parameters invalid */ + BM_ERR_NOMEM = 5, /* Not enough memory */ + BM_ERR_DATA = 6, /* Data error */ + BM_ERR_BUSY = 7, /* Busy */ + BM_ERR_NOFEATURE = 8, /* Not supported yet */ + BM_NOT_SUPPORTED = 9 +} bm_status_t; + +typedef enum { + BM_MEM_TYPE_DEVICE = 0, + BM_MEM_TYPE_HOST = 1, + BM_MEM_TYPE_SYSTEM = 2, + BM_MEM_TYPE_INT8_DEVICE = 3, + BM_MEM_TYPE_INVALID = 4 +} bm_mem_type_t; + +#define BM_MEM_ADDR_NULL (0xfffffffff) + +typedef struct bm_mem_desc { + unsigned char desc[16]; +} bm_mem_desc_t; + +struct bm_context; +typedef struct bm_context * bm_handle_t; +typedef struct bm_mem_desc bm_device_mem_t; +typedef struct bm_mem_desc bm_host_mem_t; +typedef struct bm_mem_desc bm_system_mem_t; + +#define BM_CHECK_RET(call) \ + do { \ + bm_status_t ret = call; \ + if ( ret != BM_SUCCESS ) { \ + printf("BM_CHECK_RET failed %d\n", ret); \ + ASSERT(0); \ + exit(-ret); \ + } \ + } while(0) + +/* + * control + */ +void bm_flush( + bm_handle_t handle); +/* + * brief malloc host memory according to a tensor shape(each neuron is 32 bits) +*/ + +bm_status_t bm_malloc_neuron_device( + bm_handle_t handle, + bm_device_mem_t *pmem, + int n, + int c, + int h, + int w); + +/* + * brief malloc host memory in size of dword(32 bits) +*/ + +bm_status_t bm_malloc_device_dword( + bm_handle_t handle, + bm_device_mem_t *pmem, + int count); + +/* + * brief malloc host memory in size of byte +*/ + +bm_status_t bm_malloc_device_byte( + bm_handle_t handle, + bm_device_mem_t *pmem, + unsigned int size); + +void bm_free_device( + bm_handle_t handle, + bm_device_mem_t mem); + +/* + * brief malloc host memory in size of byte + */ +bm_status_t bm_malloc_host( + bm_handle_t handle, + bm_host_mem_t *pmem, + unsigned int size); + +void bm_free_host( + bm_handle_t handle, + bm_host_mem_t mem); + +void *bm_host_mem_get_pointer( + bm_host_mem_t mem); + +/* + * Memory copy and set + */ +bm_status_t bm_memcpy_h2d( + bm_handle_t handle, + bm_device_mem_t dst, + bm_host_mem_t src); + +bm_status_t bm_memcpy_d2h( + bm_handle_t handle, + bm_host_mem_t dst, + bm_device_mem_t src); + + +bm_status_t bm_memcpy_s2d( + bm_handle_t handle, + bm_device_mem_t dst, + bm_system_mem_t src); + +bm_status_t bm_memcpy_d2s( + bm_handle_t handle, + bm_system_mem_t dst, + bm_device_mem_t src); + +bm_status_t bm_memcpy_d2d( + bm_handle_t handle, + bm_device_mem_t dst, + int dst_offset, + bm_device_mem_t src, + int src_offset, + int len); + +bm_status_t bm_memset_device( + bm_handle_t handle, + const int value, + bm_device_mem_t mem); + +bm_device_mem_t bm_mem_from_system( + void * system_addr); + +/* +*brief malloc one device memory with the shape of (N,C,H,W), copy the sys_mem to +device mem if need_copy is true +*/ + +bm_status_t bm_mem_convert_system_to_device_neuron( + bm_handle_t handle, + struct bm_mem_desc *dev_mem, + struct bm_mem_desc sys_mem, + bool need_copy, + int n, + int c, + int h, + int w); + +/* +*brief malloc one device memory with the size of coeff_count, copy the sys_mem to +device mem if need_copy is true +*/ +bm_status_t bm_mem_convert_system_to_device_coeff( + bm_handle_t handle, + struct bm_mem_desc *dev_mem, + struct bm_mem_desc sys_mem, + bool need_copy, + int coeff_count); + +/* + * memory info get and set + */ +unsigned long long bm_mem_get_device_addr(struct bm_mem_desc mem); +void bm_mem_set_device_addr(struct bm_mem_desc & mem, unsigned long long addr); +unsigned int bm_mem_get_device_size(struct bm_mem_desc mem); +void bm_mem_set_device_size(struct bm_mem_desc & mem, unsigned int size); +bm_mem_type_t bm_mem_get_type(struct bm_mem_desc mem); + +/* +* brief Get the handle of bmlib_runtime +* return : If the handle has been inited, return the handle it self , else init one and return it +*/ +bm_handle_t get_bm_handle(); + +/* + * Helper functions + */ + +/** +* \brief Get the number of nodechip (Constant 1 in bm1682) +* \return +* \ref NO +*/ +int bm_get_nodechip_num( + bm_handle_t handle); + +/** +* \brief Get the number of nodechip (Constant 64 in bm1682) +* \return +* \ref NO +*/ +int bm_get_npu_num( + bm_handle_t handle); +int bm_get_eu_num( bm_handle_t handle); +/** +* \brief Get the number of nodechip (Constant 64 in bm1682) +* \return +* \ref NO +*/ +bm_device_mem_t bm_mem_null(void); +#define BM_MEM_NULL (bm_mem_null()) + +bm_status_t bm_dev_getcount(int* count); +bm_status_t bm_dev_query(int devid); +bm_status_t bm_dev_request(bm_handle_t *handle, bool bmkernel_used, int devid); +void bm_dev_free(bm_handle_t handle); + +#if defined (__cplusplus) +} +#endif + +#endif /* BM_RUNTIME_H_ */ diff --git a/saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h b/saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h new file mode 100644 index 000000000..e878343ef --- /dev/null +++ b/saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h @@ -0,0 +1,72 @@ +#ifndef BMLIB_UTILS_H +#define BMLIB_UTILS_H +#include + +/* + * Debug definitions for user app only + * Copy from common.h + * Don't include for internal usage + */ +#ifdef __cplusplus +extern "C" { +#endif + +#define UNUSED(x) (void)(x) + +#define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask)) +#define ALIGN(x,a) __ALIGN_MASK(x,(__typeof__(x))(a)-1) + +int array_cmp( + float *p_exp, + float *p_got, + int len, + const char *info_label, + float delta); + +int tri_array_cmp( + float *p_exp, + float *p_got, + float *third_party, + int len, + const char *info_label, + float delta, + int* err_idx); + +int array_cmp_int( + int *p_exp, + int *p_got, + int len, + const char *info_label +); + +void dump_hex(char *desc, void *addr, int len); +void dump_data_float(char *desc, void *addr, int n, int c, int h, int w); +void dump_data_int(char *desc, void *addr, int n, int c, int h, int w); +void dump_matrix_float(char *desc, void *addr, int row, int col); +void dump_array_file(char * file, int row_num, int col_num, int transpose, float * parr); + +/* dump to file */ +void dump_float_tensor(const char * filename, + int length, float * dump_data); + +#ifdef __cplusplus +/* not available in C */ +void random_param( + int &n, int &c, int &h, int &w, + int &kh, int &kw, int &ph, int &pw, int &sh, int &sw, + int &oc); + +void random_conv_param( + int &n, int &ic, int &ih, int &iw, int &oc, + int &kh, int &kw, int &dh, int &dw, + int &ph, int &pw, int &sh, int &sw); +#endif + +int conv_coeff_storage_convert(float * coeff_orig, float ** coeff_reformat, unsigned int oc, unsigned int ic, unsigned int kh, unsigned int kw, unsigned int npu_num); + + +#ifdef __cplusplus +} +#endif + +#endif /* BMLIB_UTILS_H */ diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmblob.h b/saber/funcs/impl/bm/base/include/bmruntime/bmblob.h new file mode 100644 index 000000000..f3e086f91 --- /dev/null +++ b/saber/funcs/impl/bm/base/include/bmruntime/bmblob.h @@ -0,0 +1,97 @@ +#ifndef __BM_BLOB_H__ +#define __BM_BLOB_H__ + +struct bm_mem_desc; +typedef struct bm_mem_desc bm_device_mem_t; +namespace bmcnn { + +typedef struct { int n, c, h, w; } Shape; + +class BMBlob +{ +public: + /** + * \brief Constructor of blob. + * + * \param shape - Shape of blob + */ + explicit BMBlob(const Shape &shape, void *handle); + /** + * \brief Deconstructor of blob. + */ + virtual ~BMBlob(); + /** + * \brief Reshape blob. + * + * \param n - Batch number of blob + * \param c - Channel number of blob + * \param h - Height of blob section + * \param w - Width of blob section + * + * \note + * (1) For now, number of channels is not allowed to be reshaped.\n + * (2) After reshaping, data in this blob will be set vanished.\n + */ + void Reshape(int n, int c, int h, int w); + /** + * \brief Get shape. + */ + inline Shape shape() const + { return shape_; } + /** + * \brief Get batch size. + */ + inline int batch_num() const + { return shape_.n; } + /** + * \brief Get feature + * + * \return Channel number of the blob\n + */ + inline int channels() const + { return shape_.c; } + /** + * \brief Get height of section + */ + int height() const + { return shape_.h; } + /** + * \brief Get width of section. + */ + int width() const + { return shape_.w; } + /** + * \brief Get read-only pointer to data in cpu. + */ + const float *cpu_data(); + /** + * \brief Get mutable pointer of data in cpu. + */ + float *mutable_cpu_data(); + /** + * \brief Get mutable pointer of memory in device. + */ + bm_device_mem_t *mutable_dev_mem(); + /** + * \brief Get read-only pointer of memory in device. + */ + const bm_device_mem_t *dev_mem(); +private: + BMBlob(const BMBlob &other); + BMBlob &operator=(const BMBlob &other); + + bm_device_mem_t *dev_mem_; + float *sys_data_; + Shape shape_; + int data_pos_; + int capacity_; + void *handle_; + + enum { AIR = 0x00, SYS = 0x01, DEV = 0x10 }; + void sync_s2d(); + void sync_d2s(); +}; + +} /* namespace bmcnn */ + +#endif /* __BM_BLOB_H__ */ diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h b/saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h new file mode 100644 index 000000000..6b0bfe857 --- /dev/null +++ b/saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h @@ -0,0 +1,58 @@ +#ifndef __BM_CNN_CONTEXT_H__ +#define __BM_CNN_CONTEXT_H__ + +#include +#include "bmruntime.h" + +namespace bmcnn { + +typedef void *bmcnn_ctx_t; +/** + * \brief Create context of BMCNN. + * + * \param ctx_dir - Directory of context files generated by BMNETC + * + * \note + * The context will be created in the device of ID 0.\n + * + * \return + * NULL - Creating failed.\n + * non-NULL - The handle of the context (creating succeeded).\n + */ +bmcnn_ctx_t bmcnn_ctx_create(const std::string &ctx_dir); +/** + * \brief Destroy context of BMCNN + * + * \param handle - Handle of the context to be destroyed + */ +void bmcnn_ctx_destroy(bmcnn_ctx_t handle); +/** + * \brief Create context of BMCNN in specific devide. + * + * \param ctx_dir - Directory of context files generated by BMNETC + * \param devid - ID of device where the context will be placed. + * + * \note + * Call \ref bm_dev_getcount to get total number of devices, e.g. N is returned, + * valid devid should be in range of 0 ~ (N-1).\n + * + * \return + * NULL - Creating failed that might be caused by incorrect parameter.\n + * non-NULL - The handle of the context (creating succeeded).\n + */ +bmcnn_ctx_t bmcnn_ctx_create_by_devid(const std::string &ctx_dir, int devid); +/** + * \brief Append context of BMCNN. + * + * \param ctx_dir - Directory of context files generated by BMNETC or BMNETD. + * \param bmrt - The created handle of context. + * + * \return + * false - Appending failed.\n + * true - Appending succeeded.\n + */ +bool bmcnn_ctx_append(const std::string &ctx_dir, bmruntime *bmrt); + +} /* namespace bmcnn */ + +#endif /* __BM_CNN_CONTEXT_H__ */ diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmnet.h b/saber/funcs/impl/bm/base/include/bmruntime/bmnet.h new file mode 100644 index 000000000..88005e1b8 --- /dev/null +++ b/saber/funcs/impl/bm/base/include/bmruntime/bmnet.h @@ -0,0 +1,78 @@ +#ifndef __BM_NET_H__ +#define __BM_NET_H__ + +#include "bmblob.h" +#include "bmcnnctx.h" +#include +#include +#include + +#ifdef CROSS_COMPILE + #include +#else + #include +#endif + + +#ifdef CROSS_COMPILE +#define NAMESPACE_USED std +#else +#define NAMESPACE_USED boost +#endif + +namespace bmcnn { + +class BMNet +{ +public: + /** + * \brief Constructor of net. + * + * \param handle - Handler of BMCNN context (created by \ref bmcnn_ctx_create) + * \param name - Name of net + */ + explicit BMNet(bmcnn_ctx_t handle, const std::string &name); + /** + * \brief Deconstructor of blob. + */ + virtual ~BMNet(); + /** + * \brief Reshape all layers from bottom to top. + */ + void Reshape(); + /** + * \brief Run forward. + * + * \param sync - Flag of synchronizing. + */ + void Forward(bool sync = false); + /** + * \brief Get blob by name. + * + * \param name - Name of blob + * \note + * (1) The name could only be of blob in input or output.\n + * (2) If the name is not spotted, null pointer will be returned.\n + */ + const NAMESPACE_USED::shared_ptr blob_by_name(const std::string &name) const; + /** + * \brief Get maximum shape allowed. + */ + inline const Shape &max_shape() const + { return max_shape_; } +private: + BMNet(const BMNet &other); + BMNet &operator=(const BMNet &other); + + bmcnn_ctx_t bmcc_ctx_; + std::vector > blobs_; + std::vector net_input_blobs_; + std::vector net_output_blobs_; + std::string name_; + std::map blob_name_index_; + Shape max_shape_; +}; + +} /* namespace bmcnn */ + +#endif /* __BM_NET_H__ */ diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h new file mode 100644 index 000000000..daa101fce --- /dev/null +++ b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h @@ -0,0 +1,154 @@ +#ifndef BMRUNTIME_H_ +#define BMRUNTIME_H_ +#include +#include +#include "bmlib_runtime.h" +#include "bmruntime_common.h" +#include "stdio.h" +#include +#include +#include +#include + +using std::vector; +using std::map; +using std::set; +using std::string; +using std::pair; +using std::make_pair; +using std::cout; +using std::endl; +typedef unsigned int u32; +typedef unsigned long long u64; + +typedef struct stage_param_with_idx{ + int height_high; + int height_low; + int width_high; + int width_low; + int stage_index; +}stage_param_with_idx_t; + +class bmruntime { + public: + bmruntime(bm_handle_t bm_handle); + ~bmruntime(); + + bool load_context(const string& ctx_dir); + + const set& get_input_tensor(int net_idx) const; + const set& get_input_tensor(const string& net_name); + + const set& get_output_tensor(int net_idx) const; + const set& get_output_tensor(const string& net_name); + + const bm_device_mem_t* get_input_blob(const string& tensor_name, int net_idx); + const bm_device_mem_t* get_input_blob(const string& tensor_name, const string& net_name); + + const bm_device_mem_t* get_output_blob(const string& tensor_name, int net_idx); + const bm_device_mem_t* get_output_blob(const string& tensor_name, const string& net_name); + + bool launch(int net_idx); + bool launch(const string& net_name); + + bool launch(int net_idx, const bm_device_mem_t* input_tensors, int input_num, + const bm_device_mem_t* output_tensors, int output_num); + bool launch(const string& net_name, const bm_device_mem_t* input_tensors, int input_num, + const bm_device_mem_t* output_tensors, int output_num); + + bool launch(int net_idx, int n, int h , int w); + bool launch(const string& net_name, int n, int h, int w); + bool launch(int net_idx, const bm_device_mem_t* input_tensors, int input_num, + const bm_device_mem_t* output_tensors, int output_num, int n, int h, int w); + bool launch(const string& net_name, const bm_device_mem_t* input_tensors, int input_num, + const bm_device_mem_t* output_tensors, int output_num, int n , int h, int w); + + void get_input_blob_max_nhw(const string& tensor_name, int net_idx, int * max_n, int * max_c, int * max_h, int * max_w); + void get_input_blob_max_nhw(const string& tensor_name, const string& net_name, int * max_n, int * max_c, int * max_h, int * max_w); + void get_output_blob_max_nhw(const string& tensor_name, int net_idx, int * max_n, int * max_c, int * max_h, int * max_w); + void get_output_blob_max_nhw(const string& tensor_name, const string& net_name, int * max_n, int *max_c, int * max_h, int * max_w); + + int get_oh_from_ih(const string& input_tensor_name, const string& output_tensor_name, const string& net_name, int ih); + int get_oh_from_ih(const string& input_tensor_name, const string& output_tensor_name, int net_idx, int ih); + int get_ow_from_iw(const string& input_tensor_name, const string& output_tensor_name, const string& net_name, int iw); + int get_ow_from_iw(const string& input_tensor_name, const string& output_tensor_name, int net_idx, int iw); + + + + + bool can_batch_size_change(int net_idx); + bool can_batch_size_change(const string& net_name); + bool can_height_and_width_change(int net_idx); + bool can_height_and_width_change(const string& net_name); + + void show_neuron_network(); + + int get_network_number() {return net_num;} + + inline bm_handle_t get_bm_handle() {return m_handle;} + + protected: + bool setup_mem_context(const string& ctx_dir); + bool setup_cmd_context(const string& ctx_dir); + bool set_using_cmd_file(const string& ctx_dir); + void load_cmd(u32* cmd, int engine_id, bool last_cmd, u64 start_address, u64 append_mem_offset); + bool setup_ir_context(const string& ctx_dir); + + void wrong_net_idx_handle(int net_idx) const; + + int get_net_idx(const string& net_name); + int get_stage_idx(int net_idx, int h, int w); + u64 get_stage_offset(int net_idx, int stage_idx); + + int compute_output_height(int input_height, int global_kh, int global_stride_h, int global_pad_h, int global_pool_kh); + int compute_output_width(int input_width, int global_kw, int global_stride_w, int global_pad_w, int global_pool_kw); + + bm_handle_t m_handle; + std::vector m_device_mem_info_vec; + std::vector m_device_mem_vec; + + vector m_gdma_total_id_v; + vector m_cdma_total_id_v; + vector m_bdc_total_id_v; + vector > m_gdma_group_id_v; + vector > m_cdma_group_id_v; + vector > m_bdc_group_id_v; + vector m_cmdgroup_num; + vector m_gdma_cmd_start_address_v; + vector m_cdma_cmd_start_address_v; + vector m_bdc_cmd_start_address_v; + vector > input_tensor_mem_map_v; + vector > output_tensor_mem_map_v; + vector > m_input_tensor_set_v; + vector > m_output_tensor_set_v; + int net_num; + map net_name_to_idx; + vector stage_num; + + bool have_ir_info; + vector > m_ir_info_len; + vector m_ir_info_start_address_v; + vector > stage_param_with_idx_vv; + + //io tensor param + vector n_can_change_v; + vector h_w_can_change_v; + + vector > > input_tensor_max_shape_vv; + vector > > output_tensor_max_shape_vv; + vector > > global_output_tensor_param_vv; + + bool m_using_cmd_file; + FILE * m_gdma_cmd_file; + FILE * m_cdma_cmd_file; + FILE * m_bdc_cmd_file; + + //previous value or state + int pre_net_num; + int pre_m_device_mem_info_vec_size; + + //append mem offset when appending another framework's context. + vector apd_ctx_mem_offset; +}; + +#endif diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h new file mode 100644 index 000000000..200656739 --- /dev/null +++ b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h @@ -0,0 +1,65 @@ +#ifndef BMRUNTIME_COMMON_H +#define BMRUNTIME_COMMON_H + +#define BMRT_ASSERT(_cond) \ + do { \ + if (!(_cond)) { \ + printf("ASSERT %s: %s: %d: %s\n", \ + __FILE__, __func__, __LINE__, #_cond); \ + exit(-1); \ + } \ + } while(0) + +typedef enum neuron_device_mem_type { + INPUT_NEURON_TENSOR = 0, + INTERMEDIATE_NEURON_TENSOR = 1, + OUTPUT_NEURON_TENSOR = 2, + CMD_BUF_TENSOR = 3, + CMD_NUM_TENSOR = 4 +} NEURON_DEVICE_MEM_TYPE; + +typedef enum device_mem_type { + NEURON = 0, + COEFF = 1, +#ifdef INT8_COEFF_FUNC + COEFF_INT8 = 2, + COEFF_INT8SCALE = 3, + LOCAL = 4 +#else + LOCAL = 2 +#endif +} DEVICE_MEM_TYPE; + +typedef struct device_mem_info { + DEVICE_MEM_TYPE device_mem_type; + NEURON_DEVICE_MEM_TYPE neuron_device_mem_type; + int n; + int c; + int h; + int w; + int coeff_count; + int groups; + unsigned long long address; +} DEVICE_MEM_INFO; + +//info for compute output tensor +typedef struct tensor_max_shape { + int max_n; + int channel; + int max_h; + int max_w; +} tensor_max_shape_t; + +typedef struct global_output_tensor_param { + int input_idx; + int global_kh; + int global_kw; + int global_stride_h; + int global_stride_w; + int global_pad_h; + int global_pad_w; + int global_pool_kh; + int global_pool_kw; +} global_output_tensor_param_t; + +#endif diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h new file mode 100644 index 000000000..4214674f3 --- /dev/null +++ b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h @@ -0,0 +1,11 @@ +#ifndef BMRUNTIME_INTERFACE_H_ +#define BMRUNTIME_INTERFACE_H_ + +#include "bmruntime.h" +#include "bmdnn_runtime.h" + +bmruntime* create_bmruntime(bm_handle_t* bm_handle); + +void destroy_bmruntime(bm_handle_t bm_handle, bmruntime* p_bmrt); + +#endif diff --git a/saber/funcs/impl/bm/vender_activation.h b/saber/funcs/impl/bm/vender_activation.h new file mode 100644 index 000000000..45541add9 --- /dev/null +++ b/saber/funcs/impl/bm/vender_activation.h @@ -0,0 +1,96 @@ +#ifndef ANAKIN_SABER_FUNCS_BMDNN_ACT_H +#define ANAKIN_SABER_FUNCS_BMDNN_ACT_H +#include "saber/funcs/impl/impl_activation.h" +namespace anakin { + +namespace saber { + +template +class VenderActivation : \ + public ImplBase< + Tensor, + Tensor, + Tensor, + ActivationParam > > +{ +public: + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + typedef Tensor OpTensor; + typedef typename DataTensor_in::Dtype InDataType; + typedef typename DataTensor_out::Dtype OutDataType; + typedef typename OpTensor::Dtype OpDataType; + + VenderActivation() + : _handle(NULL), _active_descs(NULL), _input_descs(NULL), _output_descs(NULL) {} + + ~VenderActivation() { + if (_input_descs) { + BMDNN_CHECK(bm_free_device(_input_descs)); + } + if (_output_descs) { + BMDNN_CHECK(bm_free_device(_output_descs)); + } + } + + virtual SaberStatus init(const std::vector& inputs, + std::vector& outputs, + ActivationParam& param, Context& ctx) { + // not sure + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector& inputs, + std::vector& outputs, + ActivationParam& param, Context& ctx) { + // not sure + return SaberSuccess; + } + + //call bmdnn activation funcs here + virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + ActivationParam& param) { + + const InDataType *in_data = (const InDataType *) inputs[0]->data(); + OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); + int input_dim = inputs[0]->channel() * inputs[0]->height() * inputs[0]->width(); + int input_n = inputs[0]->num(); + + switch (_active_type) { + case Active_sigmoid: + BMDNN_CHECK(bmdnn_sigmoid_forward(_handle, _input_descs, input_n, input_dim, _output_descs)); + break; + case Active_relu: + BMDNN_CHECK(bmdnn_relu_forward(_handle, _input_descs, input_n, input_dim, _output_descs)); + break; + case Active_tanh: + BMDNN_CHECK(bmdnn_tanh_forward(_handle, _input_descs, input_n, input_dim, _output_descs)); + break; + } + /* BMDNN_CHECK(cudnnActivationForward(_handle, _active_descs, */ + /* cudnn::cudnnTypeWrapper::kOne(), */ + /* _input_descs, in_data, */ + /* cudnn::cudnnTypeWrapper::kZero(), */ + /* _output_descs, out_data */ + /* )); */ + return SaberSuccess; + } + +private: + bm_handle_t _handle; + bm_device_mem_t _input_descs; + bm_device_mem_t _output_descs; + ActiveType _active_type; +}; +template class VenderActivation; +} +} + +#endif //ANAKIN_SABER_FUNCS_BMDNN_ACT_H diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h new file mode 100644 index 000000000..7efdfa611 --- /dev/null +++ b/saber/funcs/impl/bm/vender_conv.h @@ -0,0 +1,195 @@ +/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H +#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H + +#include "saber/funcs/impl/impl_conv.h" +#include "saber/funcs/impl/bm/bmdnn_api.h" + +namespace anakin{ + +namespace saber{ + +template +class VenderConv2D : \ + public ImplBase< + Tensor, + Tensor, + Tensor, + ConvParam > > +{ +public: + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + typedef Tensor OpTensor; + typedef typename DataTensor_in::Dtype InDataType; + typedef typename DataTensor_out::Dtype OutDataType; + typedef typename OpTensor::Dtype OpDataType; + VenderConv2D() + : _handle(NULL) + , _workspaceData(NULL) + , _workspace(NULL) + , _conv_descs(NULL) + , _input_descs(NULL) + , _output_descs(NULL) + , _filter_desc(NULL) + , _workspace_fwd_sizes(0) + , _workspaceSizeInBytes(0) + , _fwd_algo((cudnnConvolutionFwdAlgo_t)0) + , _input_nchw_descs(NULL) + , _output_nchw_descs(NULL) + , x8_data(NULL) + , y8_data(NULL) + , x8_data_size(0) + , y8_data_size(0) + {} + + ~VenderConv2D() { + + if (_conv_descs) { + CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(_conv_descs)); + } + if (_input_descs) { + CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_descs)); + } + if (_output_descs) { + CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_descs)); + } + if (_filter_desc) { + CUDNN_CHECK(cudnnDestroyFilterDescriptor(_filter_desc)); + } + if (_handle != NULL) { + CUDNN_CHECK(cudnnDestroy(_handle)); + } + if (_workspaceData != NULL) { + cudaFree(_workspaceData); + } + if (_input_nchw_descs != NULL) { + CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_nchw_descs)); + } + if (_output_nchw_descs != NULL) { + CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_nchw_descs)); + } + if (x8_data != NULL) { + CUDA_CHECK(cudaFree(x8_data)); + } + if (y8_data != NULL) { + CUDA_CHECK(cudaFree(y8_data)); + } + } + + /** + * [Create description] Init all cudnn resource here + * @AuthorHTL + * @DateTime 2018-02-01T16:13:06+0800 + * @param inputs [description] + * @param outputs [description] + * @param param [conv parameters] + */ + virtual SaberStatus init(const std::vector& inputs, + std::vector& outputs, + ConvParam& param, Context& ctx) { + + // ---- init cudnn resources ---- + + _workspaceSizeInBytes = 0; + _workspaceData = NULL; + + _workspace_fwd_sizes = 0; + + this->_ctx = ctx; + // ---- get cuda resources ---- + + cudaStream_t cuda_stream; + cuda_stream = ctx.get_compute_stream(); + + CUDNN_CHECK(cudnnCreate(&_handle)); + CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); + + _workspace = NULL; + + int in_channels = inputs[0]->channel(); + + // ---- create cudnn Descs ---- + cudnn::createFilterDesc(&_filter_desc); + + cudnn::createTensorDesc(&_input_descs); + cudnn::createTensorDesc(&_output_descs); + cudnn::createConvolutionDesc(&_conv_descs); + + if (param.bias()->size() > 0) { + cudnn::createTensorDesc(&_bias_desc); + } + + cudnnCreateTensorDescriptor(&_input_nchw_descs); + cudnnCreateTensorDescriptor(&_output_nchw_descs); + + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector& inputs, + std::vector& outputs, + ConvParam& param, Context& ctx); + + //call cudnnConvolutionForward here + virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + ConvParam& param); + +private: + cudnnHandle_t _handle; + cudnnConvolutionFwdAlgo_t _fwd_algo; + + cudnnTensorDescriptor_t _input_descs; + cudnnTensorDescriptor_t _output_descs; + cudnnTensorDescriptor_t _bias_desc; + + cudnnFilterDescriptor_t _filter_desc; + + cudnnConvolutionDescriptor_t _conv_descs; + + size_t _workspace_fwd_sizes; + size_t _workspaceSizeInBytes; // size of underlying storage + + void *_workspaceData; // underlying storage + void *_workspace; // aliases into _workspaceData + + const bool _use_tensor_core = true; + const size_t _workspace_limit_bytes = 64 * 1024 * 1024; + const cudnnConvolutionFwdPreference_t _preference = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST; + + // create transform descriptor + cudnnTensorDescriptor_t _input_nchw_descs; + cudnnTensorDescriptor_t _output_nchw_descs; + + void *x8_data; + void *y8_data; + + int x8_data_size; + int y8_data_size; +}; + + +} + +} +#endif //ANAKIN_SABER_FUNCS_BMDNN_CONV2D_H diff --git a/saber/funcs/impl/bm/vender_conv_act.h b/saber/funcs/impl/bm/vender_conv_act.h new file mode 100644 index 000000000..4d9c9f3bb --- /dev/null +++ b/saber/funcs/impl/bm/vender_conv_act.h @@ -0,0 +1,198 @@ +/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ACT_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ACT_H + +#include "saber/funcs/impl/impl_conv_act.h" +#include "saber/funcs/impl/cuda/cudnn_helper.h" +#include + +namespace anakin{ + +namespace saber{ + +template +class VenderConv2DAct : \ + public ImplBase< + Tensor, + Tensor, + Tensor, + ConvActiveParam > > +{ +public: + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + typedef Tensor OpTensor; + typedef typename DataTensor_in::Dtype InDataType; + typedef typename DataTensor_out::Dtype OutDataType; + typedef typename OpTensor::Dtype OpDataType; + + VenderConv2DAct() + : _handle(NULL) + , _workspaceData(NULL) + , _workspace(NULL) + , _conv_descs(NULL) + , _input_descs(NULL) + , _output_descs(NULL) + , _filter_desc(NULL) + , _workspace_fwd_sizes(0) + , _workspaceSizeInBytes(0) + , _fwd_algo((cudnnConvolutionFwdAlgo_t)0) + , _input_nchw_descs(NULL) + , _output_nchw_descs(NULL) + , x8_data(NULL) + , y8_data(NULL) + , x8_data_size(0) + , y8_data_size(0) + {} + + ~VenderConv2DAct() { + + if (_conv_descs) { + CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(_conv_descs)); + } + if (_input_descs) { + CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_descs)); + } + if (_output_descs) { + CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_descs)); + } + if (_filter_desc) { + CUDNN_CHECK(cudnnDestroyFilterDescriptor(_filter_desc)); + } + if (_handle != NULL) { + CUDNN_CHECK(cudnnDestroy(_handle)); + } + if (_workspaceData != NULL) { + cudaFree(_workspaceData); + } + if (_input_nchw_descs != NULL) { + CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_nchw_descs)); + } + if (_output_nchw_descs != NULL) { + CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_nchw_descs)); + } + if (x8_data != NULL) { + CUDA_CHECK(cudaFree(x8_data)); + } + if (y8_data != NULL) { + CUDA_CHECK(cudaFree(y8_data)); + } + } + + /** + * [Create description] Init all cudnn resource here + * @AuthorHTL + * @DateTime 2018-02-01T16:13:06+0800 + * @param inputs [description] + * @param outputs [description] + * @param conv_param [conv parameters] + */ + virtual SaberStatus init(const std::vector& inputs, + std::vector& outputs, + ConvActiveParam& param, Context& ctx) { + // ---- init cudnn resources ---- + + _workspaceSizeInBytes = 0; + _workspaceData = NULL; + + _workspace_fwd_sizes = 0; + + this->_ctx = ctx; + // ---- get cuda resources ---- + + cudaStream_t cuda_stream; + cuda_stream = ctx.get_compute_stream(); + + CUDNN_CHECK(cudnnCreate(&_handle)); + CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); + + _workspace = NULL; + + int in_channels = inputs[0]->channel(); + + // ---- create cudnn Descs ---- + cudnn::createFilterDesc(&_filter_desc); + + cudnn::createTensorDesc(&_input_descs); + cudnn::createTensorDesc(&_output_descs); + cudnn::createConvolutionDesc(&_conv_descs); + cudnn::create_activation_des(&_active_descs); + + if (param.conv_param.bias()->size() > 0) { + cudnn::createTensorDesc(&_bias_desc); + } + + cudnnCreateTensorDescriptor(&_input_nchw_descs); + cudnnCreateTensorDescriptor(&_output_nchw_descs); + + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector& inputs, + std::vector& outputs, + ConvActiveParam& param, Context& ctx); + //call cudnnConvolutionForward here + virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + ConvActiveParam& param); +private: + cudnnHandle_t _handle; + cudnnConvolutionFwdAlgo_t _fwd_algo; + + cudnnTensorDescriptor_t _input_descs; + cudnnTensorDescriptor_t _output_descs; + cudnnTensorDescriptor_t _bias_desc; + + cudnnFilterDescriptor_t _filter_desc; + + cudnnConvolutionDescriptor_t _conv_descs; + + size_t _workspace_fwd_sizes; + size_t _workspaceSizeInBytes; // size of underlying storage + + void *_workspaceData; // underlying storage + void *_workspace; // aliases into workspaceData + + const bool _use_tensor_core = true; + const size_t _workspace_limit_bytes = 64 * 1024 * 1024; + const cudnnConvolutionFwdPreference_t _preference = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST; + + // activation descriptor + cudnnActivationDescriptor_t _active_descs; + + // create transform descriptor + cudnnTensorDescriptor_t _input_nchw_descs; + cudnnTensorDescriptor_t _output_nchw_descs; + + void *x8_data; + void *y8_data; + + int x8_data_size; + int y8_data_size; +}; + + +} + +} +#endif //ANAKIN_SABER_FUNCS_CUDNN_CONV2D_H diff --git a/saber/funcs/impl/bm/vender_conv_act_pooling.h b/saber/funcs/impl/bm/vender_conv_act_pooling.h new file mode 100644 index 000000000..e602a693d --- /dev/null +++ b/saber/funcs/impl/bm/vender_conv_act_pooling.h @@ -0,0 +1,176 @@ +/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ACT_POOLING_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ACT_POOLING_H + +#include "saber/funcs/impl/impl_conv_act_pooling.h" +#include "saber/funcs/impl/cuda/cudnn_helper.h" +#include + +namespace anakin{ + +namespace saber{ + +template +class VenderConv2DActPooling : \ + public ImplBase< + Tensor, + Tensor, + Tensor, + ConvActivePoolingParam > > +{ +public: + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + typedef Tensor OpTensor; + typedef typename DataTensor_in::Dtype InDataType; + typedef typename DataTensor_out::Dtype OutDataType; + typedef typename OpTensor::Dtype OpDataType; + + VenderConv2DActPooling() + : _handle(NULL) + , _workspaceData(NULL) + , _workspace(NULL) + , _conv_descs(NULL) + , _input_descs(NULL) + , _output_descs(NULL) + , _filter_desc(NULL) + , _workspace_fwd_sizes(0) + , _workspaceSizeInBytes(0) + , _fwd_algo((cudnnConvolutionFwdAlgo_t)0) + {} + ~VenderConv2DActPooling() { + + if (_conv_descs) { + CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(_conv_descs)); + } + if (_input_descs) { + CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_descs)); + } + if (_output_descs) { + CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_descs)); + } + if (_filter_desc) { + CUDNN_CHECK(cudnnDestroyFilterDescriptor(_filter_desc)); + } + if (_handle != NULL) { + CUDNN_CHECK(cudnnDestroy(_handle)); + } + if (_workspaceData != NULL) { + cudaFree(_workspaceData); + } + } + + /** + * [Create description] Init all cudnn resource here + * @AuthorHTL + * @DateTime 2018-02-01T16:13:06+0800 + * @param inputs [description] + * @param outputs [description] + * @param conv_param [conv parameters] + */ + virtual SaberStatus init(const std::vector& inputs, + std::vector& outputs, + ConvActivePoolingParam& param, Context& ctx) { + // ---- init cudnn resources ---- + + _workspaceSizeInBytes = 0; + _workspaceData = NULL; + + _workspace_fwd_sizes = 0; + + this->_ctx = ctx; + // ---- get cuda resources ---- + + cudaStream_t cuda_stream; + cuda_stream = ctx.get_compute_stream(); + + CUDNN_CHECK(cudnnCreate(&_handle)); + CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); + + _workspace = NULL; + + int in_channels = inputs[0]->channel(); + + // ---- create cudnn Descs ---- + cudnn::createFilterDesc(&_filter_desc); + + cudnn::createTensorDesc(&_input_descs); + cudnn::createTensorDesc(&_inner_descs); + cudnn::createTensorDesc(&_output_descs); + cudnn::createConvolutionDesc(&_conv_descs); + if (param.has_activation) { + cudnn::create_activation_des(&_active_descs); + } + if (param.has_pooling) { + cudnn::create_pooling_des(&_pooling_descs); + } + if (param.conv_param.bias()->size() > 0) { + cudnn::createTensorDesc(&_bias_desc); + } + + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector& inputs, + std::vector& outputs, + ConvActivePoolingParam& param, Context& ctx); + //call cudnnConvolutionForward here + virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + ConvActivePoolingParam& param); +private: + cudnnHandle_t _handle; + cudnnConvolutionFwdAlgo_t _fwd_algo; + + cudnnTensorDescriptor_t _input_descs; + cudnnTensorDescriptor_t _output_descs; + cudnnTensorDescriptor_t _inner_descs; + cudnnTensorDescriptor_t _bias_desc; + + cudnnFilterDescriptor_t _filter_desc; + + cudnnConvolutionDescriptor_t _conv_descs; + cudnnPoolingDescriptor_t _pooling_descs; + + size_t _workspace_fwd_sizes; + size_t _workspaceSizeInBytes; // size of underlying storage + + void *_workspaceData; // underlying storage + void *_workspace; // aliases into workspaceData + + const bool _use_tensor_core = true; + const size_t _workspace_limit_bytes = 64 * 1024 * 1024; + const cudnnConvolutionFwdPreference_t _preference = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST; + + // activation descriptor + cudnnActivationDescriptor_t _active_descs; + + Shape _inner_shape; + DataTensor_out _inner_tensor; +}; + + +} + +} +#endif //ANAKIN_SABER_FUNCS_CUDNN_CONV2D_H diff --git a/saber/funcs/impl/bm/vender_fc.h b/saber/funcs/impl/bm/vender_fc.h new file mode 100644 index 000000000..5c7c23e67 --- /dev/null +++ b/saber/funcs/impl/bm/vender_fc.h @@ -0,0 +1,114 @@ +/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H +#define ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H + +#include "saber/funcs/impl/impl_fc.h" + +namespace anakin{ + +namespace saber{ + +template +class VenderFc: \ + public ImplBase< + Tensor, \ + Tensor, \ + Tensor, \ + FcParam>> { + +public: + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + typedef Tensor OpTensor; + typedef typename DataTensor_in::Dtype InDataType; + typedef typename DataTensor_out::Dtype OutDataType; + typedef typename OpTensor::Dtype OpDataType; + + VenderFc() = default; + ~VenderFc() { + if (_handle != nullptr) { + CUBLAS_CHECK(cublasDestroy(_handle)); + } + } + + virtual SaberStatus init(const std::vector& inputs, + std::vector& outputs, + FcParam& param, Context& ctx){ + // get context + this->_ctx = ctx; + cudaStream_t cuda_stream; + cuda_stream = ctx.get_compute_stream(); + + CUBLAS_CHECK(cublasCreate(&_handle)); + CUBLAS_CHECK(cublasSetStream(_handle, cuda_stream)); + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector& inputs, + std::vector& outputs, + FcParam& param, Context& ctx){ + + if (!(ctx == this->_ctx)) { + if (_handle != NULL) { + CUBLAS_CHECK(cublasDestroy(_handle)); + } + this->_ctx = ctx; + + cudaStream_t cuda_stream; + cuda_stream = ctx.get_compute_stream(); + CUBLAS_CHECK(cublasCreate(&_handle)); + CUBLAS_CHECK(cublasSetStream(_handle, cuda_stream)); + } + + Shape shape_out = inputs[0]->valid_shape(); + _M = inputs[0]->count_valid(0, param.axis); + _K = inputs[0]->count_valid(param.axis, inputs[0]->dims()); + _N = param.num_output; + if (_N <= 0) { + int weight_size = param.weights->valid_size(); + _N = weight_size / _K; + } + //! weights dims must be in h and w + _flag_trans_weights = param.is_transpose_weights; + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + FcParam& param); + + +private: + bool _flag_trans_weights{false}; + int _M; + int _K; + int _N; + cublasHandle_t _handle; + bool _is_continue_buf{true}; +}; + +template class VenderFc; +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h new file mode 100644 index 000000000..4990a5357 --- /dev/null +++ b/saber/funcs/impl/bm/vender_pooling.h @@ -0,0 +1,151 @@ +/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_POOLING_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_POOLING_H + +#include "saber/funcs/impl/impl_pooling.h" +#include "saber/funcs/impl/cuda/cudnn_helper.h" + +namespace anakin{ + +namespace saber { + +template +class VenderPooling:\ + public ImplBase< + Tensor, + Tensor, + Tensor, + PoolingParam>> { +public: + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + typedef Tensor OpTensor; + + typedef typename DataTensor_in::Dtype InDataType; + typedef typename DataTensor_out::Dtype OutDataType; + typedef typename OpTensor::Dtype OpDataType; + + VenderPooling() : _handle(NULL) {} + + ~VenderPooling() {} + + virtual SaberStatus init(const std::vector& inputs, + std::vector& outputs, + PoolingParam &pooling_param, Context &ctx) { + + this->_ctx = ctx; + + cudaStream_t cuda_stream; + cuda_stream = ctx.get_compute_stream(); + + CUDNN_CHECK(cudnnCreate(&_handle)); + CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); + + cudnn::createTensorDesc(&_input_descs); + cudnn::createTensorDesc(&_output_descs); + + cudnn::create_pooling_des(&_pooling_descs); + + return create(inputs, outputs, pooling_param, ctx); + } + + virtual SaberStatus create(const std::vector& inputs, + std::vector& outputs, + PoolingParam &pooling_param, Context &ctx) { + if (!(ctx == this->_ctx)) { + if (_handle != NULL) { + CUDNN_CHECK(cudnnDestroy(_handle)); + } + this->_ctx = ctx; + + cudaStream_t cuda_stream; + cuda_stream = ctx.get_compute_stream(); + + CUDNN_CHECK(cudnnCreate(&_handle)); + CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); + } + + int input_num = inputs[0]->num(); + int input_channel = inputs[0]->channel(); + int input_height = inputs[0]->height(); + int input_width = inputs[0]->width(); + int output_channel = outputs[0]->channel(); + int output_height = outputs[0]->height(); + int output_width = outputs[0]->width(); + + Shape stride_in = inputs[0]->get_stride(); + Shape stride_out = outputs[0]->get_stride(); + + int dim_a[] = {input_num, input_channel, + input_height, input_width}; + + int dim_b[] = {input_num, output_channel, + output_height, output_width}; + + cudnn::setTensorNdDesc(&_input_descs, + inputs[0]->dims(), dim_a, &stride_in[0]); + + cudnn::setTensorNdDesc(&_output_descs, + outputs[0]->dims(), dim_b, &stride_out[0]); + + int windowHeight[] = {pooling_param.window_h, pooling_param.window_w}; + int padding[] = {pooling_param.pad_h, pooling_param.pad_w}; + + int stride[] = {pooling_param.stride_h, pooling_param.stride_w}; + + cudnn::set_nd_pooling_des(&_pooling_descs, pooling_param.pooling_type, + inputs[0]->dims() - 2, windowHeight, + padding,stride); + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + PoolingParam ¶m) { + const InDataType *in_data = inputs[0]->data(); + OutDataType *out_data = outputs[0]->mutable_data(); + + CUDNN_CHECK(cudnnPoolingForward(_handle, _pooling_descs, + cudnn::cudnnTypeWrapper::kOne(), + _input_descs, in_data, + cudnn::cudnnTypeWrapper::kZero(), + _output_descs, out_data + )); + + return SaberSuccess; + } + +private: + cudnnHandle_t _handle; + cudnnTensorDescriptor_t _input_descs; + cudnnTensorDescriptor_t _output_descs; + cudnnPoolingDescriptor_t _pooling_descs; + +}; + +template class VenderPooling; + +} //namespace saber + +} // namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_POOLING_H diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h index 4b0f170d5..6a109540e 100644 --- a/saber/saber_funcs_param.h +++ b/saber/saber_funcs_param.h @@ -379,6 +379,7 @@ struct ConvParam { opTensor* bias_tensor; }; // specify for int8 +#ifdef USE_CUDA template <> struct ConvParam > { ConvParam() : group(-1), pad_h(-1), pad_w(-1), @@ -534,6 +535,90 @@ struct ConvParam > { Tensor* weight_tensor; Tensor* bias_tensor; }; +#endif //USE_CUDA + +#ifdef USE_BM +template <> +struct ConvParam > { + ConvParam() : group(-1), pad_h(-1), pad_w(-1), + stride_h(-1), stride_w(-1), + dilation_h(-1), dilation_w(-1), + weight_tensor(NULL), bias_tensor(NULL), alpha(1.0), beta(0.0){} + ConvParam(int group_in, int pad_h_in, int pad_w_in, + int stride_h_in, int stride_w_in, int dilation_h_, int dilation_w_, + Tensor* weight, Tensor* bias, + float alpha_in = 1.0, float beta_in = 0.0) + : group(group_in), pad_h(pad_h_in), pad_w(pad_w_in) + , stride_h(stride_h_in), stride_w(stride_w_in) + , dilation_h(dilation_h_), dilation_w(dilation_w_) + , weight_tensor(weight), bias_tensor(bias) + , alpha(alpha_in), beta(beta_in) + {} + ConvParam(const ConvParam &right) + : group(right.group), pad_h(right.pad_h) + , pad_w(right.pad_w), stride_h(right.stride_h) + , stride_w(right.stride_w), dilation_h(right.dilation_h) + , dilation_w(right.dilation_w) + , weight_tensor(right.weight_tensor) + , bias_tensor(right.bias_tensor) + , alpha(right.alpha) + , beta(right.beta) {} + ConvParam &operator=(const ConvParam &right) { + group = right.group; + pad_h = right.pad_h; + pad_w = right.pad_w; + stride_h = right.stride_h; + stride_w = right.stride_w; + dilation_h = right.dilation_h; + dilation_w = right.dilation_w; + weight_tensor = right.weight_tensor; + bias_tensor = right.bias_tensor; + alpha = right.alpha; + beta = right.beta; + return *this; + } + bool operator==(const ConvParam &right) { + bool comp_eq = true; + comp_eq = comp_eq && (group == right.group); + comp_eq = comp_eq && (pad_h == right.pad_h); + comp_eq = comp_eq && (pad_w == right.pad_w); + comp_eq = comp_eq && (stride_h == right.stride_h); + comp_eq = comp_eq && (stride_w == right.stride_w); + comp_eq = comp_eq && (dilation_h == right.dilation_h); + comp_eq = comp_eq && (dilation_w == right.dilation_w); + comp_eq = comp_eq && (weight_tensor == right.weight_tensor); + comp_eq = comp_eq && (bias_tensor == right.bias_tensor); + comp_eq = comp_eq && (alpha == right.alpha); + comp_eq = comp_eq && (beta == right.beta); + return comp_eq; + } + inline const Tensor* weight() { + return weight_tensor; + } + inline const Tensor* bias() { + return bias_tensor; + } + inline Tensor* mutable_weight() { + return weight_tensor; + } + inline Tensor* mutable_bias() { + return bias_tensor; + } + int group; + int pad_h; + int pad_w; + int stride_h; + int stride_w; + int dilation_h; + int dilation_w; + float alpha; + float beta; +private: + Tensor* weight_tensor; + Tensor* bias_tensor; +}; +#endif //USE_BM + template struct PermuteParam { PermuteParam() {} diff --git a/saber/saber_types.h b/saber/saber_types.h index 8f9b86237..3dccb5f3f 100644 --- a/saber/saber_types.h +++ b/saber/saber_types.h @@ -31,7 +31,8 @@ enum TargetTypeEnum { eARM = 3, eX86 = 4, eNVHX86 = 5, - eNVHARM = 6 + eNVHARM = 6, + eBM = 7 }; template @@ -44,6 +45,8 @@ typedef TargetType X86; // NV device with pinned memory typedef TargetType NVHX86; //typedef TargetType NVHARM; +// Bitmain device support +typedef TargetType BM; // invalid target type, for target has only one memory block typedef TargetType INVLD; @@ -82,7 +85,8 @@ enum DataType { AK_STRING = 10, AK_BOOL = 11, AK_SHAPE = 12, - AK_TENSOR = 13 + AK_TENSOR = 13, + AK_BM = 14 }; typedef enum { @@ -148,6 +152,29 @@ enum CodeType { CORNER_SIZE = 3 }; +typedef enum { + ATRS_NormType_NONE = 0, + ATRS_NormType_WIDTH = 1, + ATRS_NormType_HEIGHT = 2, + ATRS_NormType_WIDTH_LOG = 3, + ATRS_NormType_HEIGHT_LOG = 4, +} ATRS_NormType; + +typedef enum { + DetectionOutputSSD_HEIGHT_AND_WIDTH = 0, + DetectionOutputSSD_HEIGHT_OR_WIDTH = 1 +} DetectionOutputSSD_MIN_SIZE_MODE; + +typedef enum { + ProposalImgScaleToCamCoords_NormType_HEIGHT = 0, + ProposalImgScaleToCamCoords_NormType_HEIGHT_LOG = 1 +} ProposalImgScaleToCamCoords_NormType; + +typedef enum { + ProposalImgScaleToCamCoords_OrienType_PI = 0, + ProposalImgScaleToCamCoords_OrienType_PI2 = 1 +} ProposalImgScaleToCamCoords_OrienType; + typedef enum { SABER_POWER_HIGH = 0, SABER_POWER_LOW = 1, diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 140094f32..b013d2c4d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -22,6 +22,10 @@ if(NVIDIA_GPU) anakin_fetch_files_with_suffix(${ANAKIN_UNIT_TEST}/saber/cuda "cpp" ANAKIN_TEST_CASE_SRC) endif() +if(USE_BM) +anakin_fetch_files_with_suffix(${ANAKIN_UNIT_TEST}/saber/bm "cpp" ANAKIN_TEST_CASE_SRC) +endif() + if(USE_X86_PLACE) anakin_fetch_files_with_suffix(${ANAKIN_UNIT_TEST}/saber/x86 "cpp" ANAKIN_TEST_CASE_SRC) endif() diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp new file mode 100644 index 000000000..c6ee0811b --- /dev/null +++ b/test/saber/bm/test_TargetWrapper_BM.cpp @@ -0,0 +1,16 @@ +#include "saber_types.h" +#include "target_wrapper.h" +#include + +#ifdef USE_BM +using namespace anakin::saber; +int main() { + typedef TargetWrapper API; + void *pmem; + int dev_count; + API::get_device_count(&dev_count); + API::mem_alloc(&pmem, 3*200*200); + API::mem_free(pmem); +} +#endif + diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp new file mode 100644 index 000000000..a204e7807 --- /dev/null +++ b/test/saber/bm/test_saber_buffer_BM.cpp @@ -0,0 +1,116 @@ +#include "test_saber_buffer_bm.h" +#include "saber/core/buffer.h" +#include "saber/core/data_traits.h" + +using namespace anakin::saber; + +template +void test_buffer() { + + typedef TargetWrapper X86_API; + typedef TargetWrapper BM_API; + typedef typename DataTrait::dtype Dtype; + typedef Buffer BufferH; + typedef Buffer BufferD; + + int n0 = 1024; + int n1 = 2048; + + void* tmp_x86; + Dtype* x86_ptr; + X86_API::mem_alloc(&tmp_x86, sizeof(Dtype) * n0); + x86_ptr = static_cast(tmp_x86); + + for (int i = 0; i < n0; i++) { + x86_ptr[i] = static_cast(i); + } + + void* tmp_bm; + Dtype* bm_ptr; + BM_API::mem_alloc(&tmp_bm, sizeof(Dtype) * n0); + bm_ptr = static_cast(tmp_bm); + + LOG(INFO) << "Buffer: test default(empty) constructor"; + BufferH x86_buf0; + BufferD bm_buf0; + + LOG(INFO) << "Buffer: test constructor with data size"; + BufferH x86_buf1(n0 * sizeof(Dtype)); + BufferD bm_buf1(n0 * sizeof(Dtype)); + + LOG(INFO) << "Buffer: test constructor with data pointer, size and device id"; + BufferH x86_buf2(x86_ptr, n0 * sizeof(Dtype), X86_API::get_device_id()); + BufferD bm_buf2(bm_ptr, n0 * sizeof(Dtype), BM_API::get_device_id()); + + LOG(INFO) << "Buffer: test copy constructor"; + BufferH x86_buf3(x86_buf2); + LOG(INFO) << "BM Buffer copy constructor"; + LOG(INFO) << "bm target id: " << BM_API::get_device_id(); + LOG(INFO) << "bm buffer target id: " << bm_buf2.get_id(); + BufferD bm_buf3(bm_buf2); + CHECK_EQ(x86_buf3.get_count(), x86_buf2.get_count()) << \ + "shared buffer should have same data count"; + CHECK_EQ(bm_buf3.get_count(), bm_buf2.get_count()) << \ + "shared buffer should have same data count"; + + LOG(INFO) << "Buffer: test operator ="; + x86_buf0 = x86_buf2; + bm_buf0 = bm_buf2; + CHECK_EQ(x86_buf0.get_count(), x86_buf2.get_count()) << \ + "shared buffer should have same data count"; + CHECK_EQ(bm_buf0.get_count(), bm_buf2.get_count()) << \ + "shared buffer should have same data count"; + + LOG(INFO) << "Buffer: test re_alloc"; + x86_buf1.re_alloc(n1 * sizeof(Dtype)); + bm_buf1.re_alloc(n1 * sizeof(Dtype)); + CHECK_EQ(x86_buf1.get_count(), n1 * sizeof(Dtype)) << "buffer count error"; + CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error"; + CHECK_EQ(bm_buf1.get_count(), n1 * sizeof(Dtype)) << "buffer count error"; + CHECK_EQ(bm_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error"; + x86_buf1.re_alloc(n0 * sizeof(Dtype)); + bm_buf1.re_alloc(n0 * sizeof(Dtype)); + CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Dtype)) << "buffer count error"; + CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error"; + CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Dtype)) << "buffer count error"; + CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error"; + + LOG(INFO) << "Buffer: test get_id()"; + LOG(INFO) << "X86 device id: " << x86_buf0.get_id() << \ + ", bm device id: " << bm_buf0.get_id(); + CHECK_EQ(X86_API::get_device_id(), x86_buf0.get_id()) << "x86 device id error"; + CHECK_EQ(BM_API::get_device_id(), bm_buf0.get_id()) << "bm device id error"; + + LOG(INFO) << "Buffer: test deep_cpy()"; + x86_buf1.sync_copy_from(x86_buf2); + LOG(INFO) << "deep copy between two host buffer: "; + const Dtype* ptr1 = static_cast(x86_buf1.get_data()); + const Dtype* ptr2 = static_cast(x86_buf1.get_data()); + + for (int i = 0; i < 10; i++) { + std::cout << ptr1[i] << std::endl; + } + + CHECK_EQ(ptr1[n0 / 2], ptr2[n0 / 2]) << "deep copy between host is incorrect"; + LOG(INFO) << "deep copy from host buffer to device buffer"; + bm_buf1.sync_copy_from(x86_buf2); + x86_buf1.sync_copy_from(bm_buf1); + LOG(INFO) << "deep copy from device buffer to host buffer: "; + ptr1 = static_cast(x86_buf1.get_data()); + + for (int i = 0; i < 10; i++) { + std::cout << ptr1[i] << std::endl; + } +} + +TEST(TestSaberBufferBM, test_buffer_memcpy) { + test_buffer(); +} + +int main(int argc, const char** argv) { + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/saber/bm/test_saber_buffer_BM.h b/test/saber/bm/test_saber_buffer_BM.h new file mode 100644 index 000000000..8bbbe4511 --- /dev/null +++ b/test/saber/bm/test_saber_buffer_BM.h @@ -0,0 +1,20 @@ +#ifndef ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H +#define ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H + +#include "utils/unit_test/aktest.h" +#include "utils/logger/logger.h" + +using namespace anakin::test; + +class TestSaberBufferBM : public Test { +public: + TestSaberBufferBM() {} + ~TestSaberBufferBM() {} + +protected: + virtual void setup() {} + virtual void teardown() {} + +}; + +#endif //ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H diff --git a/test/saber/bm/test_saber_context_BM.cpp b/test/saber/bm/test_saber_context_BM.cpp new file mode 100644 index 000000000..e221ba8f4 --- /dev/null +++ b/test/saber/bm/test_saber_context_BM.cpp @@ -0,0 +1,31 @@ +#include "test_saber_context_BM.h" + +#ifdef USE_BM + +using namespace anakin::saber; + +TEST(TestSaberContextBM, test_BM_context) { + Env::env_init(); + typedef TargetWrapper API; + typename API::event_t event; + API::create_event(event); + LOG(INFO) << "test context constructor"; + Context ctx0; + Context ctx1(0, 1, 1); + LOG(INFO) << "test record event to context data stream and compute stream"; + API::record_event(event, ctx0.get_data_stream()); + API::record_event(event, ctx0.get_compute_stream()); + API::record_event(event, ctx1.get_data_stream()); + API::record_event(event, ctx1.get_compute_stream()); +} + +#endif + +int main(int argc, const char** argv) { + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/bm/test_saber_context_BM.h b/test/saber/bm/test_saber_context_BM.h new file mode 100644 index 000000000..653ee11fd --- /dev/null +++ b/test/saber/bm/test_saber_context_BM.h @@ -0,0 +1,21 @@ +#ifndef SABER_TEST_SABER_CONTEXT_BM_H +#define SABER_TEST_SABER_CONTEXT_BM_H + +#include "utils/unit_test/aktest.h" +#include "utils/logger/logger.h" +#include "core/context.h" + +using namespace anakin::test; + +class TestSaberContextBM : public Test { +public: + TestSaberContextBM() {} + ~TestSaberContextBM() {} + +protected: + virtual void setup() {} + virtual void teardown() {} + +}; + +#endif //SABER_TEST_SABER_CONTEXT_BM_H diff --git a/test/saber/bm/test_saber_device_BM.cpp b/test/saber/bm/test_saber_device_BM.cpp new file mode 100644 index 000000000..1c7086cf1 --- /dev/null +++ b/test/saber/bm/test_saber_device_BM.cpp @@ -0,0 +1,20 @@ +#include "test_saber_device_BM.h" + +#ifdef USE_BM + +using namespace anakin::saber; + +TEST(TestSaberDeviceBM, test_BM_device) { + Device dev_BM; +} + +#endif + +int main(int argc, const char** argv) { + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/bm/test_saber_device_BM.h b/test/saber/bm/test_saber_device_BM.h new file mode 100644 index 000000000..3a6d61236 --- /dev/null +++ b/test/saber/bm/test_saber_device_BM.h @@ -0,0 +1,21 @@ +#ifndef SABER_TEST_SABER_DEVICE_BM_H +#define SABER_TEST_SABER_DEVICE_BM_H + +#include "utils/unit_test/aktest.h" +#include "utils/logger/logger.h" +#include "core/device.h" + +using namespace anakin::test; + +class TestSaberDeviceBM : public Test { +public: + TestSaberDeviceBM() {} + ~TestSaberDeviceBM() {} + +protected: + virtual void setup() {} + virtual void teardown() {} + +}; + +#endif //SABER_TEST_SABER_DEVICE_BM_H diff --git a/test/saber/bm/test_saber_func_BM.h b/test/saber/bm/test_saber_func_BM.h new file mode 100644 index 000000000..61d27d6f9 --- /dev/null +++ b/test/saber/bm/test_saber_func_BM.h @@ -0,0 +1,38 @@ +#ifndef ANAKIN_TEST_SABER_TEST_SABER_FUNC_BM_H +#define ANAKIN_TEST_SABER_TEST_SABER_FUNC_BM_H + +#include "utils/unit_test/aktest.h" +#include "utils/logger/logger.h" +#include "core/tensor.h" +#include +#include + +using namespace anakin::test; + +int read_file(std::vector &results, const char* file_name) { + + std::ifstream infile(file_name); + if (!infile.good()) { + std::cout << "Cannot open " << std::endl; + return false; + } + LOG(INFO)<<"found filename: "< + +using namespace anakin::saber; + +template +void print_tensor_shape(std::string name, Tensor& t0) { + + LOG(INFO) << name << " valid shape is [" + << t0.valid_shape()[0] << ", " + << t0.valid_shape()[1] << ", " + << t0.valid_shape()[2] << ", " + << t0.valid_shape()[3] << "]."; + + LOG(INFO) << name << " real shape is [" + << t0.shape()[0] << ", " + << t0.shape()[1] << ", " + << t0.shape()[2] << ", " + << t0.shape()[3] << "]."; + + LOG(INFO) << name << " offset is [" + << t0.offset()[0] << ", " + << t0.offset()[1] << ", " + << t0.offset()[2] << ", " + << t0.offset()[3] << "]."; +} + +TEST(TestSaberFuncBM, test_func_constructor) { + + typedef Tensor TensorHf4; + typedef Tensor TensorDf4; + + int img_num = 1; + int in_channels = 2; + int img_h = 8; + int img_w = 8; + + Shape img_s(img_num, in_channels, img_h, img_w); + + TensorHf4 img_host; + TensorDf4 img_dev; + + img_host.re_alloc(img_s); + img_dev.re_alloc(img_s); + + for (int i = 0; i < img_host.size(); ++i) { + img_host.mutable_data()[i] = (float)(0.05 * (i & 0x1f) * -1); + } + + img_dev.copy_from(img_host); + TensorDf4 output_dev; + + // start Reshape & doInfer + + Context ctx1(0, 1, 1); + + ActivationParam param(Active_elu, 0.1f, 0.1f); + + std::vector input; + std::vector output; + + input.push_back(&img_dev); + output.push_back(&output_dev); + + Activation act; + act.compute_output_shape(input, output, param); + output_dev.re_alloc(output[0]->shape()); + + // init assume output tensor has been reshpaed by user. + act.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1); + act(input, output, param, ctx1); + + cudaStream_t cuda_stream = ctx1.get_compute_stream(); + output[0]->record_event(cuda_stream); + output_dev.sync(); + print_tensor_device(output_dev); + cudaDeviceSynchronize(); + CUDA_POST_KERNEL_CHECK; +} + +TEST(TestSaberFuncBM, test_func_sub_tensor) { + + typedef Tensor TensorHf4; + typedef Tensor TensorDf4; + + int img_num = 1; + int in_channels = 2; + int img_h = 8; + int img_w = 8; + + Shape img_s(img_num, in_channels, img_h, img_w); + + TensorHf4 img_host; + TensorDf4 img_dev; + + img_host.re_alloc(img_s); + img_dev.re_alloc(img_s); + + for (int i = 0; i < img_host.size(); ++i) { + img_host.mutable_data()[i] = (float)(0.05 * (i & 0x1f) * -1); + } + + img_dev.copy_from(img_host); + Shape img_s_t0(img_num, in_channels, 4, 4); + + TensorDf4 t0; + TensorDf4 t1; + + t0.share_sub_buffer(img_dev, img_s_t0, {0, 0, 0, 0}); + t1.share_sub_buffer(img_dev, img_s_t0, {0, 0, 4, 4}); + + print_tensor_shape("t0", t0); + print_tensor_shape("t1", t1); + + TensorDf4 output_dev; + + TensorDf4 out0; + TensorDf4 out1; + + // start Reshape & doInfer + Context ctx1(0, 1, 1); + Context ctx2(0, 2, 2); + + ActivationParam param1(Active_elu, 0.1f, 0.1f); + ActivationParam param2(Active_elu, 0.1f, 0.1f); + + std::vector input1, input2; + std::vector output1, output2; + + input1.push_back(&t0); + input2.push_back(&t1); + + output1.push_back(&out0); + output2.push_back(&out1); + + //FIXME where do I get img_s and all those shapes ???? + output_dev.re_alloc(img_s); + + out0.share_sub_buffer(output_dev, img_s_t0, {0, 0, 0, 0}); + out1.share_sub_buffer(output_dev, img_s_t0, {0, 0, 4, 4}); + + print_tensor_shape("output_dev", output_dev); + + Activation act1; + Activation act2; + + act1.compute_output_shape(output1, input1, param1); + act2.compute_output_shape(output2, input2, param2); + + print_tensor_shape("out0", out0); + print_tensor_shape("out1", out1); + + // init assume output tensor has been reshpaed by user. + act1.init(input1, output1, param1, SPECIFY, SABER_IMPL, ctx1); + act1(input1, output1, param1, ctx1); + cudaStream_t cuda_stream = ctx1.get_compute_stream(); + output1[0]->record_event(cuda_stream); + + act2.init(input2, output2, param2, SPECIFY, SABER_IMPL, ctx2); + act2(input2, output2, param2, ctx2); + cudaStream_t cuda_stream2 = ctx2.get_compute_stream(); + output2[0]->record_event(cuda_stream2); + + out0.sync(); + out1.sync(); + print_tensor_device(output_dev); + cudaDeviceSynchronize(); + CUDA_POST_KERNEL_CHECK; +} + +int main(int argc, const char** argv) { + Env::env_init(); + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/bm/test_saber_func_conv_BM.cpp b/test/saber/bm/test_saber_func_conv_BM.cpp new file mode 100644 index 000000000..7881cdb97 --- /dev/null +++ b/test/saber/bm/test_saber_func_conv_BM.cpp @@ -0,0 +1,725 @@ +#include "core/context.h" +#include "funcs/conv.h" +#include "test_saber_func_BM.h" +#include "tensor_op.h" +#include "saber_types.h" +#include +//#include "cublas.h" + +using namespace anakin::saber; + +typedef Tensor TensorHf4; +typedef Tensor TensorDf4; + +template +void print_tensor_shape(std::string name, Tensor &t0) { + + LOG(INFO) << name << " valid shape is [" + << t0.valid_shape()[0] << ", " + << t0.valid_shape()[1] << ", " + << t0.valid_shape()[2] << ", " + << t0.valid_shape()[3] << "]."; + + LOG(INFO) << name << " real shape is [" + << t0.shape()[0] << ", " + << t0.shape()[1] << ", " + << t0.shape()[2] << ", " + << t0.shape()[3] << "]."; + + LOG(INFO) << name << " offset is [" + << t0.offset()[0] << ", " + << t0.offset()[1] << ", " + << t0.offset()[2] << ", " + << t0.offset()[3] << "]."; +} + + + +#if 1 +TEST(TestSaberFuncBM, test_depthwise_conv) { + + int group = 2; + int pad_h = 1; + int pad_w = 1; + int stride_h = 1; + int stride_w = 1; + int dilation_h = 1; + int dilation_w = 1; + + int kernel_h = 3; + int kernel_w = 3; + int out_channels = 2; + + int img_num = 1; + int in_channels = 2; + int img_h = 8; + int img_w = 8; + + bool bias_term = true; + + LOG(INFO) << "conv param: "; + LOG(INFO) << " img_num = " << img_num; + LOG(INFO) << " in_channels = " << in_channels; + LOG(INFO) << " img_h = " << img_h; + LOG(INFO) << " img_w = " << img_w; + LOG(INFO) << " group = " << group; + LOG(INFO) << " pad_h = " << pad_h; + LOG(INFO) << " pad_w = " << pad_w; + LOG(INFO) << " stride_h = " << stride_h; + LOG(INFO) << " stride_w = " << stride_w; + LOG(INFO) << " dilation_h = " << dilation_h; + LOG(INFO) << " dilation_w = " << dilation_w; + LOG(INFO) << " kernel_h = " << kernel_h; + LOG(INFO) << " kernel_w = " << kernel_w; + LOG(INFO) << " out_channels = " << out_channels; + + Shape img_s(img_num, in_channels, img_h, img_w); + Shape weights_s(out_channels, in_channels, kernel_h, kernel_w); + Shape bias_s(1, out_channels, 1, 1); + + TensorHf4 img_host; + TensorDf4 img_dev; + + img_host.re_alloc(img_s); + img_dev.re_alloc(img_s); + + for (int i = 0; i < img_host.size(); ++i) { + img_host.mutable_data()[i] = 63 & i; + } + + img_dev.copy_from(img_host); + + TensorHf4 weights_host; + TensorDf4 weights_dev; + + weights_host.re_alloc(weights_s); + weights_dev.re_alloc(weights_s); + + fill_tensor_host_const(weights_host, 1.f); + weights_dev.copy_from(weights_host); + + TensorHf4 bias_host; + TensorDf4 bias_dev; + + if (bias_term) { + bias_host.re_alloc(bias_s); + bias_dev.re_alloc(bias_s); + + fill_tensor_host_const(bias_host, 1.f); + bias_dev.copy_from(bias_host); + } + + TensorHf4 output_host; + TensorDf4 output_dev; + + // start Reshape & doInfer + Context ctx1(0, 1, 1); + + ConvParam param(group, pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + &weights_dev, &bias_dev); + + std::vector input; + std::vector output; + + input.push_back(&img_dev); + output.push_back(&output_dev); + + Conv conv; + conv.compute_output_shape(input, output, param); + + output_dev.re_alloc(output[0]->shape()); + output_host.re_alloc(output[0]->shape()); + + LOG(INFO) << "regular start with group = " << group; + // init assume output tensor has been reshpaed by user. + conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1); + + conv(input, output, param, ctx1); + + cudaStream_t cuda_stream = ctx1.get_compute_stream(); + output[0]->record_event(cuda_stream); + + output_dev.sync(); + print_tensor_device(output_dev); + +// param.group = 1; +// param.pad_h = 1; +// param.pad_w = 1; +// +// LOG(INFO) << " param changed start with group = "< ctx1(0, 1, 1); + Context ctx2(0, 2, 2); + + TensorDf4 out0; + TensorDf4 out1; + + ConvParam param0(group, pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + &weights_dev, &bias_dev); + + ConvParam param1(group, pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + &weights_dev, &bias_dev); + + std::vector input0, input1; + std::vector output0, output1; + + input0.push_back(&t0); + input1.push_back(&t1); + + output0.push_back(&out0); + output1.push_back(&out1); + + // FIXME ? where do i get output shape + output_dev.re_alloc(img_s); + + Conv conv0; + Conv conv1; + + conv0.compute_output_shape(input0, output0, param0); + conv1.compute_output_shape(input1, output1, param1); + + out0.share_sub_buffer(output_dev, output0[0]->valid_shape(),{0,0,0,0}); + out1.share_sub_buffer(output_dev, output1[0]->valid_shape(),{0,0,4,4}); + + conv0.init(input0, output0, param0, SPECIFY, VENDER_IMPL, ctx1); + conv1.init(input1, output1, param1, SPECIFY, VENDER_IMPL, ctx2); + + conv0(input0, output0, param0, ctx1); + conv1(input1, output1, param1, ctx2); + + cudaStream_t cuda_stream1 = ctx1.get_compute_stream(); + output0[0]->record_event(cuda_stream1); + + cudaStream_t cuda_stream2 = ctx2.get_compute_stream(); + output1[0]->record_event(cuda_stream2); + + out0.sync(); + out1.sync(); + + print_tensor_device(output_dev); + +// print_tensor_device(output_dev); + + cudaDeviceSynchronize(); + CUDA_CHECK(cudaPeekAtLastError()); +} +#endif + +TEST(TestSaberFuncBM, test_conv_fp32_speed_test) { + + int group = 1; + int pad_h = 1; + int pad_w = 1; + int stride_h = 1; + int stride_w = 1; + int dilation_h = 1; + int dilation_w = 1; + + int kernel_h = 1; + int kernel_w = 1; + int out_channels = 128; + + int img_num = 7; + int in_channels = 13; + int img_h = 32; + int img_w = 32; + + bool bias_term = false; + + LOG(INFO) << "conv param: "; + LOG(INFO) << " img_num = " << img_num; + LOG(INFO) << " in_channels = " << in_channels; + LOG(INFO) << " img_h = " << img_h; + LOG(INFO) << " img_w = " << img_w; + LOG(INFO) << " group = " << group; + LOG(INFO) << " pad_h = " << pad_h; + LOG(INFO) << " pad_w = " << pad_w; + LOG(INFO) << " stride_h = " << stride_h; + LOG(INFO) << " stride_w = " << stride_w; + LOG(INFO) << " dilation_h = " << dilation_h; + LOG(INFO) << " dilation_w = " << dilation_w; + LOG(INFO) << " kernel_h = " << kernel_h; + LOG(INFO) << " kernel_w = " << kernel_w; + LOG(INFO) << " out_channels = " << out_channels; + Shape img_s(img_num, in_channels, img_h, img_w); + Shape weights_s(out_channels, in_channels, kernel_h, kernel_w); + Shape bias_s(1, out_channels, 1, 1); + + TensorHf4 img_host; + TensorDf4 img_dev; + + img_host.re_alloc(img_s); + img_dev.re_alloc(img_s); + + for (int i = 0; i < img_host.size(); ++i) { + img_host.mutable_data()[i] = 1; + } + + img_dev.copy_from(img_host); + + TensorHf4 weights_host; + TensorDf4 weights_dev; + + weights_host.re_alloc(weights_s); + weights_dev.re_alloc(weights_s); + + fill_tensor_host_const(weights_host, 1.f); + weights_dev.copy_from(weights_host); + + TensorHf4 bias_host; + TensorDf4 bias_dev; + + if (bias_term) { + bias_host.re_alloc(bias_s); + bias_dev.re_alloc(bias_s); + + fill_tensor_host_const(bias_host, 1.f); + bias_dev.copy_from(bias_host); + } + + TensorDf4 output_dev; + + // start Reshape & doInfer + Context ctx1(0, 1, 1); + + ConvParam param(group, pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + &weights_dev, &bias_dev); + + std::vector input; + std::vector output; + + input.push_back(&img_dev); + output.push_back(&output_dev); + + Conv conv; + conv.compute_output_shape(input, output, param); + + output_dev.re_alloc(output[0]->shape()); + LOG(INFO) << "Output shape = [ " << output[0]->shape()[0] << " " << output[0]->shape()[1] << " " \ + << output[0]->shape()[2] << " " << output[0]->shape()[3] << "]"; + //LOG(INFO) << " blocks = [ " << i_div_up(img_num*output[0]->shape()[2]*output[0]->shape()[3],128) << " " << i_div_up(out_channels*kernel_h, 128) << " 1 ]" ; + //选择k最小的那一组,如果一样,则选128*N,N最大的那一组 + int k0 = i_div_up(out_channels, 128) * 128 - out_channels; + int k1 = i_div_up(out_channels, 64) * 64 - out_channels; + int k2 = i_div_up(out_channels, 32) * 32 - out_channels; + int kk = std::min(std::min(k0,k1),k2); + LOG(INFO) << "k0 = " << k0 << " k1 = " << k1 << " k2 = " << k2 << " kk = " << kk; + if (kk == k0) + LOG(INFO) << "thread = [256,1,1] 128*128" ; + if (kk == k1) + LOG(INFO) << "thread = [128,1,1] 128*64" ; + if (kk == k2) + LOG(INFO) << "thread = [128,1,1] 128*32" ; + + LOG(INFO) << "saber conv init"; + conv.init(input, output, param, SPECIFY, SABER_IMPL, ctx1); + + LOG(INFO) << "saber conv dispatch"; + conv(input, output, param, ctx1); + + cudaStream_t cuda_stream = ctx1.get_compute_stream(); + output[0]->record_event(cuda_stream); + + output_dev.sync(); + + SaberTimer t1; + int ts = 1; + + for (int i = 0; i < ts; ++i) { + t1.start(ctx1); + conv(input, output, param, ctx1); + output_dev.sync(); + t1.end(ctx1); + } + + LOG(INFO) << "fp32 average time: " << t1.get_average_ms() << " ms"; + + cudaDeviceSynchronize(); + CUDA_CHECK(cudaPeekAtLastError()); +} + +void test_conv_fp32_speed(std::vector &inputs, std::vector &outputs, + TensorDf4 &weights, int kernel_size, int stride, int pad, + int in_channel, int out_channel, TensorDf4 &bias, + anakin::saber::ImplEnum impl) { + + ConvParam conv_param(1, pad, pad, + stride, stride, + 1, 1, + &weights, &bias); + Conv conv; + conv.compute_output_shape(inputs, outputs, conv_param); + outputs[0]->re_alloc(outputs[0]->shape()); + Context ctx1(0, 1, 1); + + SABER_CHECK(conv.init(inputs, outputs, conv_param, SPECIFY, impl, ctx1)); + + conv(inputs, outputs, conv_param, ctx1); + outputs[0]->record_event(ctx1.get_compute_stream()); + outputs[0]->sync(); + + cudaDeviceSynchronize(); + + SaberTimer t1; + int ts = 100; + for (int i = 0; i < ts; ++i) { + t1.start(ctx1); + conv(inputs, outputs, conv_param, ctx1); + outputs[0]->record_event(ctx1.get_compute_stream()); + outputs[0]->sync(); + t1.end(ctx1); + } + LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms"; + + cudaDeviceSynchronize(); +} + + +cublasHandle_t cublas_handle; + +void caffe_gemm(const int M, const int N, const int K,\ + const float alpha, const float* A,\ + const float* B, const float beta, float* C) { + int lda = K; + int ldb = N; + CUBLAS_CHECK(cublasSgemm(cublas_handle, + CUBLAS_OP_N, + CUBLAS_OP_N, + N, M, K, + &alpha, B, + ldb, A, + lda, &beta, + C, N)); +} + +TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) { + int img_num = 1; + int kernel = 1; + +// int out_channels = 32; +// int in_channels = 128; +// int img_h = 52; +// int img_w = 112; +// int out_channels = 64; +// int in_channels = 256; +// int img_h = 26; +// int img_w = 56; + int out_channels = 128; + int in_channels = 512; + int img_h = 13; + int img_w = 28; + +// int out_channels = 512; +// int in_channels = 128; +// int img_h = 13; +// int img_w = 28; + + int pad = 0; + int stride = 1; + Context ctx1(0, 1, 1); + + CUBLAS_CHECK(cublasCreate(&cublas_handle)); + CUBLAS_CHECK(cublasSetStream(cublas_handle, ctx1.get_compute_stream())); + + TensorDf4 weights; + weights.re_alloc({out_channels, in_channels, 1, 1}); + + TensorDf4 img; + img.re_alloc({1, in_channels, img_h, img_w}); + + TensorDf4 out; + out.re_alloc({1, out_channels, img_h, img_w}); + TensorDf4 out_gemm; + out_gemm.re_alloc({1, out_channels, img_h, img_w}); + + fill_tensor_device_rand(weights, -1.f, 1.f); + fill_tensor_device_rand(img, -1.f, 1.f); + + LOG(INFO) << "img_num: " << img_num; + LOG(INFO) << "kernel: " << kernel; + LOG(INFO) << "out_channels: " << out_channels; + LOG(INFO) << "in_channels: " << in_channels; + LOG(INFO) << "img_h: " << img_h; + LOG(INFO) << "img_w: " << img_w; + LOG(INFO) << "pad: " << pad; + LOG(INFO) << "stride: " << stride; + + TensorDf4 bias; + + std::vector input_v; + std::vector output_gemm_v, output_v; + + input_v.push_back(&img); + output_v.push_back(&out); + output_gemm_v.push_back(&out_gemm); + cudaDeviceSynchronize(); + test_conv_fp32_speed(input_v, output_v, + weights, kernel, stride, pad, + in_channels, out_channels, bias, + SABER_IMPL); + cudaDeviceSynchronize(); + caffe_gemm(out_channels, img_h * img_w, in_channels,\ + 1.f, weights.data(),\ + img.data(), 0.f, out_gemm.mutable_data()); + cudaDeviceSynchronize(); + SaberTimer t1; + int ts = 100; + + for (int i = 0; i < ts; ++i) { + t1.start(ctx1); + caffe_gemm(out_channels, img_h * img_w, in_channels,\ + 1.f, weights.data(),\ + img.data(), 0.f, out_gemm.mutable_data()); + out_gemm.record_event(ctx1.get_compute_stream()); + out_gemm.sync(); + t1.end(ctx1); + } + LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms"; + + cudaDeviceSynchronize(); +// print_tensor_device(out); +// print_tensor_device(out_gemm); + TensorHf4 out_host; + TensorHf4 out_gemm_host; + out_host.re_alloc(out.shape()); + out_host.copy_from(out); + + out_gemm_host.re_alloc(out_gemm.shape()); + out_gemm_host.copy_from(out_gemm); + double max_r, max_d; + tensor_cmp_host(out_host.data(), out_gemm_host.data(), out_host.size(), max_r, max_d); + LOG(INFO) << "cmp result: max_r = " << max_r << " max_d = " << max_d; +} + +int main(int argc, const char** argv){ + anakin::saber::Env::env_init(); + + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/bm/test_saber_func_fc_BM.cpp b/test/saber/bm/test_saber_func_fc_BM.cpp new file mode 100644 index 000000000..5101c75f8 --- /dev/null +++ b/test/saber/bm/test_saber_func_fc_BM.cpp @@ -0,0 +1,148 @@ +#include "core/context.h" +#include "funcs/fc.h" +#include "test_saber_func_fc_BM.h" +#include "tensor_op.h" +#include "saber_types.h" +#include + +using namespace anakin::saber; +typedef TargetWrapper API; +typedef Tensor TensorDf4; +typedef Tensor TensorHf4; +typedef TensorDf4::Dtype ftype; + +void fc_compute(const TensorHf4& tin, const TensorHf4& weight, \ + const TensorHf4& bias, TensorHf4& tout) { + + int m = tin.num(); + int k = tin.valid_size() / m; + int n = weight.valid_size() / k; + bool bias_term = bias.valid_size() > 0; + + const float* din = tin.data(); + const float* w = weight.data(); + float* dout = tout.mutable_data(); + + for (int i = 0; i < m; ++i) { + float* pdout = dout + i * n; + const float* pdin = din + i * k; + + for (int j = 0; j < n; ++j) { + if (bias_term) { + pdout[j] = bias.data()[j]; + } else { + pdout[j] = 0; + } + + for (int l = 0; l < k; ++l) { + pdout[j] += pdin[l] * w[l * n + j]; + } + } + } +} + +TEST(TestSaberFuncFcBM, test_func_fc) { + + int test_iter = 100; + int w_in = 7; + int h_in = 7; + int ch_in = 512; + int num_in = 1; + + int num_out = 4096; + int axis = 1; + + Shape shape_in(num_in, ch_in, h_in, w_in); + Shape shape_out = {num_in, num_out, 1, 1}; + + Shape sh_w{1, 1, w_in* h_in * ch_in, num_out}; + TensorDf4 weight(sh_w); + Shape sh_b{1, 1, 1, num_out}; + TensorDf4 bias(sh_b); + fill_tensor_device_const(weight, 1.f); + fill_tensor_device_const(bias, 1.f); + + LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \ + ch_in << ", height=" << h_in << ", width=" << w_in; + + std::vector input_dev_4d; + std::vector output_dev_4d; + + TensorDf4 tdin; + TensorDf4 tdout; + tdin.re_alloc(shape_in); + fill_tensor_device_const(tdin, 1.f); + input_dev_4d.push_back(&tdin); + output_dev_4d.push_back(&tdout); + + // start Reshape & doInfer + Context ctx_dev(0, 1, 1); + + FcParam param(&weight, &bias, num_out, axis); + + Fc fc; + + LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \ + shape_out[2] << ", " << shape_out[3]; + + SABER_CHECK(fc.compute_output_shape(input_dev_4d, output_dev_4d, param)); + + LOG(INFO) << "re-alloc tensor buffer"; + output_dev_4d[0]->re_alloc(output_dev_4d[0]->valid_shape()); + Shape va_sh = tdout.valid_shape(); + LOG(INFO) << "shape out 4d: " << va_sh[0] << ", " << va_sh[1] << ", " << \ + va_sh[2] << ", " << va_sh[3]; + CHECK_EQ(tdout.valid_shape() == shape_out, true) << "compute output shape error"; + + LOG(INFO) << "FC initialization"; + SABER_CHECK(fc.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev)); + + LOG(INFO) << "FC compute"; + SaberTimer t1; + t1.clear(); + t1.start(ctx_dev); + + for (int i = 0; i < test_iter; ++i) { + SABER_CHECK(fc(input_dev_4d, output_dev_4d, param, ctx_dev)); + output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); + output_dev_4d[0]->sync(); + //cudaDeviceSynchronize(); + } + + CUDA_POST_KERNEL_CHECK; + t1.end(ctx_dev); + float ts = t1.get_average_ms(); + LOG(INFO) << "total time: " << ts << "avg time: " << ts / test_iter; + //print_tensor_device(*output_dev_4d[0]); + //cudaDeviceSynchronize(); + + //! check result + TensorHf4 thin(shape_in); + TensorHf4 thout(shape_out); + TensorHf4 thw(sh_w); + TensorHf4 thb(sh_b); + thin.copy_from(tdin); + thw.copy_from(weight); + thb.copy_from(bias); + fc_compute(thin, thw, thb, thout); + //print_tensor_host(thout); + + TensorHf4 thout_d(shape_out); + thout_d.copy_from(tdout); + double max_ratio = 0; + double max_diff = 0; + tensor_cmp_host(thout.data(), thout_d.data(), thout.valid_size(), max_ratio, max_diff); + LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; + CHECK_LE(fabs(max_ratio), 1.0e-6) << "error result"; + +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + Env::env_init(); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp new file mode 100644 index 000000000..04b963675 --- /dev/null +++ b/test/saber/bm/test_saber_func_pooling_BM.cpp @@ -0,0 +1,311 @@ +#include "core/context.h" +#include "funcs/pooling.h" +#include "test_saber_func_BM.h" +#include "tensor_op.h" +#include "saber_types.h" +#include "funcs/timer.h" +#include + +using namespace anakin::saber; + +TEST(TestSaberFuncBM, test_func_pooling) { + + Env::env_init(); + typedef TargetWrapper API; + typename API::event_t event; + API::create_event(event); + + typedef TargetWrapper X86_API; + typedef TargetWrapper BM_API; + typedef Tensor TensorHf4; + typedef Tensor TensorDf4; + + int img_num = 1; + int in_channels = 4; + int img_h = 800; + int img_w = 1440; + + Shape img_s(img_num, in_channels, img_h, img_w); + + TensorHf4 img_host; + TensorDf4 img_dev; + + img_host.re_alloc(img_s); + img_dev.re_alloc(img_s); + + for (int i = 0; i < img_host.size(); ++i) { + img_host.mutable_data()[i] = 0x7f & i; + } + + img_dev.copy_from(img_host); + + TensorHf4 output_host; + TensorDf4 output_dev; + + // start Reshape & doInfer + + Context ctx1(0, 1, 1); + int window_h = 2; + int window_w = 2; + int pad_h = 1; + int pad_w = 1; + int stride_h = 1; + int stride_w = 1; + LOG(INFO) << " img_num: " << img_num; + LOG(INFO) << " in_channels: " << in_channels; + LOG(INFO) << " img_h: " << img_h; + LOG(INFO) << " img_w: " << img_w; + LOG(INFO) << " window_h: " << window_h; + LOG(INFO) << " window_w: " << window_w; + LOG(INFO) << " pad_h: " << pad_h; + LOG(INFO) << " pad_w: " << pad_w; + LOG(INFO) << " stride_h: " << stride_h; + LOG(INFO) << " stride_w: " << stride_w; + + PoolingParam param(window_h, window_w, pad_h, pad_w + , stride_h, stride_w, Pooling_max); + + std::vector input; + std::vector output; + + input.push_back(&img_dev); + output.push_back(&output_dev); + + Pooling pooling; + pooling.compute_output_shape(input, output, param); + + output_dev.re_alloc(output[0]->shape()); + output_host.re_alloc(output[0]->shape()); + + // init assume output tensor has been reshpaed by user. + pooling.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1); + pooling(input, output, param, ctx1); + + SaberTimer t1; + int ts = 1000; + + for (int i = 0; i < ts; ++i) { + t1.start(ctx1); + pooling(input, output, param, ctx1); + output[0]->sync(); + t1.end(ctx1); + } + + output_dev.sync(); + cudaDeviceSynchronize(); + LOG(INFO) << " average time: " << t1.get_average_ms() << " ms"; + LOG(INFO) << " tile 10% time: " << t1.get_tile_time(10) << " ms"; + LOG(INFO) << " tile 50% time: " << t1.get_tile_time(50) << " ms"; + LOG(INFO) << " tile 90% time: " << t1.get_tile_time(90) << " ms"; + LOG(INFO) << " tile 95% time: " << t1.get_tile_time(95) << " ms"; + LOG(INFO) << " tile 99% time: " << t1.get_tile_time(99) << " ms"; + + CUDA_CHECK(cudaPeekAtLastError()); +} + +TEST(TestSaberFuncBM, test_pooling_result) { + + Env::env_init(); + typedef TargetWrapper API; + typename API::event_t event; + API::create_event(event); + + typedef TargetWrapper X86_API; + typedef TargetWrapper BM_API; + typedef Tensor TensorHf4; + typedef Tensor TensorDf4; + + int img_num = 1; + int in_channels = 2; + int img_h = 8; + int img_w = 8; + + Shape img_s(img_num, in_channels, img_h, img_w); + + TensorHf4 img_host; + TensorDf4 img_dev; + + img_host.re_alloc(img_s); + img_dev.re_alloc(img_s); + + for (int i = 0; i < img_host.size(); ++i) { + img_host.mutable_data()[i] = 0x7f & i; + } + + img_dev.copy_from(img_host); + + TensorDf4 output_dev; + + // start Reshape & doInfer + + Context ctx1(0, 1, 1); + int window_h = 2; + int window_w = 2; + int pad_h = 1; + int pad_w = 1; + int stride_h = 1; + int stride_w = 1; + + LOG(INFO) << " img_num: " << img_num; + LOG(INFO) << " in_channels: " << in_channels; + LOG(INFO) << " img_h: " << img_h; + LOG(INFO) << " img_w: " << img_w; + LOG(INFO) << " window_h: " << window_h; + LOG(INFO) << " window_w: " << window_w; + LOG(INFO) << " pad_h: " << pad_h; + LOG(INFO) << " pad_w: " << pad_w; + LOG(INFO) << " stride_h: " << stride_h; + LOG(INFO) << " stride_w: " << stride_w; + + PoolingParam param(window_h, window_w, pad_h, pad_w + , stride_h, stride_w, Pooling_max); + + std::vector input; + std::vector output; + + input.push_back(&img_dev); + output.push_back(&output_dev); + + Pooling pooling; + pooling.compute_output_shape(input, output, param); + + output_dev.re_alloc(output[0]->shape()); + + // init assume output tensor has been reshpaed by user. + pooling.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1); + pooling(input, output, param, ctx1); + cudaStream_t cuda_stream = ctx1.get_compute_stream(); + output[0]->record_event(cuda_stream); + + output_dev.sync(); + print_tensor_device(output_dev); + + cudaDeviceSynchronize(); + CUDA_CHECK(cudaPeekAtLastError()); +} + +TEST(TestSaberFuncBM, test_pooling_shared_buffer) { + + Env::env_init(); + typedef TargetWrapper API; + typename API::event_t event; + API::create_event(event); + + typedef TargetWrapper X86_API; + typedef TargetWrapper BM_API; + typedef Tensor TensorHf4; + typedef Tensor TensorDf4; + + int img_num = 1; + int in_channels = 2; + int img_h = 8; + int img_w = 8; + + Shape img_s(img_num, in_channels, img_h, img_w); + + TensorHf4 img_host; + TensorDf4 img_dev; + + img_host.re_alloc(img_s); + img_dev.re_alloc(img_s); + + for (int i = 0; i < img_host.size(); ++i) { + img_host.mutable_data()[i] = 0x7f & i; + } + + img_dev.copy_from(img_host); + + TensorDf4 t0; + TensorDf4 t1; + Shape img_s_sub(img_num, in_channels, img_h / 2, img_w / 2); + + t0.share_sub_buffer(img_dev, img_s_sub, {0, 0, 0, 0}); + t1.share_sub_buffer(img_dev, img_s_sub, {0, 0, 4, 4}); + + TensorDf4 output_dev; + + TensorDf4 out0; + TensorDf4 out1; + + // start Reshape & doInfer + + Context ctx1(0, 1, 1); + int window_h = 2; + int window_w = 2; + int pad_h = 1; + int pad_w = 1; + int stride_h = 1; + int stride_w = 1; + + LOG(INFO) << " img_num: " << img_num; + LOG(INFO) << " in_channels: " << in_channels; + LOG(INFO) << " img_h: " << img_h; + LOG(INFO) << " img_w: " << img_w; + LOG(INFO) << " window_h: " << window_h; + LOG(INFO) << " window_w: " << window_w; + LOG(INFO) << " pad_h: " << pad_h; + LOG(INFO) << " pad_w: " << pad_w; + LOG(INFO) << " stride_h: " << stride_h; + LOG(INFO) << " stride_w: " << stride_w; + + PoolingParam param(window_h, window_w, pad_h, pad_w + , stride_h, stride_w, Pooling_max); + + std::vector input; + std::vector output; + + input.push_back(&img_dev); + output.push_back(&output_dev); + + Pooling pooling; + Pooling pooling0; + Pooling pooling1; + + pooling.compute_output_shape(input,output, param); + + Shape total_shape = output[0]->shape(); + + output_dev.re_alloc(total_shape); + Shape out_sub_shape = {total_shape[0], total_shape[1], total_shape[2] / 2, total_shape[3] / 2}; + + out0.share_sub_buffer(output_dev, out_sub_shape, {0, 0, 0, 0}); + out1.share_sub_buffer(output_dev, out_sub_shape, {0, 0, out_sub_shape[2], out_sub_shape[3]}); + + std::vector input0, input1; + std::vector output0, output1; + + input0.push_back(&t0); + input1.push_back(&t1); + output0.push_back(&out0); + output1.push_back(&out1); + + // init assume output tensor has been reshpaed by user. + pooling0.init(input0, output0, param, SPECIFY, VENDER_IMPL, ctx1); + pooling0(input0, output0, param, ctx1); + + pooling1.init(input1, output1, param, SPECIFY, VENDER_IMPL, ctx1); + pooling1(input1, output1, param, ctx1); + + cudaStream_t cuda_stream = ctx1.get_compute_stream(); + out0.record_event(cuda_stream); + + cudaStream_t cuda_stream1 = ctx1.get_compute_stream(); + out1.record_event(cuda_stream1); + + out0.sync(); + out1.sync(); + + print_tensor_device(output_dev); + + cudaDeviceSynchronize(); + CUDA_CHECK(cudaPeekAtLastError()); +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/bm/test_saber_shape_BM.cpp b/test/saber/bm/test_saber_shape_BM.cpp new file mode 100644 index 000000000..18479cd18 --- /dev/null +++ b/test/saber/bm/test_saber_shape_BM.cpp @@ -0,0 +1,126 @@ +#include "test_saber_shape_BM.h" +#include "shape.h" +#include "anakin_config.h" + +#ifdef USE_OPENMP +#include +#include +#endif + +using namespace anakin; +using namespace saber; + + +TEST(TestSaberShapeBM, test_saber_shape) { + + int dim = 4; + Shape sh4d0{0, 0, 0, 0}; + CHECK_EQ(sh4d0.dims(), 4) << "check shape dim error"; + + for (int i = 0; i < dim; ++i) { + CHECK_EQ(sh4d0[i], 0) << "check default constructor, dim size error"; + } + + CHECK_EQ(sh4d0.count(), 0) << "check shape count error"; + + int N = 1; + int C = 3; + int H = 11; + int W = 11; + std::vector sh_size = {N, C, H, W}; + //Shape sh4d1(sh_size); + Shape sh4d1(N, C, H, W); + LOG(INFO) << "Test Saber Shape, size of shape: " << sh4d1.size(); + CHECK_EQ(sh4d1.count(), N * C * H * W) << "size error with vector constructor!"; + //CHECK_EQ(sh4d2.size(), N * C * H * W) << "size error with args constructor!"; + + CHECK_EQ(sh4d1[0], N) << "get shape size error"; + CHECK_EQ(sh4d1[1], C) << "get shape size error"; + CHECK_EQ(sh4d1[2], H) << "get shape size error"; + CHECK_EQ(sh4d1[3], W) << "get shape size error"; + + //CHECK_EQ(sh4d2[0], N) << "get shape size error"; + //CHECK_EQ(sh4d2[1], C) << "get shape size error"; + //CHECK_EQ(sh4d2[2], H) << "get shape size error"; + //CHECK_EQ(sh4d2[3], W) << "get shape size error"; + + CHECK_EQ(sh4d1.count(0), N * C * H * W) << "calculate count failed"; + + C = 10; + sh4d1[1] = C; + CHECK_EQ(sh4d1[1], C) << "set shape size error"; + + bool is_equal = (sh4d0 == sh4d1); + CHECK_EQ(is_equal, false) << "check shape is_equal failed"; + + sh4d0 = sh4d1; + CHECK_EQ(sh4d1[0], N) << "constructor failed"; + CHECK_EQ(sh4d1[1], C) << "get shape size error"; + CHECK_EQ(sh4d1[2], H) << "get shape size error"; + CHECK_EQ(sh4d1[3], W) << "get shape size error"; + + Shape sh4d3 = sh4d1; + CHECK_EQ((sh4d3 == sh4d1), true) << "constructor error"; + + Shape sh4d4(sh4d1); + CHECK_EQ((sh4d4 == sh4d1), true) << "constructor error"; + + Shape sh1d0{0}; + //std::vector sh1d_size = {W}; + + //Shape sh1d1(sh1d_size); + //Shape sh1d0{W}; + Shape sh1d1(W); + + Shape sh1d3 = sh1d1; + Shape sh1d4(sh1d1); + + CHECK_EQ(sh1d0.dims(), 1) << "shape dim error"; + + CHECK_EQ(sh1d0.count(), 0) << "shape size error"; + + CHECK_EQ(sh1d0.count(0), 0) << "shape1d count error"; + + CHECK_EQ(sh1d1[0], W) << "get shape size error"; + + //CHECK_EQ(sh1d2.count(0), W) << "shape dim error"; + + CHECK_EQ((sh1d0 != sh1d1), true) << "compare shape error"; + + CHECK_EQ((sh1d3 == sh1d1), true) << "compare shape error"; + + CHECK_EQ((sh1d4 == sh1d1), true) << "compare shape error"; + + Shape sh0{2, 2, 3, 4}; + Shape sh1{2, 1, 1, 24}; + Shape sh2{2, 2, 3, 4}; + Shape sh3{1, 1, 2, 3}; + + CHECK_EQ(sh0 == sh2, true) << "error =="; + CHECK_EQ(sh3 < sh0, true) << "error <"; + CHECK_EQ(sh3 >= sh0, false) << "error >="; + CHECK_EQ(sh3 > sh0, false) << "error >"; + CHECK_EQ(sh0 > sh3, true) << "error >"; + CHECK_EQ(sh0 < sh1, false) << "error <"; + CHECK_EQ(sh0 <= sh2, true) << "error <="; + CHECK_EQ(sh0 >= sh2, true) << "error >="; + + Shape sh001 = Shape::zero(2); + Shape sh002 = Shape::zero(3); + + if (sh001 > sh002) { + LOG(ERROR) << "error <"; + } + +} + + +int main(int argc, const char** argv) { + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + + diff --git a/test/saber/bm/test_saber_shape_BM.h b/test/saber/bm/test_saber_shape_BM.h new file mode 100644 index 000000000..a2ca02c9b --- /dev/null +++ b/test/saber/bm/test_saber_shape_BM.h @@ -0,0 +1,25 @@ +#ifndef ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H +#define ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H + +#include "utils/unit_test/aktest.h" +#include "utils/logger/logger.h" +#include "saber/core/shape.h" + +using namespace anakin::test; + +class TestSaberShapeBM : public Test { +public: + TestSaberShapeBM() {} + ~TestSaberShapeBM() {} + +protected: + virtual void setup() {} + virtual void teardown() {} + +protected: + std::string name; + std::string _test; +}; + +#endif //ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H + diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp new file mode 100644 index 000000000..d9c65c7b4 --- /dev/null +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -0,0 +1,642 @@ +#include "test_saber_tensor_BM.h" +#include "tensor_op.h" +#include +using namespace anakin::saber; + +typedef TargetWrapper X86_API; +typedef TargetWrapper BM_API; +typedef Tensor TensorHf4; +typedef Tensor TensorDf4; +typedef TensorHf4::Dtype dtype; + +TEST(TestSaberTensorBM, test_tensor_constructor) { + + //! test empty constructor + LOG(INFO) << "test default (empty) constructor"; + TensorHf4 thost0; + TensorDf4 tdev0; + + //! test tensor re_alloc function empty constructor + Shape sh0(2, 2, 8, 8); + LOG(INFO) << "|--test tensor re_alloc function on empty tensor"; + thost0.re_alloc(sh0); + tdev0.re_alloc(sh0); + LOG(INFO) << "|--tensor size of host: " << thost0.size(); + LOG(INFO) << "|--tensor size of device: " << tdev0.size(); + CHECK_EQ(thost0.size(), 256) << "error with tensor size"; + CHECK_EQ(tdev0.size(), 256) << "error with tensor size"; +/* + //! test tensor re_alloc function on tensor with data + LOG(INFO) << "|--test tensor re_alloc function on tensor with data"; + Shape sh1(1, 2, 4, 4); + thost0.re_alloc(sh1); + tdev0.re_alloc(sh1); + LOG(INFO) << "|--tensor size of host: " << thost0.size(); + LOG(INFO) << "|--tensor size of device: " << tdev0.size(); + CHECK_EQ(thost0.size(), 32) << "error with tensor size"; + CHECK_EQ(tdev0.size(), 32) << "error with tensor size"; + + //! test tensor shape() function + LOG(INFO) << "|--test tensor shape() function"; + Shape sho = thost0.shape(); + LOG(INFO) << "|--shape of tensor: " << sho[0] << ", " << sho[1] << "," << sho[2] << "," << sho[3]; + LOG(INFO) << "|--test get tensor n, c, h, w function, num = " \ + << thost0.num() << ", channel = " << thost0.channel() << ", height = " \ + << thost0.height() << ", width = " << thost0.width(); + + //! test tensor mutable_data() function + LOG(INFO) << "|--test tensor mutable_data() function, write tensor data buffer with 1.f"; + fill_tensor_host_const(thost0, 1.f); + LOG(INFO) << "|--test tensor data() function, show the const data, 1.f"; + print_tensor_host(thost0); + + //! test tensor constructor with shape + LOG(INFO) << "test tensor constructor with shape"; + TensorHf4 thost1(sh1); + TensorDf4 tdev1(sh1); + + //! test tensor copy_from() function + LOG(INFO) << "test copy_from() function, input tensor could be any target"; + thost1.copy_from(thost0); + tdev1.copy_from(thost0); + print_tensor_device(tdev1); + cudaDeviceSynchronize(); + thost1.copy_from(tdev1); + tdev1.copy_from(tdev0); + print_tensor_host(thost1); + + //! test tensor constructor with shape and real_shape + LOG(INFO) << "test tensor constructor with shape and real_shape"; + //! constructor with 3 shapes is removed + TensorHf4 thost2(sh0); + TensorDf4 tdev2(sh0); + + //! test tensor constructor with data, if target is different, create buffer, and copy the data + LOG(INFO) << + "test tensor constructor with data, if target is different, create buffer, and copy the data"; + dtype* host_data_ptr; + dtype* dev_data_ptr; + void* tmp_pt_host; + void* tmp_pt_dev; + X86_API::mem_alloc(&tmp_pt_host, sizeof(dtype) * sh1.count()); + host_data_ptr = static_cast(tmp_pt_host); + + for (int i = 0; i < sh1.count(); ++i) { + host_data_ptr[i] = i; + } + + NV_API::mem_alloc(&tmp_pt_dev, sizeof(dtype) * sh1.count()); + dev_data_ptr = static_cast(tmp_pt_dev); + cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice); + LOG(INFO) << "|--construct host tensor from host data ptr"; + TensorHf4 thost3(host_data_ptr, X86(), X86_API::get_device_id(), sh1); + LOG(INFO) << "|--constructor device tensor from host data ptr"; + TensorDf4 tdev3(host_data_ptr, X86(), X86_API::get_device_id(), sh1); + print_tensor_host(thost3); + print_tensor_device(tdev3); + cudaDeviceSynchronize(); + + LOG(INFO) << "|--construct host tensor from device data ptr"; + TensorHf4 thost4(dev_data_ptr, NV(), NV_API::get_device_id(), sh1); + LOG(INFO) << "|--constructor device tensor from device data ptr"; + TensorDf4 tdev4(dev_data_ptr, NV(), NV_API::get_device_id(), sh1); + print_tensor_host(thost4); + print_tensor_device(tdev4); + NV_API::stream_t dev_stream0; + NV_API::create_stream_with_flag(dev_stream0, 1); + cudaDeviceSynchronize(); + + //! test tensor copy constructor + LOG(INFO) << "test tensor copy constructor"; + LOG(INFO) << "|--normal copy constructor"; + TensorHf4 thost5(thost4); + TensorDf4 tdev5(tdev4); + + LOG(INFO) << "|--push back to vector"; + std::vector vthost; + std::vector vtdev; + vthost.push_back(thost0); + vthost.push_back(thost1); + vthost.push_back(thost2); + vthost.push_back(thost3); + vthost.push_back(thost4); + vthost.push_back(thost5); + vtdev.push_back(tdev0); + vtdev.push_back(tdev1); + vtdev.push_back(tdev2); + vtdev.push_back(tdev3); + vtdev.push_back(tdev4); + vtdev.push_back(tdev5); + print_tensor_host(vthost[5]); + print_tensor_device(vtdev[5]); + cudaDeviceSynchronize(); + + //! test share_from function, if targets are the same, buffer is shared, otherwise, buffer is copied + LOG(INFO) << "test share_from function"; + TensorHf4 thost6, thost7; + TensorDf4 tdev6, tdev7; + thost6.set_shape(thost4.shape()); + thost7.set_shape(thost4.shape()); + tdev6.set_shape(thost4.shape()); + tdev7.set_shape(thost4.shape()); + Shape sh2(1, 2, 2, 2); + Shape offset(0, 0, 1, 1); + LOG(INFO) << "|--shared host"; + thost6.share_sub_buffer(thost4, sh2, offset); + LOG(INFO) << "|--copied host"; + tdev6.share_from(thost4); + LOG(INFO) << "|--copied device"; + thost7.share_from(tdev4); + LOG(INFO) << "|--shared device"; + tdev7.share_from(tdev4); + + LOG(INFO) << "|--change data in shared tensor"; + + //Shape sh_real = thost6.shape(); + //Shape sh_act = thost6.valid_shape(); + //Shape offset_act = thost6.offset(); + + //int start_w = offset_act[3]; + //int start_h = offset_act[2]; + //int start_c = offset_act[1]; + //int start_n = offset_act[0]; + //int stride_h = sh_real.count(3); + //int stride_c = sh_real.count(2); + //int stride_n = sh_real.count(1); + //int stride_n = sh_real.count(0); + Shape stride = thost6.get_stride(); + int w = thost6.width(); + int h = thost6.height(); + int c = thost6.channel(); + int n = thost6.num(); + + dtype* ptr_host = thost6.mutable_data(); + + for (int in = 0; in < n; ++in) { + dtype* ptr_batch = ptr_host + in * stride[0]; + + for (int ic = 0; ic < c; ++ic) { + dtype* ptr_channel = ptr_batch + ic * stride[1]; + + for (int ih = 0; ih < h; ++ih) { + dtype* ptr_row = ptr_channel + ih * stride[2]; + + for (int iw = 0; iw < w; ++iw) { + ptr_row[iw] = 1.f; + } + } + } + } + + LOG(INFO) << "|--show root tensor while data is changed by shared tensor"; + print_tensor_host(thost4); + + //! test record tensor event + LOG(INFO) << "test record tensor event"; + NV_API::stream_t dev_stream; + NV_API::stream_t dev_stream1; + NV_API::create_stream_with_flag(dev_stream, 1); + NV_API::create_stream_with_flag(dev_stream1, 1); + X86_API::stream_t host_stream; + X86_API::create_stream_with_flag(host_stream, 1); + LOG(INFO) << "|--test record event on host tensor"; + fill_tensor_host_const(thost4, 888.f); + thost4.record_event(host_stream); + thost4.sync(); + print_tensor_host(thost4); + LOG(INFO) << "|--test record event on device tensor"; + fill_tensor_device_const(tdev4, 666.f, dev_stream); + tdev4.record_event(dev_stream); + tdev4.sync(); + print_tensor_device(tdev4, dev_stream1); + tdev4.record_event(dev_stream1); + tdev4.sync(); +} + +TEST(TestSaberTensorNV, test_tensor_deepcopy) { + //! tensor constructor with alloc data, if target is different, create buffer, and copy the data + LOG(INFO) << "test tensor deep copy"; + Shape sh0(2, 2, 4, 4); + Shape va_sh0(2, 2, 2, 2); + Shape off_sh0(0, 0, 1, 1); + + Shape sh1(2, 2, 4, 4); + Shape va_sh1(va_sh0); + Shape off_sh1(0, 0, 1, 0); + + Shape sh2(2, 32); + Shape va_sh2(2, 8); + Shape off_sh2(0, 8); + + X86_API::stream_t x86_stream; + NV_API::stream_t nv_stream; + X86_API::create_stream(x86_stream); + NV_API::create_stream(nv_stream); + + //! create source tensor, th0, td0, th01, td01, th1, td1; + TensorHf4 th0(sh0); + + for (int i = 0; i < sh0.count(); ++i) { + th0.mutable_data()[i] = i; + } + + TensorHf4 th1(va_sh0); + + for (int i = 0; i < va_sh0.count(); ++i) { + th1.mutable_data()[i] = i; + } + + TensorHf4 th01; + th01.share_sub_buffer(th0, va_sh0, off_sh0); + + TensorDf4 td0, td1, td01; + td0.set_shape(th0.shape()); + td1.set_shape(th1.shape()); + td0.share_from(th0); + td1.share_from(th1); + TensorDf4 dev_tmp0; + dev_tmp0.set_shape(th0.shape()); + dev_tmp0.share_from(th0); + td01.share_sub_buffer(dev_tmp0, va_sh0, off_sh0); + + print_tensor_host(th0); + print_tensor_host(th1); + print_tensor_device(td0); + print_tensor_device(td1); + + //! create th2, th3, th21, td2, td3, td21 as dst tensor + TensorHf2 th2(sh2); + fill_tensor_host_const(th2, 0.f); + TensorHf2 th21; + th21.share_sub_buffer(th2, va_sh2, off_sh2); + TensorHf2 th3(va_sh2); + + TensorDf2 td2(sh2); + fill_tensor_device_const(td2, 0.f); + cudaDeviceSynchronize(); + TensorDf2 td21; + td21.share_sub_buffer(td2, va_sh2, off_sh2); + TensorDf2 td3(va_sh2); + + double max_diff; + double max_ratio; + //! test tensor deep copy, entire buffer copy + LOG(INFO) << "test tensor deep copy, entire buffer copy, H2H"; + th3.copy_from(th1); + print_tensor_host(th3); + tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff); + CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, sync, H2H"; + fill_tensor_host_const(th3, 0.f); + th3.async_copy_from(th1, x86_stream); + th3.record_event(x86_stream); + th3.sync(); + tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff); + CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, async, H2H"; + + LOG(INFO) << "test tensor deep copy, entire buffer copy, D2H"; + th3.copy_from(td1); + print_tensor_host(th3); + tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff); + CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, sync, D2H"; + fill_tensor_host_const(th3, 0.f); + th3.async_copy_from(td1, nv_stream); + th3.record_event(x86_stream); + th3.sync(); + tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff); + CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, async, D2H"; + + LOG(INFO) << "test tensor deep copy, entire buffer copy, H2D"; + td3.copy_from(th1); + print_tensor_device(td3); + cudaDeviceSynchronize(); + tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff); + CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, sync, D2H"; + fill_tensor_device_const(td3, 0.f); + cudaDeviceSynchronize(); + td3.async_copy_from(th1, nv_stream); + td3.record_event(nv_stream); + td3.sync(); + tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff); + CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, async, D2H"; + + LOG(INFO) << "test tensor deep copy, entire buffer copy, D2D"; + td3.copy_from(td1); + print_tensor_device(td3); + cudaDeviceSynchronize(); + CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, sync, D2D"; + fill_tensor_device_const(td3, 0.f); + cudaDeviceSynchronize(); + td3.async_copy_from(td1, nv_stream); + td3.record_event(nv_stream); + td3.sync(); + CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, async, D2D"; + + + //! test tensor deep copy, src with roi + LOG(INFO) << "test tensor deep copy, src with roi, H2H"; + th3.copy_from(th01); + print_tensor_host(th3); + + LOG(INFO) << "test tensor deep copy, src with roi, D2H"; + th3.copy_from(td01); + print_tensor_host(th3); + + LOG(INFO) << "test tensor deep copy, src with roi, H2D"; + td3.copy_from(th01); + print_tensor_device(td3); + cudaDeviceSynchronize(); + + LOG(INFO) << "test tensor deep copy, src with roi, D2D"; + td3.copy_from(td01); + print_tensor_device(td3); + cudaDeviceSynchronize(); + + + //! test tensor deep copy, dst with roi + LOG(INFO) << "test tensor deep copy, dst with roi, H2H"; + print_tensor_host(th21); + print_tensor_host(th1); + th21.copy_from(th1); + print_tensor_host(th21); + + LOG(INFO) << "test tensor deep copy, dst with roi, D2H"; + th21.copy_from(td1); + print_tensor_host(th21); + + LOG(INFO) << "test tensor deep copy, dst with roi, H2D"; + td21.copy_from(th1); + print_tensor_device(td21); + cudaDeviceSynchronize(); + + LOG(INFO) << "test tensor deep copy, dst with roi, D2D"; + td21.copy_from(td1); + print_tensor_device(td21); + cudaDeviceSynchronize(); + + + //! test tensor deep copy, src and dst are with roi + LOG(INFO) << "test tensor deep copy, src and dst are with roi, H2H"; + th21.copy_from(th01); + print_tensor_host(th21); + + LOG(INFO) << "test tensor deep copy, src and dst are with roi, D2H"; + th21.copy_from(td01); + print_tensor_host(th21); + + LOG(INFO) << "test tensor deep copy, src and dst are with roi, H2D"; + td21.copy_from(th01); + print_tensor_device(td21); + cudaDeviceSynchronize(); + + LOG(INFO) << "test tensor deep copy, src and dst are with roi, D2D"; + td21.copy_from(td01); + print_tensor_device(td21); + cudaDeviceSynchronize(); +} + +TEST(TestSaberTensorNV, test_tensor_shape) { + typedef Tensor Tensor4_0; + typedef Tensor Tensor4_1; + typedef Tensor Tensor2; + + int nin = 2; + int cin = 2; + int hin = 4; + int win = 4; + + LOG(INFO) << "test tensor interface"; + + Tensor4_0 t1(Shape(nin, cin, hin, win)); + Tensor4_1 t2(Shape(nin, hin, win, cin)); + Tensor2 t3(Shape(hin, win)); + + LOG(INFO) << "test tensor with layout of NCHW"; + LOG(INFO) << "num: " << t1.num() << ", num idx: " << t1.num_index() << \ + ", channel: " << t1.channel() << ", channel idx: " << t1.channel_index() << \ + ", height: " << t1.height() << ", height idx: " << t1.height_index() << \ + ", widhth: " << t1.width() << ", width idx: " << t1.width_index(); + + CHECK_EQ(t1.num(), nin) << "NCHW get num error"; + CHECK_EQ(t1.channel(), cin) << "NCHW get channel error"; + CHECK_EQ(t1.height(), hin) << "NCHW get height error"; + CHECK_EQ(t1.width(), win) << "NCHW get width error"; + + CHECK_EQ(t1.num_index(), 0) << "NCHW get num index error"; + CHECK_EQ(t1.channel_index(), 1) << "NCHW get channel index error"; + CHECK_EQ(t1.height_index(), 2) << "NCHW get height index error"; + CHECK_EQ(t1.width_index(), 3) << "NCHW get width index error"; + + LOG(INFO) << "test tensor with layout of NHWC"; + LOG(INFO) << "num: " << t2.num() << ", num idx: " << t2.num_index() << \ + ", channel: " << t2.channel() << ", channel idx: " << t2.channel_index() << \ + ", height: " << t2.height() << ", height idx: " << t2.height_index() << \ + ", widhth: " << t2.width() << ", width idx: " << t2.width_index(); + + CHECK_EQ(t2.num(), nin) << "NHWC get num error"; + CHECK_EQ(t2.channel(), cin) << "NHWC get channel error"; + CHECK_EQ(t2.height(), hin) << "NHWC get height error"; + CHECK_EQ(t2.width(), win) << "NHWC get width error"; + + CHECK_EQ(t2.num_index(), 0) << "NHWC get num index error"; + CHECK_EQ(t2.channel_index(), 3) << "NHWC get channel index error"; + CHECK_EQ(t2.height_index(), 1) << "NHWC get height index error"; + CHECK_EQ(t2.width_index(), 2) << "NHWC get width index error"; + + LOG(INFO) << "test tensor with layout of HW"; + LOG(INFO) << "num: " << t3.num() << ", num idx: " << t3.num_index() << \ + ", channel: " << t3.channel() << ", channel idx: " << t3.channel_index() << \ + ", height: " << t3.height() << ", height idx: " << t3.height_index() << \ + ", widhth: " << t3.width() << ", width idx: " << t3.width_index(); + + CHECK_EQ(t3.num(), 1) << "HW get num error"; + CHECK_EQ(t3.channel(), 1) << "HW get channel error"; + CHECK_EQ(t3.height(), hin) << "HW get height error"; + CHECK_EQ(t3.width(), win) << "HW get width error"; + + CHECK_EQ(t3.num_index(), -1) << "HW get num index error"; + CHECK_EQ(t3.channel_index(), -1) << "HW get channel index error"; + CHECK_EQ(t3.height_index(), 0) << "HW get height index error"; + CHECK_EQ(t3.width_index(), 1) << "HW get width index error"; + +} + +TEST(TestSaberTensorNV, test_tensor_reshape_realloc) { + + LOG(INFO) << "test tensor reshape and re_alloc funcs"; + + Shape sh0(2, 2, 2, 2); + Shape sh1(2, 2, 4, 4); + TensorHf4 th0(sh1); + TensorDf4 td0(sh1); + fill_tensor_host_const(th0, 1); + fill_tensor_device_const(td0, 1); + LOG(INFO) << "ori tensor with size: " << th0.valid_size(); + print_tensor_host(th0); + print_tensor_device(td0); + cudaDeviceSynchronize(); + + th0.reshape(sh0); + td0.reshape(sh0); + LOG(INFO) << "tensor after reshape(from big space to small) with size: " << th0.valid_size(); + print_tensor_host(th0); + print_tensor_device(td0); + cudaDeviceSynchronize(); + fill_tensor_host_const(th0, 1); + fill_tensor_device_const(td0, 1); + cudaDeviceSynchronize(); + + th0.reshape(sh1); + td0.reshape(sh1); + LOG(INFO) << "tensor after reshape(from small to big, not larger than ori) with size: " << + th0.valid_size(); + print_tensor_host(th0); + print_tensor_device(td0); + cudaDeviceSynchronize(); + + th0.re_alloc(sh0); + td0.re_alloc(sh0); + LOG(INFO) << "tensor after re_alloc(from big space to small) with size: " << th0.valid_size(); + print_tensor_host(th0); + print_tensor_device(td0); + cudaDeviceSynchronize(); + + TensorHf4 th1(sh0); + TensorDf4 td1(sh0); + LOG(INFO) << "ori tensor with size: " << th1.valid_size(); + fill_tensor_host_const(th1, 1); + fill_tensor_device_const(td1, 1); + cudaDeviceSynchronize(); + print_tensor_host(th1); + print_tensor_device(td1); + cudaDeviceSynchronize(); + + th1.reshape(sh1); + td1.reshape(sh1); + LOG(INFO) << "tensor after reshape(from small space to big) with size: " << th1.valid_size(); + //printf("real_shape: %d,%d, %d, %d, valid_shape: %d, %d, %d, %d\n", \ + th1.shape()[0], th1.shape()[1], th1.shape()[2], th1.shape()[3], \ + th1.valid_shape()[0], th1.valid_shape()[1], th1.valid_shape()[2], th1.valid_shape()[3]); + print_tensor_host(th1); + print_tensor_device(td1); + cudaDeviceSynchronize(); + fill_tensor_host_const(th1, 1); + fill_tensor_device_const(td1, 1); + cudaDeviceSynchronize(); + + th1.reshape(sh0); + td1.reshape(sh0); + + LOG(INFO) << "tensor after re_alloc(from small space to big) with size: " << th1.valid_size(); + th1.re_alloc(sh1); + td1.re_alloc(sh1); + print_tensor_host(th1); + print_tensor_device(td1); + cudaDeviceSynchronize(); + +} + +TEST(TestSaberTensorNV, test_tensor_op) { + Shape sh{1, 2, 2, 10}; + TensorDf4 td1(sh); + TensorHf4 th1(sh); + Tensor td2(sh); + Tensor th2(sh); + LOG(INFO) << "testing host fill tensor with const 1."; + fill_tensor_host_const(th1, 1.f); + LOG(INFO) << "data type: float"; + print_tensor_host(th1); + fill_tensor_host_const(th2, 1); + LOG(INFO) << "data type: int8"; + print_tensor_host(th2); + + LOG(INFO) << "testing device fill tensor with const 1."; + fill_tensor_device_const(td1, 1.f); + LOG(INFO) << "data type: float"; + print_tensor_device(td1); + fill_tensor_device_const(td2, 1); + LOG(INFO) << "data type: int8"; + print_tensor_device(td2); + + LOG(INFO) << "testing host fill tensor with rand"; + fill_tensor_host_rand(th1); + LOG(INFO) << "data type: float"; + print_tensor_host(th1); + fill_tensor_host_rand(th2); + LOG(INFO) << "data type: int8"; + print_tensor_host(th2); + + LOG(INFO) << "testing device fill tensor with rand"; + fill_tensor_device_rand(td1); + LOG(INFO) << "data type: float"; + print_tensor_device(td1); + fill_tensor_device_rand(td2); + LOG(INFO) << "data type: int8"; + print_tensor_device(td2); + + LOG(INFO) << "testing host fill tensor with rand from 1 to 10"; + fill_tensor_host_rand(th1, 1, 10); + LOG(INFO) << "data type: float"; + print_tensor_host(th1); + fill_tensor_host_rand(th2, 1, 10); + LOG(INFO) << "data type: int8"; + print_tensor_host(th2); + + LOG(INFO) << "testing device fill tensor with rand from 1 to 10"; + fill_tensor_device_rand(td1, 1, 10); + LOG(INFO) << "data type: float"; + print_tensor_device(td1); + fill_tensor_device_rand(td2, 1, 10); + LOG(INFO) << "data type: int8"; + print_tensor_device(td2); +} + +TEST(TestSaberTensorNV, test_tensor_share_diff_dtype) { + Shape sh{1, 1, 2, 10}; + Tensor td1(sh); + Tensor th1(sh); + Tensor td2; + Tensor th2; + td2.set_shape(sh); + th2.set_shape(sh); + LOG(INFO) << "testing host fill tensor with const 1."; + fill_tensor_host_const(th1, -1); + LOG(INFO) << "data type: float"; + print_tensor_host(th1); + fill_tensor_device_const(td1, -1); + LOG(INFO) << "data type: int8"; + print_tensor_device(td1); + cudaDeviceSynchronize(); + + td2.share_from(td1); + th2.share_from(th1); + + print_tensor_host(th2); + print_tensor_device(td2); + cudaDeviceSynchronize(); +} + +TEST(TestSaberTensorNV, test_tensor_base_type) { + Shape sh(1, 3, 10, 10); + Tensor td1(sh); + Tensor th1(sh); + fill_tensor_host_rand(th1, 0.f, 255.f); + td1.copy_from(th1); + TensorBase* tb1; + TensorBase* tb2; + tb1 = &th1; + Shape sh1(1, 1, 10, 10); + tb1->set_shape(sh1); + Shape sh11 = th1.valid_shape(); + LOG(INFO) << "base tensor call set shape: " << "n=" << sh11[0] << ", c=" << sh11[1] << \ + ", h=" << sh11[2] << ", w=" << sh11[3]; +*/ +} + +int main(int argc, const char** argv) { + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/bm/test_saber_tensor_BM.h b/test/saber/bm/test_saber_tensor_BM.h new file mode 100644 index 000000000..32a402258 --- /dev/null +++ b/test/saber/bm/test_saber_tensor_BM.h @@ -0,0 +1,21 @@ +#ifndef ANAKIN_TEST_SABER_TEST_SABER_TENSOR_BM_H +#define ANAKIN_TEST_SABER_TEST_SABER_TENSOR_BM_H + +#include "utils/unit_test/aktest.h" +#include "utils/logger/logger.h" +#include "core/tensor.h" + +using namespace anakin::test; + +class TestSaberTensorBM : public Test { +public: + TestSaberTensorBM() {} + ~TestSaberTensorBM() {} + +protected: + virtual void setup() {} + virtual void teardown() {} + +}; + +#endif //ANAKIN_TEST_SABER_TEST_SABER_TENSOR_BM_H From 7f726a39e850635d056a43d9a1a751046a57d745 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Mon, 18 Jun 2018 14:14:23 +0800 Subject: [PATCH 002/318] Fix cmake issues --- CMakeLists.txt | 12 ++++++------ saber/CMakeLists.txt | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0a81d7c02..ccb37468f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,12 +65,12 @@ anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plan anakin_option(BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform" NO if BUILD_CROSS_PLANTFORM) # compile options for BM place -anakin_option(USE_BM "Use Cuda libs." YES if NVIDIA_GPU) -anakin_option(USE_CUBLAS "Use Cublas libs." YES if USE_BM) -anakin_option(USE_CURAND "Use Curand libs." YES if USE_BM) -anakin_option(USE_CUFFT "Use CuFFT libs." YES if USE_BM) -anakin_option(USE_CUDNN "Use Cudnn libs." YES if USE_BM) -anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform." YES if USE_BM) +#anakin_option(USE_BM "Use Cuda libs." YES if NVIDIA_GPU) +#anakin_option(USE_CUBLAS "Use Cublas libs." YES if USE_BM) +#anakin_option(USE_CURAND "Use Curand libs." YES if USE_BM) +#anakin_option(USE_CUFFT "Use CuFFT libs." YES if USE_BM) +#anakin_option(USE_CUDNN "Use Cudnn libs." YES if USE_BM) +#anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform." YES if USE_BM) if(USE_CUDA) diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt index 440d1de07..90c7f5c19 100644 --- a/saber/CMakeLists.txt +++ b/saber/CMakeLists.txt @@ -84,10 +84,10 @@ if(USE_BM) set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS}) set(CMAKE_CXX_FLAGS "") if(BUILD_SHARED) - CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS SHARED ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG}) + #CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS SHARED ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG}) endif() if(BUILD_STATIC) - CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS STATIC ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG}) + #CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS STATIC ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG}) endif() set(CMAKE_CXX_FLAGS ${FLAGS_BACKUP}) From 9221195043c6034f3dc8190601e6858c1d5d45ca Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Tue, 19 Jun 2018 11:04:03 +0800 Subject: [PATCH 003/318] Resolve BM library compilation issue --- saber/CMakeLists.txt | 10 +++++----- saber/funcs/impl/bm/base/CMakeLists.txt | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt index 90c7f5c19..ac0ebba2a 100644 --- a/saber/CMakeLists.txt +++ b/saber/CMakeLists.txt @@ -59,7 +59,7 @@ if(USE_CUDA) set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS}) set(CMAKE_CXX_FLAGS "") if(BUILD_SHARED) - CUDA_COMPILE(ANAKIN_SABER_CUDA_C_SRC_OBJS SHARED ${ANAKIN_SABER_CUDA_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG}) + CUDA_COMPILE(ANAKIN_SABER_CUDA_C_SRC_OBJS SHARED ${ANAKIN_SABER_CUDA_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG}) endif() if(BUILD_STATIC) CUDA_COMPILE(ANAKIN_SABER_CUDA_C_SRC_OBJS STATIC ${ANAKIN_SABER_CUDA_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG}) @@ -83,12 +83,12 @@ if(USE_BM) set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS}) set(CMAKE_CXX_FLAGS "") - if(BUILD_SHARED) + #if(BUILD_SHARED) #CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS SHARED ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG}) - endif() - if(BUILD_STATIC) + #endif() + #if(BUILD_STATIC) #CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS STATIC ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG}) - endif() + #endif() set(CMAKE_CXX_FLAGS ${FLAGS_BACKUP}) set(ANAKIN_SABER_STATIC_RELAY ${ANAKIN_SABER_STATIC_RELAY} diff --git a/saber/funcs/impl/bm/base/CMakeLists.txt b/saber/funcs/impl/bm/base/CMakeLists.txt index fd4b3d680..59b82abb5 100644 --- a/saber/funcs/impl/bm/base/CMakeLists.txt +++ b/saber/funcs/impl/bm/base/CMakeLists.txt @@ -7,7 +7,7 @@ if(USE_BM) anakin_fetch_files_with_suffix_recursively(${BM_BASE_CODE_ROOT}/include "h" ANAKIN_SABER_BM_C_SRC) - anakin_fetch_files_with_suffix_recursively(${BM_BASE_CODE_ROOT}/lib "o" ANAKIN_SABER_BM_STATIC_LIB) + anakin_fetch_files_with_suffix_recursively(${BM_BASE_CODE_ROOT}/lib "so" ANAKIN_SABER_BM_STATIC_LIB) endif() macro(anakin_set_upscope src) From 0244de9ea7f10b8525eb4bbf4d9be98fb2347721 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Tue, 19 Jun 2018 11:30:42 +0800 Subject: [PATCH 004/318] Remove unnecessary files --- .../impl/bm/base/include/bmruntime/bmcnnctx.h | 58 -------------- .../impl/bm/base/include/bmruntime/bmnet.h | 78 ------------------- 2 files changed, 136 deletions(-) delete mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h delete mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmnet.h diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h b/saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h deleted file mode 100644 index 6b0bfe857..000000000 --- a/saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef __BM_CNN_CONTEXT_H__ -#define __BM_CNN_CONTEXT_H__ - -#include -#include "bmruntime.h" - -namespace bmcnn { - -typedef void *bmcnn_ctx_t; -/** - * \brief Create context of BMCNN. - * - * \param ctx_dir - Directory of context files generated by BMNETC - * - * \note - * The context will be created in the device of ID 0.\n - * - * \return - * NULL - Creating failed.\n - * non-NULL - The handle of the context (creating succeeded).\n - */ -bmcnn_ctx_t bmcnn_ctx_create(const std::string &ctx_dir); -/** - * \brief Destroy context of BMCNN - * - * \param handle - Handle of the context to be destroyed - */ -void bmcnn_ctx_destroy(bmcnn_ctx_t handle); -/** - * \brief Create context of BMCNN in specific devide. - * - * \param ctx_dir - Directory of context files generated by BMNETC - * \param devid - ID of device where the context will be placed. - * - * \note - * Call \ref bm_dev_getcount to get total number of devices, e.g. N is returned, - * valid devid should be in range of 0 ~ (N-1).\n - * - * \return - * NULL - Creating failed that might be caused by incorrect parameter.\n - * non-NULL - The handle of the context (creating succeeded).\n - */ -bmcnn_ctx_t bmcnn_ctx_create_by_devid(const std::string &ctx_dir, int devid); -/** - * \brief Append context of BMCNN. - * - * \param ctx_dir - Directory of context files generated by BMNETC or BMNETD. - * \param bmrt - The created handle of context. - * - * \return - * false - Appending failed.\n - * true - Appending succeeded.\n - */ -bool bmcnn_ctx_append(const std::string &ctx_dir, bmruntime *bmrt); - -} /* namespace bmcnn */ - -#endif /* __BM_CNN_CONTEXT_H__ */ diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmnet.h b/saber/funcs/impl/bm/base/include/bmruntime/bmnet.h deleted file mode 100644 index 88005e1b8..000000000 --- a/saber/funcs/impl/bm/base/include/bmruntime/bmnet.h +++ /dev/null @@ -1,78 +0,0 @@ -#ifndef __BM_NET_H__ -#define __BM_NET_H__ - -#include "bmblob.h" -#include "bmcnnctx.h" -#include -#include -#include - -#ifdef CROSS_COMPILE - #include -#else - #include -#endif - - -#ifdef CROSS_COMPILE -#define NAMESPACE_USED std -#else -#define NAMESPACE_USED boost -#endif - -namespace bmcnn { - -class BMNet -{ -public: - /** - * \brief Constructor of net. - * - * \param handle - Handler of BMCNN context (created by \ref bmcnn_ctx_create) - * \param name - Name of net - */ - explicit BMNet(bmcnn_ctx_t handle, const std::string &name); - /** - * \brief Deconstructor of blob. - */ - virtual ~BMNet(); - /** - * \brief Reshape all layers from bottom to top. - */ - void Reshape(); - /** - * \brief Run forward. - * - * \param sync - Flag of synchronizing. - */ - void Forward(bool sync = false); - /** - * \brief Get blob by name. - * - * \param name - Name of blob - * \note - * (1) The name could only be of blob in input or output.\n - * (2) If the name is not spotted, null pointer will be returned.\n - */ - const NAMESPACE_USED::shared_ptr blob_by_name(const std::string &name) const; - /** - * \brief Get maximum shape allowed. - */ - inline const Shape &max_shape() const - { return max_shape_; } -private: - BMNet(const BMNet &other); - BMNet &operator=(const BMNet &other); - - bmcnn_ctx_t bmcc_ctx_; - std::vector > blobs_; - std::vector net_input_blobs_; - std::vector net_output_blobs_; - std::string name_; - std::map blob_name_index_; - Shape max_shape_; -}; - -} /* namespace bmcnn */ - -#endif /* __BM_NET_H__ */ From 6f63d664878acd2f97e3d08d0cb30f46a4ecb619 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Tue, 19 Jun 2018 17:43:20 +0800 Subject: [PATCH 005/318] Put empty implementation for BM sync_mem for now --- saber/CMakeLists.txt | 2 +- saber/core/target_wrapper.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt index ac0ebba2a..82d9bcdab 100644 --- a/saber/CMakeLists.txt +++ b/saber/CMakeLists.txt @@ -100,7 +100,7 @@ endif() # add saber library to static if(UNIX OR APPLE) - ADD_LIBRARY(${ANAKIN_SABER_TEMP_COMMMON_LIB} SHARED ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BM_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC}) + ADD_LIBRARY(${ANAKIN_SABER_TEMP_COMMMON_LIB} SHARED ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC}) #$) if(USE_X86_PLACE) message(STATUS ${ANAKIN_SABER_DEPENDENCIES}) diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h index 6d5d6a8d1..7c6e2d2fb 100644 --- a/saber/core/target_wrapper.h +++ b/saber/core/target_wrapper.h @@ -398,16 +398,16 @@ struct TargetWrapper { // brief create event, empty function for bitmain target static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, __DtoD); + size_t count, __DtoD) {}; static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, __HtoD); + size_t count, __HtoD) {}; static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, __DtoH); + size_t count, __DtoH) {}; static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \ - int src_dev, size_t count); + int src_dev, size_t count) {}; /** * \brief device target return currently used device id From ea5a5be4c3dc55a598636f17eea2bb4502fe91f3 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Tue, 19 Jun 2018 18:03:13 +0800 Subject: [PATCH 006/318] Fix wrong input param --- test/saber/bm/test_TargetWrapper_BM.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp index c6ee0811b..c54b392d1 100644 --- a/test/saber/bm/test_TargetWrapper_BM.cpp +++ b/test/saber/bm/test_TargetWrapper_BM.cpp @@ -7,8 +7,8 @@ using namespace anakin::saber; int main() { typedef TargetWrapper API; void *pmem; - int dev_count; - API::get_device_count(&dev_count); + int dev_count = 0; + API::get_device_count(dev_count); API::mem_alloc(&pmem, 3*200*200); API::mem_free(pmem); } From cf0afb04b0302958c8d4e204b2e7f3c0a1966666 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Wed, 20 Jun 2018 11:46:35 +0800 Subject: [PATCH 007/318] Fix param type issue --- saber/core/impl/bm/bm_impl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index 3ff30773a..143fbec9a 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -55,7 +55,7 @@ int BM_API::get_device_id(){ void BM_API::mem_alloc(void** ptr, size_t n){ //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n) - bm_device_mem_t mem = bm_mem_from_system(ptr); + bm_device_mem_t mem = bm_mem_from_system(*ptr); BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n)); } From 5fed8fb96cbbb50d2c99c1cdc373b627d8021f46 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Wed, 20 Jun 2018 14:17:06 +0800 Subject: [PATCH 008/318] Initialize BM handler --- saber/core/impl/bm/bm_impl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index 143fbec9a..bee5ddab6 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -37,7 +37,7 @@ namespace saber{ typedef TargetWrapper BM_API; -static bm_handle_t handle; +static bm_handle_t handle = get_bm_handle(); void BM_API::get_device_count(int &count) { BMDNN_CHECK(bm_dev_getcount(&count)); From a4dee6249dc498a79b3e13df03a5008e1abdd7c7 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Wed, 20 Jun 2018 15:33:44 +0800 Subject: [PATCH 009/318] Add more unit test for tensor --- test/saber/bm/test_TargetWrapper_BM.cpp | 16 --- test/saber/bm/test_saber_tensor_BM.cpp | 130 ++++++++++-------------- 2 files changed, 55 insertions(+), 91 deletions(-) delete mode 100644 test/saber/bm/test_TargetWrapper_BM.cpp diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp deleted file mode 100644 index c54b392d1..000000000 --- a/test/saber/bm/test_TargetWrapper_BM.cpp +++ /dev/null @@ -1,16 +0,0 @@ -#include "saber_types.h" -#include "target_wrapper.h" -#include - -#ifdef USE_BM -using namespace anakin::saber; -int main() { - typedef TargetWrapper API; - void *pmem; - int dev_count = 0; - API::get_device_count(dev_count); - API::mem_alloc(&pmem, 3*200*200); - API::mem_free(pmem); -} -#endif - diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index d9c65c7b4..0634d0a2d 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -5,8 +5,8 @@ using namespace anakin::saber; typedef TargetWrapper X86_API; typedef TargetWrapper BM_API; -typedef Tensor TensorHf4; -typedef Tensor TensorDf4; +typedef Tensor TensorHf4; +typedef Tensor TensorDf4; typedef TensorHf4::Dtype dtype; TEST(TestSaberTensorBM, test_tensor_constructor) { @@ -25,7 +25,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { LOG(INFO) << "|--tensor size of device: " << tdev0.size(); CHECK_EQ(thost0.size(), 256) << "error with tensor size"; CHECK_EQ(tdev0.size(), 256) << "error with tensor size"; -/* + //! test tensor re_alloc function on tensor with data LOG(INFO) << "|--test tensor re_alloc function on tensor with data"; Shape sh1(1, 2, 4, 4); @@ -60,7 +60,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { thost1.copy_from(thost0); tdev1.copy_from(thost0); print_tensor_device(tdev1); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); thost1.copy_from(tdev1); tdev1.copy_from(tdev0); print_tensor_host(thost1); @@ -85,7 +85,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { host_data_ptr[i] = i; } - NV_API::mem_alloc(&tmp_pt_dev, sizeof(dtype) * sh1.count()); + BM_API::mem_alloc(&tmp_pt_dev, sizeof(dtype) * sh1.count()); dev_data_ptr = static_cast(tmp_pt_dev); cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice); LOG(INFO) << "|--construct host tensor from host data ptr"; @@ -94,17 +94,18 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { TensorDf4 tdev3(host_data_ptr, X86(), X86_API::get_device_id(), sh1); print_tensor_host(thost3); print_tensor_device(tdev3); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); LOG(INFO) << "|--construct host tensor from device data ptr"; - TensorHf4 thost4(dev_data_ptr, NV(), NV_API::get_device_id(), sh1); + TensorHf4 thost4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1); LOG(INFO) << "|--constructor device tensor from device data ptr"; - TensorDf4 tdev4(dev_data_ptr, NV(), NV_API::get_device_id(), sh1); + TensorDf4 tdev4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1); print_tensor_host(thost4); print_tensor_device(tdev4); - NV_API::stream_t dev_stream0; - NV_API::create_stream_with_flag(dev_stream0, 1); - cudaDeviceSynchronize(); + + //BM_API::stream_t dev_stream0; + //BM_API::create_stream_with_flag(dev_stream0, 1); + //cudaDeviceSynchronize(); //! test tensor copy constructor LOG(INFO) << "test tensor copy constructor"; @@ -129,7 +130,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { vtdev.push_back(tdev5); print_tensor_host(vthost[5]); print_tensor_device(vtdev[5]); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); //! test share_from function, if targets are the same, buffer is shared, otherwise, buffer is copied LOG(INFO) << "test share_from function"; @@ -190,30 +191,10 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { LOG(INFO) << "|--show root tensor while data is changed by shared tensor"; print_tensor_host(thost4); - - //! test record tensor event - LOG(INFO) << "test record tensor event"; - NV_API::stream_t dev_stream; - NV_API::stream_t dev_stream1; - NV_API::create_stream_with_flag(dev_stream, 1); - NV_API::create_stream_with_flag(dev_stream1, 1); - X86_API::stream_t host_stream; - X86_API::create_stream_with_flag(host_stream, 1); - LOG(INFO) << "|--test record event on host tensor"; - fill_tensor_host_const(thost4, 888.f); - thost4.record_event(host_stream); - thost4.sync(); - print_tensor_host(thost4); - LOG(INFO) << "|--test record event on device tensor"; - fill_tensor_device_const(tdev4, 666.f, dev_stream); - tdev4.record_event(dev_stream); - tdev4.sync(); - print_tensor_device(tdev4, dev_stream1); - tdev4.record_event(dev_stream1); - tdev4.sync(); } -TEST(TestSaberTensorNV, test_tensor_deepcopy) { +/* +TEST(TestSaberTensorBM, test_tensor_deepcopy) { //! tensor constructor with alloc data, if target is different, create buffer, and copy the data LOG(INFO) << "test tensor deep copy"; Shape sh0(2, 2, 4, 4); @@ -229,9 +210,9 @@ TEST(TestSaberTensorNV, test_tensor_deepcopy) { Shape off_sh2(0, 8); X86_API::stream_t x86_stream; - NV_API::stream_t nv_stream; + BM_API::stream_t nv_stream; X86_API::create_stream(x86_stream); - NV_API::create_stream(nv_stream); + BM_API::create_stream(nv_stream); //! create source tensor, th0, td0, th01, td01, th1, td1; TensorHf4 th0(sh0); @@ -273,7 +254,7 @@ TEST(TestSaberTensorNV, test_tensor_deepcopy) { TensorDf2 td2(sh2); fill_tensor_device_const(td2, 0.f); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); TensorDf2 td21; td21.share_sub_buffer(td2, va_sh2, off_sh2); TensorDf2 td3(va_sh2); @@ -308,11 +289,11 @@ TEST(TestSaberTensorNV, test_tensor_deepcopy) { LOG(INFO) << "test tensor deep copy, entire buffer copy, H2D"; td3.copy_from(th1); print_tensor_device(td3); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); tensor_cmp_host(th1.data(), th3.data(), th3.size(), max_ratio, max_diff); CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, sync, D2H"; fill_tensor_device_const(td3, 0.f); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); td3.async_copy_from(th1, nv_stream); td3.record_event(nv_stream); td3.sync(); @@ -322,10 +303,10 @@ TEST(TestSaberTensorNV, test_tensor_deepcopy) { LOG(INFO) << "test tensor deep copy, entire buffer copy, D2D"; td3.copy_from(td1); print_tensor_device(td3); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); CHECK_LE(max_ratio, 1e-5f) << "error result of entire buffer copy, sync, D2D"; fill_tensor_device_const(td3, 0.f); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); td3.async_copy_from(td1, nv_stream); td3.record_event(nv_stream); td3.sync(); @@ -344,12 +325,12 @@ TEST(TestSaberTensorNV, test_tensor_deepcopy) { LOG(INFO) << "test tensor deep copy, src with roi, H2D"; td3.copy_from(th01); print_tensor_device(td3); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); LOG(INFO) << "test tensor deep copy, src with roi, D2D"; td3.copy_from(td01); print_tensor_device(td3); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); //! test tensor deep copy, dst with roi @@ -366,12 +347,12 @@ TEST(TestSaberTensorNV, test_tensor_deepcopy) { LOG(INFO) << "test tensor deep copy, dst with roi, H2D"; td21.copy_from(th1); print_tensor_device(td21); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); LOG(INFO) << "test tensor deep copy, dst with roi, D2D"; td21.copy_from(td1); print_tensor_device(td21); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); //! test tensor deep copy, src and dst are with roi @@ -386,18 +367,18 @@ TEST(TestSaberTensorNV, test_tensor_deepcopy) { LOG(INFO) << "test tensor deep copy, src and dst are with roi, H2D"; td21.copy_from(th01); print_tensor_device(td21); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); LOG(INFO) << "test tensor deep copy, src and dst are with roi, D2D"; td21.copy_from(td01); print_tensor_device(td21); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); } -TEST(TestSaberTensorNV, test_tensor_shape) { - typedef Tensor Tensor4_0; - typedef Tensor Tensor4_1; - typedef Tensor Tensor2; +TEST(TestSaberTensorBM, test_tensor_shape) { + typedef Tensor Tensor4_0; + typedef Tensor Tensor4_1; + typedef Tensor Tensor2; int nin = 2; int cin = 2; @@ -460,7 +441,7 @@ TEST(TestSaberTensorNV, test_tensor_shape) { } -TEST(TestSaberTensorNV, test_tensor_reshape_realloc) { +TEST(TestSaberTensorBM, test_tensor_reshape_realloc) { LOG(INFO) << "test tensor reshape and re_alloc funcs"; @@ -473,17 +454,17 @@ TEST(TestSaberTensorNV, test_tensor_reshape_realloc) { LOG(INFO) << "ori tensor with size: " << th0.valid_size(); print_tensor_host(th0); print_tensor_device(td0); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); th0.reshape(sh0); td0.reshape(sh0); LOG(INFO) << "tensor after reshape(from big space to small) with size: " << th0.valid_size(); print_tensor_host(th0); print_tensor_device(td0); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); fill_tensor_host_const(th0, 1); fill_tensor_device_const(td0, 1); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); th0.reshape(sh1); td0.reshape(sh1); @@ -491,24 +472,24 @@ TEST(TestSaberTensorNV, test_tensor_reshape_realloc) { th0.valid_size(); print_tensor_host(th0); print_tensor_device(td0); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); th0.re_alloc(sh0); td0.re_alloc(sh0); LOG(INFO) << "tensor after re_alloc(from big space to small) with size: " << th0.valid_size(); print_tensor_host(th0); print_tensor_device(td0); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); TensorHf4 th1(sh0); TensorDf4 td1(sh0); LOG(INFO) << "ori tensor with size: " << th1.valid_size(); fill_tensor_host_const(th1, 1); fill_tensor_device_const(td1, 1); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); print_tensor_host(th1); print_tensor_device(td1); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); th1.reshape(sh1); td1.reshape(sh1); @@ -518,10 +499,10 @@ TEST(TestSaberTensorNV, test_tensor_reshape_realloc) { th1.valid_shape()[0], th1.valid_shape()[1], th1.valid_shape()[2], th1.valid_shape()[3]); print_tensor_host(th1); print_tensor_device(td1); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); fill_tensor_host_const(th1, 1); fill_tensor_device_const(td1, 1); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); th1.reshape(sh0); td1.reshape(sh0); @@ -531,15 +512,15 @@ TEST(TestSaberTensorNV, test_tensor_reshape_realloc) { td1.re_alloc(sh1); print_tensor_host(th1); print_tensor_device(td1); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); } -TEST(TestSaberTensorNV, test_tensor_op) { +TEST(TestSaberTensorBM, test_tensor_op) { Shape sh{1, 2, 2, 10}; TensorDf4 td1(sh); TensorHf4 th1(sh); - Tensor td2(sh); + Tensor td2(sh); Tensor th2(sh); LOG(INFO) << "testing host fill tensor with const 1."; fill_tensor_host_const(th1, 1.f); @@ -590,11 +571,11 @@ TEST(TestSaberTensorNV, test_tensor_op) { print_tensor_device(td2); } -TEST(TestSaberTensorNV, test_tensor_share_diff_dtype) { +TEST(TestSaberTensorBM, test_tensor_share_diff_dtype) { Shape sh{1, 1, 2, 10}; - Tensor td1(sh); - Tensor th1(sh); - Tensor td2; + Tensor td1(sh); + Tensor th1(sh); + Tensor td2; Tensor th2; td2.set_shape(sh); th2.set_shape(sh); @@ -605,20 +586,20 @@ TEST(TestSaberTensorNV, test_tensor_share_diff_dtype) { fill_tensor_device_const(td1, -1); LOG(INFO) << "data type: int8"; print_tensor_device(td1); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); td2.share_from(td1); th2.share_from(th1); print_tensor_host(th2); print_tensor_device(td2); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); } -TEST(TestSaberTensorNV, test_tensor_base_type) { +TEST(TestSaberTensorBM, test_tensor_base_type) { Shape sh(1, 3, 10, 10); - Tensor td1(sh); - Tensor th1(sh); + Tensor td1(sh); + Tensor th1(sh); fill_tensor_host_rand(th1, 0.f, 255.f); td1.copy_from(th1); TensorBase* tb1; @@ -629,8 +610,7 @@ TEST(TestSaberTensorNV, test_tensor_base_type) { Shape sh11 = th1.valid_shape(); LOG(INFO) << "base tensor call set shape: " << "n=" << sh11[0] << ", c=" << sh11[1] << \ ", h=" << sh11[2] << ", w=" << sh11[3]; -*/ -} +}*/ int main(int argc, const char** argv) { // initial logger From bdd8588fbb8053722169fda8b3bb145b05d7c761 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Wed, 20 Jun 2018 16:01:18 +0800 Subject: [PATCH 010/318] Update Dtype for host --- test/saber/bm/test_saber_tensor_BM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index 0634d0a2d..14f86c8b5 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -5,7 +5,7 @@ using namespace anakin::saber; typedef TargetWrapper X86_API; typedef TargetWrapper BM_API; -typedef Tensor TensorHf4; +typedef Tensor TensorHf4; typedef Tensor TensorDf4; typedef TensorHf4::Dtype dtype; From 4ca42d6a43f8e7659abe45979efd453c48b3cf35 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Wed, 20 Jun 2018 16:38:08 +0800 Subject: [PATCH 011/318] Conversion from void* to bm_device_mem_t* --- saber/core/impl/bm/bm_impl.cpp | 10 ++++++---- test/saber/bm/test_saber_tensor_BM.cpp | 2 ++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index bee5ddab6..faca480f0 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -55,20 +55,22 @@ int BM_API::get_device_id(){ void BM_API::mem_alloc(void** ptr, size_t n){ //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n) - bm_device_mem_t mem = bm_mem_from_system(*ptr); - BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n)); + bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr) + BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n)); } void BM_API::mem_free(void* ptr){ //(bm_handle_t handle, bm_device_mem_t mem){ if(ptr != nullptr){ - bm_free_device(handle, bm_mem_from_system(ptr)); + bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr) + bm_free_device(handle, *pmem); } } void BM_API::mem_set(void* ptr, int value, size_t n){ //(bm_handle_t handle, const int value, bm_device_mem_t mem){ - BMDNN_CHECK(bm_memset_device(handle, value, bm_mem_from_system(ptr))); + bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr) + BMDNN_CHECK(bm_memset_device(handle, value, *pmem)); } //! target wrapper diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index 14f86c8b5..af279797e 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -55,6 +55,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { TensorHf4 thost1(sh1); TensorDf4 tdev1(sh1); + /* //! test tensor copy_from() function LOG(INFO) << "test copy_from() function, input tensor could be any target"; thost1.copy_from(thost0); @@ -191,6 +192,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { LOG(INFO) << "|--show root tensor while data is changed by shared tensor"; print_tensor_host(thost4); + */ } /* From 78c978a95a5dd5afdec89c6cbbe49e826baa1f93 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Wed, 20 Jun 2018 16:39:48 +0800 Subject: [PATCH 012/318] Convert from void* to bm_device_mem_t* --- saber/core/impl/bm/bm_impl.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index faca480f0..f2993426c 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -55,21 +55,21 @@ int BM_API::get_device_id(){ void BM_API::mem_alloc(void** ptr, size_t n){ //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n) - bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr) + bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr); BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n)); } void BM_API::mem_free(void* ptr){ //(bm_handle_t handle, bm_device_mem_t mem){ if(ptr != nullptr){ - bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr) + bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr); bm_free_device(handle, *pmem); } } void BM_API::mem_set(void* ptr, int value, size_t n){ //(bm_handle_t handle, const int value, bm_device_mem_t mem){ - bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr) + bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr); BMDNN_CHECK(bm_memset_device(handle, value, *pmem)); } From d99ce8a6bd3b0ea9d012f9aa9b844d7a7e6e1373 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Wed, 20 Jun 2018 16:45:27 +0800 Subject: [PATCH 013/318] Revert back first --- saber/core/impl/bm/bm_impl.cpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index f2993426c..f432cc863 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -55,22 +55,26 @@ int BM_API::get_device_id(){ void BM_API::mem_alloc(void** ptr, size_t n){ //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n) - bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr); - BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n)); + bm_device_mem_t mem = bm_mem_from_system(*ptr); + BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n)); + //bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr); + //BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n)); } void BM_API::mem_free(void* ptr){ //(bm_handle_t handle, bm_device_mem_t mem){ if(ptr != nullptr){ - bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr); - bm_free_device(handle, *pmem); + bm_free_device(handle, bm_mem_from_system(ptr)); + //bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr); + //bm_free_device(handle, *pmem); } } void BM_API::mem_set(void* ptr, int value, size_t n){ //(bm_handle_t handle, const int value, bm_device_mem_t mem){ - bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr); - BMDNN_CHECK(bm_memset_device(handle, value, *pmem)); + BMDNN_CHECK(bm_memset_device(handle, value, bm_mem_from_system(ptr))); + //bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr); + //BMDNN_CHECK(bm_memset_device(handle, value, *pmem)); } //! target wrapper From 5ea5263eaf7f103b975e710d74f38617227fd117 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Wed, 20 Jun 2018 17:00:14 +0800 Subject: [PATCH 014/318] test --- saber/core/impl/bm/bm_impl.cpp | 2 +- saber/saber_funcs_param.h | 16 ++++++++-------- test/saber/bm/test_saber_tensor_BM.cpp | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index f432cc863..baa25f484 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -84,7 +84,7 @@ template struct TargetWrapper; template class Buffer; //! BM Tensor -INSTANTIATE_TENSOR(BM, AK_BM, NCHW); +INSTANTIATE_TENSOR(BM, AK_FLOAT, NCHW); template struct Env; diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h index 6a109540e..d648bf94b 100644 --- a/saber/saber_funcs_param.h +++ b/saber/saber_funcs_param.h @@ -539,14 +539,14 @@ struct ConvParam > { #ifdef USE_BM template <> -struct ConvParam > { +struct ConvParam > { ConvParam() : group(-1), pad_h(-1), pad_w(-1), stride_h(-1), stride_w(-1), dilation_h(-1), dilation_w(-1), weight_tensor(NULL), bias_tensor(NULL), alpha(1.0), beta(0.0){} ConvParam(int group_in, int pad_h_in, int pad_w_in, int stride_h_in, int stride_w_in, int dilation_h_, int dilation_w_, - Tensor* weight, Tensor* bias, + Tensor* weight, Tensor* bias, float alpha_in = 1.0, float beta_in = 0.0) : group(group_in), pad_h(pad_h_in), pad_w(pad_w_in) , stride_h(stride_h_in), stride_w(stride_w_in) @@ -592,16 +592,16 @@ struct ConvParam > { comp_eq = comp_eq && (beta == right.beta); return comp_eq; } - inline const Tensor* weight() { + inline const Tensor* weight() { return weight_tensor; } - inline const Tensor* bias() { + inline const Tensor* bias() { return bias_tensor; } - inline Tensor* mutable_weight() { + inline Tensor* mutable_weight() { return weight_tensor; } - inline Tensor* mutable_bias() { + inline Tensor* mutable_bias() { return bias_tensor; } int group; @@ -614,8 +614,8 @@ struct ConvParam > { float alpha; float beta; private: - Tensor* weight_tensor; - Tensor* bias_tensor; + Tensor* weight_tensor; + Tensor* bias_tensor; }; #endif //USE_BM diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index af279797e..ce0bad95a 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -6,7 +6,7 @@ using namespace anakin::saber; typedef TargetWrapper X86_API; typedef TargetWrapper BM_API; typedef Tensor TensorHf4; -typedef Tensor TensorDf4; +typedef Tensor TensorDf4; typedef TensorHf4::Dtype dtype; TEST(TestSaberTensorBM, test_tensor_constructor) { @@ -55,7 +55,6 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { TensorHf4 thost1(sh1); TensorDf4 tdev1(sh1); - /* //! test tensor copy_from() function LOG(INFO) << "test copy_from() function, input tensor could be any target"; thost1.copy_from(thost0); @@ -66,6 +65,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { tdev1.copy_from(tdev0); print_tensor_host(thost1); + /* //! test tensor constructor with shape and real_shape LOG(INFO) << "test tensor constructor with shape and real_shape"; //! constructor with 3 shapes is removed From 1cc471ee9f0845ac0e59f422ebc7622338ae9947 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Wed, 20 Jun 2018 17:10:11 +0800 Subject: [PATCH 015/318] Revert "test" This reverts commit 5ea5263eaf7f103b975e710d74f38617227fd117. --- saber/core/impl/bm/bm_impl.cpp | 2 +- saber/saber_funcs_param.h | 16 ++++++++-------- test/saber/bm/test_saber_tensor_BM.cpp | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index baa25f484..f432cc863 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -84,7 +84,7 @@ template struct TargetWrapper; template class Buffer; //! BM Tensor -INSTANTIATE_TENSOR(BM, AK_FLOAT, NCHW); +INSTANTIATE_TENSOR(BM, AK_BM, NCHW); template struct Env; diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h index d648bf94b..6a109540e 100644 --- a/saber/saber_funcs_param.h +++ b/saber/saber_funcs_param.h @@ -539,14 +539,14 @@ struct ConvParam > { #ifdef USE_BM template <> -struct ConvParam > { +struct ConvParam > { ConvParam() : group(-1), pad_h(-1), pad_w(-1), stride_h(-1), stride_w(-1), dilation_h(-1), dilation_w(-1), weight_tensor(NULL), bias_tensor(NULL), alpha(1.0), beta(0.0){} ConvParam(int group_in, int pad_h_in, int pad_w_in, int stride_h_in, int stride_w_in, int dilation_h_, int dilation_w_, - Tensor* weight, Tensor* bias, + Tensor* weight, Tensor* bias, float alpha_in = 1.0, float beta_in = 0.0) : group(group_in), pad_h(pad_h_in), pad_w(pad_w_in) , stride_h(stride_h_in), stride_w(stride_w_in) @@ -592,16 +592,16 @@ struct ConvParam > { comp_eq = comp_eq && (beta == right.beta); return comp_eq; } - inline const Tensor* weight() { + inline const Tensor* weight() { return weight_tensor; } - inline const Tensor* bias() { + inline const Tensor* bias() { return bias_tensor; } - inline Tensor* mutable_weight() { + inline Tensor* mutable_weight() { return weight_tensor; } - inline Tensor* mutable_bias() { + inline Tensor* mutable_bias() { return bias_tensor; } int group; @@ -614,8 +614,8 @@ struct ConvParam > { float alpha; float beta; private: - Tensor* weight_tensor; - Tensor* bias_tensor; + Tensor* weight_tensor; + Tensor* bias_tensor; }; #endif //USE_BM diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index ce0bad95a..af279797e 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -6,7 +6,7 @@ using namespace anakin::saber; typedef TargetWrapper X86_API; typedef TargetWrapper BM_API; typedef Tensor TensorHf4; -typedef Tensor TensorDf4; +typedef Tensor TensorDf4; typedef TensorHf4::Dtype dtype; TEST(TestSaberTensorBM, test_tensor_constructor) { @@ -55,6 +55,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { TensorHf4 thost1(sh1); TensorDf4 tdev1(sh1); + /* //! test tensor copy_from() function LOG(INFO) << "test copy_from() function, input tensor could be any target"; thost1.copy_from(thost0); @@ -65,7 +66,6 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { tdev1.copy_from(tdev0); print_tensor_host(thost1); - /* //! test tensor constructor with shape and real_shape LOG(INFO) << "test tensor constructor with shape and real_shape"; //! constructor with 3 shapes is removed From fa2e41ba4df5bfb4f8d739d94230bb709e6aaa18 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Wed, 20 Jun 2018 17:14:21 +0800 Subject: [PATCH 016/318] Debug on copy_from --- test/saber/bm/test_saber_tensor_BM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index af279797e..13b9deff1 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -55,7 +55,6 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { TensorHf4 thost1(sh1); TensorDf4 tdev1(sh1); - /* //! test tensor copy_from() function LOG(INFO) << "test copy_from() function, input tensor could be any target"; thost1.copy_from(thost0); @@ -66,6 +65,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { tdev1.copy_from(tdev0); print_tensor_host(thost1); + /* //! test tensor constructor with shape and real_shape LOG(INFO) << "test tensor constructor with shape and real_shape"; //! constructor with 3 shapes is removed From 75f5063122cdcae1045b78f7d29055ca6b058e42 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Wed, 20 Jun 2018 17:40:20 +0800 Subject: [PATCH 017/318] Revert "Revert "test"" This reverts commit 1cc471ee9f0845ac0e59f422ebc7622338ae9947. --- saber/core/impl/bm/bm_impl.cpp | 2 +- saber/saber_funcs_param.h | 16 ++++++++-------- test/saber/bm/test_saber_tensor_BM.cpp | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index f432cc863..baa25f484 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -84,7 +84,7 @@ template struct TargetWrapper; template class Buffer; //! BM Tensor -INSTANTIATE_TENSOR(BM, AK_BM, NCHW); +INSTANTIATE_TENSOR(BM, AK_FLOAT, NCHW); template struct Env; diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h index 6a109540e..d648bf94b 100644 --- a/saber/saber_funcs_param.h +++ b/saber/saber_funcs_param.h @@ -539,14 +539,14 @@ struct ConvParam > { #ifdef USE_BM template <> -struct ConvParam > { +struct ConvParam > { ConvParam() : group(-1), pad_h(-1), pad_w(-1), stride_h(-1), stride_w(-1), dilation_h(-1), dilation_w(-1), weight_tensor(NULL), bias_tensor(NULL), alpha(1.0), beta(0.0){} ConvParam(int group_in, int pad_h_in, int pad_w_in, int stride_h_in, int stride_w_in, int dilation_h_, int dilation_w_, - Tensor* weight, Tensor* bias, + Tensor* weight, Tensor* bias, float alpha_in = 1.0, float beta_in = 0.0) : group(group_in), pad_h(pad_h_in), pad_w(pad_w_in) , stride_h(stride_h_in), stride_w(stride_w_in) @@ -592,16 +592,16 @@ struct ConvParam > { comp_eq = comp_eq && (beta == right.beta); return comp_eq; } - inline const Tensor* weight() { + inline const Tensor* weight() { return weight_tensor; } - inline const Tensor* bias() { + inline const Tensor* bias() { return bias_tensor; } - inline Tensor* mutable_weight() { + inline Tensor* mutable_weight() { return weight_tensor; } - inline Tensor* mutable_bias() { + inline Tensor* mutable_bias() { return bias_tensor; } int group; @@ -614,8 +614,8 @@ struct ConvParam > { float alpha; float beta; private: - Tensor* weight_tensor; - Tensor* bias_tensor; + Tensor* weight_tensor; + Tensor* bias_tensor; }; #endif //USE_BM diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index 13b9deff1..ce0bad95a 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -6,7 +6,7 @@ using namespace anakin::saber; typedef TargetWrapper X86_API; typedef TargetWrapper BM_API; typedef Tensor TensorHf4; -typedef Tensor TensorDf4; +typedef Tensor TensorDf4; typedef TensorHf4::Dtype dtype; TEST(TestSaberTensorBM, test_tensor_constructor) { From 35f96827ec2898616e7d191552a1f1a6e5ab8b1e Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Thu, 21 Jun 2018 09:44:59 +0800 Subject: [PATCH 018/318] Print tensor for BM --- test/saber/bm/test_saber_tensor_BM.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index ce0bad95a..cc2adc774 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -59,8 +59,8 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { LOG(INFO) << "test copy_from() function, input tensor could be any target"; thost1.copy_from(thost0); tdev1.copy_from(thost0); - print_tensor_device(tdev1); - //cudaDeviceSynchronize(); + //TODO: print tensor for BM device + print_tensor_host(tdev1); thost1.copy_from(tdev1); tdev1.copy_from(tdev0); print_tensor_host(thost1); From c32f16d678979042d44056cd297f15138aab93a7 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Thu, 21 Jun 2018 09:45:35 +0800 Subject: [PATCH 019/318] Revert "Revert "Revert "test""" This reverts commit 75f5063122cdcae1045b78f7d29055ca6b058e42. --- saber/core/impl/bm/bm_impl.cpp | 2 +- saber/saber_funcs_param.h | 16 ++++++++-------- test/saber/bm/test_saber_tensor_BM.cpp | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index baa25f484..f432cc863 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -84,7 +84,7 @@ template struct TargetWrapper; template class Buffer; //! BM Tensor -INSTANTIATE_TENSOR(BM, AK_FLOAT, NCHW); +INSTANTIATE_TENSOR(BM, AK_BM, NCHW); template struct Env; diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h index d648bf94b..6a109540e 100644 --- a/saber/saber_funcs_param.h +++ b/saber/saber_funcs_param.h @@ -539,14 +539,14 @@ struct ConvParam > { #ifdef USE_BM template <> -struct ConvParam > { +struct ConvParam > { ConvParam() : group(-1), pad_h(-1), pad_w(-1), stride_h(-1), stride_w(-1), dilation_h(-1), dilation_w(-1), weight_tensor(NULL), bias_tensor(NULL), alpha(1.0), beta(0.0){} ConvParam(int group_in, int pad_h_in, int pad_w_in, int stride_h_in, int stride_w_in, int dilation_h_, int dilation_w_, - Tensor* weight, Tensor* bias, + Tensor* weight, Tensor* bias, float alpha_in = 1.0, float beta_in = 0.0) : group(group_in), pad_h(pad_h_in), pad_w(pad_w_in) , stride_h(stride_h_in), stride_w(stride_w_in) @@ -592,16 +592,16 @@ struct ConvParam > { comp_eq = comp_eq && (beta == right.beta); return comp_eq; } - inline const Tensor* weight() { + inline const Tensor* weight() { return weight_tensor; } - inline const Tensor* bias() { + inline const Tensor* bias() { return bias_tensor; } - inline Tensor* mutable_weight() { + inline Tensor* mutable_weight() { return weight_tensor; } - inline Tensor* mutable_bias() { + inline Tensor* mutable_bias() { return bias_tensor; } int group; @@ -614,8 +614,8 @@ struct ConvParam > { float alpha; float beta; private: - Tensor* weight_tensor; - Tensor* bias_tensor; + Tensor* weight_tensor; + Tensor* bias_tensor; }; #endif //USE_BM diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index cc2adc774..db0edce6d 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -6,7 +6,7 @@ using namespace anakin::saber; typedef TargetWrapper X86_API; typedef TargetWrapper BM_API; typedef Tensor TensorHf4; -typedef Tensor TensorDf4; +typedef Tensor TensorDf4; typedef TensorHf4::Dtype dtype; TEST(TestSaberTensorBM, test_tensor_constructor) { From 65a1bbfdf0adc6c0fc4ff809fc75022af0881ccb Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Thu, 21 Jun 2018 10:00:22 +0800 Subject: [PATCH 020/318] Passing through BM handler --- saber/core/context.h | 12 ++++++++++++ saber/core/impl/bm/bm_impl.cpp | 5 +++++ saber/core/target_wrapper.h | 2 ++ 3 files changed, 19 insertions(+) diff --git a/saber/core/context.h b/saber/core/context.h index 847f91e81..15ec2e0b6 100644 --- a/saber/core/context.h +++ b/saber/core/context.h @@ -18,6 +18,12 @@ #include "core/env.h" #include "saber/saber_types.h" +#ifdef USE_BM +#include "bmlib_runtime.h" +#include "bmdnn_api.h" +#include "bmlib_utils.h" +#endif + namespace anakin{ namespace saber{ @@ -105,6 +111,12 @@ class Context final{ return _stream_compute; } +#ifdef USE_BM + bm_handle_t get_handler() { + return API::get_handler(); + } +#endif + #ifdef USE_ARM_PLACE void set_power_mode(PowerMode mode); void set_act_cores(std::vector ids); diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index f432cc863..ecfe755d6 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -37,8 +37,13 @@ namespace saber{ typedef TargetWrapper BM_API; +//TODO: check exception static bm_handle_t handle = get_bm_handle(); +bm_handle_t BM_API::get_handler() { + return handle; +} + void BM_API::get_device_count(int &count) { BMDNN_CHECK(bm_dev_getcount(&count)); } diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h index 7c6e2d2fb..e724235d8 100644 --- a/saber/core/target_wrapper.h +++ b/saber/core/target_wrapper.h @@ -414,6 +414,8 @@ struct TargetWrapper { * @return currently activated device id */ static int get_device_id(); + + static bm_handle_t get_handler(); }; #endif //USE_BM From 50aca5f8267b683df4a1a51a7a344da634296ccd Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Thu, 21 Jun 2018 11:20:14 +0800 Subject: [PATCH 021/318] Implement copy_from for BM; Add back test_TargetWrapper_BM --- saber/core/tensor.h | 17 +++++++++++++++++ test/saber/bm/test_TargetWrapper_BM.cpp | 16 ++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 test/saber/bm/test_TargetWrapper_BM.cpp diff --git a/saber/core/tensor.h b/saber/core/tensor.h index 33e655752..3ac4ae7a9 100644 --- a/saber/core/tensor.h +++ b/saber/core/tensor.h @@ -20,6 +20,12 @@ #include "core/events.h" #include "core/tensor_traits.h" +#ifdef USE_BM +#include "bmlib_runtime.h" +#include "bmdnn_api.h" +#include "bmlib_utils.h" +#endif + namespace anakin{ namespace saber{ @@ -570,6 +576,17 @@ class Tensor : public TensorBase { return SaberSuccess; } +#ifdef USE_BM + SaberStatus copy_from(const Tensor& tensor) { + CHECK_EQ(valid_size(), tensor.valid_size()) \ + << "sizes of two valid shapes must be the same"; + + BMDNN_CHECK(m_memcpy_s2d(API::get_handler(), mutable_data(), bm_mem_from_system(tensor.data()))); + + return SaberSuccess; + } +#endif + /** * \brief Deep copy data within region of interest from input tensor. */ diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp new file mode 100644 index 000000000..c54b392d1 --- /dev/null +++ b/test/saber/bm/test_TargetWrapper_BM.cpp @@ -0,0 +1,16 @@ +#include "saber_types.h" +#include "target_wrapper.h" +#include + +#ifdef USE_BM +using namespace anakin::saber; +int main() { + typedef TargetWrapper API; + void *pmem; + int dev_count = 0; + API::get_device_count(dev_count); + API::mem_alloc(&pmem, 3*200*200); + API::mem_free(pmem); +} +#endif + From 65734d5dfdec189599aa607476945d524474be08 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Thu, 21 Jun 2018 11:35:11 +0800 Subject: [PATCH 022/318] check tensor target type --- saber/core/tensor.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/saber/core/tensor.h b/saber/core/tensor.h index 3ac4ae7a9..7fc829555 100644 --- a/saber/core/tensor.h +++ b/saber/core/tensor.h @@ -578,6 +578,9 @@ class Tensor : public TensorBase { #ifdef USE_BM SaberStatus copy_from(const Tensor& tensor) { + CHECK_EQ(typeof(BM), typeof(targetType_t)) \ + << "this method is only for BM tensor"; + CHECK_EQ(valid_size(), tensor.valid_size()) \ << "sizes of two valid shapes must be the same"; From 6ebd0287d6be3e9ac41b0d6c8c333771b958124f Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Thu, 21 Jun 2018 11:39:13 +0800 Subject: [PATCH 023/318] Change back to compliable version --- saber/core/tensor.h | 14 -------------- test/saber/bm/test_saber_tensor_BM.cpp | 2 +- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/saber/core/tensor.h b/saber/core/tensor.h index 7fc829555..e543f7197 100644 --- a/saber/core/tensor.h +++ b/saber/core/tensor.h @@ -576,20 +576,6 @@ class Tensor : public TensorBase { return SaberSuccess; } -#ifdef USE_BM - SaberStatus copy_from(const Tensor& tensor) { - CHECK_EQ(typeof(BM), typeof(targetType_t)) \ - << "this method is only for BM tensor"; - - CHECK_EQ(valid_size(), tensor.valid_size()) \ - << "sizes of two valid shapes must be the same"; - - BMDNN_CHECK(m_memcpy_s2d(API::get_handler(), mutable_data(), bm_mem_from_system(tensor.data()))); - - return SaberSuccess; - } -#endif - /** * \brief Deep copy data within region of interest from input tensor. */ diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index db0edce6d..8aead4bb1 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -55,6 +55,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { TensorHf4 thost1(sh1); TensorDf4 tdev1(sh1); + /* //! test tensor copy_from() function LOG(INFO) << "test copy_from() function, input tensor could be any target"; thost1.copy_from(thost0); @@ -65,7 +66,6 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { tdev1.copy_from(tdev0); print_tensor_host(thost1); - /* //! test tensor constructor with shape and real_shape LOG(INFO) << "test tensor constructor with shape and real_shape"; //! constructor with 3 shapes is removed From b1aa39dcfc107d06fa1f150cfcda30ffbbe40bdf Mon Sep 17 00:00:00 2001 From: "tong.liu" Date: Thu, 21 Jun 2018 15:38:08 +0800 Subject: [PATCH 024/318] modify activation op and test --- saber/funcs/impl/bm/vender_activation.h | 38 +++----- saber/funcs/impl/bm/vender_fc.h | 46 ++------- .../bm/test_saber_func_activation_BM.cpp | 97 +------------------ 3 files changed, 21 insertions(+), 160 deletions(-) diff --git a/saber/funcs/impl/bm/vender_activation.h b/saber/funcs/impl/bm/vender_activation.h index 45541add9..fadd817b9 100644 --- a/saber/funcs/impl/bm/vender_activation.h +++ b/saber/funcs/impl/bm/vender_activation.h @@ -27,17 +27,9 @@ class VenderActivation& inputs, std::vector& outputs, @@ -64,33 +56,29 @@ class VenderActivationnum(); switch (_active_type) { - case Active_sigmoid: - BMDNN_CHECK(bmdnn_sigmoid_forward(_handle, _input_descs, input_n, input_dim, _output_descs)); - break; case Active_relu: - BMDNN_CHECK(bmdnn_relu_forward(_handle, _input_descs, input_n, input_dim, _output_descs)); + BMDNN_CHECK(bmdnn_relu_forward(_handle, in_data, input_n, input_dim, out_data)); + break; + case Active_sigmoid: + BMDNN_CHECK(bmdnn_sigmoid_forward(_handle, in_data, input_n, input_dim, out_data)); break; case Active_tanh: - BMDNN_CHECK(bmdnn_tanh_forward(_handle, _input_descs, input_n, input_dim, _output_descs)); + BMDNN_CHECK(bmdnn_tanh_forward(_handle, in_data, input_n, input_dim, out_data)); + break; + case Active_elu: + BMDNN_CHECK(bmdnn_elu_forward(_handle, 1.0, in_data, input_n, input_dim, out_data)); break; } - /* BMDNN_CHECK(cudnnActivationForward(_handle, _active_descs, */ - /* cudnn::cudnnTypeWrapper::kOne(), */ - /* _input_descs, in_data, */ - /* cudnn::cudnnTypeWrapper::kZero(), */ - /* _output_descs, out_data */ - /* )); */ return SaberSuccess; } private: bm_handle_t _handle; - bm_device_mem_t _input_descs; - bm_device_mem_t _output_descs; ActiveType _active_type; }; + template class VenderActivation; -} -} +} // namespace saber +} // namespace anakin #endif //ANAKIN_SABER_FUNCS_BMDNN_ACT_H diff --git a/saber/funcs/impl/bm/vender_fc.h b/saber/funcs/impl/bm/vender_fc.h index 5c7c23e67..3b018686c 100644 --- a/saber/funcs/impl/bm/vender_fc.h +++ b/saber/funcs/impl/bm/vender_fc.h @@ -1,20 +1,5 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H -#define ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H +#ifndef ANAKIN_SABER_FUNCS_BMDNN_FC_H +#define ANAKIN_SABER_FUNCS_BMDNN_FC_H #include "saber/funcs/impl/impl_fc.h" @@ -43,23 +28,12 @@ class VenderFc& inputs, std::vector& outputs, FcParam& param, Context& ctx){ - // get context - this->_ctx = ctx; - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUBLAS_CHECK(cublasCreate(&_handle)); - CUBLAS_CHECK(cublasSetStream(_handle, cuda_stream)); return create(inputs, outputs, param, ctx); } @@ -94,16 +68,10 @@ class VenderFc& inputs, std::vector& outputs, - FcParam& param); + FcParam& param){ + }; -private: - bool _flag_trans_weights{false}; - int _M; - int _K; - int _N; - cublasHandle_t _handle; - bool _is_continue_buf{true}; }; template class VenderFc; @@ -111,4 +79,4 @@ template class VenderFc; } //namespace anakin -#endif //ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H +#endif // ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H diff --git a/test/saber/bm/test_saber_func_activation_BM.cpp b/test/saber/bm/test_saber_func_activation_BM.cpp index 5d30a6d64..523e94121 100644 --- a/test/saber/bm/test_saber_func_activation_BM.cpp +++ b/test/saber/bm/test_saber_func_activation_BM.cpp @@ -58,7 +58,7 @@ TEST(TestSaberFuncBM, test_func_constructor) { Context ctx1(0, 1, 1); - ActivationParam param(Active_elu, 0.1f, 0.1f); + ActivationParam param(Active_relu, 0.1f, 0.1f); std::vector input; std::vector output; @@ -74,102 +74,7 @@ TEST(TestSaberFuncBM, test_func_constructor) { act.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1); act(input, output, param, ctx1); - cudaStream_t cuda_stream = ctx1.get_compute_stream(); - output[0]->record_event(cuda_stream); - output_dev.sync(); print_tensor_device(output_dev); - cudaDeviceSynchronize(); - CUDA_POST_KERNEL_CHECK; -} - -TEST(TestSaberFuncBM, test_func_sub_tensor) { - - typedef Tensor TensorHf4; - typedef Tensor TensorDf4; - - int img_num = 1; - int in_channels = 2; - int img_h = 8; - int img_w = 8; - - Shape img_s(img_num, in_channels, img_h, img_w); - - TensorHf4 img_host; - TensorDf4 img_dev; - - img_host.re_alloc(img_s); - img_dev.re_alloc(img_s); - - for (int i = 0; i < img_host.size(); ++i) { - img_host.mutable_data()[i] = (float)(0.05 * (i & 0x1f) * -1); - } - - img_dev.copy_from(img_host); - Shape img_s_t0(img_num, in_channels, 4, 4); - - TensorDf4 t0; - TensorDf4 t1; - - t0.share_sub_buffer(img_dev, img_s_t0, {0, 0, 0, 0}); - t1.share_sub_buffer(img_dev, img_s_t0, {0, 0, 4, 4}); - - print_tensor_shape("t0", t0); - print_tensor_shape("t1", t1); - - TensorDf4 output_dev; - - TensorDf4 out0; - TensorDf4 out1; - - // start Reshape & doInfer - Context ctx1(0, 1, 1); - Context ctx2(0, 2, 2); - - ActivationParam param1(Active_elu, 0.1f, 0.1f); - ActivationParam param2(Active_elu, 0.1f, 0.1f); - - std::vector input1, input2; - std::vector output1, output2; - - input1.push_back(&t0); - input2.push_back(&t1); - - output1.push_back(&out0); - output2.push_back(&out1); - - //FIXME where do I get img_s and all those shapes ???? - output_dev.re_alloc(img_s); - - out0.share_sub_buffer(output_dev, img_s_t0, {0, 0, 0, 0}); - out1.share_sub_buffer(output_dev, img_s_t0, {0, 0, 4, 4}); - - print_tensor_shape("output_dev", output_dev); - - Activation act1; - Activation act2; - - act1.compute_output_shape(output1, input1, param1); - act2.compute_output_shape(output2, input2, param2); - - print_tensor_shape("out0", out0); - print_tensor_shape("out1", out1); - - // init assume output tensor has been reshpaed by user. - act1.init(input1, output1, param1, SPECIFY, SABER_IMPL, ctx1); - act1(input1, output1, param1, ctx1); - cudaStream_t cuda_stream = ctx1.get_compute_stream(); - output1[0]->record_event(cuda_stream); - - act2.init(input2, output2, param2, SPECIFY, SABER_IMPL, ctx2); - act2(input2, output2, param2, ctx2); - cudaStream_t cuda_stream2 = ctx2.get_compute_stream(); - output2[0]->record_event(cuda_stream2); - - out0.sync(); - out1.sync(); - print_tensor_device(output_dev); - cudaDeviceSynchronize(); - CUDA_POST_KERNEL_CHECK; } int main(int argc, const char** argv) { From 250451cde19e1d677b2957c9c1ba0166d7cf0893 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Thu, 21 Jun 2018 18:31:38 +0800 Subject: [PATCH 025/318] Enable copy from tensor with different Dtype --- saber/core/data_traits.h | 11 +++++++++++ saber/core/tensor.h | 16 ++++++++++++++++ test/saber/bm/test_saber_tensor_BM.cpp | 6 ++++-- 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/saber/core/data_traits.h b/saber/core/data_traits.h index 0bb732aba..64de4af9f 100644 --- a/saber/core/data_traits.h +++ b/saber/core/data_traits.h @@ -17,6 +17,12 @@ #include "saber_types.h" +#ifdef USE_BM +#include "bmlib_runtime.h" +#include "bmdnn_api.h" +#include "bmlib_utils.h" +#endif + namespace anakin{ namespace saber{ @@ -76,6 +82,11 @@ struct DataTrait { typedef unsigned int dtype; }; +template <> +struct DataTrait { + typedef bm_device_mem_t dtype; +}; + } //namespace saber } //namespace anakin diff --git a/saber/core/tensor.h b/saber/core/tensor.h index e543f7197..d24287c44 100644 --- a/saber/core/tensor.h +++ b/saber/core/tensor.h @@ -21,6 +21,7 @@ #include "core/tensor_traits.h" #ifdef USE_BM +#include #include "bmlib_runtime.h" #include "bmdnn_api.h" #include "bmlib_utils.h" @@ -728,6 +729,21 @@ class Tensor : public TensorBase { return SaberSuccess; } +#ifdef USE_BM + template + SaberStatus copy_from(const Tensor& tensor) { + if (typeid(BM) == typeid(targetType_t) && + typeid(X86) == typeid(TargetType_t) && + typeid(AK_FLOAT) == typeid(DataType_t)){ + + Dtype* device_data_ptr = mutable_data(); + BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(tensor.data()))); + } + + return SaberSuccess; + }; +#endif + /** * \brief Asynchronously copy entire buffer from source tensor. */ diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index 8aead4bb1..83eb472b7 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -55,11 +55,13 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { TensorHf4 thost1(sh1); TensorDf4 tdev1(sh1); - /* + //! test tensor copy_from() function LOG(INFO) << "test copy_from() function, input tensor could be any target"; - thost1.copy_from(thost0); + //thost1.copy_from(thost0); tdev1.copy_from(thost0); + + /* //TODO: print tensor for BM device print_tensor_host(tdev1); thost1.copy_from(tdev1); From 435ca51e18f046abf4708a28f18f6de7ac627e0f Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Fri, 22 Jun 2018 09:25:03 +0800 Subject: [PATCH 026/318] Complete copy_from method --- saber/core/tensor.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/saber/core/tensor.h b/saber/core/tensor.h index d24287c44..b0e22ec20 100644 --- a/saber/core/tensor.h +++ b/saber/core/tensor.h @@ -732,15 +732,36 @@ class Tensor : public TensorBase { #ifdef USE_BM template SaberStatus copy_from(const Tensor& tensor) { + + CHECK_EQ(valid_size(), tensor.valid_size()) \ + << "sizes of two valid shapes must be the same"; + + /// copy from system to device if (typeid(BM) == typeid(targetType_t) && + typeid(AK_BM) == typeid(datatype) && typeid(X86) == typeid(TargetType_t) && typeid(AK_FLOAT) == typeid(DataType_t)){ Dtype* device_data_ptr = mutable_data(); BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(tensor.data()))); + + return SaberSuccess; } - return SaberSuccess; + /// copy from device to system + if (typeid(X86) == typeid(targetType_t) && + typeid(AK_FLOAT) == typeid(datatype) && + typeid(BM) == typeid(TargetType_t) && + typeid(AK_BM) == typeid(DataType_t)){ + + Dtype* device_data_ptr = tensor.data(); + BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr)); + + return SaberSuccess; + } + + /// other types are not allowed here + return SaberInvalidValue; }; #endif From be04e3b2a541d22aab7b4f93a6b843b13af70e3f Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Fri, 22 Jun 2018 09:41:21 +0800 Subject: [PATCH 027/318] const_cast the immutable target data pointer --- saber/core/tensor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/saber/core/tensor.h b/saber/core/tensor.h index b0e22ec20..d43cedef1 100644 --- a/saber/core/tensor.h +++ b/saber/core/tensor.h @@ -754,7 +754,7 @@ class Tensor : public TensorBase { typeid(BM) == typeid(TargetType_t) && typeid(AK_BM) == typeid(DataType_t)){ - Dtype* device_data_ptr = tensor.data(); + Dtype* device_data_ptr = const_cast(tensor.data()); BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr)); return SaberSuccess; From 7cc4c5781c0375a23cc6dc354235e591c6885812 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Fri, 22 Jun 2018 10:06:33 +0800 Subject: [PATCH 028/318] Revert back to compilable version --- saber/core/tensor.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/saber/core/tensor.h b/saber/core/tensor.h index d43cedef1..29ec1d006 100644 --- a/saber/core/tensor.h +++ b/saber/core/tensor.h @@ -749,16 +749,16 @@ class Tensor : public TensorBase { } /// copy from device to system - if (typeid(X86) == typeid(targetType_t) && + /*if (typeid(X86) == typeid(targetType_t) && typeid(AK_FLOAT) == typeid(datatype) && typeid(BM) == typeid(TargetType_t) && typeid(AK_BM) == typeid(DataType_t)){ - Dtype* device_data_ptr = const_cast(tensor.data()); + auto* device_data_ptr = tensor.data(); BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr)); return SaberSuccess; - } + }*/ /// other types are not allowed here return SaberInvalidValue; From 59f2c69d569990d7761e2e0ffdb37f2131ab820c Mon Sep 17 00:00:00 2001 From: root Date: Fri, 22 Jun 2018 02:43:01 +0000 Subject: [PATCH 029/318] Modify handle usage & mem_alloc function --- saber/core/impl/bm/bm_impl.cpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index ecfe755d6..6088b3af6 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -37,12 +37,17 @@ namespace saber{ typedef TargetWrapper BM_API; +<<<<<<< HEAD //TODO: check exception static bm_handle_t handle = get_bm_handle(); bm_handle_t BM_API::get_handler() { return handle; } +======= +//static bm_handle_t handle = get_bm_handle(); +static bm_handle_t handle; +>>>>>>> Modify handle usage & mem_alloc function void BM_API::get_device_count(int &count) { BMDNN_CHECK(bm_dev_getcount(&count)); @@ -60,18 +65,31 @@ int BM_API::get_device_id(){ void BM_API::mem_alloc(void** ptr, size_t n){ //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n) +<<<<<<< HEAD bm_device_mem_t mem = bm_mem_from_system(*ptr); BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n)); //bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr); //BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n)); +======= + //bm_device_mem_t mem = bm_mem_from_system(*ptr); + handle = get_bm_handle(); + bm_device_mem_t *mem = new bm_device_mem_t[1]; + mem = reinterpret_cast(ptr); + BMDNN_CHECK(bm_malloc_device_byte(handle, mem, n)); +>>>>>>> Modify handle usage & mem_alloc function } void BM_API::mem_free(void* ptr){ //(bm_handle_t handle, bm_device_mem_t mem){ if(ptr != nullptr){ bm_free_device(handle, bm_mem_from_system(ptr)); +<<<<<<< HEAD //bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr); //bm_free_device(handle, *pmem); +======= + //handle = get_bm_handle(); + //bm_free_device(handle, reinterpret_cast(*ptr)); +>>>>>>> Modify handle usage & mem_alloc function } } From 428306bf0d42987f8031c0eb2f675f54bc99d7c8 Mon Sep 17 00:00:00 2001 From: "weihao.huang" Date: Fri, 22 Jun 2018 04:01:48 +0000 Subject: [PATCH 030/318] Modify handle usage & mem_alloc function --- saber/core/impl/bm/bm_impl.cpp | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index 6088b3af6..5ad6af84e 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -37,17 +37,13 @@ namespace saber{ typedef TargetWrapper BM_API; -<<<<<<< HEAD //TODO: check exception -static bm_handle_t handle = get_bm_handle(); +//static bm_handle_t handle = get_bm_handle(); +static bm_handle_t handle; bm_handle_t BM_API::get_handler() { return handle; } -======= -//static bm_handle_t handle = get_bm_handle(); -static bm_handle_t handle; ->>>>>>> Modify handle usage & mem_alloc function void BM_API::get_device_count(int &count) { BMDNN_CHECK(bm_dev_getcount(&count)); @@ -65,31 +61,25 @@ int BM_API::get_device_id(){ void BM_API::mem_alloc(void** ptr, size_t n){ //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n) -<<<<<<< HEAD bm_device_mem_t mem = bm_mem_from_system(*ptr); BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n)); //bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr); //BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n)); -======= //bm_device_mem_t mem = bm_mem_from_system(*ptr); handle = get_bm_handle(); bm_device_mem_t *mem = new bm_device_mem_t[1]; mem = reinterpret_cast(ptr); BMDNN_CHECK(bm_malloc_device_byte(handle, mem, n)); ->>>>>>> Modify handle usage & mem_alloc function } void BM_API::mem_free(void* ptr){ //(bm_handle_t handle, bm_device_mem_t mem){ if(ptr != nullptr){ bm_free_device(handle, bm_mem_from_system(ptr)); -<<<<<<< HEAD //bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr); //bm_free_device(handle, *pmem); -======= //handle = get_bm_handle(); //bm_free_device(handle, reinterpret_cast(*ptr)); ->>>>>>> Modify handle usage & mem_alloc function } } From 1fddf4e36cabce6398d6a78906a59872f958b448 Mon Sep 17 00:00:00 2001 From: "weihao.huang" Date: Fri, 22 Jun 2018 05:31:05 +0000 Subject: [PATCH 031/318] Modify test_TargetWrapper --- saber/core/impl/bm/bm_impl.cpp | 2 -- test/saber/bm/test_TargetWrapper_BM.cpp | 9 ++++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index 5ad6af84e..4aecb169d 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -61,8 +61,6 @@ int BM_API::get_device_id(){ void BM_API::mem_alloc(void** ptr, size_t n){ //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n) - bm_device_mem_t mem = bm_mem_from_system(*ptr); - BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n)); //bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr); //BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n)); //bm_device_mem_t mem = bm_mem_from_system(*ptr); diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp index c54b392d1..a76bef279 100644 --- a/test/saber/bm/test_TargetWrapper_BM.cpp +++ b/test/saber/bm/test_TargetWrapper_BM.cpp @@ -4,13 +4,20 @@ #ifdef USE_BM using namespace anakin::saber; +static bm_handle_t handle; int main() { + bmdnn_init(&handle); typedef TargetWrapper API; void *pmem; int dev_count = 0; API::get_device_count(dev_count); + std::cout << dev_count << std::endl; API::mem_alloc(&pmem, 3*200*200); - API::mem_free(pmem); + //API::mem_free(pmem); + std::cout << "Press any key to finish execution." << std::endl; + int a; + std::cin >> a; + bmdnn_deinit(handle); } #endif From e32c50a08175f59730dcca6bc2ee26848bf730bb Mon Sep 17 00:00:00 2001 From: "tong.liu" Date: Fri, 22 Jun 2018 13:52:18 +0800 Subject: [PATCH 032/318] fill activation and fc op; compile error --- saber/funcs/impl/bm/vender_activation.h | 1 - saber/funcs/impl/bm/vender_fc.h | 42 ++++++---------- saber/funcs/timer.h | 66 +++++++++++++++++++++++++ test/saber/bm/test_saber_buffer_BM.cpp | 2 +- test/saber/bm/test_saber_func_fc_BM.cpp | 6 +-- 5 files changed, 85 insertions(+), 32 deletions(-) diff --git a/saber/funcs/impl/bm/vender_activation.h b/saber/funcs/impl/bm/vender_activation.h index fadd817b9..c4baf8365 100644 --- a/saber/funcs/impl/bm/vender_activation.h +++ b/saber/funcs/impl/bm/vender_activation.h @@ -49,7 +49,6 @@ class VenderActivation& inputs, std::vector& outputs, ActivationParam& param) { - const InDataType *in_data = (const InDataType *) inputs[0]->data(); OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); int input_dim = inputs[0]->channel() * inputs[0]->height() * inputs[0]->width(); diff --git a/saber/funcs/impl/bm/vender_fc.h b/saber/funcs/impl/bm/vender_fc.h index 3b018686c..82dd6000c 100644 --- a/saber/funcs/impl/bm/vender_fc.h +++ b/saber/funcs/impl/bm/vender_fc.h @@ -28,7 +28,7 @@ class VenderFc& inputs, @@ -40,38 +40,28 @@ class VenderFc& inputs, std::vector& outputs, FcParam& param, Context& ctx){ - - if (!(ctx == this->_ctx)) { - if (_handle != NULL) { - CUBLAS_CHECK(cublasDestroy(_handle)); - } - this->_ctx = ctx; - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - CUBLAS_CHECK(cublasCreate(&_handle)); - CUBLAS_CHECK(cublasSetStream(_handle, cuda_stream)); - } - - Shape shape_out = inputs[0]->valid_shape(); - _M = inputs[0]->count_valid(0, param.axis); - _K = inputs[0]->count_valid(param.axis, inputs[0]->dims()); - _N = param.num_output; - if (_N <= 0) { - int weight_size = param.weights->valid_size(); - _N = weight_size / _K; - } - //! weights dims must be in h and w - _flag_trans_weights = param.is_transpose_weights; return SaberSuccess; } virtual SaberStatus dispatch(const std::vector& inputs, std::vector& outputs, FcParam& param){ - + const InDataType *in_data = (const InDataType *) inputs[0]->data(); + const InDataType *weights = (const InDataType *) param.weights->get_buf()->get_data(); + const InDataType *bias = (const InDataType *) param.bias->get_buf()->get_data(); + OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); + int batch_size = inputs[0]->num(); + int input_len = inputs[0]->channel(); + int output_len = param.num_output; + int is_transpose = param.is_transpose_weights ? 1 : 0; + BMDNN_CHECK(bmdnn_fc_forward(_handle, in_data, weights, bias, + batch_size, output_len, input_len, is_transpose, 1, 0, + out_data)); + return SaberSuccess; }; +private: + bm_handle_t _handle; }; template class VenderFc; @@ -79,4 +69,4 @@ template class VenderFc; } //namespace anakin -#endif // ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H +#endif // ANAKIN_SABER_FUNCS_BMDNN_FC_H diff --git a/saber/funcs/timer.h b/saber/funcs/timer.h index 4b1689383..e5014a9cb 100644 --- a/saber/funcs/timer.h +++ b/saber/funcs/timer.h @@ -173,6 +173,72 @@ class SaberTimer final { }; #endif +#ifdef USE_BM +template <> +class SaberTimer final { + +public: + SaberTimer() {} + + ~SaberTimer() {} + + void clear() { + ms_time.clear(); + } + + void start(Context &ctx) { + tstart = std::chrono::system_clock::now(); + } + + void end(Context &ctx) { + tend = std::chrono::system_clock::now(); + auto ts = std::chrono::duration_cast(tend - tstart); + float elapse_ms = 1000.f * float(ts.count()) * std::chrono::microseconds::period::num / \ + std::chrono::microseconds::period::den; + ms_time.push_back(elapse_ms); + } + + float get_average_ms() { + if (ms_time.size() == 0) { + return 0.f; + } + float sum = 0.f; + for (auto i : ms_time){ + sum += i; + } + return sum / ms_time.size(); + } + + // return tile (0-99) time. + float get_tile_time(float tile) { + + if (tile <0 || tile > 100) { + return -1.f; + } + int total_items = (int)ms_time.size(); + if (total_items <= 0) { + return -2.f; + } + ms_time.sort(); + int pos = (int)(tile * total_items / 100); + auto it = ms_time.begin(); + for (int i = 0; i < pos; ++i) { + ++it; + } + return *it; + } + + const std::list get_time_stat() { + return ms_time; + } + +private: + std::chrono::time_point tstart; + std::chrono::time_point tend; + std::list ms_time; +}; +#endif // USE_BM + } } diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp index a204e7807..93aa6d36e 100644 --- a/test/saber/bm/test_saber_buffer_BM.cpp +++ b/test/saber/bm/test_saber_buffer_BM.cpp @@ -1,4 +1,4 @@ -#include "test_saber_buffer_bm.h" +#include "test_saber_buffer_BM.h" #include "saber/core/buffer.h" #include "saber/core/data_traits.h" diff --git a/test/saber/bm/test_saber_func_fc_BM.cpp b/test/saber/bm/test_saber_func_fc_BM.cpp index 5101c75f8..869ff1bfd 100644 --- a/test/saber/bm/test_saber_func_fc_BM.cpp +++ b/test/saber/bm/test_saber_func_fc_BM.cpp @@ -1,6 +1,6 @@ #include "core/context.h" #include "funcs/fc.h" -#include "test_saber_func_fc_BM.h" +#include "test_saber_func_BM.h" #include "tensor_op.h" #include "saber_types.h" #include @@ -41,7 +41,7 @@ void fc_compute(const TensorHf4& tin, const TensorHf4& weight, \ } } -TEST(TestSaberFuncFcBM, test_func_fc) { +TEST(TestSaberFuncBM, test_func_fc) { int test_iter = 100; int w_in = 7; @@ -109,12 +109,10 @@ TEST(TestSaberFuncFcBM, test_func_fc) { //cudaDeviceSynchronize(); } - CUDA_POST_KERNEL_CHECK; t1.end(ctx_dev); float ts = t1.get_average_ms(); LOG(INFO) << "total time: " << ts << "avg time: " << ts / test_iter; //print_tensor_device(*output_dev_4d[0]); - //cudaDeviceSynchronize(); //! check result TensorHf4 thin(shape_in); From 1c4439a7b40a654c9b2ea95ee44319f770607da7 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Sat, 23 Jun 2018 15:58:33 +0800 Subject: [PATCH 033/318] allow copy from tensor with different data type --- saber/core/tensor.cpp | 24 +++++++++++++++ saber/core/tensor.h | 42 ++------------------------ test/saber/bm/test_saber_tensor_BM.cpp | 18 ++++++++--- 3 files changed, 40 insertions(+), 44 deletions(-) create mode 100644 saber/core/tensor.cpp diff --git a/saber/core/tensor.cpp b/saber/core/tensor.cpp new file mode 100644 index 000000000..9283aac90 --- /dev/null +++ b/saber/core/tensor.cpp @@ -0,0 +1,24 @@ +#include "tensor.h" + +#ifdef USE_BM + +#include "bmlib_runtime.h" +#include "bmdnn_api.h" +#include "bmlib_utils.h" + +template<> +template<> +SaberStatus Tensor::copy_from(const Tensor& tensor) { + //auto* device_data_ptr = mutable_data(); + //BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(tensor.data()))); + return SaberSuccess; +} + +template<> +template<> +SaberStatus Tensor::copy_from(const Tensor& tensor) { + return SaberSuccess; +} + +#endif + diff --git a/saber/core/tensor.h b/saber/core/tensor.h index 29ec1d006..2272549cd 100644 --- a/saber/core/tensor.h +++ b/saber/core/tensor.h @@ -20,13 +20,6 @@ #include "core/events.h" #include "core/tensor_traits.h" -#ifdef USE_BM -#include -#include "bmlib_runtime.h" -#include "bmdnn_api.h" -#include "bmlib_utils.h" -#endif - namespace anakin{ namespace saber{ @@ -730,39 +723,10 @@ class Tensor : public TensorBase { } #ifdef USE_BM - template - SaberStatus copy_from(const Tensor& tensor) { - - CHECK_EQ(valid_size(), tensor.valid_size()) \ - << "sizes of two valid shapes must be the same"; - - /// copy from system to device - if (typeid(BM) == typeid(targetType_t) && - typeid(AK_BM) == typeid(datatype) && - typeid(X86) == typeid(TargetType_t) && - typeid(AK_FLOAT) == typeid(DataType_t)){ - - Dtype* device_data_ptr = mutable_data(); - BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(tensor.data()))); - - return SaberSuccess; - } - - /// copy from device to system - /*if (typeid(X86) == typeid(targetType_t) && - typeid(AK_FLOAT) == typeid(datatype) && - typeid(BM) == typeid(TargetType_t) && - typeid(AK_BM) == typeid(DataType_t)){ - - auto* device_data_ptr = tensor.data(); - BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr)); - - return SaberSuccess; - }*/ - - /// other types are not allowed here + template + SaberStatus copy_from(const Tensor& tensor) { return SaberInvalidValue; - }; + } #endif /** diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index 83eb472b7..ed3ff0503 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -58,16 +58,24 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { //! test tensor copy_from() function LOG(INFO) << "test copy_from() function, input tensor could be any target"; - //thost1.copy_from(thost0); - tdev1.copy_from(thost0); - /* + // host to host + thost1.copy_from(thost0); + print_tensor_host(thost1); + + // host to device + tdev1.copy_from(thost0); //TODO: print tensor for BM device - print_tensor_host(tdev1); + //print_tensor_host(tdev1); + + // device to host thost1.copy_from(tdev1); - tdev1.copy_from(tdev0); print_tensor_host(thost1); + /* + // device to device + tdev1.copy_from(tdev0); + //! test tensor constructor with shape and real_shape LOG(INFO) << "test tensor constructor with shape and real_shape"; //! constructor with 3 shapes is removed From cbf68a70b11ad3d4694108b138b707545b83615d Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Sat, 23 Jun 2018 16:28:04 +0800 Subject: [PATCH 034/318] AK_BM size should return 1 --- saber/core/tensor.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/saber/core/tensor.cpp b/saber/core/tensor.cpp index 9283aac90..3a283c1f6 100644 --- a/saber/core/tensor.cpp +++ b/saber/core/tensor.cpp @@ -6,6 +6,9 @@ #include "bmdnn_api.h" #include "bmlib_utils.h" +template<> +size_t Tensor::_type_len{1}; + template<> template<> SaberStatus Tensor::copy_from(const Tensor& tensor) { From dd763744ff43886e33a24417221e4fa7e2962d8d Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Sat, 23 Jun 2018 19:23:11 +0800 Subject: [PATCH 035/318] Comment out specialization of _type_len for now. --- saber/core/tensor.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/saber/core/tensor.cpp b/saber/core/tensor.cpp index 3a283c1f6..3203f4779 100644 --- a/saber/core/tensor.cpp +++ b/saber/core/tensor.cpp @@ -1,13 +1,19 @@ #include "tensor.h" #ifdef USE_BM - #include "bmlib_runtime.h" #include "bmdnn_api.h" #include "bmlib_utils.h" +#endif -template<> -size_t Tensor::_type_len{1}; +namespace anakin { + +namespace saber { + +#ifdef USE_BM + + //template<> +//size_t Tensor::_type_len{1}; template<> template<> @@ -25,3 +31,5 @@ SaberStatus Tensor::copy_from(const Tensor #endif +} +} \ No newline at end of file From 166d7c8818679e32b7983bf295c623229fe4ced0 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Sat, 23 Jun 2018 19:59:29 +0800 Subject: [PATCH 036/318] Add implementation for copy_from between device and system --- saber/core/tensor.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/saber/core/tensor.cpp b/saber/core/tensor.cpp index 3203f4779..1978666bc 100644 --- a/saber/core/tensor.cpp +++ b/saber/core/tensor.cpp @@ -12,20 +12,22 @@ namespace saber { #ifdef USE_BM - //template<> +//template<> //size_t Tensor::_type_len{1}; template<> template<> SaberStatus Tensor::copy_from(const Tensor& tensor) { - //auto* device_data_ptr = mutable_data(); - //BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(tensor.data()))); + auto* device_data_ptr = mutable_data(); + BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(const_cast(tensor.data())))); return SaberSuccess; } template<> template<> SaberStatus Tensor::copy_from(const Tensor& tensor) { + auto* device_data_ptr = const_cast(tensor.data()); + BMDNN_CHECK(bm_memcpy_d2s(TargetWrapper::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr)); return SaberSuccess; } From 56d2054fbf46895e93278847da33a465e3b8b9ca Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Sat, 23 Jun 2018 22:46:42 +0800 Subject: [PATCH 037/318] Redefine _type_len as function so that we can do specialization --- saber/core/tensor.cpp | 6 ++++-- saber/core/tensor.h | 24 +++++++++++++----------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/saber/core/tensor.cpp b/saber/core/tensor.cpp index 1978666bc..081854c86 100644 --- a/saber/core/tensor.cpp +++ b/saber/core/tensor.cpp @@ -12,8 +12,10 @@ namespace saber { #ifdef USE_BM -//template<> -//size_t Tensor::_type_len{1}; +template<> +size_t Tensor::_type_len(){ + return 1; +} template<> template<> diff --git a/saber/core/tensor.h b/saber/core/tensor.h index 2272549cd..086436c0e 100644 --- a/saber/core/tensor.h +++ b/saber/core/tensor.h @@ -90,7 +90,7 @@ class Tensor : public TensorBase { _shape = shape; _valid_shape = shape; _offset = Shape::zero(shape.dims()); - _buf = std::make_shared>(shape.count() * _type_len); + _buf = std::make_shared>(shape.count() * _type_len()); _is_subbuf = false; } #if 0 @@ -126,7 +126,7 @@ class Tensor : public TensorBase { _valid_shape = shape; _offset = Shape::zero(shape.dims()); std::shared_ptr> buf_from_date = \ - std::make_shared>(data_ptr, shape.count() * _type_len, id); + std::make_shared>(data_ptr, shape.count() * _type_len(), id); BufferMemShare(_buf, buf_from_date); _is_subbuf = false; } @@ -224,7 +224,7 @@ class Tensor : public TensorBase { _shape = shape; _valid_shape = _shape; _offset =Shape::zero(_shape.dims()); - _buf->alloc(_shape.count() * _type_len); + _buf->alloc(_shape.count() * _type_len()); return SaberSuccess; } @@ -286,13 +286,13 @@ class Tensor : public TensorBase { CHECK_EQ(_valid_shape + _offset <= _shape, true) << \ "valid_shape + offet should <= shape"; } - bool exceed_flag = _shape.count() * _type_len > _buf->get_capacity() \ + bool exceed_flag = _shape.count() * _type_len() > _buf->get_capacity() \ && (_is_subbuf || _is_shared); //if (exceed_flag) { // return SaberOutOfAuthority; //} CHECK_EQ(exceed_flag, false) << "shared tensor shape exceed origin data buffer size"; - SABER_CHECK(_buf->re_alloc(_shape.count() * _type_len)); + SABER_CHECK(_buf->re_alloc(_shape.count() * _type_len())); return SaberSuccess; } @@ -529,7 +529,7 @@ class Tensor : public TensorBase { CHECK_EQ(_shape > Shape::zero(TensorAPI::layout_dims::value), true) << \ "current tensor is not initialized (no shape info, use set_shape)"; typedef typename Tensor_t::Dtype dtype_t; - CHECK_LE(size() * _type_len, tensor.size() * sizeof(dtype_t)) << \ + CHECK_LE(size() * _type_len(), tensor.size() * sizeof(dtype_t)) << \ "current tensor size should <= input tensor size"; _is_shared = BufferMemShare(_buf, tensor.get_buf()) > 0; @@ -599,7 +599,7 @@ class Tensor : public TensorBase { Dtype* ptr_dst = mutable_data(); const Dtype* ptr_src = tensor.data(); process_API::sync_memcpy(ptr_dst, device_id(), ptr_src, tensor.device_id(), \ - _type_len * valid_size(), flag_type()); + _type_len() * valid_size(), flag_type()); return SaberSuccess; } @@ -717,7 +717,7 @@ class Tensor : public TensorBase { Dtype* ptr_dst = dst + idx_dst;//_buf->get_data_mutable() + idx_dst; const Dtype* ptr_src = src + idx_src;//tensor.get_buf()->get_data() + idx_src; process_API::sync_memcpy(ptr_dst, device_id(), ptr_src, tensor.device_id(), \ - _type_len * cpy_len, flag_type()); + _type_len() * cpy_len, flag_type()); } return SaberSuccess; } @@ -758,7 +758,7 @@ class Tensor : public TensorBase { Dtype* ptr_dst = mutable_data(); const Dtype* ptr_src = tensor.data(); process_API::async_memcpy(ptr_dst, device_id(), ptr_src, tensor.device_id(), \ - _type_len * valid_size(), stream, flag_type()); + _type_len() * valid_size(), stream, flag_type()); return SaberSuccess; } @@ -876,7 +876,7 @@ class Tensor : public TensorBase { Dtype* ptr_dst = dst + idx_dst;//_buf->get_data_mutable() + idx_dst; const Dtype* ptr_src = src + idx_src;//tensor.get_buf()->get_data() + idx_src; process_API::async_memcpy(ptr_dst, device_id(), ptr_src, tensor.device_id(), \ - _type_len * cpy_len, stream, flag_type()); + _type_len() * cpy_len, stream, flag_type()); } return SaberSuccess; } @@ -906,7 +906,9 @@ class Tensor : public TensorBase { private: ///< Length of datatype. - size_t _type_len{sizeof(Dtype)}; + size_t _type_len(){ + return sizeof(Dtype); + } ///< Represent the raw mem shape. Shape _shape; ///< Represent the mem you have right to access shape. From 6ea9474ee6a18ab0698299ab46e1c40012e7e8cc Mon Sep 17 00:00:00 2001 From: "weihao.huang" Date: Sun, 24 Jun 2018 05:50:24 +0000 Subject: [PATCH 038/318] Fix mem_free function --- saber/core/impl/bm/bm_impl.cpp | 21 ++++++++++++++++----- test/saber/bm/test_TargetWrapper_BM.cpp | 19 ++++++++++++------- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index 4aecb169d..d4a312fcf 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -65,21 +65,32 @@ void BM_API::mem_alloc(void** ptr, size_t n){ //BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n)); //bm_device_mem_t mem = bm_mem_from_system(*ptr); handle = get_bm_handle(); - bm_device_mem_t *mem = new bm_device_mem_t[1]; - mem = reinterpret_cast(ptr); + //bm_device_mem_t *mem = new bm_device_mem_t[1]; + bm_device_mem_t *mem = reinterpret_cast(*ptr); BMDNN_CHECK(bm_malloc_device_byte(handle, mem, n)); } void BM_API::mem_free(void* ptr){ //(bm_handle_t handle, bm_device_mem_t mem){ if(ptr != nullptr){ - bm_free_device(handle, bm_mem_from_system(ptr)); + //bm_free_device(handle, bm_mem_from_system(ptr)); //bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr); //bm_free_device(handle, *pmem); - //handle = get_bm_handle(); - //bm_free_device(handle, reinterpret_cast(*ptr)); + handle = get_bm_handle(); + bm_device_mem_t *mem = reinterpret_cast(ptr); + //bm_free_device(handle, reinterpret_cast(ptr)); + bm_free_device(handle, *mem); } } + +void BM_API::mem_free_BM(bm_device_mem_t mem){ + //(bm_handle_t handle, bm_device_mem_t mem){ + if(&mem != nullptr){ + handle = get_bm_handle(); + bm_free_device(handle, mem); + } +} + void BM_API::mem_set(void* ptr, int value, size_t n){ //(bm_handle_t handle, const int value, bm_device_mem_t mem){ diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp index a76bef279..c50df3fa3 100644 --- a/test/saber/bm/test_TargetWrapper_BM.cpp +++ b/test/saber/bm/test_TargetWrapper_BM.cpp @@ -8,15 +8,20 @@ static bm_handle_t handle; int main() { bmdnn_init(&handle); typedef TargetWrapper API; - void *pmem; int dev_count = 0; API::get_device_count(dev_count); - std::cout << dev_count << std::endl; - API::mem_alloc(&pmem, 3*200*200); - //API::mem_free(pmem); - std::cout << "Press any key to finish execution." << std::endl; - int a; - std::cin >> a; + std::cout << "dev_count: " << dev_count << std::endl; + + //void *pmem; + bm_device_mem_t *pmem = new bm_device_mem_t(); + std::cout << "mem addr before mem_alloc: " << pmem << std::endl; + API::mem_alloc(&pmem, 3*200*400); + std::cout << "mem addr after mem_alloc: " << pmem << std::endl; + + bm_device_mem_t *test = reinterpret_cast(pmem); + API::mem_free_BM((bm_device_mem_t)(*test)); + std::cout << "End mem_free test." << std::endl; + delete pmem; bmdnn_deinit(handle); } #endif From 89a645f0b1fc40ca22d0602491c1495c726e0f3b Mon Sep 17 00:00:00 2001 From: "weihao.huang" Date: Sun, 24 Jun 2018 07:26:59 +0000 Subject: [PATCH 039/318] Fix mem_free function --- saber/core/target_wrapper.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h index e724235d8..1f283a004 100644 --- a/saber/core/target_wrapper.h +++ b/saber/core/target_wrapper.h @@ -380,7 +380,9 @@ struct TargetWrapper { //template static void mem_free(void * ptr); - + + static void mem_free_BM(bm_device_mem_t mem); + //template static void mem_set(void* ptr, int value, size_t n); From 12e7c27e643514e3d3a7db6637e992f943c089aa Mon Sep 17 00:00:00 2001 From: "tong.liu" Date: Sun, 24 Jun 2018 19:55:07 +0800 Subject: [PATCH 040/318] change mem_free_BM to mem_free; tensor test passed --- saber/core/impl/bm/bm_impl.cpp | 22 +--------------------- test/saber/bm/test_TargetWrapper_BM.cpp | 12 +++++------- 2 files changed, 6 insertions(+), 28 deletions(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index d4a312fcf..e2e5b9e65 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -60,37 +60,17 @@ int BM_API::get_device_id(){ } void BM_API::mem_alloc(void** ptr, size_t n){ - //(bm_handle_t handle, bm_device_mem_t* pmem, unsigned int n) - //bm_device_mem_t* pmem = (struct bm_mem_desc *)(*ptr); - //BMDNN_CHECK(bm_malloc_device_byte(handle, pmem, n)); - //bm_device_mem_t mem = bm_mem_from_system(*ptr); handle = get_bm_handle(); - //bm_device_mem_t *mem = new bm_device_mem_t[1]; bm_device_mem_t *mem = reinterpret_cast(*ptr); BMDNN_CHECK(bm_malloc_device_byte(handle, mem, n)); } void BM_API::mem_free(void* ptr){ - //(bm_handle_t handle, bm_device_mem_t mem){ if(ptr != nullptr){ - //bm_free_device(handle, bm_mem_from_system(ptr)); - //bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr); - //bm_free_device(handle, *pmem); handle = get_bm_handle(); - bm_device_mem_t *mem = reinterpret_cast(ptr); - //bm_free_device(handle, reinterpret_cast(ptr)); - bm_free_device(handle, *mem); + bm_free_device(handle, *(struct bm_mem_desc*)(ptr)); } } - -void BM_API::mem_free_BM(bm_device_mem_t mem){ - //(bm_handle_t handle, bm_device_mem_t mem){ - if(&mem != nullptr){ - handle = get_bm_handle(); - bm_free_device(handle, mem); - } -} - void BM_API::mem_set(void* ptr, int value, size_t n){ //(bm_handle_t handle, const int value, bm_device_mem_t mem){ diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp index c50df3fa3..9d445f16a 100644 --- a/test/saber/bm/test_TargetWrapper_BM.cpp +++ b/test/saber/bm/test_TargetWrapper_BM.cpp @@ -8,18 +8,16 @@ static bm_handle_t handle; int main() { bmdnn_init(&handle); typedef TargetWrapper API; - int dev_count = 0; - API::get_device_count(dev_count); - std::cout << "dev_count: " << dev_count << std::endl; + //int dev_count = 0; + //API::get_device_count(dev_count); + //std::cout << "dev_count: " << dev_count << std::endl; - //void *pmem; bm_device_mem_t *pmem = new bm_device_mem_t(); std::cout << "mem addr before mem_alloc: " << pmem << std::endl; API::mem_alloc(&pmem, 3*200*400); std::cout << "mem addr after mem_alloc: " << pmem << std::endl; - - bm_device_mem_t *test = reinterpret_cast(pmem); - API::mem_free_BM((bm_device_mem_t)(*test)); + std::cout << "Start mem_free test." << pmem << std::endl; + API::mem_free(pmem); std::cout << "End mem_free test." << std::endl; delete pmem; bmdnn_deinit(handle); From 8a14228683d62dee4c1611246f1b2cdb6b7715c8 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Mon, 25 Jun 2018 10:44:03 +0800 Subject: [PATCH 041/318] remove stream test in context --- test/saber/bm/test_saber_context_BM.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/test/saber/bm/test_saber_context_BM.cpp b/test/saber/bm/test_saber_context_BM.cpp index e221ba8f4..ed93866cf 100644 --- a/test/saber/bm/test_saber_context_BM.cpp +++ b/test/saber/bm/test_saber_context_BM.cpp @@ -12,11 +12,8 @@ TEST(TestSaberContextBM, test_BM_context) { LOG(INFO) << "test context constructor"; Context ctx0; Context ctx1(0, 1, 1); - LOG(INFO) << "test record event to context data stream and compute stream"; - API::record_event(event, ctx0.get_data_stream()); - API::record_event(event, ctx0.get_compute_stream()); - API::record_event(event, ctx1.get_data_stream()); - API::record_event(event, ctx1.get_compute_stream()); + + //for BM no need to test stream as it is not in use } #endif From e318008f7367822a7b4c6771aff457f63ba3d454 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Mon, 25 Jun 2018 11:46:19 +0800 Subject: [PATCH 042/318] Update buffer test for BM --- test/saber/bm/test_saber_buffer_BM.cpp | 68 +++++++++++++++----------- 1 file changed, 39 insertions(+), 29 deletions(-) diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp index 93aa6d36e..ea8d7101d 100644 --- a/test/saber/bm/test_saber_buffer_BM.cpp +++ b/test/saber/bm/test_saber_buffer_BM.cpp @@ -4,12 +4,22 @@ using namespace anakin::saber; -template +static bm_handle_t handle; + +int get_bm_size() { + return 1; +} + +template void test_buffer() { + //TODO: init in another place + bmdnn_init(&handle); + typedef TargetWrapper X86_API; typedef TargetWrapper BM_API; - typedef typename DataTrait::dtype Dtype; + typedef typename DataTrait::dtype Ddtype; + typedef typename DataTrait::dtype Hdtype; typedef Buffer BufferH; typedef Buffer BufferD; @@ -17,30 +27,30 @@ void test_buffer() { int n1 = 2048; void* tmp_x86; - Dtype* x86_ptr; - X86_API::mem_alloc(&tmp_x86, sizeof(Dtype) * n0); - x86_ptr = static_cast(tmp_x86); + Hdtype* x86_ptr; + X86_API::mem_alloc(&tmp_x86, sizeof(Hdtype) * n0); + x86_ptr = static_cast(tmp_x86); for (int i = 0; i < n0; i++) { - x86_ptr[i] = static_cast(i); + x86_ptr[i] = static_cast(i); } void* tmp_bm; - Dtype* bm_ptr; - BM_API::mem_alloc(&tmp_bm, sizeof(Dtype) * n0); - bm_ptr = static_cast(tmp_bm); + Ddtype* bm_ptr; + BM_API::mem_alloc(&tmp_bm, get_bm_size() * n0); + bm_ptr = static_cast(tmp_bm); LOG(INFO) << "Buffer: test default(empty) constructor"; BufferH x86_buf0; BufferD bm_buf0; LOG(INFO) << "Buffer: test constructor with data size"; - BufferH x86_buf1(n0 * sizeof(Dtype)); - BufferD bm_buf1(n0 * sizeof(Dtype)); + BufferH x86_buf1(n0 * sizeof(Hdtype)); + BufferD bm_buf1(n0 * sizeof(Ddtype)); LOG(INFO) << "Buffer: test constructor with data pointer, size and device id"; - BufferH x86_buf2(x86_ptr, n0 * sizeof(Dtype), X86_API::get_device_id()); - BufferD bm_buf2(bm_ptr, n0 * sizeof(Dtype), BM_API::get_device_id()); + BufferH x86_buf2(x86_ptr, n0 * sizeof(Hdtype), X86_API::get_device_id()); + BufferD bm_buf2(bm_ptr, n0 * get_bm_size(), BM_API::get_device_id()); LOG(INFO) << "Buffer: test copy constructor"; BufferH x86_buf3(x86_buf2); @@ -62,18 +72,18 @@ void test_buffer() { "shared buffer should have same data count"; LOG(INFO) << "Buffer: test re_alloc"; - x86_buf1.re_alloc(n1 * sizeof(Dtype)); - bm_buf1.re_alloc(n1 * sizeof(Dtype)); - CHECK_EQ(x86_buf1.get_count(), n1 * sizeof(Dtype)) << "buffer count error"; - CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error"; - CHECK_EQ(bm_buf1.get_count(), n1 * sizeof(Dtype)) << "buffer count error"; - CHECK_EQ(bm_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error"; - x86_buf1.re_alloc(n0 * sizeof(Dtype)); - bm_buf1.re_alloc(n0 * sizeof(Dtype)); - CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Dtype)) << "buffer count error"; - CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error"; - CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Dtype)) << "buffer count error"; - CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error"; + x86_buf1.re_alloc(n1 * sizeof(Hdtype)); + bm_buf1.re_alloc(n1 * sizeof(Ddtype)); + CHECK_EQ(x86_buf1.get_count(), n1 * sizeof(Hdtype)) << "buffer count error"; + CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Hdtype)) << "buffer capacity error"; + CHECK_EQ(bm_buf1.get_count(), n1 * sizeof(Ddtype)) << "buffer count error"; + CHECK_EQ(bm_buf1.get_capacity(), n1 * sizeof(Ddtype)) << "buffer capacity error"; + x86_buf1.re_alloc(n0 * sizeof(Hdtype)); + bm_buf1.re_alloc(n0 * sizeof(Ddtype)); + CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Hdtype)) << "buffer count error"; + CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Hdtype)) << "buffer capacity error"; + CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Hdtype)) << "buffer count error"; + CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Hdtype)) << "buffer capacity error"; LOG(INFO) << "Buffer: test get_id()"; LOG(INFO) << "X86 device id: " << x86_buf0.get_id() << \ @@ -84,8 +94,8 @@ void test_buffer() { LOG(INFO) << "Buffer: test deep_cpy()"; x86_buf1.sync_copy_from(x86_buf2); LOG(INFO) << "deep copy between two host buffer: "; - const Dtype* ptr1 = static_cast(x86_buf1.get_data()); - const Dtype* ptr2 = static_cast(x86_buf1.get_data()); + const Hdtype* ptr1 = static_cast(x86_buf1.get_data()); + const Hdtype* ptr2 = static_cast(x86_buf1.get_data()); for (int i = 0; i < 10; i++) { std::cout << ptr1[i] << std::endl; @@ -96,7 +106,7 @@ void test_buffer() { bm_buf1.sync_copy_from(x86_buf2); x86_buf1.sync_copy_from(bm_buf1); LOG(INFO) << "deep copy from device buffer to host buffer: "; - ptr1 = static_cast(x86_buf1.get_data()); + ptr1 = static_cast(x86_buf1.get_data()); for (int i = 0; i < 10; i++) { std::cout << ptr1[i] << std::endl; @@ -104,7 +114,7 @@ void test_buffer() { } TEST(TestSaberBufferBM, test_buffer_memcpy) { - test_buffer(); + test_buffer(); } int main(int argc, const char** argv) { From fa95d52c35c18bd9e339517d95a7af4e173e7dbb Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Mon, 25 Jun 2018 13:26:51 +0800 Subject: [PATCH 043/318] Specialization for Env --- saber/core/env.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 saber/core/env.cpp diff --git a/saber/core/env.cpp b/saber/core/env.cpp new file mode 100644 index 000000000..b294fead4 --- /dev/null +++ b/saber/core/env.cpp @@ -0,0 +1,19 @@ +#include "env.h" + +namespace anakin { + + namespace saber { + +#ifdef USE_BM + + template<> + void Env::env_init(int max_stream){ + //TODO: decide what to put here + LOG(INFO) << "env init for BM"; + } + +#endif + + + } +} From d70704c7ab2141bf081f81018106a983f144274c Mon Sep 17 00:00:00 2001 From: "tong.liu" Date: Mon, 25 Jun 2018 14:07:30 +0800 Subject: [PATCH 044/318] env skip bm --- saber/core/env.cpp | 19 ------------------- saber/core/env.h | 5 +++++ 2 files changed, 5 insertions(+), 19 deletions(-) delete mode 100644 saber/core/env.cpp diff --git a/saber/core/env.cpp b/saber/core/env.cpp deleted file mode 100644 index b294fead4..000000000 --- a/saber/core/env.cpp +++ /dev/null @@ -1,19 +0,0 @@ -#include "env.h" - -namespace anakin { - - namespace saber { - -#ifdef USE_BM - - template<> - void Env::env_init(int max_stream){ - //TODO: decide what to put here - LOG(INFO) << "env init for BM"; - } - -#endif - - - } -} diff --git a/saber/core/env.h b/saber/core/env.h index 3ae42165b..ceabb868c 100644 --- a/saber/core/env.h +++ b/saber/core/env.h @@ -16,6 +16,7 @@ #define ANAKIN_SABER_CORE_ENV_H #include "core/device.h" +#include namespace anakin{ @@ -31,6 +32,10 @@ class Env { return *_g_env; } static void env_init(int max_stream = 4){ + if(std::is_same::value){ + LOG(INFO) << "env init for BM"; + return; + } Devs& devs = cur_env(); if (devs.size() > 0){ return; From 03fa00150613bc4be5f66112d4d074bdc59c58eb Mon Sep 17 00:00:00 2001 From: "tong.liu" Date: Mon, 25 Jun 2018 15:10:32 +0800 Subject: [PATCH 045/318] modify mem_alloc for void* --- saber/core/impl/bm/bm_impl.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index e2e5b9e65..c93703a5d 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -61,8 +61,10 @@ int BM_API::get_device_id(){ void BM_API::mem_alloc(void** ptr, size_t n){ handle = get_bm_handle(); - bm_device_mem_t *mem = reinterpret_cast(*ptr); + /* bm_device_mem_t *mem = reinterpret_cast(*ptr); */ + bm_device_mem_t *mem = new bm_device_mem_t(); BMDNN_CHECK(bm_malloc_device_byte(handle, mem, n)); + *ptr = mem; } void BM_API::mem_free(void* ptr){ From f601c0b4b2ac6698ca528dec0c2aca0997554789 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Mon, 25 Jun 2018 15:40:43 +0800 Subject: [PATCH 046/318] Specialization for copy_from --- saber/core/tensor.cpp | 39 --------------------------------------- saber/core/tensor.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 39 deletions(-) delete mode 100644 saber/core/tensor.cpp diff --git a/saber/core/tensor.cpp b/saber/core/tensor.cpp deleted file mode 100644 index 081854c86..000000000 --- a/saber/core/tensor.cpp +++ /dev/null @@ -1,39 +0,0 @@ -#include "tensor.h" - -#ifdef USE_BM -#include "bmlib_runtime.h" -#include "bmdnn_api.h" -#include "bmlib_utils.h" -#endif - -namespace anakin { - -namespace saber { - -#ifdef USE_BM - -template<> -size_t Tensor::_type_len(){ - return 1; -} - -template<> -template<> -SaberStatus Tensor::copy_from(const Tensor& tensor) { - auto* device_data_ptr = mutable_data(); - BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(const_cast(tensor.data())))); - return SaberSuccess; -} - -template<> -template<> -SaberStatus Tensor::copy_from(const Tensor& tensor) { - auto* device_data_ptr = const_cast(tensor.data()); - BMDNN_CHECK(bm_memcpy_d2s(TargetWrapper::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr)); - return SaberSuccess; -} - -#endif - -} -} \ No newline at end of file diff --git a/saber/core/tensor.h b/saber/core/tensor.h index 086436c0e..3051e16c6 100644 --- a/saber/core/tensor.h +++ b/saber/core/tensor.h @@ -725,6 +725,7 @@ class Tensor : public TensorBase { #ifdef USE_BM template SaberStatus copy_from(const Tensor& tensor) { + LOG(INFO) << "base copy_from"; return SaberInvalidValue; } #endif @@ -939,6 +940,33 @@ class Tensor : public TensorBase { std::vector _seq_offset; }; +#ifdef USE_BM + +template<> inline +size_t Tensor::_type_len(){ + return 1; +} + +template<> +template<> inline +SaberStatus Tensor::copy_from(const Tensor& tensor) { + LOG(INFO) << "BM copy_from"; + auto* device_data_ptr = mutable_data(); + BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(const_cast(tensor.data())))); + return SaberSuccess; +} + +template<> +template<> inline +SaberStatus Tensor::copy_from(const Tensor& tensor) { + LOG(INFO) << "X86 copy_from"; + auto* device_data_ptr = const_cast(tensor.data()); + BMDNN_CHECK(bm_memcpy_d2s(TargetWrapper::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr)); + return SaberSuccess; +} + +#endif + } //namespace saber } //namespace anakin From a8eef8ba0730205c2e7a70784e4e71a2021759e7 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Mon, 25 Jun 2018 15:50:04 +0800 Subject: [PATCH 047/318] Revert speical handling for Env --- saber/core/env.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/saber/core/env.h b/saber/core/env.h index ceabb868c..3ae42165b 100644 --- a/saber/core/env.h +++ b/saber/core/env.h @@ -16,7 +16,6 @@ #define ANAKIN_SABER_CORE_ENV_H #include "core/device.h" -#include namespace anakin{ @@ -32,10 +31,6 @@ class Env { return *_g_env; } static void env_init(int max_stream = 4){ - if(std::is_same::value){ - LOG(INFO) << "env init for BM"; - return; - } Devs& devs = cur_env(); if (devs.size() > 0){ return; From d6457d9553b9ea382853c069df8d63dda9ed6786 Mon Sep 17 00:00:00 2001 From: "tong.liu" Date: Mon, 25 Jun 2018 17:29:29 +0800 Subject: [PATCH 048/318] add conv op, did't test --- saber/core/impl/bm/bm_impl.cpp | 1 + saber/funcs/impl/bm/vender_conv.h | 167 ++++-------------------- test/saber/bm/test_TargetWrapper_BM.cpp | 6 +- 3 files changed, 29 insertions(+), 145 deletions(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index c93703a5d..1bdb5d140 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -71,6 +71,7 @@ void BM_API::mem_free(void* ptr){ if(ptr != nullptr){ handle = get_bm_handle(); bm_free_device(handle, *(struct bm_mem_desc*)(ptr)); + delete ptr; } } diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h index 7efdfa611..a0a3b3fb5 100644 --- a/saber/funcs/impl/bm/vender_conv.h +++ b/saber/funcs/impl/bm/vender_conv.h @@ -1,18 +1,3 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - #ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H #define ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H @@ -44,105 +29,13 @@ class VenderConv2D& inputs, std::vector& outputs, ConvParam& param, Context& ctx) { - - // ---- init cudnn resources ---- - - _workspaceSizeInBytes = 0; - _workspaceData = NULL; - - _workspace_fwd_sizes = 0; - - this->_ctx = ctx; - // ---- get cuda resources ---- - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - - _workspace = NULL; - - int in_channels = inputs[0]->channel(); - - // ---- create cudnn Descs ---- - cudnn::createFilterDesc(&_filter_desc); - - cudnn::createTensorDesc(&_input_descs); - cudnn::createTensorDesc(&_output_descs); - cudnn::createConvolutionDesc(&_conv_descs); - - if (param.bias()->size() > 0) { - cudnn::createTensorDesc(&_bias_desc); - } - - cudnnCreateTensorDescriptor(&_input_nchw_descs); - cudnnCreateTensorDescriptor(&_output_nchw_descs); - return create(inputs, outputs, param, ctx); } @@ -150,46 +43,36 @@ class VenderConv2D& outputs, ConvParam& param, Context& ctx); - //call cudnnConvolutionForward here virtual SaberStatus dispatch(const std::vector& inputs, std::vector& outputs, - ConvParam& param); + ConvParam& param) { + const InDataType *in_data = (const InDataType *) inputs[0]->data(); + const InDataType *weight = (const InDataType *) param.weight()->data(); + const InDataType *bias = (const InDataType *) param.bias()->data(); + OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); + int input_n = inputs[0]->num(); + int input_c = inputs[0]->channel(); + int input_h = inputs[0]->height(); + int input_w = inputs[0]->width(); + int group = param.group; + int output_c = outputs[0]->channel(); + int kh = param.weight()->height(); + int kw = param.weight()->width(); + int pad_h = param.pad_h; + int pad_w = param.pad_w; + int stride_h = param.stride_h; + int stride_w = param.stride_w; + BMDNN_CHECK(bmdnn_conv_forward(_handle, in_data, weights, bias, + input_n, input_c, input_h, input_w, group, output_c, + kh, kw, pad_h, pad_w, stride_h, stride_w, 1, 0, 0, + out_data, NULL)); + return SaberSuccess; + } private: cudnnHandle_t _handle; - cudnnConvolutionFwdAlgo_t _fwd_algo; - - cudnnTensorDescriptor_t _input_descs; - cudnnTensorDescriptor_t _output_descs; - cudnnTensorDescriptor_t _bias_desc; - - cudnnFilterDescriptor_t _filter_desc; - - cudnnConvolutionDescriptor_t _conv_descs; - - size_t _workspace_fwd_sizes; - size_t _workspaceSizeInBytes; // size of underlying storage - - void *_workspaceData; // underlying storage - void *_workspace; // aliases into _workspaceData - - const bool _use_tensor_core = true; - const size_t _workspace_limit_bytes = 64 * 1024 * 1024; - const cudnnConvolutionFwdPreference_t _preference = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST; - - // create transform descriptor - cudnnTensorDescriptor_t _input_nchw_descs; - cudnnTensorDescriptor_t _output_nchw_descs; - - void *x8_data; - void *y8_data; - - int x8_data_size; - int y8_data_size; }; - } - } #endif //ANAKIN_SABER_FUNCS_BMDNN_CONV2D_H diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp index 9d445f16a..b893183a2 100644 --- a/test/saber/bm/test_TargetWrapper_BM.cpp +++ b/test/saber/bm/test_TargetWrapper_BM.cpp @@ -12,14 +12,14 @@ int main() { //API::get_device_count(dev_count); //std::cout << "dev_count: " << dev_count << std::endl; - bm_device_mem_t *pmem = new bm_device_mem_t(); + //bm_device_mem_t *pmem = new bm_device_mem_t(); + void* pmem; std::cout << "mem addr before mem_alloc: " << pmem << std::endl; API::mem_alloc(&pmem, 3*200*400); std::cout << "mem addr after mem_alloc: " << pmem << std::endl; - std::cout << "Start mem_free test." << pmem << std::endl; + std::cout << "Start mem_free test." << std::endl; API::mem_free(pmem); std::cout << "End mem_free test." << std::endl; - delete pmem; bmdnn_deinit(handle); } #endif From 08bd31288cc014bef30a11be509039cfba39c04b Mon Sep 17 00:00:00 2001 From: "weihao.huang" Date: Mon, 25 Jun 2018 13:21:36 +0000 Subject: [PATCH 049/318] Add sync_memcpy function & fix test_saber_buffer_BM --- saber/core/impl/bm/bm_impl.cpp | 19 +++++++++++++++++++ saber/core/target_wrapper.h | 6 ++---- test/saber/bm/test_saber_buffer_BM.cpp | 21 ++++++++++++++++++++- 3 files changed, 41 insertions(+), 5 deletions(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index 1bdb5d140..dacca58b6 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -82,6 +82,25 @@ void BM_API::mem_set(void* ptr, int value, size_t n){ //BMDNN_CHECK(bm_memset_device(handle, value, *pmem)); } +//static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ +// size_t count, __DtoD) {}; + +//static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ +// size_t count, __HtoD) {}; + +void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ + size_t count, __DtoH) { + handle = get_bm_handle(); + //auto* dev_ptr = const_cast(src); + BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src))); + //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *src)); + LOG(INFO) << "End sync_memcpy process"; +}; + +//static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \ +// int src_dev, size_t count) {}; + + //! target wrapper template struct TargetWrapper; diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h index 1f283a004..2a2a4be88 100644 --- a/saber/core/target_wrapper.h +++ b/saber/core/target_wrapper.h @@ -379,9 +379,7 @@ struct TargetWrapper { static void mem_alloc(void** ptr, size_t n); //template - static void mem_free(void * ptr); - - static void mem_free_BM(bm_device_mem_t mem); + static void mem_free(void * ptr); //template static void mem_set(void* ptr, int value, size_t n); @@ -406,7 +404,7 @@ struct TargetWrapper { size_t count, __HtoD) {}; static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, __DtoH) {}; + size_t count, __DtoH); static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \ int src_dev, size_t count) {}; diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp index ea8d7101d..434bd221a 100644 --- a/test/saber/bm/test_saber_buffer_BM.cpp +++ b/test/saber/bm/test_saber_buffer_BM.cpp @@ -104,11 +104,30 @@ void test_buffer() { CHECK_EQ(ptr1[n0 / 2], ptr2[n0 / 2]) << "deep copy between host is incorrect"; LOG(INFO) << "deep copy from host buffer to device buffer"; bm_buf1.sync_copy_from(x86_buf2); + + /* + const Hdtype* x86_buf2_ptr = static_cast(x86_buf2.get_data()); + for (int i = 0; i < 10; i++) { + std::cout << "x86: " << x86_buf2_ptr[i] << std::endl; + } + + const Hdtype* bm_buf1_ptr = static_cast(bm_buf1.get_data()); + for (int i = 0; i < 10; i++) { + std::cout << "bm: " << bm_buf1_ptr[i] << std::endl; + } + + LOG(INFO) << "bm_buf1 cap & cnt: " << bm_buf1.get_capacity() << " " << bm_buf1.get_count(); + LOG(INFO) << "x86_buf1 cap & cnt: " << x86_buf1.get_capacity() << " " << x86_buf1.get_count(); + LOG(INFO) << "size of Hdtype: " << sizeof(Hdtype); + LOG(INFO) << "size of Ddtype: " << sizeof(Ddtype); + */ + + x86_buf1.re_alloc(bm_buf1.get_capacity()); x86_buf1.sync_copy_from(bm_buf1); LOG(INFO) << "deep copy from device buffer to host buffer: "; ptr1 = static_cast(x86_buf1.get_data()); - for (int i = 0; i < 10; i++) { + for (int i = 0; i < 30; i++) { std::cout << ptr1[i] << std::endl; } } From b95facb0c9c3ff9539ada6016483344daffa094d Mon Sep 17 00:00:00 2001 From: "Guangzhi (Frank) Xie" Date: Mon, 25 Jun 2018 21:55:27 +0800 Subject: [PATCH 050/318] init handle for tensor test --- test/saber/bm/test_saber_tensor_BM.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index ed3ff0503..d42665528 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -625,6 +625,10 @@ TEST(TestSaberTensorBM, test_tensor_base_type) { }*/ int main(int argc, const char** argv) { + //TODO: init in another place + static bm_handle_t handle; + bmdnn_init(&handle); + // initial logger logger::init(argv[0]); InitTest(); From 001f2bd4ecdd24815f8ad8d33fee34bed3f503a9 Mon Sep 17 00:00:00 2001 From: "Guangzhi (Frank) Xie" Date: Mon, 25 Jun 2018 22:06:13 +0800 Subject: [PATCH 051/318] init handle for BM context test --- test/saber/bm/test_saber_context_BM.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/saber/bm/test_saber_context_BM.cpp b/test/saber/bm/test_saber_context_BM.cpp index ed93866cf..f2df59c88 100644 --- a/test/saber/bm/test_saber_context_BM.cpp +++ b/test/saber/bm/test_saber_context_BM.cpp @@ -19,6 +19,10 @@ TEST(TestSaberContextBM, test_BM_context) { #endif int main(int argc, const char** argv) { + //TODO: init in another place + static bm_handle_t handle; + bmdnn_init(&handle); + // initial logger logger::init(argv[0]); InitTest(); From 2f1a8bafc5ec164e7e467fbd8fa45081864bd715 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Tue, 26 Jun 2018 09:12:18 +0800 Subject: [PATCH 052/318] handle init rearrange --- test/saber/bm/test_saber_buffer_BM.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp index 434bd221a..00f77d308 100644 --- a/test/saber/bm/test_saber_buffer_BM.cpp +++ b/test/saber/bm/test_saber_buffer_BM.cpp @@ -4,8 +4,6 @@ using namespace anakin::saber; -static bm_handle_t handle; - int get_bm_size() { return 1; } @@ -13,9 +11,6 @@ int get_bm_size() { template void test_buffer() { - //TODO: init in another place - bmdnn_init(&handle); - typedef TargetWrapper X86_API; typedef TargetWrapper BM_API; typedef typename DataTrait::dtype Ddtype; @@ -137,6 +132,10 @@ TEST(TestSaberBufferBM, test_buffer_memcpy) { } int main(int argc, const char** argv) { + //TODO: init in another place + static bm_handle_t handle; + bmdnn_init(&handle); + // initial logger logger::init(argv[0]); InitTest(); From 094e7b664e565ae907323e8f4a8a9278ed997e9f Mon Sep 17 00:00:00 2001 From: "tong.liu" Date: Tue, 26 Jun 2018 09:43:57 +0800 Subject: [PATCH 053/318] add pooling wrapper, didn't test --- saber/funcs/impl/bm/vender_pooling.h | 95 +++++--------------- test/saber/bm/test_saber_func_pooling_BM.cpp | 33 ++----- 2 files changed, 30 insertions(+), 98 deletions(-) diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h index 4990a5357..0da1a1106 100644 --- a/saber/funcs/impl/bm/vender_pooling.h +++ b/saber/funcs/impl/bm/vender_pooling.h @@ -44,78 +44,19 @@ class VenderPooling& inputs, std::vector& outputs, PoolingParam &pooling_param, Context &ctx) { - - this->_ctx = ctx; - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - - cudnn::createTensorDesc(&_input_descs); - cudnn::createTensorDesc(&_output_descs); - - cudnn::create_pooling_des(&_pooling_descs); - return create(inputs, outputs, pooling_param, ctx); } virtual SaberStatus create(const std::vector& inputs, std::vector& outputs, PoolingParam &pooling_param, Context &ctx) { - if (!(ctx == this->_ctx)) { - if (_handle != NULL) { - CUDNN_CHECK(cudnnDestroy(_handle)); - } - this->_ctx = ctx; - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - } - - int input_num = inputs[0]->num(); - int input_channel = inputs[0]->channel(); - int input_height = inputs[0]->height(); - int input_width = inputs[0]->width(); - int output_channel = outputs[0]->channel(); - int output_height = outputs[0]->height(); - int output_width = outputs[0]->width(); - - Shape stride_in = inputs[0]->get_stride(); - Shape stride_out = outputs[0]->get_stride(); - - int dim_a[] = {input_num, input_channel, - input_height, input_width}; - - int dim_b[] = {input_num, output_channel, - output_height, output_width}; - - cudnn::setTensorNdDesc(&_input_descs, - inputs[0]->dims(), dim_a, &stride_in[0]); - - cudnn::setTensorNdDesc(&_output_descs, - outputs[0]->dims(), dim_b, &stride_out[0]); - - int windowHeight[] = {pooling_param.window_h, pooling_param.window_w}; - int padding[] = {pooling_param.pad_h, pooling_param.pad_w}; - - int stride[] = {pooling_param.stride_h, pooling_param.stride_w}; - - cudnn::set_nd_pooling_des(&_pooling_descs, pooling_param.pooling_type, - inputs[0]->dims() - 2, windowHeight, - padding,stride); - return SaberSuccess; } virtual SaberStatus dispatch(const std::vector& inputs, @@ -123,23 +64,31 @@ class VenderPooling ¶m) { const InDataType *in_data = inputs[0]->data(); OutDataType *out_data = outputs[0]->mutable_data(); - - CUDNN_CHECK(cudnnPoolingForward(_handle, _pooling_descs, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, in_data, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, out_data - )); - + int input_n = inputs[0]->num(); + int input_c = inputs[0]->channel(); + int input_h = inputs[0]->height(); + int input_w = inputs[0]->width(); + int kh = param.window_h; + int kw = param.window_w; + int pad_h = param.pad_h; + int pad_w = param.pad_w; + int stride_h = param.stride_h; + int stride_w = param.stride_w; + if(_pooling_type == Pooling_max){ + int is_avg_pooling = 0; + } else { + int is_avg_pooling = 1; + } + BMDNN_CHECK(bmdnn_pooling_forward(_handle, in_data, + input_n, input_c, input_h, input_w, kh, hw, pad_h, pad_w, + stride_h, stride_w, is_avg_pooling, 0, + out_data, NULL, NULL)); return SaberSuccess; } private: - cudnnHandle_t _handle; - cudnnTensorDescriptor_t _input_descs; - cudnnTensorDescriptor_t _output_descs; - cudnnPoolingDescriptor_t _pooling_descs; - + bm_handle_t _handle; + PoolType _pooling_type; }; template class VenderPooling; diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp index 04b963675..ce8e7f8f5 100644 --- a/test/saber/bm/test_saber_func_pooling_BM.cpp +++ b/test/saber/bm/test_saber_func_pooling_BM.cpp @@ -18,7 +18,7 @@ TEST(TestSaberFuncBM, test_func_pooling) { typedef TargetWrapper X86_API; typedef TargetWrapper BM_API; typedef Tensor TensorHf4; - typedef Tensor TensorDf4; + typedef Tensor TensorDf4; int img_num = 1; int in_channels = 4; @@ -71,7 +71,7 @@ TEST(TestSaberFuncBM, test_func_pooling) { input.push_back(&img_dev); output.push_back(&output_dev); - Pooling pooling; + Pooling pooling; pooling.compute_output_shape(input, output, param); output_dev.re_alloc(output[0]->shape()); @@ -92,15 +92,12 @@ TEST(TestSaberFuncBM, test_func_pooling) { } output_dev.sync(); - cudaDeviceSynchronize(); LOG(INFO) << " average time: " << t1.get_average_ms() << " ms"; LOG(INFO) << " tile 10% time: " << t1.get_tile_time(10) << " ms"; LOG(INFO) << " tile 50% time: " << t1.get_tile_time(50) << " ms"; LOG(INFO) << " tile 90% time: " << t1.get_tile_time(90) << " ms"; LOG(INFO) << " tile 95% time: " << t1.get_tile_time(95) << " ms"; LOG(INFO) << " tile 99% time: " << t1.get_tile_time(99) << " ms"; - - CUDA_CHECK(cudaPeekAtLastError()); } TEST(TestSaberFuncBM, test_pooling_result) { @@ -113,7 +110,7 @@ TEST(TestSaberFuncBM, test_pooling_result) { typedef TargetWrapper X86_API; typedef TargetWrapper BM_API; typedef Tensor TensorHf4; - typedef Tensor TensorDf4; + typedef Tensor TensorDf4; int img_num = 1; int in_channels = 2; @@ -166,7 +163,7 @@ TEST(TestSaberFuncBM, test_pooling_result) { input.push_back(&img_dev); output.push_back(&output_dev); - Pooling pooling; + Pooling pooling; pooling.compute_output_shape(input, output, param); output_dev.re_alloc(output[0]->shape()); @@ -174,14 +171,9 @@ TEST(TestSaberFuncBM, test_pooling_result) { // init assume output tensor has been reshpaed by user. pooling.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1); pooling(input, output, param, ctx1); - cudaStream_t cuda_stream = ctx1.get_compute_stream(); - output[0]->record_event(cuda_stream); output_dev.sync(); print_tensor_device(output_dev); - - cudaDeviceSynchronize(); - CUDA_CHECK(cudaPeekAtLastError()); } TEST(TestSaberFuncBM, test_pooling_shared_buffer) { @@ -194,7 +186,7 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) { typedef TargetWrapper X86_API; typedef TargetWrapper BM_API; typedef Tensor TensorHf4; - typedef Tensor TensorDf4; + typedef Tensor TensorDf4; int img_num = 1; int in_channels = 2; @@ -257,9 +249,9 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) { input.push_back(&img_dev); output.push_back(&output_dev); - Pooling pooling; - Pooling pooling0; - Pooling pooling1; + Pooling pooling; + Pooling pooling0; + Pooling pooling1; pooling.compute_output_shape(input,output, param); @@ -286,19 +278,10 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) { pooling1.init(input1, output1, param, SPECIFY, VENDER_IMPL, ctx1); pooling1(input1, output1, param, ctx1); - cudaStream_t cuda_stream = ctx1.get_compute_stream(); - out0.record_event(cuda_stream); - - cudaStream_t cuda_stream1 = ctx1.get_compute_stream(); - out1.record_event(cuda_stream1); - out0.sync(); out1.sync(); print_tensor_device(output_dev); - - cudaDeviceSynchronize(); - CUDA_CHECK(cudaPeekAtLastError()); } int main(int argc, const char** argv) { From b3e78bf64175766b8e18b0218431b8dbc8c114b0 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Tue, 26 Jun 2018 09:44:43 +0800 Subject: [PATCH 054/318] ptr2 should be from buf2 --- test/saber/bm/test_saber_buffer_BM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp index 00f77d308..9910638fb 100644 --- a/test/saber/bm/test_saber_buffer_BM.cpp +++ b/test/saber/bm/test_saber_buffer_BM.cpp @@ -90,7 +90,7 @@ void test_buffer() { x86_buf1.sync_copy_from(x86_buf2); LOG(INFO) << "deep copy between two host buffer: "; const Hdtype* ptr1 = static_cast(x86_buf1.get_data()); - const Hdtype* ptr2 = static_cast(x86_buf1.get_data()); + const Hdtype* ptr2 = static_cast(x86_buf2.get_data()); for (int i = 0; i < 10; i++) { std::cout << ptr1[i] << std::endl; From 82f81aaa793e6c1244b5d9b518ed292021d6ec49 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Tue, 26 Jun 2018 10:03:24 +0800 Subject: [PATCH 055/318] Restrict copy_from for different types --- saber/core/tensor.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/saber/core/tensor.h b/saber/core/tensor.h index 3051e16c6..32ad81ac3 100644 --- a/saber/core/tensor.h +++ b/saber/core/tensor.h @@ -725,7 +725,7 @@ class Tensor : public TensorBase { #ifdef USE_BM template SaberStatus copy_from(const Tensor& tensor) { - LOG(INFO) << "base copy_from"; + LOG(WARNING) << "Invalid: copy_from is not allowed for current type."; return SaberInvalidValue; } #endif @@ -951,6 +951,8 @@ template<> template<> inline SaberStatus Tensor::copy_from(const Tensor& tensor) { LOG(INFO) << "BM copy_from"; + CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same"; + auto* device_data_ptr = mutable_data(); BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(const_cast(tensor.data())))); return SaberSuccess; @@ -960,6 +962,8 @@ template<> template<> inline SaberStatus Tensor::copy_from(const Tensor& tensor) { LOG(INFO) << "X86 copy_from"; + CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same"; + auto* device_data_ptr = const_cast(tensor.data()); BMDNN_CHECK(bm_memcpy_d2s(TargetWrapper::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr)); return SaberSuccess; From 2e84467d981da3d6a9a94ba294a35b65913d4732 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Tue, 26 Jun 2018 11:24:19 +0800 Subject: [PATCH 056/318] Implement fill_tensor_device_rand & fill_tensor_device_const for BM No test yet --- saber/core/tensor_op.cpp | 109 ++++++++++++--------------------------- saber/core/tensor_op.h | 10 ++++ 2 files changed, 44 insertions(+), 75 deletions(-) diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp index 046fef53c..56d8b7244 100644 --- a/saber/core/tensor_op.cpp +++ b/saber/core/tensor_op.cpp @@ -363,98 +363,57 @@ Context ctx) { #endif -/*#ifdef USE_BM +#ifdef USE_BM template<> -SaberStatus -DataTensorTransformHelper::convert_weights, - Tensor >(Tensor& out_tensor, - const Tensor& in_tensor, -Context ctx) { - int input_channel = in_tensor.channel(); - int output_channel = out_tensor.shape()[1]; - // LOG(INFO)<<"input_channel = "<(rand()); } - int o_num = out_tensor.num(); - int o_channel = output_channel; - int o_height = out_tensor.height(); - int o_width = out_tensor.width(); - - int out_n_stride = o_channel * o_height * o_width; - int out_c_stride = o_height * o_width; - int out_h_stride = o_width; - - Shape in_stride = in_tensor.get_stride(); - - in_weight_data = in_tensor.data(); - char* out_weight_data = out_tensor.mutable_data(); + bm_device_mem_t* device_data_ptr = tensor.mutable_data(); + BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input))); - for (int idx = 0; idx < o_num * o_channel * o_height * o_width; ++idx) { + delete [] host_mem_input; +} - int n = (idx / (out_n_stride)) % o_num; - int in_offset = ((idx / (out_n_stride)) % o_num) * in_stride[0] - + ((idx / (out_c_stride)) % o_channel) * (in_stride[1] * 4) - + ((idx / (out_h_stride)) % o_height) * in_stride[2] - + (idx % o_width) * in_stride[3]; +void fill_tensor_device_rand(Tensor& tensor, float vstart, \ + float vend, typename Tensor::API::stream_t stream = NULL){ - int out_offset = ((idx / (out_n_stride)) % o_num) * out_n_stride - + ((idx / (out_c_stride)) % o_channel) * out_c_stride - + ((idx / (out_h_stride)) % o_height) * out_h_stride - + (idx % o_width); - out_weight_data[out_offset * 4 + 0] = (char)(round( - in_weight_data[in_offset + 0 * in_stride[1]] / _vector_weight_scale[n])); - out_weight_data[out_offset * 4 + 1] = (char)(round( - in_weight_data[in_offset + 1 * in_stride[1]] / _vector_weight_scale[n])); - out_weight_data[out_offset * 4 + 2] = (char)(round( - in_weight_data[in_offset + 2 * in_stride[1]] / _vector_weight_scale[n])); - out_weight_data[out_offset * 4 + 3] = (char)(round( - in_weight_data[in_offset + 3 * in_stride[1]] / _vector_weight_scale[n])); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dis(0, 1.f); + float *host_mem_input = new float[tensor.size()]; + for (int i = 0; i < tensor.size(); ++i) { + float random_num = vstart + (vend - vstart) * dis(gen); + host_mem_input[i] = random_num; } - return SaberSuccess; + bm_device_mem_t* device_data_ptr = tensor.mutable_data(); + BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input))); + + delete [] host_mem_input; } -template<> -SaberStatus -DataTensorTransformHelper::convert_bias, - Tensor >(Tensor& out_tensor, - const Tensor& in_tensor, -Context ctx) { - unsigned long weight_size = _vector_weight_scale.size(); - unsigned long bias_size = in_tensor.size(); - CHECK_GT(_in_scale, 0); - CHECK_GT(weight_size, 0); - CHECK_EQ(bias_size, weight_size); - const float* in_data = in_tensor.data(); - float* out_data = out_tensor.mutable_data(); +void fill_tensor_device_const(Tensor& tensor, float value, \ + typename Tensor::API::stream_t stream = NULL){ - for (int i = 0; i < bias_size; ++i) { - out_data[i] = in_data[i] / _in_scale / _vector_weight_scale[i]; + float *host_mem_input = new float[tensor.size()]; + for (int i = 0; i < tensor.size(); ++i) { + host_mem_input[i] = value; } - return SaberSuccess; + bm_device_mem_t* device_data_ptr = tensor.mutable_data(); + BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input))); + + delete [] host_mem_input; } -#endif*/ + +#endif } //namespace saber diff --git a/saber/core/tensor_op.h b/saber/core/tensor_op.h index 166d8f32b..c4d7a7661 100644 --- a/saber/core/tensor_op.h +++ b/saber/core/tensor_op.h @@ -171,6 +171,16 @@ class DataTensorTransformHelper{ #endif +#ifdef USE_BM + +void fill_tensor_device_const(Tensor& tensor, float value, \ + typename Tensor::API::stream_t stream = NULL); + +void fill_tensor_device_rand(Tensor& tensor, float vstart, \ + float vend, typename Tensor::API::stream_t stream = NULL); + +#endif + } // namespace saber } // namespace anakin From d9e9669c9ffcd1cf9e9c45400f838bd0eefdf5c4 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Tue, 26 Jun 2018 11:51:04 +0800 Subject: [PATCH 057/318] get handle directly by calling get_handler() --- saber/core/context.h | 5 ----- saber/core/impl/bm/bm_impl.cpp | 4 ---- saber/core/target_wrapper.h | 2 -- saber/core/tensor.h | 4 ++-- 4 files changed, 2 insertions(+), 13 deletions(-) diff --git a/saber/core/context.h b/saber/core/context.h index 15ec2e0b6..1667f36e0 100644 --- a/saber/core/context.h +++ b/saber/core/context.h @@ -111,11 +111,6 @@ class Context final{ return _stream_compute; } -#ifdef USE_BM - bm_handle_t get_handler() { - return API::get_handler(); - } -#endif #ifdef USE_ARM_PLACE void set_power_mode(PowerMode mode); diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index dacca58b6..d2790d0a9 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -41,10 +41,6 @@ typedef TargetWrapper BM_API; //static bm_handle_t handle = get_bm_handle(); static bm_handle_t handle; -bm_handle_t BM_API::get_handler() { - return handle; -} - void BM_API::get_device_count(int &count) { BMDNN_CHECK(bm_dev_getcount(&count)); } diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h index 2a2a4be88..6e6f67b55 100644 --- a/saber/core/target_wrapper.h +++ b/saber/core/target_wrapper.h @@ -414,8 +414,6 @@ struct TargetWrapper { * @return currently activated device id */ static int get_device_id(); - - static bm_handle_t get_handler(); }; #endif //USE_BM diff --git a/saber/core/tensor.h b/saber/core/tensor.h index 32ad81ac3..d8a319cd5 100644 --- a/saber/core/tensor.h +++ b/saber/core/tensor.h @@ -954,7 +954,7 @@ SaberStatus Tensor::copy_from(const Tensor CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same"; auto* device_data_ptr = mutable_data(); - BMDNN_CHECK(bm_memcpy_s2d(API::get_handler(), *device_data_ptr, bm_mem_from_system(const_cast(tensor.data())))); + BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(const_cast(tensor.data())))); return SaberSuccess; } @@ -965,7 +965,7 @@ SaberStatus Tensor::copy_from(const Tensor CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same"; auto* device_data_ptr = const_cast(tensor.data()); - BMDNN_CHECK(bm_memcpy_d2s(TargetWrapper::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr)); + BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *device_data_ptr)); return SaberSuccess; } From 1e75380499f3d67d32e2a6a8f1a731c13c6ef5b9 Mon Sep 17 00:00:00 2001 From: "tong.liu" Date: Tue, 26 Jun 2018 12:38:48 +0800 Subject: [PATCH 058/318] modify pooling, test failed --- saber/funcs/impl/bm/vender_pooling.h | 50 +++++++------------- saber/funcs/pooling.h | 4 ++ test/saber/bm/test_saber_func_pooling_BM.cpp | 6 --- 3 files changed, 22 insertions(+), 38 deletions(-) diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h index 0da1a1106..b857eacdd 100644 --- a/saber/funcs/impl/bm/vender_pooling.h +++ b/saber/funcs/impl/bm/vender_pooling.h @@ -1,23 +1,7 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_POOLING_H -#define ANAKIN_SABER_FUNCS_IMPL_CUDA_POOLING_H +#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_POOLING_H +#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_POOLING_H #include "saber/funcs/impl/impl_pooling.h" -#include "saber/funcs/impl/cuda/cudnn_helper.h" namespace anakin{ @@ -29,12 +13,12 @@ template -class VenderPooling:\ +class VenderPooling:\ public ImplBase< - Tensor, - Tensor, - Tensor, - PoolingParam>> { + Tensor, + Tensor, + Tensor, + PoolingParam>> { public: typedef Tensor DataTensor_in; typedef Tensor DataTensor_out; @@ -62,8 +46,8 @@ class VenderPooling& inputs, std::vector& outputs, PoolingParam ¶m) { - const InDataType *in_data = inputs[0]->data(); - OutDataType *out_data = outputs[0]->mutable_data(); + const InDataType in_data = *(inputs[0]->data()); + OutDataType out_data = *(outputs[0]->mutable_data()); int input_n = inputs[0]->num(); int input_c = inputs[0]->channel(); int input_h = inputs[0]->height(); @@ -74,27 +58,29 @@ class VenderPooling; +template class VenderPooling; } //namespace saber } // namespace anakin -#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_POOLING_H +#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_POOLING_H diff --git a/saber/funcs/pooling.h b/saber/funcs/pooling.h index 09ab8029e..739d05851 100644 --- a/saber/funcs/pooling.h +++ b/saber/funcs/pooling.h @@ -27,6 +27,10 @@ #include "saber/funcs/impl/x86/saber_pooling.h" #endif +#ifdef USE_BM +#include "saber/funcs/impl/bm/vender_pooling.h" +#endif + namespace anakin { namespace saber { diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp index ce8e7f8f5..2a490c588 100644 --- a/test/saber/bm/test_saber_func_pooling_BM.cpp +++ b/test/saber/bm/test_saber_func_pooling_BM.cpp @@ -12,8 +12,6 @@ TEST(TestSaberFuncBM, test_func_pooling) { Env::env_init(); typedef TargetWrapper API; - typename API::event_t event; - API::create_event(event); typedef TargetWrapper X86_API; typedef TargetWrapper BM_API; @@ -104,8 +102,6 @@ TEST(TestSaberFuncBM, test_pooling_result) { Env::env_init(); typedef TargetWrapper API; - typename API::event_t event; - API::create_event(event); typedef TargetWrapper X86_API; typedef TargetWrapper BM_API; @@ -180,8 +176,6 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) { Env::env_init(); typedef TargetWrapper API; - typename API::event_t event; - API::create_event(event); typedef TargetWrapper X86_API; typedef TargetWrapper BM_API; From 21b23796d4411c61c8a6f1d46578116ebf12139d Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Tue, 26 Jun 2018 13:18:23 +0800 Subject: [PATCH 059/318] Implement print_tensor_device for BM --- saber/core/tensor_op.cpp | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp index 56d8b7244..841c9c208 100644 --- a/saber/core/tensor_op.cpp +++ b/saber/core/tensor_op.cpp @@ -413,6 +413,42 @@ void fill_tensor_device_const(Tensor& tensor, float value, \ delete [] host_mem_input; } +template <> +void print_tensor_device>(Tensor& tensor, \ + typename Tensor::API::stream_t stream) { + + LOG(INFO) << "BM device tensor data:" << tensor.size(); + + /* + const bm_device_mem_t* device_data_ptr = tensor.data(); + unsigned long long gaddr = bm_mem_get_device_addr(*device_data_ptr); + bm_flush(get_bm_handle()); + float* device_data = (float*)bm_get_global_addr(gaddr); + + for (int i = 0; i < tensor.size(); ++i) { + printf("%.2f ", device_data[i]); + + if ((i + 1) % (4 * tensor.width()) == 0) { + printf("\n"); + } + }*/ + + float *host_mem = new float[tensor.size()]; + auto* device_data_ptr = const_cast(tensor.data()); + bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr); + + for (int i = 0; i < tensor.size(); ++i) { + printf("%.2f ", host_mem[i]); + + if ((i + 1) % (4 * tensor.width()) == 0) { + printf("\n"); + } + } + printf("\n"); + + delete [] host_mem; +} + #endif } //namespace saber From b5583187c7f647f50ed05fd562f3c4b8e04a21e5 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Tue, 26 Jun 2018 13:25:48 +0800 Subject: [PATCH 060/318] Update BM tensor test --- saber/core/tensor_op.cpp | 2 ++ test/saber/bm/test_saber_tensor_BM.cpp | 9 ++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp index 841c9c208..72de1d0b3 100644 --- a/saber/core/tensor_op.cpp +++ b/saber/core/tensor_op.cpp @@ -434,6 +434,8 @@ void print_tensor_device>(Tensor& tenso }*/ float *host_mem = new float[tensor.size()]; + bm_flush(get_bm_handle()); + auto* device_data_ptr = const_cast(tensor.data()); bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr); diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index d42665528..dfd8d90c9 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -55,7 +55,6 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { TensorHf4 thost1(sh1); TensorDf4 tdev1(sh1); - //! test tensor copy_from() function LOG(INFO) << "test copy_from() function, input tensor could be any target"; @@ -65,17 +64,17 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { // host to device tdev1.copy_from(thost0); - //TODO: print tensor for BM device - //print_tensor_host(tdev1); + print_tensor_device(tdev1); // device to host thost1.copy_from(tdev1); print_tensor_host(thost1); - /* - // device to device + //device to device tdev1.copy_from(tdev0); + print_tensor_device(tdev1); + /* //! test tensor constructor with shape and real_shape LOG(INFO) << "test tensor constructor with shape and real_shape"; //! constructor with 3 shapes is removed From 99493a44021b5468d8400e7a0e9373ae4d9f4464 Mon Sep 17 00:00:00 2001 From: "tong.liu" Date: Tue, 26 Jun 2018 13:38:28 +0800 Subject: [PATCH 061/318] fix pooling api error --- saber/funcs/impl/bm/vender_pooling.h | 3 +-- saber/funcs/pooling.h | 11 +++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h index b857eacdd..108a70708 100644 --- a/saber/funcs/impl/bm/vender_pooling.h +++ b/saber/funcs/impl/bm/vender_pooling.h @@ -67,8 +67,7 @@ class VenderPooling +#else template +#endif class Pooling : public BaseFunc< Tensor, Tensor, From 1f02e147afea710c19869c523b73e8086436ed5e Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Tue, 26 Jun 2018 14:50:10 +0800 Subject: [PATCH 062/318] Update pooling test --- test/saber/bm/test_saber_func_pooling_BM.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp index 2a490c588..944ab6a18 100644 --- a/test/saber/bm/test_saber_func_pooling_BM.cpp +++ b/test/saber/bm/test_saber_func_pooling_BM.cpp @@ -9,8 +9,6 @@ using namespace anakin::saber; TEST(TestSaberFuncBM, test_func_pooling) { - - Env::env_init(); typedef TargetWrapper API; typedef TargetWrapper X86_API; @@ -42,6 +40,8 @@ TEST(TestSaberFuncBM, test_func_pooling) { // start Reshape & doInfer + LOG(INFO) << "init env..."; + Env::env_init(); Context ctx1(0, 1, 1); int window_h = 2; int window_w = 2; @@ -279,6 +279,9 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) { } int main(int argc, const char** argv) { + //TODO: init in another place + static bm_handle_t handle; + bmdnn_init(&handle); // initial logger //logger::init(argv[0]); InitTest(); From a1e82149076024dc85006d4cf8794db96069b06f Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Tue, 26 Jun 2018 15:01:16 +0800 Subject: [PATCH 063/318] Skip context init for BM --- saber/core/context.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/saber/core/context.h b/saber/core/context.h index 1667f36e0..a661cce46 100644 --- a/saber/core/context.h +++ b/saber/core/context.h @@ -17,6 +17,7 @@ #include "core/env.h" #include "saber/saber_types.h" +#include #ifdef USE_BM #include "bmlib_runtime.h" @@ -40,6 +41,11 @@ class Context final{ * @param compute_stream_id */ Context(int device_id = 0, int data_stream_id = 0, int compute_stream_id = 0){ + if(std::is_same::value){ + LOG(INFO) << "context init for BM"; + return; + } + CHECK_GT(devs.size(), 0) << "Env is not initialized or current target is not exit!"; if (device_id >= devs.size()){ LOG(WARNING) << "device index exceeds the number of devices, set to default device(0)!"; @@ -63,6 +69,11 @@ class Context final{ } Context(const Context& ctx){ + if(std::is_same::value){ + LOG(INFO) << "context init for BM"; + return; + } + _device_id = ctx._device_id; _data_stream_id = ctx._data_stream_id; _compute_stream_id = ctx._compute_stream_id; From b1b9f7c920617d023783b47e9227c14fba7b3b32 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Tue, 26 Jun 2018 15:09:02 +0800 Subject: [PATCH 064/318] remove flush action in print --- saber/core/tensor_op.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp index 72de1d0b3..841c9c208 100644 --- a/saber/core/tensor_op.cpp +++ b/saber/core/tensor_op.cpp @@ -434,8 +434,6 @@ void print_tensor_device>(Tensor& tenso }*/ float *host_mem = new float[tensor.size()]; - bm_flush(get_bm_handle()); - auto* device_data_ptr = const_cast(tensor.data()); bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr); From 27517ca93eb72070d13ff220c8072bdab5640080 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Tue, 26 Jun 2018 16:09:58 +0800 Subject: [PATCH 065/318] ignore set_device for BM for now --- saber/core/impl/bm/bm_impl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index d2790d0a9..fa51bf2d7 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -47,7 +47,7 @@ void BM_API::get_device_count(int &count) { void BM_API::set_device(int id){ //(bm_handle_t &handle, bool bmkernel_used, int id){ - BMDNN_CHECK(bm_dev_request(&handle, 0, id)); + //BMDNN_CHECK(bm_dev_request(&handle, 0, id)); } //TODO: Do we have this functionality? From 949c4c49fc359a54ad87eec17502e307b1715a4d Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Tue, 26 Jun 2018 16:26:19 +0800 Subject: [PATCH 066/318] Update logs for copy_from --- saber/core/tensor.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/saber/core/tensor.h b/saber/core/tensor.h index d8a319cd5..af3495b1f 100644 --- a/saber/core/tensor.h +++ b/saber/core/tensor.h @@ -950,7 +950,7 @@ size_t Tensor::_type_len(){ template<> template<> inline SaberStatus Tensor::copy_from(const Tensor& tensor) { - LOG(INFO) << "BM copy_from"; + LOG(INFO) << "BM copy_from X86"; CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same"; auto* device_data_ptr = mutable_data(); @@ -961,7 +961,7 @@ SaberStatus Tensor::copy_from(const Tensor template<> template<> inline SaberStatus Tensor::copy_from(const Tensor& tensor) { - LOG(INFO) << "X86 copy_from"; + LOG(INFO) << "X86 copy_from BM"; CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same"; auto* device_data_ptr = const_cast(tensor.data()); From 51f0f2b4df677e5bad5ccfd5eb057f90dc4d423d Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Tue, 26 Jun 2018 17:42:26 +0800 Subject: [PATCH 067/318] Initialize bm handle only in one place --- saber/core/impl/bm/bm_impl.cpp | 4 ++-- test/saber/bm/test_TargetWrapper_BM.cpp | 6 +++--- test/saber/bm/test_saber_buffer_BM.cpp | 4 ---- test/saber/bm/test_saber_func_pooling_BM.cpp | 3 --- test/saber/bm/test_saber_tensor_BM.cpp | 4 ---- 5 files changed, 5 insertions(+), 16 deletions(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index fa51bf2d7..60e52088e 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -37,9 +37,9 @@ namespace saber{ typedef TargetWrapper BM_API; -//TODO: check exception -//static bm_handle_t handle = get_bm_handle(); +// Init handle only once in the lifetime static bm_handle_t handle; +static bm_status_t init_handle{bmdnn_init(&handle)}; void BM_API::get_device_count(int &count) { BMDNN_CHECK(bm_dev_getcount(&count)); diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp index b893183a2..8de77498a 100644 --- a/test/saber/bm/test_TargetWrapper_BM.cpp +++ b/test/saber/bm/test_TargetWrapper_BM.cpp @@ -4,9 +4,9 @@ #ifdef USE_BM using namespace anakin::saber; -static bm_handle_t handle; +//static bm_handle_t handle; int main() { - bmdnn_init(&handle); + //bmdnn_init(&handle); typedef TargetWrapper API; //int dev_count = 0; //API::get_device_count(dev_count); @@ -20,7 +20,7 @@ int main() { std::cout << "Start mem_free test." << std::endl; API::mem_free(pmem); std::cout << "End mem_free test." << std::endl; - bmdnn_deinit(handle); + //bmdnn_deinit(handle); } #endif diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp index 9910638fb..dce1fae15 100644 --- a/test/saber/bm/test_saber_buffer_BM.cpp +++ b/test/saber/bm/test_saber_buffer_BM.cpp @@ -132,10 +132,6 @@ TEST(TestSaberBufferBM, test_buffer_memcpy) { } int main(int argc, const char** argv) { - //TODO: init in another place - static bm_handle_t handle; - bmdnn_init(&handle); - // initial logger logger::init(argv[0]); InitTest(); diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp index 944ab6a18..e988bc573 100644 --- a/test/saber/bm/test_saber_func_pooling_BM.cpp +++ b/test/saber/bm/test_saber_func_pooling_BM.cpp @@ -279,9 +279,6 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) { } int main(int argc, const char** argv) { - //TODO: init in another place - static bm_handle_t handle; - bmdnn_init(&handle); // initial logger //logger::init(argv[0]); InitTest(); diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index dfd8d90c9..2dcd61c41 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -624,10 +624,6 @@ TEST(TestSaberTensorBM, test_tensor_base_type) { }*/ int main(int argc, const char** argv) { - //TODO: init in another place - static bm_handle_t handle; - bmdnn_init(&handle); - // initial logger logger::init(argv[0]); InitTest(); From 1fe4f195b8d736a78e1f8dbd131640de8342d070 Mon Sep 17 00:00:00 2001 From: lian <327842846@qq.com> Date: Tue, 26 Jun 2018 10:46:30 +0000 Subject: [PATCH 068/318] chage tensor type_len --- saber/core/target_wrapper.h | 11 + saber/core/tensor.h | 7 +- test/framework/core/base_types_test.cpp | 143 ---- test/framework/core/core_test.h | 46 -- test/framework/graph/graph_base_test.cpp | 82 -- .../graph/graph_parser_from_model_test.cpp | 88 --- test/framework/graph/graph_test.h | 47 -- test/framework/net/benchmark.cpp | 162 ---- test/framework/net/chinese_ner_test.cpp | 213 ----- test/framework/net/model_test.cpp | 175 ----- .../net/net_exec_multi_thread_test.cpp | 149 ---- test/framework/net/net_exec_test.cpp | 273 ------- test/framework/net/net_test.h | 98 --- test/framework/net/padde_api_test.cpp | 121 --- test/framework/net/paddle_api.h | 87 --- test/framework/operators/operator_tests.h | 47 -- test/framework/operators/pooling_test.cpp | 43 -- test/saber/bm/test_saber_buffer_BM.cpp | 126 --- test/saber/bm/test_saber_buffer_BM.h | 20 - test/saber/bm/test_saber_context_BM.cpp | 28 - test/saber/bm/test_saber_context_BM.h | 21 - test/saber/bm/test_saber_device_BM.cpp | 20 - test/saber/bm/test_saber_device_BM.h | 21 - test/saber/bm/test_saber_func_BM.h | 38 - .../bm/test_saber_func_activation_BM.cpp | 88 --- test/saber/bm/test_saber_func_conv_BM.cpp | 725 ------------------ test/saber/bm/test_saber_func_fc_BM.cpp | 146 ---- test/saber/bm/test_saber_func_pooling_BM.cpp | 311 -------- test/saber/bm/test_saber_shape_BM.cpp | 126 --- test/saber/bm/test_saber_shape_BM.h | 25 - test/saber/bm/test_saber_tensor_BM.cpp | 40 +- 31 files changed, 43 insertions(+), 3484 deletions(-) delete mode 100644 test/framework/core/base_types_test.cpp delete mode 100644 test/framework/core/core_test.h delete mode 100644 test/framework/graph/graph_base_test.cpp delete mode 100644 test/framework/graph/graph_parser_from_model_test.cpp delete mode 100644 test/framework/graph/graph_test.h delete mode 100644 test/framework/net/benchmark.cpp delete mode 100644 test/framework/net/chinese_ner_test.cpp delete mode 100644 test/framework/net/model_test.cpp delete mode 100644 test/framework/net/net_exec_multi_thread_test.cpp delete mode 100644 test/framework/net/net_exec_test.cpp delete mode 100644 test/framework/net/net_test.h delete mode 100644 test/framework/net/padde_api_test.cpp delete mode 100644 test/framework/net/paddle_api.h delete mode 100644 test/framework/operators/operator_tests.h delete mode 100644 test/framework/operators/pooling_test.cpp delete mode 100644 test/saber/bm/test_saber_buffer_BM.cpp delete mode 100644 test/saber/bm/test_saber_buffer_BM.h delete mode 100644 test/saber/bm/test_saber_context_BM.cpp delete mode 100644 test/saber/bm/test_saber_context_BM.h delete mode 100644 test/saber/bm/test_saber_device_BM.cpp delete mode 100644 test/saber/bm/test_saber_device_BM.h delete mode 100644 test/saber/bm/test_saber_func_BM.h delete mode 100644 test/saber/bm/test_saber_func_activation_BM.cpp delete mode 100644 test/saber/bm/test_saber_func_conv_BM.cpp delete mode 100644 test/saber/bm/test_saber_func_fc_BM.cpp delete mode 100644 test/saber/bm/test_saber_func_pooling_BM.cpp delete mode 100644 test/saber/bm/test_saber_shape_BM.cpp delete mode 100644 test/saber/bm/test_saber_shape_BM.h diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h index 1f283a004..c1325f7fb 100644 --- a/saber/core/target_wrapper.h +++ b/saber/core/target_wrapper.h @@ -368,6 +368,15 @@ struct TargetWrapper { */ template <> struct TargetWrapper { +// TargetWrapper () +// { +// CHECK_EQ(bmdnn_init(&handle),BM_SUCCESS) << "Error:bmdnn_init failed"; +// } +// ~TargetWrapper () +// { +// CHECK_EQ(bmdnn_deinit(handle),BM_SUCCESS) << "Error:bmdnn_deinit failed"; +// } + typedef void* event_t; typedef void* stream_t; @@ -418,6 +427,8 @@ struct TargetWrapper { static int get_device_id(); static bm_handle_t get_handler(); + +// bm_handle_t handle; }; #endif //USE_BM diff --git a/saber/core/tensor.h b/saber/core/tensor.h index 3051e16c6..945c46d00 100644 --- a/saber/core/tensor.h +++ b/saber/core/tensor.h @@ -941,10 +941,11 @@ class Tensor : public TensorBase { }; #ifdef USE_BM - +#ifndef BM_TENSOR_COPY +#define BM_TENSOR_COPY template<> inline size_t Tensor::_type_len(){ - return 1; + return 4; } template<> @@ -964,7 +965,7 @@ SaberStatus Tensor::copy_from(const Tensor BMDNN_CHECK(bm_memcpy_d2s(TargetWrapper::get_handler(), bm_mem_from_system(mutable_data()), *device_data_ptr)); return SaberSuccess; } - +#endif #endif } //namespace saber diff --git a/test/framework/core/base_types_test.cpp b/test/framework/core/base_types_test.cpp deleted file mode 100644 index 0109493bf..000000000 --- a/test/framework/core/base_types_test.cpp +++ /dev/null @@ -1,143 +0,0 @@ -#include "core_test.h" -#include "any.h" -#include "singleton.h" -#include "tls.h" -#include "parameter.h" -#include "thread_pool.h" - -#ifdef USE_CUDA -#include "cuda_funcs.h" -#include "sass_funcs.h" -#endif - -#include "tensor.h" - -#ifdef USE_CUDA -TEST(CoreComponentsTest, sass_test) { - LOG(INFO) << "test for cuda code function"; - //anakin::saber::Tensor<3, RTCUDA, float, NCHW> ts; - //LOG(WARNING) << " tensor num " << ts.num(); - //ts.set_offset(8); - //my_print(); - LOG(INFO) << "test for sass code function 1"; - invoke_test(); - LOG(INFO) << "test for sass code function 2"; - invoke_test_2(); -} -#endif - -TEST(CoreComponentsTest, core_base_types_any_test) { - LOG(INFO) << "test for any class ."; - LOG(WARNING) << " level 1 : base type int (set 42 to any)"; - const int a = 42; - any any_a(42); - int result_a = any_cast(any_a); - - LOG(INFO) << "casted result : " << result_a; - LOG(WARNING) << " level 2 : base type float (set 42.8 to any)"; - float b = 42.8; - any any_b = b; - float result_b = any_cast(any_b); - LOG(INFO) << "casted result : " << result_b << " decide: "; - - LOG(WARNING) << " level 3 : ptuple type (set PTuple to any)"; - PTuple p_tuple_float(3.2f, 3.3f, 3.5f); - p_tuple_float.push_back(4.3); // push_back - - any p_tuple_float_any = p_tuple_float; - auto result_p_tuple_float_any = any_cast>(p_tuple_float_any); - - for (int i = 0; i < result_p_tuple_float_any.size(); i++) { - LOG(INFO) << " any casted PTuple[" << i << "]: " << result_p_tuple_float_any[i]; - } - - struct target { - void print() { - LOG(INFO) << " target struct Successfully recovered."; - } - }; - - LOG(WARNING) << " level 5 : struct type"; - - target tg; - - any any_tg = tg; - - target result_tg = any_cast(any_tg); - - result_tg.print(); - - LOG(WARNING) << " level other : struct type"; - - any any_tg_copy = any_tg; - - target result_tg_copy = any_cast(any_tg); - - result_tg_copy.print(); -} - -void at_exit_in_test() { - LOG(WARNING) << "core_base_types_singleton_test exit successfully!"; -} - -TEST(CoreComponentsTest, core_base_types_singleton_test) { - struct target { - target() { - LOG(INFO) << " singleton target constructed"; - } - }; - typedef Singleton sg_target; - sg_target::Global(); -} - -typedef AnakinThreadLocalVar sg_tls; -void thread_func_0() { - int* tmp = sg_tls::value(); - *tmp = 3; - LOG(INFO) << " thread_func_0 value: " << *(sg_tls::value()); -} -void thread_func_1() { - int* tmp = sg_tls::value(); - *tmp = 4; - - LOG(INFO) << " thread_func_0 value: " << *(sg_tls::value()); -} -TEST(CoreComponentsTest, core_base_types_tls_test) { - LOG(INFO) << " Create tls var 0 , check in two thread."; - std::thread first(thread_func_0); - std::thread sec(thread_func_1); - first.join(); - sec.join(); - LOG(INFO) << " main thread var: " << *(sg_tls::value()); -} - -int thread_pool_func(int i) { - LOG(INFO) << " thread_pool_func input : " << i; - //std::this_thread::sleep_for(std::chrono::seconds(0)); - return i; -} - -TEST(CoreComponentsTest, core_base_types_thread_pool_test) { - LOG(INFO) << " Create thread pool with thread num = 12 "; - ThreadPool thread_pool_test(100); - thread_pool_test.launch(); - std::function test = thread_pool_func; - - for (int i = 0; i < 50; i++) { - // run async - auto ret = thread_pool_test.RunAsync(test, i); - LOG(INFO) << " return : " << ret.get(); - - // run sync - //auto sync_ret = thread_pool_test.RunSync(test, i); - } -} - - -int main(int argc, const char** argv) { - // initial logger - logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} diff --git a/test/framework/core/core_test.h b/test/framework/core/core_test.h deleted file mode 100644 index 6107eef4b..000000000 --- a/test/framework/core/core_test.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_GRAPH_TEST_H -#define ANAKIN_GRAPH_TEST_H - -#include -#include -#include -#include "utils/unit_test/aktest.h" -#include "utils/logger/logger.h" - -using namespace anakin; -using ::anakin::test::Test; - -class CoreComponentsTest : public Test { -public: - CoreComponentsTest(){} - - void SetUp(){} - - void TearDown(){} - -protected: -}; - - - - - - -#endif - - diff --git a/test/framework/graph/graph_base_test.cpp b/test/framework/graph/graph_base_test.cpp deleted file mode 100644 index d42e86c02..000000000 --- a/test/framework/graph/graph_base_test.cpp +++ /dev/null @@ -1,82 +0,0 @@ -#include -#include "graph_test.h" -#include "graph_base.h" - -using namespace anakin; -using namespace anakin::graph; - -//! Usage sample -class GraphTestClass : public GraphBase { -public: - GraphTestClass() {} - ~GraphTestClass() {} - virtual bool directed() { - return true; - }; -}; -class edge : public Arc { -public: - edge(std::string btm, std::string top, int weight): Arc(btm, top, weight) {} - ~edge() {} -}; - -TEST(GraphTest, graph_base_test) { - LOG(INFO) << "test for graph base ."; - - GraphTestClass graph; - graph.add_vertex("a", 42); - graph.add_vertex("b", 43); - graph.add_vertex("c", 44); - graph.add_vertex("d", 45); - graph.add_vertex("e", 46); - graph.add_vertex("f", 47); - - edge arc0("a", "b", 0); - edge arc1("b", "c", 1); - edge arc2("c", "d", 2); - edge arc3("d", "e", 3); - edge arc4("e", "f", 4); - edge arc5("f", "a", 5); - - graph.add_in_arc(arc0); - graph.add_in_arc(arc1); - graph.add_in_arc(arc2); - graph.add_in_arc(arc3); - graph.add_in_arc(arc4); - graph.add_in_arc(arc5); - graph.add_out_arc(arc0); - graph.add_out_arc(arc1); - graph.add_out_arc(arc2); - graph.add_out_arc(arc3); - graph.add_out_arc(arc4); - graph.add_out_arc(arc5); - - LOG(WARNING) << "Construction of graph."; - LOG(INFO) << graph.to_string(); - - LOG(WARNING) << "Remove a from graph."; - graph.remove("a"); - LOG(INFO) << graph.to_string(); - - LOG(WARNING) << "Add arc: f->b to graph."; - edge arc_f_b("f", "b", 10); - graph.add_in_arc(arc_f_b); - graph.add_out_arc(arc_f_b); - LOG(INFO) << graph.to_string(); - - LOG(WARNING) << "Add vertex:a and arc: a->e to graph."; - graph.add_vertex("a", 47); - edge arc_a_e("a", "e", 10); - graph.add_out_arc(arc_a_e); - graph.add_in_arc(arc_a_e); - LOG(INFO) << graph.to_string(); -} - - -int main(int argc, const char** argv) { - // initial logger - logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} diff --git a/test/framework/graph/graph_parser_from_model_test.cpp b/test/framework/graph/graph_parser_from_model_test.cpp deleted file mode 100644 index 883a12858..000000000 --- a/test/framework/graph/graph_parser_from_model_test.cpp +++ /dev/null @@ -1,88 +0,0 @@ -#include -#include "graph_test.h" -#include "graph_base.h" -#include "graph.h" -#include "scheduler.h" - -using namespace anakin; -using namespace anakin::graph; - -//std::string model_path = "/home/chaowen/anakin_v2/model_v2/google_net/googlenet.anakin.bin"; -std::string model_path = "/home/chaowen/anakin_v2/model_v2/yolo/yolo.anakin.bin"; - - -TEST(GraphTest, graph_load_model) { - /*Graph* graph = new Graph(); - LOG(WARNING) << "load anakin model file from " << model_path << " ..."; - // load anakin model files. - graph->load(model_path); - - DLOG(INFO) << graph->to_string(); - // exec optimization - graph->Optimize(); */ -} - -#ifdef USE_CUDA -TEST(GraphTest, nvidia_graph_save_model) { - Graph* graph = new Graph(); - // load anakin model files. - LOG(INFO) << "load anakin model file from " << model_path << " ..."; - graph->load(model_path); - - // regisiter output tensor - //graph->RegistOut("data_perm", "data_scale"); - - // exec optimization - graph->Optimize(); - - // save the optimized model to disk. - std::string save_model_path = model_path + std::string(".saved"); - Status status = graph->save(save_model_path); -} -#endif - -#ifdef USE_X86_PLACE -TEST(GraphTest, x86_graph_save_model) { - Graph* graph = new Graph(); - // load anakin model files. - LOG(INFO) << "load anakin model file from " << model_path << " ..."; - graph->load(model_path); - - // regisiter output tensor - //graph->RegistOut("data_perm", "data_scale"); - - // exec optimization - graph->Optimize(); - - // save the optimized model to disk. - std::string save_model_path = model_path + std::string(".saved"); - Status status = graph->save(save_model_path); -} -#endif - -#ifdef USE_ARM_PLACE -TEST(GraphTest, arm_graph_save_model) { - Graph* graph = new Graph(); - // load anakin model files. - LOG(INFO) << "load anakin model file from " << model_path << " ..."; - graph->load(model_path); - - // regisiter output tensor - //graph->RegistOut("data_perm", "data_scale"); - - // exec optimization - graph->Optimize(); - - // save the optimized model to disk. - std::string save_model_path = model_path + std::string(".saved"); - Status status = graph->save(save_model_path); -} -#endif - -int main(int argc, const char** argv) { - // initial logger - logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} diff --git a/test/framework/graph/graph_test.h b/test/framework/graph/graph_test.h deleted file mode 100644 index db837c84a..000000000 --- a/test/framework/graph/graph_test.h +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_GRAPH_TEST_H -#define ANAKIN_GRAPH_TEST_H - -#include -#include "utils/unit_test/aktest.h" -#include "utils/logger/logger.h" - -using namespace anakin; -using ::anakin::test::Test; - -/** - * \brief Graph test is base Test class for anakin graph funciton. - */ -class GraphTest: public Test { -public: - GraphTest(){} - - void SetUp(){} - - void TearDown(){} - -protected: -}; - - - - - - -#endif - - diff --git a/test/framework/net/benchmark.cpp b/test/framework/net/benchmark.cpp deleted file mode 100644 index 41c31c83e..000000000 --- a/test/framework/net/benchmark.cpp +++ /dev/null @@ -1,162 +0,0 @@ -#include -#include "net_test.h" -#include "saber/funcs/timer.h" -#include -#include "saber/core/tensor_op.h" -#include -#include -#include -#include -#include -#include - -#ifdef USE_GFLAGS -#include - -DEFINE_string(model_dir, "", "model dir"); -DEFINE_string(model_file, "", "model file"); -DEFINE_int32(num, 1, "batchSize"); -DEFINE_int32(warmup_iter, 10, "warm up iterations"); -DEFINE_int32(epoch, 1000, "time statistic epoch"); -#else -std::string FLAGS_model_dir; -std::string FLAGS_model_file; -int FLAGS_num = 1; -int FLAGS_warmup_iter = 10; -int FLAGS_epoch = 1000; -#endif - -#ifdef USE_CUDA -typedef NV Target; -#elif defined(USE_X86_PLACE) -typedef X86 Target; -#else -typedef ARM Target; -#endif - -void getModels(std::string path, std::vector& files) { - DIR *dir; - struct dirent *ptr; - if ((dir = opendir(path.c_str())) == NULL) { - perror("Open dri error..."); - exit(1); - } - while((ptr = readdir(dir)) != NULL) { - if (strcmp(ptr->d_name, ".") == 0 || strcmp(ptr->d_name, "..") == 0) - continue; - else if (ptr->d_type == 8)//file - files.push_back(path + "/" + ptr->d_name); - else if (ptr->d_type == 4) { - getModels(path + "/" + ptr->d_name, files); - } - } - closedir(dir); -} -TEST(NetTest, net_execute_base_test) { - std::vector models; - if (FLAGS_model_file == "") { - getModels(FLAGS_model_dir, models); - } else { - models.push_back(FLAGS_model_dir + FLAGS_model_file); - } - for (auto iter = models.begin(); iter < models.end(); iter++) - { - LOG(WARNING) << "load anakin model file from " << *iter << " ..."; - Graph graph; - auto status = graph.load(*iter); - if (!status) { - LOG(FATAL) << " [ERROR] " << status.info(); - } - graph.ResetBatchSize("input_0", FLAGS_num); - graph.Optimize(); - // constructs the executer net - Net net_executer(graph, true); - // get in - auto d_tensor_in_p = net_executer.get_in("input_0"); - Tensor4d h_tensor_in; - auto valid_shape_in = d_tensor_in_p->valid_shape(); - for (int i = 0; i < valid_shape_in.size(); i++) { - LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i]; - } - h_tensor_in.re_alloc(valid_shape_in); - fill_tensor_host_rand(h_tensor_in, -1.0f,1.0f); - d_tensor_in_p->copy_from(h_tensor_in); - // do inference - Context ctx(0, 0, 0); - saber::SaberTimer my_time; - LOG(WARNING) << "EXECUTER !!!!!!!! "; - for (int i = 0; i < FLAGS_warmup_iter; i++) { - net_executer.prediction(); - } -#ifdef ENABLE_OP_TIMER - net_executer.reset_op_time(); -#endif - my_time.start(ctx); - //auto start = std::chrono::system_clock::now(); - for (int i = 0; i < FLAGS_epoch; i++) { - //DLOG(ERROR) << " epoch(" << i << "/" << epoch << ") "; - net_executer.prediction(); - } - my_time.end(ctx); -#ifdef ENABLE_OP_TIMER - std::vector op_time = net_executer.get_op_time(); - auto exec_funcs = net_executer.get_exec_funcs(); - auto op_param = net_executer.get_op_param(); - for (int i = 0; i < op_time.size(); i++) { - LOG(INFO) << "name: " << exec_funcs[i].name << " op_type: " << exec_funcs[i].op_name << " op_param: " << op_param[i] << " time " << op_time[i]/FLAGS_epoch; - } - std::map op_map; - for (int i = 0; i < op_time.size(); i++) { - auto it = op_map.find(op_param[i]); - if (it != op_map.end()) - op_map[op_param[i]] += op_time[i]; - else - op_map.insert(std::pair(op_param[i], op_time[i])); - } - for (auto it = op_map.begin(); it != op_map.end(); ++it) { - LOG(INFO)<< it->first << " " << (it->second) / FLAGS_epoch<< " ms"; - } -#endif - size_t end = (*iter).find(".anakin.bin"); - size_t start = FLAGS_model_dir.length(); - std::string model_name = (*iter).substr(start, end-start); - - LOG(INFO) << model_name << " batch_size " << FLAGS_num << " average time "<< my_time.get_average_ms() / FLAGS_epoch << " ms"; - } -} -int main(int argc, const char** argv){ - // initial logger - logger::init(argv[0]); - -#ifdef USE_GFLAGS - google::ParseCommandLineFlags(&argc, &argv, true); -#else - LOG(INFO)<< "BenchMark usage:"; - LOG(INFO)<< " $benchmark "; - LOG(INFO)<< " model_dir: model directory"; - LOG(INFO)<< " model_file: path to model"; - LOG(INFO)<< " num: batchSize default to 1"; - LOG(INFO)<< " warmup_iter: warm up iterations default to 10"; - LOG(INFO)<< " epoch: time statistic epoch default to 1000"; - if(argc < 3) { - LOG(ERROR) << "You should fill in the variable model_dir and model_file at least."; - return 0; - } - FLAGS_model_dir = argv[1]; - if(argc > 2) { - FLAGS_model_file = argv[2]; - } - if(argc > 3) { - FLAGS_num = atoi(argv[3]); - } - if(argc > 4) { - FLAGS_warmup_iter = atoi(argv[4]); - } - if(argc > 5) { - FLAGS_epoch = atoi(argv[5]); - } -#endif - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} diff --git a/test/framework/net/chinese_ner_test.cpp b/test/framework/net/chinese_ner_test.cpp deleted file mode 100644 index 37785f721..000000000 --- a/test/framework/net/chinese_ner_test.cpp +++ /dev/null @@ -1,213 +0,0 @@ -#include "anakin_config.h" -#include -#include -#include "net_test.h" -#include "saber/funcs/timer.h" -#include -#include "saber/core/tensor_op.h" -#include -#include -#include -#include -#include -#include - -#define DEFINE_GLOBAL(type, var, value) \ - type (GLB_##var) = (value) -DEFINE_GLOBAL(std::string, model_dir, ""); -DEFINE_GLOBAL(std::string, input_file, ""); - -//#define WITH_MENTION - -void getModels(std::string path, std::vector& files) { - DIR* dir= nullptr; - struct dirent* ptr; - - if ((dir = opendir(path.c_str())) == NULL) { - perror("Open dri error..."); - exit(1); - } - - while ((ptr = readdir(dir)) != NULL) { - if (strcmp(ptr->d_name, ".") == 0 || strcmp(ptr->d_name, "..") == 0) { - continue; - } else if (ptr->d_type == 8) { //file - files.push_back(path + "/" + ptr->d_name); - } else if (ptr->d_type == 4) { - //files.push_back(ptr->d_name);//dir - getModels(path + "/" + ptr->d_name, files); - } - } - closedir(dir); -} -void SplitString(const std::string& s, - std::vector& v, const std::string& c) -{ - std::string::size_type pos1, pos2; - pos2 = s.find(c); - pos1 = 0; - while(std::string::npos != pos2) - { - v.push_back(s.substr(pos1, pos2-pos1)); - - pos1 = pos2 + c.size(); - pos2 = s.find(c, pos1); - } - if(pos1 != s.length()) - v.push_back(s.substr(pos1)); -} - -bool split_word_mention_idx_from_file( - std::vector > &word_idx, - std::vector > &mention_idx, - const std::string input_file_path) { - - std::ifstream infile(input_file_path.c_str()); - if (!infile.good()) { - std::cout << "Cannot open " << std::endl; - return false; - } - LOG(INFO)<<"found filename: "< split_v; - std::vector split_w; - std::vector split_m; - while (std::getline(infile, line)) { - split_v.clear(); - SplitString(line, split_v, ";"); - CHECK_GE(split_v.size(), 4) << " file need ; split"; - std::vector word; - std::vector mention; - split_w.clear(); - SplitString(split_v[1], split_w, " "); - split_m.clear(); - SplitString(split_v[3], split_m, " "); - for (auto w : split_w) { - word.push_back(atof(w.c_str())); - } - for (auto m : split_m) { - mention.push_back(atof(m.c_str())); - } - word_idx.push_back(word); - mention_idx.push_back(mention); - } - return true; -} - -int get_batch_data_offset( - std::vector &out_data, - const std::vector > &seq_data, - std::vector &seq_offset, - const int start_idx, - const int batch_num) { - seq_offset.clear(); - out_data.clear(); - seq_offset.push_back(0); - int len = 0; - for (int i = 0; i < batch_num; ++i) { - for (auto d : seq_data[i + start_idx]) { - len += 1; - out_data.push_back(d); - } - seq_offset.push_back(len); - } - return len; -} - -#ifdef USE_X86_PLACE -TEST(NetTest, chinese_ner_executor) { - std::vector models; - getModels(GLB_model_dir, models); - std::vector > word_idx; - std::vector > mention_idx; - split_word_mention_idx_from_file(word_idx, mention_idx, GLB_input_file); - std::vector word_idx_data; - std::vector mention_idx_data; - std::vector word_seq_offset; - std::vector mention_seq_offset; - int batch_num = 6; - - Graph* graph = new Graph(); - LOG(WARNING) << "load anakin model file from " << models[0] << " ..."; - // load anakin model files. - auto status = graph->load(models[0]); - if(!status ) { - LOG(FATAL) << " [ERROR] " << status.info(); - } - graph->Reshape("input_0", {1000, 1, 1, 1}); -#ifdef WITH_MENTION - graph->Reshape("input_1", {1000, 1, 1, 1}); -#endif - //anakin graph optimization - graph->Optimize(); - Net net_executer(*graph, true); - SaberTimer timer; - Context ctx; - for (int i = 0; i < word_idx.size(); i += batch_num) { -// { -// int i = 0; - int word_len = get_batch_data_offset(word_idx_data, word_idx, word_seq_offset, i, batch_num); -#ifdef WITH_MENTION - int mention_len = get_batch_data_offset(mention_idx_data, mention_idx, mention_seq_offset, i, batch_num); -#endif -// for (auto w : word_idx_data) { -// std::cout << w << ","; -// } -// std::cout << std::endl; -// for (auto s : word_seq_offset) { -// std::cout << s << ", "; -// } -// std::cout << std::endl << std::endl << std::endl; -// word_idx_data = {20, 21, 22, 23, 24, 25, 26}; -// word_seq_offset = {0, 5, 7}; -// int word_len = 7; -// mention_idx_data = {2, 1, 22, 23, 24, 25, 26}; -// mention_seq_offset = {0, 5, 7}; -// int mention_len = 7; - - auto word_in_p = net_executer.get_in("input_0"); - word_in_p->reshape({word_len, 1, 1, 1}); - for (int j = 0; j < word_idx_data.size(); ++j) { - word_in_p->mutable_data()[j] = word_idx_data[j]; - } - word_in_p->set_seq_offset(word_seq_offset); -#ifdef WITH_MENTION - auto mention_in_p = net_executer.get_in("input_1"); - mention_in_p->reshape({mention_len, 1, 1, 1}); - for (int j = 0; j < mention_idx_data.size(); ++j) { - mention_in_p->mutable_data()[j] = mention_idx_data[j]; - } - mention_in_p->set_seq_offset(mention_seq_offset); -#endif - timer.start(ctx); - net_executer.prediction(); - timer.end(ctx); -// auto tensor_out_5_p = net_executer.get_out("crf_decoding_0.tmp_0_out"); -// int v_size = tensor_out_5_p->valid_size(); -// for (int j = 0; j < v_size; ++j) { -// std::cout << tensor_out_5_p->data()[j]<<" "; -// } -// std::cout << std::endl; - } - LOG(INFO)<<"elapse time: "< -#include "net_test.h" -#include "saber/funcs/timer.h" -#include -#include "saber/core/tensor_op.h" -#include -#include -#include -#include -#include -#include -#define DEFINE_GLOBAL(type, var, value) \ - type (GLB_##var) = (value) -DEFINE_GLOBAL(std::string, model_dir, ""); -DEFINE_GLOBAL(int, num, 1); -DEFINE_GLOBAL(int, channel, 8); -DEFINE_GLOBAL(int, height, 640); -DEFINE_GLOBAL(int, width, 640); -DEFINE_GLOBAL(bool, is_input_shape, false); - -void getModels(std::string path, std::vector& files) { - DIR* dir= nullptr; - struct dirent* ptr; - - if ((dir = opendir(path.c_str())) == NULL) { - perror("Open dri error..."); - exit(1); - } - - while ((ptr = readdir(dir)) != NULL) { - if (strcmp(ptr->d_name, ".") == 0 || strcmp(ptr->d_name, "..") == 0) { - continue; - } else if (ptr->d_type == 8) { //file - files.push_back(path + "/" + ptr->d_name); - } else if (ptr->d_type == 4) { - //files.push_back(ptr->d_name);//dir - getModels(path + "/" + ptr->d_name, files); - } - } - - closedir(dir); -} - -#ifdef USE_CUDA -TEST(NetTest, nv_net_execute_base_test) { - std::vector models; - getModels(GLB_model_dir, models); - - for (auto iter = models.begin(); iter < models.end(); iter++) { - LOG(WARNING) << "load anakin model file from " << *iter << " ..."; -#if 1 - Graph graph; - auto status = graph.load(*iter); - - if (!status) { - LOG(FATAL) << " [ERROR] " << status.info(); - } - - if (GLB_is_input_shape) { - graph.Reshape("input_0", {GLB_num, GLB_channel, GLB_height, GLB_width}); - } else { - graph.ResetBatchSize("input_0", GLB_num); - } - - graph.Optimize(); - // constructs the executer net - Net net_executer(graph, true); - // get in - auto d_tensor_in_p = net_executer.get_in("input_0"); - Tensor4d h_tensor_in; - auto valid_shape_in = d_tensor_in_p->valid_shape(); - - for (int i = 0; i < valid_shape_in.size(); i++) { - LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i]; - } - - h_tensor_in.re_alloc(valid_shape_in); - fill_tensor_host_rand(h_tensor_in, -1.0f, 1.0f); - d_tensor_in_p->copy_from(h_tensor_in); - int warmup_iter = 10; - int epoch = 1000; - // do inference - Context ctx(0, 0, 0); - saber::SaberTimer my_time; - LOG(WARNING) << "EXECUTER !!!!!!!! "; - - for (int i = 0; i < warmup_iter; i++) { - net_executer.prediction(); - } - -#ifdef ENABLE_OP_TIMER - net_executer.reset_op_time(); -#endif - my_time.start(ctx); - - //auto start = std::chrono::system_clock::now(); - for (int i = 0; i < epoch; i++) { - //DLOG(ERROR) << " epoch(" << i << "/" << epoch << ") "; - net_executer.prediction(); - } - - my_time.end(ctx); -#ifdef ENABLE_OP_TIMER - std::vector op_time = net_executer.get_op_time(); - auto exec_funcs = net_executer.get_exec_funcs(); - auto op_param = net_executer.get_op_param(); - - for (int i = 0; i < op_time.size(); i++) { - LOG(INFO) << "name: " << exec_funcs[i].name << " op_type: " << exec_funcs[i].op_name << - " op_param: " << op_param[i] << " time " << op_time[i] / epoch; - } - - std::map op_map; - - for (int i = 0; i < op_time.size(); i++) { - auto it = op_map.find(op_param[i]); - - if (it != op_map.end()) { - op_map[op_param[i]] += op_time[i]; - } else { - op_map.insert(std::pair(op_param[i], op_time[i])); - } - } - - for (auto it = op_map.begin(); it != op_map.end(); ++it) { - LOG(INFO) << it->first << " " << (it->second) / epoch << " ms"; - } - -#endif - LOG(INFO) << *iter << " aveage time " << my_time.get_average_ms() / epoch << " ms"; - // save the optimized model to disk. - // std::string save_model_path = GLB_model_dir + std::string("opt.saved"); - // status = graph.save(save_model_path); - // if (!status ) { - // LOG(FATAL) << " [ERROR] " << status.info(); - // } -#endif - } -} -#endif - -int main(int argc, const char** argv) { - // initial logger - LOG(INFO) << "argc " << argc; - - if (argc < 1) { - LOG(INFO) << "Example of Usage:\n \ - ./output/unit_test/model_test\n \ - anakin_models\n \ - num\n \ - channel\n \ - height\n \ - width\n "; - exit(0); - } else if (argc == 2) { - GLB_model_dir = std::string(argv[1]); - GLB_is_input_shape = false; - } else if (argc == 3) { - GLB_model_dir = std::string(argv[1]); - GLB_num = atoi(argv[2]); - GLB_is_input_shape = false; - } else { - GLB_model_dir = std::string(argv[1]); - GLB_num = atoi(argv[2]); - GLB_channel = atoi(argv[3]); - GLB_height = atoi(argv[4]); - GLB_width = atoi(argv[5]); - GLB_is_input_shape = true; - } - - logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} diff --git a/test/framework/net/net_exec_multi_thread_test.cpp b/test/framework/net/net_exec_multi_thread_test.cpp deleted file mode 100644 index 7a8bf5401..000000000 --- a/test/framework/net/net_exec_multi_thread_test.cpp +++ /dev/null @@ -1,149 +0,0 @@ -#include -#include "net_test.h" -#include "saber/funcs/timer.h" -#include - -std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/yolo_lane_v2.anakin.bin"; - -#ifdef USE_CUDA -#if 1 -TEST(NetTest, nv_net_execute_muti_thread_sync_test) { -#if 1 // use host input - //Env::env_init(1); - LOG(WARNING) << "Sync Runing multi_threads for model: " << model_path; - Worker workers(model_path, 10); - workers.register_inputs({"input_0"}); - workers.register_outputs({"softmax_out"}); - workers.Reshape("input_0", {1, 384, 960, 3}); - - workers.launch(); - - std::vector::type, AK_FLOAT> > host_tensor_p_in_list; - // get in - saber::Shape valid_shape_in({1, 384, 960, 3}); - Tensor4dPtr::type, AK_FLOAT> h_tensor_in = new Tensor4d::type, AK_FLOAT>(valid_shape_in); - float* h_data = h_tensor_in->mutable_data(); - for (int i=0; isize(); i++) { - h_data[i] = 1.0f; - } - host_tensor_p_in_list.push_back(h_tensor_in); - - int epoch = 1000; - - // Running - for(int i=0; ifirst << " processing " << it->second.size() << " tasks"; - for (auto time_in_ms : it->second) { - LOG(INFO) << " \\__task avg time: " << time_in_ms; - } - } -#endif - -#endif - -#if 0 // use device input - Env::env_init(1); - LOG(WARNING) << "Sync Runing multi_threads for model: " << model_path; - Worker workers(model_path, 1); - workers.register_inputs({"input_0"}); - workers.register_outputs({"softmax_out"}); - workers.Reshape("input_0", {1, 384, 960, 3}); - - workers.launch(); - - std::vector::type, AK_FLOAT> > host_tensor_p_in_list; - // get in - saber::Shape valid_shape_in({1, 384, 960, 3}); - Tensor4dPtr::type, AK_FLOAT> h_tensor_in = new Tensor4d::type, AK_FLOAT>(valid_shape_in); - float* h_data = h_tensor_in->mutable_data(); - for (int i=0; isize(); i++) { - h_data[i] = 1.0f; - } - host_tensor_p_in_list.push_back(h_tensor_in); - - std::vector > device_tensor_p_in_list; - for (int i=0; i d_tensor_in = new Tensor4d(host_tensor_p_in_list[i]->valid_shape()); - d_tensor_in->copy_from(*(host_tensor_p_in_list[i])); - device_tensor_p_in_list.push_back(d_tensor_in); - } - - int epoch = 10; - - // Running - for (int i=0; i ctx(0, 0, 0); - saber::SaberTimer my_time; - - my_time.start(ctx); - auto d_tensor_p_out_list = workers.sync_prediction_device(device_tensor_p_in_list); - my_time.end(ctx); - LOG(INFO)<<"muti thread single task exec time: "< workers(model_path, 10); - workers.register_inputs({"input_0"}); - workers.register_outputs({"softmax_out"}); - workers.Reshape("input_0", {1, 384, 960, 3}); - - workers.launch(); - - std::vector::type, AK_FLOAT> > host_tensor_p_in_list; - // get in - saber::Shape valid_shape_in({1, 384, 960, 3}); - Tensor4dPtr::type, AK_FLOAT> h_tensor_in = new Tensor4d::type, AK_FLOAT>(valid_shape_in); - float* h_data = h_tensor_in->mutable_data(); - for (int i=0; isize(); i++) { - h_data[i] = 1.0f; - } - host_tensor_p_in_list.push_back(h_tensor_in); - - int epoch = 10000; - - // Running - for(int i=0; i -#include "net_test.h" -#include "saber/funcs/timer.h" -#include - -//#define USE_DIEPSE - -//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/diepsie_light_head.anakin.bin"; - -//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/diepsie_light_head_base.anakin.bin"; - - -//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/densebox.anakin.bin"; - -//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/cnn_seg.anakin.bin"; - -//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/yolo_camera_detector.anakin.bin"; - -//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/yolo_lane_v2.anakin.bin"; - -// alignment of face -//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/net_deploy_stageI.anakin.bin"; - -//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/net_deploy_stageII.anakin.bin"; - -// residual 7 patch of face -//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/residual_net_7patch_3hc.anakin.bin"; - -// resnet 50 -//std::string model_path = "/home/cuichaowen/anakin2/anakin2/benchmark/CNN/mobilenet_v2.anakin.bin"; - -// vgg16 -std::string model_path = "/home/cuichaowen/anakin2/anakin2/benchmark/CNN/models/vgg16.anakin.bin"; - -#ifdef USE_CUDA -#if 1 -TEST(NetTest, net_execute_base_test) { - Graph* graph = new Graph(); - LOG(WARNING) << "load anakin model file from " << model_path << " ..."; - // load anakin model files. - auto status = graph->load(model_path); - if(!status ) { - LOG(FATAL) << " [ERROR] " << status.info(); - } - - // reshape the input_0 's shape for graph model - //graph->Reshape("input_0", {1, 8, 640, 640}); - - // register all tensor inside graph - //graph->RegistAllOut(); - - // register edge - // graph->RegistOut("conv2_2/expand/scale", "relu2_2/expand"); - - //anakin graph optimization - graph->Optimize(); - - // constructs the executer net - { // inner scope -#ifdef USE_DIEPSE - Net net_executer(*graph, true); -#else - Net net_executer(*graph, true); -#endif - - // get in - auto d_tensor_in_p = net_executer.get_in("input_0"); - Tensor4d h_tensor_in; - - auto valid_shape_in = d_tensor_in_p->valid_shape(); - for (int i=0; icopy_from(h_tensor_in); - -#ifdef USE_DIEPSE - // for diepse model - auto d_tensor_in_1_p = net_executer.get_in("input_1"); - Tensor4d h_tensor_in_1; - - h_tensor_in_1.re_alloc(d_tensor_in_1_p->valid_shape()); - for (int i=0; ivalid_shape().size(); i++) { - LOG(INFO) << "detect input_1 dims[" << i << "]" << d_tensor_in_1_p->valid_shape()[i]; - } - h_data = h_tensor_in_1.mutable_data(); - h_data[0] = 1408; - h_data[1] = 800; - h_data[2] = 0.733333; - h_data[3] = 0.733333; - h_data[4] = 0; - h_data[5] = 0; - d_tensor_in_1_p->copy_from(h_tensor_in_1); - - auto d_tensor_in_2_p = net_executer.get_in("input_2"); - Tensor4d h_tensor_in_2; - - h_tensor_in_2.re_alloc(d_tensor_in_2_p->valid_shape()); - for (int i=0; ivalid_shape().size(); i++) { - LOG(INFO) << "detect input_2 dims[" << i << "]" << d_tensor_in_2_p->valid_shape()[i]; - } - h_data = h_tensor_in_2.mutable_data(); - h_data[0] = 2022.56; - h_data[1] = 989.389; - h_data[2] = 2014.05; - h_data[3] = 570.615; - h_data[4] = 1.489; - h_data[5] = -0.02; - d_tensor_in_2_p->copy_from(h_tensor_in_2); -#endif - - int epoch = 1; - // do inference - Context ctx(0, 0, 0); - saber::SaberTimer my_time; - LOG(WARNING) << "EXECUTER !!!!!!!! "; - // warm up - /*for(int i=0; i<10; i++) { - net_executer.prediction(); - }*/ - - my_time.start(ctx); - - - //auto start = std::chrono::system_clock::now(); - for(int i=0; i(end - start).count(); - //LOG(WARNING) << "avg time : " << time/epoch <<" ms"; - - my_time.end(ctx); - LOG(INFO)<<"aveage time "<(tensor_out_4_p); - - - // save the optimized model to disk. - /*std::string save_model_path = model_path + std::string(".saved"); - status = graph->save(save_model_path); - if (!status ) { - LOG(FATAL) << " [ERROR] " << status.info(); - }*/ -} -#endif -#endif - -#if 0 -TEST(NetTest, net_execute_reconstruction_test) { - graph = new Graph(); - LOG(WARNING) << "load anakin model file from optimized model " << model_saved_path << " ..."; - // load anakin model files. - auto status = graph->load(model_saved_path); - if (!status ) { - LOG(FATAL) << " [ERROR] " << status.info(); - } - - // regisiter output tensor - //graph->RegistOut("data_perm", "data_scale"); - graph->RegistOut("data_perm", "conv1"); - - //anakin graph optimization - graph->Optimize(); - - // constructs the executer net - Net net_executer(*graph); - - // get in - auto d_tensor_in_p = net_executer.get_in("input_0"); - Tensor4d h_tensor_in; - - auto valid_shape_in = d_tensor_in_p->valid_shape(); - for (int i=0; icopy_from(h_tensor_in); - - // do inference - Context ctx(0, 0, 0); - saber::SaberTimer my_time; - my_time.start(ctx); - - LOG(WARNING) << "EXECUTER !!!!!!!! "; - for (int i=0; i<1; i++) { - net_executer.prediction(); - - } - my_time.end(ctx); - LOG(INFO)<<"aveage time "<(tensor_out_inner_p); -} -#endif - -int main(int argc, const char** argv){ - // initial logger - logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} diff --git a/test/framework/net/net_test.h b/test/framework/net/net_test.h deleted file mode 100644 index c240afbf0..000000000 --- a/test/framework/net/net_test.h +++ /dev/null @@ -1,98 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_NET_TEST_H -#define ANAKIN_NET_TEST_H - -#include -#include "utils/unit_test/aktest.h" -#include "utils/logger/logger.h" -#include "graph_base.h" -#include "graph.h" -#include "scheduler.h" -#include "net.h" -#include "worker.h" - -using namespace anakin; -using ::anakin::test::Test; - -using namespace anakin::graph; - -/** - * \brief Graph test is base Test class for anakin graph funciton. - */ -class NetTest: public Test { -public: - NetTest(){} - - void SetUp(){} - - void TearDown(){} - -protected: -}; - -#ifdef USE_CUDA -void test_print(Tensor4dPtr& out_tensor_p) { - Tensor4d::type, AK_FLOAT> h_tensor_result; - h_tensor_result.re_alloc(out_tensor_p->valid_shape()); - LOG(ERROR) << "result count : " << h_tensor_result.valid_shape().count(); - h_tensor_result.copy_from(*out_tensor_p); - for (int i = 0; i < h_tensor_result.valid_size(); i++) { - LOG(INFO) << " GET OUT (" << i << ") " << h_tensor_result.mutable_data()[i]; - } -} -#endif - -template -double tensor_average(Tensor4dPtr& out_tensor_p) { - double sum = 0.0f; -#ifdef USE_CUDA - float* h_data = new float[out_tensor_p->valid_size()]; - const float* d_data = out_tensor_p->data(); - CUDA_CHECK(cudaMemcpy(h_data, d_data, out_tensor_p->valid_size()*sizeof(float), cudaMemcpyDeviceToHost)); -#else - float* h_data = out_tensor_p->data(); -#endif - for (int i=0; ivalid_size(); i++) { - sum+=h_data[i]; - } - return sum/out_tensor_p->valid_size(); -} - - -#ifdef USE_X86_PLACE -static int record_dev_tensorfile(const Tensor4d* dev_tensor, const char* locate) { - Tensor::type, AK_FLOAT, NCHW> host_temp; - host_temp.re_alloc(dev_tensor->valid_shape()); - host_temp.copy_from(*dev_tensor); - FILE* fp = fopen(locate, "w+"); - int size = host_temp.valid_shape().count(); - if (fp == 0) { - LOG(ERROR) << "[ FAILED ] file open target txt: " << locate; - } else { - for (int i = 0; i < size; ++i) { - fprintf(fp, "%.18f \n", i, (host_temp.data()[i])); - } - fclose(fp); - } - LOG(INFO) << "[ SUCCESS ] Write " << size << " data to: " << locate; - return 0; -} -#endif - -#endif - - diff --git a/test/framework/net/padde_api_test.cpp b/test/framework/net/padde_api_test.cpp deleted file mode 100644 index 6e0dfe878..000000000 --- a/test/framework/net/padde_api_test.cpp +++ /dev/null @@ -1,121 +0,0 @@ -#include -#include "net_test.h" -#include "saber/funcs/timer.h" -#include -#include "saber/core/tensor_op.h" -#include -#include -#include -#include -#include -#include -#include "paddle_api.h" -#define DEFINE_GLOBAL(type, var, value) \ - type (GLB_##var) = (value) -DEFINE_GLOBAL(std::string, model_dir, ""); -DEFINE_GLOBAL(int, num, 1); -DEFINE_GLOBAL(int, channel, 8); -DEFINE_GLOBAL(int, height, 640); -DEFINE_GLOBAL(int, width, 640); -DEFINE_GLOBAL(bool, is_input_shape, false); - -#ifdef USE_CUDA -typedef NV Target; -#elif defined(USE_X86_PLACE) -typedef X86 Target; -#else -typedef ARM Target; -#endif - -void getModels(std::string path, std::vector& files) -{ - DIR *dir; - struct dirent *ptr; - if((dir=opendir(path.c_str()))==NULL){ - perror("Open dri error..."); - exit(1); - } - while((ptr=readdir(dir))!=NULL){ - if(strcmp(ptr->d_name,".")==0||strcmp(ptr->d_name,"..")==0) - continue; - else if(ptr->d_type==8)//file - files.push_back(path+"/"+ptr->d_name); - else if(ptr->d_type==4){ - //files.push_back(ptr->d_name);//dir - getModels(path+"/"+ptr->d_name,files); - } - } - closedir(dir); -} - - -TEST(NetTest, net_execute_base_test) { - std::vector models; - getModels(GLB_model_dir, models); - for (auto iter = models.begin(); iter < models.end(); iter++) - { - AnakinEngine anakin_engine; - LOG(WARNING) << "load anakin model file from " << *iter << " ..."; - std::vector shape{GLB_num, GLB_channel, GLB_height, GLB_width}; - //anakin_engine.Build(*iter, shape); - anakin_engine.Build(*iter); - - printf("Args = %d %d %d %d\n",GLB_num, GLB_channel, GLB_height, GLB_width); - //fill input - Tensor4d h_tensor_in; - h_tensor_in.re_alloc({GLB_num, GLB_channel, GLB_height, GLB_width}); - fill_tensor_host_rand(h_tensor_in, -1.0f,1.0f); - - anakin_engine.SetInputFromCPU("input_0", h_tensor_in.data(), h_tensor_in.valid_size()); - - int warmup_iter = 10; - int epoch = 1000; - // do inference - Context ctx(0, 0, 0); - saber::SaberTimer my_time; - LOG(WARNING) << "EXECUTER !!!!!!!! "; - for (int i = 0; i < warmup_iter; i++) { - anakin_engine.Execute(); - } - my_time.start(ctx); - //auto start = std::chrono::system_clock::now(); - for (int i = 0; i < epoch; i++) { - anakin_engine.Execute(); - } - my_time.end(ctx); - LOG(INFO) << *iter << " aveage time "<< my_time.get_average_ms() / epoch << " ms"; - } -} - -int main(int argc, const char** argv){ - // initial logger - LOG(INFO)<<"argc"< -#include "saber/funcs/timer.h" -#include -#include "saber/core/tensor_op.h" -#include "saber/saber_types.h" -#include -#include -#include -#include -#include -#include - -class EngineBase { - public: - // Build the model and do some preparation, for example, in TensorRT, run - // createInferBuilder, buildCudaEngine. - virtual void Build(const std::string& model_file, int batch_size = 1) = 0; - virtual void Build(const std::string& model_file, const std::vector& shape) = 0; - // Execute the engine, that will run the inference network. - virtual void Execute() = 0; - - virtual ~EngineBase() {} -}; // class EngineBase - -template -class AnakinEngine : public EngineBase { -public: - typedef typename anakin::saber::DataTrait::dtype Dtype_t; - typedef anakin::saber::TargetWrapper X86_API; - typedef anakin::saber::TargetWrapper NV_API; - AnakinEngine(){} - - ~AnakinEngine(){}; - - void Build(const std::string& model_file, int batch_size = 1) override - { - _graph.load(model_file); - _graph.ResetBatchSize("input_0", batch_size); - _graph.Optimize(); - _net_executer.init(_graph); - }; - - void Build(const std::string& model_file, const std::vector& shape) override - { - _graph.load(model_file); - _graph.Reshape("input_0", shape); - _graph.Optimize(); - _net_executer.init(_graph); - }; - - void Execute() override - { - _net_executer.prediction(); - }; - - // Fill an input from CPU memory with name and size. - void SetInputFromCPU(const std::string name, Dtype_t* data, size_t size) - { - auto input_tensor = _net_executer.get_in(name); - anakin::Tensor tmp_tensor(data, anakin::saber::X86(), X86_API::get_device_id(), input_tensor->valid_shape()); - *input_tensor = tmp_tensor; - }; - - // accessed directly. Fill an input from GPU memory with name and size. - void SetInputFromGPU(const std::string& name, Dtype_t* data, size_t size) - { - auto input_tensor = _net_executer.get_in(name); - CHECK_EQ(size, input_tensor->valid_size()); - anakin::Tensor tmp_tensor(data, NV(), NV_API::get_device_id(), input_tensor->valid_shape()); - *input_tensor = tmp_tensor; - }; - // Get an output called name, the output of tensorrt is in GPU, so this method - // will just return the output's GPU memory address. - anakin::Tensor* GetOutputInGPU(const std::string& name) - { - return _net_executer.get_out(name); - } - -private: - anakin::graph::Graph _graph; - anakin::Net _net_executer; -}; // class TensorRTEngine -template -class AnakinEngine; - - diff --git a/test/framework/operators/operator_tests.h b/test/framework/operators/operator_tests.h deleted file mode 100644 index 38f16b87d..000000000 --- a/test/framework/operators/operator_tests.h +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_OPERATOR_TESTS_H -#define ANAKIN_OPERATOR_TESTS_H - -#include -#include -#include -#include "utils/unit_test/aktest.h" -#include "utils/logger/logger.h" -#include "framework/operators/ops.h" - -using namespace anakin; -using ::anakin::test::Test; - -class OperatorsTest : public Test { -public: - OperatorsTest(){} - - void SetUp(){} - - void TearDown(){} - -protected: -}; - - - - - - -#endif - - diff --git a/test/framework/operators/pooling_test.cpp b/test/framework/operators/pooling_test.cpp deleted file mode 100644 index 47b66be23..000000000 --- a/test/framework/operators/pooling_test.cpp +++ /dev/null @@ -1,43 +0,0 @@ -#include "operator_tests.h" -#include "thread_pool.h" - -#ifdef USE_CUDA -using Target = NV; -#elif defined(USE_X86_PLACE) -using Target = X86; -#else -using Target = ARM; -#endif - -TEST(OperatorsTest, PoolingFactoryTest) { - OpContext opctx; - std::vector > in; - std::vector > out; - - - /*Operator*/ auto* Op_name1 = - OpFactory::Global()["pooling"]; - /*Operator**/auto* Op_name2 = - OpFactory::Global()["pool"]; - auto& op_list = OpFactory::Global().get_list_op_name(); - - for (auto& item : op_list) { - LOG(INFO) << " op: " << item; - } - - LOG(WARNING) << " op name alias 1 : pooling"; - LOG(INFO) << " run forward function"; - (*Op_name1)(opctx, in, out); - LOG(WARNING) << " op name alias 2 : pool"; - LOG(INFO) << " run forward function"; - (*Op_name2)(opctx, in, out); -} - - -int main(int argc, const char** argv) { - // initial logger - logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp deleted file mode 100644 index ea8d7101d..000000000 --- a/test/saber/bm/test_saber_buffer_BM.cpp +++ /dev/null @@ -1,126 +0,0 @@ -#include "test_saber_buffer_BM.h" -#include "saber/core/buffer.h" -#include "saber/core/data_traits.h" - -using namespace anakin::saber; - -static bm_handle_t handle; - -int get_bm_size() { - return 1; -} - -template -void test_buffer() { - - //TODO: init in another place - bmdnn_init(&handle); - - typedef TargetWrapper X86_API; - typedef TargetWrapper BM_API; - typedef typename DataTrait::dtype Ddtype; - typedef typename DataTrait::dtype Hdtype; - typedef Buffer BufferH; - typedef Buffer BufferD; - - int n0 = 1024; - int n1 = 2048; - - void* tmp_x86; - Hdtype* x86_ptr; - X86_API::mem_alloc(&tmp_x86, sizeof(Hdtype) * n0); - x86_ptr = static_cast(tmp_x86); - - for (int i = 0; i < n0; i++) { - x86_ptr[i] = static_cast(i); - } - - void* tmp_bm; - Ddtype* bm_ptr; - BM_API::mem_alloc(&tmp_bm, get_bm_size() * n0); - bm_ptr = static_cast(tmp_bm); - - LOG(INFO) << "Buffer: test default(empty) constructor"; - BufferH x86_buf0; - BufferD bm_buf0; - - LOG(INFO) << "Buffer: test constructor with data size"; - BufferH x86_buf1(n0 * sizeof(Hdtype)); - BufferD bm_buf1(n0 * sizeof(Ddtype)); - - LOG(INFO) << "Buffer: test constructor with data pointer, size and device id"; - BufferH x86_buf2(x86_ptr, n0 * sizeof(Hdtype), X86_API::get_device_id()); - BufferD bm_buf2(bm_ptr, n0 * get_bm_size(), BM_API::get_device_id()); - - LOG(INFO) << "Buffer: test copy constructor"; - BufferH x86_buf3(x86_buf2); - LOG(INFO) << "BM Buffer copy constructor"; - LOG(INFO) << "bm target id: " << BM_API::get_device_id(); - LOG(INFO) << "bm buffer target id: " << bm_buf2.get_id(); - BufferD bm_buf3(bm_buf2); - CHECK_EQ(x86_buf3.get_count(), x86_buf2.get_count()) << \ - "shared buffer should have same data count"; - CHECK_EQ(bm_buf3.get_count(), bm_buf2.get_count()) << \ - "shared buffer should have same data count"; - - LOG(INFO) << "Buffer: test operator ="; - x86_buf0 = x86_buf2; - bm_buf0 = bm_buf2; - CHECK_EQ(x86_buf0.get_count(), x86_buf2.get_count()) << \ - "shared buffer should have same data count"; - CHECK_EQ(bm_buf0.get_count(), bm_buf2.get_count()) << \ - "shared buffer should have same data count"; - - LOG(INFO) << "Buffer: test re_alloc"; - x86_buf1.re_alloc(n1 * sizeof(Hdtype)); - bm_buf1.re_alloc(n1 * sizeof(Ddtype)); - CHECK_EQ(x86_buf1.get_count(), n1 * sizeof(Hdtype)) << "buffer count error"; - CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Hdtype)) << "buffer capacity error"; - CHECK_EQ(bm_buf1.get_count(), n1 * sizeof(Ddtype)) << "buffer count error"; - CHECK_EQ(bm_buf1.get_capacity(), n1 * sizeof(Ddtype)) << "buffer capacity error"; - x86_buf1.re_alloc(n0 * sizeof(Hdtype)); - bm_buf1.re_alloc(n0 * sizeof(Ddtype)); - CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Hdtype)) << "buffer count error"; - CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Hdtype)) << "buffer capacity error"; - CHECK_EQ(x86_buf1.get_count(), n0 * sizeof(Hdtype)) << "buffer count error"; - CHECK_EQ(x86_buf1.get_capacity(), n1 * sizeof(Hdtype)) << "buffer capacity error"; - - LOG(INFO) << "Buffer: test get_id()"; - LOG(INFO) << "X86 device id: " << x86_buf0.get_id() << \ - ", bm device id: " << bm_buf0.get_id(); - CHECK_EQ(X86_API::get_device_id(), x86_buf0.get_id()) << "x86 device id error"; - CHECK_EQ(BM_API::get_device_id(), bm_buf0.get_id()) << "bm device id error"; - - LOG(INFO) << "Buffer: test deep_cpy()"; - x86_buf1.sync_copy_from(x86_buf2); - LOG(INFO) << "deep copy between two host buffer: "; - const Hdtype* ptr1 = static_cast(x86_buf1.get_data()); - const Hdtype* ptr2 = static_cast(x86_buf1.get_data()); - - for (int i = 0; i < 10; i++) { - std::cout << ptr1[i] << std::endl; - } - - CHECK_EQ(ptr1[n0 / 2], ptr2[n0 / 2]) << "deep copy between host is incorrect"; - LOG(INFO) << "deep copy from host buffer to device buffer"; - bm_buf1.sync_copy_from(x86_buf2); - x86_buf1.sync_copy_from(bm_buf1); - LOG(INFO) << "deep copy from device buffer to host buffer: "; - ptr1 = static_cast(x86_buf1.get_data()); - - for (int i = 0; i < 10; i++) { - std::cout << ptr1[i] << std::endl; - } -} - -TEST(TestSaberBufferBM, test_buffer_memcpy) { - test_buffer(); -} - -int main(int argc, const char** argv) { - // initial logger - logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} diff --git a/test/saber/bm/test_saber_buffer_BM.h b/test/saber/bm/test_saber_buffer_BM.h deleted file mode 100644 index 8bbbe4511..000000000 --- a/test/saber/bm/test_saber_buffer_BM.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H -#define ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H - -#include "utils/unit_test/aktest.h" -#include "utils/logger/logger.h" - -using namespace anakin::test; - -class TestSaberBufferBM : public Test { -public: - TestSaberBufferBM() {} - ~TestSaberBufferBM() {} - -protected: - virtual void setup() {} - virtual void teardown() {} - -}; - -#endif //ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H diff --git a/test/saber/bm/test_saber_context_BM.cpp b/test/saber/bm/test_saber_context_BM.cpp deleted file mode 100644 index ed93866cf..000000000 --- a/test/saber/bm/test_saber_context_BM.cpp +++ /dev/null @@ -1,28 +0,0 @@ -#include "test_saber_context_BM.h" - -#ifdef USE_BM - -using namespace anakin::saber; - -TEST(TestSaberContextBM, test_BM_context) { - Env::env_init(); - typedef TargetWrapper API; - typename API::event_t event; - API::create_event(event); - LOG(INFO) << "test context constructor"; - Context ctx0; - Context ctx1(0, 1, 1); - - //for BM no need to test stream as it is not in use -} - -#endif - -int main(int argc, const char** argv) { - // initial logger - logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/saber/bm/test_saber_context_BM.h b/test/saber/bm/test_saber_context_BM.h deleted file mode 100644 index 653ee11fd..000000000 --- a/test/saber/bm/test_saber_context_BM.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef SABER_TEST_SABER_CONTEXT_BM_H -#define SABER_TEST_SABER_CONTEXT_BM_H - -#include "utils/unit_test/aktest.h" -#include "utils/logger/logger.h" -#include "core/context.h" - -using namespace anakin::test; - -class TestSaberContextBM : public Test { -public: - TestSaberContextBM() {} - ~TestSaberContextBM() {} - -protected: - virtual void setup() {} - virtual void teardown() {} - -}; - -#endif //SABER_TEST_SABER_CONTEXT_BM_H diff --git a/test/saber/bm/test_saber_device_BM.cpp b/test/saber/bm/test_saber_device_BM.cpp deleted file mode 100644 index 1c7086cf1..000000000 --- a/test/saber/bm/test_saber_device_BM.cpp +++ /dev/null @@ -1,20 +0,0 @@ -#include "test_saber_device_BM.h" - -#ifdef USE_BM - -using namespace anakin::saber; - -TEST(TestSaberDeviceBM, test_BM_device) { - Device dev_BM; -} - -#endif - -int main(int argc, const char** argv) { - // initial logger - logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/saber/bm/test_saber_device_BM.h b/test/saber/bm/test_saber_device_BM.h deleted file mode 100644 index 3a6d61236..000000000 --- a/test/saber/bm/test_saber_device_BM.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef SABER_TEST_SABER_DEVICE_BM_H -#define SABER_TEST_SABER_DEVICE_BM_H - -#include "utils/unit_test/aktest.h" -#include "utils/logger/logger.h" -#include "core/device.h" - -using namespace anakin::test; - -class TestSaberDeviceBM : public Test { -public: - TestSaberDeviceBM() {} - ~TestSaberDeviceBM() {} - -protected: - virtual void setup() {} - virtual void teardown() {} - -}; - -#endif //SABER_TEST_SABER_DEVICE_BM_H diff --git a/test/saber/bm/test_saber_func_BM.h b/test/saber/bm/test_saber_func_BM.h deleted file mode 100644 index 61d27d6f9..000000000 --- a/test/saber/bm/test_saber_func_BM.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef ANAKIN_TEST_SABER_TEST_SABER_FUNC_BM_H -#define ANAKIN_TEST_SABER_TEST_SABER_FUNC_BM_H - -#include "utils/unit_test/aktest.h" -#include "utils/logger/logger.h" -#include "core/tensor.h" -#include -#include - -using namespace anakin::test; - -int read_file(std::vector &results, const char* file_name) { - - std::ifstream infile(file_name); - if (!infile.good()) { - std::cout << "Cannot open " << std::endl; - return false; - } - LOG(INFO)<<"found filename: "< - -using namespace anakin::saber; - -template -void print_tensor_shape(std::string name, Tensor& t0) { - - LOG(INFO) << name << " valid shape is [" - << t0.valid_shape()[0] << ", " - << t0.valid_shape()[1] << ", " - << t0.valid_shape()[2] << ", " - << t0.valid_shape()[3] << "]."; - - LOG(INFO) << name << " real shape is [" - << t0.shape()[0] << ", " - << t0.shape()[1] << ", " - << t0.shape()[2] << ", " - << t0.shape()[3] << "]."; - - LOG(INFO) << name << " offset is [" - << t0.offset()[0] << ", " - << t0.offset()[1] << ", " - << t0.offset()[2] << ", " - << t0.offset()[3] << "]."; -} - -TEST(TestSaberFuncBM, test_func_constructor) { - - typedef Tensor TensorHf4; - typedef Tensor TensorDf4; - - int img_num = 1; - int in_channels = 2; - int img_h = 8; - int img_w = 8; - - Shape img_s(img_num, in_channels, img_h, img_w); - - TensorHf4 img_host; - TensorDf4 img_dev; - - img_host.re_alloc(img_s); - img_dev.re_alloc(img_s); - - for (int i = 0; i < img_host.size(); ++i) { - img_host.mutable_data()[i] = (float)(0.05 * (i & 0x1f) * -1); - } - - img_dev.copy_from(img_host); - TensorDf4 output_dev; - - // start Reshape & doInfer - - Context ctx1(0, 1, 1); - - ActivationParam param(Active_relu, 0.1f, 0.1f); - - std::vector input; - std::vector output; - - input.push_back(&img_dev); - output.push_back(&output_dev); - - Activation act; - act.compute_output_shape(input, output, param); - output_dev.re_alloc(output[0]->shape()); - - // init assume output tensor has been reshpaed by user. - act.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1); - act(input, output, param, ctx1); - - print_tensor_device(output_dev); -} - -int main(int argc, const char** argv) { - Env::env_init(); - // initial logger - //logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/saber/bm/test_saber_func_conv_BM.cpp b/test/saber/bm/test_saber_func_conv_BM.cpp deleted file mode 100644 index 7881cdb97..000000000 --- a/test/saber/bm/test_saber_func_conv_BM.cpp +++ /dev/null @@ -1,725 +0,0 @@ -#include "core/context.h" -#include "funcs/conv.h" -#include "test_saber_func_BM.h" -#include "tensor_op.h" -#include "saber_types.h" -#include -//#include "cublas.h" - -using namespace anakin::saber; - -typedef Tensor TensorHf4; -typedef Tensor TensorDf4; - -template -void print_tensor_shape(std::string name, Tensor &t0) { - - LOG(INFO) << name << " valid shape is [" - << t0.valid_shape()[0] << ", " - << t0.valid_shape()[1] << ", " - << t0.valid_shape()[2] << ", " - << t0.valid_shape()[3] << "]."; - - LOG(INFO) << name << " real shape is [" - << t0.shape()[0] << ", " - << t0.shape()[1] << ", " - << t0.shape()[2] << ", " - << t0.shape()[3] << "]."; - - LOG(INFO) << name << " offset is [" - << t0.offset()[0] << ", " - << t0.offset()[1] << ", " - << t0.offset()[2] << ", " - << t0.offset()[3] << "]."; -} - - - -#if 1 -TEST(TestSaberFuncBM, test_depthwise_conv) { - - int group = 2; - int pad_h = 1; - int pad_w = 1; - int stride_h = 1; - int stride_w = 1; - int dilation_h = 1; - int dilation_w = 1; - - int kernel_h = 3; - int kernel_w = 3; - int out_channels = 2; - - int img_num = 1; - int in_channels = 2; - int img_h = 8; - int img_w = 8; - - bool bias_term = true; - - LOG(INFO) << "conv param: "; - LOG(INFO) << " img_num = " << img_num; - LOG(INFO) << " in_channels = " << in_channels; - LOG(INFO) << " img_h = " << img_h; - LOG(INFO) << " img_w = " << img_w; - LOG(INFO) << " group = " << group; - LOG(INFO) << " pad_h = " << pad_h; - LOG(INFO) << " pad_w = " << pad_w; - LOG(INFO) << " stride_h = " << stride_h; - LOG(INFO) << " stride_w = " << stride_w; - LOG(INFO) << " dilation_h = " << dilation_h; - LOG(INFO) << " dilation_w = " << dilation_w; - LOG(INFO) << " kernel_h = " << kernel_h; - LOG(INFO) << " kernel_w = " << kernel_w; - LOG(INFO) << " out_channels = " << out_channels; - - Shape img_s(img_num, in_channels, img_h, img_w); - Shape weights_s(out_channels, in_channels, kernel_h, kernel_w); - Shape bias_s(1, out_channels, 1, 1); - - TensorHf4 img_host; - TensorDf4 img_dev; - - img_host.re_alloc(img_s); - img_dev.re_alloc(img_s); - - for (int i = 0; i < img_host.size(); ++i) { - img_host.mutable_data()[i] = 63 & i; - } - - img_dev.copy_from(img_host); - - TensorHf4 weights_host; - TensorDf4 weights_dev; - - weights_host.re_alloc(weights_s); - weights_dev.re_alloc(weights_s); - - fill_tensor_host_const(weights_host, 1.f); - weights_dev.copy_from(weights_host); - - TensorHf4 bias_host; - TensorDf4 bias_dev; - - if (bias_term) { - bias_host.re_alloc(bias_s); - bias_dev.re_alloc(bias_s); - - fill_tensor_host_const(bias_host, 1.f); - bias_dev.copy_from(bias_host); - } - - TensorHf4 output_host; - TensorDf4 output_dev; - - // start Reshape & doInfer - Context ctx1(0, 1, 1); - - ConvParam param(group, pad_h, pad_w, - stride_h, stride_w, - dilation_h, dilation_w, - &weights_dev, &bias_dev); - - std::vector input; - std::vector output; - - input.push_back(&img_dev); - output.push_back(&output_dev); - - Conv conv; - conv.compute_output_shape(input, output, param); - - output_dev.re_alloc(output[0]->shape()); - output_host.re_alloc(output[0]->shape()); - - LOG(INFO) << "regular start with group = " << group; - // init assume output tensor has been reshpaed by user. - conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1); - - conv(input, output, param, ctx1); - - cudaStream_t cuda_stream = ctx1.get_compute_stream(); - output[0]->record_event(cuda_stream); - - output_dev.sync(); - print_tensor_device(output_dev); - -// param.group = 1; -// param.pad_h = 1; -// param.pad_w = 1; -// -// LOG(INFO) << " param changed start with group = "< ctx1(0, 1, 1); - Context ctx2(0, 2, 2); - - TensorDf4 out0; - TensorDf4 out1; - - ConvParam param0(group, pad_h, pad_w, - stride_h, stride_w, - dilation_h, dilation_w, - &weights_dev, &bias_dev); - - ConvParam param1(group, pad_h, pad_w, - stride_h, stride_w, - dilation_h, dilation_w, - &weights_dev, &bias_dev); - - std::vector input0, input1; - std::vector output0, output1; - - input0.push_back(&t0); - input1.push_back(&t1); - - output0.push_back(&out0); - output1.push_back(&out1); - - // FIXME ? where do i get output shape - output_dev.re_alloc(img_s); - - Conv conv0; - Conv conv1; - - conv0.compute_output_shape(input0, output0, param0); - conv1.compute_output_shape(input1, output1, param1); - - out0.share_sub_buffer(output_dev, output0[0]->valid_shape(),{0,0,0,0}); - out1.share_sub_buffer(output_dev, output1[0]->valid_shape(),{0,0,4,4}); - - conv0.init(input0, output0, param0, SPECIFY, VENDER_IMPL, ctx1); - conv1.init(input1, output1, param1, SPECIFY, VENDER_IMPL, ctx2); - - conv0(input0, output0, param0, ctx1); - conv1(input1, output1, param1, ctx2); - - cudaStream_t cuda_stream1 = ctx1.get_compute_stream(); - output0[0]->record_event(cuda_stream1); - - cudaStream_t cuda_stream2 = ctx2.get_compute_stream(); - output1[0]->record_event(cuda_stream2); - - out0.sync(); - out1.sync(); - - print_tensor_device(output_dev); - -// print_tensor_device(output_dev); - - cudaDeviceSynchronize(); - CUDA_CHECK(cudaPeekAtLastError()); -} -#endif - -TEST(TestSaberFuncBM, test_conv_fp32_speed_test) { - - int group = 1; - int pad_h = 1; - int pad_w = 1; - int stride_h = 1; - int stride_w = 1; - int dilation_h = 1; - int dilation_w = 1; - - int kernel_h = 1; - int kernel_w = 1; - int out_channels = 128; - - int img_num = 7; - int in_channels = 13; - int img_h = 32; - int img_w = 32; - - bool bias_term = false; - - LOG(INFO) << "conv param: "; - LOG(INFO) << " img_num = " << img_num; - LOG(INFO) << " in_channels = " << in_channels; - LOG(INFO) << " img_h = " << img_h; - LOG(INFO) << " img_w = " << img_w; - LOG(INFO) << " group = " << group; - LOG(INFO) << " pad_h = " << pad_h; - LOG(INFO) << " pad_w = " << pad_w; - LOG(INFO) << " stride_h = " << stride_h; - LOG(INFO) << " stride_w = " << stride_w; - LOG(INFO) << " dilation_h = " << dilation_h; - LOG(INFO) << " dilation_w = " << dilation_w; - LOG(INFO) << " kernel_h = " << kernel_h; - LOG(INFO) << " kernel_w = " << kernel_w; - LOG(INFO) << " out_channels = " << out_channels; - Shape img_s(img_num, in_channels, img_h, img_w); - Shape weights_s(out_channels, in_channels, kernel_h, kernel_w); - Shape bias_s(1, out_channels, 1, 1); - - TensorHf4 img_host; - TensorDf4 img_dev; - - img_host.re_alloc(img_s); - img_dev.re_alloc(img_s); - - for (int i = 0; i < img_host.size(); ++i) { - img_host.mutable_data()[i] = 1; - } - - img_dev.copy_from(img_host); - - TensorHf4 weights_host; - TensorDf4 weights_dev; - - weights_host.re_alloc(weights_s); - weights_dev.re_alloc(weights_s); - - fill_tensor_host_const(weights_host, 1.f); - weights_dev.copy_from(weights_host); - - TensorHf4 bias_host; - TensorDf4 bias_dev; - - if (bias_term) { - bias_host.re_alloc(bias_s); - bias_dev.re_alloc(bias_s); - - fill_tensor_host_const(bias_host, 1.f); - bias_dev.copy_from(bias_host); - } - - TensorDf4 output_dev; - - // start Reshape & doInfer - Context ctx1(0, 1, 1); - - ConvParam param(group, pad_h, pad_w, - stride_h, stride_w, - dilation_h, dilation_w, - &weights_dev, &bias_dev); - - std::vector input; - std::vector output; - - input.push_back(&img_dev); - output.push_back(&output_dev); - - Conv conv; - conv.compute_output_shape(input, output, param); - - output_dev.re_alloc(output[0]->shape()); - LOG(INFO) << "Output shape = [ " << output[0]->shape()[0] << " " << output[0]->shape()[1] << " " \ - << output[0]->shape()[2] << " " << output[0]->shape()[3] << "]"; - //LOG(INFO) << " blocks = [ " << i_div_up(img_num*output[0]->shape()[2]*output[0]->shape()[3],128) << " " << i_div_up(out_channels*kernel_h, 128) << " 1 ]" ; - //选择k最小的那一组,如果一样,则选128*N,N最大的那一组 - int k0 = i_div_up(out_channels, 128) * 128 - out_channels; - int k1 = i_div_up(out_channels, 64) * 64 - out_channels; - int k2 = i_div_up(out_channels, 32) * 32 - out_channels; - int kk = std::min(std::min(k0,k1),k2); - LOG(INFO) << "k0 = " << k0 << " k1 = " << k1 << " k2 = " << k2 << " kk = " << kk; - if (kk == k0) - LOG(INFO) << "thread = [256,1,1] 128*128" ; - if (kk == k1) - LOG(INFO) << "thread = [128,1,1] 128*64" ; - if (kk == k2) - LOG(INFO) << "thread = [128,1,1] 128*32" ; - - LOG(INFO) << "saber conv init"; - conv.init(input, output, param, SPECIFY, SABER_IMPL, ctx1); - - LOG(INFO) << "saber conv dispatch"; - conv(input, output, param, ctx1); - - cudaStream_t cuda_stream = ctx1.get_compute_stream(); - output[0]->record_event(cuda_stream); - - output_dev.sync(); - - SaberTimer t1; - int ts = 1; - - for (int i = 0; i < ts; ++i) { - t1.start(ctx1); - conv(input, output, param, ctx1); - output_dev.sync(); - t1.end(ctx1); - } - - LOG(INFO) << "fp32 average time: " << t1.get_average_ms() << " ms"; - - cudaDeviceSynchronize(); - CUDA_CHECK(cudaPeekAtLastError()); -} - -void test_conv_fp32_speed(std::vector &inputs, std::vector &outputs, - TensorDf4 &weights, int kernel_size, int stride, int pad, - int in_channel, int out_channel, TensorDf4 &bias, - anakin::saber::ImplEnum impl) { - - ConvParam conv_param(1, pad, pad, - stride, stride, - 1, 1, - &weights, &bias); - Conv conv; - conv.compute_output_shape(inputs, outputs, conv_param); - outputs[0]->re_alloc(outputs[0]->shape()); - Context ctx1(0, 1, 1); - - SABER_CHECK(conv.init(inputs, outputs, conv_param, SPECIFY, impl, ctx1)); - - conv(inputs, outputs, conv_param, ctx1); - outputs[0]->record_event(ctx1.get_compute_stream()); - outputs[0]->sync(); - - cudaDeviceSynchronize(); - - SaberTimer t1; - int ts = 100; - for (int i = 0; i < ts; ++i) { - t1.start(ctx1); - conv(inputs, outputs, conv_param, ctx1); - outputs[0]->record_event(ctx1.get_compute_stream()); - outputs[0]->sync(); - t1.end(ctx1); - } - LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms"; - - cudaDeviceSynchronize(); -} - - -cublasHandle_t cublas_handle; - -void caffe_gemm(const int M, const int N, const int K,\ - const float alpha, const float* A,\ - const float* B, const float beta, float* C) { - int lda = K; - int ldb = N; - CUBLAS_CHECK(cublasSgemm(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - N, M, K, - &alpha, B, - ldb, A, - lda, &beta, - C, N)); -} - -TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) { - int img_num = 1; - int kernel = 1; - -// int out_channels = 32; -// int in_channels = 128; -// int img_h = 52; -// int img_w = 112; -// int out_channels = 64; -// int in_channels = 256; -// int img_h = 26; -// int img_w = 56; - int out_channels = 128; - int in_channels = 512; - int img_h = 13; - int img_w = 28; - -// int out_channels = 512; -// int in_channels = 128; -// int img_h = 13; -// int img_w = 28; - - int pad = 0; - int stride = 1; - Context ctx1(0, 1, 1); - - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - CUBLAS_CHECK(cublasSetStream(cublas_handle, ctx1.get_compute_stream())); - - TensorDf4 weights; - weights.re_alloc({out_channels, in_channels, 1, 1}); - - TensorDf4 img; - img.re_alloc({1, in_channels, img_h, img_w}); - - TensorDf4 out; - out.re_alloc({1, out_channels, img_h, img_w}); - TensorDf4 out_gemm; - out_gemm.re_alloc({1, out_channels, img_h, img_w}); - - fill_tensor_device_rand(weights, -1.f, 1.f); - fill_tensor_device_rand(img, -1.f, 1.f); - - LOG(INFO) << "img_num: " << img_num; - LOG(INFO) << "kernel: " << kernel; - LOG(INFO) << "out_channels: " << out_channels; - LOG(INFO) << "in_channels: " << in_channels; - LOG(INFO) << "img_h: " << img_h; - LOG(INFO) << "img_w: " << img_w; - LOG(INFO) << "pad: " << pad; - LOG(INFO) << "stride: " << stride; - - TensorDf4 bias; - - std::vector input_v; - std::vector output_gemm_v, output_v; - - input_v.push_back(&img); - output_v.push_back(&out); - output_gemm_v.push_back(&out_gemm); - cudaDeviceSynchronize(); - test_conv_fp32_speed(input_v, output_v, - weights, kernel, stride, pad, - in_channels, out_channels, bias, - SABER_IMPL); - cudaDeviceSynchronize(); - caffe_gemm(out_channels, img_h * img_w, in_channels,\ - 1.f, weights.data(),\ - img.data(), 0.f, out_gemm.mutable_data()); - cudaDeviceSynchronize(); - SaberTimer t1; - int ts = 100; - - for (int i = 0; i < ts; ++i) { - t1.start(ctx1); - caffe_gemm(out_channels, img_h * img_w, in_channels,\ - 1.f, weights.data(),\ - img.data(), 0.f, out_gemm.mutable_data()); - out_gemm.record_event(ctx1.get_compute_stream()); - out_gemm.sync(); - t1.end(ctx1); - } - LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms"; - - cudaDeviceSynchronize(); -// print_tensor_device(out); -// print_tensor_device(out_gemm); - TensorHf4 out_host; - TensorHf4 out_gemm_host; - out_host.re_alloc(out.shape()); - out_host.copy_from(out); - - out_gemm_host.re_alloc(out_gemm.shape()); - out_gemm_host.copy_from(out_gemm); - double max_r, max_d; - tensor_cmp_host(out_host.data(), out_gemm_host.data(), out_host.size(), max_r, max_d); - LOG(INFO) << "cmp result: max_r = " << max_r << " max_d = " << max_d; -} - -int main(int argc, const char** argv){ - anakin::saber::Env::env_init(); - - // initial logger - //logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/saber/bm/test_saber_func_fc_BM.cpp b/test/saber/bm/test_saber_func_fc_BM.cpp deleted file mode 100644 index 869ff1bfd..000000000 --- a/test/saber/bm/test_saber_func_fc_BM.cpp +++ /dev/null @@ -1,146 +0,0 @@ -#include "core/context.h" -#include "funcs/fc.h" -#include "test_saber_func_BM.h" -#include "tensor_op.h" -#include "saber_types.h" -#include - -using namespace anakin::saber; -typedef TargetWrapper API; -typedef Tensor TensorDf4; -typedef Tensor TensorHf4; -typedef TensorDf4::Dtype ftype; - -void fc_compute(const TensorHf4& tin, const TensorHf4& weight, \ - const TensorHf4& bias, TensorHf4& tout) { - - int m = tin.num(); - int k = tin.valid_size() / m; - int n = weight.valid_size() / k; - bool bias_term = bias.valid_size() > 0; - - const float* din = tin.data(); - const float* w = weight.data(); - float* dout = tout.mutable_data(); - - for (int i = 0; i < m; ++i) { - float* pdout = dout + i * n; - const float* pdin = din + i * k; - - for (int j = 0; j < n; ++j) { - if (bias_term) { - pdout[j] = bias.data()[j]; - } else { - pdout[j] = 0; - } - - for (int l = 0; l < k; ++l) { - pdout[j] += pdin[l] * w[l * n + j]; - } - } - } -} - -TEST(TestSaberFuncBM, test_func_fc) { - - int test_iter = 100; - int w_in = 7; - int h_in = 7; - int ch_in = 512; - int num_in = 1; - - int num_out = 4096; - int axis = 1; - - Shape shape_in(num_in, ch_in, h_in, w_in); - Shape shape_out = {num_in, num_out, 1, 1}; - - Shape sh_w{1, 1, w_in* h_in * ch_in, num_out}; - TensorDf4 weight(sh_w); - Shape sh_b{1, 1, 1, num_out}; - TensorDf4 bias(sh_b); - fill_tensor_device_const(weight, 1.f); - fill_tensor_device_const(bias, 1.f); - - LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \ - ch_in << ", height=" << h_in << ", width=" << w_in; - - std::vector input_dev_4d; - std::vector output_dev_4d; - - TensorDf4 tdin; - TensorDf4 tdout; - tdin.re_alloc(shape_in); - fill_tensor_device_const(tdin, 1.f); - input_dev_4d.push_back(&tdin); - output_dev_4d.push_back(&tdout); - - // start Reshape & doInfer - Context ctx_dev(0, 1, 1); - - FcParam param(&weight, &bias, num_out, axis); - - Fc fc; - - LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \ - shape_out[2] << ", " << shape_out[3]; - - SABER_CHECK(fc.compute_output_shape(input_dev_4d, output_dev_4d, param)); - - LOG(INFO) << "re-alloc tensor buffer"; - output_dev_4d[0]->re_alloc(output_dev_4d[0]->valid_shape()); - Shape va_sh = tdout.valid_shape(); - LOG(INFO) << "shape out 4d: " << va_sh[0] << ", " << va_sh[1] << ", " << \ - va_sh[2] << ", " << va_sh[3]; - CHECK_EQ(tdout.valid_shape() == shape_out, true) << "compute output shape error"; - - LOG(INFO) << "FC initialization"; - SABER_CHECK(fc.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev)); - - LOG(INFO) << "FC compute"; - SaberTimer t1; - t1.clear(); - t1.start(ctx_dev); - - for (int i = 0; i < test_iter; ++i) { - SABER_CHECK(fc(input_dev_4d, output_dev_4d, param, ctx_dev)); - output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); - output_dev_4d[0]->sync(); - //cudaDeviceSynchronize(); - } - - t1.end(ctx_dev); - float ts = t1.get_average_ms(); - LOG(INFO) << "total time: " << ts << "avg time: " << ts / test_iter; - //print_tensor_device(*output_dev_4d[0]); - - //! check result - TensorHf4 thin(shape_in); - TensorHf4 thout(shape_out); - TensorHf4 thw(sh_w); - TensorHf4 thb(sh_b); - thin.copy_from(tdin); - thw.copy_from(weight); - thb.copy_from(bias); - fc_compute(thin, thw, thb, thout); - //print_tensor_host(thout); - - TensorHf4 thout_d(shape_out); - thout_d.copy_from(tdout); - double max_ratio = 0; - double max_diff = 0; - tensor_cmp_host(thout.data(), thout_d.data(), thout.valid_size(), max_ratio, max_diff); - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - CHECK_LE(fabs(max_ratio), 1.0e-6) << "error result"; - -} - -int main(int argc, const char** argv) { - // initial logger - //logger::init(argv[0]); - Env::env_init(); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp deleted file mode 100644 index 04b963675..000000000 --- a/test/saber/bm/test_saber_func_pooling_BM.cpp +++ /dev/null @@ -1,311 +0,0 @@ -#include "core/context.h" -#include "funcs/pooling.h" -#include "test_saber_func_BM.h" -#include "tensor_op.h" -#include "saber_types.h" -#include "funcs/timer.h" -#include - -using namespace anakin::saber; - -TEST(TestSaberFuncBM, test_func_pooling) { - - Env::env_init(); - typedef TargetWrapper API; - typename API::event_t event; - API::create_event(event); - - typedef TargetWrapper X86_API; - typedef TargetWrapper BM_API; - typedef Tensor TensorHf4; - typedef Tensor TensorDf4; - - int img_num = 1; - int in_channels = 4; - int img_h = 800; - int img_w = 1440; - - Shape img_s(img_num, in_channels, img_h, img_w); - - TensorHf4 img_host; - TensorDf4 img_dev; - - img_host.re_alloc(img_s); - img_dev.re_alloc(img_s); - - for (int i = 0; i < img_host.size(); ++i) { - img_host.mutable_data()[i] = 0x7f & i; - } - - img_dev.copy_from(img_host); - - TensorHf4 output_host; - TensorDf4 output_dev; - - // start Reshape & doInfer - - Context ctx1(0, 1, 1); - int window_h = 2; - int window_w = 2; - int pad_h = 1; - int pad_w = 1; - int stride_h = 1; - int stride_w = 1; - LOG(INFO) << " img_num: " << img_num; - LOG(INFO) << " in_channels: " << in_channels; - LOG(INFO) << " img_h: " << img_h; - LOG(INFO) << " img_w: " << img_w; - LOG(INFO) << " window_h: " << window_h; - LOG(INFO) << " window_w: " << window_w; - LOG(INFO) << " pad_h: " << pad_h; - LOG(INFO) << " pad_w: " << pad_w; - LOG(INFO) << " stride_h: " << stride_h; - LOG(INFO) << " stride_w: " << stride_w; - - PoolingParam param(window_h, window_w, pad_h, pad_w - , stride_h, stride_w, Pooling_max); - - std::vector input; - std::vector output; - - input.push_back(&img_dev); - output.push_back(&output_dev); - - Pooling pooling; - pooling.compute_output_shape(input, output, param); - - output_dev.re_alloc(output[0]->shape()); - output_host.re_alloc(output[0]->shape()); - - // init assume output tensor has been reshpaed by user. - pooling.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1); - pooling(input, output, param, ctx1); - - SaberTimer t1; - int ts = 1000; - - for (int i = 0; i < ts; ++i) { - t1.start(ctx1); - pooling(input, output, param, ctx1); - output[0]->sync(); - t1.end(ctx1); - } - - output_dev.sync(); - cudaDeviceSynchronize(); - LOG(INFO) << " average time: " << t1.get_average_ms() << " ms"; - LOG(INFO) << " tile 10% time: " << t1.get_tile_time(10) << " ms"; - LOG(INFO) << " tile 50% time: " << t1.get_tile_time(50) << " ms"; - LOG(INFO) << " tile 90% time: " << t1.get_tile_time(90) << " ms"; - LOG(INFO) << " tile 95% time: " << t1.get_tile_time(95) << " ms"; - LOG(INFO) << " tile 99% time: " << t1.get_tile_time(99) << " ms"; - - CUDA_CHECK(cudaPeekAtLastError()); -} - -TEST(TestSaberFuncBM, test_pooling_result) { - - Env::env_init(); - typedef TargetWrapper API; - typename API::event_t event; - API::create_event(event); - - typedef TargetWrapper X86_API; - typedef TargetWrapper BM_API; - typedef Tensor TensorHf4; - typedef Tensor TensorDf4; - - int img_num = 1; - int in_channels = 2; - int img_h = 8; - int img_w = 8; - - Shape img_s(img_num, in_channels, img_h, img_w); - - TensorHf4 img_host; - TensorDf4 img_dev; - - img_host.re_alloc(img_s); - img_dev.re_alloc(img_s); - - for (int i = 0; i < img_host.size(); ++i) { - img_host.mutable_data()[i] = 0x7f & i; - } - - img_dev.copy_from(img_host); - - TensorDf4 output_dev; - - // start Reshape & doInfer - - Context ctx1(0, 1, 1); - int window_h = 2; - int window_w = 2; - int pad_h = 1; - int pad_w = 1; - int stride_h = 1; - int stride_w = 1; - - LOG(INFO) << " img_num: " << img_num; - LOG(INFO) << " in_channels: " << in_channels; - LOG(INFO) << " img_h: " << img_h; - LOG(INFO) << " img_w: " << img_w; - LOG(INFO) << " window_h: " << window_h; - LOG(INFO) << " window_w: " << window_w; - LOG(INFO) << " pad_h: " << pad_h; - LOG(INFO) << " pad_w: " << pad_w; - LOG(INFO) << " stride_h: " << stride_h; - LOG(INFO) << " stride_w: " << stride_w; - - PoolingParam param(window_h, window_w, pad_h, pad_w - , stride_h, stride_w, Pooling_max); - - std::vector input; - std::vector output; - - input.push_back(&img_dev); - output.push_back(&output_dev); - - Pooling pooling; - pooling.compute_output_shape(input, output, param); - - output_dev.re_alloc(output[0]->shape()); - - // init assume output tensor has been reshpaed by user. - pooling.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1); - pooling(input, output, param, ctx1); - cudaStream_t cuda_stream = ctx1.get_compute_stream(); - output[0]->record_event(cuda_stream); - - output_dev.sync(); - print_tensor_device(output_dev); - - cudaDeviceSynchronize(); - CUDA_CHECK(cudaPeekAtLastError()); -} - -TEST(TestSaberFuncBM, test_pooling_shared_buffer) { - - Env::env_init(); - typedef TargetWrapper API; - typename API::event_t event; - API::create_event(event); - - typedef TargetWrapper X86_API; - typedef TargetWrapper BM_API; - typedef Tensor TensorHf4; - typedef Tensor TensorDf4; - - int img_num = 1; - int in_channels = 2; - int img_h = 8; - int img_w = 8; - - Shape img_s(img_num, in_channels, img_h, img_w); - - TensorHf4 img_host; - TensorDf4 img_dev; - - img_host.re_alloc(img_s); - img_dev.re_alloc(img_s); - - for (int i = 0; i < img_host.size(); ++i) { - img_host.mutable_data()[i] = 0x7f & i; - } - - img_dev.copy_from(img_host); - - TensorDf4 t0; - TensorDf4 t1; - Shape img_s_sub(img_num, in_channels, img_h / 2, img_w / 2); - - t0.share_sub_buffer(img_dev, img_s_sub, {0, 0, 0, 0}); - t1.share_sub_buffer(img_dev, img_s_sub, {0, 0, 4, 4}); - - TensorDf4 output_dev; - - TensorDf4 out0; - TensorDf4 out1; - - // start Reshape & doInfer - - Context ctx1(0, 1, 1); - int window_h = 2; - int window_w = 2; - int pad_h = 1; - int pad_w = 1; - int stride_h = 1; - int stride_w = 1; - - LOG(INFO) << " img_num: " << img_num; - LOG(INFO) << " in_channels: " << in_channels; - LOG(INFO) << " img_h: " << img_h; - LOG(INFO) << " img_w: " << img_w; - LOG(INFO) << " window_h: " << window_h; - LOG(INFO) << " window_w: " << window_w; - LOG(INFO) << " pad_h: " << pad_h; - LOG(INFO) << " pad_w: " << pad_w; - LOG(INFO) << " stride_h: " << stride_h; - LOG(INFO) << " stride_w: " << stride_w; - - PoolingParam param(window_h, window_w, pad_h, pad_w - , stride_h, stride_w, Pooling_max); - - std::vector input; - std::vector output; - - input.push_back(&img_dev); - output.push_back(&output_dev); - - Pooling pooling; - Pooling pooling0; - Pooling pooling1; - - pooling.compute_output_shape(input,output, param); - - Shape total_shape = output[0]->shape(); - - output_dev.re_alloc(total_shape); - Shape out_sub_shape = {total_shape[0], total_shape[1], total_shape[2] / 2, total_shape[3] / 2}; - - out0.share_sub_buffer(output_dev, out_sub_shape, {0, 0, 0, 0}); - out1.share_sub_buffer(output_dev, out_sub_shape, {0, 0, out_sub_shape[2], out_sub_shape[3]}); - - std::vector input0, input1; - std::vector output0, output1; - - input0.push_back(&t0); - input1.push_back(&t1); - output0.push_back(&out0); - output1.push_back(&out1); - - // init assume output tensor has been reshpaed by user. - pooling0.init(input0, output0, param, SPECIFY, VENDER_IMPL, ctx1); - pooling0(input0, output0, param, ctx1); - - pooling1.init(input1, output1, param, SPECIFY, VENDER_IMPL, ctx1); - pooling1(input1, output1, param, ctx1); - - cudaStream_t cuda_stream = ctx1.get_compute_stream(); - out0.record_event(cuda_stream); - - cudaStream_t cuda_stream1 = ctx1.get_compute_stream(); - out1.record_event(cuda_stream1); - - out0.sync(); - out1.sync(); - - print_tensor_device(output_dev); - - cudaDeviceSynchronize(); - CUDA_CHECK(cudaPeekAtLastError()); -} - -int main(int argc, const char** argv) { - // initial logger - //logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/saber/bm/test_saber_shape_BM.cpp b/test/saber/bm/test_saber_shape_BM.cpp deleted file mode 100644 index 18479cd18..000000000 --- a/test/saber/bm/test_saber_shape_BM.cpp +++ /dev/null @@ -1,126 +0,0 @@ -#include "test_saber_shape_BM.h" -#include "shape.h" -#include "anakin_config.h" - -#ifdef USE_OPENMP -#include -#include -#endif - -using namespace anakin; -using namespace saber; - - -TEST(TestSaberShapeBM, test_saber_shape) { - - int dim = 4; - Shape sh4d0{0, 0, 0, 0}; - CHECK_EQ(sh4d0.dims(), 4) << "check shape dim error"; - - for (int i = 0; i < dim; ++i) { - CHECK_EQ(sh4d0[i], 0) << "check default constructor, dim size error"; - } - - CHECK_EQ(sh4d0.count(), 0) << "check shape count error"; - - int N = 1; - int C = 3; - int H = 11; - int W = 11; - std::vector sh_size = {N, C, H, W}; - //Shape sh4d1(sh_size); - Shape sh4d1(N, C, H, W); - LOG(INFO) << "Test Saber Shape, size of shape: " << sh4d1.size(); - CHECK_EQ(sh4d1.count(), N * C * H * W) << "size error with vector constructor!"; - //CHECK_EQ(sh4d2.size(), N * C * H * W) << "size error with args constructor!"; - - CHECK_EQ(sh4d1[0], N) << "get shape size error"; - CHECK_EQ(sh4d1[1], C) << "get shape size error"; - CHECK_EQ(sh4d1[2], H) << "get shape size error"; - CHECK_EQ(sh4d1[3], W) << "get shape size error"; - - //CHECK_EQ(sh4d2[0], N) << "get shape size error"; - //CHECK_EQ(sh4d2[1], C) << "get shape size error"; - //CHECK_EQ(sh4d2[2], H) << "get shape size error"; - //CHECK_EQ(sh4d2[3], W) << "get shape size error"; - - CHECK_EQ(sh4d1.count(0), N * C * H * W) << "calculate count failed"; - - C = 10; - sh4d1[1] = C; - CHECK_EQ(sh4d1[1], C) << "set shape size error"; - - bool is_equal = (sh4d0 == sh4d1); - CHECK_EQ(is_equal, false) << "check shape is_equal failed"; - - sh4d0 = sh4d1; - CHECK_EQ(sh4d1[0], N) << "constructor failed"; - CHECK_EQ(sh4d1[1], C) << "get shape size error"; - CHECK_EQ(sh4d1[2], H) << "get shape size error"; - CHECK_EQ(sh4d1[3], W) << "get shape size error"; - - Shape sh4d3 = sh4d1; - CHECK_EQ((sh4d3 == sh4d1), true) << "constructor error"; - - Shape sh4d4(sh4d1); - CHECK_EQ((sh4d4 == sh4d1), true) << "constructor error"; - - Shape sh1d0{0}; - //std::vector sh1d_size = {W}; - - //Shape sh1d1(sh1d_size); - //Shape sh1d0{W}; - Shape sh1d1(W); - - Shape sh1d3 = sh1d1; - Shape sh1d4(sh1d1); - - CHECK_EQ(sh1d0.dims(), 1) << "shape dim error"; - - CHECK_EQ(sh1d0.count(), 0) << "shape size error"; - - CHECK_EQ(sh1d0.count(0), 0) << "shape1d count error"; - - CHECK_EQ(sh1d1[0], W) << "get shape size error"; - - //CHECK_EQ(sh1d2.count(0), W) << "shape dim error"; - - CHECK_EQ((sh1d0 != sh1d1), true) << "compare shape error"; - - CHECK_EQ((sh1d3 == sh1d1), true) << "compare shape error"; - - CHECK_EQ((sh1d4 == sh1d1), true) << "compare shape error"; - - Shape sh0{2, 2, 3, 4}; - Shape sh1{2, 1, 1, 24}; - Shape sh2{2, 2, 3, 4}; - Shape sh3{1, 1, 2, 3}; - - CHECK_EQ(sh0 == sh2, true) << "error =="; - CHECK_EQ(sh3 < sh0, true) << "error <"; - CHECK_EQ(sh3 >= sh0, false) << "error >="; - CHECK_EQ(sh3 > sh0, false) << "error >"; - CHECK_EQ(sh0 > sh3, true) << "error >"; - CHECK_EQ(sh0 < sh1, false) << "error <"; - CHECK_EQ(sh0 <= sh2, true) << "error <="; - CHECK_EQ(sh0 >= sh2, true) << "error >="; - - Shape sh001 = Shape::zero(2); - Shape sh002 = Shape::zero(3); - - if (sh001 > sh002) { - LOG(ERROR) << "error <"; - } - -} - - -int main(int argc, const char** argv) { - // initial logger - logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - - diff --git a/test/saber/bm/test_saber_shape_BM.h b/test/saber/bm/test_saber_shape_BM.h deleted file mode 100644 index a2ca02c9b..000000000 --- a/test/saber/bm/test_saber_shape_BM.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H -#define ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H - -#include "utils/unit_test/aktest.h" -#include "utils/logger/logger.h" -#include "saber/core/shape.h" - -using namespace anakin::test; - -class TestSaberShapeBM : public Test { -public: - TestSaberShapeBM() {} - ~TestSaberShapeBM() {} - -protected: - virtual void setup() {} - virtual void teardown() {} - -protected: - std::string name; - std::string _test; -}; - -#endif //ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H - diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index ed3ff0503..69b1ccbfc 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -9,7 +9,9 @@ typedef Tensor TensorHf4; typedef Tensor TensorDf4; typedef TensorHf4::Dtype dtype; +static bm_handle_t handle; TEST(TestSaberTensorBM, test_tensor_constructor) { + bmdnn_init(&handle); //! test empty constructor LOG(INFO) << "test default (empty) constructor"; @@ -28,13 +30,13 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { //! test tensor re_alloc function on tensor with data LOG(INFO) << "|--test tensor re_alloc function on tensor with data"; - Shape sh1(1, 2, 4, 4); + Shape sh1(2, 4, 4, 2); thost0.re_alloc(sh1); tdev0.re_alloc(sh1); LOG(INFO) << "|--tensor size of host: " << thost0.size(); LOG(INFO) << "|--tensor size of device: " << tdev0.size(); - CHECK_EQ(thost0.size(), 32) << "error with tensor size"; - CHECK_EQ(tdev0.size(), 32) << "error with tensor size"; + CHECK_EQ(thost0.size(), 64) << "error with tensor size"; + CHECK_EQ(tdev0.size(), 64) << "error with tensor size"; //! test tensor shape() function LOG(INFO) << "|--test tensor shape() function"; @@ -45,9 +47,9 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { << thost0.height() << ", width = " << thost0.width(); //! test tensor mutable_data() function - LOG(INFO) << "|--test tensor mutable_data() function, write tensor data buffer with 1.f"; - fill_tensor_host_const(thost0, 1.f); - LOG(INFO) << "|--test tensor data() function, show the const data, 1.f"; + LOG(INFO) << "|--test tensor mutable_data() function, write tensor data buffer with 2.f"; + fill_tensor_host_const(thost0, 2.f); + LOG(INFO) << "|--test tensor data() function, show the const data, 2.f"; print_tensor_host(thost0); //! test tensor constructor with shape @@ -72,7 +74,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { thost1.copy_from(tdev1); print_tensor_host(thost1); - /* + // device to device tdev1.copy_from(tdev0); @@ -98,22 +100,35 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { BM_API::mem_alloc(&tmp_pt_dev, sizeof(dtype) * sh1.count()); dev_data_ptr = static_cast(tmp_pt_dev); - cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice); +// bm_memcpy_d2s(handle,host_data_ptr,dev_data_ptr) +// cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice); LOG(INFO) << "|--construct host tensor from host data ptr"; TensorHf4 thost3(host_data_ptr, X86(), X86_API::get_device_id(), sh1); LOG(INFO) << "|--constructor device tensor from host data ptr"; - TensorDf4 tdev3(host_data_ptr, X86(), X86_API::get_device_id(), sh1); + +// TensorDf4 tdev3(&bm_mem_from_system(const_cast(host_data_ptr)), X86(), X86_API::get_device_id(), sh1); + + TensorDf4 tdev3(&bm_mem_from_system(const_cast(host_data_ptr)), X86(), X86_API::get_device_id(), sh1); + print_tensor_host(thost3); - print_tensor_device(tdev3); - //cudaDeviceSynchronize(); + TensorHf4 thost_lian(sh1); + thost_lian.copy_from(tdev3); + print_tensor_host(thost_lian); + + thost_lian.copy_from(thost3); + print_tensor_host(thost_lian); + + //cudaDeviceSynchronize(); + // +/* LOG(INFO) << "|--construct host tensor from device data ptr"; TensorHf4 thost4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1); LOG(INFO) << "|--constructor device tensor from device data ptr"; TensorDf4 tdev4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1); print_tensor_host(thost4); print_tensor_device(tdev4); - +/* //BM_API::stream_t dev_stream0; //BM_API::create_stream_with_flag(dev_stream0, 1); //cudaDeviceSynchronize(); @@ -203,6 +218,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { LOG(INFO) << "|--show root tensor while data is changed by shared tensor"; print_tensor_host(thost4); */ +// bmdnn_deinit(handle); } /* From 683969cd6c2b99b723e896aacada065f8330be04 Mon Sep 17 00:00:00 2001 From: "Guangzhi (Frank) Xie" Date: Tue, 26 Jun 2018 19:55:32 +0800 Subject: [PATCH 069/318] Return correct size for AK_BM --- saber/core/tensor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/saber/core/tensor.h b/saber/core/tensor.h index af3495b1f..6824869dd 100644 --- a/saber/core/tensor.h +++ b/saber/core/tensor.h @@ -944,7 +944,7 @@ class Tensor : public TensorBase { template<> inline size_t Tensor::_type_len(){ - return 1; + return 4; } template<> From adcac0eef01ea6144454039b8d53d59a74fb3c17 Mon Sep 17 00:00:00 2001 From: "Guangzhi (Frank) Xie" Date: Tue, 26 Jun 2018 21:12:57 +0800 Subject: [PATCH 070/318] Implement conv for BM --- saber/funcs/conv.h | 11 ++++ saber/funcs/impl/bm/vender_conv.h | 41 +++++++++++-- test/saber/bm/test_saber_func_conv_BM.cpp | 71 ++++++++++++----------- 3 files changed, 86 insertions(+), 37 deletions(-) diff --git a/saber/funcs/conv.h b/saber/funcs/conv.h index 4e5ca762f..fd5ebc2de 100644 --- a/saber/funcs/conv.h +++ b/saber/funcs/conv.h @@ -30,6 +30,16 @@ namespace anakin { namespace saber { +#ifdef USE_BM +template +#else template +#endif class Conv : public BaseFunc< Tensor, Tensor, diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h index a0a3b3fb5..778094886 100644 --- a/saber/funcs/impl/bm/vender_conv.h +++ b/saber/funcs/impl/bm/vender_conv.h @@ -62,10 +62,43 @@ class VenderConv2D TensorHf4; -typedef Tensor TensorDf4; +typedef Tensor TensorDf4; template void print_tensor_shape(std::string name, Tensor &t0) { @@ -33,7 +33,11 @@ void print_tensor_shape(std::string name, Tensor &t0) { << t0.offset()[3] << "]."; } - +//Round a / b to nearest higher integer value +inline int i_div_up(int a, int b) +{ + return (a % b != 0) ? (a / b + 1) : (a / b); +} #if 1 TEST(TestSaberFuncBM, test_depthwise_conv) { @@ -126,7 +130,7 @@ TEST(TestSaberFuncBM, test_depthwise_conv) { input.push_back(&img_dev); output.push_back(&output_dev); - Conv conv; + Conv conv; conv.compute_output_shape(input, output, param); output_dev.re_alloc(output[0]->shape()); @@ -138,10 +142,10 @@ TEST(TestSaberFuncBM, test_depthwise_conv) { conv(input, output, param, ctx1); - cudaStream_t cuda_stream = ctx1.get_compute_stream(); - output[0]->record_event(cuda_stream); + //cudaStream_t cuda_stream = ctx1.get_compute_stream(); + //output[0]->record_event(cuda_stream); - output_dev.sync(); + //output_dev.sync(); print_tensor_device(output_dev); // param.group = 1; @@ -153,8 +157,8 @@ TEST(TestSaberFuncBM, test_depthwise_conv) { // // output_dev.sync(); // print_tensor_device(output_dev); - cudaDeviceSynchronize(); - CUDA_CHECK(cudaPeekAtLastError()); + //cudaDeviceSynchronize(); + //CUDA_CHECK(cudaPeekAtLastError()); } TEST(TestSaberFuncBM, test_conv_param_change) { @@ -247,7 +251,7 @@ TEST(TestSaberFuncBM, test_conv_param_change) { input.push_back(&img_dev); output.push_back(&output_dev); - Conv conv; + Conv conv; conv.compute_output_shape(input, output, param); output_dev.re_alloc(output[0]->shape()); @@ -258,7 +262,7 @@ TEST(TestSaberFuncBM, test_conv_param_change) { conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1); conv(input, output, param, ctx1); - output_dev.sync(); + //output_dev.sync(); // print_tensor_device(output_dev); param.group = 1; @@ -268,13 +272,13 @@ TEST(TestSaberFuncBM, test_conv_param_change) { LOG(INFO)<<" param changed start with group = "<record_event(cuda_stream); + //cudaStream_t cuda_stream = ctx1.get_compute_stream(); + //output[0]->record_event(cuda_stream); - output_dev.sync(); + //output_dev.sync(); // print_tensor_device(output_dev); - cudaDeviceSynchronize(); - CUDA_CHECK(cudaPeekAtLastError()); + //cudaDeviceSynchronize(); + //CUDA_CHECK(cudaPeekAtLastError()); } TEST(TestSaberFuncBM, test_conv_share_sub_tensor) { @@ -392,8 +396,8 @@ TEST(TestSaberFuncBM, test_conv_share_sub_tensor) { // FIXME ? where do i get output shape output_dev.re_alloc(img_s); - Conv conv0; - Conv conv1; + Conv conv0; + Conv conv1; conv0.compute_output_shape(input0, output0, param0); conv1.compute_output_shape(input1, output1, param1); @@ -407,6 +411,7 @@ TEST(TestSaberFuncBM, test_conv_share_sub_tensor) { conv0(input0, output0, param0, ctx1); conv1(input1, output1, param1, ctx2); + /* cudaStream_t cuda_stream1 = ctx1.get_compute_stream(); output0[0]->record_event(cuda_stream1); @@ -415,13 +420,13 @@ TEST(TestSaberFuncBM, test_conv_share_sub_tensor) { out0.sync(); out1.sync(); - + */ print_tensor_device(output_dev); // print_tensor_device(output_dev); - cudaDeviceSynchronize(); - CUDA_CHECK(cudaPeekAtLastError()); + //cudaDeviceSynchronize(); + //CUDA_CHECK(cudaPeekAtLastError()); } #endif @@ -513,7 +518,7 @@ TEST(TestSaberFuncBM, test_conv_fp32_speed_test) { input.push_back(&img_dev); output.push_back(&output_dev); - Conv conv; + Conv conv; conv.compute_output_shape(input, output, param); output_dev.re_alloc(output[0]->shape()); @@ -539,10 +544,10 @@ TEST(TestSaberFuncBM, test_conv_fp32_speed_test) { LOG(INFO) << "saber conv dispatch"; conv(input, output, param, ctx1); - cudaStream_t cuda_stream = ctx1.get_compute_stream(); - output[0]->record_event(cuda_stream); + //cudaStream_t cuda_stream = ctx1.get_compute_stream(); + //output[0]->record_event(cuda_stream); - output_dev.sync(); + //output_dev.sync(); SaberTimer t1; int ts = 1; @@ -556,8 +561,8 @@ TEST(TestSaberFuncBM, test_conv_fp32_speed_test) { LOG(INFO) << "fp32 average time: " << t1.get_average_ms() << " ms"; - cudaDeviceSynchronize(); - CUDA_CHECK(cudaPeekAtLastError()); + //cudaDeviceSynchronize(); + //CUDA_CHECK(cudaPeekAtLastError()); } void test_conv_fp32_speed(std::vector &inputs, std::vector &outputs, @@ -569,7 +574,7 @@ void test_conv_fp32_speed(std::vector &inputs, std::vector conv; + Conv conv; conv.compute_output_shape(inputs, outputs, conv_param); outputs[0]->re_alloc(outputs[0]->shape()); Context ctx1(0, 1, 1); @@ -580,7 +585,7 @@ void test_conv_fp32_speed(std::vector &inputs, std::vectorrecord_event(ctx1.get_compute_stream()); outputs[0]->sync(); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); SaberTimer t1; int ts = 100; @@ -593,7 +598,7 @@ void test_conv_fp32_speed(std::vector &inputs, std::vector t1; int ts = 100; @@ -698,7 +703,7 @@ TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) { } LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms"; - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); // print_tensor_device(out); // print_tensor_device(out_gemm); TensorHf4 out_host; From a4ed82ebd5095ab6f75d865c4b8ab34d3c6ad760 Mon Sep 17 00:00:00 2001 From: "Guangzhi (Frank) Xie" Date: Tue, 26 Jun 2018 21:21:24 +0800 Subject: [PATCH 071/318] Comment out last conv test for now --- test/saber/bm/test_saber_func_conv_BM.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/saber/bm/test_saber_func_conv_BM.cpp b/test/saber/bm/test_saber_func_conv_BM.cpp index 025a1074c..9a25d00b3 100644 --- a/test/saber/bm/test_saber_func_conv_BM.cpp +++ b/test/saber/bm/test_saber_func_conv_BM.cpp @@ -601,7 +601,7 @@ void test_conv_fp32_speed(std::vector &inputs, std::vector Date: Tue, 26 Jun 2018 13:42:52 +0000 Subject: [PATCH 072/318] Modify sync_memcpy & add bm_mem_from_device --- saber/core/impl/bm/bm_impl.cpp | 16 ++++++++++------ saber/core/target_wrapper.h | 2 +- .../impl/bm/base/include/bmlib/bmlib_runtime.h | 3 +++ test/saber/bm/test_saber_buffer_BM.cpp | 10 ++++++---- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index 60e52088e..ef26884b2 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -81,16 +81,20 @@ void BM_API::mem_set(void* ptr, int value, size_t n){ //static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ // size_t count, __DtoD) {}; -//static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ -// size_t count, __HtoD) {}; +void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ + size_t count, __HtoD) { + handle = get_bm_handle(); + BMDNN_CHECK(bm_memcpy_s2d(handle, bm_mem_from_device(dst), bm_mem_from_system(src))); + LOG(INFO) << "BM sync_memcpy: host to device, finished"; +}; void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ size_t count, __DtoH) { handle = get_bm_handle(); - //auto* dev_ptr = const_cast(src); - BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src))); - //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *src)); - LOG(INFO) << "End sync_memcpy process"; + BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), bm_mem_from_device(src))); + //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src))); + //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(reinterpret_cast(src)))); + LOG(INFO) << "BM sync_memcpy: device to host, finished"; }; //static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \ diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h index 6e6f67b55..925f4dd39 100644 --- a/saber/core/target_wrapper.h +++ b/saber/core/target_wrapper.h @@ -401,7 +401,7 @@ struct TargetWrapper { size_t count, __DtoD) {}; static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, __HtoD) {}; + size_t count, __HtoD); static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ size_t count, __DtoH); diff --git a/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h b/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h index 932b17138..7d537401c 100644 --- a/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h +++ b/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h @@ -148,6 +148,9 @@ bm_status_t bm_memset_device( bm_device_mem_t bm_mem_from_system( void * system_addr); +bm_device_mem_t bm_mem_from_device( + void * device_addr); + /* *brief malloc one device memory with the shape of (N,C,H,W), copy the sys_mem to device mem if need_copy is true diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp index dce1fae15..555e22675 100644 --- a/test/saber/bm/test_saber_buffer_BM.cpp +++ b/test/saber/bm/test_saber_buffer_BM.cpp @@ -5,7 +5,7 @@ using namespace anakin::saber; int get_bm_size() { - return 1; + return 4; } template @@ -27,7 +27,7 @@ void test_buffer() { x86_ptr = static_cast(tmp_x86); for (int i = 0; i < n0; i++) { - x86_ptr[i] = static_cast(i); + x86_ptr[i] = static_cast(100); } void* tmp_bm; @@ -105,6 +105,7 @@ void test_buffer() { for (int i = 0; i < 10; i++) { std::cout << "x86: " << x86_buf2_ptr[i] << std::endl; } + */ const Hdtype* bm_buf1_ptr = static_cast(bm_buf1.get_data()); for (int i = 0; i < 10; i++) { @@ -115,16 +116,17 @@ void test_buffer() { LOG(INFO) << "x86_buf1 cap & cnt: " << x86_buf1.get_capacity() << " " << x86_buf1.get_count(); LOG(INFO) << "size of Hdtype: " << sizeof(Hdtype); LOG(INFO) << "size of Ddtype: " << sizeof(Ddtype); - */ + x86_buf1.re_alloc(bm_buf1.get_capacity()); x86_buf1.sync_copy_from(bm_buf1); LOG(INFO) << "deep copy from device buffer to host buffer: "; ptr1 = static_cast(x86_buf1.get_data()); - for (int i = 0; i < 30; i++) { + for (int i = 0; i < 10; i++) { std::cout << ptr1[i] << std::endl; } + } TEST(TestSaberBufferBM, test_buffer_memcpy) { From 19b5ace798f5c49b26868d7947783fd647ab971a Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 27 Jun 2018 09:39:38 +0800 Subject: [PATCH 073/318] Update BM conv params --- saber/funcs/impl/bm/vender_conv.h | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h index 778094886..530eef528 100644 --- a/saber/funcs/impl/bm/vender_conv.h +++ b/saber/funcs/impl/bm/vender_conv.h @@ -36,6 +36,8 @@ class VenderConv2D& inputs, std::vector& outputs, ConvParam& param, Context& ctx) { + + _handle = get_bm_handle(); return create(inputs, outputs, param, ctx); } @@ -50,18 +52,26 @@ class VenderConv2Ddata(); const InDataType *bias = (const InDataType *) param.bias()->data(); OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); + int input_n = inputs[0]->num(); int input_c = inputs[0]->channel(); int input_h = inputs[0]->height(); int input_w = inputs[0]->width(); - int group = param.group; + + int output_n = outputs[0]->num(); int output_c = outputs[0]->channel(); + int output_h = outputs[0]->height(); + int output_w = outputs[0]->width(); + + int group = param.group; int kh = param.weight()->height(); int kw = param.weight()->width(); int pad_h = param.pad_h; int pad_w = param.pad_w; int stride_h = param.stride_h; int stride_w = param.stride_w; + int dilation_h = param.dilation_h; + int dilation_w = param.dilation_w; bm_tensor_4d_t input_shape = { input_n, @@ -71,10 +81,10 @@ class VenderConv2D Date: Wed, 27 Jun 2018 09:41:52 +0800 Subject: [PATCH 074/318] Init handle in init function --- saber/funcs/impl/bm/vender_pooling.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h index 108a70708..6e5de79a4 100644 --- a/saber/funcs/impl/bm/vender_pooling.h +++ b/saber/funcs/impl/bm/vender_pooling.h @@ -35,6 +35,8 @@ class VenderPooling& inputs, std::vector& outputs, PoolingParam &pooling_param, Context &ctx) { + + _handle = get_bm_handle(); return create(inputs, outputs, pooling_param, ctx); } @@ -64,7 +66,7 @@ class VenderPooling Date: Wed, 27 Jun 2018 10:07:03 +0800 Subject: [PATCH 075/318] Include BM conv implementation --- saber/funcs/conv.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/saber/funcs/conv.h b/saber/funcs/conv.h index fd5ebc2de..596964dbe 100644 --- a/saber/funcs/conv.h +++ b/saber/funcs/conv.h @@ -27,6 +27,10 @@ #include "saber/funcs/impl/impl_conv.h" #endif +#ifdef USE_BM +#include "saber/funcs/impl/bm/vender_conv.h" +#endif + namespace anakin { namespace saber { From e1c82c4557089fd6d0985c5e4cfb2148d0bb88cb Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 27 Jun 2018 10:12:18 +0800 Subject: [PATCH 076/318] remove unecessary include --- saber/funcs/impl/bm/vender_conv.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h index 530eef528..924bf736c 100644 --- a/saber/funcs/impl/bm/vender_conv.h +++ b/saber/funcs/impl/bm/vender_conv.h @@ -1,8 +1,7 @@ #ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H #define ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H -#include "saber/funcs/impl/impl_conv.h" -#include "saber/funcs/impl/bm/bmdnn_api.h" +#include "saber/funcs/impl/impl_conv.h" namespace anakin{ From 6905020377a0f1f9337be76dcdf7f5b296faad67 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 27 Jun 2018 10:26:09 +0800 Subject: [PATCH 077/318] empty create function --- saber/funcs/impl/bm/vender_conv.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h index 924bf736c..14e52af8e 100644 --- a/saber/funcs/impl/bm/vender_conv.h +++ b/saber/funcs/impl/bm/vender_conv.h @@ -42,7 +42,9 @@ class VenderConv2D& inputs, std::vector& outputs, - ConvParam& param, Context& ctx); + ConvParam& param, Context& ctx) { + + } virtual SaberStatus dispatch(const std::vector& inputs, std::vector& outputs, From 59dba0558133103131df77ccf4acca4f901c582b Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 27 Jun 2018 11:18:00 +0800 Subject: [PATCH 078/318] unit test for BM conv --- saber/funcs/impl/bm/vender_conv.h | 6 +- test/saber/bm/test_saber_func_conv_BM.cpp | 88 ++--------------------- 2 files changed, 8 insertions(+), 86 deletions(-) diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h index 14e52af8e..220b8a14e 100644 --- a/saber/funcs/impl/bm/vender_conv.h +++ b/saber/funcs/impl/bm/vender_conv.h @@ -1,7 +1,7 @@ #ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H #define ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H -#include "saber/funcs/impl/impl_conv.h" +#include "saber/funcs/impl/impl_conv.h" namespace anakin{ @@ -74,6 +74,8 @@ class VenderConv2Dsize() > 0; + bm_tensor_4d_t input_shape = { input_n, input_c, @@ -107,7 +109,7 @@ class VenderConv2D &inputs, std::vector &outputs, @@ -601,23 +573,6 @@ void test_conv_fp32_speed(std::vector &inputs, std::vector ctx1(0, 1, 1); - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - CUBLAS_CHECK(cublasSetStream(cublas_handle, ctx1.get_compute_stream())); - TensorDf4 weights; weights.re_alloc({out_channels, in_channels, 1, 1}); @@ -684,40 +636,8 @@ TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) { weights, kernel, stride, pad, in_channels, out_channels, bias, SABER_IMPL); - //cudaDeviceSynchronize(); - caffe_gemm(out_channels, img_h * img_w, in_channels,\ - 1.f, weights.data(),\ - img.data(), 0.f, out_gemm.mutable_data()); - //cudaDeviceSynchronize(); - SaberTimer t1; - int ts = 100; - - for (int i = 0; i < ts; ++i) { - t1.start(ctx1); - caffe_gemm(out_channels, img_h * img_w, in_channels,\ - 1.f, weights.data(),\ - img.data(), 0.f, out_gemm.mutable_data()); - out_gemm.record_event(ctx1.get_compute_stream()); - out_gemm.sync(); - t1.end(ctx1); - } - LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms"; - - //cudaDeviceSynchronize(); -// print_tensor_device(out); -// print_tensor_device(out_gemm); - TensorHf4 out_host; - TensorHf4 out_gemm_host; - out_host.re_alloc(out.shape()); - out_host.copy_from(out); - - out_gemm_host.re_alloc(out_gemm.shape()); - out_gemm_host.copy_from(out_gemm); - double max_r, max_d; - tensor_cmp_host(out_host.data(), out_gemm_host.data(), out_host.size(), max_r, max_d); - LOG(INFO) << "cmp result: max_r = " << max_r << " max_d = " << max_d; } -*/ + int main(int argc, const char** argv){ anakin::saber::Env::env_init(); From c27573a42bc43a670a347a2fcefe36ac752791cd Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 27 Jun 2018 11:26:06 +0800 Subject: [PATCH 079/318] Update BM tensor print function --- saber/core/tensor_op.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp index 841c9c208..3d6494b9d 100644 --- a/saber/core/tensor_op.cpp +++ b/saber/core/tensor_op.cpp @@ -440,7 +440,7 @@ void print_tensor_device>(Tensor& tenso for (int i = 0; i < tensor.size(); ++i) { printf("%.2f ", host_mem[i]); - if ((i + 1) % (4 * tensor.width()) == 0) { + if ((i + 1) % tensor.width() == 0){ printf("\n"); } } From 679ae3fca424df5a7e92f2b4138616b062c13d50 Mon Sep 17 00:00:00 2001 From: "tong.liu" Date: Wed, 27 Jun 2018 05:17:48 +0000 Subject: [PATCH 080/318] modify activation op, test pass --- saber/funcs/activation.h | 17 ++++++++++++- saber/funcs/impl/bm/vender_activation.h | 15 ++++++------ saber/saber_funcs_param.h | 24 +++++++++++++++++++ .../bm/test_saber_func_activation_BM.cpp | 13 ++++++---- test/saber/bm/test_saber_func_pooling_BM.cpp | 2 +- 5 files changed, 56 insertions(+), 15 deletions(-) diff --git a/saber/funcs/activation.h b/saber/funcs/activation.h index f39747a27..e1167bc9a 100644 --- a/saber/funcs/activation.h +++ b/saber/funcs/activation.h @@ -29,9 +29,23 @@ #include "saber/funcs/impl/x86/saber_activation.h" #endif +#ifdef USE_BM +#include "saber/funcs/impl/bm/vender_activation.h" +#endif + namespace anakin { namespace saber { +#ifdef USE_BM +template +#else template +#endif class Activation : public BaseFunc< Tensor, Tensor, @@ -110,4 +125,4 @@ class Activation : public BaseFunc< } // namespace saber } // namespace anakin -#endif \ No newline at end of file +#endif diff --git a/saber/funcs/impl/bm/vender_activation.h b/saber/funcs/impl/bm/vender_activation.h index c4baf8365..ec27ac054 100644 --- a/saber/funcs/impl/bm/vender_activation.h +++ b/saber/funcs/impl/bm/vender_activation.h @@ -27,7 +27,7 @@ class VenderActivation& outputs, ActivationParam& param, Context& ctx) { // not sure + _handle = get_bm_handle(); return create(inputs, outputs, param, ctx); } @@ -49,14 +50,15 @@ class VenderActivation& inputs, std::vector& outputs, ActivationParam& param) { - const InDataType *in_data = (const InDataType *) inputs[0]->data(); - OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); + const InDataType in_data = *(inputs[0]->data()); + OutDataType out_data = *(outputs[0]->mutable_data()); int input_dim = inputs[0]->channel() * inputs[0]->height() * inputs[0]->width(); int input_n = inputs[0]->num(); + _active_type = param.active; switch (_active_type) { case Active_relu: - BMDNN_CHECK(bmdnn_relu_forward(_handle, in_data, input_n, input_dim, out_data)); + BMDNN_CHECK(bmdnn_relu_forward(_handle, in_data, 0.0, input_n, input_dim, out_data)); break; case Active_sigmoid: BMDNN_CHECK(bmdnn_sigmoid_forward(_handle, in_data, input_n, input_dim, out_data)); @@ -64,9 +66,6 @@ class VenderActivation; +template class VenderActivation; } // namespace saber } // namespace anakin diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h index 6a109540e..284fbcbc5 100644 --- a/saber/saber_funcs_param.h +++ b/saber/saber_funcs_param.h @@ -18,6 +18,7 @@ #include "anakin_config.h" #include #include +#include #include "saber/core/shape.h" #include "saber/core/tensor.h" #include "saber/saber_types.h" @@ -858,6 +859,29 @@ struct ActivationParam { DataDtype negative_slope; DataDtype coef; }; + +#ifdef USE_BM +template <> +struct ActivationParam > { + ActivationParam(): active(Active_unknow) {} + ActivationParam(ActiveType act): active(act) {} + ActivationParam(const ActivationParam &right): active(right.active) {} + ActivationParam &operator=(const ActivationParam &right) { + active = right.active; + return *this; + } + bool operator==(const ActivationParam &right) { + bool comp_eq = true; + comp_eq = comp_eq && (active == right.active); + return comp_eq; + } + bool has_negative_slope(){ + return (active == Active_relu); + } + ActiveType active; +}; +#endif + template struct ScaleParam { typedef typename opTensor::Dtype DataDtype; diff --git a/test/saber/bm/test_saber_func_activation_BM.cpp b/test/saber/bm/test_saber_func_activation_BM.cpp index 523e94121..42f33e58d 100644 --- a/test/saber/bm/test_saber_func_activation_BM.cpp +++ b/test/saber/bm/test_saber_func_activation_BM.cpp @@ -32,10 +32,10 @@ void print_tensor_shape(std::string name, Tensor& t0) { TEST(TestSaberFuncBM, test_func_constructor) { typedef Tensor TensorHf4; - typedef Tensor TensorDf4; + typedef Tensor TensorDf4; int img_num = 1; - int in_channels = 2; + int in_channels = 1; int img_h = 8; int img_w = 8; @@ -47,18 +47,21 @@ TEST(TestSaberFuncBM, test_func_constructor) { img_host.re_alloc(img_s); img_dev.re_alloc(img_s); + int sign = -1; for (int i = 0; i < img_host.size(); ++i) { - img_host.mutable_data()[i] = (float)(0.05 * (i & 0x1f) * -1); + sign = i % 2 ? -1 : 1; + img_host.mutable_data()[i] = (float)(0.05 * (i & 0x1f) * sign); } img_dev.copy_from(img_host); TensorDf4 output_dev; + print_tensor_device(img_dev); // start Reshape & doInfer Context ctx1(0, 1, 1); - ActivationParam param(Active_relu, 0.1f, 0.1f); + ActivationParam param(Active_relu); std::vector input; std::vector output; @@ -66,7 +69,7 @@ TEST(TestSaberFuncBM, test_func_constructor) { input.push_back(&img_dev); output.push_back(&output_dev); - Activation act; + Activation act; act.compute_output_shape(input, output, param); output_dev.re_alloc(output[0]->shape()); diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp index e988bc573..fb1a7398d 100644 --- a/test/saber/bm/test_saber_func_pooling_BM.cpp +++ b/test/saber/bm/test_saber_func_pooling_BM.cpp @@ -80,7 +80,7 @@ TEST(TestSaberFuncBM, test_func_pooling) { pooling(input, output, param, ctx1); SaberTimer t1; - int ts = 1000; + int ts = 100; for (int i = 0; i < ts; ++i) { t1.start(ctx1); From 1ab43e0aae2eb7dc532166b7948f3fa717418380 Mon Sep 17 00:00:00 2001 From: hlzy <327842846@qq.com> Date: Wed, 27 Jun 2018 01:28:34 -0400 Subject: [PATCH 081/318] tensor_test --- test/saber/bm/test_saber_tensor_BM.cpp | 49 ++++++++++++++------------ 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index 69b1ccbfc..de787908b 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -8,6 +8,8 @@ typedef TargetWrapper BM_API; typedef Tensor TensorHf4; typedef Tensor TensorDf4; typedef TensorHf4::Dtype dtype; +typedef TensorDf4::Dtype dtype2; + static bm_handle_t handle; TEST(TestSaberTensorBM, test_tensor_constructor) { @@ -47,7 +49,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { << thost0.height() << ", width = " << thost0.width(); //! test tensor mutable_data() function - LOG(INFO) << "|--test tensor mutable_data() function, write tensor data buffer with 2.f"; + LOG(INFO) << "|--xxxxxxxxtest tensor mutable_data() function, write tensor data buffer with 2.f"; fill_tensor_host_const(thost0, 2.f); LOG(INFO) << "|--test tensor data() function, show the const data, 2.f"; print_tensor_host(thost0); @@ -88,7 +90,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { LOG(INFO) << "test tensor constructor with data, if target is different, create buffer, and copy the data"; dtype* host_data_ptr; - dtype* dev_data_ptr; +// dtype2* dev_data_ptr; void* tmp_pt_host; void* tmp_pt_dev; X86_API::mem_alloc(&tmp_pt_host, sizeof(dtype) * sh1.count()); @@ -98,26 +100,28 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { host_data_ptr[i] = i; } - BM_API::mem_alloc(&tmp_pt_dev, sizeof(dtype) * sh1.count()); - dev_data_ptr = static_cast(tmp_pt_dev); -// bm_memcpy_d2s(handle,host_data_ptr,dev_data_ptr) -// cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice); + BM_API::mem_alloc(&tmp_pt_dev, sizeof(dtype2) * sh1.count()); +// dev_data_ptr = static_cast(tmp_pt_dev); +// bm_memcpy_d2s(handle,*dev_data_ptr,bm_mem_from_system(const_cast(host_data_ptr))); + +//--- cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice); + LOG(INFO) << "|--construct host tensor from host data ptr"; TensorHf4 thost3(host_data_ptr, X86(), X86_API::get_device_id(), sh1); LOG(INFO) << "|--constructor device tensor from host data ptr"; -// TensorDf4 tdev3(&bm_mem_from_system(const_cast(host_data_ptr)), X86(), X86_API::get_device_id(), sh1); - TensorDf4 tdev3(&bm_mem_from_system(const_cast(host_data_ptr)), X86(), X86_API::get_device_id(), sh1); print_tensor_host(thost3); - TensorHf4 thost_lian(sh1); - thost_lian.copy_from(tdev3); - print_tensor_host(thost_lian); + print_tensor_device(tdev3); - thost_lian.copy_from(thost3); - print_tensor_host(thost_lian); +// TensorHf4 thost_lian(sh1); +// thost_lian.copy_from(tdev3); +// print_tensor_host(thost_lian); +// +// thost_lian.copy_from(thost3); +// print_tensor_host(thost_lian); //cudaDeviceSynchronize(); // @@ -128,16 +132,17 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { TensorDf4 tdev4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1); print_tensor_host(thost4); print_tensor_device(tdev4); -/* +*/ + //BM_API::stream_t dev_stream0; //BM_API::create_stream_with_flag(dev_stream0, 1); //cudaDeviceSynchronize(); - +/* //! test tensor copy constructor LOG(INFO) << "test tensor copy constructor"; LOG(INFO) << "|--normal copy constructor"; - TensorHf4 thost5(thost4); - TensorDf4 tdev5(tdev4); +// TensorHf4 thost5(thost4); +// TensorDf4 tdev5(tdev4); LOG(INFO) << "|--push back to vector"; std::vector vthost; @@ -146,18 +151,18 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { vthost.push_back(thost1); vthost.push_back(thost2); vthost.push_back(thost3); - vthost.push_back(thost4); - vthost.push_back(thost5); +// vthost.push_back(thost4); +// vthost.push_back(thost5); vtdev.push_back(tdev0); vtdev.push_back(tdev1); vtdev.push_back(tdev2); vtdev.push_back(tdev3); - vtdev.push_back(tdev4); - vtdev.push_back(tdev5); +// vtdev.push_back(tdev4); +// vtdev.push_back(tdev5); print_tensor_host(vthost[5]); print_tensor_device(vtdev[5]); //cudaDeviceSynchronize(); - +/* //! test share_from function, if targets are the same, buffer is shared, otherwise, buffer is copied LOG(INFO) << "test share_from function"; TensorHf4 thost6, thost7; From 80f57fb390b27fac834b6ac5e4ec5f1971e9d612 Mon Sep 17 00:00:00 2001 From: "weihao.huang" Date: Wed, 27 Jun 2018 06:14:17 +0000 Subject: [PATCH 082/318] Fix sync_memcpy functions & test_saber_buffer_BM all passes --- saber/core/impl/bm/bm_impl.cpp | 28 ++++++++++++++++++-------- saber/core/target_wrapper.h | 4 ++-- test/saber/bm/test_saber_buffer_BM.cpp | 24 ++++++---------------- 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index ef26884b2..a50994a60 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -78,27 +78,39 @@ void BM_API::mem_set(void* ptr, int value, size_t n){ //BMDNN_CHECK(bm_memset_device(handle, value, *pmem)); } -//static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ -// size_t count, __DtoD) {}; +void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ + size_t count, __DtoD) { + handle = get_bm_handle(); + //BMDNN_CHECK(bm_memcpy_s2d(handle, bm_mem_from_device(dst), dst_id, bm_mem_from_device(src), src_id, count)); + BMDNN_CHECK(bm_memcpy_d2d(handle, *(bm_device_mem_t *)(dst), dst_id, *(bm_device_mem_t *)(src), src_id, count)); + LOG(INFO) << "BM sync_memcpy: device to device, finished"; +}; void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ size_t count, __HtoD) { handle = get_bm_handle(); - BMDNN_CHECK(bm_memcpy_s2d(handle, bm_mem_from_device(dst), bm_mem_from_system(src))); + BMDNN_CHECK(bm_memcpy_s2d(handle, *(bm_device_mem_t *)(dst), bm_mem_from_system(src))); + for(int i=0; i<10; i++) + std::cout << "HtoD src: " << *((float *)(src)+i) << std::endl; + LOG(INFO) << "BM sync_memcpy: host to device, finished"; }; void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ size_t count, __DtoH) { handle = get_bm_handle(); - BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), bm_mem_from_device(src))); - //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src))); - //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(reinterpret_cast(src)))); + BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src))); + for(int i=0; i<10; i++) + std::cout << "DtoH dst: " << *((float *)(dst)+i) << std::endl; + LOG(INFO) << "BM sync_memcpy: device to host, finished"; }; -//static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \ -// int src_dev, size_t count) {}; +void BM_API::sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \ + int src_dev, size_t count) { + + LOG(INFO) << "BM sync_memcpy_p2p: temporarily no used"; +}; //! target wrapper diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h index 925f4dd39..d87b2ae03 100644 --- a/saber/core/target_wrapper.h +++ b/saber/core/target_wrapper.h @@ -398,7 +398,7 @@ struct TargetWrapper { // brief create event, empty function for bitmain target static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, __DtoD) {}; + size_t count, __DtoD); static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ size_t count, __HtoD); @@ -407,7 +407,7 @@ struct TargetWrapper { size_t count, __DtoH); static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \ - int src_dev, size_t count) {}; + int src_dev, size_t count); /** * \brief device target return currently used device id diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp index 555e22675..f8c8f46bb 100644 --- a/test/saber/bm/test_saber_buffer_BM.cpp +++ b/test/saber/bm/test_saber_buffer_BM.cpp @@ -27,7 +27,7 @@ void test_buffer() { x86_ptr = static_cast(tmp_x86); for (int i = 0; i < n0; i++) { - x86_ptr[i] = static_cast(100); + x86_ptr[i] = static_cast(i); } void* tmp_bm; @@ -97,25 +97,13 @@ void test_buffer() { } CHECK_EQ(ptr1[n0 / 2], ptr2[n0 / 2]) << "deep copy between host is incorrect"; + bm_buf1.sync_copy_from(x86_buf2); LOG(INFO) << "deep copy from host buffer to device buffer"; - bm_buf1.sync_copy_from(x86_buf2); - - /* - const Hdtype* x86_buf2_ptr = static_cast(x86_buf2.get_data()); - for (int i = 0; i < 10; i++) { - std::cout << "x86: " << x86_buf2_ptr[i] << std::endl; - } - */ - - const Hdtype* bm_buf1_ptr = static_cast(bm_buf1.get_data()); - for (int i = 0; i < 10; i++) { - std::cout << "bm: " << bm_buf1_ptr[i] << std::endl; - } - LOG(INFO) << "bm_buf1 cap & cnt: " << bm_buf1.get_capacity() << " " << bm_buf1.get_count(); - LOG(INFO) << "x86_buf1 cap & cnt: " << x86_buf1.get_capacity() << " " << x86_buf1.get_count(); - LOG(INFO) << "size of Hdtype: " << sizeof(Hdtype); - LOG(INFO) << "size of Ddtype: " << sizeof(Ddtype); + //LOG(INFO) << "bm_buf1 cap & cnt: " << bm_buf1.get_capacity() << " " << bm_buf1.get_count(); + //LOG(INFO) << "x86_buf1 cap & cnt: " << x86_buf1.get_capacity() << " " << x86_buf1.get_count(); + //LOG(INFO) << "size of Hdtype: " << sizeof(Hdtype); + //LOG(INFO) << "size of Ddtype: " << sizeof(Ddtype); x86_buf1.re_alloc(bm_buf1.get_capacity()); From a1bd3fdcbaf82268a6f60a56cdbebd34a17ffa11 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 27 Jun 2018 14:22:38 +0800 Subject: [PATCH 083/318] Implement BM softmax --- saber/funcs/impl/bm/vender_softmax.h | 106 ++++++++++ saber/funcs/softmax.h | 15 ++ test/saber/bm/test_saber_func_softmax_BM.cpp | 194 ++++++++++++++++++ test/saber/bm/test_saber_func_softmax_BM.h | 21 ++ .../saber/x86/test_saber_func_softmax_x86.cpp | 2 +- 5 files changed, 337 insertions(+), 1 deletion(-) create mode 100644 saber/funcs/impl/bm/vender_softmax.h create mode 100644 test/saber/bm/test_saber_func_softmax_BM.cpp create mode 100644 test/saber/bm/test_saber_func_softmax_BM.h diff --git a/saber/funcs/impl/bm/vender_softmax.h b/saber/funcs/impl/bm/vender_softmax.h new file mode 100644 index 000000000..fb2595e87 --- /dev/null +++ b/saber/funcs/impl/bm/vender_softmax.h @@ -0,0 +1,106 @@ +#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_SOFTMAX_H +#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_SOFTMAX_H + +#include "saber/funcs/impl/impl_softmax.h" +#include "saber/saber_funcs_param.h" +#include "saber/saber_types.h" + +namespace anakin{ + +namespace saber{ + +template +class VenderSoftmax : \ + public ImplBase< + Tensor, + Tensor, + Tensor, + SoftmaxParam > > +{ +public: + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + typedef Tensor OpTensor; + typedef typename DataTensor_in::Dtype InDataType; + typedef typename DataTensor_out::Dtype OutDataType; + typedef typename OpTensor::Dtype OpDataType; + + VenderSoftmax(): _handle(NULL) {} + ~VenderSoftmax() {} + + /** + * \brief initial all bmdnn resources here + * @param inputs + * @param outputs + * @param param + * @param ctx + */ + virtual SaberStatus init(const std::vector& inputs, + std::vector& outputs, + SoftmaxParam& param, Context& ctx) { + + _handle = get_bm_handle(); + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector& inputs, + std::vector& outputs, + SoftmaxParam& param, Context &ctx) { + + } + + //call cudnnConvolutionForward here + virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + SoftmaxParam ¶m){ + + const InDataType *in_data = (const InDataType *) inputs[0]->data(); + OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); + + int input_n = inputs[0]->num(); + int input_c = inputs[0]->channel(); + int input_h = inputs[0]->height(); + int input_w = inputs[0]->width(); + + /* + int outer_num = inputs[0]->count(0, param.axis); + int inner_num = inputs[0]->count(param.axis + 1, inputs[0]->dims()); + + int N = outer_num; + int K = inputs[0]->valid_shape()[param.axis]; + int H = inner_num; + int W = 1; + + const int stride_w = 1; + const int stride_h = W * stride_w; + const int stride_c = H * stride_h; + const int stride_n = K * stride_c; + */ + + bmdnn_softmax_forward( + _handle, + *in_data, + input_n, + input_c, + input_h * input_w, + *out_data + ); + + return SaberSuccess; + } + +private: + bm_handle_t _handle; +}; + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_SOFTMAX_H diff --git a/saber/funcs/softmax.h b/saber/funcs/softmax.h index 3fa5d850e..4a1e631f0 100644 --- a/saber/funcs/softmax.h +++ b/saber/funcs/softmax.h @@ -27,10 +27,24 @@ #include "saber/funcs/impl/x86/saber_softmax.h" #endif +#ifdef USE_BM +#include "saber/funcs/impl/bm/vender_softmax.h" +#endif + namespace anakin{ namespace saber{ +#ifdef USE_BM +template +#else template +#endif class Softmax : public BaseFunc< Tensor, Tensor, diff --git a/test/saber/bm/test_saber_func_softmax_BM.cpp b/test/saber/bm/test_saber_func_softmax_BM.cpp new file mode 100644 index 000000000..2da0d2e62 --- /dev/null +++ b/test/saber/bm/test_saber_func_softmax_BM.cpp @@ -0,0 +1,194 @@ +#include "core/context.h" +#include "funcs/softmax.h" +#include "test_saber_func_softmax_BM.h" +#include "tensor_op.h" +#include "saber_types.h" +#include + +using namespace anakin::saber; + +TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) { + + Env::env_init(); + typedef TargetWrapper API; + + typedef Tensor TensorDf4; + + typedef TensorDf4::Dtype dtype; + + int test_iter = 1000; + + int softmax_axis = 3; // channel + int w_in = 3; + int h_in = 225; + int ch_in = 40; + int num_in = 1; + + Shape shape_in(num_in, ch_in, h_in, w_in); + Shape shape_out = shape_in; + + SoftmaxParam param(softmax_axis); + + LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \ + ch_in << ", height=" << h_in << ", width=" << w_in; + + LOG(INFO) << "softmax axis= " << param.axis; + + std::vector input_dev_4d; + std::vector output_dev_4d; + + Tensor thin(shape_in); + + for (int i = 0; i < thin.size(); ++i) { + thin.mutable_data()[i] = i % 4; + } + + TensorDf4 tdin, tdout; + tdin.re_alloc(shape_in); + tdin.copy_from(thin); + input_dev_4d.push_back(&tdin); + + // start Reshape & doInfer + Context ctx_dev(0, 1, 1); + + Softmax softmax_dev; + + typedef std::vector Shape_v; + + LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \ + shape_out[2] << ", " << shape_out[3]; + + output_dev_4d.push_back(&tdout); + softmax_dev.compute_output_shape(input_dev_4d, output_dev_4d, param); + + LOG(INFO) << "re-alloc tensor buffer"; + output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape()); + + LOG(INFO) << "softmax initialized to cudnn impl"; + softmax_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev); + + LOG(INFO) << "cudnn softmax compute"; + SaberTimer t1; + t1.clear(); + t1.start(ctx_dev); + + for (int i = 0; i < test_iter; ++i) { + softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev); + output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); + output_dev_4d[0]->sync(); + } + + t1.end(ctx_dev); + float ts = t1.get_average_ms(); + printf("cudnn softmax total time : %.4f, avg time : %.4f\n", ts, ts / test_iter); + + LOG(INFO) << "softmax initialized to saber impl"; + softmax_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, SABER_IMPL, ctx_dev); + + LOG(INFO) << "saber softmax compute"; + t1.clear(); + t1.start(ctx_dev); + + for (int i = 0; i < test_iter; ++i) { + softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev); + output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); + output_dev_4d[0]->sync(); + } + + t1.end(ctx_dev); + ts = t1.get_average_ms(); + printf("saber softmax total time : %.4f, avg time : %.4f\n", ts, ts / test_iter); + //print_tensor_device(*output_dev_4d[0]); +} + +TEST(TestSaberFuncSoftmaxBM, test_func_softmax_ROI_BM) { + + Env::env_init(); + typedef TargetWrapper API; + + typedef Tensor TensorDf4; + + typedef TensorDf4::Dtype dtype; + + int test_iter = 1; + + int softmax_axis = 3; // channel + int w_in = 3; + int h_in = 10; + int ch_in = 10; + int num_in = 1; + + Shape shape_in(num_in, ch_in, h_in, w_in); + Shape shape_in_roi{num_in, ch_in / 2, h_in / 2, w_in}; + Shape shape_out = shape_in_roi; + + SoftmaxParam param(softmax_axis); + + LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \ + ch_in << ", height=" << h_in << ", width=" << w_in; + + LOG(INFO) << "softmax axis= " << param.axis; + + std::vector input_dev_4d; + std::vector output_dev_4d; + + Tensor thin(shape_in); + + for (int i = 0; i < thin.size(); ++i) { + thin.mutable_data()[i] = (i % 3); + } + + TensorDf4 tdin, tdin_roi, tdout, tdout_roi; + tdin.re_alloc(shape_in); + tdout.re_alloc(shape_in); + tdin.copy_from(thin); + tdin_roi.share_sub_buffer(tdin, shape_in_roi, Shape(0, 0, 0, 0)); + input_dev_4d.push_back(&tdin_roi); + output_dev_4d.push_back(&tdout_roi); + + // start Reshape & doInfer + Context ctx_dev(0, 1, 1); + + Softmax softmax_dev; + + LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \ + shape_out[2] << ", " << shape_out[3]; + + softmax_dev.compute_output_shape(input_dev_4d, output_dev_4d, param); + + LOG(INFO) << "re-alloc tensor buffer"; + output_dev_4d[0]->share_sub_buffer(tdout, shape_in_roi, Shape(0, 0, 0, 0)); + //output_dev_4d[0]->reshape(output_dev_4d[0]->valid_shape()); + + LOG(INFO) << "softmax initialization"; + softmax_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, SABER_IMPL, ctx_dev); + + LOG(INFO) << "softmax compute"; + SaberTimer t1; + t1.clear(); + t1.start(ctx_dev); + + for (int i = 0; i < test_iter; ++i) { + softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev); + output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); + output_dev_4d[0]->sync(); + } + + t1.end(ctx_dev); + float ts = t1.get_average_ms(); + printf("total time : %.4f, avg time : %.4f\n", ts, ts / test_iter); + print_tensor_device(*output_dev_4d[0]); + + TensorDf4 troi(output_dev_4d[0]->valid_shape()); + troi.copy_from(*output_dev_4d[0]); + print_tensor_device(troi); +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/bm/test_saber_func_softmax_BM.h b/test/saber/bm/test_saber_func_softmax_BM.h new file mode 100644 index 000000000..d5c5b6986 --- /dev/null +++ b/test/saber/bm/test_saber_func_softmax_BM.h @@ -0,0 +1,21 @@ +#ifndef ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H +#define ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H + +#include "utils/unit_test/aktest.h" +#include "utils/logger/logger.h" +#include "core/tensor.h" + +using namespace anakin::test; + +class TestSaberFuncSoftmaxBM : public Test { +public: + TestSaberFuncSoftmaxBM() {} + ~TestSaberFuncSoftmaxBM() {} + +protected: + virtual void setup() {} + virtual void teardown() {} + +}; + +#endif //ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H diff --git a/test/saber/x86/test_saber_func_softmax_x86.cpp b/test/saber/x86/test_saber_func_softmax_x86.cpp index c4942c302..179806e02 100644 --- a/test/saber/x86/test_saber_func_softmax_x86.cpp +++ b/test/saber/x86/test_saber_func_softmax_x86.cpp @@ -63,7 +63,7 @@ void test(int num, int channel) { dst_saber.re_alloc(shape_out); output_softmax.push_back(&dst_saber); - Softmax op_softmax; + Softmax op_softmax; SoftmaxParam smx_pm; op_softmax.init(input_softmax, output_softmax, smx_pm, SPECIFY, SABER_IMPL, ctx_host); From 7c0a0f0118475617a60995314370759dfeea032c Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 27 Jun 2018 14:53:56 +0800 Subject: [PATCH 084/318] only print in DEBUG --- saber/core/impl/bm/bm_impl.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index a50994a60..4d24dedf0 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -90,8 +90,11 @@ void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ size_t count, __HtoD) { handle = get_bm_handle(); BMDNN_CHECK(bm_memcpy_s2d(handle, *(bm_device_mem_t *)(dst), bm_mem_from_system(src))); + + #ifdef DEBUG for(int i=0; i<10; i++) - std::cout << "HtoD src: " << *((float *)(src)+i) << std::endl; + LOG(INFO) << "HtoD src: " << *((float *)(src)+i); + #endif LOG(INFO) << "BM sync_memcpy: host to device, finished"; }; @@ -100,8 +103,11 @@ void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ size_t count, __DtoH) { handle = get_bm_handle(); BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src))); + + #ifdef DEBUG for(int i=0; i<10; i++) - std::cout << "DtoH dst: " << *((float *)(dst)+i) << std::endl; + LOG(INFO) << "DtoH dst: " << *((float *)(dst)+i); + #endif LOG(INFO) << "BM sync_memcpy: device to host, finished"; }; From 635ff4260496f98657440461c7f251c2b6a4c907 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 27 Jun 2018 15:05:37 +0800 Subject: [PATCH 085/318] reduce iteration --- test/saber/bm/test_saber_func_softmax_BM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/saber/bm/test_saber_func_softmax_BM.cpp b/test/saber/bm/test_saber_func_softmax_BM.cpp index 2da0d2e62..8176a9e51 100644 --- a/test/saber/bm/test_saber_func_softmax_BM.cpp +++ b/test/saber/bm/test_saber_func_softmax_BM.cpp @@ -16,7 +16,7 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) { typedef TensorDf4::Dtype dtype; - int test_iter = 1000; + int test_iter = 10; int softmax_axis = 3; // channel int w_in = 3; From 4a9863f59da04a26ef151208ec84bc31a1386d8e Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 27 Jun 2018 15:11:57 +0800 Subject: [PATCH 086/318] Revert "reduce iteration" This reverts commit 635ff4260496f98657440461c7f251c2b6a4c907. --- test/saber/bm/test_saber_func_softmax_BM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/saber/bm/test_saber_func_softmax_BM.cpp b/test/saber/bm/test_saber_func_softmax_BM.cpp index 8176a9e51..2da0d2e62 100644 --- a/test/saber/bm/test_saber_func_softmax_BM.cpp +++ b/test/saber/bm/test_saber_func_softmax_BM.cpp @@ -16,7 +16,7 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) { typedef TensorDf4::Dtype dtype; - int test_iter = 10; + int test_iter = 1000; int softmax_axis = 3; // channel int w_in = 3; From 2997faf062e8ef4bf6310c425ab369059fec335d Mon Sep 17 00:00:00 2001 From: "tong.liu" Date: Wed, 27 Jun 2018 08:19:32 +0000 Subject: [PATCH 087/318] modify fc op, compile error --- saber/funcs/fc.h | 18 ++++++++++++++++++ saber/funcs/impl/bm/vender_fc.h | 12 ++++++------ test/saber/bm/test_saber_func_fc_BM.cpp | 4 ++-- 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/saber/funcs/fc.h b/saber/funcs/fc.h index 06dc8695a..035d85934 100644 --- a/saber/funcs/fc.h +++ b/saber/funcs/fc.h @@ -27,10 +27,24 @@ #include "saber/funcs/impl/x86/vender_fc.h" #endif +#ifdef USE_BM +#include "saber/funcs/impl/bm/vender_fc.h" +#endif + namespace anakin{ namespace saber{ +#ifdef USE_BM +template +#else template +#endif class Fc : public BaseFunc< Tensor, Tensor, @@ -125,6 +140,9 @@ class Fc : public BaseFunc< #endif #ifdef USE_X86_PLACE this->_best_impl = this->_impl[0]; +#endif +#ifdef USE_BM + this->_best_impl = this->_impl[0]; #endif } diff --git a/saber/funcs/impl/bm/vender_fc.h b/saber/funcs/impl/bm/vender_fc.h index 82dd6000c..5004ad349 100644 --- a/saber/funcs/impl/bm/vender_fc.h +++ b/saber/funcs/impl/bm/vender_fc.h @@ -1,6 +1,5 @@ #ifndef ANAKIN_SABER_FUNCS_BMDNN_FC_H #define ANAKIN_SABER_FUNCS_BMDNN_FC_H - #include "saber/funcs/impl/impl_fc.h" namespace anakin{ @@ -34,6 +33,7 @@ class VenderFc& inputs, std::vector& outputs, FcParam& param, Context& ctx){ + _handle = get_bm_handle(); return create(inputs, outputs, param, ctx); } @@ -46,10 +46,10 @@ class VenderFc& inputs, std::vector& outputs, FcParam& param){ - const InDataType *in_data = (const InDataType *) inputs[0]->data(); - const InDataType *weights = (const InDataType *) param.weights->get_buf()->get_data(); - const InDataType *bias = (const InDataType *) param.bias->get_buf()->get_data(); - OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); + const InDataType in_data = *(inputs[0]->data()); + const InDataType weights = *(InDataType*)(param.weights->get_buf()->get_data()); + const InDataType bias = *(InDataType*)(param.bias->get_buf()->get_data()); + OutDataType out_data = *(outputs[0]->mutable_data()); int batch_size = inputs[0]->num(); int input_len = inputs[0]->channel(); int output_len = param.num_output; @@ -64,7 +64,7 @@ class VenderFc; +template class VenderFc; } //namespace saber } //namespace anakin diff --git a/test/saber/bm/test_saber_func_fc_BM.cpp b/test/saber/bm/test_saber_func_fc_BM.cpp index 869ff1bfd..5acbc453e 100644 --- a/test/saber/bm/test_saber_func_fc_BM.cpp +++ b/test/saber/bm/test_saber_func_fc_BM.cpp @@ -7,7 +7,7 @@ using namespace anakin::saber; typedef TargetWrapper API; -typedef Tensor TensorDf4; +typedef Tensor TensorDf4; typedef Tensor TensorHf4; typedef TensorDf4::Dtype ftype; @@ -80,7 +80,7 @@ TEST(TestSaberFuncBM, test_func_fc) { FcParam param(&weight, &bias, num_out, axis); - Fc fc; + Fc fc; LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \ shape_out[2] << ", " << shape_out[3]; From ff5039ff63bb89a32f57c48a14ef0a5e8e0061c7 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 27 Jun 2018 16:53:20 +0800 Subject: [PATCH 088/318] Update for BM softmax --- saber/funcs/impl/bm/vender_softmax.h | 14 +++++++----- test/saber/bm/test_saber_func_softmax_BM.cpp | 23 ++++++++++---------- test/saber/bm/test_saber_func_softmax_BM.h | 21 ------------------ 3 files changed, 20 insertions(+), 38 deletions(-) delete mode 100644 test/saber/bm/test_saber_func_softmax_BM.h diff --git a/saber/funcs/impl/bm/vender_softmax.h b/saber/funcs/impl/bm/vender_softmax.h index fb2595e87..55612f66a 100644 --- a/saber/funcs/impl/bm/vender_softmax.h +++ b/saber/funcs/impl/bm/vender_softmax.h @@ -63,12 +63,13 @@ class VenderSoftmaxdata(); OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); + /* int input_n = inputs[0]->num(); int input_c = inputs[0]->channel(); int input_h = inputs[0]->height(); int input_w = inputs[0]->width(); + */ - /* int outer_num = inputs[0]->count(0, param.axis); int inner_num = inputs[0]->count(param.axis + 1, inputs[0]->dims()); @@ -77,18 +78,19 @@ class VenderSoftmax using namespace anakin::saber; -TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) { - Env::env_init(); +TEST(TestSaberFuncBM, test_func_softmax_BM) { + + //Env::env_init(); typedef TargetWrapper API; typedef Tensor TensorDf4; @@ -74,8 +75,8 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) { for (int i = 0; i < test_iter; ++i) { softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev); - output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); - output_dev_4d[0]->sync(); + //output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); + //output_dev_4d[0]->sync(); } t1.end(ctx_dev); @@ -91,8 +92,8 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) { for (int i = 0; i < test_iter; ++i) { softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev); - output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); - output_dev_4d[0]->sync(); + //output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); + //output_dev_4d[0]->sync(); } t1.end(ctx_dev); @@ -101,9 +102,9 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) { //print_tensor_device(*output_dev_4d[0]); } -TEST(TestSaberFuncSoftmaxBM, test_func_softmax_ROI_BM) { +TEST(TestSaberFuncBM, test_func_softmax_ROI_BM) { - Env::env_init(); + //Env::env_init(); typedef TargetWrapper API; typedef Tensor TensorDf4; @@ -170,8 +171,8 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_ROI_BM) { for (int i = 0; i < test_iter; ++i) { softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev); - output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); - output_dev_4d[0]->sync(); + //output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); + //output_dev_4d[0]->sync(); } t1.end(ctx_dev); diff --git a/test/saber/bm/test_saber_func_softmax_BM.h b/test/saber/bm/test_saber_func_softmax_BM.h deleted file mode 100644 index d5c5b6986..000000000 --- a/test/saber/bm/test_saber_func_softmax_BM.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H -#define ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H - -#include "utils/unit_test/aktest.h" -#include "utils/logger/logger.h" -#include "core/tensor.h" - -using namespace anakin::test; - -class TestSaberFuncSoftmaxBM : public Test { -public: - TestSaberFuncSoftmaxBM() {} - ~TestSaberFuncSoftmaxBM() {} - -protected: - virtual void setup() {} - virtual void teardown() {} - -}; - -#endif //ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H From ebb12b4bde4f87a1087a51e53f43d3866694f7c1 Mon Sep 17 00:00:00 2001 From: "tong.liu" Date: Wed, 27 Jun 2018 17:39:42 +0800 Subject: [PATCH 089/318] xRevert "modify fc op, compile error" This reverts commit 2997faf062e8ef4bf6310c425ab369059fec335d. --- saber/funcs/fc.h | 18 ------------------ saber/funcs/impl/bm/vender_fc.h | 12 ++++++------ test/saber/bm/test_saber_func_fc_BM.cpp | 4 ++-- 3 files changed, 8 insertions(+), 26 deletions(-) diff --git a/saber/funcs/fc.h b/saber/funcs/fc.h index 035d85934..06dc8695a 100644 --- a/saber/funcs/fc.h +++ b/saber/funcs/fc.h @@ -27,24 +27,10 @@ #include "saber/funcs/impl/x86/vender_fc.h" #endif -#ifdef USE_BM -#include "saber/funcs/impl/bm/vender_fc.h" -#endif - namespace anakin{ namespace saber{ -#ifdef USE_BM -template -#else template -#endif class Fc : public BaseFunc< Tensor, Tensor, @@ -140,9 +125,6 @@ class Fc : public BaseFunc< #endif #ifdef USE_X86_PLACE this->_best_impl = this->_impl[0]; -#endif -#ifdef USE_BM - this->_best_impl = this->_impl[0]; #endif } diff --git a/saber/funcs/impl/bm/vender_fc.h b/saber/funcs/impl/bm/vender_fc.h index 5004ad349..82dd6000c 100644 --- a/saber/funcs/impl/bm/vender_fc.h +++ b/saber/funcs/impl/bm/vender_fc.h @@ -1,5 +1,6 @@ #ifndef ANAKIN_SABER_FUNCS_BMDNN_FC_H #define ANAKIN_SABER_FUNCS_BMDNN_FC_H + #include "saber/funcs/impl/impl_fc.h" namespace anakin{ @@ -33,7 +34,6 @@ class VenderFc& inputs, std::vector& outputs, FcParam& param, Context& ctx){ - _handle = get_bm_handle(); return create(inputs, outputs, param, ctx); } @@ -46,10 +46,10 @@ class VenderFc& inputs, std::vector& outputs, FcParam& param){ - const InDataType in_data = *(inputs[0]->data()); - const InDataType weights = *(InDataType*)(param.weights->get_buf()->get_data()); - const InDataType bias = *(InDataType*)(param.bias->get_buf()->get_data()); - OutDataType out_data = *(outputs[0]->mutable_data()); + const InDataType *in_data = (const InDataType *) inputs[0]->data(); + const InDataType *weights = (const InDataType *) param.weights->get_buf()->get_data(); + const InDataType *bias = (const InDataType *) param.bias->get_buf()->get_data(); + OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); int batch_size = inputs[0]->num(); int input_len = inputs[0]->channel(); int output_len = param.num_output; @@ -64,7 +64,7 @@ class VenderFc; +template class VenderFc; } //namespace saber } //namespace anakin diff --git a/test/saber/bm/test_saber_func_fc_BM.cpp b/test/saber/bm/test_saber_func_fc_BM.cpp index 5acbc453e..869ff1bfd 100644 --- a/test/saber/bm/test_saber_func_fc_BM.cpp +++ b/test/saber/bm/test_saber_func_fc_BM.cpp @@ -7,7 +7,7 @@ using namespace anakin::saber; typedef TargetWrapper API; -typedef Tensor TensorDf4; +typedef Tensor TensorDf4; typedef Tensor TensorHf4; typedef TensorDf4::Dtype ftype; @@ -80,7 +80,7 @@ TEST(TestSaberFuncBM, test_func_fc) { FcParam param(&weight, &bias, num_out, axis); - Fc fc; + Fc fc; LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \ shape_out[2] << ", " << shape_out[3]; From 56f6122e47ae6d4286a353e8a5b01dc199913e73 Mon Sep 17 00:00:00 2001 From: hlzy <327842846@qq.com> Date: Wed, 27 Jun 2018 07:46:39 -0400 Subject: [PATCH 090/318] change tensor_test_bm --- saber/core/target_wrapper.h | 5 +---- test/saber/bm/test_saber_tensor_BM.cpp | 6 ------ 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h index 8f84ca759..aafbf3648 100644 --- a/saber/core/target_wrapper.h +++ b/saber/core/target_wrapper.h @@ -423,13 +423,10 @@ struct TargetWrapper { * @return currently activated device id */ static int get_device_id(); -<<<<<<< HEAD - static bm_handle_t get_handler(); +// static bm_handle_t get_handler(); // bm_handle_t handle; -======= ->>>>>>> c0edd55a1bdd22e12dc62c9463d229285e5f5d80 }; #endif //USE_BM diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index f720581ef..9fb62d989 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -72,12 +72,6 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { thost1.copy_from(tdev1); print_tensor_host(thost1); -<<<<<<< HEAD - - // device to device -======= - //device to device ->>>>>>> c0edd55a1bdd22e12dc62c9463d229285e5f5d80 tdev1.copy_from(tdev0); print_tensor_device(tdev1); From 571e3a43f3dfe3ec05ceae290624e98d2941718b Mon Sep 17 00:00:00 2001 From: hlzy <327842846@qq.com> Date: Wed, 27 Jun 2018 20:19:31 -0400 Subject: [PATCH 091/318] tensor test update --- saber/core/tensor.h | 37 ++++++++++++++-- test/saber/bm/test_saber_tensor_BM.cpp | 61 +++++++++++++++----------- 2 files changed, 69 insertions(+), 29 deletions(-) diff --git a/saber/core/tensor.h b/saber/core/tensor.h index 0beaa7b04..7c1d00052 100644 --- a/saber/core/tensor.h +++ b/saber/core/tensor.h @@ -19,7 +19,7 @@ #include "core/shape.h" #include "core/events.h" #include "core/tensor_traits.h" - +#include namespace anakin{ namespace saber{ @@ -117,20 +117,49 @@ class Tensor : public TensorBase { /** * \brief Constructor with allocated data ptr and entire memory shape. */ - template - Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape) { +// template +// Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape) { +// +// CHECK_EQ(shape.dims(), TensorAPI::layout_dims::value) << \ +// "shape dims is not matched to layout type"; +// _shape = shape; +// _valid_shape = shape; +// _offset = Shape::zero(shape.dims()); +// std::shared_ptr> buf_from_date = \ +// std::make_shared>(data_ptr, shape.count() * _type_len(), id); +// BufferMemShare(_buf, buf_from_date); +// _is_subbuf = false; +// } +#ifdef USE_BM + /** + * \brief Constructor with allocated data ptr and entire memory shape. only for BM + */ + template + Tensor(Dtype_s* data_ptr, TargetType_t target, int id, Shape shape) { CHECK_EQ(shape.dims(), TensorAPI::layout_dims::value) << \ "shape dims is not matched to layout type"; _shape = shape; _valid_shape = shape; _offset = Shape::zero(shape.dims()); + + if(typeid(Dtype_s) == typeid(AK_FLOAT)) + { + std::shared_ptr> buf_from_date = \ + std::make_shared>(&bm_mem_from_system(const_cast(data_ptr)), shape.count() * _type_len(), id); + + BufferMemShare(_buf, buf_from_date); + } + else + { std::shared_ptr> buf_from_date = \ std::make_shared>(data_ptr, shape.count() * _type_len(), id); + BufferMemShare(_buf, buf_from_date); + } _is_subbuf = false; } - +#endif /** * \brief Copy constructor, shallow copy. */ diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index 429f61673..423ffe221 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -32,7 +32,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { //! test tensor re_alloc function on tensor with data LOG(INFO) << "|--test tensor re_alloc function on tensor with data"; - Shape sh1(2, 4, 4, 2); + Shape sh1(1, 4, 4, 4); thost0.re_alloc(sh1); tdev0.re_alloc(sh1); LOG(INFO) << "|--tensor size of host: " << thost0.size(); @@ -74,10 +74,12 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { thost1.copy_from(tdev1); print_tensor_host(thost1); + LOG(INFO) << "test copy_from() function device to device"; + tdev1.copy_from(tdev0); print_tensor_device(tdev1); - /* + //! test tensor constructor with shape and real_shape LOG(INFO) << "test tensor constructor with shape and real_shape"; //! constructor with 3 shapes is removed @@ -88,7 +90,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { LOG(INFO) << "test tensor constructor with data, if target is different, create buffer, and copy the data"; dtype* host_data_ptr; -// dtype2* dev_data_ptr; + dtype2* dev_data_ptr; void* tmp_pt_host; void* tmp_pt_dev; X86_API::mem_alloc(&tmp_pt_host, sizeof(dtype) * sh1.count()); @@ -99,16 +101,17 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { } BM_API::mem_alloc(&tmp_pt_dev, sizeof(dtype2) * sh1.count()); -// dev_data_ptr = static_cast(tmp_pt_dev); -// bm_memcpy_d2s(handle,*dev_data_ptr,bm_mem_from_system(const_cast(host_data_ptr))); - + dev_data_ptr = static_cast(tmp_pt_dev); //--- cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice); - + BM_API::sync_memcpy(dev_data_ptr,0,host_data_ptr,0,0,__HtoD()); LOG(INFO) << "|--construct host tensor from host data ptr"; TensorHf4 thost3(host_data_ptr, X86(), X86_API::get_device_id(), sh1); LOG(INFO) << "|--constructor device tensor from host data ptr"; - TensorDf4 tdev3(&bm_mem_from_system(const_cast(host_data_ptr)), X86(), X86_API::get_device_id(), sh1); +// TensorDf4 tdev3(&bm_mem_from_system(const_cast(host_data_ptr)), X86(), X86_API::get_device_id(), sh1); + + TensorDf4 tdev3(host_data_ptr, X86(), X86_API::get_device_id(), sh1); + print_tensor_host(thost3); @@ -123,24 +126,30 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { //cudaDeviceSynchronize(); // -/* + LOG(INFO) << "|--construct host tensor from device data ptr"; - TensorHf4 thost4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1); - LOG(INFO) << "|--constructor device tensor from device data ptr"; - TensorDf4 tdev4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1); - print_tensor_host(thost4); - print_tensor_device(tdev4); -*/ + TensorHf4 thost4(host_data_ptr, X86(), X86_API::get_device_id(), sh1); + + TensorDf4 tdev4(host_data_ptr, X86(), X86_API::get_device_id(), sh1); + +// TensorDf4 tdev3(&bm_mem_from_system(const_cast(host_data_ptr)), X86(), X86_API::get_device_id(), sh1); + +// TensorHf4 thost4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1); +// LOG(INFO) << "|--constructor device tensor from device data ptr"; +// TensorDf4 tdev4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1); +// print_tensor_host(thost4); +// print_tensor_device(tdev4); + //BM_API::stream_t dev_stream0; //BM_API::create_stream_with_flag(dev_stream0, 1); //cudaDeviceSynchronize(); -/* + //! test tensor copy constructor LOG(INFO) << "test tensor copy constructor"; LOG(INFO) << "|--normal copy constructor"; -// TensorHf4 thost5(thost4); -// TensorDf4 tdev5(tdev4); + TensorHf4 thost5(thost4); + TensorDf4 tdev5(tdev4); LOG(INFO) << "|--push back to vector"; std::vector vthost; @@ -149,18 +158,18 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { vthost.push_back(thost1); vthost.push_back(thost2); vthost.push_back(thost3); -// vthost.push_back(thost4); -// vthost.push_back(thost5); + vthost.push_back(thost4); + vthost.push_back(thost5); vtdev.push_back(tdev0); vtdev.push_back(tdev1); vtdev.push_back(tdev2); vtdev.push_back(tdev3); -// vtdev.push_back(tdev4); -// vtdev.push_back(tdev5); + vtdev.push_back(tdev4); + vtdev.push_back(tdev5); print_tensor_host(vthost[5]); print_tensor_device(vtdev[5]); //cudaDeviceSynchronize(); -/* + //! test share_from function, if targets are the same, buffer is shared, otherwise, buffer is copied LOG(INFO) << "test share_from function"; TensorHf4 thost6, thost7; @@ -172,7 +181,9 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { Shape sh2(1, 2, 2, 2); Shape offset(0, 0, 1, 1); LOG(INFO) << "|--shared host"; + thost6.share_sub_buffer(thost4, sh2, offset); + LOG(INFO) << "|--copied host"; tdev6.share_from(thost4); LOG(INFO) << "|--copied device"; @@ -180,6 +191,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { LOG(INFO) << "|--shared device"; tdev7.share_from(tdev4); + LOG(INFO) << "|--change data in shared tensor"; //Shape sh_real = thost6.shape(); @@ -220,8 +232,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { LOG(INFO) << "|--show root tensor while data is changed by shared tensor"; print_tensor_host(thost4); - */ -// bmdnn_deinit(handle); + bmdnn_deinit(handle); } /* From 62a04c8f1994447bfded4b9d2e3e03db7fb07b6d Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Thu, 28 Jun 2018 09:42:35 +0800 Subject: [PATCH 092/318] Add back missing files --- test/saber/bm/test_saber_buffer_BM.h | 20 ++++ test/saber/bm/test_saber_context_BM.h | 21 ++++ test/saber/bm/test_saber_device_BM.cpp | 20 ++++ test/saber/bm/test_saber_device_BM.h | 21 ++++ test/saber/bm/test_saber_func_BM.h | 38 ++++++ test/saber/bm/test_saber_func_fc_BM.cpp | 146 ++++++++++++++++++++++++ test/saber/bm/test_saber_shape_BM.cpp | 126 ++++++++++++++++++++ test/saber/bm/test_saber_shape_BM.h | 25 ++++ 8 files changed, 417 insertions(+) create mode 100644 test/saber/bm/test_saber_buffer_BM.h create mode 100644 test/saber/bm/test_saber_context_BM.h create mode 100644 test/saber/bm/test_saber_device_BM.cpp create mode 100644 test/saber/bm/test_saber_device_BM.h create mode 100644 test/saber/bm/test_saber_func_BM.h create mode 100644 test/saber/bm/test_saber_func_fc_BM.cpp create mode 100644 test/saber/bm/test_saber_shape_BM.cpp create mode 100644 test/saber/bm/test_saber_shape_BM.h diff --git a/test/saber/bm/test_saber_buffer_BM.h b/test/saber/bm/test_saber_buffer_BM.h new file mode 100644 index 000000000..8bbbe4511 --- /dev/null +++ b/test/saber/bm/test_saber_buffer_BM.h @@ -0,0 +1,20 @@ +#ifndef ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H +#define ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H + +#include "utils/unit_test/aktest.h" +#include "utils/logger/logger.h" + +using namespace anakin::test; + +class TestSaberBufferBM : public Test { +public: + TestSaberBufferBM() {} + ~TestSaberBufferBM() {} + +protected: + virtual void setup() {} + virtual void teardown() {} + +}; + +#endif //ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H diff --git a/test/saber/bm/test_saber_context_BM.h b/test/saber/bm/test_saber_context_BM.h new file mode 100644 index 000000000..653ee11fd --- /dev/null +++ b/test/saber/bm/test_saber_context_BM.h @@ -0,0 +1,21 @@ +#ifndef SABER_TEST_SABER_CONTEXT_BM_H +#define SABER_TEST_SABER_CONTEXT_BM_H + +#include "utils/unit_test/aktest.h" +#include "utils/logger/logger.h" +#include "core/context.h" + +using namespace anakin::test; + +class TestSaberContextBM : public Test { +public: + TestSaberContextBM() {} + ~TestSaberContextBM() {} + +protected: + virtual void setup() {} + virtual void teardown() {} + +}; + +#endif //SABER_TEST_SABER_CONTEXT_BM_H diff --git a/test/saber/bm/test_saber_device_BM.cpp b/test/saber/bm/test_saber_device_BM.cpp new file mode 100644 index 000000000..1c7086cf1 --- /dev/null +++ b/test/saber/bm/test_saber_device_BM.cpp @@ -0,0 +1,20 @@ +#include "test_saber_device_BM.h" + +#ifdef USE_BM + +using namespace anakin::saber; + +TEST(TestSaberDeviceBM, test_BM_device) { + Device dev_BM; +} + +#endif + +int main(int argc, const char** argv) { + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/bm/test_saber_device_BM.h b/test/saber/bm/test_saber_device_BM.h new file mode 100644 index 000000000..3a6d61236 --- /dev/null +++ b/test/saber/bm/test_saber_device_BM.h @@ -0,0 +1,21 @@ +#ifndef SABER_TEST_SABER_DEVICE_BM_H +#define SABER_TEST_SABER_DEVICE_BM_H + +#include "utils/unit_test/aktest.h" +#include "utils/logger/logger.h" +#include "core/device.h" + +using namespace anakin::test; + +class TestSaberDeviceBM : public Test { +public: + TestSaberDeviceBM() {} + ~TestSaberDeviceBM() {} + +protected: + virtual void setup() {} + virtual void teardown() {} + +}; + +#endif //SABER_TEST_SABER_DEVICE_BM_H diff --git a/test/saber/bm/test_saber_func_BM.h b/test/saber/bm/test_saber_func_BM.h new file mode 100644 index 000000000..61d27d6f9 --- /dev/null +++ b/test/saber/bm/test_saber_func_BM.h @@ -0,0 +1,38 @@ +#ifndef ANAKIN_TEST_SABER_TEST_SABER_FUNC_BM_H +#define ANAKIN_TEST_SABER_TEST_SABER_FUNC_BM_H + +#include "utils/unit_test/aktest.h" +#include "utils/logger/logger.h" +#include "core/tensor.h" +#include +#include + +using namespace anakin::test; + +int read_file(std::vector &results, const char* file_name) { + + std::ifstream infile(file_name); + if (!infile.good()) { + std::cout << "Cannot open " << std::endl; + return false; + } + LOG(INFO)<<"found filename: "< + +using namespace anakin::saber; +typedef TargetWrapper API; +typedef Tensor TensorDf4; +typedef Tensor TensorHf4; +typedef TensorDf4::Dtype ftype; + +void fc_compute(const TensorHf4& tin, const TensorHf4& weight, \ + const TensorHf4& bias, TensorHf4& tout) { + + int m = tin.num(); + int k = tin.valid_size() / m; + int n = weight.valid_size() / k; + bool bias_term = bias.valid_size() > 0; + + const float* din = tin.data(); + const float* w = weight.data(); + float* dout = tout.mutable_data(); + + for (int i = 0; i < m; ++i) { + float* pdout = dout + i * n; + const float* pdin = din + i * k; + + for (int j = 0; j < n; ++j) { + if (bias_term) { + pdout[j] = bias.data()[j]; + } else { + pdout[j] = 0; + } + + for (int l = 0; l < k; ++l) { + pdout[j] += pdin[l] * w[l * n + j]; + } + } + } +} + +TEST(TestSaberFuncBM, test_func_fc) { + + int test_iter = 100; + int w_in = 7; + int h_in = 7; + int ch_in = 512; + int num_in = 1; + + int num_out = 4096; + int axis = 1; + + Shape shape_in(num_in, ch_in, h_in, w_in); + Shape shape_out = {num_in, num_out, 1, 1}; + + Shape sh_w{1, 1, w_in* h_in * ch_in, num_out}; + TensorDf4 weight(sh_w); + Shape sh_b{1, 1, 1, num_out}; + TensorDf4 bias(sh_b); + fill_tensor_device_const(weight, 1.f); + fill_tensor_device_const(bias, 1.f); + + LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \ + ch_in << ", height=" << h_in << ", width=" << w_in; + + std::vector input_dev_4d; + std::vector output_dev_4d; + + TensorDf4 tdin; + TensorDf4 tdout; + tdin.re_alloc(shape_in); + fill_tensor_device_const(tdin, 1.f); + input_dev_4d.push_back(&tdin); + output_dev_4d.push_back(&tdout); + + // start Reshape & doInfer + Context ctx_dev(0, 1, 1); + + FcParam param(&weight, &bias, num_out, axis); + + Fc fc; + + LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \ + shape_out[2] << ", " << shape_out[3]; + + SABER_CHECK(fc.compute_output_shape(input_dev_4d, output_dev_4d, param)); + + LOG(INFO) << "re-alloc tensor buffer"; + output_dev_4d[0]->re_alloc(output_dev_4d[0]->valid_shape()); + Shape va_sh = tdout.valid_shape(); + LOG(INFO) << "shape out 4d: " << va_sh[0] << ", " << va_sh[1] << ", " << \ + va_sh[2] << ", " << va_sh[3]; + CHECK_EQ(tdout.valid_shape() == shape_out, true) << "compute output shape error"; + + LOG(INFO) << "FC initialization"; + SABER_CHECK(fc.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev)); + + LOG(INFO) << "FC compute"; + SaberTimer t1; + t1.clear(); + t1.start(ctx_dev); + + for (int i = 0; i < test_iter; ++i) { + SABER_CHECK(fc(input_dev_4d, output_dev_4d, param, ctx_dev)); + output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); + output_dev_4d[0]->sync(); + //cudaDeviceSynchronize(); + } + + t1.end(ctx_dev); + float ts = t1.get_average_ms(); + LOG(INFO) << "total time: " << ts << "avg time: " << ts / test_iter; + //print_tensor_device(*output_dev_4d[0]); + + //! check result + TensorHf4 thin(shape_in); + TensorHf4 thout(shape_out); + TensorHf4 thw(sh_w); + TensorHf4 thb(sh_b); + thin.copy_from(tdin); + thw.copy_from(weight); + thb.copy_from(bias); + fc_compute(thin, thw, thb, thout); + //print_tensor_host(thout); + + TensorHf4 thout_d(shape_out); + thout_d.copy_from(tdout); + double max_ratio = 0; + double max_diff = 0; + tensor_cmp_host(thout.data(), thout_d.data(), thout.valid_size(), max_ratio, max_diff); + LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; + CHECK_LE(fabs(max_ratio), 1.0e-6) << "error result"; + +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + Env::env_init(); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/bm/test_saber_shape_BM.cpp b/test/saber/bm/test_saber_shape_BM.cpp new file mode 100644 index 000000000..18479cd18 --- /dev/null +++ b/test/saber/bm/test_saber_shape_BM.cpp @@ -0,0 +1,126 @@ +#include "test_saber_shape_BM.h" +#include "shape.h" +#include "anakin_config.h" + +#ifdef USE_OPENMP +#include +#include +#endif + +using namespace anakin; +using namespace saber; + + +TEST(TestSaberShapeBM, test_saber_shape) { + + int dim = 4; + Shape sh4d0{0, 0, 0, 0}; + CHECK_EQ(sh4d0.dims(), 4) << "check shape dim error"; + + for (int i = 0; i < dim; ++i) { + CHECK_EQ(sh4d0[i], 0) << "check default constructor, dim size error"; + } + + CHECK_EQ(sh4d0.count(), 0) << "check shape count error"; + + int N = 1; + int C = 3; + int H = 11; + int W = 11; + std::vector sh_size = {N, C, H, W}; + //Shape sh4d1(sh_size); + Shape sh4d1(N, C, H, W); + LOG(INFO) << "Test Saber Shape, size of shape: " << sh4d1.size(); + CHECK_EQ(sh4d1.count(), N * C * H * W) << "size error with vector constructor!"; + //CHECK_EQ(sh4d2.size(), N * C * H * W) << "size error with args constructor!"; + + CHECK_EQ(sh4d1[0], N) << "get shape size error"; + CHECK_EQ(sh4d1[1], C) << "get shape size error"; + CHECK_EQ(sh4d1[2], H) << "get shape size error"; + CHECK_EQ(sh4d1[3], W) << "get shape size error"; + + //CHECK_EQ(sh4d2[0], N) << "get shape size error"; + //CHECK_EQ(sh4d2[1], C) << "get shape size error"; + //CHECK_EQ(sh4d2[2], H) << "get shape size error"; + //CHECK_EQ(sh4d2[3], W) << "get shape size error"; + + CHECK_EQ(sh4d1.count(0), N * C * H * W) << "calculate count failed"; + + C = 10; + sh4d1[1] = C; + CHECK_EQ(sh4d1[1], C) << "set shape size error"; + + bool is_equal = (sh4d0 == sh4d1); + CHECK_EQ(is_equal, false) << "check shape is_equal failed"; + + sh4d0 = sh4d1; + CHECK_EQ(sh4d1[0], N) << "constructor failed"; + CHECK_EQ(sh4d1[1], C) << "get shape size error"; + CHECK_EQ(sh4d1[2], H) << "get shape size error"; + CHECK_EQ(sh4d1[3], W) << "get shape size error"; + + Shape sh4d3 = sh4d1; + CHECK_EQ((sh4d3 == sh4d1), true) << "constructor error"; + + Shape sh4d4(sh4d1); + CHECK_EQ((sh4d4 == sh4d1), true) << "constructor error"; + + Shape sh1d0{0}; + //std::vector sh1d_size = {W}; + + //Shape sh1d1(sh1d_size); + //Shape sh1d0{W}; + Shape sh1d1(W); + + Shape sh1d3 = sh1d1; + Shape sh1d4(sh1d1); + + CHECK_EQ(sh1d0.dims(), 1) << "shape dim error"; + + CHECK_EQ(sh1d0.count(), 0) << "shape size error"; + + CHECK_EQ(sh1d0.count(0), 0) << "shape1d count error"; + + CHECK_EQ(sh1d1[0], W) << "get shape size error"; + + //CHECK_EQ(sh1d2.count(0), W) << "shape dim error"; + + CHECK_EQ((sh1d0 != sh1d1), true) << "compare shape error"; + + CHECK_EQ((sh1d3 == sh1d1), true) << "compare shape error"; + + CHECK_EQ((sh1d4 == sh1d1), true) << "compare shape error"; + + Shape sh0{2, 2, 3, 4}; + Shape sh1{2, 1, 1, 24}; + Shape sh2{2, 2, 3, 4}; + Shape sh3{1, 1, 2, 3}; + + CHECK_EQ(sh0 == sh2, true) << "error =="; + CHECK_EQ(sh3 < sh0, true) << "error <"; + CHECK_EQ(sh3 >= sh0, false) << "error >="; + CHECK_EQ(sh3 > sh0, false) << "error >"; + CHECK_EQ(sh0 > sh3, true) << "error >"; + CHECK_EQ(sh0 < sh1, false) << "error <"; + CHECK_EQ(sh0 <= sh2, true) << "error <="; + CHECK_EQ(sh0 >= sh2, true) << "error >="; + + Shape sh001 = Shape::zero(2); + Shape sh002 = Shape::zero(3); + + if (sh001 > sh002) { + LOG(ERROR) << "error <"; + } + +} + + +int main(int argc, const char** argv) { + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + + diff --git a/test/saber/bm/test_saber_shape_BM.h b/test/saber/bm/test_saber_shape_BM.h new file mode 100644 index 000000000..a2ca02c9b --- /dev/null +++ b/test/saber/bm/test_saber_shape_BM.h @@ -0,0 +1,25 @@ +#ifndef ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H +#define ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H + +#include "utils/unit_test/aktest.h" +#include "utils/logger/logger.h" +#include "saber/core/shape.h" + +using namespace anakin::test; + +class TestSaberShapeBM : public Test { +public: + TestSaberShapeBM() {} + ~TestSaberShapeBM() {} + +protected: + virtual void setup() {} + virtual void teardown() {} + +protected: + std::string name; + std::string _test; +}; + +#endif //ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H + From bff601c294d62502ee92754df621a2f557c2760f Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Thu, 28 Jun 2018 09:43:19 +0800 Subject: [PATCH 093/318] Add back missing files --- test/framework/core/base_types_test.cpp | 143 +++++++++ test/framework/core/core_test.h | 46 +++ test/framework/graph/graph_base_test.cpp | 82 ++++++ .../graph/graph_parser_from_model_test.cpp | 88 ++++++ test/framework/graph/graph_test.h | 47 +++ test/framework/net/benchmark.cpp | 162 +++++++++++ test/framework/net/chinese_ner_test.cpp | 213 ++++++++++++++ test/framework/net/model_test.cpp | 175 +++++++++++ .../net/net_exec_multi_thread_test.cpp | 149 ++++++++++ test/framework/net/net_exec_test.cpp | 273 ++++++++++++++++++ test/framework/net/net_test.h | 98 +++++++ test/framework/net/padde_api_test.cpp | 121 ++++++++ test/framework/net/paddle_api.h | 87 ++++++ test/framework/operators/operator_tests.h | 47 +++ test/framework/operators/pooling_test.cpp | 43 +++ 15 files changed, 1774 insertions(+) create mode 100644 test/framework/core/base_types_test.cpp create mode 100644 test/framework/core/core_test.h create mode 100644 test/framework/graph/graph_base_test.cpp create mode 100644 test/framework/graph/graph_parser_from_model_test.cpp create mode 100644 test/framework/graph/graph_test.h create mode 100644 test/framework/net/benchmark.cpp create mode 100644 test/framework/net/chinese_ner_test.cpp create mode 100644 test/framework/net/model_test.cpp create mode 100644 test/framework/net/net_exec_multi_thread_test.cpp create mode 100644 test/framework/net/net_exec_test.cpp create mode 100644 test/framework/net/net_test.h create mode 100644 test/framework/net/padde_api_test.cpp create mode 100644 test/framework/net/paddle_api.h create mode 100644 test/framework/operators/operator_tests.h create mode 100644 test/framework/operators/pooling_test.cpp diff --git a/test/framework/core/base_types_test.cpp b/test/framework/core/base_types_test.cpp new file mode 100644 index 000000000..0109493bf --- /dev/null +++ b/test/framework/core/base_types_test.cpp @@ -0,0 +1,143 @@ +#include "core_test.h" +#include "any.h" +#include "singleton.h" +#include "tls.h" +#include "parameter.h" +#include "thread_pool.h" + +#ifdef USE_CUDA +#include "cuda_funcs.h" +#include "sass_funcs.h" +#endif + +#include "tensor.h" + +#ifdef USE_CUDA +TEST(CoreComponentsTest, sass_test) { + LOG(INFO) << "test for cuda code function"; + //anakin::saber::Tensor<3, RTCUDA, float, NCHW> ts; + //LOG(WARNING) << " tensor num " << ts.num(); + //ts.set_offset(8); + //my_print(); + LOG(INFO) << "test for sass code function 1"; + invoke_test(); + LOG(INFO) << "test for sass code function 2"; + invoke_test_2(); +} +#endif + +TEST(CoreComponentsTest, core_base_types_any_test) { + LOG(INFO) << "test for any class ."; + LOG(WARNING) << " level 1 : base type int (set 42 to any)"; + const int a = 42; + any any_a(42); + int result_a = any_cast(any_a); + + LOG(INFO) << "casted result : " << result_a; + LOG(WARNING) << " level 2 : base type float (set 42.8 to any)"; + float b = 42.8; + any any_b = b; + float result_b = any_cast(any_b); + LOG(INFO) << "casted result : " << result_b << " decide: "; + + LOG(WARNING) << " level 3 : ptuple type (set PTuple to any)"; + PTuple p_tuple_float(3.2f, 3.3f, 3.5f); + p_tuple_float.push_back(4.3); // push_back + + any p_tuple_float_any = p_tuple_float; + auto result_p_tuple_float_any = any_cast>(p_tuple_float_any); + + for (int i = 0; i < result_p_tuple_float_any.size(); i++) { + LOG(INFO) << " any casted PTuple[" << i << "]: " << result_p_tuple_float_any[i]; + } + + struct target { + void print() { + LOG(INFO) << " target struct Successfully recovered."; + } + }; + + LOG(WARNING) << " level 5 : struct type"; + + target tg; + + any any_tg = tg; + + target result_tg = any_cast(any_tg); + + result_tg.print(); + + LOG(WARNING) << " level other : struct type"; + + any any_tg_copy = any_tg; + + target result_tg_copy = any_cast(any_tg); + + result_tg_copy.print(); +} + +void at_exit_in_test() { + LOG(WARNING) << "core_base_types_singleton_test exit successfully!"; +} + +TEST(CoreComponentsTest, core_base_types_singleton_test) { + struct target { + target() { + LOG(INFO) << " singleton target constructed"; + } + }; + typedef Singleton sg_target; + sg_target::Global(); +} + +typedef AnakinThreadLocalVar sg_tls; +void thread_func_0() { + int* tmp = sg_tls::value(); + *tmp = 3; + LOG(INFO) << " thread_func_0 value: " << *(sg_tls::value()); +} +void thread_func_1() { + int* tmp = sg_tls::value(); + *tmp = 4; + + LOG(INFO) << " thread_func_0 value: " << *(sg_tls::value()); +} +TEST(CoreComponentsTest, core_base_types_tls_test) { + LOG(INFO) << " Create tls var 0 , check in two thread."; + std::thread first(thread_func_0); + std::thread sec(thread_func_1); + first.join(); + sec.join(); + LOG(INFO) << " main thread var: " << *(sg_tls::value()); +} + +int thread_pool_func(int i) { + LOG(INFO) << " thread_pool_func input : " << i; + //std::this_thread::sleep_for(std::chrono::seconds(0)); + return i; +} + +TEST(CoreComponentsTest, core_base_types_thread_pool_test) { + LOG(INFO) << " Create thread pool with thread num = 12 "; + ThreadPool thread_pool_test(100); + thread_pool_test.launch(); + std::function test = thread_pool_func; + + for (int i = 0; i < 50; i++) { + // run async + auto ret = thread_pool_test.RunAsync(test, i); + LOG(INFO) << " return : " << ret.get(); + + // run sync + //auto sync_ret = thread_pool_test.RunSync(test, i); + } +} + + +int main(int argc, const char** argv) { + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/framework/core/core_test.h b/test/framework/core/core_test.h new file mode 100644 index 000000000..6107eef4b --- /dev/null +++ b/test/framework/core/core_test.h @@ -0,0 +1,46 @@ +/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_GRAPH_TEST_H +#define ANAKIN_GRAPH_TEST_H + +#include +#include +#include +#include "utils/unit_test/aktest.h" +#include "utils/logger/logger.h" + +using namespace anakin; +using ::anakin::test::Test; + +class CoreComponentsTest : public Test { +public: + CoreComponentsTest(){} + + void SetUp(){} + + void TearDown(){} + +protected: +}; + + + + + + +#endif + + diff --git a/test/framework/graph/graph_base_test.cpp b/test/framework/graph/graph_base_test.cpp new file mode 100644 index 000000000..d42e86c02 --- /dev/null +++ b/test/framework/graph/graph_base_test.cpp @@ -0,0 +1,82 @@ +#include +#include "graph_test.h" +#include "graph_base.h" + +using namespace anakin; +using namespace anakin::graph; + +//! Usage sample +class GraphTestClass : public GraphBase { +public: + GraphTestClass() {} + ~GraphTestClass() {} + virtual bool directed() { + return true; + }; +}; +class edge : public Arc { +public: + edge(std::string btm, std::string top, int weight): Arc(btm, top, weight) {} + ~edge() {} +}; + +TEST(GraphTest, graph_base_test) { + LOG(INFO) << "test for graph base ."; + + GraphTestClass graph; + graph.add_vertex("a", 42); + graph.add_vertex("b", 43); + graph.add_vertex("c", 44); + graph.add_vertex("d", 45); + graph.add_vertex("e", 46); + graph.add_vertex("f", 47); + + edge arc0("a", "b", 0); + edge arc1("b", "c", 1); + edge arc2("c", "d", 2); + edge arc3("d", "e", 3); + edge arc4("e", "f", 4); + edge arc5("f", "a", 5); + + graph.add_in_arc(arc0); + graph.add_in_arc(arc1); + graph.add_in_arc(arc2); + graph.add_in_arc(arc3); + graph.add_in_arc(arc4); + graph.add_in_arc(arc5); + graph.add_out_arc(arc0); + graph.add_out_arc(arc1); + graph.add_out_arc(arc2); + graph.add_out_arc(arc3); + graph.add_out_arc(arc4); + graph.add_out_arc(arc5); + + LOG(WARNING) << "Construction of graph."; + LOG(INFO) << graph.to_string(); + + LOG(WARNING) << "Remove a from graph."; + graph.remove("a"); + LOG(INFO) << graph.to_string(); + + LOG(WARNING) << "Add arc: f->b to graph."; + edge arc_f_b("f", "b", 10); + graph.add_in_arc(arc_f_b); + graph.add_out_arc(arc_f_b); + LOG(INFO) << graph.to_string(); + + LOG(WARNING) << "Add vertex:a and arc: a->e to graph."; + graph.add_vertex("a", 47); + edge arc_a_e("a", "e", 10); + graph.add_out_arc(arc_a_e); + graph.add_in_arc(arc_a_e); + LOG(INFO) << graph.to_string(); +} + + +int main(int argc, const char** argv) { + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/framework/graph/graph_parser_from_model_test.cpp b/test/framework/graph/graph_parser_from_model_test.cpp new file mode 100644 index 000000000..883a12858 --- /dev/null +++ b/test/framework/graph/graph_parser_from_model_test.cpp @@ -0,0 +1,88 @@ +#include +#include "graph_test.h" +#include "graph_base.h" +#include "graph.h" +#include "scheduler.h" + +using namespace anakin; +using namespace anakin::graph; + +//std::string model_path = "/home/chaowen/anakin_v2/model_v2/google_net/googlenet.anakin.bin"; +std::string model_path = "/home/chaowen/anakin_v2/model_v2/yolo/yolo.anakin.bin"; + + +TEST(GraphTest, graph_load_model) { + /*Graph* graph = new Graph(); + LOG(WARNING) << "load anakin model file from " << model_path << " ..."; + // load anakin model files. + graph->load(model_path); + + DLOG(INFO) << graph->to_string(); + // exec optimization + graph->Optimize(); */ +} + +#ifdef USE_CUDA +TEST(GraphTest, nvidia_graph_save_model) { + Graph* graph = new Graph(); + // load anakin model files. + LOG(INFO) << "load anakin model file from " << model_path << " ..."; + graph->load(model_path); + + // regisiter output tensor + //graph->RegistOut("data_perm", "data_scale"); + + // exec optimization + graph->Optimize(); + + // save the optimized model to disk. + std::string save_model_path = model_path + std::string(".saved"); + Status status = graph->save(save_model_path); +} +#endif + +#ifdef USE_X86_PLACE +TEST(GraphTest, x86_graph_save_model) { + Graph* graph = new Graph(); + // load anakin model files. + LOG(INFO) << "load anakin model file from " << model_path << " ..."; + graph->load(model_path); + + // regisiter output tensor + //graph->RegistOut("data_perm", "data_scale"); + + // exec optimization + graph->Optimize(); + + // save the optimized model to disk. + std::string save_model_path = model_path + std::string(".saved"); + Status status = graph->save(save_model_path); +} +#endif + +#ifdef USE_ARM_PLACE +TEST(GraphTest, arm_graph_save_model) { + Graph* graph = new Graph(); + // load anakin model files. + LOG(INFO) << "load anakin model file from " << model_path << " ..."; + graph->load(model_path); + + // regisiter output tensor + //graph->RegistOut("data_perm", "data_scale"); + + // exec optimization + graph->Optimize(); + + // save the optimized model to disk. + std::string save_model_path = model_path + std::string(".saved"); + Status status = graph->save(save_model_path); +} +#endif + +int main(int argc, const char** argv) { + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/framework/graph/graph_test.h b/test/framework/graph/graph_test.h new file mode 100644 index 000000000..db837c84a --- /dev/null +++ b/test/framework/graph/graph_test.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_GRAPH_TEST_H +#define ANAKIN_GRAPH_TEST_H + +#include +#include "utils/unit_test/aktest.h" +#include "utils/logger/logger.h" + +using namespace anakin; +using ::anakin::test::Test; + +/** + * \brief Graph test is base Test class for anakin graph funciton. + */ +class GraphTest: public Test { +public: + GraphTest(){} + + void SetUp(){} + + void TearDown(){} + +protected: +}; + + + + + + +#endif + + diff --git a/test/framework/net/benchmark.cpp b/test/framework/net/benchmark.cpp new file mode 100644 index 000000000..41c31c83e --- /dev/null +++ b/test/framework/net/benchmark.cpp @@ -0,0 +1,162 @@ +#include +#include "net_test.h" +#include "saber/funcs/timer.h" +#include +#include "saber/core/tensor_op.h" +#include +#include +#include +#include +#include +#include + +#ifdef USE_GFLAGS +#include + +DEFINE_string(model_dir, "", "model dir"); +DEFINE_string(model_file, "", "model file"); +DEFINE_int32(num, 1, "batchSize"); +DEFINE_int32(warmup_iter, 10, "warm up iterations"); +DEFINE_int32(epoch, 1000, "time statistic epoch"); +#else +std::string FLAGS_model_dir; +std::string FLAGS_model_file; +int FLAGS_num = 1; +int FLAGS_warmup_iter = 10; +int FLAGS_epoch = 1000; +#endif + +#ifdef USE_CUDA +typedef NV Target; +#elif defined(USE_X86_PLACE) +typedef X86 Target; +#else +typedef ARM Target; +#endif + +void getModels(std::string path, std::vector& files) { + DIR *dir; + struct dirent *ptr; + if ((dir = opendir(path.c_str())) == NULL) { + perror("Open dri error..."); + exit(1); + } + while((ptr = readdir(dir)) != NULL) { + if (strcmp(ptr->d_name, ".") == 0 || strcmp(ptr->d_name, "..") == 0) + continue; + else if (ptr->d_type == 8)//file + files.push_back(path + "/" + ptr->d_name); + else if (ptr->d_type == 4) { + getModels(path + "/" + ptr->d_name, files); + } + } + closedir(dir); +} +TEST(NetTest, net_execute_base_test) { + std::vector models; + if (FLAGS_model_file == "") { + getModels(FLAGS_model_dir, models); + } else { + models.push_back(FLAGS_model_dir + FLAGS_model_file); + } + for (auto iter = models.begin(); iter < models.end(); iter++) + { + LOG(WARNING) << "load anakin model file from " << *iter << " ..."; + Graph graph; + auto status = graph.load(*iter); + if (!status) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + graph.ResetBatchSize("input_0", FLAGS_num); + graph.Optimize(); + // constructs the executer net + Net net_executer(graph, true); + // get in + auto d_tensor_in_p = net_executer.get_in("input_0"); + Tensor4d h_tensor_in; + auto valid_shape_in = d_tensor_in_p->valid_shape(); + for (int i = 0; i < valid_shape_in.size(); i++) { + LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i]; + } + h_tensor_in.re_alloc(valid_shape_in); + fill_tensor_host_rand(h_tensor_in, -1.0f,1.0f); + d_tensor_in_p->copy_from(h_tensor_in); + // do inference + Context ctx(0, 0, 0); + saber::SaberTimer my_time; + LOG(WARNING) << "EXECUTER !!!!!!!! "; + for (int i = 0; i < FLAGS_warmup_iter; i++) { + net_executer.prediction(); + } +#ifdef ENABLE_OP_TIMER + net_executer.reset_op_time(); +#endif + my_time.start(ctx); + //auto start = std::chrono::system_clock::now(); + for (int i = 0; i < FLAGS_epoch; i++) { + //DLOG(ERROR) << " epoch(" << i << "/" << epoch << ") "; + net_executer.prediction(); + } + my_time.end(ctx); +#ifdef ENABLE_OP_TIMER + std::vector op_time = net_executer.get_op_time(); + auto exec_funcs = net_executer.get_exec_funcs(); + auto op_param = net_executer.get_op_param(); + for (int i = 0; i < op_time.size(); i++) { + LOG(INFO) << "name: " << exec_funcs[i].name << " op_type: " << exec_funcs[i].op_name << " op_param: " << op_param[i] << " time " << op_time[i]/FLAGS_epoch; + } + std::map op_map; + for (int i = 0; i < op_time.size(); i++) { + auto it = op_map.find(op_param[i]); + if (it != op_map.end()) + op_map[op_param[i]] += op_time[i]; + else + op_map.insert(std::pair(op_param[i], op_time[i])); + } + for (auto it = op_map.begin(); it != op_map.end(); ++it) { + LOG(INFO)<< it->first << " " << (it->second) / FLAGS_epoch<< " ms"; + } +#endif + size_t end = (*iter).find(".anakin.bin"); + size_t start = FLAGS_model_dir.length(); + std::string model_name = (*iter).substr(start, end-start); + + LOG(INFO) << model_name << " batch_size " << FLAGS_num << " average time "<< my_time.get_average_ms() / FLAGS_epoch << " ms"; + } +} +int main(int argc, const char** argv){ + // initial logger + logger::init(argv[0]); + +#ifdef USE_GFLAGS + google::ParseCommandLineFlags(&argc, &argv, true); +#else + LOG(INFO)<< "BenchMark usage:"; + LOG(INFO)<< " $benchmark "; + LOG(INFO)<< " model_dir: model directory"; + LOG(INFO)<< " model_file: path to model"; + LOG(INFO)<< " num: batchSize default to 1"; + LOG(INFO)<< " warmup_iter: warm up iterations default to 10"; + LOG(INFO)<< " epoch: time statistic epoch default to 1000"; + if(argc < 3) { + LOG(ERROR) << "You should fill in the variable model_dir and model_file at least."; + return 0; + } + FLAGS_model_dir = argv[1]; + if(argc > 2) { + FLAGS_model_file = argv[2]; + } + if(argc > 3) { + FLAGS_num = atoi(argv[3]); + } + if(argc > 4) { + FLAGS_warmup_iter = atoi(argv[4]); + } + if(argc > 5) { + FLAGS_epoch = atoi(argv[5]); + } +#endif + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/framework/net/chinese_ner_test.cpp b/test/framework/net/chinese_ner_test.cpp new file mode 100644 index 000000000..37785f721 --- /dev/null +++ b/test/framework/net/chinese_ner_test.cpp @@ -0,0 +1,213 @@ +#include "anakin_config.h" +#include +#include +#include "net_test.h" +#include "saber/funcs/timer.h" +#include +#include "saber/core/tensor_op.h" +#include +#include +#include +#include +#include +#include + +#define DEFINE_GLOBAL(type, var, value) \ + type (GLB_##var) = (value) +DEFINE_GLOBAL(std::string, model_dir, ""); +DEFINE_GLOBAL(std::string, input_file, ""); + +//#define WITH_MENTION + +void getModels(std::string path, std::vector& files) { + DIR* dir= nullptr; + struct dirent* ptr; + + if ((dir = opendir(path.c_str())) == NULL) { + perror("Open dri error..."); + exit(1); + } + + while ((ptr = readdir(dir)) != NULL) { + if (strcmp(ptr->d_name, ".") == 0 || strcmp(ptr->d_name, "..") == 0) { + continue; + } else if (ptr->d_type == 8) { //file + files.push_back(path + "/" + ptr->d_name); + } else if (ptr->d_type == 4) { + //files.push_back(ptr->d_name);//dir + getModels(path + "/" + ptr->d_name, files); + } + } + closedir(dir); +} +void SplitString(const std::string& s, + std::vector& v, const std::string& c) +{ + std::string::size_type pos1, pos2; + pos2 = s.find(c); + pos1 = 0; + while(std::string::npos != pos2) + { + v.push_back(s.substr(pos1, pos2-pos1)); + + pos1 = pos2 + c.size(); + pos2 = s.find(c, pos1); + } + if(pos1 != s.length()) + v.push_back(s.substr(pos1)); +} + +bool split_word_mention_idx_from_file( + std::vector > &word_idx, + std::vector > &mention_idx, + const std::string input_file_path) { + + std::ifstream infile(input_file_path.c_str()); + if (!infile.good()) { + std::cout << "Cannot open " << std::endl; + return false; + } + LOG(INFO)<<"found filename: "< split_v; + std::vector split_w; + std::vector split_m; + while (std::getline(infile, line)) { + split_v.clear(); + SplitString(line, split_v, ";"); + CHECK_GE(split_v.size(), 4) << " file need ; split"; + std::vector word; + std::vector mention; + split_w.clear(); + SplitString(split_v[1], split_w, " "); + split_m.clear(); + SplitString(split_v[3], split_m, " "); + for (auto w : split_w) { + word.push_back(atof(w.c_str())); + } + for (auto m : split_m) { + mention.push_back(atof(m.c_str())); + } + word_idx.push_back(word); + mention_idx.push_back(mention); + } + return true; +} + +int get_batch_data_offset( + std::vector &out_data, + const std::vector > &seq_data, + std::vector &seq_offset, + const int start_idx, + const int batch_num) { + seq_offset.clear(); + out_data.clear(); + seq_offset.push_back(0); + int len = 0; + for (int i = 0; i < batch_num; ++i) { + for (auto d : seq_data[i + start_idx]) { + len += 1; + out_data.push_back(d); + } + seq_offset.push_back(len); + } + return len; +} + +#ifdef USE_X86_PLACE +TEST(NetTest, chinese_ner_executor) { + std::vector models; + getModels(GLB_model_dir, models); + std::vector > word_idx; + std::vector > mention_idx; + split_word_mention_idx_from_file(word_idx, mention_idx, GLB_input_file); + std::vector word_idx_data; + std::vector mention_idx_data; + std::vector word_seq_offset; + std::vector mention_seq_offset; + int batch_num = 6; + + Graph* graph = new Graph(); + LOG(WARNING) << "load anakin model file from " << models[0] << " ..."; + // load anakin model files. + auto status = graph->load(models[0]); + if(!status ) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + graph->Reshape("input_0", {1000, 1, 1, 1}); +#ifdef WITH_MENTION + graph->Reshape("input_1", {1000, 1, 1, 1}); +#endif + //anakin graph optimization + graph->Optimize(); + Net net_executer(*graph, true); + SaberTimer timer; + Context ctx; + for (int i = 0; i < word_idx.size(); i += batch_num) { +// { +// int i = 0; + int word_len = get_batch_data_offset(word_idx_data, word_idx, word_seq_offset, i, batch_num); +#ifdef WITH_MENTION + int mention_len = get_batch_data_offset(mention_idx_data, mention_idx, mention_seq_offset, i, batch_num); +#endif +// for (auto w : word_idx_data) { +// std::cout << w << ","; +// } +// std::cout << std::endl; +// for (auto s : word_seq_offset) { +// std::cout << s << ", "; +// } +// std::cout << std::endl << std::endl << std::endl; +// word_idx_data = {20, 21, 22, 23, 24, 25, 26}; +// word_seq_offset = {0, 5, 7}; +// int word_len = 7; +// mention_idx_data = {2, 1, 22, 23, 24, 25, 26}; +// mention_seq_offset = {0, 5, 7}; +// int mention_len = 7; + + auto word_in_p = net_executer.get_in("input_0"); + word_in_p->reshape({word_len, 1, 1, 1}); + for (int j = 0; j < word_idx_data.size(); ++j) { + word_in_p->mutable_data()[j] = word_idx_data[j]; + } + word_in_p->set_seq_offset(word_seq_offset); +#ifdef WITH_MENTION + auto mention_in_p = net_executer.get_in("input_1"); + mention_in_p->reshape({mention_len, 1, 1, 1}); + for (int j = 0; j < mention_idx_data.size(); ++j) { + mention_in_p->mutable_data()[j] = mention_idx_data[j]; + } + mention_in_p->set_seq_offset(mention_seq_offset); +#endif + timer.start(ctx); + net_executer.prediction(); + timer.end(ctx); +// auto tensor_out_5_p = net_executer.get_out("crf_decoding_0.tmp_0_out"); +// int v_size = tensor_out_5_p->valid_size(); +// for (int j = 0; j < v_size; ++j) { +// std::cout << tensor_out_5_p->data()[j]<<" "; +// } +// std::cout << std::endl; + } + LOG(INFO)<<"elapse time: "< +#include "net_test.h" +#include "saber/funcs/timer.h" +#include +#include "saber/core/tensor_op.h" +#include +#include +#include +#include +#include +#include +#define DEFINE_GLOBAL(type, var, value) \ + type (GLB_##var) = (value) +DEFINE_GLOBAL(std::string, model_dir, ""); +DEFINE_GLOBAL(int, num, 1); +DEFINE_GLOBAL(int, channel, 8); +DEFINE_GLOBAL(int, height, 640); +DEFINE_GLOBAL(int, width, 640); +DEFINE_GLOBAL(bool, is_input_shape, false); + +void getModels(std::string path, std::vector& files) { + DIR* dir= nullptr; + struct dirent* ptr; + + if ((dir = opendir(path.c_str())) == NULL) { + perror("Open dri error..."); + exit(1); + } + + while ((ptr = readdir(dir)) != NULL) { + if (strcmp(ptr->d_name, ".") == 0 || strcmp(ptr->d_name, "..") == 0) { + continue; + } else if (ptr->d_type == 8) { //file + files.push_back(path + "/" + ptr->d_name); + } else if (ptr->d_type == 4) { + //files.push_back(ptr->d_name);//dir + getModels(path + "/" + ptr->d_name, files); + } + } + + closedir(dir); +} + +#ifdef USE_CUDA +TEST(NetTest, nv_net_execute_base_test) { + std::vector models; + getModels(GLB_model_dir, models); + + for (auto iter = models.begin(); iter < models.end(); iter++) { + LOG(WARNING) << "load anakin model file from " << *iter << " ..."; +#if 1 + Graph graph; + auto status = graph.load(*iter); + + if (!status) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + if (GLB_is_input_shape) { + graph.Reshape("input_0", {GLB_num, GLB_channel, GLB_height, GLB_width}); + } else { + graph.ResetBatchSize("input_0", GLB_num); + } + + graph.Optimize(); + // constructs the executer net + Net net_executer(graph, true); + // get in + auto d_tensor_in_p = net_executer.get_in("input_0"); + Tensor4d h_tensor_in; + auto valid_shape_in = d_tensor_in_p->valid_shape(); + + for (int i = 0; i < valid_shape_in.size(); i++) { + LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i]; + } + + h_tensor_in.re_alloc(valid_shape_in); + fill_tensor_host_rand(h_tensor_in, -1.0f, 1.0f); + d_tensor_in_p->copy_from(h_tensor_in); + int warmup_iter = 10; + int epoch = 1000; + // do inference + Context ctx(0, 0, 0); + saber::SaberTimer my_time; + LOG(WARNING) << "EXECUTER !!!!!!!! "; + + for (int i = 0; i < warmup_iter; i++) { + net_executer.prediction(); + } + +#ifdef ENABLE_OP_TIMER + net_executer.reset_op_time(); +#endif + my_time.start(ctx); + + //auto start = std::chrono::system_clock::now(); + for (int i = 0; i < epoch; i++) { + //DLOG(ERROR) << " epoch(" << i << "/" << epoch << ") "; + net_executer.prediction(); + } + + my_time.end(ctx); +#ifdef ENABLE_OP_TIMER + std::vector op_time = net_executer.get_op_time(); + auto exec_funcs = net_executer.get_exec_funcs(); + auto op_param = net_executer.get_op_param(); + + for (int i = 0; i < op_time.size(); i++) { + LOG(INFO) << "name: " << exec_funcs[i].name << " op_type: " << exec_funcs[i].op_name << + " op_param: " << op_param[i] << " time " << op_time[i] / epoch; + } + + std::map op_map; + + for (int i = 0; i < op_time.size(); i++) { + auto it = op_map.find(op_param[i]); + + if (it != op_map.end()) { + op_map[op_param[i]] += op_time[i]; + } else { + op_map.insert(std::pair(op_param[i], op_time[i])); + } + } + + for (auto it = op_map.begin(); it != op_map.end(); ++it) { + LOG(INFO) << it->first << " " << (it->second) / epoch << " ms"; + } + +#endif + LOG(INFO) << *iter << " aveage time " << my_time.get_average_ms() / epoch << " ms"; + // save the optimized model to disk. + // std::string save_model_path = GLB_model_dir + std::string("opt.saved"); + // status = graph.save(save_model_path); + // if (!status ) { + // LOG(FATAL) << " [ERROR] " << status.info(); + // } +#endif + } +} +#endif + +int main(int argc, const char** argv) { + // initial logger + LOG(INFO) << "argc " << argc; + + if (argc < 1) { + LOG(INFO) << "Example of Usage:\n \ + ./output/unit_test/model_test\n \ + anakin_models\n \ + num\n \ + channel\n \ + height\n \ + width\n "; + exit(0); + } else if (argc == 2) { + GLB_model_dir = std::string(argv[1]); + GLB_is_input_shape = false; + } else if (argc == 3) { + GLB_model_dir = std::string(argv[1]); + GLB_num = atoi(argv[2]); + GLB_is_input_shape = false; + } else { + GLB_model_dir = std::string(argv[1]); + GLB_num = atoi(argv[2]); + GLB_channel = atoi(argv[3]); + GLB_height = atoi(argv[4]); + GLB_width = atoi(argv[5]); + GLB_is_input_shape = true; + } + + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/framework/net/net_exec_multi_thread_test.cpp b/test/framework/net/net_exec_multi_thread_test.cpp new file mode 100644 index 000000000..7a8bf5401 --- /dev/null +++ b/test/framework/net/net_exec_multi_thread_test.cpp @@ -0,0 +1,149 @@ +#include +#include "net_test.h" +#include "saber/funcs/timer.h" +#include + +std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/yolo_lane_v2.anakin.bin"; + +#ifdef USE_CUDA +#if 1 +TEST(NetTest, nv_net_execute_muti_thread_sync_test) { +#if 1 // use host input + //Env::env_init(1); + LOG(WARNING) << "Sync Runing multi_threads for model: " << model_path; + Worker workers(model_path, 10); + workers.register_inputs({"input_0"}); + workers.register_outputs({"softmax_out"}); + workers.Reshape("input_0", {1, 384, 960, 3}); + + workers.launch(); + + std::vector::type, AK_FLOAT> > host_tensor_p_in_list; + // get in + saber::Shape valid_shape_in({1, 384, 960, 3}); + Tensor4dPtr::type, AK_FLOAT> h_tensor_in = new Tensor4d::type, AK_FLOAT>(valid_shape_in); + float* h_data = h_tensor_in->mutable_data(); + for (int i=0; isize(); i++) { + h_data[i] = 1.0f; + } + host_tensor_p_in_list.push_back(h_tensor_in); + + int epoch = 1000; + + // Running + for(int i=0; ifirst << " processing " << it->second.size() << " tasks"; + for (auto time_in_ms : it->second) { + LOG(INFO) << " \\__task avg time: " << time_in_ms; + } + } +#endif + +#endif + +#if 0 // use device input + Env::env_init(1); + LOG(WARNING) << "Sync Runing multi_threads for model: " << model_path; + Worker workers(model_path, 1); + workers.register_inputs({"input_0"}); + workers.register_outputs({"softmax_out"}); + workers.Reshape("input_0", {1, 384, 960, 3}); + + workers.launch(); + + std::vector::type, AK_FLOAT> > host_tensor_p_in_list; + // get in + saber::Shape valid_shape_in({1, 384, 960, 3}); + Tensor4dPtr::type, AK_FLOAT> h_tensor_in = new Tensor4d::type, AK_FLOAT>(valid_shape_in); + float* h_data = h_tensor_in->mutable_data(); + for (int i=0; isize(); i++) { + h_data[i] = 1.0f; + } + host_tensor_p_in_list.push_back(h_tensor_in); + + std::vector > device_tensor_p_in_list; + for (int i=0; i d_tensor_in = new Tensor4d(host_tensor_p_in_list[i]->valid_shape()); + d_tensor_in->copy_from(*(host_tensor_p_in_list[i])); + device_tensor_p_in_list.push_back(d_tensor_in); + } + + int epoch = 10; + + // Running + for (int i=0; i ctx(0, 0, 0); + saber::SaberTimer my_time; + + my_time.start(ctx); + auto d_tensor_p_out_list = workers.sync_prediction_device(device_tensor_p_in_list); + my_time.end(ctx); + LOG(INFO)<<"muti thread single task exec time: "< workers(model_path, 10); + workers.register_inputs({"input_0"}); + workers.register_outputs({"softmax_out"}); + workers.Reshape("input_0", {1, 384, 960, 3}); + + workers.launch(); + + std::vector::type, AK_FLOAT> > host_tensor_p_in_list; + // get in + saber::Shape valid_shape_in({1, 384, 960, 3}); + Tensor4dPtr::type, AK_FLOAT> h_tensor_in = new Tensor4d::type, AK_FLOAT>(valid_shape_in); + float* h_data = h_tensor_in->mutable_data(); + for (int i=0; isize(); i++) { + h_data[i] = 1.0f; + } + host_tensor_p_in_list.push_back(h_tensor_in); + + int epoch = 10000; + + // Running + for(int i=0; i +#include "net_test.h" +#include "saber/funcs/timer.h" +#include + +//#define USE_DIEPSE + +//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/diepsie_light_head.anakin.bin"; + +//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/diepsie_light_head_base.anakin.bin"; + + +//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/densebox.anakin.bin"; + +//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/cnn_seg.anakin.bin"; + +//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/yolo_camera_detector.anakin.bin"; + +//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/yolo_lane_v2.anakin.bin"; + +// alignment of face +//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/net_deploy_stageI.anakin.bin"; + +//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/net_deploy_stageII.anakin.bin"; + +// residual 7 patch of face +//std::string model_path = "/home/chaowen/anakin_v2/model_v2/anakin-models/adu/anakin_models/diepsie_light_head/residual_net_7patch_3hc.anakin.bin"; + +// resnet 50 +//std::string model_path = "/home/cuichaowen/anakin2/anakin2/benchmark/CNN/mobilenet_v2.anakin.bin"; + +// vgg16 +std::string model_path = "/home/cuichaowen/anakin2/anakin2/benchmark/CNN/models/vgg16.anakin.bin"; + +#ifdef USE_CUDA +#if 1 +TEST(NetTest, net_execute_base_test) { + Graph* graph = new Graph(); + LOG(WARNING) << "load anakin model file from " << model_path << " ..."; + // load anakin model files. + auto status = graph->load(model_path); + if(!status ) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + // reshape the input_0 's shape for graph model + //graph->Reshape("input_0", {1, 8, 640, 640}); + + // register all tensor inside graph + //graph->RegistAllOut(); + + // register edge + // graph->RegistOut("conv2_2/expand/scale", "relu2_2/expand"); + + //anakin graph optimization + graph->Optimize(); + + // constructs the executer net + { // inner scope +#ifdef USE_DIEPSE + Net net_executer(*graph, true); +#else + Net net_executer(*graph, true); +#endif + + // get in + auto d_tensor_in_p = net_executer.get_in("input_0"); + Tensor4d h_tensor_in; + + auto valid_shape_in = d_tensor_in_p->valid_shape(); + for (int i=0; icopy_from(h_tensor_in); + +#ifdef USE_DIEPSE + // for diepse model + auto d_tensor_in_1_p = net_executer.get_in("input_1"); + Tensor4d h_tensor_in_1; + + h_tensor_in_1.re_alloc(d_tensor_in_1_p->valid_shape()); + for (int i=0; ivalid_shape().size(); i++) { + LOG(INFO) << "detect input_1 dims[" << i << "]" << d_tensor_in_1_p->valid_shape()[i]; + } + h_data = h_tensor_in_1.mutable_data(); + h_data[0] = 1408; + h_data[1] = 800; + h_data[2] = 0.733333; + h_data[3] = 0.733333; + h_data[4] = 0; + h_data[5] = 0; + d_tensor_in_1_p->copy_from(h_tensor_in_1); + + auto d_tensor_in_2_p = net_executer.get_in("input_2"); + Tensor4d h_tensor_in_2; + + h_tensor_in_2.re_alloc(d_tensor_in_2_p->valid_shape()); + for (int i=0; ivalid_shape().size(); i++) { + LOG(INFO) << "detect input_2 dims[" << i << "]" << d_tensor_in_2_p->valid_shape()[i]; + } + h_data = h_tensor_in_2.mutable_data(); + h_data[0] = 2022.56; + h_data[1] = 989.389; + h_data[2] = 2014.05; + h_data[3] = 570.615; + h_data[4] = 1.489; + h_data[5] = -0.02; + d_tensor_in_2_p->copy_from(h_tensor_in_2); +#endif + + int epoch = 1; + // do inference + Context ctx(0, 0, 0); + saber::SaberTimer my_time; + LOG(WARNING) << "EXECUTER !!!!!!!! "; + // warm up + /*for(int i=0; i<10; i++) { + net_executer.prediction(); + }*/ + + my_time.start(ctx); + + + //auto start = std::chrono::system_clock::now(); + for(int i=0; i(end - start).count(); + //LOG(WARNING) << "avg time : " << time/epoch <<" ms"; + + my_time.end(ctx); + LOG(INFO)<<"aveage time "<(tensor_out_4_p); + + + // save the optimized model to disk. + /*std::string save_model_path = model_path + std::string(".saved"); + status = graph->save(save_model_path); + if (!status ) { + LOG(FATAL) << " [ERROR] " << status.info(); + }*/ +} +#endif +#endif + +#if 0 +TEST(NetTest, net_execute_reconstruction_test) { + graph = new Graph(); + LOG(WARNING) << "load anakin model file from optimized model " << model_saved_path << " ..."; + // load anakin model files. + auto status = graph->load(model_saved_path); + if (!status ) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + // regisiter output tensor + //graph->RegistOut("data_perm", "data_scale"); + graph->RegistOut("data_perm", "conv1"); + + //anakin graph optimization + graph->Optimize(); + + // constructs the executer net + Net net_executer(*graph); + + // get in + auto d_tensor_in_p = net_executer.get_in("input_0"); + Tensor4d h_tensor_in; + + auto valid_shape_in = d_tensor_in_p->valid_shape(); + for (int i=0; icopy_from(h_tensor_in); + + // do inference + Context ctx(0, 0, 0); + saber::SaberTimer my_time; + my_time.start(ctx); + + LOG(WARNING) << "EXECUTER !!!!!!!! "; + for (int i=0; i<1; i++) { + net_executer.prediction(); + + } + my_time.end(ctx); + LOG(INFO)<<"aveage time "<(tensor_out_inner_p); +} +#endif + +int main(int argc, const char** argv){ + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/framework/net/net_test.h b/test/framework/net/net_test.h new file mode 100644 index 000000000..c240afbf0 --- /dev/null +++ b/test/framework/net/net_test.h @@ -0,0 +1,98 @@ +/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_NET_TEST_H +#define ANAKIN_NET_TEST_H + +#include +#include "utils/unit_test/aktest.h" +#include "utils/logger/logger.h" +#include "graph_base.h" +#include "graph.h" +#include "scheduler.h" +#include "net.h" +#include "worker.h" + +using namespace anakin; +using ::anakin::test::Test; + +using namespace anakin::graph; + +/** + * \brief Graph test is base Test class for anakin graph funciton. + */ +class NetTest: public Test { +public: + NetTest(){} + + void SetUp(){} + + void TearDown(){} + +protected: +}; + +#ifdef USE_CUDA +void test_print(Tensor4dPtr& out_tensor_p) { + Tensor4d::type, AK_FLOAT> h_tensor_result; + h_tensor_result.re_alloc(out_tensor_p->valid_shape()); + LOG(ERROR) << "result count : " << h_tensor_result.valid_shape().count(); + h_tensor_result.copy_from(*out_tensor_p); + for (int i = 0; i < h_tensor_result.valid_size(); i++) { + LOG(INFO) << " GET OUT (" << i << ") " << h_tensor_result.mutable_data()[i]; + } +} +#endif + +template +double tensor_average(Tensor4dPtr& out_tensor_p) { + double sum = 0.0f; +#ifdef USE_CUDA + float* h_data = new float[out_tensor_p->valid_size()]; + const float* d_data = out_tensor_p->data(); + CUDA_CHECK(cudaMemcpy(h_data, d_data, out_tensor_p->valid_size()*sizeof(float), cudaMemcpyDeviceToHost)); +#else + float* h_data = out_tensor_p->data(); +#endif + for (int i=0; ivalid_size(); i++) { + sum+=h_data[i]; + } + return sum/out_tensor_p->valid_size(); +} + + +#ifdef USE_X86_PLACE +static int record_dev_tensorfile(const Tensor4d* dev_tensor, const char* locate) { + Tensor::type, AK_FLOAT, NCHW> host_temp; + host_temp.re_alloc(dev_tensor->valid_shape()); + host_temp.copy_from(*dev_tensor); + FILE* fp = fopen(locate, "w+"); + int size = host_temp.valid_shape().count(); + if (fp == 0) { + LOG(ERROR) << "[ FAILED ] file open target txt: " << locate; + } else { + for (int i = 0; i < size; ++i) { + fprintf(fp, "%.18f \n", i, (host_temp.data()[i])); + } + fclose(fp); + } + LOG(INFO) << "[ SUCCESS ] Write " << size << " data to: " << locate; + return 0; +} +#endif + +#endif + + diff --git a/test/framework/net/padde_api_test.cpp b/test/framework/net/padde_api_test.cpp new file mode 100644 index 000000000..6e0dfe878 --- /dev/null +++ b/test/framework/net/padde_api_test.cpp @@ -0,0 +1,121 @@ +#include +#include "net_test.h" +#include "saber/funcs/timer.h" +#include +#include "saber/core/tensor_op.h" +#include +#include +#include +#include +#include +#include +#include "paddle_api.h" +#define DEFINE_GLOBAL(type, var, value) \ + type (GLB_##var) = (value) +DEFINE_GLOBAL(std::string, model_dir, ""); +DEFINE_GLOBAL(int, num, 1); +DEFINE_GLOBAL(int, channel, 8); +DEFINE_GLOBAL(int, height, 640); +DEFINE_GLOBAL(int, width, 640); +DEFINE_GLOBAL(bool, is_input_shape, false); + +#ifdef USE_CUDA +typedef NV Target; +#elif defined(USE_X86_PLACE) +typedef X86 Target; +#else +typedef ARM Target; +#endif + +void getModels(std::string path, std::vector& files) +{ + DIR *dir; + struct dirent *ptr; + if((dir=opendir(path.c_str()))==NULL){ + perror("Open dri error..."); + exit(1); + } + while((ptr=readdir(dir))!=NULL){ + if(strcmp(ptr->d_name,".")==0||strcmp(ptr->d_name,"..")==0) + continue; + else if(ptr->d_type==8)//file + files.push_back(path+"/"+ptr->d_name); + else if(ptr->d_type==4){ + //files.push_back(ptr->d_name);//dir + getModels(path+"/"+ptr->d_name,files); + } + } + closedir(dir); +} + + +TEST(NetTest, net_execute_base_test) { + std::vector models; + getModels(GLB_model_dir, models); + for (auto iter = models.begin(); iter < models.end(); iter++) + { + AnakinEngine anakin_engine; + LOG(WARNING) << "load anakin model file from " << *iter << " ..."; + std::vector shape{GLB_num, GLB_channel, GLB_height, GLB_width}; + //anakin_engine.Build(*iter, shape); + anakin_engine.Build(*iter); + + printf("Args = %d %d %d %d\n",GLB_num, GLB_channel, GLB_height, GLB_width); + //fill input + Tensor4d h_tensor_in; + h_tensor_in.re_alloc({GLB_num, GLB_channel, GLB_height, GLB_width}); + fill_tensor_host_rand(h_tensor_in, -1.0f,1.0f); + + anakin_engine.SetInputFromCPU("input_0", h_tensor_in.data(), h_tensor_in.valid_size()); + + int warmup_iter = 10; + int epoch = 1000; + // do inference + Context ctx(0, 0, 0); + saber::SaberTimer my_time; + LOG(WARNING) << "EXECUTER !!!!!!!! "; + for (int i = 0; i < warmup_iter; i++) { + anakin_engine.Execute(); + } + my_time.start(ctx); + //auto start = std::chrono::system_clock::now(); + for (int i = 0; i < epoch; i++) { + anakin_engine.Execute(); + } + my_time.end(ctx); + LOG(INFO) << *iter << " aveage time "<< my_time.get_average_ms() / epoch << " ms"; + } +} + +int main(int argc, const char** argv){ + // initial logger + LOG(INFO)<<"argc"< +#include "saber/funcs/timer.h" +#include +#include "saber/core/tensor_op.h" +#include "saber/saber_types.h" +#include +#include +#include +#include +#include +#include + +class EngineBase { + public: + // Build the model and do some preparation, for example, in TensorRT, run + // createInferBuilder, buildCudaEngine. + virtual void Build(const std::string& model_file, int batch_size = 1) = 0; + virtual void Build(const std::string& model_file, const std::vector& shape) = 0; + // Execute the engine, that will run the inference network. + virtual void Execute() = 0; + + virtual ~EngineBase() {} +}; // class EngineBase + +template +class AnakinEngine : public EngineBase { +public: + typedef typename anakin::saber::DataTrait::dtype Dtype_t; + typedef anakin::saber::TargetWrapper X86_API; + typedef anakin::saber::TargetWrapper NV_API; + AnakinEngine(){} + + ~AnakinEngine(){}; + + void Build(const std::string& model_file, int batch_size = 1) override + { + _graph.load(model_file); + _graph.ResetBatchSize("input_0", batch_size); + _graph.Optimize(); + _net_executer.init(_graph); + }; + + void Build(const std::string& model_file, const std::vector& shape) override + { + _graph.load(model_file); + _graph.Reshape("input_0", shape); + _graph.Optimize(); + _net_executer.init(_graph); + }; + + void Execute() override + { + _net_executer.prediction(); + }; + + // Fill an input from CPU memory with name and size. + void SetInputFromCPU(const std::string name, Dtype_t* data, size_t size) + { + auto input_tensor = _net_executer.get_in(name); + anakin::Tensor tmp_tensor(data, anakin::saber::X86(), X86_API::get_device_id(), input_tensor->valid_shape()); + *input_tensor = tmp_tensor; + }; + + // accessed directly. Fill an input from GPU memory with name and size. + void SetInputFromGPU(const std::string& name, Dtype_t* data, size_t size) + { + auto input_tensor = _net_executer.get_in(name); + CHECK_EQ(size, input_tensor->valid_size()); + anakin::Tensor tmp_tensor(data, NV(), NV_API::get_device_id(), input_tensor->valid_shape()); + *input_tensor = tmp_tensor; + }; + // Get an output called name, the output of tensorrt is in GPU, so this method + // will just return the output's GPU memory address. + anakin::Tensor* GetOutputInGPU(const std::string& name) + { + return _net_executer.get_out(name); + } + +private: + anakin::graph::Graph _graph; + anakin::Net _net_executer; +}; // class TensorRTEngine +template +class AnakinEngine; + + diff --git a/test/framework/operators/operator_tests.h b/test/framework/operators/operator_tests.h new file mode 100644 index 000000000..38f16b87d --- /dev/null +++ b/test/framework/operators/operator_tests.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_TESTS_H +#define ANAKIN_OPERATOR_TESTS_H + +#include +#include +#include +#include "utils/unit_test/aktest.h" +#include "utils/logger/logger.h" +#include "framework/operators/ops.h" + +using namespace anakin; +using ::anakin::test::Test; + +class OperatorsTest : public Test { +public: + OperatorsTest(){} + + void SetUp(){} + + void TearDown(){} + +protected: +}; + + + + + + +#endif + + diff --git a/test/framework/operators/pooling_test.cpp b/test/framework/operators/pooling_test.cpp new file mode 100644 index 000000000..47b66be23 --- /dev/null +++ b/test/framework/operators/pooling_test.cpp @@ -0,0 +1,43 @@ +#include "operator_tests.h" +#include "thread_pool.h" + +#ifdef USE_CUDA +using Target = NV; +#elif defined(USE_X86_PLACE) +using Target = X86; +#else +using Target = ARM; +#endif + +TEST(OperatorsTest, PoolingFactoryTest) { + OpContext opctx; + std::vector > in; + std::vector > out; + + + /*Operator*/ auto* Op_name1 = + OpFactory::Global()["pooling"]; + /*Operator**/auto* Op_name2 = + OpFactory::Global()["pool"]; + auto& op_list = OpFactory::Global().get_list_op_name(); + + for (auto& item : op_list) { + LOG(INFO) << " op: " << item; + } + + LOG(WARNING) << " op name alias 1 : pooling"; + LOG(INFO) << " run forward function"; + (*Op_name1)(opctx, in, out); + LOG(WARNING) << " op name alias 2 : pool"; + LOG(INFO) << " run forward function"; + (*Op_name2)(opctx, in, out); +} + + +int main(int argc, const char** argv) { + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} From 19413c53d66ac9cb4e8fa2dce74408c46b21f8f9 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Thu, 28 Jun 2018 11:01:56 +0800 Subject: [PATCH 094/318] Implement BM scale --- saber/funcs/impl/bm/vender_scale.h | 114 +++++++++++++++++++++++++++++ saber/funcs/scale.h | 19 +++++ 2 files changed, 133 insertions(+) create mode 100644 saber/funcs/impl/bm/vender_scale.h diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h new file mode 100644 index 000000000..e019f1b21 --- /dev/null +++ b/saber/funcs/impl/bm/vender_scale.h @@ -0,0 +1,114 @@ +#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_SCALE_H +#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_SCALE_H + +#include "saber/funcs/impl/impl_scale.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberScale : \ + public ImplBase< + Tensor, + Tensor, + Tensor, + ScaleParam > > +{ +public: + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + typedef Tensor OpTensor; + typedef typename DataTensor_in::Dtype InDataType; + typedef typename DataTensor_out::Dtype OutDataType; + typedef typename OpTensor::Dtype OpDataType; + + SaberScale() + {} + + ~SaberScale() {} + + virtual SaberStatus init(const std::vector& inputs, + std::vector& outputs, + ScaleParam& param, Context& ctx) { + + _handle = get_bm_handle(); + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector& inputs, + std::vector& outputs, + ScaleParam& param, Context &ctx) { + + } + + virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + ScaleParam& param) { + + const InDataType *in_data = (const InDataType *) inputs[0]->data(); + OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); + + int input_n = inputs[0]->num(); + int input_c = inputs[0]->channel(); + int input_h = inputs[0]->height(); + int input_w = inputs[0]->width(); + + int axis = (param.num_axes == 0) ? 0 : param.axis; + int num_axes = param.num_axes >=0 ? param.num_axes : inputs[0]->shape().dims() - axis; + + int outer_dim = inputs[0]->count(0, axis); + int inner_dim = inputs[0]->count(axis + num_axes, inputs[0]->shape().dims()); + int scale_dim = inputs[0]->count(axis, axis + num_axes); + if (inputs.size() == 1) { + CHECK_EQ(scale_dim, param.scale_w.size()) << "scale dim not valid"; + } + + OpDataType* scale_data = param.scale_w[0]; + bmdnn_scale_forward( + _handle, + //input + *in_data, + *scale_data, + input_n, + input_c, + input_h, + input_w, + scale_dim, + inner_dim, + 0, + //output + new bm_device_mem_t(), + *out_data + ); + + if (param.bias_term) { + OpDataType* bias_data = param.scale_b[0]; + bmdnn_bias_forward( + _handle, + //input + *out_data, + *bias_data, + outer_dim, + inner_dim, + //output + *out_data + ); + } + + return SaberSuccess; + } +private: + bm_handle_t _handle; +}; + +} + +} +#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_SCALE_H diff --git a/saber/funcs/scale.h b/saber/funcs/scale.h index 1cf9d6212..95e0e6263 100644 --- a/saber/funcs/scale.h +++ b/saber/funcs/scale.h @@ -28,11 +28,29 @@ #ifdef USE_X86_PLACE #include "saber/funcs/impl/x86/saber_scale.h" #endif +#ifdef USE_ARM_PLACE +//todo +#include "saber/funcs/impl/impl_scale.h" +#endif + +#ifdef USE_BM +#include "saber/funcs/impl/bm/vender_scale.h" +#endif namespace anakin { namespace saber { +#ifdef USE_BM template +#else +template +#endif class Scale : public BaseFunc< Tensor, Tensor, From 25fa4815da5315d386dc36c07a710a61ec564f5f Mon Sep 17 00:00:00 2001 From: "tong.liu" Date: Thu, 28 Jun 2018 11:05:25 +0800 Subject: [PATCH 095/318] pooling test --- saber/core/tensor_op.cpp | 2 +- saber/funcs/impl/bm/vender_pooling.h | 4 +-- test/saber/bm/test_saber_func_pooling_BM.cpp | 28 +++++++------------- 3 files changed, 13 insertions(+), 21 deletions(-) diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp index 3d6494b9d..d7ee91231 100644 --- a/saber/core/tensor_op.cpp +++ b/saber/core/tensor_op.cpp @@ -438,7 +438,7 @@ void print_tensor_device>(Tensor& tenso bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr); for (int i = 0; i < tensor.size(); ++i) { - printf("%.2f ", host_mem[i]); + printf("%.2f\t", host_mem[i]); if ((i + 1) % tensor.width() == 0){ printf("\n"); diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h index 6e5de79a4..1bdcfdecb 100644 --- a/saber/funcs/impl/bm/vender_pooling.h +++ b/saber/funcs/impl/bm/vender_pooling.h @@ -28,7 +28,7 @@ class VenderPooling param(window_h, window_w, pad_h, pad_w , stride_h, stride_w, Pooling_max); @@ -80,7 +70,7 @@ TEST(TestSaberFuncBM, test_func_pooling) { pooling(input, output, param, ctx1); SaberTimer t1; - int ts = 100; + int ts = 10; for (int i = 0; i < ts; ++i) { t1.start(ctx1); @@ -100,7 +90,6 @@ TEST(TestSaberFuncBM, test_func_pooling) { TEST(TestSaberFuncBM, test_pooling_result) { - Env::env_init(); typedef TargetWrapper API; typedef TargetWrapper X86_API; @@ -109,7 +98,7 @@ TEST(TestSaberFuncBM, test_pooling_result) { typedef Tensor TensorDf4; int img_num = 1; - int in_channels = 2; + int in_channels = 1; int img_h = 8; int img_w = 8; @@ -122,7 +111,7 @@ TEST(TestSaberFuncBM, test_pooling_result) { img_dev.re_alloc(img_s); for (int i = 0; i < img_host.size(); ++i) { - img_host.mutable_data()[i] = 0x7f & i; + img_host.mutable_data()[i] = rand() % 20; } img_dev.copy_from(img_host); @@ -150,8 +139,8 @@ TEST(TestSaberFuncBM, test_pooling_result) { LOG(INFO) << " stride_h: " << stride_h; LOG(INFO) << " stride_w: " << stride_w; - PoolingParam param(window_h, window_w, pad_h, pad_w - , stride_h, stride_w, Pooling_max); + PoolingParam param(window_h, window_w, pad_h, pad_w, + stride_h, stride_w, Pooling_average_include_padding); std::vector input; std::vector output; @@ -169,12 +158,14 @@ TEST(TestSaberFuncBM, test_pooling_result) { pooling(input, output, param, ctx1); output_dev.sync(); + LOG(INFO) << "tensor data before pooling: "; + print_tensor_device(img_dev); + LOG(INFO) << "tensor data after pooling: "; print_tensor_device(output_dev); } TEST(TestSaberFuncBM, test_pooling_shared_buffer) { - Env::env_init(); typedef TargetWrapper API; typedef TargetWrapper X86_API; @@ -275,12 +266,13 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) { out0.sync(); out1.sync(); - print_tensor_device(output_dev); + /* print_tensor_device(output_dev); */ } int main(int argc, const char** argv) { // initial logger //logger::init(argv[0]); + Env::env_init(); InitTest(); RUN_ALL_TESTS(argv[0]); return 0; From 56271d43c89aa9494fbf45212b7cf7de8161d912 Mon Sep 17 00:00:00 2001 From: "weihao.huang" Date: Thu, 28 Jun 2018 03:22:08 +0000 Subject: [PATCH 096/318] Fix d2d mem copy --- saber/core/impl/bm/bm_impl.cpp | 2 +- saber/core/tensor.h | 54 +++++++++++++++++++- test/saber/bm/test_saber_func_softmax_BM.cpp | 5 +- 3 files changed, 56 insertions(+), 5 deletions(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index 4d24dedf0..e73e355b7 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -81,7 +81,7 @@ void BM_API::mem_set(void* ptr, int value, size_t n){ void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ size_t count, __DtoD) { handle = get_bm_handle(); - //BMDNN_CHECK(bm_memcpy_s2d(handle, bm_mem_from_device(dst), dst_id, bm_mem_from_device(src), src_id, count)); + //BMDNN_CHECK(bm_memcpy_d2d(handle, bm_mem_from_device(dst), dst_id, bm_mem_from_device(src), src_id, count)); BMDNN_CHECK(bm_memcpy_d2d(handle, *(bm_device_mem_t *)(dst), dst_id, *(bm_device_mem_t *)(src), src_id, count)); LOG(INFO) << "BM sync_memcpy: device to device, finished"; }; diff --git a/saber/core/tensor.h b/saber/core/tensor.h index 7c1d00052..860749981 100644 --- a/saber/core/tensor.h +++ b/saber/core/tensor.h @@ -609,7 +609,7 @@ class Tensor : public TensorBase { } CHECK_EQ(valid_size(), tensor.valid_size()) \ << "sizes of two valid shapes must be the same"; - + /// get the proper process target wrapper typedef TargetWrapper API_t; typedef typename TargetTypeTraits::target_type target_type_t; @@ -756,7 +756,8 @@ class Tensor : public TensorBase { SaberStatus copy_from(const Tensor& tensor) { LOG(WARNING) << "Invalid: copy_from is not allowed for current type."; return SaberInvalidValue; - } + } + #endif /** @@ -970,8 +971,11 @@ class Tensor : public TensorBase { }; #ifdef USE_BM + #ifndef BM_TENSOR_COPY #define BM_TENSOR_COPY + + template<> inline size_t Tensor::_type_len(){ return 4; @@ -998,9 +1002,55 @@ SaberStatus Tensor::copy_from(const Tensor BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *device_data_ptr)); return SaberSuccess; } + +/* + template<> inline + size_t Tensor::_type_len(){ + return 4; + } + + template<> + template<> inline + SaberStatus Tensor::copy_from(const Tensor& tensor) { + LOG(INFO) << "BM copy_from X86"; + CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same"; + + auto* device_data_ptr = mutable_data(); + BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(const_cast(tensor.data())))); + //BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *(bm_device_mem_t *)(mutable_data()), bm_mem_from_system(tensor.data()))); + return SaberSuccess; + } + + template<> + template<> inline + SaberStatus Tensor::copy_from(const Tensor& tensor) { + LOG(INFO) << "X86 copy_from BM"; + CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same"; + + auto* device_data_ptr = const_cast(tensor.data()); + BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *device_data_ptr)); + //BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *(bm_device_mem_t *)(tensor.data()))); + return SaberSuccess; + } + + template<> + template<> inline + SaberStatus Tensor::copy_from(const Tensor& tensor) { + LOG(INFO) << "BM copy_from BM"; + CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same"; + + auto* device_data_ptr = const_cast(tensor.data()); + //BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *device_data_ptr)); + //BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *(bm_device_mem_t *)(tensor.data()))); + return SaberSuccess; + } +*/ + #endif + #endif + } //namespace saber } //namespace anakin diff --git a/test/saber/bm/test_saber_func_softmax_BM.cpp b/test/saber/bm/test_saber_func_softmax_BM.cpp index 6c38c7534..d7707fad7 100644 --- a/test/saber/bm/test_saber_func_softmax_BM.cpp +++ b/test/saber/bm/test_saber_func_softmax_BM.cpp @@ -17,7 +17,7 @@ TEST(TestSaberFuncBM, test_func_softmax_BM) { typedef TensorDf4::Dtype dtype; - int test_iter = 1000; + int test_iter = 10; int softmax_axis = 3; // channel int w_in = 3; @@ -182,12 +182,13 @@ TEST(TestSaberFuncBM, test_func_softmax_ROI_BM) { TensorDf4 troi(output_dev_4d[0]->valid_shape()); troi.copy_from(*output_dev_4d[0]); - print_tensor_device(troi); + //print_tensor_device(troi); } int main(int argc, const char** argv) { // initial logger //logger::init(argv[0]); + Env::env_init(); InitTest(); RUN_ALL_TESTS(argv[0]); return 0; From c5a30a79192a4d0cc7b2c4aa5238b4d3d3d6df97 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Thu, 28 Jun 2018 11:39:02 +0800 Subject: [PATCH 097/318] Add batch norm operation --- saber/funcs/batch_norm.h | 115 ++++++++++++++++++++++++ saber/funcs/impl/bm/vender_batch_norm.h | 63 +++++++++++++ saber/funcs/impl/bm/vender_scale.h | 6 +- saber/funcs/impl/impl_batch_norm.h | 14 +++ saber/funcs/scale.h | 6 +- 5 files changed, 200 insertions(+), 4 deletions(-) create mode 100644 saber/funcs/batch_norm.h create mode 100644 saber/funcs/impl/bm/vender_batch_norm.h create mode 100644 saber/funcs/impl/impl_batch_norm.h diff --git a/saber/funcs/batch_norm.h b/saber/funcs/batch_norm.h new file mode 100644 index 000000000..604687303 --- /dev/null +++ b/saber/funcs/batch_norm.h @@ -0,0 +1,115 @@ +#ifndef ANAKIN_SABER_FUNCS_BATCH_NORM_H +#define ANAKIN_SABER_FUNCS_BATCH_NORM_H + +#include "saber/core/tensor.h" +#include "saber/funcs/base.h" +#include "saber/saber_funcs_param.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_batch_norm.h" + +#ifdef NVIDIA_GPU +//todo +#include "saber/funcs/impl/impl_batch_norm.h" +#endif + +#ifdef USE_X86_PLACE +//todo +#include "saber/funcs/impl/impl_batch_norm.h" +#endif + +#ifdef USE_ARM_PLACE +//todo +#include "saber/funcs/impl/impl_batch_norm.h" +#endif + +#ifdef USE_BM +#include "saber/funcs/impl/bm/vender_batch_norm.h" +#endif + +namespace anakin { +namespace saber { + +#ifdef USE_BM +template +#else +template +#endif +class BatchNorm : public BaseFunc< + Tensor, + Tensor, + Tensor, + ImplBase, + BatchNormParam +> { +public: + using BaseFunc< + Tensor, + Tensor, + Tensor, + ImplBase, + BatchNormParam>::BaseFunc; + + BatchNorm() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef BatchNormParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + + Shape output_shape = (input[0]->valid_shape()); + return output[0]->set_shape(output_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderBatchNorm ); + return SaberSuccess; + + case SABER_IMPL: + return SaberUnImplError; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + if (true) // some condition? + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/impl/bm/vender_batch_norm.h b/saber/funcs/impl/bm/vender_batch_norm.h new file mode 100644 index 000000000..cf767cd22 --- /dev/null +++ b/saber/funcs/impl/bm/vender_batch_norm.h @@ -0,0 +1,63 @@ +#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_BATCH_NORM_H +#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_BATCH_NORM_H + +#include "saber/funcs/impl/impl_batch_norm.h" + +namespace anakin{ + +namespace saber { + +template +class VenderBatchNorm:\ + public ImplBase< + Tensor, + Tensor, + Tensor, + BatchNormParam>> { +public: + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + typedef Tensor OpTensor; + + typedef typename DataTensor_in::Dtype InDataType; + typedef typename DataTensor_out::Dtype OutDataType; + typedef typename OpTensor::Dtype OpDataType; + + VenderBatchNorm() : _handle(NULL) {} + + ~VenderBatchNorm() {} + + virtual SaberStatus init(const std::vector& inputs, + std::vector& outputs, + BatchNormParam &batch_norm_param, Context &ctx) { + + _handle = get_bm_handle(); + return create(inputs, outputs, batch_norm_param, ctx); + } + + virtual SaberStatus create(const std::vector& inputs, + std::vector& outputs, + BatchNormParam &batch_norm_param, Context &ctx) { + } + + virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + BatchNormParam ¶m) { + + return SaberSuccess; + } + +private: + bm_handle_t _handle; +}; + +} //namespace saber + +} // namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_BATCH_NORM_H diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h index e019f1b21..9ed364173 100644 --- a/saber/funcs/impl/bm/vender_scale.h +++ b/saber/funcs/impl/bm/vender_scale.h @@ -13,7 +13,7 @@ template -class SaberScale : \ public ImplBase< Tensor, @@ -29,10 +29,10 @@ class SaberScale& inputs, std::vector& outputs, diff --git a/saber/funcs/impl/impl_batch_norm.h b/saber/funcs/impl/impl_batch_norm.h new file mode 100644 index 000000000..5a09220c7 --- /dev/null +++ b/saber/funcs/impl/impl_batch_norm.h @@ -0,0 +1,14 @@ +#ifndef ANAKIN_SABER_FUNCS_IMPL_BATCH_NORM_H +#define ANAKIN_SABER_FUNCS_IMPL_BATCH_NORM_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(BatchNorm, BatchnormParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_BATCH_NORM_H diff --git a/saber/funcs/scale.h b/saber/funcs/scale.h index 95e0e6263..9a1762df6 100644 --- a/saber/funcs/scale.h +++ b/saber/funcs/scale.h @@ -28,6 +28,7 @@ #ifdef USE_X86_PLACE #include "saber/funcs/impl/x86/saber_scale.h" #endif + #ifdef USE_ARM_PLACE //todo #include "saber/funcs/impl/impl_scale.h" @@ -94,7 +95,10 @@ class Scale : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - return SaberUnImplError; + this->_impl.push_back(new VenderScale ); + return SaberSuccess; case SABER_IMPL: this->_impl.push_back(new SaberScale Date: Thu, 28 Jun 2018 14:22:34 +0800 Subject: [PATCH 098/318] Implement batch norm for BM --- saber/funcs/impl/bm/vender_batch_norm.h | 31 ++++++++++++++++ saber/saber_funcs_param.h | 48 +++++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/saber/funcs/impl/bm/vender_batch_norm.h b/saber/funcs/impl/bm/vender_batch_norm.h index cf767cd22..917dc7219 100644 --- a/saber/funcs/impl/bm/vender_batch_norm.h +++ b/saber/funcs/impl/bm/vender_batch_norm.h @@ -49,6 +49,37 @@ class VenderBatchNorm& outputs, BatchNormParam ¶m) { + const InDataType *in_data = (const InDataType *) inputs[0]->data(); + OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); + + int input_n = inputs[0]->num(); + int input_c = inputs[0]->channel(); + int input_h = inputs[0]->height(); + int input_w = inputs[0]->width(); + + OpDataType eps = param.eps; + OpDataType scale = param.scale; + + bm_device_mem_t mean_ma = bm_mem_from_system(¶m.mean); + bm_device_mem_t variance_ma = bm_mem_from_system(¶m.variance); + + bmdnn_batchnorm_forward_inference( + _handle, + //input + *in_data, + mean_ma, + variance_ma, + scale, + new bm_device_mem_t(), + eps, + input_n, + input_c, + input_h, + input_w, + //output + *out_data + ); + return SaberSuccess; } diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h index 284fbcbc5..1a32c9c87 100644 --- a/saber/saber_funcs_param.h +++ b/saber/saber_funcs_param.h @@ -776,6 +776,53 @@ struct SoftmaxParam { } int axis; }; + +#ifdef USE_BM +template +struct BatchnormParam { + typedef typename opTensor::Dtype DataDtype; + BatchnormParam() + : scale(float(0)) + , use_global_stats(true) + , moving_average_fraction(float(0.999)) + , eps(float(1e-5)) + , mean(), variance() + {} + //scale_factor = 1 / scale; + BatchnormParam(std::vector mean_in, std::vector variance_in, + float scale_in, float moving_average_fraction_in = float(0.999), + float eps_in = float(1e-5), bool use_global_stats_in = true) + : mean(mean_in), variance(variance_in), scale(scale_in) + , moving_average_fraction(moving_average_fraction_in) + , eps(eps_in), use_global_stats(use_global_stats_in) + {} + BatchnormParam &operator=(const BatchnormParam &right) { + scale = right.scale; + moving_average_fraction = right.moving_average_fraction; + eps = right.eps; + use_global_stats = right.use_global_stats; + mean = right.mean; + variance = right.variance; + return *this; + } + bool operator==(const BatchnormParam &right) { + bool comp_eq = true; + comp_eq = comp_eq && (scale == right.scale); + comp_eq = comp_eq && (moving_average_fraction == right.moving_average_fraction); + comp_eq = comp_eq && (eps == right.eps); + comp_eq = comp_eq && (use_global_stats == right.use_global_stats); + comp_eq = comp_eq && (mean == right.mean); + comp_eq = comp_eq && (variance == right.variance); + return comp_eq; + } + float scale; + float moving_average_fraction; + float eps; + bool use_global_stats; + std::vector mean; + std::vector variance; +}; +#else template struct BatchnormParam { typedef typename opTensor::Dtype DataDtype; @@ -820,6 +867,7 @@ struct BatchnormParam { std::vector mean; std::vector variance; }; +#endif template struct ActivationParam { From 5c6ec7f965638ffe41a7c1c0db14ab8ec45921aa Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Thu, 28 Jun 2018 14:57:18 +0800 Subject: [PATCH 099/318] Use template specifications instead of macro --- saber/funcs/activation.h | 11 ----------- saber/funcs/batch_norm.h | 11 ----------- saber/funcs/conv.h | 11 ----------- saber/funcs/pooling.h | 11 ----------- saber/funcs/scale.h | 11 ----------- saber/funcs/softmax.h | 11 ----------- test/saber/bm/test_saber_func_conv_BM.cpp | 4 ++-- test/saber/bm/test_saber_func_pooling_BM.cpp | 8 ++++---- test/saber/bm/test_saber_func_softmax_BM.cpp | 4 ++-- 9 files changed, 8 insertions(+), 74 deletions(-) diff --git a/saber/funcs/activation.h b/saber/funcs/activation.h index e1167bc9a..7af7a6f80 100644 --- a/saber/funcs/activation.h +++ b/saber/funcs/activation.h @@ -36,16 +36,6 @@ namespace anakin { namespace saber { -#ifdef USE_BM -template -#else template -#endif class Activation : public BaseFunc< Tensor, Tensor, diff --git a/saber/funcs/batch_norm.h b/saber/funcs/batch_norm.h index 604687303..f8cf3e693 100644 --- a/saber/funcs/batch_norm.h +++ b/saber/funcs/batch_norm.h @@ -29,16 +29,6 @@ namespace anakin { namespace saber { -#ifdef USE_BM -template -#else template -#endif class BatchNorm : public BaseFunc< Tensor, Tensor, diff --git a/saber/funcs/conv.h b/saber/funcs/conv.h index 596964dbe..e527f3d6f 100644 --- a/saber/funcs/conv.h +++ b/saber/funcs/conv.h @@ -34,16 +34,6 @@ namespace anakin { namespace saber { -#ifdef USE_BM -template -#else template -#endif class Conv : public BaseFunc< Tensor, Tensor, diff --git a/saber/funcs/pooling.h b/saber/funcs/pooling.h index aff883505..739d05851 100644 --- a/saber/funcs/pooling.h +++ b/saber/funcs/pooling.h @@ -34,16 +34,6 @@ namespace anakin { namespace saber { -#ifdef USE_BM -template -#else template -#endif class Pooling : public BaseFunc< Tensor, Tensor, diff --git a/saber/funcs/scale.h b/saber/funcs/scale.h index 9a1762df6..24e45138f 100644 --- a/saber/funcs/scale.h +++ b/saber/funcs/scale.h @@ -41,16 +41,6 @@ namespace anakin { namespace saber { -#ifdef USE_BM -template -#else template -#endif class Scale : public BaseFunc< Tensor, Tensor, diff --git a/saber/funcs/softmax.h b/saber/funcs/softmax.h index 4a1e631f0..1ad324908 100644 --- a/saber/funcs/softmax.h +++ b/saber/funcs/softmax.h @@ -35,16 +35,6 @@ namespace anakin{ namespace saber{ -#ifdef USE_BM -template -#else template -#endif class Softmax : public BaseFunc< Tensor, Tensor, diff --git a/test/saber/bm/test_saber_func_conv_BM.cpp b/test/saber/bm/test_saber_func_conv_BM.cpp index 554bcf843..35ffc6006 100644 --- a/test/saber/bm/test_saber_func_conv_BM.cpp +++ b/test/saber/bm/test_saber_func_conv_BM.cpp @@ -492,7 +492,7 @@ TEST(TestSaberFuncBM, test_conv_fp32_speed_test) { input.push_back(&img_dev); output.push_back(&output_dev); - Conv conv; + Conv conv; conv.compute_output_shape(input, output, param); output_dev.re_alloc(output[0]->shape()); @@ -546,7 +546,7 @@ void test_conv_fp32_speed(std::vector &inputs, std::vector conv; + Conv conv; conv.compute_output_shape(inputs, outputs, conv_param); outputs[0]->re_alloc(outputs[0]->shape()); Context ctx1(0, 1, 1); diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp index 7edfc677b..943ed130b 100644 --- a/test/saber/bm/test_saber_func_pooling_BM.cpp +++ b/test/saber/bm/test_saber_func_pooling_BM.cpp @@ -148,7 +148,7 @@ TEST(TestSaberFuncBM, test_pooling_result) { input.push_back(&img_dev); output.push_back(&output_dev); - Pooling pooling; + Pooling pooling; pooling.compute_output_shape(input, output, param); output_dev.re_alloc(output[0]->shape()); @@ -234,9 +234,9 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) { input.push_back(&img_dev); output.push_back(&output_dev); - Pooling pooling; - Pooling pooling0; - Pooling pooling1; + Pooling pooling; + Pooling pooling0; + Pooling pooling1; pooling.compute_output_shape(input,output, param); diff --git a/test/saber/bm/test_saber_func_softmax_BM.cpp b/test/saber/bm/test_saber_func_softmax_BM.cpp index d7707fad7..645d081f1 100644 --- a/test/saber/bm/test_saber_func_softmax_BM.cpp +++ b/test/saber/bm/test_saber_func_softmax_BM.cpp @@ -52,7 +52,7 @@ TEST(TestSaberFuncBM, test_func_softmax_BM) { // start Reshape & doInfer Context ctx_dev(0, 1, 1); - Softmax softmax_dev; + Softmax softmax_dev; typedef std::vector Shape_v; @@ -150,7 +150,7 @@ TEST(TestSaberFuncBM, test_func_softmax_ROI_BM) { // start Reshape & doInfer Context ctx_dev(0, 1, 1); - Softmax softmax_dev; + Softmax softmax_dev; LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \ shape_out[2] << ", " << shape_out[3]; From 597fc4c1d87974989df86065955f0b1aaa8a4035 Mon Sep 17 00:00:00 2001 From: "tong.liu" Date: Thu, 28 Jun 2018 15:54:53 +0800 Subject: [PATCH 100/318] conv test --- test/saber/bm/test_saber_func_conv_BM.cpp | 283 +++++++--------------- 1 file changed, 92 insertions(+), 191 deletions(-) diff --git a/test/saber/bm/test_saber_func_conv_BM.cpp b/test/saber/bm/test_saber_func_conv_BM.cpp index 35ffc6006..75663cb8a 100644 --- a/test/saber/bm/test_saber_func_conv_BM.cpp +++ b/test/saber/bm/test_saber_func_conv_BM.cpp @@ -4,7 +4,6 @@ #include "tensor_op.h" #include "saber_types.h" #include -//#include "cublas.h" using namespace anakin::saber; @@ -39,10 +38,10 @@ inline int i_div_up(int a, int b) return (a % b != 0) ? (a / b + 1) : (a / b); } -#if 1 -TEST(TestSaberFuncBM, test_depthwise_conv) { - int group = 2; +TEST(TestSaberFuncBM, test_conv_result) { + + int group = 1; int pad_h = 1; int pad_w = 1; int stride_h = 1; @@ -52,30 +51,30 @@ TEST(TestSaberFuncBM, test_depthwise_conv) { int kernel_h = 3; int kernel_w = 3; - int out_channels = 2; + int out_channels = 1; int img_num = 1; - int in_channels = 2; + int in_channels = 1; int img_h = 8; int img_w = 8; bool bias_term = true; LOG(INFO) << "conv param: "; - LOG(INFO) << " img_num = " << img_num; - LOG(INFO) << " in_channels = " << in_channels; - LOG(INFO) << " img_h = " << img_h; - LOG(INFO) << " img_w = " << img_w; - LOG(INFO) << " group = " << group; - LOG(INFO) << " pad_h = " << pad_h; - LOG(INFO) << " pad_w = " << pad_w; - LOG(INFO) << " stride_h = " << stride_h; - LOG(INFO) << " stride_w = " << stride_w; - LOG(INFO) << " dilation_h = " << dilation_h; - LOG(INFO) << " dilation_w = " << dilation_w; - LOG(INFO) << " kernel_h = " << kernel_h; - LOG(INFO) << " kernel_w = " << kernel_w; - LOG(INFO) << " out_channels = " << out_channels; + LOG(INFO) << " img_num = " << img_num; + LOG(INFO) << " in_channels = " << in_channels; + LOG(INFO) << " img_h = " << img_h; + LOG(INFO) << " img_w = " << img_w; + LOG(INFO) << " group = " << group; + LOG(INFO) << " pad_h = " << pad_h; + LOG(INFO) << " pad_w = " << pad_w; + LOG(INFO) << " stride_h = " << stride_h; + LOG(INFO) << " stride_w = " << stride_w; + LOG(INFO) << " dilation_h = " << dilation_h; + LOG(INFO) << " dilation_w = " << dilation_w; + LOG(INFO) << " kernel_h = " << kernel_h; + LOG(INFO) << " kernel_w = " << kernel_w; + LOG(INFO) << " out_channels = " << out_channels; Shape img_s(img_num, in_channels, img_h, img_w); Shape weights_s(out_channels, in_channels, kernel_h, kernel_w); @@ -88,7 +87,7 @@ TEST(TestSaberFuncBM, test_depthwise_conv) { img_dev.re_alloc(img_s); for (int i = 0; i < img_host.size(); ++i) { - img_host.mutable_data()[i] = 63 & i; + img_host.mutable_data()[i] = i; } img_dev.copy_from(img_host); @@ -142,10 +141,9 @@ TEST(TestSaberFuncBM, test_depthwise_conv) { conv(input, output, param, ctx1); - //cudaStream_t cuda_stream = ctx1.get_compute_stream(); - //output[0]->record_event(cuda_stream); + output_dev.sync(); - //output_dev.sync(); + print_tensor_device(img_dev); print_tensor_device(output_dev); } @@ -165,27 +163,11 @@ TEST(TestSaberFuncBM, test_conv_param_change) { int img_num = 1; int in_channels = 4; - int img_h = 65; - int img_w = 63; + int img_h = 64; + int img_w = 64; bool bias_term = true; - LOG(INFO) << "conv param: "; - LOG(INFO) << " img_num = " << img_num; - LOG(INFO) << " in_channels = " << in_channels; - LOG(INFO) << " img_h = " << img_h; - LOG(INFO) << " img_w = " << img_w; - LOG(INFO) << " group = " << group; - LOG(INFO) << " pad_h = " << pad_h; - LOG(INFO) << " pad_w = " << pad_w; - LOG(INFO) << " stride_h = " << stride_h; - LOG(INFO) << " stride_w = " << stride_w; - LOG(INFO) << " dilation_h = " << dilation_h; - LOG(INFO) << " dilation_w = " << dilation_w; - LOG(INFO) << " kernel_h = " << kernel_h; - LOG(INFO) << " kernel_w = " << kernel_w; - LOG(INFO) << " out_channels = " << out_channels; - Shape img_s(img_num, in_channels, img_h, img_w); Shape weights_s(out_channels, in_channels, kernel_h, kernel_w); Shape bias_s(1, out_channels, 1, 1); @@ -197,7 +179,7 @@ TEST(TestSaberFuncBM, test_conv_param_change) { img_dev.re_alloc(img_s); for (int i = 0; i < img_host.size(); ++i) { - img_host.mutable_data()[i] = 0x7f & i; + img_host.mutable_data()[i] = i; } img_dev.copy_from(img_host); @@ -245,25 +227,20 @@ TEST(TestSaberFuncBM, test_conv_param_change) { output_dev.re_alloc(output[0]->shape()); output_host.re_alloc(output[0]->shape()); - LOG(INFO)<<"regular start with group = "<shape()[0] << " " << output[0]->shape()[1] << " " \ << output[0]->shape()[2] << " " << output[0]->shape()[3] << "]"; - //LOG(INFO) << " blocks = [ " << i_div_up(img_num*output[0]->shape()[2]*output[0]->shape()[3],128) << " " << i_div_up(out_channels*kernel_h, 128) << " 1 ]" ; - //选择k最小的那一组,如果一样,则选128*N,N最大的那一组 - int k0 = i_div_up(out_channels, 128) * 128 - out_channels; - int k1 = i_div_up(out_channels, 64) * 64 - out_channels; - int k2 = i_div_up(out_channels, 32) * 32 - out_channels; - int kk = std::min(std::min(k0,k1),k2); - LOG(INFO) << "k0 = " << k0 << " k1 = " << k1 << " k2 = " << k2 << " kk = " << kk; - if (kk == k0) - LOG(INFO) << "thread = [256,1,1] 128*128" ; - if (kk == k1) - LOG(INFO) << "thread = [128,1,1] 128*64" ; - if (kk == k2) - LOG(INFO) << "thread = [128,1,1] 128*32" ; LOG(INFO) << "saber conv init"; - conv.init(input, output, param, SPECIFY, SABER_IMPL, ctx1); - - LOG(INFO) << "saber conv dispatch"; - conv(input, output, param, ctx1); - - //cudaStream_t cuda_stream = ctx1.get_compute_stream(); - //output[0]->record_event(cuda_stream); + conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1); - //output_dev.sync(); + /* conv(input, output, param, ctx1); */ + /* output_dev.sync(); */ + LOG(INFO) << "saber conv dispatch"; SaberTimer t1; - int ts = 1; - + int ts = 100; + t1.start(ctx1); for (int i = 0; i < ts; ++i) { - t1.start(ctx1); conv(input, output, param, ctx1); output_dev.sync(); - t1.end(ctx1); } - - LOG(INFO) << "fp32 average time: " << t1.get_average_ms() << " ms"; - + t1.end(ctx1); + LOG(INFO) << "elapse time: " << t1.get_average_ms()/ts << " ms"; } -void test_conv_fp32_speed(std::vector &inputs, std::vector &outputs, - TensorDf4 &weights, int kernel_size, int stride, int pad, - int in_channel, int out_channel, TensorDf4 &bias, - anakin::saber::ImplEnum impl) { - - ConvParam conv_param(1, pad, pad, - stride, stride, - 1, 1, - &weights, &bias); - Conv conv; - conv.compute_output_shape(inputs, outputs, conv_param); - outputs[0]->re_alloc(outputs[0]->shape()); - Context ctx1(0, 1, 1); - - SABER_CHECK(conv.init(inputs, outputs, conv_param, SPECIFY, impl, ctx1)); - - conv(inputs, outputs, conv_param, ctx1); - outputs[0]->record_event(ctx1.get_compute_stream()); - outputs[0]->sync(); - - //cudaDeviceSynchronize(); - - SaberTimer t1; - int ts = 100; - for (int i = 0; i < ts; ++i) { - t1.start(ctx1); - conv(inputs, outputs, conv_param, ctx1); - outputs[0]->record_event(ctx1.get_compute_stream()); - outputs[0]->sync(); - t1.end(ctx1); - } - LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms"; - - //cudaDeviceSynchronize(); -} - - TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) { int img_num = 1; int kernel = 1; - -// int out_channels = 32; -// int in_channels = 128; -// int img_h = 52; -// int img_w = 112; -// int out_channels = 64; -// int in_channels = 256; -// int img_h = 26; -// int img_w = 56; int out_channels = 128; int in_channels = 512; - int img_h = 13; - int img_w = 28; - -// int out_channels = 512; -// int in_channels = 128; -// int img_h = 13; -// int img_w = 28; - + int img_h = 32; + int img_w = 32; int pad = 0; int stride = 1; - Context ctx1(0, 1, 1); TensorDf4 weights; + TensorDf4 bias; weights.re_alloc({out_channels, in_channels, 1, 1}); + bias.re_alloc({1, out_channels, 1, 1}); - TensorDf4 img; + TensorDf4 img, out; img.re_alloc({1, in_channels, img_h, img_w}); - TensorDf4 out; - out.re_alloc({1, out_channels, img_h, img_w}); - TensorDf4 out_gemm; - out_gemm.re_alloc({1, out_channels, img_h, img_w}); - fill_tensor_device_rand(weights, -1.f, 1.f); + fill_tensor_device_rand(bias, -1.f, 1.f); fill_tensor_device_rand(img, -1.f, 1.f); - LOG(INFO) << "img_num: " << img_num; - LOG(INFO) << "kernel: " << kernel; - LOG(INFO) << "out_channels: " << out_channels; - LOG(INFO) << "in_channels: " << in_channels; - LOG(INFO) << "img_h: " << img_h; - LOG(INFO) << "img_w: " << img_w; - LOG(INFO) << "pad: " << pad; - LOG(INFO) << "stride: " << stride; + LOG(INFO) << "conv param: "; + LOG(INFO) << " img_num: " << img_num; + LOG(INFO) << " kernel: " << kernel; + LOG(INFO) << " out_channels: " << out_channels; + LOG(INFO) << " in_channels: " << in_channels; + LOG(INFO) << " img_h: " << img_h; + LOG(INFO) << " img_w: " << img_w; + LOG(INFO) << " pad: " << pad; + LOG(INFO) << " stride: " << stride; - TensorDf4 bias; + std::vector input; + std::vector output; + + input.push_back(&img); + output.push_back(&out); + + ConvParam conv_param(1, pad, pad, + stride, stride, + 1, 1, + &weights, &bias); + Conv conv; + conv.compute_output_shape(input, output, conv_param); + out.re_alloc(output[0]->shape()); + Context ctx1(0, 1, 1); + conv.init(input, output, conv_param, SPECIFY, VENDER_IMPL, ctx1); - std::vector input_v; - std::vector output_gemm_v, output_v; - - input_v.push_back(&img); - output_v.push_back(&out); - output_gemm_v.push_back(&out_gemm); - //cudaDeviceSynchronize(); - test_conv_fp32_speed(input_v, output_v, - weights, kernel, stride, pad, - in_channels, out_channels, bias, - SABER_IMPL); + SaberTimer t1; + int ts = 100; + t1.start(ctx1); + for (int i = 0; i < ts; ++i) { + conv(input, output, conv_param, ctx1); + out.sync(); + } + t1.end(ctx1); + LOG(INFO) << "elapse time: " << t1.get_average_ms()/ts << " ms"; } int main(int argc, const char** argv){ - anakin::saber::Env::env_init(); - + Env::env_init(); // initial logger //logger::init(argv[0]); InitTest(); From a941292347101ec6fa0a266b672aee3a8b48ecc0 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Thu, 28 Jun 2018 16:41:18 +0800 Subject: [PATCH 101/318] Add test for batch norm --- saber/funcs/batch_norm.h | 6 +- saber/funcs/impl/bm/vender_batch_norm.h | 16 ++-- saber/funcs/impl/bm/vender_scale.h | 3 +- .../bm/test_saber_func_batch_norm_BM.cpp | 81 +++++++++++++++++++ 4 files changed, 95 insertions(+), 11 deletions(-) create mode 100644 test/saber/bm/test_saber_func_batch_norm_BM.cpp diff --git a/saber/funcs/batch_norm.h b/saber/funcs/batch_norm.h index f8cf3e693..2e817c734 100644 --- a/saber/funcs/batch_norm.h +++ b/saber/funcs/batch_norm.h @@ -42,7 +42,7 @@ class BatchNorm : public BaseFunc< Tensor, Tensor, ImplBase, - BatchNormParam + BatchnormParam > { public: using BaseFunc< @@ -50,14 +50,14 @@ class BatchNorm : public BaseFunc< Tensor, Tensor, ImplBase, - BatchNormParam>::BaseFunc; + BatchnormParam>::BaseFunc; BatchNorm() = default; typedef Tensor InDataTensor; typedef Tensor OutDataTensor; typedef Tensor OpTensor; - typedef BatchNormParam Param_t; + typedef BatchnormParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; diff --git a/saber/funcs/impl/bm/vender_batch_norm.h b/saber/funcs/impl/bm/vender_batch_norm.h index 917dc7219..e362a256f 100644 --- a/saber/funcs/impl/bm/vender_batch_norm.h +++ b/saber/funcs/impl/bm/vender_batch_norm.h @@ -18,7 +18,7 @@ class VenderBatchNorm, Tensor, Tensor, - BatchNormParam>> { + BatchnormParam>> { public: typedef Tensor DataTensor_in; typedef Tensor DataTensor_out; @@ -34,7 +34,7 @@ class VenderBatchNorm& inputs, std::vector& outputs, - BatchNormParam &batch_norm_param, Context &ctx) { + BatchnormParam &batch_norm_param, Context &ctx) { _handle = get_bm_handle(); return create(inputs, outputs, batch_norm_param, ctx); @@ -42,12 +42,12 @@ class VenderBatchNorm& inputs, std::vector& outputs, - BatchNormParam &batch_norm_param, Context &ctx) { + BatchnormParam &batch_norm_param, Context &ctx) { } virtual SaberStatus dispatch(const std::vector& inputs, std::vector& outputs, - BatchNormParam ¶m) { + BatchnormParam ¶m) { const InDataType *in_data = (const InDataType *) inputs[0]->data(); OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); @@ -57,12 +57,14 @@ class VenderBatchNormheight(); int input_w = inputs[0]->width(); - OpDataType eps = param.eps; - OpDataType scale = param.scale; + float eps = param.eps; + float scale = param.scale; bm_device_mem_t mean_ma = bm_mem_from_system(¶m.mean); bm_device_mem_t variance_ma = bm_mem_from_system(¶m.variance); + bm_device_mem_t* variance_holder = new bm_device_mem_t(); + bmdnn_batchnorm_forward_inference( _handle, //input @@ -70,7 +72,7 @@ class VenderBatchNorm + +using namespace anakin::saber; + + +TEST(TestSaberFuncBM, test_func_batch_norm_BM) { + + typedef TargetWrapper API; + typedef Tensor TensorDf4; + typedef TensorDf4::Dtype dtype; + + //Input / output tensor + Shape shape_in(1, 1, 2, 2); + Shape shape_out = shape_in; + + std::vector input_dev_4d; + std::vector output_dev_4d; + + Tensor thin(shape_in); + for (int i = 0; i < thin.size(); ++i) { + thin.mutable_data()[i] = 10; + } + + TensorDf4 tdin, tdout; + tdin.re_alloc(shape_in); + tdin.copy_from(thin); + input_dev_4d.push_back(&tdin); + + //Batch norm param + std::vector mean; + mean.push_back(10); + + std::vector variance; + variance.push_back(0); + + float scale_in = 1; + float eps_in = float(1e-5); + + BatchnormParam param(mean, variance, scale_in); + + //BatachNorm + BatchNorm batchNorm; + + output_dev_4d.push_back(&tdout); + batchNorm.compute_output_shape(input_dev_4d, output_dev_4d, param); + + LOG(INFO) << "re-alloc tensor buffer"; + output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape()); + + LOG(INFO) << "batch norm initialized to bm impl"; + Context ctx_dev(0, 1, 1); + batchNorm.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev); + + LOG(INFO) << "bm batch norm compute"; + SaberTimer t1; + t1.clear(); + t1.start(ctx_dev); + + batchNorm(input_dev_4d, output_dev_4d, param, ctx_dev); + + t1.end(ctx_dev); + float ts = t1.get_average_ms(); + printf("cudnn softmax total time : %.4f, avg time : %.4f\n", ts, ts); + + print_tensor_device(*output_dev_4d[0]); +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + //Env::env_init(); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + From 64325fedf3597c26da2cbae429c690b2cbaba241 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Thu, 28 Jun 2018 17:50:07 +0800 Subject: [PATCH 102/318] Use specialization --- saber/saber_funcs_param.h | 52 +++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h index 1a32c9c87..a758b5881 100644 --- a/saber/saber_funcs_param.h +++ b/saber/saber_funcs_param.h @@ -777,21 +777,20 @@ struct SoftmaxParam { int axis; }; -#ifdef USE_BM template struct BatchnormParam { typedef typename opTensor::Dtype DataDtype; BatchnormParam() - : scale(float(0)) + : scale(DataDtype(0)) , use_global_stats(true) - , moving_average_fraction(float(0.999)) - , eps(float(1e-5)) + , moving_average_fraction(DataDtype(0.999)) + , eps(DataDtype(1e-5)) , mean(), variance() {} //scale_factor = 1 / scale; - BatchnormParam(std::vector mean_in, std::vector variance_in, - float scale_in, float moving_average_fraction_in = float(0.999), - float eps_in = float(1e-5), bool use_global_stats_in = true) + BatchnormParam(std::vector mean_in, std::vector variance_in, + DataDtype scale_in, DataDtype moving_average_fraction_in = DataDtype(0.999), + DataDtype eps_in = DataDtype(1e-5), bool use_global_stats_in = true) : mean(mean_in), variance(variance_in), scale(scale_in) , moving_average_fraction(moving_average_fraction_in) , eps(eps_in), use_global_stats(use_global_stats_in) @@ -815,28 +814,27 @@ struct BatchnormParam { comp_eq = comp_eq && (variance == right.variance); return comp_eq; } - float scale; - float moving_average_fraction; - float eps; + DataDtype scale; + DataDtype moving_average_fraction; + DataDtype eps; bool use_global_stats; - std::vector mean; - std::vector variance; + std::vector mean; + std::vector variance; }; -#else -template -struct BatchnormParam { - typedef typename opTensor::Dtype DataDtype; +#ifdef USE_BM +template <> +struct BatchnormParam> { BatchnormParam() - : scale(DataDtype(0)) + : scale(float(0)) , use_global_stats(true) - , moving_average_fraction(DataDtype(0.999)) - , eps(DataDtype(1e-5)) + , moving_average_fraction(float(0.999)) + , eps(float(1e-5)) , mean(), variance() {} //scale_factor = 1 / scale; - BatchnormParam(std::vector mean_in, std::vector variance_in, - DataDtype scale_in, DataDtype moving_average_fraction_in = DataDtype(0.999), - DataDtype eps_in = DataDtype(1e-5), bool use_global_stats_in = true) + BatchnormParam(std::vector mean_in, std::vector variance_in, + float scale_in, float moving_average_fraction_in = float(0.999), + float eps_in = float(1e-5), bool use_global_stats_in = true) : mean(mean_in), variance(variance_in), scale(scale_in) , moving_average_fraction(moving_average_fraction_in) , eps(eps_in), use_global_stats(use_global_stats_in) @@ -860,12 +858,12 @@ struct BatchnormParam { comp_eq = comp_eq && (variance == right.variance); return comp_eq; } - DataDtype scale; - DataDtype moving_average_fraction; - DataDtype eps; + float scale; + float moving_average_fraction; + float eps; bool use_global_stats; - std::vector mean; - std::vector variance; + std::vector mean; + std::vector variance; }; #endif From da713a96794a5f65538420a2513167eeb6b88998 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Thu, 28 Jun 2018 18:45:39 +0800 Subject: [PATCH 103/318] Update batch norm test for BM --- test/saber/bm/test_saber_func_batch_norm_BM.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/saber/bm/test_saber_func_batch_norm_BM.cpp b/test/saber/bm/test_saber_func_batch_norm_BM.cpp index 659d0f699..0453f818a 100644 --- a/test/saber/bm/test_saber_func_batch_norm_BM.cpp +++ b/test/saber/bm/test_saber_func_batch_norm_BM.cpp @@ -23,7 +23,7 @@ TEST(TestSaberFuncBM, test_func_batch_norm_BM) { Tensor thin(shape_in); for (int i = 0; i < thin.size(); ++i) { - thin.mutable_data()[i] = 10; + thin.mutable_data()[i] = 1+i; } TensorDf4 tdin, tdout; @@ -31,9 +31,12 @@ TEST(TestSaberFuncBM, test_func_batch_norm_BM) { tdin.copy_from(thin); input_dev_4d.push_back(&tdin); + LOG(INFO) << "Input tensor is:"; + print_tensor_device(*input_dev_4d[0]); + //Batch norm param std::vector mean; - mean.push_back(10); + mean.push_back(1); std::vector variance; variance.push_back(0); @@ -65,7 +68,7 @@ TEST(TestSaberFuncBM, test_func_batch_norm_BM) { t1.end(ctx_dev); float ts = t1.get_average_ms(); - printf("cudnn softmax total time : %.4f, avg time : %.4f\n", ts, ts); + printf("bm batch norm total time : %.4f, avg time : %.4f\n", ts, ts); print_tensor_device(*output_dev_4d[0]); } From efd4524105661afec01a5c2d42120e74416f342b Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Thu, 28 Jun 2018 18:45:39 +0800 Subject: [PATCH 104/318] Update batch norm test for BM --- saber/core/common.h | 1 + saber/funcs/impl/bm/vender_scale.h | 52 ++------ saber/saber_funcs_param.h | 42 ++++++ .../bm/test_saber_func_batch_norm_BM.cpp | 9 +- test/saber/bm/test_saber_func_scale_BM.cpp | 121 ++++++++++++++++++ 5 files changed, 184 insertions(+), 41 deletions(-) create mode 100644 test/saber/bm/test_saber_func_scale_BM.cpp diff --git a/saber/core/common.h b/saber/core/common.h index 2e7cd2650..54d6c56dd 100644 --- a/saber/core/common.h +++ b/saber/core/common.h @@ -150,6 +150,7 @@ const char* cudnn_get_errorstring(cudnnStatus_t status); #include "bmlib_runtime.h" #include "bmdnn_api.h" +#include "bmdnn_ext_api.h" #include "bmlib_utils.h" #define BMDNN_CHECK(condition) \ diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h index b47716a03..13f1d6322 100644 --- a/saber/funcs/impl/bm/vender_scale.h +++ b/saber/funcs/impl/bm/vender_scale.h @@ -29,8 +29,7 @@ class VenderScale& outputs, ScaleParam& param) { - const InDataType *in_data = (const InDataType *) inputs[0]->data(); - OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); + const InDataType in_data = *(inputs[0]->data()); + OutDataType out_data = *(outputs[0]->mutable_data()); int input_n = inputs[0]->num(); int input_c = inputs[0]->channel(); @@ -66,43 +65,21 @@ class VenderScalecount(0, axis); int inner_dim = inputs[0]->count(axis + num_axes, inputs[0]->shape().dims()); int scale_dim = inputs[0]->count(axis, axis + num_axes); - if (inputs.size() == 1) { - CHECK_EQ(scale_dim, param.scale_w.size()) << "scale dim not valid"; - } + /* if (inputs.size() == 1) { */ + /* CHECK_EQ(scale_dim, param.scale_w.size()) << "scale dim not valid"; */ + /* } */ - bm_device_mem_t* scale_extension = new bm_device_mem_t(); - OpDataType* scale_data = param.scale_w[0]; - bmdnn_scale_forward( - _handle, - //input - *in_data, - *scale_data, - input_n, - input_c, - input_h, - input_w, - scale_dim, - inner_dim, - 0, - //output - *scale_extension, - *out_data - ); + OpDataType scale_data = param.scale_w; + BMDNN_CHECK(bmdnn_scale_forward(_handle, in_data, scale_data, + input_n, input_c, input_h, input_w, + scale_dim, inner_dim, 0, + bm_mem_null(), out_data)); if (param.bias_term) { - OpDataType* bias_data = param.scale_b[0]; - bmdnn_bias_forward( - _handle, - //input - *out_data, - *bias_data, - outer_dim, - inner_dim, - //output - *out_data - ); + OpDataType bias_data = param.scale_b; + BMDNN_CHECK(bmdnn_bias_forward(_handle, in_data, bias_data, + outer_dim, inner_dim, out_data)); } - return SaberSuccess; } private: @@ -110,6 +87,5 @@ class VenderScale scale_w; std::vector scale_b; }; +#ifdef USE_BM +template <> +struct ScaleParam> { + ScaleParam(): axis(1), num_axes(1), bias_term(false) {} + ScaleParam(bm_device_mem_t scale_w_in, bm_device_mem_t scale_b_in, + bool bias_term_in = true, int axis_in = 1, int num_axes_in = 1) + : scale_w(scale_w_in), scale_b(scale_b_in) + , bias_term(bias_term_in), axis(axis_in), num_axes(num_axes_in) + {} + ScaleParam(bm_device_mem_t scale_w_in, + bool bias_term_in = false, int axis_in = 1, int num_axes_in = 1) + : scale_w(scale_w_in) + , bias_term(bias_term_in), axis(axis_in), num_axes(num_axes_in) + {} + ScaleParam(const ScaleParam &right) + : scale_w(right.scale_w), scale_b(right.scale_b) + , bias_term(right.bias_term), axis(right.axis), num_axes(right.num_axes) + {} + ScaleParam &operator=(const ScaleParam &right) { + scale_w = right.scale_w; + scale_b = right.scale_b; + bias_term = right.bias_term; + axis = right.axis; + num_axes = right.num_axes; + return *this; + } + bool operator==(const ScaleParam &right) { + bool comp_eq = true; + /* comp_eq = comp_eq && (scale_w == right.scale_w); */ + /* comp_eq = comp_eq && (scale_b == right.scale_b); */ + comp_eq = comp_eq && (bias_term == right.bias_term); + comp_eq = comp_eq && (axis == right.axis); + comp_eq = comp_eq && (num_axes == right.num_axes); + return comp_eq; + } + int axis; // default is 1 + int num_axes; // default is 1 + bool bias_term; // default false + bm_device_mem_t scale_w; + bm_device_mem_t scale_b; +}; +#endif template struct PoolingParam { PoolingParam() : window_h(-1), window_w(-1) diff --git a/test/saber/bm/test_saber_func_batch_norm_BM.cpp b/test/saber/bm/test_saber_func_batch_norm_BM.cpp index 659d0f699..0453f818a 100644 --- a/test/saber/bm/test_saber_func_batch_norm_BM.cpp +++ b/test/saber/bm/test_saber_func_batch_norm_BM.cpp @@ -23,7 +23,7 @@ TEST(TestSaberFuncBM, test_func_batch_norm_BM) { Tensor thin(shape_in); for (int i = 0; i < thin.size(); ++i) { - thin.mutable_data()[i] = 10; + thin.mutable_data()[i] = 1+i; } TensorDf4 tdin, tdout; @@ -31,9 +31,12 @@ TEST(TestSaberFuncBM, test_func_batch_norm_BM) { tdin.copy_from(thin); input_dev_4d.push_back(&tdin); + LOG(INFO) << "Input tensor is:"; + print_tensor_device(*input_dev_4d[0]); + //Batch norm param std::vector mean; - mean.push_back(10); + mean.push_back(1); std::vector variance; variance.push_back(0); @@ -65,7 +68,7 @@ TEST(TestSaberFuncBM, test_func_batch_norm_BM) { t1.end(ctx_dev); float ts = t1.get_average_ms(); - printf("cudnn softmax total time : %.4f, avg time : %.4f\n", ts, ts); + printf("bm batch norm total time : %.4f, avg time : %.4f\n", ts, ts); print_tensor_device(*output_dev_4d[0]); } diff --git a/test/saber/bm/test_saber_func_scale_BM.cpp b/test/saber/bm/test_saber_func_scale_BM.cpp new file mode 100644 index 000000000..c746a67ff --- /dev/null +++ b/test/saber/bm/test_saber_func_scale_BM.cpp @@ -0,0 +1,121 @@ +#include "core/context.h" +#include "funcs/scale.h" +#include "test_saber_func_BM.h" +#include "tensor_op.h" +#include "saber_types.h" +#include + +using namespace anakin::saber; + +template +void print_tensor_shape(std::string name, Tensor& t0) { + + LOG(INFO) << name << " valid shape is [" + << t0.valid_shape()[0] << ", " + << t0.valid_shape()[1] << ", " + << t0.valid_shape()[2] << ", " + << t0.valid_shape()[3] << "]."; + + LOG(INFO) << name << " real shape is [" + << t0.shape()[0] << ", " + << t0.shape()[1] << ", " + << t0.shape()[2] << ", " + << t0.shape()[3] << "]."; + + LOG(INFO) << name << " offset is [" + << t0.offset()[0] << ", " + << t0.offset()[1] << ", " + << t0.offset()[2] << ", " + << t0.offset()[3] << "]."; +} +void fill_vector_rand(std::vector& vec) { + for (int i = 0; i < vec.size(); i++) { + vec[i] = rand() *1.0f/RAND_MAX - 0.5; + } +} +void print_vector_data(std::vector& vec) { + for (int i = 0; i < vec.size(); i++) { + printf("%d, %f\n", i, vec[i]); + } +} + +void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_term, int scale_dim) { + + typedef Tensor TensorHf4; + typedef Tensor TensorDf4; + + int img_num = n; + int in_channels = c; + int img_h = h; + int img_w = w; + + Shape img_s(img_num, in_channels, img_h, img_w); + + TensorHf4 img_host; + TensorDf4 img_dev; + + img_host.re_alloc(img_s); + img_dev.re_alloc(img_s); + fill_tensor_host_rand(img_host, -0.5, 0.5); + img_dev.copy_from(img_host); + + TensorDf4 output_dev; + + Context ctx1(0, 1, 1); + std::vector scale_w; + std::vector scale_b; + scale_w.resize(scale_dim); + fill_vector_rand(scale_w); + if (bias_term) { + scale_b.resize(scale_dim); + fill_vector_rand(scale_b); + } + + ScaleParam param(bm_mem_from_system(&scale_w[0]), + bm_mem_from_system(&scale_b[0]), + bias_term, axis, num_axes); + + std::vector input; + std::vector output; + + input.push_back(&img_dev); + output.push_back(&output_dev); + + Scale scale; + scale.compute_output_shape(input, output, param); + output_dev.re_alloc(output[0]->valid_shape()); + + // init assume output tensor has been reshpaed by user. + scale.init(input, output, param, SPECIFY, SABER_IMPL, ctx1); + scale(input, output, param, ctx1); + + output_dev.sync(); + LOG(INFO) << "input data: "; + print_tensor_device(img_dev); + LOG(INFO) << "output data: "; + print_tensor_device(output_dev); + LOG(INFO) << "scale_w data: "; + print_vector_data(scale_w); + if (bias_term) { + LOG(INFO) << "scale_b data: "; + print_vector_data(scale_b); + } +} + +TEST(TestSaberFuncBM, test_func_constructor_elt) { + test_scale(2, 2, 4, 4, 1, 1, false, 2); + /* test_scale(2, 2, 4, 4, 1, 1, true, 2); */ + /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */ + /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */ + /* test_scale(2, 2, 4, 4, 0, 0, true, 1); */ + /* test_scale(2, 2, 4, 4, 0, 0, true, 1); */ +} + + +int main(int argc, const char** argv) { + Env::env_init(); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + From 44e1395bb7d8355ad18fe1ffa1e62d6b569eaefa Mon Sep 17 00:00:00 2001 From: "Guangzhi (Frank) Xie" Date: Thu, 28 Jun 2018 20:34:39 +0800 Subject: [PATCH 105/318] Update BM batch norm test --- saber/funcs/impl/bm/vender_batch_norm.h | 6 +++--- test/saber/bm/test_saber_func_batch_norm_BM.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/saber/funcs/impl/bm/vender_batch_norm.h b/saber/funcs/impl/bm/vender_batch_norm.h index e362a256f..4f433a4a9 100644 --- a/saber/funcs/impl/bm/vender_batch_norm.h +++ b/saber/funcs/impl/bm/vender_batch_norm.h @@ -59,9 +59,9 @@ class VenderBatchNorm variance; - variance.push_back(0); + variance.push_back(0.001); float scale_in = 1; float eps_in = float(1e-5); From 609bcd81b98d1729fec370a87b12db29cd8064a2 Mon Sep 17 00:00:00 2001 From: "Guangzhi (Frank) Xie" Date: Thu, 28 Jun 2018 21:15:41 +0800 Subject: [PATCH 106/318] Use vender scale for test --- test/saber/bm/test_saber_func_scale_BM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/saber/bm/test_saber_func_scale_BM.cpp b/test/saber/bm/test_saber_func_scale_BM.cpp index c746a67ff..6b0e309d8 100644 --- a/test/saber/bm/test_saber_func_scale_BM.cpp +++ b/test/saber/bm/test_saber_func_scale_BM.cpp @@ -86,7 +86,7 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te output_dev.re_alloc(output[0]->valid_shape()); // init assume output tensor has been reshpaed by user. - scale.init(input, output, param, SPECIFY, SABER_IMPL, ctx1); + scale.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1); scale(input, output, param, ctx1); output_dev.sync(); From 16b6f6ec36880772c5a7b786fe9df456eaac1f3e Mon Sep 17 00:00:00 2001 From: "Guangzhi (Frank) Xie" Date: Thu, 28 Jun 2018 21:21:42 +0800 Subject: [PATCH 107/318] Update BM scale --- saber/funcs/impl/bm/vender_scale.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h index 13f1d6322..ce32e898e 100644 --- a/saber/funcs/impl/bm/vender_scale.h +++ b/saber/funcs/impl/bm/vender_scale.h @@ -70,10 +70,11 @@ class VenderScale Date: Thu, 28 Jun 2018 21:27:00 +0800 Subject: [PATCH 108/318] update BM bias input --- saber/funcs/impl/bm/vender_scale.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h index ce32e898e..8ecaa1c38 100644 --- a/saber/funcs/impl/bm/vender_scale.h +++ b/saber/funcs/impl/bm/vender_scale.h @@ -78,7 +78,7 @@ class VenderScale Date: Thu, 28 Jun 2018 21:37:46 +0800 Subject: [PATCH 109/318] BM scale test with bias --- test/saber/bm/test_saber_func_scale_BM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/saber/bm/test_saber_func_scale_BM.cpp b/test/saber/bm/test_saber_func_scale_BM.cpp index 6b0e309d8..d6833bb9a 100644 --- a/test/saber/bm/test_saber_func_scale_BM.cpp +++ b/test/saber/bm/test_saber_func_scale_BM.cpp @@ -104,7 +104,7 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te TEST(TestSaberFuncBM, test_func_constructor_elt) { test_scale(2, 2, 4, 4, 1, 1, false, 2); - /* test_scale(2, 2, 4, 4, 1, 1, true, 2); */ + test_scale(2, 2, 4, 4, 1, 1, true, 2); /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */ /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */ /* test_scale(2, 2, 4, 4, 0, 0, true, 1); */ From 28a35e9036cb18b65a79048fa4213cf74a4b2266 Mon Sep 17 00:00:00 2001 From: liuhong03 Date: Thu, 28 Jun 2018 21:21:39 -0400 Subject: [PATCH 110/318] fix bias in scale --- saber/funcs/impl/bm/vender_scale.h | 33 ++++++++++++++++++---- test/saber/bm/test_saber_func_scale_BM.cpp | 6 ++-- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h index 8ecaa1c38..5f8b6d3bb 100644 --- a/saber/funcs/impl/bm/vender_scale.h +++ b/saber/funcs/impl/bm/vender_scale.h @@ -70,17 +70,40 @@ class VenderScale(param.scale_b)); + bm_memcpy_d2s(_handle, bm_mem_from_system(host_bias), bm_mem_from_device(&bias_data)); + int dim = inner_dim * scale_dim; + host_bias[0] = 1; + host_bias[1] = 2; + for (int i = 0; i < size; ++i) { + int bias_dim = (i % dim) / inner_dim; + host_extension[i] = host_bias[bias_dim]; + printf("%f, ", host_extension[i]); + } + printf("\n"); + bm_memcpy_s2d(_handle, *data_extension, bm_mem_from_system(const_cast(host_extension))); + delete [] host_bias; + delete [] host_extension; + BMDNN_CHECK(bmdnn_bias_forward(_handle, out_data, *data_extension, + outer_dim, scale_dim * inner_dim, out_data)); } + bm_free_device(_handle, *data_extension); return SaberSuccess; } private: diff --git a/test/saber/bm/test_saber_func_scale_BM.cpp b/test/saber/bm/test_saber_func_scale_BM.cpp index d6833bb9a..cf0a1ad91 100644 --- a/test/saber/bm/test_saber_func_scale_BM.cpp +++ b/test/saber/bm/test_saber_func_scale_BM.cpp @@ -66,6 +66,8 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te std::vector scale_b; scale_w.resize(scale_dim); fill_vector_rand(scale_w); + scale_w[0] = 0; + scale_w[1] = 0; if (bias_term) { scale_b.resize(scale_dim); fill_vector_rand(scale_b); @@ -103,8 +105,8 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te } TEST(TestSaberFuncBM, test_func_constructor_elt) { - test_scale(2, 2, 4, 4, 1, 1, false, 2); - test_scale(2, 2, 4, 4, 1, 1, true, 2); +// test_scale(1, 2, 1, 2, 1, 1, false, 2); + test_scale(1, 2, 1, 2, 1, 1, true, 2); /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */ /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */ /* test_scale(2, 2, 4, 4, 0, 0, true, 1); */ From 7291e2138a1dea6dd1335cd6b748fa1c37c14e88 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Fri, 29 Jun 2018 10:19:07 +0800 Subject: [PATCH 111/318] Update BM scale ops --- saber/funcs/impl/bm/vender_scale.h | 21 ++++++++------------- saber/saber_funcs_param.h | 8 ++++---- test/saber/bm/test_saber_func_scale_BM.cpp | 4 ++-- 3 files changed, 14 insertions(+), 19 deletions(-) diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h index 5f8b6d3bb..64c4d22a2 100644 --- a/saber/funcs/impl/bm/vender_scale.h +++ b/saber/funcs/impl/bm/vender_scale.h @@ -69,25 +69,19 @@ class VenderScale(param.scale_b)); - bm_memcpy_d2s(_handle, bm_mem_from_system(host_bias), bm_mem_from_device(&bias_data)); int dim = inner_dim * scale_dim; host_bias[0] = 1; host_bias[1] = 2; @@ -97,11 +91,12 @@ class VenderScale(host_extension))); - delete [] host_bias; - delete [] host_extension; - BMDNN_CHECK(bmdnn_bias_forward(_handle, out_data, *data_extension, + + BMDNN_CHECK(bmdnn_bias_forward(_handle, out_data, bm_mem_from_system(host_extension), outer_dim, scale_dim * inner_dim, out_data)); + + delete [] host_bias; + delete [] host_extension; } bm_free_device(_handle, *data_extension); return SaberSuccess; diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h index 021928a49..c6a88cbe4 100644 --- a/saber/saber_funcs_param.h +++ b/saber/saber_funcs_param.h @@ -976,12 +976,12 @@ struct ScaleParam { template <> struct ScaleParam> { ScaleParam(): axis(1), num_axes(1), bias_term(false) {} - ScaleParam(bm_device_mem_t scale_w_in, bm_device_mem_t scale_b_in, + ScaleParam(std::vector scale_w_in, std::vector scale_b_in, bool bias_term_in = true, int axis_in = 1, int num_axes_in = 1) : scale_w(scale_w_in), scale_b(scale_b_in) , bias_term(bias_term_in), axis(axis_in), num_axes(num_axes_in) {} - ScaleParam(bm_device_mem_t scale_w_in, + ScaleParam(std::vector scale_w_in, bool bias_term_in = false, int axis_in = 1, int num_axes_in = 1) : scale_w(scale_w_in) , bias_term(bias_term_in), axis(axis_in), num_axes(num_axes_in) @@ -1010,8 +1010,8 @@ struct ScaleParam> { int axis; // default is 1 int num_axes; // default is 1 bool bias_term; // default false - bm_device_mem_t scale_w; - bm_device_mem_t scale_b; + std::vector scale_w; + std::vector scale_b; }; #endif template diff --git a/test/saber/bm/test_saber_func_scale_BM.cpp b/test/saber/bm/test_saber_func_scale_BM.cpp index cf0a1ad91..066ba194b 100644 --- a/test/saber/bm/test_saber_func_scale_BM.cpp +++ b/test/saber/bm/test_saber_func_scale_BM.cpp @@ -73,8 +73,8 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te fill_vector_rand(scale_b); } - ScaleParam param(bm_mem_from_system(&scale_w[0]), - bm_mem_from_system(&scale_b[0]), + ScaleParam param(scale_w, + scale_b, bias_term, axis, num_axes); std::vector input; From c89d92cc18ed2273bdb0129f18909b4bd156b943 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Fri, 29 Jun 2018 10:28:54 +0800 Subject: [PATCH 112/318] cleanup --- saber/funcs/impl/bm/vender_scale.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h index 64c4d22a2..e1acd3bfa 100644 --- a/saber/funcs/impl/bm/vender_scale.h +++ b/saber/funcs/impl/bm/vender_scale.h @@ -83,8 +83,8 @@ class VenderScale Date: Fri, 29 Jun 2018 10:58:57 +0800 Subject: [PATCH 113/318] Update BM scale test --- test/saber/bm/test_saber_func_scale_BM.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/test/saber/bm/test_saber_func_scale_BM.cpp b/test/saber/bm/test_saber_func_scale_BM.cpp index 066ba194b..d4e40d44b 100644 --- a/test/saber/bm/test_saber_func_scale_BM.cpp +++ b/test/saber/bm/test_saber_func_scale_BM.cpp @@ -33,6 +33,11 @@ void fill_vector_rand(std::vector& vec) { vec[i] = rand() *1.0f/RAND_MAX - 0.5; } } +void fill_vector_const(std::vector& vec, float num) { + for (int i = 0; i < vec.size(); i++) { + vec[i] = num; + } +} void print_vector_data(std::vector& vec) { for (int i = 0; i < vec.size(); i++) { printf("%d, %f\n", i, vec[i]); @@ -56,7 +61,7 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te img_host.re_alloc(img_s); img_dev.re_alloc(img_s); - fill_tensor_host_rand(img_host, -0.5, 0.5); + fill_tensor_host_const(img_host, 1); img_dev.copy_from(img_host); TensorDf4 output_dev; @@ -65,12 +70,10 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te std::vector scale_w; std::vector scale_b; scale_w.resize(scale_dim); - fill_vector_rand(scale_w); - scale_w[0] = 0; - scale_w[1] = 0; + fill_vector_const(scale_w, 2); if (bias_term) { scale_b.resize(scale_dim); - fill_vector_rand(scale_b); + fill_vector_const(scale_b, 0); } ScaleParam param(scale_w, @@ -105,7 +108,7 @@ void test_scale(int n, int c, int h, int w, int axis, int num_axes, bool bias_te } TEST(TestSaberFuncBM, test_func_constructor_elt) { -// test_scale(1, 2, 1, 2, 1, 1, false, 2); + test_scale(1, 2, 1, 2, 1, 1, false, 2); test_scale(1, 2, 1, 2, 1, 1, true, 2); /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */ /* test_scale(2, 2, 4, 4, 0, -1, true, 64); */ From d3cef11b1c26510dd49ad2bb52de5599b39a40cd Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Fri, 29 Jun 2018 13:52:16 +0800 Subject: [PATCH 114/318] cleanup --- saber/funcs/impl/bm/vender_scale.h | 8 +- test/saber/bm/test_saber_func_fc_BM.cpp | 146 ------------------------ 2 files changed, 3 insertions(+), 151 deletions(-) delete mode 100644 test/saber/bm/test_saber_func_fc_BM.cpp diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h index e1acd3bfa..e2e6fb900 100644 --- a/saber/funcs/impl/bm/vender_scale.h +++ b/saber/funcs/impl/bm/vender_scale.h @@ -81,16 +81,14 @@ class VenderScale - -using namespace anakin::saber; -typedef TargetWrapper API; -typedef Tensor TensorDf4; -typedef Tensor TensorHf4; -typedef TensorDf4::Dtype ftype; - -void fc_compute(const TensorHf4& tin, const TensorHf4& weight, \ - const TensorHf4& bias, TensorHf4& tout) { - - int m = tin.num(); - int k = tin.valid_size() / m; - int n = weight.valid_size() / k; - bool bias_term = bias.valid_size() > 0; - - const float* din = tin.data(); - const float* w = weight.data(); - float* dout = tout.mutable_data(); - - for (int i = 0; i < m; ++i) { - float* pdout = dout + i * n; - const float* pdin = din + i * k; - - for (int j = 0; j < n; ++j) { - if (bias_term) { - pdout[j] = bias.data()[j]; - } else { - pdout[j] = 0; - } - - for (int l = 0; l < k; ++l) { - pdout[j] += pdin[l] * w[l * n + j]; - } - } - } -} - -TEST(TestSaberFuncBM, test_func_fc) { - - int test_iter = 100; - int w_in = 7; - int h_in = 7; - int ch_in = 512; - int num_in = 1; - - int num_out = 4096; - int axis = 1; - - Shape shape_in(num_in, ch_in, h_in, w_in); - Shape shape_out = {num_in, num_out, 1, 1}; - - Shape sh_w{1, 1, w_in* h_in * ch_in, num_out}; - TensorDf4 weight(sh_w); - Shape sh_b{1, 1, 1, num_out}; - TensorDf4 bias(sh_b); - fill_tensor_device_const(weight, 1.f); - fill_tensor_device_const(bias, 1.f); - - LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \ - ch_in << ", height=" << h_in << ", width=" << w_in; - - std::vector input_dev_4d; - std::vector output_dev_4d; - - TensorDf4 tdin; - TensorDf4 tdout; - tdin.re_alloc(shape_in); - fill_tensor_device_const(tdin, 1.f); - input_dev_4d.push_back(&tdin); - output_dev_4d.push_back(&tdout); - - // start Reshape & doInfer - Context ctx_dev(0, 1, 1); - - FcParam param(&weight, &bias, num_out, axis); - - Fc fc; - - LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \ - shape_out[2] << ", " << shape_out[3]; - - SABER_CHECK(fc.compute_output_shape(input_dev_4d, output_dev_4d, param)); - - LOG(INFO) << "re-alloc tensor buffer"; - output_dev_4d[0]->re_alloc(output_dev_4d[0]->valid_shape()); - Shape va_sh = tdout.valid_shape(); - LOG(INFO) << "shape out 4d: " << va_sh[0] << ", " << va_sh[1] << ", " << \ - va_sh[2] << ", " << va_sh[3]; - CHECK_EQ(tdout.valid_shape() == shape_out, true) << "compute output shape error"; - - LOG(INFO) << "FC initialization"; - SABER_CHECK(fc.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev)); - - LOG(INFO) << "FC compute"; - SaberTimer t1; - t1.clear(); - t1.start(ctx_dev); - - for (int i = 0; i < test_iter; ++i) { - SABER_CHECK(fc(input_dev_4d, output_dev_4d, param, ctx_dev)); - output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); - output_dev_4d[0]->sync(); - //cudaDeviceSynchronize(); - } - - t1.end(ctx_dev); - float ts = t1.get_average_ms(); - LOG(INFO) << "total time: " << ts << "avg time: " << ts / test_iter; - //print_tensor_device(*output_dev_4d[0]); - - //! check result - TensorHf4 thin(shape_in); - TensorHf4 thout(shape_out); - TensorHf4 thw(sh_w); - TensorHf4 thb(sh_b); - thin.copy_from(tdin); - thw.copy_from(weight); - thb.copy_from(bias); - fc_compute(thin, thw, thb, thout); - //print_tensor_host(thout); - - TensorHf4 thout_d(shape_out); - thout_d.copy_from(tdout); - double max_ratio = 0; - double max_diff = 0; - tensor_cmp_host(thout.data(), thout_d.data(), thout.valid_size(), max_ratio, max_diff); - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - CHECK_LE(fabs(max_ratio), 1.0e-6) << "error result"; - -} - -int main(int argc, const char** argv) { - // initial logger - //logger::init(argv[0]); - Env::env_init(); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - From ec902952b3bf399047ef95a2c1ada5da950902ad Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Fri, 29 Jun 2018 14:57:30 +0800 Subject: [PATCH 115/318] flush before next operation --- saber/funcs/impl/bm/vender_scale.h | 4 +--- test/saber/bm/test_saber_func_scale_BM.cpp | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h index e2e6fb900..4e9402a43 100644 --- a/saber/funcs/impl/bm/vender_scale.h +++ b/saber/funcs/impl/bm/vender_scale.h @@ -81,15 +81,13 @@ class VenderScale param(scale_w, From c38bf09e25d56c12c823abc2b6c410c7a0137d5d Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Fri, 29 Jun 2018 15:54:06 +0800 Subject: [PATCH 116/318] check BM conv bias --- saber/funcs/impl/bm/vender_conv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h index 220b8a14e..7243fd6a4 100644 --- a/saber/funcs/impl/bm/vender_conv.h +++ b/saber/funcs/impl/bm/vender_conv.h @@ -51,7 +51,6 @@ class VenderConv2D& param) { const InDataType *in_data = (const InDataType *) inputs[0]->data(); const InDataType *weight = (const InDataType *) param.weight()->data(); - const InDataType *bias = (const InDataType *) param.bias()->data(); OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); int input_n = inputs[0]->num(); @@ -75,6 +74,7 @@ class VenderConv2Dsize() > 0; + const InDataType *bias = with_bias? (const InDataType *) param.bias()->data() : &bm_mem_null(); bm_tensor_4d_t input_shape = { input_n, From 8dbb4b4cc22fba3fb693df72a4c831a793132b47 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Thu, 5 Jul 2018 13:47:59 +0800 Subject: [PATCH 117/318] Update BM tensor test --- test/saber/bm/test_saber_tensor_BM.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index 423ffe221..2400e73c3 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -415,12 +415,12 @@ TEST(TestSaberTensorBM, test_tensor_deepcopy) { td21.copy_from(td01); print_tensor_device(td21); //cudaDeviceSynchronize(); -} +}*/ TEST(TestSaberTensorBM, test_tensor_shape) { - typedef Tensor Tensor4_0; - typedef Tensor Tensor4_1; - typedef Tensor Tensor2; + typedef Tensor Tensor4_0; + typedef Tensor Tensor4_1; + typedef Tensor Tensor2; int nin = 2; int cin = 2; @@ -562,8 +562,8 @@ TEST(TestSaberTensorBM, test_tensor_op) { Shape sh{1, 2, 2, 10}; TensorDf4 td1(sh); TensorHf4 th1(sh); - Tensor td2(sh); - Tensor th2(sh); + Tensor td2(sh); + Tensor th2(sh); LOG(INFO) << "testing host fill tensor with const 1."; fill_tensor_host_const(th1, 1.f); LOG(INFO) << "data type: float"; @@ -616,9 +616,9 @@ TEST(TestSaberTensorBM, test_tensor_op) { TEST(TestSaberTensorBM, test_tensor_share_diff_dtype) { Shape sh{1, 1, 2, 10}; Tensor td1(sh); - Tensor th1(sh); - Tensor td2; - Tensor th2; + Tensor th1(sh); + Tensor td2; + Tensor th2; td2.set_shape(sh); th2.set_shape(sh); LOG(INFO) << "testing host fill tensor with const 1."; @@ -641,7 +641,7 @@ TEST(TestSaberTensorBM, test_tensor_share_diff_dtype) { TEST(TestSaberTensorBM, test_tensor_base_type) { Shape sh(1, 3, 10, 10); Tensor td1(sh); - Tensor th1(sh); + Tensor th1(sh); fill_tensor_host_rand(th1, 0.f, 255.f); td1.copy_from(th1); TensorBase* tb1; @@ -652,7 +652,7 @@ TEST(TestSaberTensorBM, test_tensor_base_type) { Shape sh11 = th1.valid_shape(); LOG(INFO) << "base tensor call set shape: " << "n=" << sh11[0] << ", c=" << sh11[1] << \ ", h=" << sh11[2] << ", w=" << sh11[3]; -}*/ +} int main(int argc, const char** argv) { // initial logger From a19e6fae278dfd5374d8c25b4aec210c032481b4 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Tue, 10 Jul 2018 11:13:29 +0800 Subject: [PATCH 118/318] Implement fc for BM --- saber/funcs/fc.h | 4 + saber/funcs/impl/bm/vender_fc.h | 22 ++-- test/saber/bm/test_saber_func_fc_BM.cpp | 147 ++++++++++++++++++++++++ 3 files changed, 164 insertions(+), 9 deletions(-) create mode 100644 test/saber/bm/test_saber_func_fc_BM.cpp diff --git a/saber/funcs/fc.h b/saber/funcs/fc.h index 06dc8695a..8b1d553be 100644 --- a/saber/funcs/fc.h +++ b/saber/funcs/fc.h @@ -26,6 +26,10 @@ #ifdef USE_X86_PLACE #include "saber/funcs/impl/x86/vender_fc.h" #endif + +#ifdef USE_BM +#include "saber/funcs/impl/bm/vender_fc.h" +#endif namespace anakin{ diff --git a/saber/funcs/impl/bm/vender_fc.h b/saber/funcs/impl/bm/vender_fc.h index 82dd6000c..c0cd7ea66 100644 --- a/saber/funcs/impl/bm/vender_fc.h +++ b/saber/funcs/impl/bm/vender_fc.h @@ -34,6 +34,7 @@ class VenderFc& inputs, std::vector& outputs, FcParam& param, Context& ctx){ + _handle = get_bm_handle(); return create(inputs, outputs, param, ctx); } @@ -47,16 +48,20 @@ class VenderFc& outputs, FcParam& param){ const InDataType *in_data = (const InDataType *) inputs[0]->data(); - const InDataType *weights = (const InDataType *) param.weights->get_buf()->get_data(); - const InDataType *bias = (const InDataType *) param.bias->get_buf()->get_data(); + const InDataType *weights = (const InDataType *) param.weights->data(); + const InDataType *bias = (const InDataType *) param.bias->data(); OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); - int batch_size = inputs[0]->num(); - int input_len = inputs[0]->channel(); + int batch_size = inputs[0]->count_valid(0, param.axis); + int input_len = inputs[0]->count_valid(param.axis, inputs[0]->dims()); int output_len = param.num_output; - int is_transpose = param.is_transpose_weights ? 1 : 0; - BMDNN_CHECK(bmdnn_fc_forward(_handle, in_data, weights, bias, - batch_size, output_len, input_len, is_transpose, 1, 0, - out_data)); + if (output_len <= 0) { + int weight_size = param.weights->valid_size(); + output_len = weight_size / input_len; + } + + BMDNN_CHECK(bmdnn_fc_forward(_handle, *in_data, *weights, *bias, + batch_size, output_len, input_len, param.is_transpose_weights, 1, 0, + *out_data)); return SaberSuccess; }; @@ -64,7 +69,6 @@ class VenderFc; } //namespace saber } //namespace anakin diff --git a/test/saber/bm/test_saber_func_fc_BM.cpp b/test/saber/bm/test_saber_func_fc_BM.cpp new file mode 100644 index 000000000..7b56033e6 --- /dev/null +++ b/test/saber/bm/test_saber_func_fc_BM.cpp @@ -0,0 +1,147 @@ +#include "core/context.h" +#include "funcs/fc.h" +#include "test_saber_func_BM.h" +#include "tensor_op.h" +#include "saber_types.h" +#include + +using namespace anakin::saber; +typedef TargetWrapper API; +typedef Tensor TensorDf4; +typedef Tensor TensorHf4; +typedef TensorDf4::Dtype ftype; + +void fc_compute(const TensorHf4& tin, const TensorHf4& weight, \ + const TensorHf4& bias, TensorHf4& tout) { + + int m = tin.num(); + int k = tin.valid_size() / m; + int n = weight.valid_size() / k; + bool bias_term = bias.valid_size() > 0; + + const float* din = tin.data(); + const float* w = weight.data(); + float* dout = tout.mutable_data(); + + for (int i = 0; i < m; ++i) { + float* pdout = dout + i * n; + const float* pdin = din + i * k; + + for (int j = 0; j < n; ++j) { + if (bias_term) { + pdout[j] = bias.data()[j]; + } else { + pdout[j] = 0; + } + + for (int l = 0; l < k; ++l) { + pdout[j] += pdin[l] * w[l * n + j]; + } + } + } +} + +TEST(TestSaberFuncBM, test_func_fc) { + + int test_iter = 10; + int w_in = 7; + int h_in = 7; + int ch_in = 1024; + int num_in = 4; + + int num_out = 4096; + int axis = 1; + + Shape shape_in(num_in, ch_in, h_in, w_in); + Shape shape_out = {num_in, num_out, 1, 1}; + + Shape sh_w{1, 1, w_in* h_in * ch_in, num_out}; + TensorDf4 weight(sh_w); + Shape sh_b{1, 1, 1, num_out}; + TensorDf4 bias(sh_b); + fill_tensor_device_const(weight, 1.f); + fill_tensor_device_const(bias, 1.f); + + LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \ + ch_in << ", height=" << h_in << ", width=" << w_in; + + std::vector input_dev_4d; + std::vector output_dev_4d; + + TensorDf4 tdin; + TensorDf4 tdout; + tdin.re_alloc(shape_in); + fill_tensor_device_const(tdin, 1.f); + input_dev_4d.push_back(&tdin); + output_dev_4d.push_back(&tdout); + + // start Reshape & doInfer + Context ctx_dev(0, 1, 1); + + FcParam param(&weight, &bias, num_out, axis); + + Fc fc; + + LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \ + shape_out[2] << ", " << shape_out[3]; + + SABER_CHECK(fc.compute_output_shape(input_dev_4d, output_dev_4d, param)); + + LOG(INFO) << "re-alloc tensor buffer"; + output_dev_4d[0]->re_alloc(output_dev_4d[0]->valid_shape()); + Shape va_sh = tdout.valid_shape(); + LOG(INFO) << "shape out 4d: " << va_sh[0] << ", " << va_sh[1] << ", " << \ + va_sh[2] << ", " << va_sh[3]; + CHECK_EQ(tdout.valid_shape() == shape_out, true) << "compute output shape error"; + + LOG(INFO) << "FC initialization"; + SABER_CHECK(fc.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev)); + + LOG(INFO) << "FC compute"; + SaberTimer t1; + t1.clear(); + t1.start(ctx_dev); + + for (int i = 0; i < test_iter; ++i) { + SABER_CHECK(fc(input_dev_4d, output_dev_4d, param, ctx_dev)); + bm_flush(get_bm_handle()); + //output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); + //output_dev_4d[0]->sync(); + //cudaDeviceSynchronize(); + } + + t1.end(ctx_dev); + float ts = t1.get_average_ms(); + LOG(INFO) << "total time: " << ts << "avg time: " << ts / test_iter; + //print_tensor_device(*output_dev_4d[0]); + + //! check result + TensorHf4 thin(shape_in); + TensorHf4 thout(shape_out); + TensorHf4 thw(sh_w); + TensorHf4 thb(sh_b); + thin.copy_from(tdin); + thw.copy_from(weight); + thb.copy_from(bias); + fc_compute(thin, thw, thb, thout); + //print_tensor_host(thout); + + TensorHf4 thout_d(shape_out); + thout_d.copy_from(tdout); + double max_ratio = 0; + double max_diff = 0; + tensor_cmp_host(thout.data(), thout_d.data(), thout.valid_size(), max_ratio, max_diff); + LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; + CHECK_LE(fabs(max_ratio), 1.0e-6) << "error result"; + +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + Env::env_init(); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + From e340b1b38d150eb6a07389a40d6b523a0fd180b1 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 11 Jul 2018 10:08:45 +0800 Subject: [PATCH 119/318] Implement eltwise for BM --- saber/funcs/eltwise.h | 4 + saber/funcs/impl/bm/vender_eltwise.h | 118 +++++++++++++++++++++++++++ saber/funcs/impl/bm/vender_scale.h | 2 +- saber/saber_funcs_param.h | 48 +++++++++++ 4 files changed, 171 insertions(+), 1 deletion(-) create mode 100644 saber/funcs/impl/bm/vender_eltwise.h diff --git a/saber/funcs/eltwise.h b/saber/funcs/eltwise.h index 7d3a4860c..490f9b6bf 100644 --- a/saber/funcs/eltwise.h +++ b/saber/funcs/eltwise.h @@ -26,6 +26,10 @@ #include "saber/funcs/impl/x86/saber_eltwise.h" #endif +#ifdef USE_BM +#include "saber/funcs/impl/bm/vender_eltwise.h" +#endif + namespace anakin { namespace saber { diff --git a/saber/funcs/impl/bm/vender_eltwise.h b/saber/funcs/impl/bm/vender_eltwise.h new file mode 100644 index 000000000..62ac2c436 --- /dev/null +++ b/saber/funcs/impl/bm/vender_eltwise.h @@ -0,0 +1,118 @@ +#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_ELTWISE_H +#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_ELTWISE_H + +#include "saber/funcs/impl/impl_eltwise.h" + +namespace anakin { + +namespace saber { + +template +class SaberEltwise:\ +public ImplBase< + Tensor, + Tensor, + Tensor, + EltwiseParam>> { +public: + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + typedef Tensor OpTensor; + + typedef typename DataTensor_in::Dtype InDataType; + typedef typename DataTensor_out::Dtype OutDataType; + typedef typename OpTensor::Dtype OpDataType; + + SaberEltwise() {} + + ~SaberEltwise() {} + + virtual SaberStatus init(const std::vector& inputs, + std::vector& outputs, + EltwiseParam ¶m, + Context &ctx) { + _handle = get_bm_handle(); + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector& inputs, + std::vector& outputs, + EltwiseParam ¶m, + Context &ctx) { + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + EltwiseParam ¶m) { + + int op_ = 0; + switch (param.operation) { + case Eltwise_prod: + op_ = 0; + break; + case Eltwise_sum: + op_ = 1; + break; + case Eltwise_max: + op_ = 2; + break; + default: + return SaberUnImplError; + } + + //int input_size = inputs.size(); + //CHECK_GE(input_size, 2) << "Input size should >= 2!"; + + OutDataType out_data = *(outputs[0]->mutable_data()); + int input_n = inputs[0]->num(); + int input_c = inputs[0]->channel(); + int input_h = inputs[0]->height(); + int input_w = inputs[0]->width(); + + std::vector coeff_ = param.coeff; + if (coeff_.size() != inputs.size()) { + for (int j=0; j<(inputs.size() - coeff_.size()); j++) { + coeff_.push_back(1); + } + } + + bm_device_mem_t* mask_data = new bm_device_mem_t(); + + int flag_first = 1; + for (int i=0; idata()); + bmdnn_eltwise_forward( + _handle, + op_, + flag_first, + coeff_[i], + i, + in_data, + out_data, + input_n, + input_c * input_h * input_w, + *mask_data, + out_data); + + bm_flush(_handle); + flag_first = 0; + } + bm_free_device(_handle, *mask_data); + return SaberSuccess; + } + +private: + bm_handle_t _handle; +}; + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_ELTWISE_H \ No newline at end of file diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h index 4e9402a43..2876e8005 100644 --- a/saber/funcs/impl/bm/vender_scale.h +++ b/saber/funcs/impl/bm/vender_scale.h @@ -87,7 +87,7 @@ class VenderScale coeff; }; +#ifdef USE_BM +template <> +struct EltwiseParam> { + EltwiseParam() + : operation(Eltwise_unknow) + , coeff() + {} + EltwiseParam(EltwiseType operation_in + , std::vector coeff_in = std::vector({1,1})) + : operation(operation_in) + , coeff(coeff_in) + { + if ((operation == Eltwise_sum) && (coeff.size() == 0)) { + coeff.push_back(1); + coeff.push_back(1); + } + } + + EltwiseParam(const EltwiseParam>& right) + : operation(right.operation) + , coeff(right.coeff) + {} + + EltwiseParam>& operator=(const EltwiseParam>& right) { + operation = right.operation; + coeff.resize(right.coeff.size()); + for (int i = 0; i < coeff.size(); ++i) { + coeff[i] = right.coeff[i]; + } + return *this; + } + + bool operator==(const EltwiseParam>& right) { + bool comp_eq = true; + comp_eq = comp_eq && (operation == right.operation); + comp_eq = comp_eq && (coeff.size() == right.coeff.size()); + if (!comp_eq) { + return comp_eq; + } + for (int i = 0; i < coeff.size(); ++i) { + comp_eq = comp_eq && (coeff[i] == right.coeff[i]); + } + } + EltwiseType operation; + std::vector coeff; +}; +#endif + template struct EltwiseActiveParam { EltwiseActiveParam() From 2fe6ca07bb78555c420ffd0ca3c0b4db07f12be5 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 11 Jul 2018 11:27:17 +0800 Subject: [PATCH 120/318] Add test for BM eltwise --- saber/funcs/impl/bm/vender_eltwise.h | 13 +- test/saber/bm/test_saber_func_eltwise_BM.cpp | 627 +++++++++++++++++++ 2 files changed, 634 insertions(+), 6 deletions(-) create mode 100644 test/saber/bm/test_saber_func_eltwise_BM.cpp diff --git a/saber/funcs/impl/bm/vender_eltwise.h b/saber/funcs/impl/bm/vender_eltwise.h index 62ac2c436..050fc4d43 100644 --- a/saber/funcs/impl/bm/vender_eltwise.h +++ b/saber/funcs/impl/bm/vender_eltwise.h @@ -13,7 +13,7 @@ template -class SaberEltwise:\ +class VenderEltwise:\ public ImplBase< Tensor, Tensor, @@ -28,9 +28,9 @@ public ImplBase< typedef typename DataTensor_out::Dtype OutDataType; typedef typename OpTensor::Dtype OpDataType; - SaberEltwise() {} + VenderEltwise() {} - ~SaberEltwise() {} + ~VenderEltwise() {} virtual SaberStatus init(const std::vector& inputs, std::vector& outputs, @@ -44,7 +44,6 @@ public ImplBase< std::vector& outputs, EltwiseParam ¶m, Context &ctx) { - return SaberSuccess; } virtual SaberStatus dispatch(const std::vector& inputs, @@ -77,7 +76,8 @@ public ImplBase< std::vector coeff_ = param.coeff; if (coeff_.size() != inputs.size()) { - for (int j=0; j<(inputs.size() - coeff_.size()); j++) { + int diff = inputs.size() - coeff_.size(); + for (int j=0; j + +using namespace anakin::saber; + +/* +TEST(TestSaberFuncBM, test_func_prod) { + + Env::env_init(); + typedef TargetWrapper API; + + typedef TargetWrapper BM_API; + typedef Tensor TensorHf4; + typedef Tensor TensorDf4; + + EltwiseType elt_type = Eltwise_prod; + + EltwiseParam param(elt_type); + + int w_in = 10; + int h_in = 2; + int ch_in = 2; + int num_in = 1; + + Shape shape_in(num_in, ch_in, h_in, w_in); + Shape shape_out = shape_in; + + // Host Tensor + Tensor thin0(shape_in); + Tensor thin1(shape_in); + Tensor thin2(shape_in); + for (int i = 0; i < thin0.size(); ++i) { + thin0.mutable_data()[i] = i; + } + for (int i = 0; i < thin1.size(); ++i) { + thin1.mutable_data()[i] = i + 1; + } + for (int i = 0; i < thin2.size(); ++i) { + thin2.mutable_data()[i] = 1; + } + + // Device Tensor + TensorDf4 tdin0, tdin1, tdin2, tdout; + tdin0.re_alloc(shape_in); + tdin1.re_alloc(shape_in); + tdin2.re_alloc(shape_in); + tdin0.copy_from(thin0); + tdin1.copy_from(thin1); + tdin2.copy_from(thin2); + tdout.re_alloc(shape_out); + + // Device Vector of Tensor + std::vector input_dev_4d; + std::vector output_dev_4d; + input_dev_4d.push_back(&tdin0); + input_dev_4d.push_back(&tdin1); + input_dev_4d.push_back(&tdin2); + output_dev_4d.push_back(&tdout); + + + Context ctx_dev(0, 1, 1); + Eltwise eltwise_dev; + + LOG(INFO) << "eltwise compute output shape"; + eltwise_dev.compute_output_shape(output_dev_4d, input_dev_4d, param); + + //SABER_CHECK(eltwise_dev.compute_output_shape(output_dev_4d, input_dev_4d, param)); + + // Verify output shape + Shape sh = output_dev_4d[0]->valid_shape(); + LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \ + ", " << sh[2] << ", " << sh[3]; + Shape shout{num_in, ch_in, h_in, w_in}; + CHECK_EQ(shout == sh, true) << "compute shape error"; + + output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape()); + + LOG(INFO) << "eltwise initialization"; + eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev); + + + LOG(INFO) << "eltwise compute"; + eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev); + + + output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); + output_dev_4d[0]->sync(); + print_tensor_device(*output_dev_4d[0]); + cudaDeviceSynchronize(); + + + TensorHf4 th_for_print; + th_for_print.re_alloc(output_dev_4d[0]->valid_shape()); + th_for_print.copy_from(*output_dev_4d[0]); + print_tensor_host(th_for_print); + + CUDA_CHECK(cudaPeekAtLastError()); +} + +*/ + +TEST(TestSaberFuncBM, test_func_sum) { + + Env::env_init(); + + typedef Tensor TensorHf4; + typedef Tensor TensorDf4; + + EltwiseType elt_type = Eltwise_sum; + + int w_in = 10; + int h_in = 2; + int ch_in = 2; + int num_in = 1; + + Shape shape_in(num_in, ch_in, h_in, w_in); + Shape shape_out = shape_in; + + // Host Tensor + TensorHf4 thin1(shape_in); + TensorHf4 thin2(shape_in); + + for (int i = 0; i < thin1.size(); ++i) { + thin1.mutable_data()[i] = 1.0; + } + + for (int i = 0; i < thin2.size(); ++i) { + thin2.mutable_data()[i] = 2.0; + } + + // Device Tensor + TensorDf4 tdin0, tdin1, tdout; + tdin0.re_alloc(shape_in); + tdin1.re_alloc(shape_in); + tdin0.copy_from(thin1); + tdin1.copy_from(thin2); + tdout.re_alloc(shape_out); + + // Device Vector of Tensor + std::vector input_dev_4d; + std::vector output_dev_4d; + input_dev_4d.push_back(&tdin0); + input_dev_4d.push_back(&tdin1); + input_dev_4d.push_back(&tdin1); + output_dev_4d.push_back(&tdout); + + EltwiseParam param(elt_type); + + Context ctx_dev(0, 1, 1); + Eltwise eltwise_dev; + + LOG(INFO) << "eltwise compute output shape"; + eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param); + + + // Verify output shape + Shape sh = output_dev_4d[0]->valid_shape(); + LOG(INFO) << "output shape: " << sh[0] << ", " << sh[1] << \ + ", " << sh[2] << ", " << sh[3]; + Shape shout{num_in, ch_in, h_in, w_in}; + CHECK_EQ(shout == sh, true) << "compute shape error"; + + output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape()); + + LOG(INFO) << "eltwise initialization"; + eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev); + + LOG(INFO) << "eltwise compute"; + eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev); + print_tensor_device(*output_dev_4d[0]); +} + +TEST(TestSaberFuncBM, test_func_max) { + + Env::env_init(); + + typedef Tensor TensorHf4; + typedef Tensor TensorDf4; + + EltwiseType elt_type = Eltwise_max; + + EltwiseParam param(elt_type); + + int w_in = 10; + int h_in = 2; + int ch_in = 2; + int num_in = 1; + + Shape shape_in(num_in, ch_in, h_in, w_in); + Shape shape_out = shape_in; + + // Host Tensor + Tensor thin0(shape_in); + Tensor thin1(shape_in); + Tensor thin2(shape_in); + for (int i = 0; i < thin0.size(); ++i) { + thin0.mutable_data()[i] = i; + } + for (int i = 0; i < thin1.size(); ++i) { + thin1.mutable_data()[i] = i + 2; + } + for (int i = 0; i < thin2.size(); ++i) { + thin2.mutable_data()[i] = i + 1; + } + + // Device Tensor + TensorDf4 tdin0, tdin1, tdin2, tdout; + tdin0.re_alloc(shape_in); + tdin1.re_alloc(shape_in); + tdin2.re_alloc(shape_in); + tdin0.copy_from(thin0); + tdin1.copy_from(thin1); + tdin2.copy_from(thin2); + tdout.re_alloc(shape_out); + + // Device Vector of Tensor + std::vector input_dev_4d; + std::vector output_dev_4d; + input_dev_4d.push_back(&tdin0); + input_dev_4d.push_back(&tdin1); + input_dev_4d.push_back(&tdin2); + output_dev_4d.push_back(&tdout); + + Context ctx_dev(0, 1, 1); + Eltwise eltwise_dev; + + LOG(INFO) << "eltwise compute output shape"; + eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param); + + // Verify output shape + Shape sh = output_dev_4d[0]->valid_shape(); + LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \ + ", " << sh[2] << ", " << sh[3]; + Shape shout{num_in, ch_in, h_in, w_in}; + CHECK_EQ(shout == sh, true) << "compute shape error"; + + output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape()); + + LOG(INFO) << "eltwise initialization"; + eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev); + + LOG(INFO) << "eltwise compute"; + eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev); + print_tensor_device(*output_dev_4d[0]); + +} + +/* 0 1 2 3 4 + * 10 11 12 13 14 (tdin_roi1, c=0) + * (tdin_roi0, c=0) 25 26 27 28 29 + * 35 36 37 38 39 + * ======================================= + * 40 41 42 43 44 + * 50 51 52 53 54 (tdin_roi1, c=1) + * (tdin_roi0, c=1) 65 66 67 68 69 + * 75 76 77 78 79 + */ +/* +TEST(TestSaberFuncBM, test_func_prod_roi) { + + Env::env_init(); + typedef TargetWrapper API; + + typedef TargetWrapper BM_API; + typedef Tensor TensorHf4; + typedef Tensor TensorDf4; + + EltwiseType elt_type = Eltwise_prod; + + EltwiseParam param(elt_type); + + int w_in = 10; + int h_in = 4; + int ch_in = 2; + int num_in = 1; + + Shape shape_in(num_in, ch_in, h_in, w_in); + Shape shape_in_roi{num_in, ch_in, h_in / 2, w_in / 2}; + Shape off0{0, 0, 0, 0}; + Shape off1{0, 0, 2, 5}; + Shape shape_out = shape_in_roi; + + // Host Tensor + Tensor thin(shape_in); + for (int i = 0; i < thin.size(); ++i) { + thin.mutable_data()[i] = i; + } + + // Device Tensor + TensorDf4 tdin, tdin_roi0, tdin_roi1, tdout; + tdin.re_alloc(shape_in); + tdin.copy_from(thin); + tdin_roi0.share_sub_buffer(tdin, shape_in_roi, off0); + tdin_roi1.share_sub_buffer(tdin, shape_in_roi, off1); + tdout.re_alloc(shape_out); + + + // Device Vector of Tensor + std::vector input_dev_4d; + std::vector output_dev_4d; + input_dev_4d.push_back(&tdin_roi0); + input_dev_4d.push_back(&tdin_roi1); + input_dev_4d.push_back(&tdin_roi1); + output_dev_4d.push_back(&tdout); + + + Context ctx_dev(0, 1, 1); + Eltwise eltwise_dev; + + LOG(INFO) << "eltwise compute output shape"; + eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param); + + + // Verify output shape + Shape sh = output_dev_4d[0]->valid_shape(); + LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \ + ", " << sh[2] << ", " << sh[3]; + Shape shout(shape_in_roi); + CHECK_EQ(shout == sh, true) << "compute shape error"; + + output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape()); + + LOG(INFO) << "eltwise initialization"; + eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev); + + + LOG(INFO) << "eltwise compute"; + eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev); + + + output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); + output_dev_4d[0]->sync(); + print_tensor_device(*output_dev_4d[0]); + cudaDeviceSynchronize(); + + + TensorHf4 th_for_print; + th_for_print.re_alloc(output_dev_4d[0]->valid_shape()); + th_for_print.copy_from(*output_dev_4d[0]); + print_tensor_host(th_for_print); + + CUDA_CHECK(cudaPeekAtLastError()); +} + +*/ + +/* 0 1 2 3 4 + * 10 11 12 13 14 (tdin_roi1, c=0) + * (tdin_roi0, c=0) 25 26 27 28 29 + * 35 36 37 38 39 + * ======================================= + * 40 41 42 43 44 + * 50 51 52 53 54 (tdin_roi1, c=1) + * (tdin_roi0, c=1) 65 66 67 68 69 + * 75 76 77 78 79 + */ +/* +TEST(TestSaberFuncBM, test_func_sum_roi_new) { + + Env::env_init(); + typedef TargetWrapper API; + + typedef TargetWrapper BM_API; + typedef Tensor TensorHf4; + typedef Tensor TensorDf4; + + EltwiseType elt_type = Eltwise_sum; + + int w_in = 10; + int h_in = 4; + int ch_in = 2; + int num_in = 1; + + Shape shape_in(num_in, ch_in, h_in, w_in); + Shape shape_in_roi{num_in, ch_in, h_in / 2, w_in / 2}; + + Shape off0{0, 0, 0, 0}; + Shape off1{0, 0, 2, 5}; + Shape shape_out = shape_in_roi; + + // Host Tensor + Tensor thin(shape_in); + for (int i = 0; i < thin.size(); ++i) { + thin.mutable_data()[i] = i; + } + + // Device Tensor + TensorDf4 tdin, tdin_roi0, tdin_roi1, tdout; + tdin.re_alloc(shape_in); + tdin.copy_from(thin); + tdin_roi0.share_sub_buffer(tdin, shape_in_roi, off0); + tdin_roi1.share_sub_buffer(tdin, shape_in_roi, off1); + tdout.re_alloc(shape_out); + + // Device Vector of Tensor + std::vector input_dev_4d; + std::vector output_dev_4d; + input_dev_4d.push_back(&tdin_roi0); + input_dev_4d.push_back(&tdin_roi1); +// input_dev_4d.push_back(&tdin_roi1); +// input_dev_4d.push_back(&tdin_roi1); + output_dev_4d.push_back(&tdout); + +// Shape shape_coeff(1, 1, 1, input_dev_4d.size()); +// TensorHf4 thcoeff(shape_coeff); +// for (int i = 0; i < thcoeff.size(); ++i) { +// thcoeff.mutable_data()[i] = 1; +// } + + EltwiseParam param(elt_type); + + Context ctx_dev(0, 1, 1); + Eltwise eltwise_dev; + + LOG(INFO) << "eltwise compute output shape"; + eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param); + + // Verify output shape + Shape sh = output_dev_4d[0]->valid_shape(); + LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \ + ", " << sh[2] << ", " << sh[3]; + Shape shout(shape_in_roi); + CHECK_EQ(shout == sh, true) << "compute shape error"; + + output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape()); + + LOG(INFO) << "eltwise initialization"; + eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev); + + print_tensor_device(*input_dev_4d[0]); + print_tensor_device(*input_dev_4d[1]); + cudaDeviceSynchronize(); + LOG(INFO) << "eltwise compute"; + eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev); + + output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); + output_dev_4d[0]->sync(); + print_tensor_device(*output_dev_4d[0]); + cudaDeviceSynchronize(); + CUDA_CHECK(cudaPeekAtLastError()); +} +*/ +/* +TEST(TestSaberFuncBM, test_func_sum_roi) { + + Env::env_init(); + typedef TargetWrapper API; + + typedef TargetWrapper BM_API; + typedef Tensor TensorHf4; + typedef Tensor TensorDf4; + + EltwiseType elt_type = Eltwise_sum; + + int w_in = 10; + int h_in = 4; + int ch_in = 2; + int num_in = 1; + + Shape shape_in(num_in, ch_in, h_in, w_in); + Shape shape_in_roi{num_in, ch_in, h_in / 2, w_in / 2}; + Shape off0{0, 0, 0, 0}; + Shape off1{0, 0, 2, 5}; + Shape shape_out = shape_in_roi; + + // Host Tensor + Tensor thin(shape_in); + for (int i = 0; i < thin.size(); ++i) { + thin.mutable_data()[i] = i; + } + + // Device Tensor + TensorDf4 tdin, tdin_roi0, tdin_roi1, tdout; + tdin.re_alloc(shape_in); + tdin.copy_from(thin); + tdin_roi0.share_sub_buffer(tdin, shape_in_roi, off0); + tdin_roi1.share_sub_buffer(tdin, shape_in_roi, off1); + tdout.re_alloc(shape_out); + + // Device Vector of Tensor + std::vector input_dev_4d; + std::vector output_dev_4d; + input_dev_4d.push_back(&tdin_roi0); + input_dev_4d.push_back(&tdin_roi1); + output_dev_4d.push_back(&tdout); + + //Shape shape_coeff(1, 1, 1, 3); + Shape shape_coeff(1, 1, 1, input_dev_4d.size()); + TensorHf4 thcoeff(shape_coeff); + + for (int i = 0; i < thcoeff.size(); ++i) { + thcoeff.mutable_data()[i] = i; + } + TensorDf4 tdcoeff; + tdcoeff.re_alloc(shape_coeff); + tdcoeff.copy_from(thcoeff); + + EltwiseParam param(elt_type, &tdcoeff); + + Context ctx_dev(0, 1, 1); + Eltwise eltwise_dev; + + LOG(INFO) << "eltwise compute output shape"; + eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param); + + // Verify output shape + Shape sh = output_dev_4d[0]->valid_shape(); + LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \ + ", " << sh[2] << ", " << sh[3]; + Shape shout(shape_in_roi); + CHECK_EQ(shout == sh, true) << "compute shape error"; + + output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape()); + + LOG(INFO) << "eltwise initialization"; + eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev); + + LOG(INFO) << "eltwise compute"; + eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev); + + output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); + output_dev_4d[0]->sync(); + print_tensor_device(*output_dev_4d[0]); + cudaDeviceSynchronize(); + + TensorHf4 th_for_print; + th_for_print.re_alloc(output_dev_4d[0]->valid_shape()); + th_for_print.copy_from(*output_dev_4d[0]); + print_tensor_host(th_for_print); + + CUDA_CHECK(cudaPeekAtLastError()); +} +*/ + +/* +TEST(TestSaberFuncBM, test_func_max_roi) { + + Env::env_init(); + typedef TargetWrapper API; + + typedef TargetWrapper BM_API; + typedef Tensor TensorHf4; + typedef Tensor TensorDf4; + + EltwiseType elt_type = Eltwise_max; + + int w_in = 10; + int h_in = 4; + int ch_in = 2; + int num_in = 1; + + Shape shape_in(num_in, ch_in, h_in, w_in); + Shape shape_in_roi{num_in, ch_in, h_in / 2, w_in / 2}; + Shape off0{0, 0, 0, 0}; + Shape off1{0, 0, 2, 5}; + Shape shape_out = shape_in_roi; + + // Host Tensor + Tensor thin(shape_in); + for (int i = 0; i < thin.size(); ++i) { + thin.mutable_data()[i] = i; + } + + // Device Tensor + TensorDf4 tdin, tdin_roi0, tdin_roi1, tdout; + tdin.re_alloc(shape_in); + tdin.copy_from(thin); + tdin_roi0.share_sub_buffer(tdin, shape_in_roi, off0); + tdin_roi1.share_sub_buffer(tdin, shape_in_roi, off1); + tdout.re_alloc(shape_out); + + // Device Vector of Tensor + std::vector input_dev_4d; + std::vector output_dev_4d; + input_dev_4d.push_back(&tdin_roi0); + input_dev_4d.push_back(&tdin_roi1); + output_dev_4d.push_back(&tdout); + + EltwiseParam param(elt_type); + + Context ctx_dev(0, 1, 1); + Eltwise eltwise_dev; + + LOG(INFO) << "eltwise compute output shape"; + eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param); + + // Verify output shape + Shape sh = output_dev_4d[0]->valid_shape(); + LOG(INFO) << "output shape: "<< sh[0] << ", " << sh[1] << \ + ", " << sh[2] << ", " << sh[3]; + Shape shout(shape_in_roi); + CHECK_EQ(shout == sh, true) << "compute shape error"; + + output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape()); + + LOG(INFO) << "eltwise initialization"; + eltwise_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev); + + LOG(INFO) << "eltwise compute"; + eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev); + + output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); + output_dev_4d[0]->sync(); + print_tensor_device(*output_dev_4d[0]); + cudaDeviceSynchronize(); + + TensorHf4 th_for_print; + th_for_print.re_alloc(output_dev_4d[0]->valid_shape()); + th_for_print.copy_from(*output_dev_4d[0]); + print_tensor_host(th_for_print); + + CUDA_CHECK(cudaPeekAtLastError()); +} + +*/ + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} From 8125c944bec0486c34b1d44054a17dbe035e22e8 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 11 Jul 2018 11:40:43 +0800 Subject: [PATCH 121/318] test eltwise PROD for BM --- test/saber/bm/test_saber_func_eltwise_BM.cpp | 21 +++----------------- 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/test/saber/bm/test_saber_func_eltwise_BM.cpp b/test/saber/bm/test_saber_func_eltwise_BM.cpp index da931510b..643f4e026 100644 --- a/test/saber/bm/test_saber_func_eltwise_BM.cpp +++ b/test/saber/bm/test_saber_func_eltwise_BM.cpp @@ -7,7 +7,7 @@ using namespace anakin::saber; -/* + TEST(TestSaberFuncBM, test_func_prod) { Env::env_init(); @@ -63,12 +63,10 @@ TEST(TestSaberFuncBM, test_func_prod) { Context ctx_dev(0, 1, 1); - Eltwise eltwise_dev; + Eltwise eltwise_dev; LOG(INFO) << "eltwise compute output shape"; - eltwise_dev.compute_output_shape(output_dev_4d, input_dev_4d, param); - - //SABER_CHECK(eltwise_dev.compute_output_shape(output_dev_4d, input_dev_4d, param)); + eltwise_dev.compute_output_shape(input_dev_4d, output_dev_4d, param); // Verify output shape Shape sh = output_dev_4d[0]->valid_shape(); @@ -86,22 +84,9 @@ TEST(TestSaberFuncBM, test_func_prod) { LOG(INFO) << "eltwise compute"; eltwise_dev(input_dev_4d, output_dev_4d, param, ctx_dev); - - output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); - output_dev_4d[0]->sync(); print_tensor_device(*output_dev_4d[0]); - cudaDeviceSynchronize(); - - - TensorHf4 th_for_print; - th_for_print.re_alloc(output_dev_4d[0]->valid_shape()); - th_for_print.copy_from(*output_dev_4d[0]); - print_tensor_host(th_for_print); - - CUDA_CHECK(cudaPeekAtLastError()); } -*/ TEST(TestSaberFuncBM, test_func_sum) { From 0596def27ff050ff804522d176ecc490ddd9be08 Mon Sep 17 00:00:00 2001 From: "frank.xie" Date: Mon, 18 Jun 2018 13:39:29 +0800 Subject: [PATCH 122/318] Initial checkin for BM device support --- .idea/workspace.xml | 393 +++++++++ CMakeLists.txt | 24 +- cmake/compiler_options.cmake | 18 + cmake/config/anakin_config.h.in | 2 + cmake/gather.cmake | 6 + framework/core/data_types.h | 9 + saber/CMakeLists.txt | 38 +- saber/core/common.h | 14 + saber/core/impl/bm/bm_device.cpp | 24 + saber/core/impl/bm/bm_impl.cpp | 89 ++ saber/core/target_traits.h | 7 + saber/core/target_wrapper.h | 60 +- saber/core/tensor_op.cpp | 6 +- saber/funcs/CMakeLists.txt | 12 + saber/funcs/impl/bm/base/CMakeLists.txt | 20 + .../impl/bm/base/include/bmdnn/bmdnn_api.h | 814 ++++++++++++++++++ .../bm/base/include/bmdnn/bmdnn_ext_api.h | 438 ++++++++++ .../bm/base/include/bmdnn/bmdnn_runtime.h | 20 + .../impl/bm/base/include/bmdnn/op_code.h | 62 ++ .../bm/base/include/bmlib/bmlib_runtime.h | 229 +++++ .../impl/bm/base/include/bmlib/bmlib_utils.h | 72 ++ .../impl/bm/base/include/bmruntime/bmblob.h | 97 +++ .../impl/bm/base/include/bmruntime/bmcnnctx.h | 58 ++ .../impl/bm/base/include/bmruntime/bmnet.h | 78 ++ .../bm/base/include/bmruntime/bmruntime.h | 154 ++++ .../base/include/bmruntime/bmruntime_common.h | 65 ++ .../include/bmruntime/bmruntime_interface.h | 11 + saber/funcs/impl/bm/vender_activation.h | 96 +++ saber/funcs/impl/bm/vender_conv.h | 195 +++++ saber/funcs/impl/bm/vender_conv_act.h | 198 +++++ saber/funcs/impl/bm/vender_conv_act_pooling.h | 176 ++++ saber/funcs/impl/bm/vender_fc.h | 114 +++ saber/funcs/impl/bm/vender_pooling.h | 151 ++++ saber/saber_funcs_param.h | 12 +- saber/saber_types.h | 10 +- test/saber/bm/test_TargetWrapper_BM.cpp | 16 + test/saber/bm/test_saber_buffer_BM.cpp | 116 +++ test/saber/bm/test_saber_buffer_BM.h | 20 + test/saber/bm/test_saber_context_BM.cpp | 31 + test/saber/bm/test_saber_context_BM.h | 21 + test/saber/bm/test_saber_device_BM.cpp | 20 + test/saber/bm/test_saber_device_BM.h | 21 + test/saber/bm/test_saber_func_BM.h | 38 + .../bm/test_saber_func_activation_BM.cpp | 183 ++++ test/saber/bm/test_saber_func_conv_BM.cpp | 725 ++++++++++++++++ test/saber/bm/test_saber_func_fc_BM.cpp | 148 ++++ test/saber/bm/test_saber_func_pooling_BM.cpp | 311 +++++++ test/saber/bm/test_saber_shape_BM.cpp | 126 +++ test/saber/bm/test_saber_shape_BM.h | 25 + test/saber/bm/test_saber_tensor_BM.cpp | 642 ++++++++++++++ test/saber/bm/test_saber_tensor_BM.h | 21 + 51 files changed, 6218 insertions(+), 18 deletions(-) create mode 100644 .idea/workspace.xml create mode 100644 saber/core/impl/bm/bm_device.cpp create mode 100644 saber/core/impl/bm/bm_impl.cpp create mode 100644 saber/funcs/impl/bm/base/CMakeLists.txt create mode 100644 saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h create mode 100644 saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h create mode 100644 saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h create mode 100644 saber/funcs/impl/bm/base/include/bmdnn/op_code.h create mode 100644 saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h create mode 100644 saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmblob.h create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmcnnctx.h create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmnet.h create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h create mode 100644 saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h create mode 100644 saber/funcs/impl/bm/vender_activation.h create mode 100644 saber/funcs/impl/bm/vender_conv.h create mode 100644 saber/funcs/impl/bm/vender_conv_act.h create mode 100644 saber/funcs/impl/bm/vender_conv_act_pooling.h create mode 100644 saber/funcs/impl/bm/vender_fc.h create mode 100644 saber/funcs/impl/bm/vender_pooling.h create mode 100644 test/saber/bm/test_TargetWrapper_BM.cpp create mode 100644 test/saber/bm/test_saber_buffer_BM.cpp create mode 100644 test/saber/bm/test_saber_buffer_BM.h create mode 100644 test/saber/bm/test_saber_context_BM.cpp create mode 100644 test/saber/bm/test_saber_context_BM.h create mode 100644 test/saber/bm/test_saber_device_BM.cpp create mode 100644 test/saber/bm/test_saber_device_BM.h create mode 100644 test/saber/bm/test_saber_func_BM.h create mode 100644 test/saber/bm/test_saber_func_activation_BM.cpp create mode 100644 test/saber/bm/test_saber_func_conv_BM.cpp create mode 100644 test/saber/bm/test_saber_func_fc_BM.cpp create mode 100644 test/saber/bm/test_saber_func_pooling_BM.cpp create mode 100644 test/saber/bm/test_saber_shape_BM.cpp create mode 100644 test/saber/bm/test_saber_shape_BM.h create mode 100644 test/saber/bm/test_saber_tensor_BM.cpp create mode 100644 test/saber/bm/test_saber_tensor_BM.h diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 000000000..48b584478 --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,393 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + MvnParam + ConvParam + TargetType + mem_set + & + BM + print + print_tensor_host + + + + + + + + + + + true + DEFINITION_ORDER + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -252,24 +203,23 @@ - - - + - + @@ -350,19 +300,15 @@ - - - - - - + + - + @@ -371,20 +317,34 @@ - - + + - - + + - - - - + + + + + + + + + + + + + + + + + + diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h index 0da1a1106..b857eacdd 100644 --- a/saber/funcs/impl/bm/vender_pooling.h +++ b/saber/funcs/impl/bm/vender_pooling.h @@ -1,23 +1,7 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_POOLING_H -#define ANAKIN_SABER_FUNCS_IMPL_CUDA_POOLING_H +#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_POOLING_H +#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_POOLING_H #include "saber/funcs/impl/impl_pooling.h" -#include "saber/funcs/impl/cuda/cudnn_helper.h" namespace anakin{ @@ -29,12 +13,12 @@ template -class VenderPooling:\ +class VenderPooling:\ public ImplBase< - Tensor, - Tensor, - Tensor, - PoolingParam>> { + Tensor, + Tensor, + Tensor, + PoolingParam>> { public: typedef Tensor DataTensor_in; typedef Tensor DataTensor_out; @@ -62,8 +46,8 @@ class VenderPooling& inputs, std::vector& outputs, PoolingParam ¶m) { - const InDataType *in_data = inputs[0]->data(); - OutDataType *out_data = outputs[0]->mutable_data(); + const InDataType in_data = *(inputs[0]->data()); + OutDataType out_data = *(outputs[0]->mutable_data()); int input_n = inputs[0]->num(); int input_c = inputs[0]->channel(); int input_h = inputs[0]->height(); @@ -74,27 +58,29 @@ class VenderPooling; +template class VenderPooling; } //namespace saber } // namespace anakin -#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_POOLING_H +#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_POOLING_H diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp index ce8e7f8f5..2a490c588 100644 --- a/test/saber/bm/test_saber_func_pooling_BM.cpp +++ b/test/saber/bm/test_saber_func_pooling_BM.cpp @@ -12,8 +12,6 @@ TEST(TestSaberFuncBM, test_func_pooling) { Env::env_init(); typedef TargetWrapper API; - typename API::event_t event; - API::create_event(event); typedef TargetWrapper X86_API; typedef TargetWrapper BM_API; @@ -104,8 +102,6 @@ TEST(TestSaberFuncBM, test_pooling_result) { Env::env_init(); typedef TargetWrapper API; - typename API::event_t event; - API::create_event(event); typedef TargetWrapper X86_API; typedef TargetWrapper BM_API; @@ -180,8 +176,6 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) { Env::env_init(); typedef TargetWrapper API; - typename API::event_t event; - API::create_event(event); typedef TargetWrapper X86_API; typedef TargetWrapper BM_API; From ccfa11b050820f5f56ea3a863070b6b9d77214ed Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Tue, 26 Jun 2018 13:18:23 +0800 Subject: [PATCH 180/318] Implement print_tensor_device for BM --- saber/core/tensor_op.cpp | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp index 6de80bce4..06ee5bd79 100644 --- a/saber/core/tensor_op.cpp +++ b/saber/core/tensor_op.cpp @@ -312,6 +312,42 @@ void fill_tensor_device_const(Tensor& tensor, float value, \ delete [] host_mem_input; } +template <> +void print_tensor_device>(Tensor& tensor, \ + typename Tensor::API::stream_t stream) { + + LOG(INFO) << "BM device tensor data:" << tensor.size(); + + /* + const bm_device_mem_t* device_data_ptr = tensor.data(); + unsigned long long gaddr = bm_mem_get_device_addr(*device_data_ptr); + bm_flush(get_bm_handle()); + float* device_data = (float*)bm_get_global_addr(gaddr); + + for (int i = 0; i < tensor.size(); ++i) { + printf("%.2f ", device_data[i]); + + if ((i + 1) % (4 * tensor.width()) == 0) { + printf("\n"); + } + }*/ + + float *host_mem = new float[tensor.size()]; + auto* device_data_ptr = const_cast(tensor.data()); + bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr); + + for (int i = 0; i < tensor.size(); ++i) { + printf("%.2f ", host_mem[i]); + + if ((i + 1) % (4 * tensor.width()) == 0) { + printf("\n"); + } + } + printf("\n"); + + delete [] host_mem; +} + #endif } //namespace saber From 00384141404cb4022c6399e2e4454a26f66c6d30 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Tue, 26 Jun 2018 13:25:48 +0800 Subject: [PATCH 181/318] Update BM tensor test --- saber/core/tensor_op.cpp | 2 ++ test/saber/bm/test_saber_tensor_BM.cpp | 9 ++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp index 06ee5bd79..219a41fd8 100644 --- a/saber/core/tensor_op.cpp +++ b/saber/core/tensor_op.cpp @@ -333,6 +333,8 @@ void print_tensor_device>(Tensor& tenso }*/ float *host_mem = new float[tensor.size()]; + bm_flush(get_bm_handle()); + auto* device_data_ptr = const_cast(tensor.data()); bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr); diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index d42665528..dfd8d90c9 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -55,7 +55,6 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { TensorHf4 thost1(sh1); TensorDf4 tdev1(sh1); - //! test tensor copy_from() function LOG(INFO) << "test copy_from() function, input tensor could be any target"; @@ -65,17 +64,17 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { // host to device tdev1.copy_from(thost0); - //TODO: print tensor for BM device - //print_tensor_host(tdev1); + print_tensor_device(tdev1); // device to host thost1.copy_from(tdev1); print_tensor_host(thost1); - /* - // device to device + //device to device tdev1.copy_from(tdev0); + print_tensor_device(tdev1); + /* //! test tensor constructor with shape and real_shape LOG(INFO) << "test tensor constructor with shape and real_shape"; //! constructor with 3 shapes is removed From 9ca8735409b7760ce4a3032b8c533f7dd3f0402d Mon Sep 17 00:00:00 2001 From: "tong.liu" Date: Tue, 26 Jun 2018 13:38:28 +0800 Subject: [PATCH 182/318] fix pooling api error --- saber/funcs/impl/bm/vender_pooling.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h index b857eacdd..108a70708 100644 --- a/saber/funcs/impl/bm/vender_pooling.h +++ b/saber/funcs/impl/bm/vender_pooling.h @@ -67,8 +67,7 @@ class VenderPooling Date: Tue, 26 Jun 2018 14:50:10 +0800 Subject: [PATCH 183/318] Update pooling test --- test/saber/bm/test_saber_func_pooling_BM.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp index 2a490c588..944ab6a18 100644 --- a/test/saber/bm/test_saber_func_pooling_BM.cpp +++ b/test/saber/bm/test_saber_func_pooling_BM.cpp @@ -9,8 +9,6 @@ using namespace anakin::saber; TEST(TestSaberFuncBM, test_func_pooling) { - - Env::env_init(); typedef TargetWrapper API; typedef TargetWrapper X86_API; @@ -42,6 +40,8 @@ TEST(TestSaberFuncBM, test_func_pooling) { // start Reshape & doInfer + LOG(INFO) << "init env..."; + Env::env_init(); Context ctx1(0, 1, 1); int window_h = 2; int window_w = 2; @@ -279,6 +279,9 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) { } int main(int argc, const char** argv) { + //TODO: init in another place + static bm_handle_t handle; + bmdnn_init(&handle); // initial logger //logger::init(argv[0]); InitTest(); From 7f1a4f3dd3deeb08abcfb711240ed59166795aba Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Tue, 26 Jun 2018 15:01:16 +0800 Subject: [PATCH 184/318] Skip context init for BM --- saber/core/context.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/saber/core/context.h b/saber/core/context.h index 292f38449..2147033f0 100644 --- a/saber/core/context.h +++ b/saber/core/context.h @@ -18,6 +18,7 @@ #include "core/env.h" #include "saber/saber_types.h" +#include #ifdef USE_BM #include "bmlib_runtime.h" @@ -41,6 +42,11 @@ class Context final{ * @param compute_stream_id */ Context(int device_id = 0, int data_stream_id = 0, int compute_stream_id = 0){ + if(std::is_same::value){ + LOG(INFO) << "context init for BM"; + return; + } + CHECK_GT(devs.size(), 0) << "Env is not initialized or current target is not exit!"; if (device_id >= devs.size()){ LOG(WARNING) << "device index exceeds the number of devices, set to default device(0)!"; @@ -64,6 +70,11 @@ class Context final{ } Context(const Context& ctx){ + if(std::is_same::value){ + LOG(INFO) << "context init for BM"; + return; + } + _device_id = ctx._device_id; _data_stream_id = ctx._data_stream_id; _compute_stream_id = ctx._compute_stream_id; From 154d5ad1cf35693ef800953893447550e51363ce Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Tue, 26 Jun 2018 15:09:02 +0800 Subject: [PATCH 185/318] remove flush action in print --- saber/core/tensor_op.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp index 219a41fd8..06ee5bd79 100644 --- a/saber/core/tensor_op.cpp +++ b/saber/core/tensor_op.cpp @@ -333,8 +333,6 @@ void print_tensor_device>(Tensor& tenso }*/ float *host_mem = new float[tensor.size()]; - bm_flush(get_bm_handle()); - auto* device_data_ptr = const_cast(tensor.data()); bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr); From d84c51ef97d612094a7ed2948baa49e8e96e9760 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Tue, 26 Jun 2018 16:09:58 +0800 Subject: [PATCH 186/318] ignore set_device for BM for now --- saber/core/impl/bm/bm_impl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index d2790d0a9..fa51bf2d7 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -47,7 +47,7 @@ void BM_API::get_device_count(int &count) { void BM_API::set_device(int id){ //(bm_handle_t &handle, bool bmkernel_used, int id){ - BMDNN_CHECK(bm_dev_request(&handle, 0, id)); + //BMDNN_CHECK(bm_dev_request(&handle, 0, id)); } //TODO: Do we have this functionality? From fe303220195d1a223bcb0299ee2c402f587c57b6 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Tue, 26 Jun 2018 16:26:19 +0800 Subject: [PATCH 187/318] Update logs for copy_from --- saber/core/tensor.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/saber/core/tensor.h b/saber/core/tensor.h index ff4728aa9..244b2a1c7 100644 --- a/saber/core/tensor.h +++ b/saber/core/tensor.h @@ -1012,7 +1012,7 @@ size_t Tensor::_type_len(){ template<> template<> inline SaberStatus Tensor::copy_from(const Tensor& tensor) { - LOG(INFO) << "BM copy_from"; + LOG(INFO) << "BM copy_from X86"; CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same"; auto* device_data_ptr = mutable_data(); @@ -1023,7 +1023,7 @@ SaberStatus Tensor::copy_from(const Tensor template<> template<> inline SaberStatus Tensor::copy_from(const Tensor& tensor) { - LOG(INFO) << "X86 copy_from"; + LOG(INFO) << "X86 copy_from BM"; CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same"; auto* device_data_ptr = const_cast(tensor.data()); From a6088e39d62c38c301df1027ae58f988a3fa9487 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Tue, 26 Jun 2018 17:42:26 +0800 Subject: [PATCH 188/318] Initialize bm handle only in one place --- saber/core/impl/bm/bm_impl.cpp | 4 ++-- test/saber/bm/test_TargetWrapper_BM.cpp | 6 +++--- test/saber/bm/test_saber_buffer_BM.cpp | 4 ---- test/saber/bm/test_saber_func_pooling_BM.cpp | 3 --- test/saber/bm/test_saber_tensor_BM.cpp | 4 ---- 5 files changed, 5 insertions(+), 16 deletions(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index fa51bf2d7..60e52088e 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -37,9 +37,9 @@ namespace saber{ typedef TargetWrapper BM_API; -//TODO: check exception -//static bm_handle_t handle = get_bm_handle(); +// Init handle only once in the lifetime static bm_handle_t handle; +static bm_status_t init_handle{bmdnn_init(&handle)}; void BM_API::get_device_count(int &count) { BMDNN_CHECK(bm_dev_getcount(&count)); diff --git a/test/saber/bm/test_TargetWrapper_BM.cpp b/test/saber/bm/test_TargetWrapper_BM.cpp index b893183a2..8de77498a 100644 --- a/test/saber/bm/test_TargetWrapper_BM.cpp +++ b/test/saber/bm/test_TargetWrapper_BM.cpp @@ -4,9 +4,9 @@ #ifdef USE_BM using namespace anakin::saber; -static bm_handle_t handle; +//static bm_handle_t handle; int main() { - bmdnn_init(&handle); + //bmdnn_init(&handle); typedef TargetWrapper API; //int dev_count = 0; //API::get_device_count(dev_count); @@ -20,7 +20,7 @@ int main() { std::cout << "Start mem_free test." << std::endl; API::mem_free(pmem); std::cout << "End mem_free test." << std::endl; - bmdnn_deinit(handle); + //bmdnn_deinit(handle); } #endif diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp index 9910638fb..dce1fae15 100644 --- a/test/saber/bm/test_saber_buffer_BM.cpp +++ b/test/saber/bm/test_saber_buffer_BM.cpp @@ -132,10 +132,6 @@ TEST(TestSaberBufferBM, test_buffer_memcpy) { } int main(int argc, const char** argv) { - //TODO: init in another place - static bm_handle_t handle; - bmdnn_init(&handle); - // initial logger logger::init(argv[0]); InitTest(); diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp index 944ab6a18..e988bc573 100644 --- a/test/saber/bm/test_saber_func_pooling_BM.cpp +++ b/test/saber/bm/test_saber_func_pooling_BM.cpp @@ -279,9 +279,6 @@ TEST(TestSaberFuncBM, test_pooling_shared_buffer) { } int main(int argc, const char** argv) { - //TODO: init in another place - static bm_handle_t handle; - bmdnn_init(&handle); // initial logger //logger::init(argv[0]); InitTest(); diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index dfd8d90c9..2dcd61c41 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -624,10 +624,6 @@ TEST(TestSaberTensorBM, test_tensor_base_type) { }*/ int main(int argc, const char** argv) { - //TODO: init in another place - static bm_handle_t handle; - bmdnn_init(&handle); - // initial logger logger::init(argv[0]); InitTest(); From 42d7ee0bfc9ad96ab8658b2db95d1542036787e0 Mon Sep 17 00:00:00 2001 From: lian <327842846@qq.com> Date: Tue, 26 Jun 2018 10:46:30 +0000 Subject: [PATCH 189/318] chage tensor type_len --- .idea/workspace.xml | 70 +- saber/core/target_wrapper.h | 6 +- saber/core/tensor.h | 7 +- test/framework/core/base_types_test.cpp | 143 ---- test/framework/graph/graph_base_test.cpp | 82 -- test/saber/bm/test_saber_buffer_BM.h | 20 - test/saber/bm/test_saber_context_BM.h | 21 - test/saber/bm/test_saber_device_BM.cpp | 20 - test/saber/bm/test_saber_device_BM.h | 21 - test/saber/bm/test_saber_func_BM.h | 38 - .../bm/test_saber_func_activation_BM.cpp | 88 --- test/saber/bm/test_saber_func_conv_BM.cpp | 725 ------------------ test/saber/bm/test_saber_func_fc_BM.cpp | 146 ---- test/saber/bm/test_saber_shape_BM.cpp | 126 --- test/saber/bm/test_saber_shape_BM.h | 25 - test/saber/bm/test_saber_tensor_BM.cpp | 47 +- 16 files changed, 83 insertions(+), 1502 deletions(-) delete mode 100644 test/framework/core/base_types_test.cpp delete mode 100644 test/framework/graph/graph_base_test.cpp delete mode 100644 test/saber/bm/test_saber_buffer_BM.h delete mode 100644 test/saber/bm/test_saber_context_BM.h delete mode 100644 test/saber/bm/test_saber_device_BM.cpp delete mode 100644 test/saber/bm/test_saber_device_BM.h delete mode 100644 test/saber/bm/test_saber_func_BM.h delete mode 100644 test/saber/bm/test_saber_func_activation_BM.cpp delete mode 100644 test/saber/bm/test_saber_func_conv_BM.cpp delete mode 100644 test/saber/bm/test_saber_func_fc_BM.cpp delete mode 100644 test/saber/bm/test_saber_shape_BM.cpp delete mode 100644 test/saber/bm/test_saber_shape_BM.h diff --git a/.idea/workspace.xml b/.idea/workspace.xml index aec21f6ee..718ee2682 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -10,8 +10,21 @@ - - + + + + + + + + + + + + + + + @@ -119,8 +134,8 @@ @@ -203,23 +218,24 @@ - + - + - + @@ -305,16 +321,6 @@ - - - - - - - - - - @@ -344,10 +350,20 @@ - + + + + + + + + + + + \ No newline at end of file diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h index 648c85ed4..49a6e9364 100644 --- a/saber/core/target_wrapper.h +++ b/saber/core/target_wrapper.h @@ -521,7 +521,7 @@ struct TargetWrapper { static void mem_alloc(void** ptr, size_t n); //template - static void mem_free(void * ptr); + static void mem_free(void * ptr); //template static void mem_set(void* ptr, int value, size_t n); @@ -546,7 +546,7 @@ struct TargetWrapper { size_t count, __HtoD) {}; static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, __DtoH); + size_t count, __DtoH) {}; static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \ int src_dev, size_t count) {}; @@ -556,6 +556,8 @@ struct TargetWrapper { * @return currently activated device id */ static int get_device_id(); + + static bm_handle_t get_handler(); }; #endif //USE_BM diff --git a/saber/core/tensor.h b/saber/core/tensor.h index 244b2a1c7..93af6822f 100644 --- a/saber/core/tensor.h +++ b/saber/core/tensor.h @@ -1003,10 +1003,11 @@ class Tensor { }; #ifdef USE_BM - +#ifndef BM_TENSOR_COPY +#define BM_TENSOR_COPY template<> inline size_t Tensor::_type_len(){ - return 1; + return 4; } template<> @@ -1030,7 +1031,7 @@ SaberStatus Tensor::copy_from(const Tensor BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *device_data_ptr)); return SaberSuccess; } - +#endif #endif } //namespace saber diff --git a/test/framework/core/base_types_test.cpp b/test/framework/core/base_types_test.cpp deleted file mode 100644 index 0109493bf..000000000 --- a/test/framework/core/base_types_test.cpp +++ /dev/null @@ -1,143 +0,0 @@ -#include "core_test.h" -#include "any.h" -#include "singleton.h" -#include "tls.h" -#include "parameter.h" -#include "thread_pool.h" - -#ifdef USE_CUDA -#include "cuda_funcs.h" -#include "sass_funcs.h" -#endif - -#include "tensor.h" - -#ifdef USE_CUDA -TEST(CoreComponentsTest, sass_test) { - LOG(INFO) << "test for cuda code function"; - //anakin::saber::Tensor<3, RTCUDA, float, NCHW> ts; - //LOG(WARNING) << " tensor num " << ts.num(); - //ts.set_offset(8); - //my_print(); - LOG(INFO) << "test for sass code function 1"; - invoke_test(); - LOG(INFO) << "test for sass code function 2"; - invoke_test_2(); -} -#endif - -TEST(CoreComponentsTest, core_base_types_any_test) { - LOG(INFO) << "test for any class ."; - LOG(WARNING) << " level 1 : base type int (set 42 to any)"; - const int a = 42; - any any_a(42); - int result_a = any_cast(any_a); - - LOG(INFO) << "casted result : " << result_a; - LOG(WARNING) << " level 2 : base type float (set 42.8 to any)"; - float b = 42.8; - any any_b = b; - float result_b = any_cast(any_b); - LOG(INFO) << "casted result : " << result_b << " decide: "; - - LOG(WARNING) << " level 3 : ptuple type (set PTuple to any)"; - PTuple p_tuple_float(3.2f, 3.3f, 3.5f); - p_tuple_float.push_back(4.3); // push_back - - any p_tuple_float_any = p_tuple_float; - auto result_p_tuple_float_any = any_cast>(p_tuple_float_any); - - for (int i = 0; i < result_p_tuple_float_any.size(); i++) { - LOG(INFO) << " any casted PTuple[" << i << "]: " << result_p_tuple_float_any[i]; - } - - struct target { - void print() { - LOG(INFO) << " target struct Successfully recovered."; - } - }; - - LOG(WARNING) << " level 5 : struct type"; - - target tg; - - any any_tg = tg; - - target result_tg = any_cast(any_tg); - - result_tg.print(); - - LOG(WARNING) << " level other : struct type"; - - any any_tg_copy = any_tg; - - target result_tg_copy = any_cast(any_tg); - - result_tg_copy.print(); -} - -void at_exit_in_test() { - LOG(WARNING) << "core_base_types_singleton_test exit successfully!"; -} - -TEST(CoreComponentsTest, core_base_types_singleton_test) { - struct target { - target() { - LOG(INFO) << " singleton target constructed"; - } - }; - typedef Singleton sg_target; - sg_target::Global(); -} - -typedef AnakinThreadLocalVar sg_tls; -void thread_func_0() { - int* tmp = sg_tls::value(); - *tmp = 3; - LOG(INFO) << " thread_func_0 value: " << *(sg_tls::value()); -} -void thread_func_1() { - int* tmp = sg_tls::value(); - *tmp = 4; - - LOG(INFO) << " thread_func_0 value: " << *(sg_tls::value()); -} -TEST(CoreComponentsTest, core_base_types_tls_test) { - LOG(INFO) << " Create tls var 0 , check in two thread."; - std::thread first(thread_func_0); - std::thread sec(thread_func_1); - first.join(); - sec.join(); - LOG(INFO) << " main thread var: " << *(sg_tls::value()); -} - -int thread_pool_func(int i) { - LOG(INFO) << " thread_pool_func input : " << i; - //std::this_thread::sleep_for(std::chrono::seconds(0)); - return i; -} - -TEST(CoreComponentsTest, core_base_types_thread_pool_test) { - LOG(INFO) << " Create thread pool with thread num = 12 "; - ThreadPool thread_pool_test(100); - thread_pool_test.launch(); - std::function test = thread_pool_func; - - for (int i = 0; i < 50; i++) { - // run async - auto ret = thread_pool_test.RunAsync(test, i); - LOG(INFO) << " return : " << ret.get(); - - // run sync - //auto sync_ret = thread_pool_test.RunSync(test, i); - } -} - - -int main(int argc, const char** argv) { - // initial logger - logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} diff --git a/test/framework/graph/graph_base_test.cpp b/test/framework/graph/graph_base_test.cpp deleted file mode 100644 index d42e86c02..000000000 --- a/test/framework/graph/graph_base_test.cpp +++ /dev/null @@ -1,82 +0,0 @@ -#include -#include "graph_test.h" -#include "graph_base.h" - -using namespace anakin; -using namespace anakin::graph; - -//! Usage sample -class GraphTestClass : public GraphBase { -public: - GraphTestClass() {} - ~GraphTestClass() {} - virtual bool directed() { - return true; - }; -}; -class edge : public Arc { -public: - edge(std::string btm, std::string top, int weight): Arc(btm, top, weight) {} - ~edge() {} -}; - -TEST(GraphTest, graph_base_test) { - LOG(INFO) << "test for graph base ."; - - GraphTestClass graph; - graph.add_vertex("a", 42); - graph.add_vertex("b", 43); - graph.add_vertex("c", 44); - graph.add_vertex("d", 45); - graph.add_vertex("e", 46); - graph.add_vertex("f", 47); - - edge arc0("a", "b", 0); - edge arc1("b", "c", 1); - edge arc2("c", "d", 2); - edge arc3("d", "e", 3); - edge arc4("e", "f", 4); - edge arc5("f", "a", 5); - - graph.add_in_arc(arc0); - graph.add_in_arc(arc1); - graph.add_in_arc(arc2); - graph.add_in_arc(arc3); - graph.add_in_arc(arc4); - graph.add_in_arc(arc5); - graph.add_out_arc(arc0); - graph.add_out_arc(arc1); - graph.add_out_arc(arc2); - graph.add_out_arc(arc3); - graph.add_out_arc(arc4); - graph.add_out_arc(arc5); - - LOG(WARNING) << "Construction of graph."; - LOG(INFO) << graph.to_string(); - - LOG(WARNING) << "Remove a from graph."; - graph.remove("a"); - LOG(INFO) << graph.to_string(); - - LOG(WARNING) << "Add arc: f->b to graph."; - edge arc_f_b("f", "b", 10); - graph.add_in_arc(arc_f_b); - graph.add_out_arc(arc_f_b); - LOG(INFO) << graph.to_string(); - - LOG(WARNING) << "Add vertex:a and arc: a->e to graph."; - graph.add_vertex("a", 47); - edge arc_a_e("a", "e", 10); - graph.add_out_arc(arc_a_e); - graph.add_in_arc(arc_a_e); - LOG(INFO) << graph.to_string(); -} - - -int main(int argc, const char** argv) { - // initial logger - logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} diff --git a/test/saber/bm/test_saber_buffer_BM.h b/test/saber/bm/test_saber_buffer_BM.h deleted file mode 100644 index 8bbbe4511..000000000 --- a/test/saber/bm/test_saber_buffer_BM.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H -#define ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H - -#include "utils/unit_test/aktest.h" -#include "utils/logger/logger.h" - -using namespace anakin::test; - -class TestSaberBufferBM : public Test { -public: - TestSaberBufferBM() {} - ~TestSaberBufferBM() {} - -protected: - virtual void setup() {} - virtual void teardown() {} - -}; - -#endif //ANAKIN_TEST_SABER_TEST_SABER_BUFFER_BM_H diff --git a/test/saber/bm/test_saber_context_BM.h b/test/saber/bm/test_saber_context_BM.h deleted file mode 100644 index 653ee11fd..000000000 --- a/test/saber/bm/test_saber_context_BM.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef SABER_TEST_SABER_CONTEXT_BM_H -#define SABER_TEST_SABER_CONTEXT_BM_H - -#include "utils/unit_test/aktest.h" -#include "utils/logger/logger.h" -#include "core/context.h" - -using namespace anakin::test; - -class TestSaberContextBM : public Test { -public: - TestSaberContextBM() {} - ~TestSaberContextBM() {} - -protected: - virtual void setup() {} - virtual void teardown() {} - -}; - -#endif //SABER_TEST_SABER_CONTEXT_BM_H diff --git a/test/saber/bm/test_saber_device_BM.cpp b/test/saber/bm/test_saber_device_BM.cpp deleted file mode 100644 index 1c7086cf1..000000000 --- a/test/saber/bm/test_saber_device_BM.cpp +++ /dev/null @@ -1,20 +0,0 @@ -#include "test_saber_device_BM.h" - -#ifdef USE_BM - -using namespace anakin::saber; - -TEST(TestSaberDeviceBM, test_BM_device) { - Device dev_BM; -} - -#endif - -int main(int argc, const char** argv) { - // initial logger - logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/saber/bm/test_saber_device_BM.h b/test/saber/bm/test_saber_device_BM.h deleted file mode 100644 index 3a6d61236..000000000 --- a/test/saber/bm/test_saber_device_BM.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef SABER_TEST_SABER_DEVICE_BM_H -#define SABER_TEST_SABER_DEVICE_BM_H - -#include "utils/unit_test/aktest.h" -#include "utils/logger/logger.h" -#include "core/device.h" - -using namespace anakin::test; - -class TestSaberDeviceBM : public Test { -public: - TestSaberDeviceBM() {} - ~TestSaberDeviceBM() {} - -protected: - virtual void setup() {} - virtual void teardown() {} - -}; - -#endif //SABER_TEST_SABER_DEVICE_BM_H diff --git a/test/saber/bm/test_saber_func_BM.h b/test/saber/bm/test_saber_func_BM.h deleted file mode 100644 index 61d27d6f9..000000000 --- a/test/saber/bm/test_saber_func_BM.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef ANAKIN_TEST_SABER_TEST_SABER_FUNC_BM_H -#define ANAKIN_TEST_SABER_TEST_SABER_FUNC_BM_H - -#include "utils/unit_test/aktest.h" -#include "utils/logger/logger.h" -#include "core/tensor.h" -#include -#include - -using namespace anakin::test; - -int read_file(std::vector &results, const char* file_name) { - - std::ifstream infile(file_name); - if (!infile.good()) { - std::cout << "Cannot open " << std::endl; - return false; - } - LOG(INFO)<<"found filename: "< - -using namespace anakin::saber; - -template -void print_tensor_shape(std::string name, Tensor& t0) { - - LOG(INFO) << name << " valid shape is [" - << t0.valid_shape()[0] << ", " - << t0.valid_shape()[1] << ", " - << t0.valid_shape()[2] << ", " - << t0.valid_shape()[3] << "]."; - - LOG(INFO) << name << " real shape is [" - << t0.shape()[0] << ", " - << t0.shape()[1] << ", " - << t0.shape()[2] << ", " - << t0.shape()[3] << "]."; - - LOG(INFO) << name << " offset is [" - << t0.offset()[0] << ", " - << t0.offset()[1] << ", " - << t0.offset()[2] << ", " - << t0.offset()[3] << "]."; -} - -TEST(TestSaberFuncBM, test_func_constructor) { - - typedef Tensor TensorHf4; - typedef Tensor TensorDf4; - - int img_num = 1; - int in_channels = 2; - int img_h = 8; - int img_w = 8; - - Shape img_s(img_num, in_channels, img_h, img_w); - - TensorHf4 img_host; - TensorDf4 img_dev; - - img_host.re_alloc(img_s); - img_dev.re_alloc(img_s); - - for (int i = 0; i < img_host.size(); ++i) { - img_host.mutable_data()[i] = (float)(0.05 * (i & 0x1f) * -1); - } - - img_dev.copy_from(img_host); - TensorDf4 output_dev; - - // start Reshape & doInfer - - Context ctx1(0, 1, 1); - - ActivationParam param(Active_relu, 0.1f, 0.1f); - - std::vector input; - std::vector output; - - input.push_back(&img_dev); - output.push_back(&output_dev); - - Activation act; - act.compute_output_shape(input, output, param); - output_dev.re_alloc(output[0]->shape()); - - // init assume output tensor has been reshpaed by user. - act.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1); - act(input, output, param, ctx1); - - print_tensor_device(output_dev); -} - -int main(int argc, const char** argv) { - Env::env_init(); - // initial logger - //logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/saber/bm/test_saber_func_conv_BM.cpp b/test/saber/bm/test_saber_func_conv_BM.cpp deleted file mode 100644 index 7881cdb97..000000000 --- a/test/saber/bm/test_saber_func_conv_BM.cpp +++ /dev/null @@ -1,725 +0,0 @@ -#include "core/context.h" -#include "funcs/conv.h" -#include "test_saber_func_BM.h" -#include "tensor_op.h" -#include "saber_types.h" -#include -//#include "cublas.h" - -using namespace anakin::saber; - -typedef Tensor TensorHf4; -typedef Tensor TensorDf4; - -template -void print_tensor_shape(std::string name, Tensor &t0) { - - LOG(INFO) << name << " valid shape is [" - << t0.valid_shape()[0] << ", " - << t0.valid_shape()[1] << ", " - << t0.valid_shape()[2] << ", " - << t0.valid_shape()[3] << "]."; - - LOG(INFO) << name << " real shape is [" - << t0.shape()[0] << ", " - << t0.shape()[1] << ", " - << t0.shape()[2] << ", " - << t0.shape()[3] << "]."; - - LOG(INFO) << name << " offset is [" - << t0.offset()[0] << ", " - << t0.offset()[1] << ", " - << t0.offset()[2] << ", " - << t0.offset()[3] << "]."; -} - - - -#if 1 -TEST(TestSaberFuncBM, test_depthwise_conv) { - - int group = 2; - int pad_h = 1; - int pad_w = 1; - int stride_h = 1; - int stride_w = 1; - int dilation_h = 1; - int dilation_w = 1; - - int kernel_h = 3; - int kernel_w = 3; - int out_channels = 2; - - int img_num = 1; - int in_channels = 2; - int img_h = 8; - int img_w = 8; - - bool bias_term = true; - - LOG(INFO) << "conv param: "; - LOG(INFO) << " img_num = " << img_num; - LOG(INFO) << " in_channels = " << in_channels; - LOG(INFO) << " img_h = " << img_h; - LOG(INFO) << " img_w = " << img_w; - LOG(INFO) << " group = " << group; - LOG(INFO) << " pad_h = " << pad_h; - LOG(INFO) << " pad_w = " << pad_w; - LOG(INFO) << " stride_h = " << stride_h; - LOG(INFO) << " stride_w = " << stride_w; - LOG(INFO) << " dilation_h = " << dilation_h; - LOG(INFO) << " dilation_w = " << dilation_w; - LOG(INFO) << " kernel_h = " << kernel_h; - LOG(INFO) << " kernel_w = " << kernel_w; - LOG(INFO) << " out_channels = " << out_channels; - - Shape img_s(img_num, in_channels, img_h, img_w); - Shape weights_s(out_channels, in_channels, kernel_h, kernel_w); - Shape bias_s(1, out_channels, 1, 1); - - TensorHf4 img_host; - TensorDf4 img_dev; - - img_host.re_alloc(img_s); - img_dev.re_alloc(img_s); - - for (int i = 0; i < img_host.size(); ++i) { - img_host.mutable_data()[i] = 63 & i; - } - - img_dev.copy_from(img_host); - - TensorHf4 weights_host; - TensorDf4 weights_dev; - - weights_host.re_alloc(weights_s); - weights_dev.re_alloc(weights_s); - - fill_tensor_host_const(weights_host, 1.f); - weights_dev.copy_from(weights_host); - - TensorHf4 bias_host; - TensorDf4 bias_dev; - - if (bias_term) { - bias_host.re_alloc(bias_s); - bias_dev.re_alloc(bias_s); - - fill_tensor_host_const(bias_host, 1.f); - bias_dev.copy_from(bias_host); - } - - TensorHf4 output_host; - TensorDf4 output_dev; - - // start Reshape & doInfer - Context ctx1(0, 1, 1); - - ConvParam param(group, pad_h, pad_w, - stride_h, stride_w, - dilation_h, dilation_w, - &weights_dev, &bias_dev); - - std::vector input; - std::vector output; - - input.push_back(&img_dev); - output.push_back(&output_dev); - - Conv conv; - conv.compute_output_shape(input, output, param); - - output_dev.re_alloc(output[0]->shape()); - output_host.re_alloc(output[0]->shape()); - - LOG(INFO) << "regular start with group = " << group; - // init assume output tensor has been reshpaed by user. - conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1); - - conv(input, output, param, ctx1); - - cudaStream_t cuda_stream = ctx1.get_compute_stream(); - output[0]->record_event(cuda_stream); - - output_dev.sync(); - print_tensor_device(output_dev); - -// param.group = 1; -// param.pad_h = 1; -// param.pad_w = 1; -// -// LOG(INFO) << " param changed start with group = "< ctx1(0, 1, 1); - Context ctx2(0, 2, 2); - - TensorDf4 out0; - TensorDf4 out1; - - ConvParam param0(group, pad_h, pad_w, - stride_h, stride_w, - dilation_h, dilation_w, - &weights_dev, &bias_dev); - - ConvParam param1(group, pad_h, pad_w, - stride_h, stride_w, - dilation_h, dilation_w, - &weights_dev, &bias_dev); - - std::vector input0, input1; - std::vector output0, output1; - - input0.push_back(&t0); - input1.push_back(&t1); - - output0.push_back(&out0); - output1.push_back(&out1); - - // FIXME ? where do i get output shape - output_dev.re_alloc(img_s); - - Conv conv0; - Conv conv1; - - conv0.compute_output_shape(input0, output0, param0); - conv1.compute_output_shape(input1, output1, param1); - - out0.share_sub_buffer(output_dev, output0[0]->valid_shape(),{0,0,0,0}); - out1.share_sub_buffer(output_dev, output1[0]->valid_shape(),{0,0,4,4}); - - conv0.init(input0, output0, param0, SPECIFY, VENDER_IMPL, ctx1); - conv1.init(input1, output1, param1, SPECIFY, VENDER_IMPL, ctx2); - - conv0(input0, output0, param0, ctx1); - conv1(input1, output1, param1, ctx2); - - cudaStream_t cuda_stream1 = ctx1.get_compute_stream(); - output0[0]->record_event(cuda_stream1); - - cudaStream_t cuda_stream2 = ctx2.get_compute_stream(); - output1[0]->record_event(cuda_stream2); - - out0.sync(); - out1.sync(); - - print_tensor_device(output_dev); - -// print_tensor_device(output_dev); - - cudaDeviceSynchronize(); - CUDA_CHECK(cudaPeekAtLastError()); -} -#endif - -TEST(TestSaberFuncBM, test_conv_fp32_speed_test) { - - int group = 1; - int pad_h = 1; - int pad_w = 1; - int stride_h = 1; - int stride_w = 1; - int dilation_h = 1; - int dilation_w = 1; - - int kernel_h = 1; - int kernel_w = 1; - int out_channels = 128; - - int img_num = 7; - int in_channels = 13; - int img_h = 32; - int img_w = 32; - - bool bias_term = false; - - LOG(INFO) << "conv param: "; - LOG(INFO) << " img_num = " << img_num; - LOG(INFO) << " in_channels = " << in_channels; - LOG(INFO) << " img_h = " << img_h; - LOG(INFO) << " img_w = " << img_w; - LOG(INFO) << " group = " << group; - LOG(INFO) << " pad_h = " << pad_h; - LOG(INFO) << " pad_w = " << pad_w; - LOG(INFO) << " stride_h = " << stride_h; - LOG(INFO) << " stride_w = " << stride_w; - LOG(INFO) << " dilation_h = " << dilation_h; - LOG(INFO) << " dilation_w = " << dilation_w; - LOG(INFO) << " kernel_h = " << kernel_h; - LOG(INFO) << " kernel_w = " << kernel_w; - LOG(INFO) << " out_channels = " << out_channels; - Shape img_s(img_num, in_channels, img_h, img_w); - Shape weights_s(out_channels, in_channels, kernel_h, kernel_w); - Shape bias_s(1, out_channels, 1, 1); - - TensorHf4 img_host; - TensorDf4 img_dev; - - img_host.re_alloc(img_s); - img_dev.re_alloc(img_s); - - for (int i = 0; i < img_host.size(); ++i) { - img_host.mutable_data()[i] = 1; - } - - img_dev.copy_from(img_host); - - TensorHf4 weights_host; - TensorDf4 weights_dev; - - weights_host.re_alloc(weights_s); - weights_dev.re_alloc(weights_s); - - fill_tensor_host_const(weights_host, 1.f); - weights_dev.copy_from(weights_host); - - TensorHf4 bias_host; - TensorDf4 bias_dev; - - if (bias_term) { - bias_host.re_alloc(bias_s); - bias_dev.re_alloc(bias_s); - - fill_tensor_host_const(bias_host, 1.f); - bias_dev.copy_from(bias_host); - } - - TensorDf4 output_dev; - - // start Reshape & doInfer - Context ctx1(0, 1, 1); - - ConvParam param(group, pad_h, pad_w, - stride_h, stride_w, - dilation_h, dilation_w, - &weights_dev, &bias_dev); - - std::vector input; - std::vector output; - - input.push_back(&img_dev); - output.push_back(&output_dev); - - Conv conv; - conv.compute_output_shape(input, output, param); - - output_dev.re_alloc(output[0]->shape()); - LOG(INFO) << "Output shape = [ " << output[0]->shape()[0] << " " << output[0]->shape()[1] << " " \ - << output[0]->shape()[2] << " " << output[0]->shape()[3] << "]"; - //LOG(INFO) << " blocks = [ " << i_div_up(img_num*output[0]->shape()[2]*output[0]->shape()[3],128) << " " << i_div_up(out_channels*kernel_h, 128) << " 1 ]" ; - //选择k最小的那一组,如果一样,则选128*N,N最大的那一组 - int k0 = i_div_up(out_channels, 128) * 128 - out_channels; - int k1 = i_div_up(out_channels, 64) * 64 - out_channels; - int k2 = i_div_up(out_channels, 32) * 32 - out_channels; - int kk = std::min(std::min(k0,k1),k2); - LOG(INFO) << "k0 = " << k0 << " k1 = " << k1 << " k2 = " << k2 << " kk = " << kk; - if (kk == k0) - LOG(INFO) << "thread = [256,1,1] 128*128" ; - if (kk == k1) - LOG(INFO) << "thread = [128,1,1] 128*64" ; - if (kk == k2) - LOG(INFO) << "thread = [128,1,1] 128*32" ; - - LOG(INFO) << "saber conv init"; - conv.init(input, output, param, SPECIFY, SABER_IMPL, ctx1); - - LOG(INFO) << "saber conv dispatch"; - conv(input, output, param, ctx1); - - cudaStream_t cuda_stream = ctx1.get_compute_stream(); - output[0]->record_event(cuda_stream); - - output_dev.sync(); - - SaberTimer t1; - int ts = 1; - - for (int i = 0; i < ts; ++i) { - t1.start(ctx1); - conv(input, output, param, ctx1); - output_dev.sync(); - t1.end(ctx1); - } - - LOG(INFO) << "fp32 average time: " << t1.get_average_ms() << " ms"; - - cudaDeviceSynchronize(); - CUDA_CHECK(cudaPeekAtLastError()); -} - -void test_conv_fp32_speed(std::vector &inputs, std::vector &outputs, - TensorDf4 &weights, int kernel_size, int stride, int pad, - int in_channel, int out_channel, TensorDf4 &bias, - anakin::saber::ImplEnum impl) { - - ConvParam conv_param(1, pad, pad, - stride, stride, - 1, 1, - &weights, &bias); - Conv conv; - conv.compute_output_shape(inputs, outputs, conv_param); - outputs[0]->re_alloc(outputs[0]->shape()); - Context ctx1(0, 1, 1); - - SABER_CHECK(conv.init(inputs, outputs, conv_param, SPECIFY, impl, ctx1)); - - conv(inputs, outputs, conv_param, ctx1); - outputs[0]->record_event(ctx1.get_compute_stream()); - outputs[0]->sync(); - - cudaDeviceSynchronize(); - - SaberTimer t1; - int ts = 100; - for (int i = 0; i < ts; ++i) { - t1.start(ctx1); - conv(inputs, outputs, conv_param, ctx1); - outputs[0]->record_event(ctx1.get_compute_stream()); - outputs[0]->sync(); - t1.end(ctx1); - } - LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms"; - - cudaDeviceSynchronize(); -} - - -cublasHandle_t cublas_handle; - -void caffe_gemm(const int M, const int N, const int K,\ - const float alpha, const float* A,\ - const float* B, const float beta, float* C) { - int lda = K; - int ldb = N; - CUBLAS_CHECK(cublasSgemm(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - N, M, K, - &alpha, B, - ldb, A, - lda, &beta, - C, N)); -} - -TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) { - int img_num = 1; - int kernel = 1; - -// int out_channels = 32; -// int in_channels = 128; -// int img_h = 52; -// int img_w = 112; -// int out_channels = 64; -// int in_channels = 256; -// int img_h = 26; -// int img_w = 56; - int out_channels = 128; - int in_channels = 512; - int img_h = 13; - int img_w = 28; - -// int out_channels = 512; -// int in_channels = 128; -// int img_h = 13; -// int img_w = 28; - - int pad = 0; - int stride = 1; - Context ctx1(0, 1, 1); - - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - CUBLAS_CHECK(cublasSetStream(cublas_handle, ctx1.get_compute_stream())); - - TensorDf4 weights; - weights.re_alloc({out_channels, in_channels, 1, 1}); - - TensorDf4 img; - img.re_alloc({1, in_channels, img_h, img_w}); - - TensorDf4 out; - out.re_alloc({1, out_channels, img_h, img_w}); - TensorDf4 out_gemm; - out_gemm.re_alloc({1, out_channels, img_h, img_w}); - - fill_tensor_device_rand(weights, -1.f, 1.f); - fill_tensor_device_rand(img, -1.f, 1.f); - - LOG(INFO) << "img_num: " << img_num; - LOG(INFO) << "kernel: " << kernel; - LOG(INFO) << "out_channels: " << out_channels; - LOG(INFO) << "in_channels: " << in_channels; - LOG(INFO) << "img_h: " << img_h; - LOG(INFO) << "img_w: " << img_w; - LOG(INFO) << "pad: " << pad; - LOG(INFO) << "stride: " << stride; - - TensorDf4 bias; - - std::vector input_v; - std::vector output_gemm_v, output_v; - - input_v.push_back(&img); - output_v.push_back(&out); - output_gemm_v.push_back(&out_gemm); - cudaDeviceSynchronize(); - test_conv_fp32_speed(input_v, output_v, - weights, kernel, stride, pad, - in_channels, out_channels, bias, - SABER_IMPL); - cudaDeviceSynchronize(); - caffe_gemm(out_channels, img_h * img_w, in_channels,\ - 1.f, weights.data(),\ - img.data(), 0.f, out_gemm.mutable_data()); - cudaDeviceSynchronize(); - SaberTimer t1; - int ts = 100; - - for (int i = 0; i < ts; ++i) { - t1.start(ctx1); - caffe_gemm(out_channels, img_h * img_w, in_channels,\ - 1.f, weights.data(),\ - img.data(), 0.f, out_gemm.mutable_data()); - out_gemm.record_event(ctx1.get_compute_stream()); - out_gemm.sync(); - t1.end(ctx1); - } - LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms"; - - cudaDeviceSynchronize(); -// print_tensor_device(out); -// print_tensor_device(out_gemm); - TensorHf4 out_host; - TensorHf4 out_gemm_host; - out_host.re_alloc(out.shape()); - out_host.copy_from(out); - - out_gemm_host.re_alloc(out_gemm.shape()); - out_gemm_host.copy_from(out_gemm); - double max_r, max_d; - tensor_cmp_host(out_host.data(), out_gemm_host.data(), out_host.size(), max_r, max_d); - LOG(INFO) << "cmp result: max_r = " << max_r << " max_d = " << max_d; -} - -int main(int argc, const char** argv){ - anakin::saber::Env::env_init(); - - // initial logger - //logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/saber/bm/test_saber_func_fc_BM.cpp b/test/saber/bm/test_saber_func_fc_BM.cpp deleted file mode 100644 index 869ff1bfd..000000000 --- a/test/saber/bm/test_saber_func_fc_BM.cpp +++ /dev/null @@ -1,146 +0,0 @@ -#include "core/context.h" -#include "funcs/fc.h" -#include "test_saber_func_BM.h" -#include "tensor_op.h" -#include "saber_types.h" -#include - -using namespace anakin::saber; -typedef TargetWrapper API; -typedef Tensor TensorDf4; -typedef Tensor TensorHf4; -typedef TensorDf4::Dtype ftype; - -void fc_compute(const TensorHf4& tin, const TensorHf4& weight, \ - const TensorHf4& bias, TensorHf4& tout) { - - int m = tin.num(); - int k = tin.valid_size() / m; - int n = weight.valid_size() / k; - bool bias_term = bias.valid_size() > 0; - - const float* din = tin.data(); - const float* w = weight.data(); - float* dout = tout.mutable_data(); - - for (int i = 0; i < m; ++i) { - float* pdout = dout + i * n; - const float* pdin = din + i * k; - - for (int j = 0; j < n; ++j) { - if (bias_term) { - pdout[j] = bias.data()[j]; - } else { - pdout[j] = 0; - } - - for (int l = 0; l < k; ++l) { - pdout[j] += pdin[l] * w[l * n + j]; - } - } - } -} - -TEST(TestSaberFuncBM, test_func_fc) { - - int test_iter = 100; - int w_in = 7; - int h_in = 7; - int ch_in = 512; - int num_in = 1; - - int num_out = 4096; - int axis = 1; - - Shape shape_in(num_in, ch_in, h_in, w_in); - Shape shape_out = {num_in, num_out, 1, 1}; - - Shape sh_w{1, 1, w_in* h_in * ch_in, num_out}; - TensorDf4 weight(sh_w); - Shape sh_b{1, 1, 1, num_out}; - TensorDf4 bias(sh_b); - fill_tensor_device_const(weight, 1.f); - fill_tensor_device_const(bias, 1.f); - - LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \ - ch_in << ", height=" << h_in << ", width=" << w_in; - - std::vector input_dev_4d; - std::vector output_dev_4d; - - TensorDf4 tdin; - TensorDf4 tdout; - tdin.re_alloc(shape_in); - fill_tensor_device_const(tdin, 1.f); - input_dev_4d.push_back(&tdin); - output_dev_4d.push_back(&tdout); - - // start Reshape & doInfer - Context ctx_dev(0, 1, 1); - - FcParam param(&weight, &bias, num_out, axis); - - Fc fc; - - LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \ - shape_out[2] << ", " << shape_out[3]; - - SABER_CHECK(fc.compute_output_shape(input_dev_4d, output_dev_4d, param)); - - LOG(INFO) << "re-alloc tensor buffer"; - output_dev_4d[0]->re_alloc(output_dev_4d[0]->valid_shape()); - Shape va_sh = tdout.valid_shape(); - LOG(INFO) << "shape out 4d: " << va_sh[0] << ", " << va_sh[1] << ", " << \ - va_sh[2] << ", " << va_sh[3]; - CHECK_EQ(tdout.valid_shape() == shape_out, true) << "compute output shape error"; - - LOG(INFO) << "FC initialization"; - SABER_CHECK(fc.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev)); - - LOG(INFO) << "FC compute"; - SaberTimer t1; - t1.clear(); - t1.start(ctx_dev); - - for (int i = 0; i < test_iter; ++i) { - SABER_CHECK(fc(input_dev_4d, output_dev_4d, param, ctx_dev)); - output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); - output_dev_4d[0]->sync(); - //cudaDeviceSynchronize(); - } - - t1.end(ctx_dev); - float ts = t1.get_average_ms(); - LOG(INFO) << "total time: " << ts << "avg time: " << ts / test_iter; - //print_tensor_device(*output_dev_4d[0]); - - //! check result - TensorHf4 thin(shape_in); - TensorHf4 thout(shape_out); - TensorHf4 thw(sh_w); - TensorHf4 thb(sh_b); - thin.copy_from(tdin); - thw.copy_from(weight); - thb.copy_from(bias); - fc_compute(thin, thw, thb, thout); - //print_tensor_host(thout); - - TensorHf4 thout_d(shape_out); - thout_d.copy_from(tdout); - double max_ratio = 0; - double max_diff = 0; - tensor_cmp_host(thout.data(), thout_d.data(), thout.valid_size(), max_ratio, max_diff); - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - CHECK_LE(fabs(max_ratio), 1.0e-6) << "error result"; - -} - -int main(int argc, const char** argv) { - // initial logger - //logger::init(argv[0]); - Env::env_init(); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/saber/bm/test_saber_shape_BM.cpp b/test/saber/bm/test_saber_shape_BM.cpp deleted file mode 100644 index 18479cd18..000000000 --- a/test/saber/bm/test_saber_shape_BM.cpp +++ /dev/null @@ -1,126 +0,0 @@ -#include "test_saber_shape_BM.h" -#include "shape.h" -#include "anakin_config.h" - -#ifdef USE_OPENMP -#include -#include -#endif - -using namespace anakin; -using namespace saber; - - -TEST(TestSaberShapeBM, test_saber_shape) { - - int dim = 4; - Shape sh4d0{0, 0, 0, 0}; - CHECK_EQ(sh4d0.dims(), 4) << "check shape dim error"; - - for (int i = 0; i < dim; ++i) { - CHECK_EQ(sh4d0[i], 0) << "check default constructor, dim size error"; - } - - CHECK_EQ(sh4d0.count(), 0) << "check shape count error"; - - int N = 1; - int C = 3; - int H = 11; - int W = 11; - std::vector sh_size = {N, C, H, W}; - //Shape sh4d1(sh_size); - Shape sh4d1(N, C, H, W); - LOG(INFO) << "Test Saber Shape, size of shape: " << sh4d1.size(); - CHECK_EQ(sh4d1.count(), N * C * H * W) << "size error with vector constructor!"; - //CHECK_EQ(sh4d2.size(), N * C * H * W) << "size error with args constructor!"; - - CHECK_EQ(sh4d1[0], N) << "get shape size error"; - CHECK_EQ(sh4d1[1], C) << "get shape size error"; - CHECK_EQ(sh4d1[2], H) << "get shape size error"; - CHECK_EQ(sh4d1[3], W) << "get shape size error"; - - //CHECK_EQ(sh4d2[0], N) << "get shape size error"; - //CHECK_EQ(sh4d2[1], C) << "get shape size error"; - //CHECK_EQ(sh4d2[2], H) << "get shape size error"; - //CHECK_EQ(sh4d2[3], W) << "get shape size error"; - - CHECK_EQ(sh4d1.count(0), N * C * H * W) << "calculate count failed"; - - C = 10; - sh4d1[1] = C; - CHECK_EQ(sh4d1[1], C) << "set shape size error"; - - bool is_equal = (sh4d0 == sh4d1); - CHECK_EQ(is_equal, false) << "check shape is_equal failed"; - - sh4d0 = sh4d1; - CHECK_EQ(sh4d1[0], N) << "constructor failed"; - CHECK_EQ(sh4d1[1], C) << "get shape size error"; - CHECK_EQ(sh4d1[2], H) << "get shape size error"; - CHECK_EQ(sh4d1[3], W) << "get shape size error"; - - Shape sh4d3 = sh4d1; - CHECK_EQ((sh4d3 == sh4d1), true) << "constructor error"; - - Shape sh4d4(sh4d1); - CHECK_EQ((sh4d4 == sh4d1), true) << "constructor error"; - - Shape sh1d0{0}; - //std::vector sh1d_size = {W}; - - //Shape sh1d1(sh1d_size); - //Shape sh1d0{W}; - Shape sh1d1(W); - - Shape sh1d3 = sh1d1; - Shape sh1d4(sh1d1); - - CHECK_EQ(sh1d0.dims(), 1) << "shape dim error"; - - CHECK_EQ(sh1d0.count(), 0) << "shape size error"; - - CHECK_EQ(sh1d0.count(0), 0) << "shape1d count error"; - - CHECK_EQ(sh1d1[0], W) << "get shape size error"; - - //CHECK_EQ(sh1d2.count(0), W) << "shape dim error"; - - CHECK_EQ((sh1d0 != sh1d1), true) << "compare shape error"; - - CHECK_EQ((sh1d3 == sh1d1), true) << "compare shape error"; - - CHECK_EQ((sh1d4 == sh1d1), true) << "compare shape error"; - - Shape sh0{2, 2, 3, 4}; - Shape sh1{2, 1, 1, 24}; - Shape sh2{2, 2, 3, 4}; - Shape sh3{1, 1, 2, 3}; - - CHECK_EQ(sh0 == sh2, true) << "error =="; - CHECK_EQ(sh3 < sh0, true) << "error <"; - CHECK_EQ(sh3 >= sh0, false) << "error >="; - CHECK_EQ(sh3 > sh0, false) << "error >"; - CHECK_EQ(sh0 > sh3, true) << "error >"; - CHECK_EQ(sh0 < sh1, false) << "error <"; - CHECK_EQ(sh0 <= sh2, true) << "error <="; - CHECK_EQ(sh0 >= sh2, true) << "error >="; - - Shape sh001 = Shape::zero(2); - Shape sh002 = Shape::zero(3); - - if (sh001 > sh002) { - LOG(ERROR) << "error <"; - } - -} - - -int main(int argc, const char** argv) { - // initial logger - logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - - diff --git a/test/saber/bm/test_saber_shape_BM.h b/test/saber/bm/test_saber_shape_BM.h deleted file mode 100644 index a2ca02c9b..000000000 --- a/test/saber/bm/test_saber_shape_BM.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H -#define ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H - -#include "utils/unit_test/aktest.h" -#include "utils/logger/logger.h" -#include "saber/core/shape.h" - -using namespace anakin::test; - -class TestSaberShapeBM : public Test { -public: - TestSaberShapeBM() {} - ~TestSaberShapeBM() {} - -protected: - virtual void setup() {} - virtual void teardown() {} - -protected: - std::string name; - std::string _test; -}; - -#endif //ANAKIN_TEST_SABER_TEST_SABER_SHAPE_BM_H - diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index 2dcd61c41..69b1ccbfc 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -9,7 +9,9 @@ typedef Tensor TensorHf4; typedef Tensor TensorDf4; typedef TensorHf4::Dtype dtype; +static bm_handle_t handle; TEST(TestSaberTensorBM, test_tensor_constructor) { + bmdnn_init(&handle); //! test empty constructor LOG(INFO) << "test default (empty) constructor"; @@ -28,13 +30,13 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { //! test tensor re_alloc function on tensor with data LOG(INFO) << "|--test tensor re_alloc function on tensor with data"; - Shape sh1(1, 2, 4, 4); + Shape sh1(2, 4, 4, 2); thost0.re_alloc(sh1); tdev0.re_alloc(sh1); LOG(INFO) << "|--tensor size of host: " << thost0.size(); LOG(INFO) << "|--tensor size of device: " << tdev0.size(); - CHECK_EQ(thost0.size(), 32) << "error with tensor size"; - CHECK_EQ(tdev0.size(), 32) << "error with tensor size"; + CHECK_EQ(thost0.size(), 64) << "error with tensor size"; + CHECK_EQ(tdev0.size(), 64) << "error with tensor size"; //! test tensor shape() function LOG(INFO) << "|--test tensor shape() function"; @@ -45,9 +47,9 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { << thost0.height() << ", width = " << thost0.width(); //! test tensor mutable_data() function - LOG(INFO) << "|--test tensor mutable_data() function, write tensor data buffer with 1.f"; - fill_tensor_host_const(thost0, 1.f); - LOG(INFO) << "|--test tensor data() function, show the const data, 1.f"; + LOG(INFO) << "|--test tensor mutable_data() function, write tensor data buffer with 2.f"; + fill_tensor_host_const(thost0, 2.f); + LOG(INFO) << "|--test tensor data() function, show the const data, 2.f"; print_tensor_host(thost0); //! test tensor constructor with shape @@ -55,6 +57,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { TensorHf4 thost1(sh1); TensorDf4 tdev1(sh1); + //! test tensor copy_from() function LOG(INFO) << "test copy_from() function, input tensor could be any target"; @@ -64,17 +67,17 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { // host to device tdev1.copy_from(thost0); - print_tensor_device(tdev1); + //TODO: print tensor for BM device + //print_tensor_host(tdev1); // device to host thost1.copy_from(tdev1); print_tensor_host(thost1); - //device to device + + // device to device tdev1.copy_from(tdev0); - print_tensor_device(tdev1); - /* //! test tensor constructor with shape and real_shape LOG(INFO) << "test tensor constructor with shape and real_shape"; //! constructor with 3 shapes is removed @@ -97,22 +100,35 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { BM_API::mem_alloc(&tmp_pt_dev, sizeof(dtype) * sh1.count()); dev_data_ptr = static_cast(tmp_pt_dev); - cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice); +// bm_memcpy_d2s(handle,host_data_ptr,dev_data_ptr) +// cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice); LOG(INFO) << "|--construct host tensor from host data ptr"; TensorHf4 thost3(host_data_ptr, X86(), X86_API::get_device_id(), sh1); LOG(INFO) << "|--constructor device tensor from host data ptr"; - TensorDf4 tdev3(host_data_ptr, X86(), X86_API::get_device_id(), sh1); + +// TensorDf4 tdev3(&bm_mem_from_system(const_cast(host_data_ptr)), X86(), X86_API::get_device_id(), sh1); + + TensorDf4 tdev3(&bm_mem_from_system(const_cast(host_data_ptr)), X86(), X86_API::get_device_id(), sh1); + print_tensor_host(thost3); - print_tensor_device(tdev3); - //cudaDeviceSynchronize(); + TensorHf4 thost_lian(sh1); + thost_lian.copy_from(tdev3); + print_tensor_host(thost_lian); + + thost_lian.copy_from(thost3); + print_tensor_host(thost_lian); + + //cudaDeviceSynchronize(); + // +/* LOG(INFO) << "|--construct host tensor from device data ptr"; TensorHf4 thost4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1); LOG(INFO) << "|--constructor device tensor from device data ptr"; TensorDf4 tdev4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1); print_tensor_host(thost4); print_tensor_device(tdev4); - +/* //BM_API::stream_t dev_stream0; //BM_API::create_stream_with_flag(dev_stream0, 1); //cudaDeviceSynchronize(); @@ -202,6 +218,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { LOG(INFO) << "|--show root tensor while data is changed by shared tensor"; print_tensor_host(thost4); */ +// bmdnn_deinit(handle); } /* From e048078e03e8b7b8858159019ff0d2684dca4249 Mon Sep 17 00:00:00 2001 From: "Guangzhi (Frank) Xie" Date: Tue, 26 Jun 2018 21:12:57 +0800 Subject: [PATCH 190/318] Implement conv for BM --- saber/funcs/impl/bm/vender_conv.h | 41 +- test/saber/bm/test_saber_func_conv_BM.cpp | 730 ++++++++++++++++++++++ 2 files changed, 767 insertions(+), 4 deletions(-) create mode 100644 test/saber/bm/test_saber_func_conv_BM.cpp diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h index a0a3b3fb5..778094886 100644 --- a/saber/funcs/impl/bm/vender_conv.h +++ b/saber/funcs/impl/bm/vender_conv.h @@ -62,10 +62,43 @@ class VenderConv2D +//#include "cublas.h" + +using namespace anakin::saber; + +typedef Tensor TensorHf4; +typedef Tensor TensorDf4; + +template +void print_tensor_shape(std::string name, Tensor &t0) { + + LOG(INFO) << name << " valid shape is [" + << t0.valid_shape()[0] << ", " + << t0.valid_shape()[1] << ", " + << t0.valid_shape()[2] << ", " + << t0.valid_shape()[3] << "]."; + + LOG(INFO) << name << " real shape is [" + << t0.shape()[0] << ", " + << t0.shape()[1] << ", " + << t0.shape()[2] << ", " + << t0.shape()[3] << "]."; + + LOG(INFO) << name << " offset is [" + << t0.offset()[0] << ", " + << t0.offset()[1] << ", " + << t0.offset()[2] << ", " + << t0.offset()[3] << "]."; +} + +//Round a / b to nearest higher integer value +inline int i_div_up(int a, int b) +{ + return (a % b != 0) ? (a / b + 1) : (a / b); +} + +#if 1 +TEST(TestSaberFuncBM, test_depthwise_conv) { + + int group = 2; + int pad_h = 1; + int pad_w = 1; + int stride_h = 1; + int stride_w = 1; + int dilation_h = 1; + int dilation_w = 1; + + int kernel_h = 3; + int kernel_w = 3; + int out_channels = 2; + + int img_num = 1; + int in_channels = 2; + int img_h = 8; + int img_w = 8; + + bool bias_term = true; + + LOG(INFO) << "conv param: "; + LOG(INFO) << " img_num = " << img_num; + LOG(INFO) << " in_channels = " << in_channels; + LOG(INFO) << " img_h = " << img_h; + LOG(INFO) << " img_w = " << img_w; + LOG(INFO) << " group = " << group; + LOG(INFO) << " pad_h = " << pad_h; + LOG(INFO) << " pad_w = " << pad_w; + LOG(INFO) << " stride_h = " << stride_h; + LOG(INFO) << " stride_w = " << stride_w; + LOG(INFO) << " dilation_h = " << dilation_h; + LOG(INFO) << " dilation_w = " << dilation_w; + LOG(INFO) << " kernel_h = " << kernel_h; + LOG(INFO) << " kernel_w = " << kernel_w; + LOG(INFO) << " out_channels = " << out_channels; + + Shape img_s(img_num, in_channels, img_h, img_w); + Shape weights_s(out_channels, in_channels, kernel_h, kernel_w); + Shape bias_s(1, out_channels, 1, 1); + + TensorHf4 img_host; + TensorDf4 img_dev; + + img_host.re_alloc(img_s); + img_dev.re_alloc(img_s); + + for (int i = 0; i < img_host.size(); ++i) { + img_host.mutable_data()[i] = 63 & i; + } + + img_dev.copy_from(img_host); + + TensorHf4 weights_host; + TensorDf4 weights_dev; + + weights_host.re_alloc(weights_s); + weights_dev.re_alloc(weights_s); + + fill_tensor_host_const(weights_host, 1.f); + weights_dev.copy_from(weights_host); + + TensorHf4 bias_host; + TensorDf4 bias_dev; + + if (bias_term) { + bias_host.re_alloc(bias_s); + bias_dev.re_alloc(bias_s); + + fill_tensor_host_const(bias_host, 1.f); + bias_dev.copy_from(bias_host); + } + + TensorHf4 output_host; + TensorDf4 output_dev; + + // start Reshape & doInfer + Context ctx1(0, 1, 1); + + ConvParam param(group, pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + &weights_dev, &bias_dev); + + std::vector input; + std::vector output; + + input.push_back(&img_dev); + output.push_back(&output_dev); + + Conv conv; + conv.compute_output_shape(input, output, param); + + output_dev.re_alloc(output[0]->shape()); + output_host.re_alloc(output[0]->shape()); + + LOG(INFO) << "regular start with group = " << group; + // init assume output tensor has been reshpaed by user. + conv.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1); + + conv(input, output, param, ctx1); + + //cudaStream_t cuda_stream = ctx1.get_compute_stream(); + //output[0]->record_event(cuda_stream); + + //output_dev.sync(); + print_tensor_device(output_dev); + +// param.group = 1; +// param.pad_h = 1; +// param.pad_w = 1; +// +// LOG(INFO) << " param changed start with group = "< ctx1(0, 1, 1); + Context ctx2(0, 2, 2); + + TensorDf4 out0; + TensorDf4 out1; + + ConvParam param0(group, pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + &weights_dev, &bias_dev); + + ConvParam param1(group, pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + &weights_dev, &bias_dev); + + std::vector input0, input1; + std::vector output0, output1; + + input0.push_back(&t0); + input1.push_back(&t1); + + output0.push_back(&out0); + output1.push_back(&out1); + + // FIXME ? where do i get output shape + output_dev.re_alloc(img_s); + + Conv conv0; + Conv conv1; + + conv0.compute_output_shape(input0, output0, param0); + conv1.compute_output_shape(input1, output1, param1); + + out0.share_sub_buffer(output_dev, output0[0]->valid_shape(),{0,0,0,0}); + out1.share_sub_buffer(output_dev, output1[0]->valid_shape(),{0,0,4,4}); + + conv0.init(input0, output0, param0, SPECIFY, VENDER_IMPL, ctx1); + conv1.init(input1, output1, param1, SPECIFY, VENDER_IMPL, ctx2); + + conv0(input0, output0, param0, ctx1); + conv1(input1, output1, param1, ctx2); + + /* + cudaStream_t cuda_stream1 = ctx1.get_compute_stream(); + output0[0]->record_event(cuda_stream1); + + cudaStream_t cuda_stream2 = ctx2.get_compute_stream(); + output1[0]->record_event(cuda_stream2); + + out0.sync(); + out1.sync(); + */ + print_tensor_device(output_dev); + +// print_tensor_device(output_dev); + + //cudaDeviceSynchronize(); + //CUDA_CHECK(cudaPeekAtLastError()); +} +#endif + +TEST(TestSaberFuncBM, test_conv_fp32_speed_test) { + + int group = 1; + int pad_h = 1; + int pad_w = 1; + int stride_h = 1; + int stride_w = 1; + int dilation_h = 1; + int dilation_w = 1; + + int kernel_h = 1; + int kernel_w = 1; + int out_channels = 128; + + int img_num = 7; + int in_channels = 13; + int img_h = 32; + int img_w = 32; + + bool bias_term = false; + + LOG(INFO) << "conv param: "; + LOG(INFO) << " img_num = " << img_num; + LOG(INFO) << " in_channels = " << in_channels; + LOG(INFO) << " img_h = " << img_h; + LOG(INFO) << " img_w = " << img_w; + LOG(INFO) << " group = " << group; + LOG(INFO) << " pad_h = " << pad_h; + LOG(INFO) << " pad_w = " << pad_w; + LOG(INFO) << " stride_h = " << stride_h; + LOG(INFO) << " stride_w = " << stride_w; + LOG(INFO) << " dilation_h = " << dilation_h; + LOG(INFO) << " dilation_w = " << dilation_w; + LOG(INFO) << " kernel_h = " << kernel_h; + LOG(INFO) << " kernel_w = " << kernel_w; + LOG(INFO) << " out_channels = " << out_channels; + Shape img_s(img_num, in_channels, img_h, img_w); + Shape weights_s(out_channels, in_channels, kernel_h, kernel_w); + Shape bias_s(1, out_channels, 1, 1); + + TensorHf4 img_host; + TensorDf4 img_dev; + + img_host.re_alloc(img_s); + img_dev.re_alloc(img_s); + + for (int i = 0; i < img_host.size(); ++i) { + img_host.mutable_data()[i] = 1; + } + + img_dev.copy_from(img_host); + + TensorHf4 weights_host; + TensorDf4 weights_dev; + + weights_host.re_alloc(weights_s); + weights_dev.re_alloc(weights_s); + + fill_tensor_host_const(weights_host, 1.f); + weights_dev.copy_from(weights_host); + + TensorHf4 bias_host; + TensorDf4 bias_dev; + + if (bias_term) { + bias_host.re_alloc(bias_s); + bias_dev.re_alloc(bias_s); + + fill_tensor_host_const(bias_host, 1.f); + bias_dev.copy_from(bias_host); + } + + TensorDf4 output_dev; + + // start Reshape & doInfer + Context ctx1(0, 1, 1); + + ConvParam param(group, pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + &weights_dev, &bias_dev); + + std::vector input; + std::vector output; + + input.push_back(&img_dev); + output.push_back(&output_dev); + + Conv conv; + conv.compute_output_shape(input, output, param); + + output_dev.re_alloc(output[0]->shape()); + LOG(INFO) << "Output shape = [ " << output[0]->shape()[0] << " " << output[0]->shape()[1] << " " \ + << output[0]->shape()[2] << " " << output[0]->shape()[3] << "]"; + //LOG(INFO) << " blocks = [ " << i_div_up(img_num*output[0]->shape()[2]*output[0]->shape()[3],128) << " " << i_div_up(out_channels*kernel_h, 128) << " 1 ]" ; + //选择k最小的那一组,如果一样,则选128*N,N最大的那一组 + int k0 = i_div_up(out_channels, 128) * 128 - out_channels; + int k1 = i_div_up(out_channels, 64) * 64 - out_channels; + int k2 = i_div_up(out_channels, 32) * 32 - out_channels; + int kk = std::min(std::min(k0,k1),k2); + LOG(INFO) << "k0 = " << k0 << " k1 = " << k1 << " k2 = " << k2 << " kk = " << kk; + if (kk == k0) + LOG(INFO) << "thread = [256,1,1] 128*128" ; + if (kk == k1) + LOG(INFO) << "thread = [128,1,1] 128*64" ; + if (kk == k2) + LOG(INFO) << "thread = [128,1,1] 128*32" ; + + LOG(INFO) << "saber conv init"; + conv.init(input, output, param, SPECIFY, SABER_IMPL, ctx1); + + LOG(INFO) << "saber conv dispatch"; + conv(input, output, param, ctx1); + + //cudaStream_t cuda_stream = ctx1.get_compute_stream(); + //output[0]->record_event(cuda_stream); + + //output_dev.sync(); + + SaberTimer t1; + int ts = 1; + + for (int i = 0; i < ts; ++i) { + t1.start(ctx1); + conv(input, output, param, ctx1); + output_dev.sync(); + t1.end(ctx1); + } + + LOG(INFO) << "fp32 average time: " << t1.get_average_ms() << " ms"; + + //cudaDeviceSynchronize(); + //CUDA_CHECK(cudaPeekAtLastError()); +} + +void test_conv_fp32_speed(std::vector &inputs, std::vector &outputs, + TensorDf4 &weights, int kernel_size, int stride, int pad, + int in_channel, int out_channel, TensorDf4 &bias, + anakin::saber::ImplEnum impl) { + + ConvParam conv_param(1, pad, pad, + stride, stride, + 1, 1, + &weights, &bias); + Conv conv; + conv.compute_output_shape(inputs, outputs, conv_param); + outputs[0]->re_alloc(outputs[0]->shape()); + Context ctx1(0, 1, 1); + + SABER_CHECK(conv.init(inputs, outputs, conv_param, SPECIFY, impl, ctx1)); + + conv(inputs, outputs, conv_param, ctx1); + outputs[0]->record_event(ctx1.get_compute_stream()); + outputs[0]->sync(); + + //cudaDeviceSynchronize(); + + SaberTimer t1; + int ts = 100; + for (int i = 0; i < ts; ++i) { + t1.start(ctx1); + conv(inputs, outputs, conv_param, ctx1); + outputs[0]->record_event(ctx1.get_compute_stream()); + outputs[0]->sync(); + t1.end(ctx1); + } + LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms"; + + //cudaDeviceSynchronize(); +} + + +cublasHandle_t cublas_handle; + +void caffe_gemm(const int M, const int N, const int K,\ + const float alpha, const float* A,\ + const float* B, const float beta, float* C) { + int lda = K; + int ldb = N; + CUBLAS_CHECK(cublasSgemm(cublas_handle, + CUBLAS_OP_N, + CUBLAS_OP_N, + N, M, K, + &alpha, B, + ldb, A, + lda, &beta, + C, N)); +} + +TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) { + int img_num = 1; + int kernel = 1; + +// int out_channels = 32; +// int in_channels = 128; +// int img_h = 52; +// int img_w = 112; +// int out_channels = 64; +// int in_channels = 256; +// int img_h = 26; +// int img_w = 56; + int out_channels = 128; + int in_channels = 512; + int img_h = 13; + int img_w = 28; + +// int out_channels = 512; +// int in_channels = 128; +// int img_h = 13; +// int img_w = 28; + + int pad = 0; + int stride = 1; + Context ctx1(0, 1, 1); + + CUBLAS_CHECK(cublasCreate(&cublas_handle)); + CUBLAS_CHECK(cublasSetStream(cublas_handle, ctx1.get_compute_stream())); + + TensorDf4 weights; + weights.re_alloc({out_channels, in_channels, 1, 1}); + + TensorDf4 img; + img.re_alloc({1, in_channels, img_h, img_w}); + + TensorDf4 out; + out.re_alloc({1, out_channels, img_h, img_w}); + TensorDf4 out_gemm; + out_gemm.re_alloc({1, out_channels, img_h, img_w}); + + fill_tensor_device_rand(weights, -1.f, 1.f); + fill_tensor_device_rand(img, -1.f, 1.f); + + LOG(INFO) << "img_num: " << img_num; + LOG(INFO) << "kernel: " << kernel; + LOG(INFO) << "out_channels: " << out_channels; + LOG(INFO) << "in_channels: " << in_channels; + LOG(INFO) << "img_h: " << img_h; + LOG(INFO) << "img_w: " << img_w; + LOG(INFO) << "pad: " << pad; + LOG(INFO) << "stride: " << stride; + + TensorDf4 bias; + + std::vector input_v; + std::vector output_gemm_v, output_v; + + input_v.push_back(&img); + output_v.push_back(&out); + output_gemm_v.push_back(&out_gemm); + //cudaDeviceSynchronize(); + test_conv_fp32_speed(input_v, output_v, + weights, kernel, stride, pad, + in_channels, out_channels, bias, + SABER_IMPL); + //cudaDeviceSynchronize(); + caffe_gemm(out_channels, img_h * img_w, in_channels,\ + 1.f, weights.data(),\ + img.data(), 0.f, out_gemm.mutable_data()); + //cudaDeviceSynchronize(); + SaberTimer t1; + int ts = 100; + + for (int i = 0; i < ts; ++i) { + t1.start(ctx1); + caffe_gemm(out_channels, img_h * img_w, in_channels,\ + 1.f, weights.data(),\ + img.data(), 0.f, out_gemm.mutable_data()); + out_gemm.record_event(ctx1.get_compute_stream()); + out_gemm.sync(); + t1.end(ctx1); + } + LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms"; + + //cudaDeviceSynchronize(); +// print_tensor_device(out); +// print_tensor_device(out_gemm); + TensorHf4 out_host; + TensorHf4 out_gemm_host; + out_host.re_alloc(out.shape()); + out_host.copy_from(out); + + out_gemm_host.re_alloc(out_gemm.shape()); + out_gemm_host.copy_from(out_gemm); + double max_r, max_d; + tensor_cmp_host(out_host.data(), out_gemm_host.data(), out_host.size(), max_r, max_d); + LOG(INFO) << "cmp result: max_r = " << max_r << " max_d = " << max_d; +} + +int main(int argc, const char** argv){ + anakin::saber::Env::env_init(); + + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + From a394d60ba35ce000ab105ae80e0f92c0b7bce5aa Mon Sep 17 00:00:00 2001 From: "Guangzhi (Frank) Xie" Date: Tue, 26 Jun 2018 21:21:24 +0800 Subject: [PATCH 191/318] Comment out last conv test for now --- test/saber/bm/test_saber_func_conv_BM.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/saber/bm/test_saber_func_conv_BM.cpp b/test/saber/bm/test_saber_func_conv_BM.cpp index 025a1074c..9a25d00b3 100644 --- a/test/saber/bm/test_saber_func_conv_BM.cpp +++ b/test/saber/bm/test_saber_func_conv_BM.cpp @@ -601,7 +601,7 @@ void test_conv_fp32_speed(std::vector &inputs, std::vector Date: Tue, 26 Jun 2018 13:42:52 +0000 Subject: [PATCH 192/318] Modify sync_memcpy & add bm_mem_from_device --- saber/core/impl/bm/bm_impl.cpp | 16 ++++++++++------ saber/core/target_wrapper.h | 2 +- .../impl/bm/base/include/bmlib/bmlib_runtime.h | 3 +++ test/saber/bm/test_saber_buffer_BM.cpp | 10 ++++++---- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index 60e52088e..ef26884b2 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -81,16 +81,20 @@ void BM_API::mem_set(void* ptr, int value, size_t n){ //static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ // size_t count, __DtoD) {}; -//static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ -// size_t count, __HtoD) {}; +void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ + size_t count, __HtoD) { + handle = get_bm_handle(); + BMDNN_CHECK(bm_memcpy_s2d(handle, bm_mem_from_device(dst), bm_mem_from_system(src))); + LOG(INFO) << "BM sync_memcpy: host to device, finished"; +}; void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ size_t count, __DtoH) { handle = get_bm_handle(); - //auto* dev_ptr = const_cast(src); - BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src))); - //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *src)); - LOG(INFO) << "End sync_memcpy process"; + BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), bm_mem_from_device(src))); + //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src))); + //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(reinterpret_cast(src)))); + LOG(INFO) << "BM sync_memcpy: device to host, finished"; }; //static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \ diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h index 49a6e9364..475fbba84 100644 --- a/saber/core/target_wrapper.h +++ b/saber/core/target_wrapper.h @@ -543,7 +543,7 @@ struct TargetWrapper { size_t count, __DtoD) {}; static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, __HtoD) {}; + size_t count, __HtoD); static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ size_t count, __DtoH) {}; diff --git a/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h b/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h index 932b17138..7d537401c 100644 --- a/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h +++ b/saber/funcs/impl/bm/base/include/bmlib/bmlib_runtime.h @@ -148,6 +148,9 @@ bm_status_t bm_memset_device( bm_device_mem_t bm_mem_from_system( void * system_addr); +bm_device_mem_t bm_mem_from_device( + void * device_addr); + /* *brief malloc one device memory with the shape of (N,C,H,W), copy the sys_mem to device mem if need_copy is true diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp index dce1fae15..555e22675 100644 --- a/test/saber/bm/test_saber_buffer_BM.cpp +++ b/test/saber/bm/test_saber_buffer_BM.cpp @@ -5,7 +5,7 @@ using namespace anakin::saber; int get_bm_size() { - return 1; + return 4; } template @@ -27,7 +27,7 @@ void test_buffer() { x86_ptr = static_cast(tmp_x86); for (int i = 0; i < n0; i++) { - x86_ptr[i] = static_cast(i); + x86_ptr[i] = static_cast(100); } void* tmp_bm; @@ -105,6 +105,7 @@ void test_buffer() { for (int i = 0; i < 10; i++) { std::cout << "x86: " << x86_buf2_ptr[i] << std::endl; } + */ const Hdtype* bm_buf1_ptr = static_cast(bm_buf1.get_data()); for (int i = 0; i < 10; i++) { @@ -115,16 +116,17 @@ void test_buffer() { LOG(INFO) << "x86_buf1 cap & cnt: " << x86_buf1.get_capacity() << " " << x86_buf1.get_count(); LOG(INFO) << "size of Hdtype: " << sizeof(Hdtype); LOG(INFO) << "size of Ddtype: " << sizeof(Ddtype); - */ + x86_buf1.re_alloc(bm_buf1.get_capacity()); x86_buf1.sync_copy_from(bm_buf1); LOG(INFO) << "deep copy from device buffer to host buffer: "; ptr1 = static_cast(x86_buf1.get_data()); - for (int i = 0; i < 30; i++) { + for (int i = 0; i < 10; i++) { std::cout << ptr1[i] << std::endl; } + } TEST(TestSaberBufferBM, test_buffer_memcpy) { From 8925da303d5a923b078b9b4624f642808c9f7468 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 27 Jun 2018 09:39:38 +0800 Subject: [PATCH 193/318] Update BM conv params --- saber/funcs/impl/bm/vender_conv.h | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h index 778094886..530eef528 100644 --- a/saber/funcs/impl/bm/vender_conv.h +++ b/saber/funcs/impl/bm/vender_conv.h @@ -36,6 +36,8 @@ class VenderConv2D& inputs, std::vector& outputs, ConvParam& param, Context& ctx) { + + _handle = get_bm_handle(); return create(inputs, outputs, param, ctx); } @@ -50,18 +52,26 @@ class VenderConv2Ddata(); const InDataType *bias = (const InDataType *) param.bias()->data(); OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); + int input_n = inputs[0]->num(); int input_c = inputs[0]->channel(); int input_h = inputs[0]->height(); int input_w = inputs[0]->width(); - int group = param.group; + + int output_n = outputs[0]->num(); int output_c = outputs[0]->channel(); + int output_h = outputs[0]->height(); + int output_w = outputs[0]->width(); + + int group = param.group; int kh = param.weight()->height(); int kw = param.weight()->width(); int pad_h = param.pad_h; int pad_w = param.pad_w; int stride_h = param.stride_h; int stride_w = param.stride_w; + int dilation_h = param.dilation_h; + int dilation_w = param.dilation_w; bm_tensor_4d_t input_shape = { input_n, @@ -71,10 +81,10 @@ class VenderConv2D Date: Wed, 27 Jun 2018 09:41:52 +0800 Subject: [PATCH 194/318] Init handle in init function --- saber/funcs/impl/bm/vender_pooling.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h index 108a70708..6e5de79a4 100644 --- a/saber/funcs/impl/bm/vender_pooling.h +++ b/saber/funcs/impl/bm/vender_pooling.h @@ -35,6 +35,8 @@ class VenderPooling& inputs, std::vector& outputs, PoolingParam &pooling_param, Context &ctx) { + + _handle = get_bm_handle(); return create(inputs, outputs, pooling_param, ctx); } @@ -64,7 +66,7 @@ class VenderPooling Date: Wed, 27 Jun 2018 10:07:03 +0800 Subject: [PATCH 195/318] Include BM conv implementation --- saber/funcs/conv.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/saber/funcs/conv.h b/saber/funcs/conv.h index 5a58bb01c..1626d38a9 100644 --- a/saber/funcs/conv.h +++ b/saber/funcs/conv.h @@ -29,6 +29,10 @@ #include "saber/funcs/impl/impl_conv.h" #endif +#ifdef USE_BM +#include "saber/funcs/impl/bm/vender_conv.h" +#endif + #ifdef USE_ARM_PLACE //#include "saber/funcs/impl/arm/saber_conv.h" #endif From 27ba06b87ed8dbca624e06ed5f6c3b9b99a1f2c1 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 27 Jun 2018 10:12:18 +0800 Subject: [PATCH 196/318] remove unecessary include --- saber/funcs/impl/bm/vender_conv.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h index 530eef528..924bf736c 100644 --- a/saber/funcs/impl/bm/vender_conv.h +++ b/saber/funcs/impl/bm/vender_conv.h @@ -1,8 +1,7 @@ #ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H #define ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H -#include "saber/funcs/impl/impl_conv.h" -#include "saber/funcs/impl/bm/bmdnn_api.h" +#include "saber/funcs/impl/impl_conv.h" namespace anakin{ From 88d7ced5fbd1ee9ee0f1289b2841f7a74ea9313c Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 27 Jun 2018 10:26:09 +0800 Subject: [PATCH 197/318] empty create function --- saber/funcs/impl/bm/vender_conv.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h index 924bf736c..14e52af8e 100644 --- a/saber/funcs/impl/bm/vender_conv.h +++ b/saber/funcs/impl/bm/vender_conv.h @@ -42,7 +42,9 @@ class VenderConv2D& inputs, std::vector& outputs, - ConvParam& param, Context& ctx); + ConvParam& param, Context& ctx) { + + } virtual SaberStatus dispatch(const std::vector& inputs, std::vector& outputs, From 7d9bc02dd45bb1ba86cb816a0c4ca382c6446cd1 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 27 Jun 2018 11:18:00 +0800 Subject: [PATCH 198/318] unit test for BM conv --- saber/funcs/impl/bm/vender_conv.h | 6 +- test/saber/bm/test_saber_func_conv_BM.cpp | 88 ++--------------------- 2 files changed, 8 insertions(+), 86 deletions(-) diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h index 14e52af8e..220b8a14e 100644 --- a/saber/funcs/impl/bm/vender_conv.h +++ b/saber/funcs/impl/bm/vender_conv.h @@ -1,7 +1,7 @@ #ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H #define ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H -#include "saber/funcs/impl/impl_conv.h" +#include "saber/funcs/impl/impl_conv.h" namespace anakin{ @@ -74,6 +74,8 @@ class VenderConv2Dsize() > 0; + bm_tensor_4d_t input_shape = { input_n, input_c, @@ -107,7 +109,7 @@ class VenderConv2D &inputs, std::vector &outputs, @@ -601,23 +573,6 @@ void test_conv_fp32_speed(std::vector &inputs, std::vector ctx1(0, 1, 1); - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - CUBLAS_CHECK(cublasSetStream(cublas_handle, ctx1.get_compute_stream())); - TensorDf4 weights; weights.re_alloc({out_channels, in_channels, 1, 1}); @@ -684,40 +636,8 @@ TEST(TestSaberFuncBM, test_conv_fp32_1x1_speed) { weights, kernel, stride, pad, in_channels, out_channels, bias, SABER_IMPL); - //cudaDeviceSynchronize(); - caffe_gemm(out_channels, img_h * img_w, in_channels,\ - 1.f, weights.data(),\ - img.data(), 0.f, out_gemm.mutable_data()); - //cudaDeviceSynchronize(); - SaberTimer t1; - int ts = 100; - - for (int i = 0; i < ts; ++i) { - t1.start(ctx1); - caffe_gemm(out_channels, img_h * img_w, in_channels,\ - 1.f, weights.data(),\ - img.data(), 0.f, out_gemm.mutable_data()); - out_gemm.record_event(ctx1.get_compute_stream()); - out_gemm.sync(); - t1.end(ctx1); - } - LOG(INFO) << "elapse time: " << t1.get_average_ms() << " ms"; - - //cudaDeviceSynchronize(); -// print_tensor_device(out); -// print_tensor_device(out_gemm); - TensorHf4 out_host; - TensorHf4 out_gemm_host; - out_host.re_alloc(out.shape()); - out_host.copy_from(out); - - out_gemm_host.re_alloc(out_gemm.shape()); - out_gemm_host.copy_from(out_gemm); - double max_r, max_d; - tensor_cmp_host(out_host.data(), out_gemm_host.data(), out_host.size(), max_r, max_d); - LOG(INFO) << "cmp result: max_r = " << max_r << " max_d = " << max_d; } -*/ + int main(int argc, const char** argv){ anakin::saber::Env::env_init(); From 5ce905615a9cb22b34fb63a9e71ec51d18f523c3 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 27 Jun 2018 11:26:06 +0800 Subject: [PATCH 199/318] Update BM tensor print function --- saber/core/tensor_op.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp index 06ee5bd79..6a5d58f03 100644 --- a/saber/core/tensor_op.cpp +++ b/saber/core/tensor_op.cpp @@ -339,7 +339,7 @@ void print_tensor_device>(Tensor& tenso for (int i = 0; i < tensor.size(); ++i) { printf("%.2f ", host_mem[i]); - if ((i + 1) % (4 * tensor.width()) == 0) { + if ((i + 1) % tensor.width() == 0){ printf("\n"); } } From 838a2856dfd32a5cc8951aa6cfb1a6cf6f91536c Mon Sep 17 00:00:00 2001 From: "tong.liu" Date: Wed, 27 Jun 2018 05:17:48 +0000 Subject: [PATCH 200/318] modify activation op, test pass --- saber/funcs/impl/bm/vender_activation.h | 15 ++- .../bm/test_saber_func_activation_BM.cpp | 91 +++++++++++++++++++ test/saber/bm/test_saber_func_pooling_BM.cpp | 2 +- 3 files changed, 99 insertions(+), 9 deletions(-) create mode 100644 test/saber/bm/test_saber_func_activation_BM.cpp diff --git a/saber/funcs/impl/bm/vender_activation.h b/saber/funcs/impl/bm/vender_activation.h index c4baf8365..ec27ac054 100644 --- a/saber/funcs/impl/bm/vender_activation.h +++ b/saber/funcs/impl/bm/vender_activation.h @@ -27,7 +27,7 @@ class VenderActivation& outputs, ActivationParam& param, Context& ctx) { // not sure + _handle = get_bm_handle(); return create(inputs, outputs, param, ctx); } @@ -49,14 +50,15 @@ class VenderActivation& inputs, std::vector& outputs, ActivationParam& param) { - const InDataType *in_data = (const InDataType *) inputs[0]->data(); - OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); + const InDataType in_data = *(inputs[0]->data()); + OutDataType out_data = *(outputs[0]->mutable_data()); int input_dim = inputs[0]->channel() * inputs[0]->height() * inputs[0]->width(); int input_n = inputs[0]->num(); + _active_type = param.active; switch (_active_type) { case Active_relu: - BMDNN_CHECK(bmdnn_relu_forward(_handle, in_data, input_n, input_dim, out_data)); + BMDNN_CHECK(bmdnn_relu_forward(_handle, in_data, 0.0, input_n, input_dim, out_data)); break; case Active_sigmoid: BMDNN_CHECK(bmdnn_sigmoid_forward(_handle, in_data, input_n, input_dim, out_data)); @@ -64,9 +66,6 @@ class VenderActivation; +template class VenderActivation; } // namespace saber } // namespace anakin diff --git a/test/saber/bm/test_saber_func_activation_BM.cpp b/test/saber/bm/test_saber_func_activation_BM.cpp new file mode 100644 index 000000000..42f33e58d --- /dev/null +++ b/test/saber/bm/test_saber_func_activation_BM.cpp @@ -0,0 +1,91 @@ +#include "core/context.h" +#include "funcs/activation.h" +#include "test_saber_func_BM.h" +#include "tensor_op.h" +#include "saber_types.h" +#include + +using namespace anakin::saber; + +template +void print_tensor_shape(std::string name, Tensor& t0) { + + LOG(INFO) << name << " valid shape is [" + << t0.valid_shape()[0] << ", " + << t0.valid_shape()[1] << ", " + << t0.valid_shape()[2] << ", " + << t0.valid_shape()[3] << "]."; + + LOG(INFO) << name << " real shape is [" + << t0.shape()[0] << ", " + << t0.shape()[1] << ", " + << t0.shape()[2] << ", " + << t0.shape()[3] << "]."; + + LOG(INFO) << name << " offset is [" + << t0.offset()[0] << ", " + << t0.offset()[1] << ", " + << t0.offset()[2] << ", " + << t0.offset()[3] << "]."; +} + +TEST(TestSaberFuncBM, test_func_constructor) { + + typedef Tensor TensorHf4; + typedef Tensor TensorDf4; + + int img_num = 1; + int in_channels = 1; + int img_h = 8; + int img_w = 8; + + Shape img_s(img_num, in_channels, img_h, img_w); + + TensorHf4 img_host; + TensorDf4 img_dev; + + img_host.re_alloc(img_s); + img_dev.re_alloc(img_s); + + int sign = -1; + for (int i = 0; i < img_host.size(); ++i) { + sign = i % 2 ? -1 : 1; + img_host.mutable_data()[i] = (float)(0.05 * (i & 0x1f) * sign); + } + + img_dev.copy_from(img_host); + TensorDf4 output_dev; + print_tensor_device(img_dev); + + // start Reshape & doInfer + + Context ctx1(0, 1, 1); + + ActivationParam param(Active_relu); + + std::vector input; + std::vector output; + + input.push_back(&img_dev); + output.push_back(&output_dev); + + Activation act; + act.compute_output_shape(input, output, param); + output_dev.re_alloc(output[0]->shape()); + + // init assume output tensor has been reshpaed by user. + act.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1); + act(input, output, param, ctx1); + + print_tensor_device(output_dev); +} + +int main(int argc, const char** argv) { + Env::env_init(); + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/bm/test_saber_func_pooling_BM.cpp b/test/saber/bm/test_saber_func_pooling_BM.cpp index e988bc573..fb1a7398d 100644 --- a/test/saber/bm/test_saber_func_pooling_BM.cpp +++ b/test/saber/bm/test_saber_func_pooling_BM.cpp @@ -80,7 +80,7 @@ TEST(TestSaberFuncBM, test_func_pooling) { pooling(input, output, param, ctx1); SaberTimer t1; - int ts = 1000; + int ts = 100; for (int i = 0; i < ts; ++i) { t1.start(ctx1); From 272ef52bc9ea245097be3f8690f5a1dfd4695f02 Mon Sep 17 00:00:00 2001 From: hlzy <327842846@qq.com> Date: Wed, 27 Jun 2018 01:28:34 -0400 Subject: [PATCH 201/318] tensor_test --- test/saber/bm/test_saber_tensor_BM.cpp | 49 ++++++++++++++------------ 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/test/saber/bm/test_saber_tensor_BM.cpp b/test/saber/bm/test_saber_tensor_BM.cpp index 69b1ccbfc..de787908b 100644 --- a/test/saber/bm/test_saber_tensor_BM.cpp +++ b/test/saber/bm/test_saber_tensor_BM.cpp @@ -8,6 +8,8 @@ typedef TargetWrapper BM_API; typedef Tensor TensorHf4; typedef Tensor TensorDf4; typedef TensorHf4::Dtype dtype; +typedef TensorDf4::Dtype dtype2; + static bm_handle_t handle; TEST(TestSaberTensorBM, test_tensor_constructor) { @@ -47,7 +49,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { << thost0.height() << ", width = " << thost0.width(); //! test tensor mutable_data() function - LOG(INFO) << "|--test tensor mutable_data() function, write tensor data buffer with 2.f"; + LOG(INFO) << "|--xxxxxxxxtest tensor mutable_data() function, write tensor data buffer with 2.f"; fill_tensor_host_const(thost0, 2.f); LOG(INFO) << "|--test tensor data() function, show the const data, 2.f"; print_tensor_host(thost0); @@ -88,7 +90,7 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { LOG(INFO) << "test tensor constructor with data, if target is different, create buffer, and copy the data"; dtype* host_data_ptr; - dtype* dev_data_ptr; +// dtype2* dev_data_ptr; void* tmp_pt_host; void* tmp_pt_dev; X86_API::mem_alloc(&tmp_pt_host, sizeof(dtype) * sh1.count()); @@ -98,26 +100,28 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { host_data_ptr[i] = i; } - BM_API::mem_alloc(&tmp_pt_dev, sizeof(dtype) * sh1.count()); - dev_data_ptr = static_cast(tmp_pt_dev); -// bm_memcpy_d2s(handle,host_data_ptr,dev_data_ptr) -// cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice); + BM_API::mem_alloc(&tmp_pt_dev, sizeof(dtype2) * sh1.count()); +// dev_data_ptr = static_cast(tmp_pt_dev); +// bm_memcpy_d2s(handle,*dev_data_ptr,bm_mem_from_system(const_cast(host_data_ptr))); + +//--- cudaMemcpy(dev_data_ptr, host_data_ptr, sizeof(dtype) * sh1.count(), cudaMemcpyHostToDevice); + LOG(INFO) << "|--construct host tensor from host data ptr"; TensorHf4 thost3(host_data_ptr, X86(), X86_API::get_device_id(), sh1); LOG(INFO) << "|--constructor device tensor from host data ptr"; -// TensorDf4 tdev3(&bm_mem_from_system(const_cast(host_data_ptr)), X86(), X86_API::get_device_id(), sh1); - TensorDf4 tdev3(&bm_mem_from_system(const_cast(host_data_ptr)), X86(), X86_API::get_device_id(), sh1); print_tensor_host(thost3); - TensorHf4 thost_lian(sh1); - thost_lian.copy_from(tdev3); - print_tensor_host(thost_lian); + print_tensor_device(tdev3); - thost_lian.copy_from(thost3); - print_tensor_host(thost_lian); +// TensorHf4 thost_lian(sh1); +// thost_lian.copy_from(tdev3); +// print_tensor_host(thost_lian); +// +// thost_lian.copy_from(thost3); +// print_tensor_host(thost_lian); //cudaDeviceSynchronize(); // @@ -128,16 +132,17 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { TensorDf4 tdev4(dev_data_ptr, BM(), BM_API::get_device_id(), sh1); print_tensor_host(thost4); print_tensor_device(tdev4); -/* +*/ + //BM_API::stream_t dev_stream0; //BM_API::create_stream_with_flag(dev_stream0, 1); //cudaDeviceSynchronize(); - +/* //! test tensor copy constructor LOG(INFO) << "test tensor copy constructor"; LOG(INFO) << "|--normal copy constructor"; - TensorHf4 thost5(thost4); - TensorDf4 tdev5(tdev4); +// TensorHf4 thost5(thost4); +// TensorDf4 tdev5(tdev4); LOG(INFO) << "|--push back to vector"; std::vector vthost; @@ -146,18 +151,18 @@ TEST(TestSaberTensorBM, test_tensor_constructor) { vthost.push_back(thost1); vthost.push_back(thost2); vthost.push_back(thost3); - vthost.push_back(thost4); - vthost.push_back(thost5); +// vthost.push_back(thost4); +// vthost.push_back(thost5); vtdev.push_back(tdev0); vtdev.push_back(tdev1); vtdev.push_back(tdev2); vtdev.push_back(tdev3); - vtdev.push_back(tdev4); - vtdev.push_back(tdev5); +// vtdev.push_back(tdev4); +// vtdev.push_back(tdev5); print_tensor_host(vthost[5]); print_tensor_device(vtdev[5]); //cudaDeviceSynchronize(); - +/* //! test share_from function, if targets are the same, buffer is shared, otherwise, buffer is copied LOG(INFO) << "test share_from function"; TensorHf4 thost6, thost7; From 033a6ab2122e9cbae215f87d9374cafab1f3893d Mon Sep 17 00:00:00 2001 From: "weihao.huang" Date: Wed, 27 Jun 2018 06:14:17 +0000 Subject: [PATCH 202/318] Fix sync_memcpy functions & test_saber_buffer_BM all passes --- saber/core/impl/bm/bm_impl.cpp | 28 ++++++++++++++++++-------- saber/core/target_wrapper.h | 4 ++-- test/saber/bm/test_saber_buffer_BM.cpp | 24 ++++++---------------- 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index ef26884b2..a50994a60 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -78,27 +78,39 @@ void BM_API::mem_set(void* ptr, int value, size_t n){ //BMDNN_CHECK(bm_memset_device(handle, value, *pmem)); } -//static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ -// size_t count, __DtoD) {}; +void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ + size_t count, __DtoD) { + handle = get_bm_handle(); + //BMDNN_CHECK(bm_memcpy_s2d(handle, bm_mem_from_device(dst), dst_id, bm_mem_from_device(src), src_id, count)); + BMDNN_CHECK(bm_memcpy_d2d(handle, *(bm_device_mem_t *)(dst), dst_id, *(bm_device_mem_t *)(src), src_id, count)); + LOG(INFO) << "BM sync_memcpy: device to device, finished"; +}; void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ size_t count, __HtoD) { handle = get_bm_handle(); - BMDNN_CHECK(bm_memcpy_s2d(handle, bm_mem_from_device(dst), bm_mem_from_system(src))); + BMDNN_CHECK(bm_memcpy_s2d(handle, *(bm_device_mem_t *)(dst), bm_mem_from_system(src))); + for(int i=0; i<10; i++) + std::cout << "HtoD src: " << *((float *)(src)+i) << std::endl; + LOG(INFO) << "BM sync_memcpy: host to device, finished"; }; void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ size_t count, __DtoH) { handle = get_bm_handle(); - BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), bm_mem_from_device(src))); - //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src))); - //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(reinterpret_cast(src)))); + BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src))); + for(int i=0; i<10; i++) + std::cout << "DtoH dst: " << *((float *)(dst)+i) << std::endl; + LOG(INFO) << "BM sync_memcpy: device to host, finished"; }; -//static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \ -// int src_dev, size_t count) {}; +void BM_API::sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \ + int src_dev, size_t count) { + + LOG(INFO) << "BM sync_memcpy_p2p: temporarily no used"; +}; //! target wrapper diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h index 475fbba84..5c802fa9e 100644 --- a/saber/core/target_wrapper.h +++ b/saber/core/target_wrapper.h @@ -540,7 +540,7 @@ struct TargetWrapper { // brief create event, empty function for bitmain target static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, __DtoD) {}; + size_t count, __DtoD); static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ size_t count, __HtoD); @@ -549,7 +549,7 @@ struct TargetWrapper { size_t count, __DtoH) {}; static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \ - int src_dev, size_t count) {}; + int src_dev, size_t count); /** * \brief device target return currently used device id diff --git a/test/saber/bm/test_saber_buffer_BM.cpp b/test/saber/bm/test_saber_buffer_BM.cpp index 555e22675..f8c8f46bb 100644 --- a/test/saber/bm/test_saber_buffer_BM.cpp +++ b/test/saber/bm/test_saber_buffer_BM.cpp @@ -27,7 +27,7 @@ void test_buffer() { x86_ptr = static_cast(tmp_x86); for (int i = 0; i < n0; i++) { - x86_ptr[i] = static_cast(100); + x86_ptr[i] = static_cast(i); } void* tmp_bm; @@ -97,25 +97,13 @@ void test_buffer() { } CHECK_EQ(ptr1[n0 / 2], ptr2[n0 / 2]) << "deep copy between host is incorrect"; + bm_buf1.sync_copy_from(x86_buf2); LOG(INFO) << "deep copy from host buffer to device buffer"; - bm_buf1.sync_copy_from(x86_buf2); - - /* - const Hdtype* x86_buf2_ptr = static_cast(x86_buf2.get_data()); - for (int i = 0; i < 10; i++) { - std::cout << "x86: " << x86_buf2_ptr[i] << std::endl; - } - */ - - const Hdtype* bm_buf1_ptr = static_cast(bm_buf1.get_data()); - for (int i = 0; i < 10; i++) { - std::cout << "bm: " << bm_buf1_ptr[i] << std::endl; - } - LOG(INFO) << "bm_buf1 cap & cnt: " << bm_buf1.get_capacity() << " " << bm_buf1.get_count(); - LOG(INFO) << "x86_buf1 cap & cnt: " << x86_buf1.get_capacity() << " " << x86_buf1.get_count(); - LOG(INFO) << "size of Hdtype: " << sizeof(Hdtype); - LOG(INFO) << "size of Ddtype: " << sizeof(Ddtype); + //LOG(INFO) << "bm_buf1 cap & cnt: " << bm_buf1.get_capacity() << " " << bm_buf1.get_count(); + //LOG(INFO) << "x86_buf1 cap & cnt: " << x86_buf1.get_capacity() << " " << x86_buf1.get_count(); + //LOG(INFO) << "size of Hdtype: " << sizeof(Hdtype); + //LOG(INFO) << "size of Ddtype: " << sizeof(Ddtype); x86_buf1.re_alloc(bm_buf1.get_capacity()); From 9bba50ebdd0e98a11f0835b9f18a33f7d3c38a6d Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 27 Jun 2018 14:22:38 +0800 Subject: [PATCH 203/318] Implement BM softmax --- saber/funcs/impl/bm/vender_softmax.h | 106 ++++++++++ test/saber/bm/test_saber_func_softmax_BM.cpp | 194 +++++++++++++++++++ test/saber/bm/test_saber_func_softmax_BM.h | 21 ++ 3 files changed, 321 insertions(+) create mode 100644 saber/funcs/impl/bm/vender_softmax.h create mode 100644 test/saber/bm/test_saber_func_softmax_BM.cpp create mode 100644 test/saber/bm/test_saber_func_softmax_BM.h diff --git a/saber/funcs/impl/bm/vender_softmax.h b/saber/funcs/impl/bm/vender_softmax.h new file mode 100644 index 000000000..fb2595e87 --- /dev/null +++ b/saber/funcs/impl/bm/vender_softmax.h @@ -0,0 +1,106 @@ +#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_SOFTMAX_H +#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_SOFTMAX_H + +#include "saber/funcs/impl/impl_softmax.h" +#include "saber/saber_funcs_param.h" +#include "saber/saber_types.h" + +namespace anakin{ + +namespace saber{ + +template +class VenderSoftmax : \ + public ImplBase< + Tensor, + Tensor, + Tensor, + SoftmaxParam > > +{ +public: + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + typedef Tensor OpTensor; + typedef typename DataTensor_in::Dtype InDataType; + typedef typename DataTensor_out::Dtype OutDataType; + typedef typename OpTensor::Dtype OpDataType; + + VenderSoftmax(): _handle(NULL) {} + ~VenderSoftmax() {} + + /** + * \brief initial all bmdnn resources here + * @param inputs + * @param outputs + * @param param + * @param ctx + */ + virtual SaberStatus init(const std::vector& inputs, + std::vector& outputs, + SoftmaxParam& param, Context& ctx) { + + _handle = get_bm_handle(); + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector& inputs, + std::vector& outputs, + SoftmaxParam& param, Context &ctx) { + + } + + //call cudnnConvolutionForward here + virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + SoftmaxParam ¶m){ + + const InDataType *in_data = (const InDataType *) inputs[0]->data(); + OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); + + int input_n = inputs[0]->num(); + int input_c = inputs[0]->channel(); + int input_h = inputs[0]->height(); + int input_w = inputs[0]->width(); + + /* + int outer_num = inputs[0]->count(0, param.axis); + int inner_num = inputs[0]->count(param.axis + 1, inputs[0]->dims()); + + int N = outer_num; + int K = inputs[0]->valid_shape()[param.axis]; + int H = inner_num; + int W = 1; + + const int stride_w = 1; + const int stride_h = W * stride_w; + const int stride_c = H * stride_h; + const int stride_n = K * stride_c; + */ + + bmdnn_softmax_forward( + _handle, + *in_data, + input_n, + input_c, + input_h * input_w, + *out_data + ); + + return SaberSuccess; + } + +private: + bm_handle_t _handle; +}; + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_SOFTMAX_H diff --git a/test/saber/bm/test_saber_func_softmax_BM.cpp b/test/saber/bm/test_saber_func_softmax_BM.cpp new file mode 100644 index 000000000..2da0d2e62 --- /dev/null +++ b/test/saber/bm/test_saber_func_softmax_BM.cpp @@ -0,0 +1,194 @@ +#include "core/context.h" +#include "funcs/softmax.h" +#include "test_saber_func_softmax_BM.h" +#include "tensor_op.h" +#include "saber_types.h" +#include + +using namespace anakin::saber; + +TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) { + + Env::env_init(); + typedef TargetWrapper API; + + typedef Tensor TensorDf4; + + typedef TensorDf4::Dtype dtype; + + int test_iter = 1000; + + int softmax_axis = 3; // channel + int w_in = 3; + int h_in = 225; + int ch_in = 40; + int num_in = 1; + + Shape shape_in(num_in, ch_in, h_in, w_in); + Shape shape_out = shape_in; + + SoftmaxParam param(softmax_axis); + + LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \ + ch_in << ", height=" << h_in << ", width=" << w_in; + + LOG(INFO) << "softmax axis= " << param.axis; + + std::vector input_dev_4d; + std::vector output_dev_4d; + + Tensor thin(shape_in); + + for (int i = 0; i < thin.size(); ++i) { + thin.mutable_data()[i] = i % 4; + } + + TensorDf4 tdin, tdout; + tdin.re_alloc(shape_in); + tdin.copy_from(thin); + input_dev_4d.push_back(&tdin); + + // start Reshape & doInfer + Context ctx_dev(0, 1, 1); + + Softmax softmax_dev; + + typedef std::vector Shape_v; + + LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \ + shape_out[2] << ", " << shape_out[3]; + + output_dev_4d.push_back(&tdout); + softmax_dev.compute_output_shape(input_dev_4d, output_dev_4d, param); + + LOG(INFO) << "re-alloc tensor buffer"; + output_dev_4d[0]->re_alloc(output_dev_4d[0]->shape()); + + LOG(INFO) << "softmax initialized to cudnn impl"; + softmax_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, VENDER_IMPL, ctx_dev); + + LOG(INFO) << "cudnn softmax compute"; + SaberTimer t1; + t1.clear(); + t1.start(ctx_dev); + + for (int i = 0; i < test_iter; ++i) { + softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev); + output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); + output_dev_4d[0]->sync(); + } + + t1.end(ctx_dev); + float ts = t1.get_average_ms(); + printf("cudnn softmax total time : %.4f, avg time : %.4f\n", ts, ts / test_iter); + + LOG(INFO) << "softmax initialized to saber impl"; + softmax_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, SABER_IMPL, ctx_dev); + + LOG(INFO) << "saber softmax compute"; + t1.clear(); + t1.start(ctx_dev); + + for (int i = 0; i < test_iter; ++i) { + softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev); + output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); + output_dev_4d[0]->sync(); + } + + t1.end(ctx_dev); + ts = t1.get_average_ms(); + printf("saber softmax total time : %.4f, avg time : %.4f\n", ts, ts / test_iter); + //print_tensor_device(*output_dev_4d[0]); +} + +TEST(TestSaberFuncSoftmaxBM, test_func_softmax_ROI_BM) { + + Env::env_init(); + typedef TargetWrapper API; + + typedef Tensor TensorDf4; + + typedef TensorDf4::Dtype dtype; + + int test_iter = 1; + + int softmax_axis = 3; // channel + int w_in = 3; + int h_in = 10; + int ch_in = 10; + int num_in = 1; + + Shape shape_in(num_in, ch_in, h_in, w_in); + Shape shape_in_roi{num_in, ch_in / 2, h_in / 2, w_in}; + Shape shape_out = shape_in_roi; + + SoftmaxParam param(softmax_axis); + + LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \ + ch_in << ", height=" << h_in << ", width=" << w_in; + + LOG(INFO) << "softmax axis= " << param.axis; + + std::vector input_dev_4d; + std::vector output_dev_4d; + + Tensor thin(shape_in); + + for (int i = 0; i < thin.size(); ++i) { + thin.mutable_data()[i] = (i % 3); + } + + TensorDf4 tdin, tdin_roi, tdout, tdout_roi; + tdin.re_alloc(shape_in); + tdout.re_alloc(shape_in); + tdin.copy_from(thin); + tdin_roi.share_sub_buffer(tdin, shape_in_roi, Shape(0, 0, 0, 0)); + input_dev_4d.push_back(&tdin_roi); + output_dev_4d.push_back(&tdout_roi); + + // start Reshape & doInfer + Context ctx_dev(0, 1, 1); + + Softmax softmax_dev; + + LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \ + shape_out[2] << ", " << shape_out[3]; + + softmax_dev.compute_output_shape(input_dev_4d, output_dev_4d, param); + + LOG(INFO) << "re-alloc tensor buffer"; + output_dev_4d[0]->share_sub_buffer(tdout, shape_in_roi, Shape(0, 0, 0, 0)); + //output_dev_4d[0]->reshape(output_dev_4d[0]->valid_shape()); + + LOG(INFO) << "softmax initialization"; + softmax_dev.init(input_dev_4d, output_dev_4d, param, SPECIFY, SABER_IMPL, ctx_dev); + + LOG(INFO) << "softmax compute"; + SaberTimer t1; + t1.clear(); + t1.start(ctx_dev); + + for (int i = 0; i < test_iter; ++i) { + softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev); + output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); + output_dev_4d[0]->sync(); + } + + t1.end(ctx_dev); + float ts = t1.get_average_ms(); + printf("total time : %.4f, avg time : %.4f\n", ts, ts / test_iter); + print_tensor_device(*output_dev_4d[0]); + + TensorDf4 troi(output_dev_4d[0]->valid_shape()); + troi.copy_from(*output_dev_4d[0]); + print_tensor_device(troi); +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/bm/test_saber_func_softmax_BM.h b/test/saber/bm/test_saber_func_softmax_BM.h new file mode 100644 index 000000000..d5c5b6986 --- /dev/null +++ b/test/saber/bm/test_saber_func_softmax_BM.h @@ -0,0 +1,21 @@ +#ifndef ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H +#define ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H + +#include "utils/unit_test/aktest.h" +#include "utils/logger/logger.h" +#include "core/tensor.h" + +using namespace anakin::test; + +class TestSaberFuncSoftmaxBM : public Test { +public: + TestSaberFuncSoftmaxBM() {} + ~TestSaberFuncSoftmaxBM() {} + +protected: + virtual void setup() {} + virtual void teardown() {} + +}; + +#endif //ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H From 1a8861bc993a096e922313a3c01bfecba37b29a2 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 27 Jun 2018 14:53:56 +0800 Subject: [PATCH 204/318] only print in DEBUG --- saber/core/impl/bm/bm_impl.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index a50994a60..4d24dedf0 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -90,8 +90,11 @@ void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ size_t count, __HtoD) { handle = get_bm_handle(); BMDNN_CHECK(bm_memcpy_s2d(handle, *(bm_device_mem_t *)(dst), bm_mem_from_system(src))); + + #ifdef DEBUG for(int i=0; i<10; i++) - std::cout << "HtoD src: " << *((float *)(src)+i) << std::endl; + LOG(INFO) << "HtoD src: " << *((float *)(src)+i); + #endif LOG(INFO) << "BM sync_memcpy: host to device, finished"; }; @@ -100,8 +103,11 @@ void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ size_t count, __DtoH) { handle = get_bm_handle(); BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src))); + + #ifdef DEBUG for(int i=0; i<10; i++) - std::cout << "DtoH dst: " << *((float *)(dst)+i) << std::endl; + LOG(INFO) << "DtoH dst: " << *((float *)(dst)+i); + #endif LOG(INFO) << "BM sync_memcpy: device to host, finished"; }; From 2103811c6bb83874eb2ebb56c997bb98d087663b Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 27 Jun 2018 15:05:37 +0800 Subject: [PATCH 205/318] reduce iteration --- test/saber/bm/test_saber_func_softmax_BM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/saber/bm/test_saber_func_softmax_BM.cpp b/test/saber/bm/test_saber_func_softmax_BM.cpp index 2da0d2e62..8176a9e51 100644 --- a/test/saber/bm/test_saber_func_softmax_BM.cpp +++ b/test/saber/bm/test_saber_func_softmax_BM.cpp @@ -16,7 +16,7 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) { typedef TensorDf4::Dtype dtype; - int test_iter = 1000; + int test_iter = 10; int softmax_axis = 3; // channel int w_in = 3; From 67e9bbd1702271b7d09e9b5006b2ca5190fe32e0 Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 27 Jun 2018 15:11:57 +0800 Subject: [PATCH 206/318] Revert "reduce iteration" This reverts commit 635ff4260496f98657440461c7f251c2b6a4c907. --- test/saber/bm/test_saber_func_softmax_BM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/saber/bm/test_saber_func_softmax_BM.cpp b/test/saber/bm/test_saber_func_softmax_BM.cpp index 8176a9e51..2da0d2e62 100644 --- a/test/saber/bm/test_saber_func_softmax_BM.cpp +++ b/test/saber/bm/test_saber_func_softmax_BM.cpp @@ -16,7 +16,7 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) { typedef TensorDf4::Dtype dtype; - int test_iter = 10; + int test_iter = 1000; int softmax_axis = 3; // channel int w_in = 3; From ceccee48718582feaa580453dfcbba7221f1bd33 Mon Sep 17 00:00:00 2001 From: "tong.liu" Date: Wed, 27 Jun 2018 08:19:32 +0000 Subject: [PATCH 207/318] modify fc op, compile error --- saber/funcs/impl/bm/vender_fc.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/saber/funcs/impl/bm/vender_fc.h b/saber/funcs/impl/bm/vender_fc.h index 82dd6000c..5004ad349 100644 --- a/saber/funcs/impl/bm/vender_fc.h +++ b/saber/funcs/impl/bm/vender_fc.h @@ -1,6 +1,5 @@ #ifndef ANAKIN_SABER_FUNCS_BMDNN_FC_H #define ANAKIN_SABER_FUNCS_BMDNN_FC_H - #include "saber/funcs/impl/impl_fc.h" namespace anakin{ @@ -34,6 +33,7 @@ class VenderFc& inputs, std::vector& outputs, FcParam& param, Context& ctx){ + _handle = get_bm_handle(); return create(inputs, outputs, param, ctx); } @@ -46,10 +46,10 @@ class VenderFc& inputs, std::vector& outputs, FcParam& param){ - const InDataType *in_data = (const InDataType *) inputs[0]->data(); - const InDataType *weights = (const InDataType *) param.weights->get_buf()->get_data(); - const InDataType *bias = (const InDataType *) param.bias->get_buf()->get_data(); - OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); + const InDataType in_data = *(inputs[0]->data()); + const InDataType weights = *(InDataType*)(param.weights->get_buf()->get_data()); + const InDataType bias = *(InDataType*)(param.bias->get_buf()->get_data()); + OutDataType out_data = *(outputs[0]->mutable_data()); int batch_size = inputs[0]->num(); int input_len = inputs[0]->channel(); int output_len = param.num_output; @@ -64,7 +64,7 @@ class VenderFc; +template class VenderFc; } //namespace saber } //namespace anakin From 944214d6cdf62ff1399b1b1e5d86a05093f3bc7d Mon Sep 17 00:00:00 2001 From: guangzhixie Date: Wed, 27 Jun 2018 16:53:20 +0800 Subject: [PATCH 208/318] Update for BM softmax --- saber/funcs/impl/bm/vender_softmax.h | 14 +++++++----- test/saber/bm/test_saber_func_softmax_BM.cpp | 23 ++++++++++---------- test/saber/bm/test_saber_func_softmax_BM.h | 21 ------------------ 3 files changed, 20 insertions(+), 38 deletions(-) delete mode 100644 test/saber/bm/test_saber_func_softmax_BM.h diff --git a/saber/funcs/impl/bm/vender_softmax.h b/saber/funcs/impl/bm/vender_softmax.h index fb2595e87..55612f66a 100644 --- a/saber/funcs/impl/bm/vender_softmax.h +++ b/saber/funcs/impl/bm/vender_softmax.h @@ -63,12 +63,13 @@ class VenderSoftmaxdata(); OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); + /* int input_n = inputs[0]->num(); int input_c = inputs[0]->channel(); int input_h = inputs[0]->height(); int input_w = inputs[0]->width(); + */ - /* int outer_num = inputs[0]->count(0, param.axis); int inner_num = inputs[0]->count(param.axis + 1, inputs[0]->dims()); @@ -77,18 +78,19 @@ class VenderSoftmax using namespace anakin::saber; -TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) { - Env::env_init(); +TEST(TestSaberFuncBM, test_func_softmax_BM) { + + //Env::env_init(); typedef TargetWrapper API; typedef Tensor TensorDf4; @@ -74,8 +75,8 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) { for (int i = 0; i < test_iter; ++i) { softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev); - output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); - output_dev_4d[0]->sync(); + //output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); + //output_dev_4d[0]->sync(); } t1.end(ctx_dev); @@ -91,8 +92,8 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) { for (int i = 0; i < test_iter; ++i) { softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev); - output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); - output_dev_4d[0]->sync(); + //output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); + //output_dev_4d[0]->sync(); } t1.end(ctx_dev); @@ -101,9 +102,9 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_BM) { //print_tensor_device(*output_dev_4d[0]); } -TEST(TestSaberFuncSoftmaxBM, test_func_softmax_ROI_BM) { +TEST(TestSaberFuncBM, test_func_softmax_ROI_BM) { - Env::env_init(); + //Env::env_init(); typedef TargetWrapper API; typedef Tensor TensorDf4; @@ -170,8 +171,8 @@ TEST(TestSaberFuncSoftmaxBM, test_func_softmax_ROI_BM) { for (int i = 0; i < test_iter; ++i) { softmax_dev(input_dev_4d, output_dev_4d, param, ctx_dev); - output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); - output_dev_4d[0]->sync(); + //output_dev_4d[0]->record_event(ctx_dev.get_compute_stream()); + //output_dev_4d[0]->sync(); } t1.end(ctx_dev); diff --git a/test/saber/bm/test_saber_func_softmax_BM.h b/test/saber/bm/test_saber_func_softmax_BM.h deleted file mode 100644 index d5c5b6986..000000000 --- a/test/saber/bm/test_saber_func_softmax_BM.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H -#define ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H - -#include "utils/unit_test/aktest.h" -#include "utils/logger/logger.h" -#include "core/tensor.h" - -using namespace anakin::test; - -class TestSaberFuncSoftmaxBM : public Test { -public: - TestSaberFuncSoftmaxBM() {} - ~TestSaberFuncSoftmaxBM() {} - -protected: - virtual void setup() {} - virtual void teardown() {} - -}; - -#endif //ANAKIN_TEST_SABER_TEST_SABER_FUNC_SOFTMAX_BM_H From 6d5c486ca7fd214a30fa7d354b25d252a24f9322 Mon Sep 17 00:00:00 2001 From: "tong.liu" Date: Wed, 27 Jun 2018 17:39:42 +0800 Subject: [PATCH 209/318] xRevert "modify fc op, compile error" This reverts commit 2997faf062e8ef4bf6310c425ab369059fec335d. --- saber/funcs/impl/bm/vender_fc.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/saber/funcs/impl/bm/vender_fc.h b/saber/funcs/impl/bm/vender_fc.h index 5004ad349..82dd6000c 100644 --- a/saber/funcs/impl/bm/vender_fc.h +++ b/saber/funcs/impl/bm/vender_fc.h @@ -1,5 +1,6 @@ #ifndef ANAKIN_SABER_FUNCS_BMDNN_FC_H #define ANAKIN_SABER_FUNCS_BMDNN_FC_H + #include "saber/funcs/impl/impl_fc.h" namespace anakin{ @@ -33,7 +34,6 @@ class VenderFc& inputs, std::vector& outputs, FcParam& param, Context& ctx){ - _handle = get_bm_handle(); return create(inputs, outputs, param, ctx); } @@ -46,10 +46,10 @@ class VenderFc& inputs, std::vector& outputs, FcParam& param){ - const InDataType in_data = *(inputs[0]->data()); - const InDataType weights = *(InDataType*)(param.weights->get_buf()->get_data()); - const InDataType bias = *(InDataType*)(param.bias->get_buf()->get_data()); - OutDataType out_data = *(outputs[0]->mutable_data()); + const InDataType *in_data = (const InDataType *) inputs[0]->data(); + const InDataType *weights = (const InDataType *) param.weights->get_buf()->get_data(); + const InDataType *bias = (const InDataType *) param.bias->get_buf()->get_data(); + OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); int batch_size = inputs[0]->num(); int input_len = inputs[0]->channel(); int output_len = param.num_output; @@ -64,7 +64,7 @@ class VenderFc; +template class VenderFc; } //namespace saber } //namespace anakin From 8a7a8d713100a6867d2cec13bc78d49cd88320af Mon Sep 17 00:00:00 2001 From: hlzy <327842846@qq.com> Date: Wed, 27 Jun 2018 07:46:39 -0400 Subject: [PATCH 210/318] change tensor_test_bm --- .idea/workspace.xml | 86 ++++++++++++++++++++++++++++----------------- 1 file changed, 54 insertions(+), 32 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 718ee2682..210061337 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -9,23 +9,7 @@ - - - - - - - - - - - - - - - - - + @@ -176,14 +180,13 @@ - - + + - @@ -221,23 +257,24 @@