From f8507272c0b63bbaaec3f306843e885e0c746766 Mon Sep 17 00:00:00 2001 From: chenjiaoAngel Date: Thu, 23 Aug 2018 17:56:30 +0800 Subject: [PATCH 01/10] fix bug in ios build --- saber/lite/funcs/neon/saber_eltwise_act.cpp | 38 ++++++++++----------- saber/lite/funcs/saber_eltwise_act.h | 2 +- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/saber/lite/funcs/neon/saber_eltwise_act.cpp b/saber/lite/funcs/neon/saber_eltwise_act.cpp index 2b301cecc..3d3435ce7 100644 --- a/saber/lite/funcs/neon/saber_eltwise_act.cpp +++ b/saber/lite/funcs/neon/saber_eltwise_act.cpp @@ -9,32 +9,32 @@ namespace lite{ template void eltwise_prod_relu(const Dtype* din_a, const Dtype* din_b, Dtype* dout, const int num, \ - int channel, int channel_size, std::vector coef, bool channel_shared, float* slop_ptr); + int channel, int channel_size, std::vector coef, bool channel_shared, const float* slop_ptr); template void eltwise_sum_relu(const Dtype* din_a, const Dtype* din_b, Dtype* dout, const int num, \ - int channel, int channel_size, std::vector coef, bool channel_shared, float* slop_ptr); + int channel, int channel_size, std::vector coef, bool channel_shared, const float* slop_ptr); template void eltwise_max_relu(const Dtype* din_a, const Dtype* din_b, Dtype* dout, const int num, \ - int channel, int channel_size, std::vector coef, bool channel_shared, float* slop_ptr); + int channel, int channel_size, std::vector coef, bool channel_shared, const float* slop_ptr); template void eltwise_prod_prelu(const Dtype* din_a, const Dtype* din_b, Dtype* dout, const int num, \ - int channel, int channel_size, std::vector coef, bool channel_shared, float* slop_ptr); + int channel, int channel_size, std::vector coef, bool channel_shared, const float* slop_ptr); template void eltwise_sum_prelu(const Dtype* din_a, const Dtype* din_b, Dtype* dout, const int num, \ - int channel, int channel_size, std::vector coef, bool channel_shared, float* slop_ptr); + int channel, int channel_size, std::vector coef, bool channel_shared, const float* slop_ptr); template void eltwise_max_prelu(const Dtype* din_a, const Dtype* din_b, Dtype* dout, const int num, \ - int channel, int channel_size, std::vector coef, bool channel_shared, float* slop_ptr); + int channel, int channel_size, std::vector coef, bool channel_shared, const float* slop_ptr); template <> void eltwise_prod_relu(const float* din_a, const float* din_b, float* dout, const int num, \ - int channel, int channel_size, std::vector coef, bool channel_shared, float* slop_ptr) { + int channel, int channel_size, std::vector coef, bool channel_shared, const float* slop_ptr) { int cnt = channel_size >> 3; int remain = channel_size & 7; @@ -95,7 +95,7 @@ void eltwise_prod_relu(const float* din_a, const float* din_b, float* dout, cons } void eltwise_sum_relu(const float* din_a, const float* din_b, float* dout, const int num, \ - int channel, int channel_size, std::vector coef, bool channel_shared, float* slop_ptr) { + int channel, int channel_size, std::vector coef, bool channel_shared, const float* slop_ptr) { int cnt = channel_size >> 3; int remain = channel_size & 7; @@ -156,7 +156,7 @@ void eltwise_sum_relu(const float* din_a, const float* din_b, float* dout, const } void eltwise_max_relu(const float* din_a, const float* din_b, float* dout, const int num, \ - int channel, int channel_size, std::vector coef, bool channel_shared, float* slop_ptr) { + int channel, int channel_size, std::vector coef, bool channel_shared, const float* slop_ptr) { int cnt = channel_size >> 3; int remain = channel_size & 7; @@ -215,7 +215,7 @@ void eltwise_max_relu(const float* din_a, const float* din_b, float* dout, const //prelu void eltwise_prod_prelu(const float* din_a, const float* din_b, float* dout, const int num, \ - int channel, int channel_size, std::vector coeff, bool channel_shared, float* slop_ptr) { + int channel, int channel_size, std::vector coeff, bool channel_shared, const float* slop_ptr) { int cnt = channel_size >> 3; int remain = channel_size & 7; @@ -250,7 +250,7 @@ void eltwise_prod_prelu(const float* din_a, const float* din_b, float* dout, con float32x4_t vout_sel0 = vbslq_f32(vmask0, vout0, vsum0); float32x4_t vout_sel1 = vbslq_f32(vmask1, vout1, vsum1); - + a_ptr += 8; b_ptr += 8; @@ -259,7 +259,7 @@ void eltwise_prod_prelu(const float* din_a, const float* din_b, float* dout, con out_ptr += 8; } - + #else int loop_cnt = cnt; if (loop_cnt > 0) { @@ -304,7 +304,7 @@ void eltwise_prod_prelu(const float* din_a, const float* din_b, float* dout, con } void eltwise_sum_prelu(const float* din_a, const float* din_b, float* dout, const int num, \ - int channel, int channel_size, std::vector coeff, bool channel_shared, float* slop_ptr) { + int channel, int channel_size, std::vector coeff, bool channel_shared, const float* slop_ptr) { int cnt = channel_size >> 3; int remain = channel_size & 7; @@ -339,7 +339,7 @@ void eltwise_sum_prelu(const float* din_a, const float* din_b, float* dout, cons float32x4_t vout_sel0 = vbslq_f32(vmask0, vout0, vsum0); float32x4_t vout_sel1 = vbslq_f32(vmask1, vout1, vsum1); - + a_ptr += 8; b_ptr += 8; @@ -348,7 +348,7 @@ void eltwise_sum_prelu(const float* din_a, const float* din_b, float* dout, cons out_ptr += 8; } - + #else int loop_cnt = cnt; if (loop_cnt > 0) { @@ -393,7 +393,7 @@ void eltwise_sum_prelu(const float* din_a, const float* din_b, float* dout, cons } void eltwise_max_prelu(const float* din_a, const float* din_b, float* dout, const int num, \ - int channel, int channel_size, std::vector coeff, bool channel_shared, float* slop_ptr) { + int channel, int channel_size, std::vector coeff, bool channel_shared, const float* slop_ptr) { int cnt = channel_size >> 3; int remain = channel_size & 7; @@ -428,7 +428,7 @@ void eltwise_max_prelu(const float* din_a, const float* din_b, float* dout, cons float32x4_t vout_sel0 = vbslq_f32(vmask0, vout0, vsum0); float32x4_t vout_sel1 = vbslq_f32(vmask1, vout1, vsum1); - + a_ptr += 8; b_ptr += 8; @@ -437,7 +437,7 @@ void eltwise_max_prelu(const float* din_a, const float* din_b, float* dout, cons out_ptr += 8; } - + #else int loop_cnt = cnt; if (loop_cnt > 0) { @@ -668,4 +668,4 @@ REGISTER_LAYER_CLASS(SaberEltwiseAct); } // namespace anakin -#endif //USE_ARM_PLACE \ No newline at end of file +#endif //USE_ARM_PLACE diff --git a/saber/lite/funcs/saber_eltwise_act.h b/saber/lite/funcs/saber_eltwise_act.h index 685e29348..43da72b5e 100644 --- a/saber/lite/funcs/saber_eltwise_act.h +++ b/saber/lite/funcs/saber_eltwise_act.h @@ -26,7 +26,7 @@ namespace lite{ typedef void (*eltwise_act_func)(const float* din_a, \ const float* din_b, float* dout, const int num, const int channel, \ - const int channel_size, std::vector coef, bool channel_shared, float* slop_ptr); + const int channel_size, std::vector coef, bool channel_shared, const float* slop_ptr); //template class SaberEltwiseAct : public OpBase { From 5596a5e8097f2434f555ac7f9e86f664c9d1925b Mon Sep 17 00:00:00 2001 From: Zhang Date: Thu, 23 Aug 2018 18:58:17 +0800 Subject: [PATCH 02/10] some change for resize --- saber/lite/funcs/neon/saber_resize.cpp | 162 ++++++++++++++----------- saber/lite/funcs/saber_resize.h | 4 + test/lite/test_resize_lite.cpp | 33 ++--- 3 files changed, 111 insertions(+), 88 deletions(-) diff --git a/saber/lite/funcs/neon/saber_resize.cpp b/saber/lite/funcs/neon/saber_resize.cpp index b38b45a38..2b09d2286 100644 --- a/saber/lite/funcs/neon/saber_resize.cpp +++ b/saber/lite/funcs/neon/saber_resize.cpp @@ -12,63 +12,25 @@ namespace saber{ namespace lite{ -static void resize_spatial(const float* src, int w_in, int h_in, float* dst, int w_out, int h_out, float scale_x, float scale_y) -{ +static void resize_spatial(const float* src, int w_in, int h_in, float* dst, \ + int w_out, int h_out, float* coor_buf, std::vector> rows_buf){ - int* buf = new int[w_out + h_out + w_out * 2 + h_out * 2]; + int* xofs = (int*)coor_buf; + int* yofs = xofs + w_out; - int* xofs = buf; - int* yofs = buf + w_out; + float* alpha = (float*)yofs + h_out; + float* beta = alpha + w_out * 2; - float* alpha = (float*)(buf + w_out + h_out); - float* beta = (float*)(buf + w_out + h_out + w_out * 2); - - float fx; - float fy; - int sx; - int sy; - - for (int dx = 0; dx < w_out; dx++){ - fx = dx * scale_x; - sx = int(fx); - fx -= sx; - - if (sx >= w_in - 1){ - sx = w_in - 2; - fx = 1.f; - } - - xofs[dx] = sx; - - alpha[dx*2 ] = 1.f - fx; - alpha[dx*2 + 1] = fx; - } - - for (int dy = 0; dy < h_out; dy++){ - fy = dy * scale_y; - sy = int(fy); - fy -= sy; - - if (sy >= h_in - 1) - { - sy = h_in - 2; - fy = 1.f; - } - - yofs[dy] = sy; - - beta[dy*2 ] = 1.f - fy; - beta[dy*2 + 1] = fy; - } - - // loop body - float* rowsbuf0 = new float[w_out + 1]; - float* rowsbuf1 = new float[w_out + 1]; - float* rows0 = rowsbuf0; - float* rows1 = rowsbuf1; +#ifdef USE_OPENMP + int thread_id = omp_get_thread_num(); +#else + int thread_id = 0; +#endif + float* rows0 = rows_buf[thread_id * 2].mutable_data(); + float* rows1 = rows_buf[thread_id * 2 + 1].mutable_data(); int prev_sy1 = -1; - + //main loop for (int dy = 0; dy < h_out; dy++ ){ int sy = yofs[dy]; @@ -83,7 +45,6 @@ static void resize_spatial(const float* src, int w_in, int h_in, float* dst, int float* rows1p = rows1; int dx = 0; - float* rows1pt = rows1; for ( ; dx+1 < w_out; dx += 2 ){ int sx = xofs[dx]; int sxn = xofs[dx+1]; @@ -103,6 +64,7 @@ static void resize_spatial(const float* src, int w_in, int h_in, float* dst, int vst1_f32(rows1p + dx, _rows1); alphap += 4; #else + float* rows1pt = rows1; asm volatile( "vld1.32 {d0-d1}, [%[alpha]]! @load alpha to q0\n" "vld1.32 d2, [%[s1p]] @load s1p to d2 \n" @@ -176,22 +138,21 @@ static void resize_spatial(const float* src, int w_in, int h_in, float* dst, int /* asm volatile( - "pld [%[alpha]] \n" - "vld1.32 {d0-d1}, [%[alpha]]! \n" - "vld1.32 d2, [%[s0p]] \n" + "pld [%[alpha]] \n" + "vld1.32 {d0-d1}, [%[alpha]]! \n" + "vld1.32 d2, [%[s0p]] \n" "vld1.32 d3, [%[s0np]] \n" "vld1.32 d4, [%[s1p]] \n" - "vld1.32 d5, [%[s1np]] \n" + "vld1.32 d5, [%[s1np]] \n" - "vmul.f32 q3, q1, q0 \n" - "vmul.f32 q4, q2, q0 \n" + "vmul.f32 q3, q1, q0 \n" + "vmul.f32 q4, q2, q0 \n" - "vpadd.f32 d10, d6, d7 \n" - "vpadd.f32 d11, d8, d9 \n" + "vpadd.f32 d10, d6, d7 \n" + "vpadd.f32 d11, d8, d9 \n" - "vst1.32 d10, [%[out1]]! \n" - - "vst1.32 d11, [%[out2]]! \n" + "vst1.32 d10, [%[out1]]! \n" + "vst1.32 d11, [%[out2]]! \n" :[out1]"+r"(rows0pt), [out2]"+r"(rows1pt), [alpha]"+r"(alphap) :[s0p]"r"(S0p), [s1p]"r"(S1p),[s0np]"r"(S0np),[s1np]"r"(S1np) @@ -257,7 +218,7 @@ static void resize_spatial(const float* src, int w_in, int h_in, float* dst, int } #else - if(nn > 0){ + if (nn > 0){ asm volatile( "vdup.32 q0, %[b0] @dup b0 to q1\n" "vdup.32 q1, %[b1] @dup b1 to q0\n" @@ -291,22 +252,19 @@ static void resize_spatial(const float* src, int w_in, int h_in, float* dst, int beta += 2; } - delete[] buf; - - delete[] rowsbuf0; - delete[] rowsbuf1; } void resize(const float* in_data, int count, int h_in, int w_in, \ - float* out_data, int h_out, int w_out, float width_scale, float height_scale){ + float* out_data, int h_out, int w_out, float* coor_buf, std::vector> &rows_buf){ int spatial_in = h_in * w_in; int spatial_out = h_out * w_out; #pragma omp parallel for for(int i = 0; i < count; ++i){ - resize_spatial(in_data + i * spatial_in, w_in, h_in, out_data + i * spatial_out, w_out, h_out, width_scale, height_scale); + resize_spatial(in_data + i * spatial_in, w_in, h_in, \ + out_data + i * spatial_out, w_out, h_out, coor_buf, rows_buf); } } @@ -361,10 +319,66 @@ SaberStatus SaberResize::init(const std::vector *> &inputs return SaberNotInitialized; } this->_ctx = &ctx; + _width_scale = _param->_width_scale; _height_scale = _param->_height_scale; + + int out_w = outputs[0]->valid_shape()[3]; + int out_h = outputs[0]->valid_shape()[2]; + int in_w = inputs[0]->valid_shape()[3]; + int in_h = inputs[0]->valid_shape()[2]; + +#ifdef USE_OPENMP + int num_threads = omp_get_max_threads(); +#else + int num_threads = 1; +#endif + //allocate space + Shape sh_coor(1, 1, 1, 3 * (out_w+out_h)); + _coor_buf.re_alloc(sh_coor); + _rows_buf.resize(2 * num_threads); + //allocate rows buf according to threads num + for (int i = 0; i < _rows_buf.size(); ++i){ + _rows_buf[i].re_alloc(out_w + 1); + } + int* xofs = (int*)_coor_buf.mutable_data(); + int* yofs = xofs + out_w; + float* alpha = (float*)(yofs) + out_h; + float* beta = alpha + out_w * 2; + float fx, fy; + int sx, sy; + + //pre compute coordinate in x and y direction + for (int dx = 0; dx < out_w; dx++){ + fx = dx * (1.0 / _width_scale); + sx = int(fx); + fx -= sx; + + if (sx >= in_w - 1){ + sx = in_w - 2; + fx = 1.f; + } + xofs[dx] = sx; + alpha[dx * 2 ] = 1.f - fx; + alpha[dx * 2 + 1] = fx; + } + + for (int dy = 0; dy < out_h; dy++){ + fy = dy * (1.0 / _height_scale); + sy = int(fy); + fy -= sy; + + if (sy >= in_h - 1){ + sy = in_h - 2; + fy = 1.f; + } + yofs[dy] = sy; + beta[dy * 2 ] = 1.f - fy; + beta[dy * 2 + 1] = fy; + } this->_flag_init = true; return SaberSuccess; + } @@ -392,9 +406,7 @@ SaberStatus SaberResize::dispatch(const std::vector*>& inp int out_h = outputs[0]->height(); int out_w = outputs[0]->width(); - - resize(din, count, in_h, in_w, dout, out_h, out_w, 1.0f / _width_scale, 1.0f / _height_scale); - + resize(din, count, in_h, in_w, dout, out_h, out_w, _coor_buf.mutable_data(), _rows_buf); #ifdef ENABLE_OP_TIMER @@ -406,7 +418,9 @@ SaberStatus SaberResize::dispatch(const std::vector*>& inp #endif return SaberSuccess; } + REGISTER_LAYER_CLASS(SaberResize); + } //namespace lite } //namespace saber diff --git a/saber/lite/funcs/saber_resize.h b/saber/lite/funcs/saber_resize.h index 618d24abc..e7ab6c0ee 100644 --- a/saber/lite/funcs/saber_resize.h +++ b/saber/lite/funcs/saber_resize.h @@ -51,6 +51,10 @@ namespace anakin{ private: const ResizeParam* _param; + ////coordinate buffer + Tensor _coor_buf; + //rows buffer + std::vector> _rows_buf; float _width_scale{0.0f}; float _height_scale{0.0f}; }; diff --git a/test/lite/test_resize_lite.cpp b/test/lite/test_resize_lite.cpp index d8e68bb1b..8e2e13bd1 100644 --- a/test/lite/test_resize_lite.cpp +++ b/test/lite/test_resize_lite.cpp @@ -21,9 +21,9 @@ void resize_basic(const float* in_data,int count, int h_in, int w_in, \ int spatial_in = h_in * w_in; int spatial_out = h_out * w_out; -#pragma omp parallel for - for(int i = 0; i < count; ++i){ - for(int s = 0; s < spatial_out; ++s){ + + for (int i = 0; i < count; ++i){ + for (int s = 0; s < spatial_out; ++s){ int x_out = s % w_out; int y_out = s / w_out; float x_in = x_out * width_scale; @@ -67,8 +67,7 @@ TEST(TestSaberLite, test_func_resize_arm) { // start Reshape & doInfer Context ctx1; LOG(INFO) << "set runtine context"; - PowerMode mode = cluster == 0? SABER_POWER_HIGH : SABER_POWER_LOW; - ctx1.set_run_mode(mode, threads); + ctx1.set_run_mode((PowerMode)cluster, threads); LOG(INFO) << "test threads activated"; #pragma omp parallel { @@ -139,16 +138,16 @@ TEST(TestSaberLite, test_func_resize_arm) { } } + printf("basic resize time: %.4fms\n", basic_tdiff); + printf("saber resize total time : %.4fms, avg time : %.4fms\n", sum, sum / test_iter, min_time); #if COMPARE_RESULT double max_ratio = 0; double max_diff = 0; - tensor_cmp_host(tout_basic.data(), tout.data(), tout_basic.valid_size(), max_ratio, max_diff); CHECK_EQ(fabsf(max_ratio) < 1e-5f, true) << "compute result error" \ << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio;; #endif - printf("basic resize time: %.4fms\n", basic_tdiff); - printf("saber resize total time : %.4fms, avg time : %.4fms\n", sum, sum / test_iter, min_time); + //print_tensor(*vin[0]); //print_tensor(tout_basic); //print_tensor(*vout[0]); @@ -161,26 +160,32 @@ int main(int argc, const char** argv){ if (argc >= 2) { cluster = atoi(argv[1]); + if (cluster < 0){ + cluster = 0; + } + if (cluster > 3){ + cluster = 3; + } } if (argc >= 3) { threads = atoi(argv[2]); } - if(argc >= 4){ + if (argc >= 4){ num_in = atoi(argv[3]); } - if(argc >= 5){ + if (argc >= 5){ ch_in = atoi(argv[4]); } - if(argc >= 6){ + if (argc >= 6){ h_in = atoi(argv[5]); } - if(argc >= 7){ + if (argc >= 7){ w_in = atoi(argv[6]); } - if(argc >= 8){ + if (argc >= 8){ width_scale = atof(argv[7]); } - if(argc >= 9){ + if (argc >= 9){ height_scale = atof(argv[8]); } InitTest(); From 1d570e98f79de9b365e073e9f47a371dd19b13cb Mon Sep 17 00:00:00 2001 From: chenjiaoAngel Date: Fri, 24 Aug 2018 13:39:16 +0800 Subject: [PATCH 03/10] update softmax --- saber/lite/funcs/saber_softmax.h | 1 + test/lite/test_softmax_lite.cpp | 31 ++++++++++++++++++++++--------- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/saber/lite/funcs/saber_softmax.h b/saber/lite/funcs/saber_softmax.h index 3181f56f9..2686ea2e2 100755 --- a/saber/lite/funcs/saber_softmax.h +++ b/saber/lite/funcs/saber_softmax.h @@ -53,6 +53,7 @@ class SaberSoftmax : public OpBase { private: const SoftmaxParam* _param; + float* _work_space_data; int _axis_size{0}; int _inner_num{0}; int _outer_num{0}; diff --git a/test/lite/test_softmax_lite.cpp b/test/lite/test_softmax_lite.cpp index bd5f3a147..4e0959e99 100644 --- a/test/lite/test_softmax_lite.cpp +++ b/test/lite/test_softmax_lite.cpp @@ -6,7 +6,11 @@ using namespace anakin::saber::lite; int cluster = 0; int threads = 4; - +int num = 1; +int ch = 1971; +int h = 21; +int w = 1; +int axis = 2; typedef Tensor TensorHf4; #define COMPARE_RESULT 1 @@ -78,13 +82,13 @@ TEST(TestSaberLite, test_func_softmax_arm) { #endif } - int test_iter = 100; + int test_iter = 1; - int softmax_axis = 2; // channel - int w_in = 1; - int h_in = 21; - int ch_in = 1917; - int num_in = 1; + int softmax_axis = axis; // channel + int w_in = w; + int h_in = h; + int ch_in = ch; + int num_in = num; Shape shape_in(num_in, ch_in, h_in, w_in); Shape shape_out = shape_in; @@ -107,7 +111,7 @@ TEST(TestSaberLite, test_func_softmax_arm) { #if COMPARE_RESULT softmax_basic(thin, softmax_axis, tout_basic); - //print_tensor_host(tout_basic); + //print_tensor(tout_basic); #endif SaberSoftmax softmax_lite; @@ -145,7 +149,7 @@ TEST(TestSaberLite, test_func_softmax_arm) { } printf("saber softmax total time : %.4f, avg time : %.4f\n", to, to / test_iter, min_time); - //print_tensor_host(*vout[0]); + //print_tensor(*vout[0]); #if COMPARE_RESULT double max_ratio = 0; @@ -170,6 +174,15 @@ int main(int argc, const char** argv){ if (argc >= 3) { threads = atoi(argv[2]); } + if (argc >= 4) { + axis = atoi(argv[3]); + } + if (argc >= 5 && argc <= 8) { + num = atoi(argv[4]); + ch = atoi(argv[5]); + h = atoi(argv[6]); + w = atoi(argv[7]); + } InitTest(); RUN_ALL_TESTS(argv[0]); From aa81f67dc0d2191902d9c53e7837aeae8e68555a Mon Sep 17 00:00:00 2001 From: chenjiaoAngel Date: Fri, 24 Aug 2018 13:54:04 +0800 Subject: [PATCH 04/10] update softmax --- saber/lite/funcs/neon/saber_softmax.cpp | 106 +++++++++++++++++++++++- saber/lite/funcs/saber_softmax.h | 1 - 2 files changed, 105 insertions(+), 2 deletions(-) diff --git a/saber/lite/funcs/neon/saber_softmax.cpp b/saber/lite/funcs/neon/saber_softmax.cpp index b609d0869..657874bae 100644 --- a/saber/lite/funcs/neon/saber_softmax.cpp +++ b/saber/lite/funcs/neon/saber_softmax.cpp @@ -48,6 +48,101 @@ void softmax_basic(const float* din, float* dout, \ } } +void softmax_arm_lite(const float* din, float* dout, \ + const int axis_size, const int inner_num, \ + const int outer_num, const int compute_size) { + + int cmp_cnt = compute_size >> 2; + // int cmp_remain = compute_size % 4; + // printf("axis_size: %d, inner_num: %d, outer_num: %d, compute_size: %d \n", axis_size, inner_num, outer_num, compute_size); + //printf("cmp_cnt: %d \n", cmp_cnt); + #pragma omp parallel for + for (int c = 0; c < cmp_cnt; ++c) { + int i = c * 4; + int idx_inner = i % inner_num; + int idx_outer = (i / inner_num) * axis_size; + int real_index = idx_outer * inner_num + idx_inner; + + //float max_data = din[real_index]; + const float* din_ptr = din + real_index; + float32x4_t vmax = vld1q_f32(din_ptr); + //! get max + for (int j = 1; j < axis_size; ++j) { + din_ptr += inner_num; + float32x4_t vdata = vld1q_f32(din_ptr); + vmax = vmaxq_f32(vmax, vdata); + } + + //! sub, exp and sum + // dout[real_index] = expf(din[real_index] - max_data); + din_ptr = din + real_index; + float* dout_ptr = dout + real_index; + float32x4_t vdata = vld1q_f32(din_ptr); + float32x4_t vsum = exp_ps(vsubq_f32(vdata, vmax)); + din_ptr += inner_num; + vst1q_f32(dout_ptr, vsum); + dout_ptr += inner_num; + //float sum_data = dout[real_index]; + for (int j = 1; j < axis_size; ++j) { + // real_index += inner_num; + float32x4_t vdata0 = vld1q_f32(din_ptr); + vdata0 = exp_ps(vsubq_f32(vdata0, vmax)); + din_ptr += inner_num; + vsum = vaddq_f32(vsum, vdata0); + vst1q_f32(dout_ptr, vdata0); + dout_ptr += inner_num; + } + + // float sum_inv = 1.f / sum_data; + float32x4_t vone = vdupq_n_f32(1.0f); + float32x4_t vinf = div_ps(vone, vsum); + dout_ptr = dout + real_index; + //printf("real_index: %d, dout: %x, dout_ptr: %x \n", real_index, dout, dout_ptr); + // real_index = idx_outer * inner_num + idx_inner; + //! get softmax result + for (int j = 0; j < axis_size; ++j) { + float32x4_t vdata0 = vld1q_f32(dout_ptr); + vdata0 = vmulq_f32(vdata0, vinf); + vst1q_f32(dout_ptr, vdata0); + dout_ptr += inner_num; + } + } + + for(int i = cmp_cnt * 4; i < compute_size; i++){ + int idx_inner = i % inner_num; + int idx_outer = (i / inner_num) * axis_size; + int real_index = idx_outer * inner_num + idx_inner; + + // printf("real_index: %d, din: %x\n", real_index, din); + + float max_data = din[real_index]; + //! get max + for (int j = 1; j < axis_size; ++j) { + real_index += inner_num; + max_data = din[real_index] > max_data? din[real_index] : max_data; + } + + real_index = idx_outer * inner_num + idx_inner; + //! sub, exp and sum + dout[real_index] = expf(din[real_index] - max_data); + float sum_data = dout[real_index]; + for (int j = 1; j < axis_size; ++j) { + real_index += inner_num; + dout[real_index] = expf(din[real_index] - max_data); + sum_data += dout[real_index]; + } + + float sum_inv = 1.f / sum_data; + real_index = idx_outer * inner_num + idx_inner; + //! get softmax result + for (int j = 0; j < axis_size; ++j) { + dout[real_index] *= sum_inv; + real_index += inner_num; + } + } + +} + //! for inner size == 1 void softmax_inner1(const float* din, float* dout, \ const int outer_size, const int axis_size) { @@ -242,7 +337,16 @@ SaberStatus SaberSoftmax::dispatch(const std::vector*>& in } } else { int compute_size = inputs[0]->valid_size() / _axis_size; - softmax_basic(din, dout, _axis_size, _inner_num, _outer_num, compute_size); + // softmax_basic(din, dout, _axis_size, _inner_num, _outer_num, compute_size); +#if 1 + if(this->_inner_num % 4){ + // printf("basic \n"); + softmax_basic(din, dout, _axis_size, _inner_num, _outer_num, compute_size); + }else{ + // printf("lite \n"); + softmax_arm_lite(din, dout, _axis_size, _inner_num, _outer_num, compute_size); + } +#endif } #ifdef ENABLE_OP_TIMER this->_timer.end(); diff --git a/saber/lite/funcs/saber_softmax.h b/saber/lite/funcs/saber_softmax.h index 2686ea2e2..3181f56f9 100755 --- a/saber/lite/funcs/saber_softmax.h +++ b/saber/lite/funcs/saber_softmax.h @@ -53,7 +53,6 @@ class SaberSoftmax : public OpBase { private: const SoftmaxParam* _param; - float* _work_space_data; int _axis_size{0}; int _inner_num{0}; int _outer_num{0}; From 6ebaeb6ca62bc4ed0c4641f58b0d3cc373aa16e7 Mon Sep 17 00:00:00 2001 From: chenjiaoAngel Date: Fri, 24 Aug 2018 16:58:02 +0800 Subject: [PATCH 05/10] fix bug in parse priorbox --- framework/operators/priorbox.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/framework/operators/priorbox.cpp b/framework/operators/priorbox.cpp index 27855904b..7ee221f12 100644 --- a/framework/operators/priorbox.cpp +++ b/framework/operators/priorbox.cpp @@ -23,14 +23,17 @@ Status PriorBoxHelper::InitParam() { if (FIND_PARAMETER(fixed_size)) { fixed_size_ = GET_PARAMETER(PTuple, fixed_size); } + // LOG(ERROR) << "fixed_size size " << fixed_size_.size(); PTuple fixed_ratio_; if (FIND_PARAMETER(fixed_ratio)) { fixed_ratio_ = GET_PARAMETER(PTuple, fixed_ratio);; } + //LOG(ERROR) << "fixed_ratio size " << fixed_ratio_.size(); PTuple density_; if (FIND_PARAMETER(density)) { - auto density_ = GET_PARAMETER(PTuple, density); + density_ = GET_PARAMETER(PTuple, density); } + //LOG(ERROR) << "density_ size " << density_.size(); //end auto max_size_ = GET_PARAMETER(PTuple, max_size); auto as_ratio = GET_PARAMETER(PTuple, aspect_ratio); From 5f8bb54b149c928e794fa22dfcc63da3a7d2a238 Mon Sep 17 00:00:00 2001 From: chenjiaoAngel Date: Sat, 25 Aug 2018 14:09:55 +0800 Subject: [PATCH 06/10] update conv_dw3x3 --- .../funcs/neon/impl/conv_arm_depthwise.cpp | 1397 ++++++++++++++++- 1 file changed, 1389 insertions(+), 8 deletions(-) diff --git a/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp b/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp index 30a1fac03..ac2691bbb 100644 --- a/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp +++ b/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp @@ -96,7 +96,7 @@ void conv_depthwise_3x3(const float* din, float* dout, \ } } else { //! stride = 2 if (flag_relu) { - if(w_in > 4){ + if(w_in > 7){ conv_depthwise_3x3s2p1_bias_relu(dout, din, weights, bias, flag_bias, \ num, ch_in, h_in, w_in, h_out, w_out); }else{ @@ -104,7 +104,7 @@ void conv_depthwise_3x3(const float* din, float* dout, \ num, ch_in, h_in, w_in, h_out, w_out); } } else { - if(w_in > 4){ + if(w_in > 7){ conv_depthwise_3x3s2p1_bias(dout, din, weights, bias, flag_bias, \ num, ch_in, h_in, w_in, h_out, w_out); }else{ @@ -3088,6 +3088,465 @@ void conv_depthwise_3x3s1p1_bias(float* dout, const float* din, \ */ #ifdef __aarch64__ //one line +#if 1 +//w_in > 7 +void conv_depthwise_3x3s2p1_bias(float* dout, const float* din, \ + const float* weights, const float* bias, bool flag_bias, \ + const int num, const int ch_in, const int h_in, const int w_in, \ + const int h_out, const int w_out) { + + int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + int out_pad_idx[4] = {0, 1, 2, 3}; + int size_pad_bottom = h_out * 2 - h_in; + + int cnt_col = (w_out >> 2) - 1; + int cnt_remain = (w_out % 4); + int size_right_remain = w_in - (7 + cnt_col * 8); + + int size_right_pad = w_out * 2 - w_in; + + uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), vld1q_s32(right_pad_idx));//0 2 4 6 + uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), vld1q_s32(right_pad_idx + 4));//1 3 5 7 + uint32x4_t wmask = vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));//0 1 2 3 + // printf("w_out %d, cnt_col: %d, remain: %d \n", w_out, cnt_col, size_right_remain); + //printf("mask1: %d, %d, %d, %d \n", vmask_rp1[0], vmask_rp1[1], vmask_rp1[2], vmask_rp1[3]); + //printf("mask2: %d, %d, %d, %d \n", vmask_rp2[0], vmask_rp2[1], vmask_rp2[2], vmask_rp2[3]); + //printf("wmask: %d, %d, %d, %d \n", wmask[0], wmask[1], wmask[2], wmask[3]); + // size_right_remain *= sizeof(float); + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float* din_channel = din_batch + i * size_in_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + const float *weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + + float32x4_t vzero= vdupq_n_f32(0.f); + + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } + + const float *dr0 = din_channel; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + + const float *din0_ptr = dr0; + const float *din1_ptr = dr1; + const float *din2_ptr = dr2; + + float *doutr0 = dout_channel; + float *doutr0_ptr = doutr0; + + //! top pad + if(1){ + int cnt = cnt_col; + + //printf("cnt_col: %d, remain: %d \n", cnt_col, size_right_remain); + // printf("mask1: %d, %d, %d, %d \n", vmask_rp1[0], vmask_rp1[1], vmask_rp1[2], vmask_rp1[3]); + // printf("mask2: %d, %d, %d, %d \n", vmask_rp2[0], vmask_rp2[1], vmask_rp2[2], vmask_rp2[3]); + // printf("wmask: %d, %d, %d, %d \n", wmask[0], wmask[1], wmask[2], wmask[3]); + + asm volatile ( + //top + // Load up 12 elements (3 vectors) from each of 8 sources. + "0: \n" + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + + "ext v2.16b, %[vzero].16b, v1.16b, #12 \n" // v2 = {0,1,3,5} + "ext v6.16b, %[vzero].16b, v5.16b, #12 \n" // v6 = {0,1,3,5} + + "fmul v8.4s, v0.4s, %[w1].s[1] \n" // v0 * w01 + "fmul v9.4s, v1.4s, %[w1].s[2] \n" // v1 * w02 + "fmla v10.4s, v2.4s, %[w1].s[0] \n" // v2 * w00 + + "sub %[inptr0], %[inptr0], #4 \n" + "sub %[inptr1], %[inptr1], #4 \n" + + "fmla v8.4s, v4.4s, %[w2].s[1] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w2].s[2] \n" // v1 * w02 + "fmla v10.4s, v6.4s, %[w2].s[0] \n" // v2 * w00 + + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + + "st1 {v0.4s}, [%[outptr]], #16 \n" + "cmp %[cnt], #1 \n" + "blt 1f \n" + //mid + "2: \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + + "ld2 {v2.4s, v3.4s}, [%[inptr0]] \n" //v2={8,10,12,14} v3={9,11,13,15} + "ext v6.16b, v0.16b, v2.16b, #4 \n" // v6 = {2,4,6,8} + + "fmul v8.4s, v0.4s, %[w1].s[0] \n" // v0 * w00 + "fmul v9.4s, v1.4s, %[w1].s[1] \n" // v1 * w01 + "fmla v10.4s, v6.4s, %[w1].s[2] \n" // v6 * w02 + + "ld2 {v6.4s, v7.4s}, [%[inptr1]] \n" //v2={8,10,12,14} v3={9,11,13,15} + "ext v11.16b, v4.16b, v6.16b, #4 \n" // v6 = {2,4,6,8} + + "fmla v8.4s, v4.4s, %[w2].s[0] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w2].s[1] \n" // v1 * w02 + "fmla v10.4s, v11.4s, %[w2].s[2] \n" // v2 * w00 + + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + + "subs %[cnt], %[cnt], #1 \n" + + "st1 {v0.4s}, [%[outptr]], #16 \n" + + "bne 2b \n" + + //right + "1: \n" + "cmp %[remain], #1 \n" + "blt 4f \n" + "3: \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + "bif v0.16b, %[vzero].16b, %[mask1].16b \n" //pipei + "bif v1.16b, %[vzero].16b, %[mask2].16b \n" //pipei + // "bif v10.16b, %[vzero].16b, %[wmask].16b \n" //pipei + "ext v2.16b, v0.16b, %[vzero].16b, #4 \n" // v6 = {2,4,6,8} + "bif v4.16b, %[vzero].16b, %[mask1].16b \n" //pipei + + + "fmul v8.4s, v0.4s, %[w1].s[0] \n" // v0 * w01 + "fmul v9.4s, v1.4s, %[w1].s[1] \n" // v1 * w02 + "fmla v10.4s, v2.4s, %[w1].s[2] \n" // v2 * w00 + + "bif v5.16b, %[vzero].16b, %[mask2].16b \n" //pipei + "ext v6.16b, v4.16b, %[vzero].16b, #4 \n" // v6 = {2,4,6,8} + + "fmla v8.4s, v4.4s, %[w2].s[0] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w2].s[1] \n" // v1 * w02 + "fmla v10.4s, v6.4s, %[w2].s[2] \n" // v2 * w00 + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + "bif v0.16b, %[vzero].16b, %[wmask].16b \n" //pipei + + "st1 {v0.4s}, [%[outptr]], #16 \n" + "4: \n" + : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [outptr] "+r"(doutr0_ptr), \ + [vzero] "+w" (vzero), [w1] "+w" (wr1), [w2] "+w" (wr2), [cnt] "+r" (cnt), \ + [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias) + : [remain] "r" (cnt_remain) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" + ); + } + + dr0 = dr1; + dr1 = dr2; + dr2 = dr1 + w_in; + doutr0 = doutr0 + w_out; + //! mid + for (int j = h_out - size_pad_bottom - 1; j > 0; j--){ + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + + doutr0_ptr = doutr0; + + int cnt = cnt_col; + + asm volatile ( + //top + // Load up 12 elements (3 vectors) from each of 8 sources. + "0: \n" + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + "prfm pldl1keep, [%[inptr2]] \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + + "ext v2.16b, %[vzero].16b, v1.16b, #12 \n" // v2 = {0,1,3,5} + "ext v6.16b, %[vzero].16b, v5.16b, #12 \n" // v6 = {0,1,3,5} + + "fmul v8.4s, v0.4s, %[w0].s[1] \n" // v0 * w01 + "fmul v9.4s, v1.4s, %[w0].s[2] \n" // v1 * w02 + "fmla v10.4s, v2.4s, %[w0].s[0] \n" // v2 * w00 + + "ld2 {v12.4s, v13.4s}, [%[inptr2]], #32 \n" + + "sub %[inptr0], %[inptr0], #4 \n" + "sub %[inptr1], %[inptr1], #4 \n" + + "fmla v8.4s, v4.4s, %[w1].s[1] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w1].s[2] \n" // v1 * w02 + "fmla v10.4s, v6.4s, %[w1].s[0] \n" // v2 * w00 + + "ext v14.16b, %[vzero].16b, v13.16b, #12 \n" // v6 = {0,1,3,5} + "sub %[inptr2], %[inptr2], #4 \n" + + "prfm pldl1keep, [%[inptr0]] \n" + + "fmla v8.4s, v12.4s, %[w2].s[1] \n" // v0 * w01 + "fmla v9.4s, v13.4s, %[w2].s[2] \n" // v1 * w02 + "fmla v10.4s, v14.4s, %[w2].s[0] \n" // v2 * w00 + + "prfm pldl1keep, [%[inptr1]] \n" + "prfm pldl1keep, [%[inptr2]] \n" + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + + "st1 {v0.4s}, [%[outptr]], #16 \n" + "cmp %[cnt], #1 \n" + "blt 1f \n" + //mid + "2: \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + + "ld2 {v2.4s, v3.4s}, [%[inptr0]] \n" //v2={8,10,12,14} v3={9,11,13,15} + "ld2 {v6.4s, v7.4s}, [%[inptr1]] \n" //v2={8,10,12,14} v3={9,11,13,15} + "ld2 {v12.4s, v13.4s}, [%[inptr2]], #32 \n" + "ext v11.16b, v0.16b, v2.16b, #4 \n" // v6 = {2,4,6,8} + + "prfm pldl1keep, [%[inptr2]] \n" + + "fmul v8.4s, v0.4s, %[w0].s[0] \n" // v0 * w00 + "fmul v9.4s, v1.4s, %[w0].s[1] \n" // v1 * w01 + "fmla v10.4s, v11.4s, %[w0].s[2] \n" // v6 * w02 + + "ext v11.16b, v4.16b, v6.16b, #4 \n" // v6 = {2,4,6,8} + "ld2 {v14.4s, v15.4s}, [%[inptr2]] \n" + + "fmla v8.4s, v4.4s, %[w1].s[0] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w1].s[1] \n" // v1 * w02 + "fmla v10.4s, v11.4s, %[w1].s[2] \n" // v2 * w00 + + "ext v11.16b, v12.16b, v14.16b, #4 \n" // v6 = {2,4,6,8} + + "fmla v8.4s, v12.4s, %[w2].s[0] \n" // v0 * w00 + "fmla v9.4s, v13.4s, %[w2].s[1] \n" // v1 * w01 + "fmla v10.4s, v11.4s, %[w2].s[2] \n" // v6 * w02 + + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + "prfm pldl1keep, [%[inptr2]] \n" + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + + "subs %[cnt], %[cnt], #1 \n" + + "st1 {v0.4s}, [%[outptr]], #16 \n" + + "bne 2b \n" + + //right + "1: \n" + "cmp %[remain], #1 \n" + "blt 4f \n" + "3: \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + + "bif v0.16b, %[vzero].16b, %[mask1].16b \n" //pipei + "bif v1.16b, %[vzero].16b, %[mask2].16b \n" //pipei + // "bif v10.16b, %[vzero].16b, %[wmask].16b \n" //pipei + "ext v2.16b, v0.16b, %[vzero].16b, #4 \n" // v6 = {2,4,6,8} + "bif v4.16b, %[vzero].16b, %[mask1].16b \n" //pipei + + "ld2 {v12.4s, v13.4s}, [%[inptr2]], #32 \n" + "bif v5.16b, %[vzero].16b, %[mask2].16b \n" //pipei + "ext v6.16b, v4.16b, %[vzero].16b, #4 \n" // v6 = {2,4,6,8} + + "fmul v8.4s, v0.4s, %[w0].s[0] \n" // v0 * w01 + "fmul v9.4s, v1.4s, %[w0].s[1] \n" // v1 * w02 + "fmla v10.4s, v2.4s, %[w0].s[2] \n" // v2 * w00 + + "bif v12.16b, %[vzero].16b, %[mask1].16b \n" //pipei + "bif v13.16b, %[vzero].16b, %[mask2].16b \n" //pipei + + "fmla v8.4s, v4.4s, %[w1].s[0] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w1].s[1] \n" // v1 * w02 + "fmla v10.4s, v6.4s, %[w1].s[2] \n" // v2 * w00 + + "ext v14.16b, v12.16b, %[vzero].16b, #4 \n" // v6 = {2,4,6,8} + + "fmla v8.4s, v12.4s, %[w2].s[0] \n" // v0 * w01 + "fmla v9.4s, v13.4s, %[w2].s[1] \n" // v1 * w02 + "fmla v10.4s, v14.4s, %[w2].s[2] \n" // v2 * w00 + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + "bif v0.16b, %[vzero].16b, %[wmask].16b \n" //pipei + + "st1 {v0.4s}, [%[outptr]], #16 \n" + "4: \n" + : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [inptr2] "+r"(din2_ptr), [outptr] "+r"(doutr0_ptr), \ + [vzero] "+w" (vzero), [w1] "+w" (wr1), [w2] "+w" (wr2), [cnt] "+r" (cnt), [w0] "+w" (wr0), \ + [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias) + : [remain] "r" (cnt_remain) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14" + ); + dr0 = dr2; + dr1 = dr0 + w_in; + dr2 = dr1 + w_in; + doutr0 = doutr0 + w_out; + } + + if (size_pad_bottom){ + int cnt = cnt_col; + din0_ptr = dr0; + din1_ptr = dr1; + + doutr0_ptr = doutr0; + + asm volatile ( + //top + // Load up 12 elements (3 vectors) from each of 8 sources. + "0: \n" + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + + "ext v2.16b, %[vzero].16b, v1.16b, #12 \n" // v2 = {0,1,3,5} + "ext v6.16b, %[vzero].16b, v5.16b, #12 \n" // v6 = {0,1,3,5} + + "fmul v8.4s, v0.4s, %[w0].s[1] \n" // v0 * w01 + "fmul v9.4s, v1.4s, %[w0].s[2] \n" // v1 * w02 + "fmla v10.4s, v2.4s, %[w0].s[0] \n" // v2 * w00 + + "sub %[inptr0], %[inptr0], #4 \n" + "sub %[inptr1], %[inptr1], #4 \n" + + "fmla v8.4s, v4.4s, %[w1].s[1] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w1].s[2] \n" // v1 * w02 + "fmla v10.4s, v6.4s, %[w1].s[0] \n" // v2 * w00 + + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + + "st1 {v0.4s}, [%[outptr]], #16 \n" + "cmp %[cnt], #1 \n" + "blt 1f \n" + //mid + "2: \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + + "ld2 {v2.4s, v3.4s}, [%[inptr0]] \n" //v2={8,10,12,14} v3={9,11,13,15} + "ext v6.16b, v0.16b, v2.16b, #4 \n" // v6 = {2,4,6,8} + + "fmul v8.4s, v0.4s, %[w0].s[0] \n" // v0 * w00 + "fmul v9.4s, v1.4s, %[w0].s[1] \n" // v1 * w01 + "fmla v10.4s, v6.4s, %[w0].s[2] \n" // v6 * w02 + + "ld2 {v6.4s, v7.4s}, [%[inptr1]] \n" //v2={8,10,12,14} v3={9,11,13,15} + "ext v11.16b, v4.16b, v6.16b, #4 \n" // v6 = {2,4,6,8} + + "fmla v8.4s, v4.4s, %[w1].s[0] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w1].s[1] \n" // v1 * w02 + "fmla v10.4s, v11.4s, %[w1].s[2] \n" // v2 * w00 + + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + + "subs %[cnt], %[cnt], #1 \n" + + "st1 {v0.4s}, [%[outptr]], #16 \n" + + "bne 2b \n" + + //right + "1: \n" + "cmp %[remain], #1 \n" + "blt 4f \n" + "3: \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + "bif v0.16b, %[vzero].16b, %[mask1].16b \n" //pipei + "bif v1.16b, %[vzero].16b, %[mask2].16b \n" //pipei + // "bif v10.16b, %[vzero].16b, %[wmask].16b \n" //pipei + "ext v2.16b, v0.16b, %[vzero].16b, #4 \n" // v6 = {2,4,6,8} + "bif v4.16b, %[vzero].16b, %[mask1].16b \n" //pipei + + + "fmul v8.4s, v0.4s, %[w0].s[0] \n" // v0 * w01 + "fmul v9.4s, v1.4s, %[w0].s[1] \n" // v1 * w02 + "fmla v10.4s, v2.4s, %[w0].s[2] \n" // v2 * w00 + + "bif v5.16b, %[vzero].16b, %[mask2].16b \n" //pipei + "ext v6.16b, v4.16b, %[vzero].16b, #4 \n" // v6 = {2,4,6,8} + + "fmla v8.4s, v4.4s, %[w1].s[0] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w1].s[1] \n" // v1 * w02 + "fmla v10.4s, v6.4s, %[w1].s[2] \n" // v2 * w00 + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + "bif v0.16b, %[vzero].16b, %[wmask].16b \n" //pipei + + "st1 {v0.4s}, [%[outptr]], #16 \n" + "4: \n" + : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [outptr] "+r"(doutr0_ptr), \ + [vzero] "+w" (vzero), [w0] "+w" (wr0), [w1] "+w" (wr1), [cnt] "+r" (cnt), \ + [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias) + : [remain] "r" (cnt_remain) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" + ); + } + + } + } + + +} +#else void conv_depthwise_3x3s2p1_bias(float* dout, const float* din, \ const float* weights, const float* bias, bool flag_bias, \ const int num, const int ch_in, const int h_in, const int w_in, \ @@ -3155,9 +3614,9 @@ void conv_depthwise_3x3s2p1_bias(float* dout, const float* din, \ prefetch(din0_ptr); prefetch(din1_ptr); - // todo - float *doutr0_ptr = doutr0; + float *doutr0_ptr = doutr0; + // todo float32x4_t din0_1234 = vld1q_f32(din0_ptr); float32x4_t din1_1234 = vld1q_f32(din1_ptr); @@ -3536,7 +3995,7 @@ void conv_depthwise_3x3s2p1_bias(float* dout, const float* din, \ } } } - +#endif #else void conv_depthwise_3x3s2p1_bias(float* dout, const float* din, \ @@ -6275,7 +6734,6 @@ void conv_depthwise_3x3s1p1_bias_relu(float* dout, const float* din, \ } } - #else void conv_depthwise_3x3s1p1_bias_relu(float* dout, const float* din, \ @@ -7070,6 +7528,476 @@ void conv_depthwise_3x3s1p1_bias_relu(float* dout, const float* din, \ * \brief depthwise convolution kernel 3x3, stride 2, with reulu */ #ifdef __aarch64__ +#if 1 +//w_in > 7 +void conv_depthwise_3x3s2p1_bias_relu(float* dout, const float* din, \ + const float* weights, const float* bias, bool flag_bias, \ + const int num, const int ch_in, const int h_in, const int w_in, \ + const int h_out, const int w_out) { + + int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + int out_pad_idx[4] = {0, 1, 2, 3}; + int size_pad_bottom = h_out * 2 - h_in; + + int cnt_col = (w_out >> 2) - 1; + int cnt_remain = (w_out % 4); + int size_right_remain = w_in - (7 + cnt_col * 8); + + int size_right_pad = w_out * 2 - w_in; + + uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), vld1q_s32(right_pad_idx));//0 2 4 6 + uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), vld1q_s32(right_pad_idx + 4));//1 3 5 7 + uint32x4_t wmask = vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));//0 1 2 3 + // printf("w_out %d, cnt_col: %d, remain: %d \n", w_out, cnt_col, size_right_remain); + //printf("mask1: %d, %d, %d, %d \n", vmask_rp1[0], vmask_rp1[1], vmask_rp1[2], vmask_rp1[3]); + //printf("mask2: %d, %d, %d, %d \n", vmask_rp2[0], vmask_rp2[1], vmask_rp2[2], vmask_rp2[3]); + //printf("wmask: %d, %d, %d, %d \n", wmask[0], wmask[1], wmask[2], wmask[3]); + // size_right_remain *= sizeof(float); + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float* din_channel = din_batch + i * size_in_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + const float *weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + + float32x4_t vzero= vdupq_n_f32(0.f); + + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } + + const float *dr0 = din_channel; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + + const float *din0_ptr = dr0; + const float *din1_ptr = dr1; + const float *din2_ptr = dr2; + + float *doutr0 = dout_channel; + float *doutr0_ptr = doutr0; + + //! top pad + if(1){ + int cnt = cnt_col; + + //printf("cnt_col: %d, remain: %d \n", cnt_col, size_right_remain); + // printf("mask1: %d, %d, %d, %d \n", vmask_rp1[0], vmask_rp1[1], vmask_rp1[2], vmask_rp1[3]); + // printf("mask2: %d, %d, %d, %d \n", vmask_rp2[0], vmask_rp2[1], vmask_rp2[2], vmask_rp2[3]); + // printf("wmask: %d, %d, %d, %d \n", wmask[0], wmask[1], wmask[2], wmask[3]); + + asm volatile ( + //top + // Load up 12 elements (3 vectors) from each of 8 sources. + "0: \n" + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + + "ext v2.16b, %[vzero].16b, v1.16b, #12 \n" // v2 = {0,1,3,5} + "ext v6.16b, %[vzero].16b, v5.16b, #12 \n" // v6 = {0,1,3,5} + + "fmul v8.4s, v0.4s, %[w1].s[1] \n" // v0 * w01 + "fmul v9.4s, v1.4s, %[w1].s[2] \n" // v1 * w02 + "fmla v10.4s, v2.4s, %[w1].s[0] \n" // v2 * w00 + + "sub %[inptr0], %[inptr0], #4 \n" + "sub %[inptr1], %[inptr1], #4 \n" + + "fmla v8.4s, v4.4s, %[w2].s[1] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w2].s[2] \n" // v1 * w02 + "fmla v10.4s, v6.4s, %[w2].s[0] \n" // v2 * w00 + + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + "fmax v0.4s, v0.4s, %[vzero].4s \n" + + "cmp %[cnt], #1 \n" + "st1 {v0.4s}, [%[outptr]], #16 \n" + "blt 1f \n" + //mid + "2: \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + + "ld2 {v2.4s, v3.4s}, [%[inptr0]] \n" //v2={8,10,12,14} v3={9,11,13,15} + "ext v6.16b, v0.16b, v2.16b, #4 \n" // v6 = {2,4,6,8} + + "fmul v8.4s, v0.4s, %[w1].s[0] \n" // v0 * w00 + "fmul v9.4s, v1.4s, %[w1].s[1] \n" // v1 * w01 + "fmla v10.4s, v6.4s, %[w1].s[2] \n" // v6 * w02 + + "ld2 {v6.4s, v7.4s}, [%[inptr1]] \n" //v2={8,10,12,14} v3={9,11,13,15} + "ext v11.16b, v4.16b, v6.16b, #4 \n" // v6 = {2,4,6,8} + + "fmla v8.4s, v4.4s, %[w2].s[0] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w2].s[1] \n" // v1 * w02 + "fmla v10.4s, v11.4s, %[w2].s[2] \n" // v2 * w00 + + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + + "subs %[cnt], %[cnt], #1 \n" + + "fmax v0.4s, v0.4s, %[vzero].4s \n" + "st1 {v0.4s}, [%[outptr]], #16 \n" + + "bne 2b \n" + + //right + "1: \n" + "cmp %[remain], #1 \n" + "blt 4f \n" + "3: \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + "bif v0.16b, %[vzero].16b, %[mask1].16b \n" //pipei + "bif v1.16b, %[vzero].16b, %[mask2].16b \n" //pipei + // "bif v10.16b, %[vzero].16b, %[wmask].16b \n" //pipei + "ext v2.16b, v0.16b, %[vzero].16b, #4 \n" // v6 = {2,4,6,8} + "bif v4.16b, %[vzero].16b, %[mask1].16b \n" //pipei + + + "fmul v8.4s, v0.4s, %[w1].s[0] \n" // v0 * w01 + "fmul v9.4s, v1.4s, %[w1].s[1] \n" // v1 * w02 + "fmla v10.4s, v2.4s, %[w1].s[2] \n" // v2 * w00 + + "bif v5.16b, %[vzero].16b, %[mask2].16b \n" //pipei + "ext v6.16b, v4.16b, %[vzero].16b, #4 \n" // v6 = {2,4,6,8} + + "fmla v8.4s, v4.4s, %[w2].s[0] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w2].s[1] \n" // v1 * w02 + "fmla v10.4s, v6.4s, %[w2].s[2] \n" // v2 * w00 + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + "fmax v0.4s, v0.4s, %[vzero].4s \n" + "bif v0.16b, %[vzero].16b, %[wmask].16b \n" //pipei + + "st1 {v0.4s}, [%[outptr]], #16 \n" + "4: \n" + : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [outptr] "+r"(doutr0_ptr), \ + [vzero] "+w" (vzero), [w1] "+w" (wr1), [w2] "+w" (wr2), [cnt] "+r" (cnt), \ + [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias) + : [remain] "r" (cnt_remain) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" + ); + } + + dr0 = dr1; + dr1 = dr2; + dr2 = dr1 + w_in; + doutr0 = doutr0 + w_out; + //! mid + for (int j = h_out - size_pad_bottom - 1; j > 0; j--){ + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + + doutr0_ptr = doutr0; + + int cnt = cnt_col; + + asm volatile ( + //top + // Load up 12 elements (3 vectors) from each of 8 sources. + "0: \n" + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + "prfm pldl1keep, [%[inptr2]] \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + + "ext v2.16b, %[vzero].16b, v1.16b, #12 \n" // v2 = {0,1,3,5} + "ext v6.16b, %[vzero].16b, v5.16b, #12 \n" // v6 = {0,1,3,5} + + "fmul v8.4s, v0.4s, %[w0].s[1] \n" // v0 * w01 + "fmul v9.4s, v1.4s, %[w0].s[2] \n" // v1 * w02 + "fmla v10.4s, v2.4s, %[w0].s[0] \n" // v2 * w00 + + "ld2 {v12.4s, v13.4s}, [%[inptr2]], #32 \n" + + "sub %[inptr0], %[inptr0], #4 \n" + "sub %[inptr1], %[inptr1], #4 \n" + + "fmla v8.4s, v4.4s, %[w1].s[1] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w1].s[2] \n" // v1 * w02 + "fmla v10.4s, v6.4s, %[w1].s[0] \n" // v2 * w00 + + "ext v14.16b, %[vzero].16b, v13.16b, #12 \n" // v6 = {0,1,3,5} + "sub %[inptr2], %[inptr2], #4 \n" + + "prfm pldl1keep, [%[inptr0]] \n" + + "fmla v8.4s, v12.4s, %[w2].s[1] \n" // v0 * w01 + "fmla v9.4s, v13.4s, %[w2].s[2] \n" // v1 * w02 + "fmla v10.4s, v14.4s, %[w2].s[0] \n" // v2 * w00 + + "prfm pldl1keep, [%[inptr1]] \n" + "prfm pldl1keep, [%[inptr2]] \n" + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + "fmax v0.4s, v0.4s, %[vzero].4s \n" + + "cmp %[cnt], #1 \n" + "st1 {v0.4s}, [%[outptr]], #16 \n" + "blt 1f \n" + //mid + "2: \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + + "ld2 {v2.4s, v3.4s}, [%[inptr0]] \n" //v2={8,10,12,14} v3={9,11,13,15} + "ld2 {v6.4s, v7.4s}, [%[inptr1]] \n" //v2={8,10,12,14} v3={9,11,13,15} + "ld2 {v12.4s, v13.4s}, [%[inptr2]], #32 \n" + "ext v11.16b, v0.16b, v2.16b, #4 \n" // v6 = {2,4,6,8} + + "prfm pldl1keep, [%[inptr2]] \n" + + "fmul v8.4s, v0.4s, %[w0].s[0] \n" // v0 * w00 + "fmul v9.4s, v1.4s, %[w0].s[1] \n" // v1 * w01 + "fmla v10.4s, v11.4s, %[w0].s[2] \n" // v6 * w02 + + "ext v11.16b, v4.16b, v6.16b, #4 \n" // v6 = {2,4,6,8} + "ld2 {v14.4s, v15.4s}, [%[inptr2]] \n" + + "fmla v8.4s, v4.4s, %[w1].s[0] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w1].s[1] \n" // v1 * w02 + "fmla v10.4s, v11.4s, %[w1].s[2] \n" // v2 * w00 + + "ext v11.16b, v12.16b, v14.16b, #4 \n" // v6 = {2,4,6,8} + + "fmla v8.4s, v12.4s, %[w2].s[0] \n" // v0 * w00 + "fmla v9.4s, v13.4s, %[w2].s[1] \n" // v1 * w01 + "fmla v10.4s, v11.4s, %[w2].s[2] \n" // v6 * w02 + + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + "prfm pldl1keep, [%[inptr2]] \n" + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + + "subs %[cnt], %[cnt], #1 \n" + "fmax v0.4s, v0.4s, %[vzero].4s \n" + + "st1 {v0.4s}, [%[outptr]], #16 \n" + + "bne 2b \n" + + //right + "1: \n" + "cmp %[remain], #1 \n" + "blt 4f \n" + "3: \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + + "bif v0.16b, %[vzero].16b, %[mask1].16b \n" //pipei + "bif v1.16b, %[vzero].16b, %[mask2].16b \n" //pipei + // "bif v10.16b, %[vzero].16b, %[wmask].16b \n" //pipei + "ext v2.16b, v0.16b, %[vzero].16b, #4 \n" // v6 = {2,4,6,8} + "bif v4.16b, %[vzero].16b, %[mask1].16b \n" //pipei + + "ld2 {v12.4s, v13.4s}, [%[inptr2]], #32 \n" + "bif v5.16b, %[vzero].16b, %[mask2].16b \n" //pipei + "ext v6.16b, v4.16b, %[vzero].16b, #4 \n" // v6 = {2,4,6,8} + + "fmul v8.4s, v0.4s, %[w0].s[0] \n" // v0 * w01 + "fmul v9.4s, v1.4s, %[w0].s[1] \n" // v1 * w02 + "fmla v10.4s, v2.4s, %[w0].s[2] \n" // v2 * w00 + + "bif v12.16b, %[vzero].16b, %[mask1].16b \n" //pipei + "bif v13.16b, %[vzero].16b, %[mask2].16b \n" //pipei + + "fmla v8.4s, v4.4s, %[w1].s[0] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w1].s[1] \n" // v1 * w02 + "fmla v10.4s, v6.4s, %[w1].s[2] \n" // v2 * w00 + + "ext v14.16b, v12.16b, %[vzero].16b, #4 \n" // v6 = {2,4,6,8} + + "fmla v8.4s, v12.4s, %[w2].s[0] \n" // v0 * w01 + "fmla v9.4s, v13.4s, %[w2].s[1] \n" // v1 * w02 + "fmla v10.4s, v14.4s, %[w2].s[2] \n" // v2 * w00 + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + "fmax v0.4s, v0.4s, %[vzero].4s \n" + + "bif v0.16b, %[vzero].16b, %[wmask].16b \n" //pipei + + "st1 {v0.4s}, [%[outptr]], #16 \n" + "4: \n" + : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [inptr2] "+r"(din2_ptr), [outptr] "+r"(doutr0_ptr), \ + [vzero] "+w" (vzero), [w1] "+w" (wr1), [w2] "+w" (wr2), [cnt] "+r" (cnt), [w0] "+w" (wr0), \ + [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias) + : [remain] "r" (cnt_remain) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14" + ); + dr0 = dr2; + dr1 = dr0 + w_in; + dr2 = dr1 + w_in; + doutr0 = doutr0 + w_out; + } + + if (size_pad_bottom){ + int cnt = cnt_col; + din0_ptr = dr0; + din1_ptr = dr1; + + doutr0_ptr = doutr0; + + asm volatile ( + //top + // Load up 12 elements (3 vectors) from each of 8 sources. + "0: \n" + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + + "ext v2.16b, %[vzero].16b, v1.16b, #12 \n" // v2 = {0,1,3,5} + "ext v6.16b, %[vzero].16b, v5.16b, #12 \n" // v6 = {0,1,3,5} + + "fmul v8.4s, v0.4s, %[w0].s[1] \n" // v0 * w01 + "fmul v9.4s, v1.4s, %[w0].s[2] \n" // v1 * w02 + "fmla v10.4s, v2.4s, %[w0].s[0] \n" // v2 * w00 + + "sub %[inptr0], %[inptr0], #4 \n" + "sub %[inptr1], %[inptr1], #4 \n" + + "fmla v8.4s, v4.4s, %[w1].s[1] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w1].s[2] \n" // v1 * w02 + "fmla v10.4s, v6.4s, %[w1].s[0] \n" // v2 * w00 + + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + + "fmax v0.4s, v0.4s, %[vzero].4s \n" + + "cmp %[cnt], #1 \n" + "st1 {v0.4s}, [%[outptr]], #16 \n" + "blt 1f \n" + //mid + "2: \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + + "ld2 {v2.4s, v3.4s}, [%[inptr0]] \n" //v2={8,10,12,14} v3={9,11,13,15} + "ext v6.16b, v0.16b, v2.16b, #4 \n" // v6 = {2,4,6,8} + + "fmul v8.4s, v0.4s, %[w0].s[0] \n" // v0 * w00 + "fmul v9.4s, v1.4s, %[w0].s[1] \n" // v1 * w01 + "fmla v10.4s, v6.4s, %[w0].s[2] \n" // v6 * w02 + + "ld2 {v6.4s, v7.4s}, [%[inptr1]] \n" //v2={8,10,12,14} v3={9,11,13,15} + "ext v11.16b, v4.16b, v6.16b, #4 \n" // v6 = {2,4,6,8} + + "fmla v8.4s, v4.4s, %[w1].s[0] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w1].s[1] \n" // v1 * w02 + "fmla v10.4s, v11.4s, %[w1].s[2] \n" // v2 * w00 + + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + + "subs %[cnt], %[cnt], #1 \n" + "fmax v0.4s, v0.4s, %[vzero].4s \n" + + "st1 {v0.4s}, [%[outptr]], #16 \n" + + "bne 2b \n" + + //right + "1: \n" + "cmp %[remain], #1 \n" + "blt 4f \n" + "3: \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + "bif v0.16b, %[vzero].16b, %[mask1].16b \n" //pipei + "bif v1.16b, %[vzero].16b, %[mask2].16b \n" //pipei + // "bif v10.16b, %[vzero].16b, %[wmask].16b \n" //pipei + "ext v2.16b, v0.16b, %[vzero].16b, #4 \n" // v6 = {2,4,6,8} + "bif v4.16b, %[vzero].16b, %[mask1].16b \n" //pipei + + + "fmul v8.4s, v0.4s, %[w0].s[0] \n" // v0 * w01 + "fmul v9.4s, v1.4s, %[w0].s[1] \n" // v1 * w02 + "fmla v10.4s, v2.4s, %[w0].s[2] \n" // v2 * w00 + + "bif v5.16b, %[vzero].16b, %[mask2].16b \n" //pipei + "ext v6.16b, v4.16b, %[vzero].16b, #4 \n" // v6 = {2,4,6,8} + + "fmla v8.4s, v4.4s, %[w1].s[0] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w1].s[1] \n" // v1 * w02 + "fmla v10.4s, v6.4s, %[w1].s[2] \n" // v2 * w00 + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + "fmax v0.4s, v0.4s, %[vzero].4s \n" + "bif v0.16b, %[vzero].16b, %[wmask].16b \n" //pipei + + "st1 {v0.4s}, [%[outptr]], #16 \n" + "4: \n" + : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [outptr] "+r"(doutr0_ptr), \ + [vzero] "+w" (vzero), [w0] "+w" (wr0), [w1] "+w" (wr1), [cnt] "+r" (cnt), \ + [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias) + : [remain] "r" (cnt_remain) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" + ); + } + + } + } + + +} +#else void conv_depthwise_3x3s2p1_bias_relu(float* dout, const float* din, \ const float* weights, const float* bias, bool flag_bias, \ const int num, const int ch_in, const int h_in, const int w_in, \ @@ -7529,7 +8457,7 @@ void conv_depthwise_3x3s2p1_bias_relu(float* dout, const float* din, \ } } } - +#endif #else void conv_depthwise_3x3s2p1_bias_relu(float* dout, const float* din, \ @@ -8881,6 +9809,232 @@ void conv_depthwise_3x3s1p1_bias_s(float* dout, const float* din, \ * \brief depthwise convolution kernel 3x3, stride 2, width <= 4 */ #ifdef __aarch64__ +#if 1 //w <= 7 +void conv_depthwise_3x3s2p1_bias_s(float* dout, const float* din, \ + const float* weights, const float* bias, bool flag_bias, \ + const int num, const int ch_in, const int h_in, const int w_in, \ + const int h_out, const int w_out) { + + int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + int out_pad_idx[4] = {0, 1, 2, 3}; + int size_pad_bottom = h_out * 2 - h_in; + + int size_right_remain = w_in; + + int size_right_pad = w_out * 2 - w_in; + + int cnt_remain = w_out; + + uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), vld1q_s32(right_pad_idx));//0 2 4 6 + uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), vld1q_s32(right_pad_idx + 4));//1 3 5 7 + uint32x4_t wmask = vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));//0 1 2 3 + //printf("w_in %d, remain: %d \n", w_in, size_right_remain); + // printf("mask1: %d, %d, %d, %d \n", vmask_rp1[0], vmask_rp1[1], vmask_rp1[2], vmask_rp1[3]); + //printf("mask2: %d, %d, %d, %d \n", vmask_rp2[0], vmask_rp2[1], vmask_rp2[2], vmask_rp2[3]); + //printf("wmask: %d, %d, %d, %d \n", wmask[0], wmask[1], wmask[2], wmask[3]); + // size_right_remain *= sizeof(float); + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float* din_channel = din_batch + i * size_in_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + const float *weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + + float32x4_t vzero= vdupq_n_f32(0.f); + + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } + + const float *dr0 = din_channel; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + + const float *din0_ptr = dr0; + const float *din1_ptr = dr1; + const float *din2_ptr = dr2; + + float *doutr0 = dout_channel; + float *doutr0_ptr = doutr0; + + //! top pad + if(1){ + //printf("cnt_col: %d, remain: %d \n", cnt_col, size_right_remain); + // printf("mask1: %d, %d, %d, %d \n", vmask_rp1[0], vmask_rp1[1], vmask_rp1[2], vmask_rp1[3]); + // printf("mask2: %d, %d, %d, %d \n", vmask_rp2[0], vmask_rp2[1], vmask_rp2[2], vmask_rp2[3]); + // printf("wmask: %d, %d, %d, %d \n", wmask[0], wmask[1], wmask[2], wmask[3]); + + asm volatile ( + //top + // Load up 12 elements (3 vectors) from each of 8 sources. + "0: \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + "bif v0.16b, %[vzero].16b, %[mask1].16b \n" //pipei + "bif v1.16b, %[vzero].16b, %[mask2].16b \n" //pipei + // "bif v10.16b, %[vzero].16b, %[wmask].16b \n" //pipei + "ext v2.16b, %[vzero].16b, v1.16b, #12 \n" // v6 = {0, 1, 3, 5} + "bif v4.16b, %[vzero].16b, %[mask1].16b \n" //pipei + + + "fmul v8.4s, v0.4s, %[w1].s[1] \n" // v0 * w01 + "fmul v9.4s, v1.4s, %[w1].s[2] \n" // v1 * w02 + "fmla v10.4s, v2.4s, %[w1].s[0] \n" // v2 * w00 + + "bif v5.16b, %[vzero].16b, %[mask2].16b \n" //pipei + "ext v6.16b, %[vzero].16b, v5.16b, #12 \n" // v6 = {2,4,6,8} + + "fmla v8.4s, v4.4s, %[w2].s[1] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w2].s[2] \n" // v1 * w02 + "fmla v10.4s, v6.4s, %[w2].s[0] \n" // v2 * w00 + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + // "fmax v0.4s, v0.4s, %[vzero].4s \n" + "bif v0.16b, %[vzero].16b, %[wmask].16b \n" //pipei + + "st1 {v0.4s}, [%[outptr]], #16 \n" + : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [outptr] "+r"(doutr0_ptr), \ + [vzero] "+w" (vzero), [w1] "+w" (wr1), [w2] "+w" (wr2), \ + [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" + ); + } + + dr0 = dr1; + dr1 = dr2; + dr2 = dr1 + w_in; + doutr0 = doutr0 + w_out; + //! mid + for (int j = h_out - size_pad_bottom - 1; j > 0; j--){ + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + + doutr0_ptr = doutr0; + + + asm volatile ( + //top + // Load up 12 elements (3 vectors) from each of 8 sources. + "0: \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + + "bif v0.16b, %[vzero].16b, %[mask1].16b \n" //pipei + "bif v1.16b, %[vzero].16b, %[mask2].16b \n" //pipei + // "bif v10.16b, %[vzero].16b, %[wmask].16b \n" //pipei + "ext v2.16b, %[vzero].16b, v1.16b, #12 \n" // v6 = {0,1,3,5} + "bif v4.16b, %[vzero].16b, %[mask1].16b \n" //pipei + + "ld2 {v12.4s, v13.4s}, [%[inptr2]], #32 \n" + "bif v5.16b, %[vzero].16b, %[mask2].16b \n" //pipei + "ext v6.16b, %[vzero].16b, v5.16b, #12 \n" // v6 = {2,4,6,8} + + "fmul v8.4s, v0.4s, %[w0].s[1] \n" // v0 * w01 + "fmul v9.4s, v1.4s, %[w0].s[2] \n" // v1 * w02 + "fmla v10.4s, v2.4s, %[w0].s[0] \n" // v2 * w00 + + "bif v12.16b, %[vzero].16b, %[mask1].16b \n" //pipei + "bif v13.16b, %[vzero].16b, %[mask2].16b \n" //pipei + + "fmla v8.4s, v4.4s, %[w1].s[1] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w1].s[2] \n" // v1 * w02 + "fmla v10.4s, v6.4s, %[w1].s[0] \n" // v2 * w00 + + "ext v14.16b, %[vzero].16b, v13.16b, #12 \n" // v6 = {0,1,3,5} + + "fmla v8.4s, v12.4s, %[w2].s[1] \n" // v0 * w01 + "fmla v9.4s, v13.4s, %[w2].s[2] \n" // v1 * w02 + "fmla v10.4s, v14.4s, %[w2].s[0] \n" // v2 * w00 + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + // "fmax v0.4s, v0.4s, %[vzero].4s \n" + + "bif v0.16b, %[vzero].16b, %[wmask].16b \n" //pipei + + "st1 {v0.4s}, [%[outptr]], #16 \n" + : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [inptr2] "+r"(din2_ptr), [outptr] "+r"(doutr0_ptr), \ + [vzero] "+w" (vzero), [w1] "+w" (wr1), [w2] "+w" (wr2), [w0] "+w" (wr0), \ + [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14" + ); + dr0 = dr2; + dr1 = dr0 + w_in; + dr2 = dr1 + w_in; + doutr0 = doutr0 + w_out; + } + + if (size_pad_bottom){ + din0_ptr = dr0; + din1_ptr = dr1; + + doutr0_ptr = doutr0; + + asm volatile ( + //top + // Load up 12 elements (3 vectors) from each of 8 sources. + "0: \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + "bif v0.16b, %[vzero].16b, %[mask1].16b \n" //pipei + "bif v1.16b, %[vzero].16b, %[mask2].16b \n" //pipei + // "bif v10.16b, %[vzero].16b, %[wmask].16b \n" //pipei + "ext v2.16b, %[vzero].16b, v1.16b, #12 \n" // v6 = {2,4,6,8} + "bif v4.16b, %[vzero].16b, %[mask1].16b \n" //pipei + + + "fmul v8.4s, v0.4s, %[w0].s[0] \n" // v0 * w01 + "fmul v9.4s, v1.4s, %[w0].s[1] \n" // v1 * w02 + "fmla v10.4s, v2.4s, %[w0].s[2] \n" // v2 * w00 + + "bif v5.16b, %[vzero].16b, %[mask2].16b \n" //pipei + "ext v6.16b, %[vzero].16b, v5.16b, #12 \n" // v6 = {2,4,6,8} + + "fmla v8.4s, v4.4s, %[w1].s[0] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w1].s[1] \n" // v1 * w02 + "fmla v10.4s, v6.4s, %[w1].s[2] \n" // v2 * w00 + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + // "fmax v0.4s, v0.4s, %[vzero].4s \n" + "bif v0.16b, %[vzero].16b, %[wmask].16b \n" //pipei + + "st1 {v0.4s}, [%[outptr]], #16 \n" + "4: \n" + : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [outptr] "+r"(doutr0_ptr), \ + [vzero] "+w" (vzero), [w0] "+w" (wr0), [w1] "+w" (wr1), \ + [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias) + : [remain] "r" (cnt_remain) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" + ); + } + + } + } + + +} +#else void conv_depthwise_3x3s2p1_bias_s(float* dout, const float* din, \ const float* weights, const float* bias, bool flag_bias, \ const int num, const int ch_in, const int h_in, const int w_in, \ @@ -9144,6 +10298,7 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout, const float* din, \ } } } +#endif #else void conv_depthwise_3x3s2p1_bias_s(float* dout, const float* din, \ const float* weights, const float* bias, bool flag_bias, \ @@ -9908,7 +11063,6 @@ void conv_depthwise_3x3s1p1_bias_s_relu(float* dout, const float* din, \ } #else - void conv_depthwise_3x3s1p1_bias_s_relu(float* dout, const float* din, \ const float* weights, const float* bias, bool flag_bias, \ const int num, const int ch_in, const int h_in, const int w_in, \ @@ -10279,6 +11433,232 @@ void conv_depthwise_3x3s1p1_bias_s_relu(float* dout, const float* din, \ * \brief depthwise convolution kernel 3x3, stride 2, width <= 4 */ #ifdef __aarch64__ +#if 1 //w <= 7 +void conv_depthwise_3x3s2p1_bias_s_relu(float* dout, const float* din, \ + const float* weights, const float* bias, bool flag_bias, \ + const int num, const int ch_in, const int h_in, const int w_in, \ + const int h_out, const int w_out) { + + int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + int out_pad_idx[4] = {0, 1, 2, 3}; + int size_pad_bottom = h_out * 2 - h_in; + + int size_right_remain = w_in; + + int size_right_pad = w_out * 2 - w_in; + + int cnt_remain = w_out; + + uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), vld1q_s32(right_pad_idx));//0 2 4 6 + uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), vld1q_s32(right_pad_idx + 4));//1 3 5 7 + uint32x4_t wmask = vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));//0 1 2 3 + //printf("w_in %d, remain: %d \n", w_in, size_right_remain); + // printf("mask1: %d, %d, %d, %d \n", vmask_rp1[0], vmask_rp1[1], vmask_rp1[2], vmask_rp1[3]); + //printf("mask2: %d, %d, %d, %d \n", vmask_rp2[0], vmask_rp2[1], vmask_rp2[2], vmask_rp2[3]); + //printf("wmask: %d, %d, %d, %d \n", wmask[0], wmask[1], wmask[2], wmask[3]); + // size_right_remain *= sizeof(float); + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float* din_channel = din_batch + i * size_in_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + const float *weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + + float32x4_t vzero= vdupq_n_f32(0.f); + + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } + + const float *dr0 = din_channel; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + + const float *din0_ptr = dr0; + const float *din1_ptr = dr1; + const float *din2_ptr = dr2; + + float *doutr0 = dout_channel; + float *doutr0_ptr = doutr0; + + //! top pad + if(1){ + //printf("cnt_col: %d, remain: %d \n", cnt_col, size_right_remain); + // printf("mask1: %d, %d, %d, %d \n", vmask_rp1[0], vmask_rp1[1], vmask_rp1[2], vmask_rp1[3]); + // printf("mask2: %d, %d, %d, %d \n", vmask_rp2[0], vmask_rp2[1], vmask_rp2[2], vmask_rp2[3]); + // printf("wmask: %d, %d, %d, %d \n", wmask[0], wmask[1], wmask[2], wmask[3]); + + asm volatile ( + //top + // Load up 12 elements (3 vectors) from each of 8 sources. + "0: \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + "bif v0.16b, %[vzero].16b, %[mask1].16b \n" //pipei + "bif v1.16b, %[vzero].16b, %[mask2].16b \n" //pipei + // "bif v10.16b, %[vzero].16b, %[wmask].16b \n" //pipei + "ext v2.16b, %[vzero].16b, v1.16b, #12 \n" // v6 = {0, 1, 3, 5} + "bif v4.16b, %[vzero].16b, %[mask1].16b \n" //pipei + + + "fmul v8.4s, v0.4s, %[w1].s[1] \n" // v0 * w01 + "fmul v9.4s, v1.4s, %[w1].s[2] \n" // v1 * w02 + "fmla v10.4s, v2.4s, %[w1].s[0] \n" // v2 * w00 + + "bif v5.16b, %[vzero].16b, %[mask2].16b \n" //pipei + "ext v6.16b, %[vzero].16b, v5.16b, #12 \n" // v6 = {2,4,6,8} + + "fmla v8.4s, v4.4s, %[w2].s[1] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w2].s[2] \n" // v1 * w02 + "fmla v10.4s, v6.4s, %[w2].s[0] \n" // v2 * w00 + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + "fmax v0.4s, v0.4s, %[vzero].4s \n" + "bif v0.16b, %[vzero].16b, %[wmask].16b \n" //pipei + + "st1 {v0.4s}, [%[outptr]], #16 \n" + : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [outptr] "+r"(doutr0_ptr), \ + [vzero] "+w" (vzero), [w1] "+w" (wr1), [w2] "+w" (wr2), \ + [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" + ); + } + + dr0 = dr1; + dr1 = dr2; + dr2 = dr1 + w_in; + doutr0 = doutr0 + w_out; + //! mid + for (int j = h_out - size_pad_bottom - 1; j > 0; j--){ + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + + doutr0_ptr = doutr0; + + + asm volatile ( + //top + // Load up 12 elements (3 vectors) from each of 8 sources. + "0: \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + + "bif v0.16b, %[vzero].16b, %[mask1].16b \n" //pipei + "bif v1.16b, %[vzero].16b, %[mask2].16b \n" //pipei + // "bif v10.16b, %[vzero].16b, %[wmask].16b \n" //pipei + "ext v2.16b, %[vzero].16b, v1.16b, #12 \n" // v6 = {0,1,3,5} + "bif v4.16b, %[vzero].16b, %[mask1].16b \n" //pipei + + "ld2 {v12.4s, v13.4s}, [%[inptr2]], #32 \n" + "bif v5.16b, %[vzero].16b, %[mask2].16b \n" //pipei + "ext v6.16b, %[vzero].16b, v5.16b, #12 \n" // v6 = {2,4,6,8} + + "fmul v8.4s, v0.4s, %[w0].s[1] \n" // v0 * w01 + "fmul v9.4s, v1.4s, %[w0].s[2] \n" // v1 * w02 + "fmla v10.4s, v2.4s, %[w0].s[0] \n" // v2 * w00 + + "bif v12.16b, %[vzero].16b, %[mask1].16b \n" //pipei + "bif v13.16b, %[vzero].16b, %[mask2].16b \n" //pipei + + "fmla v8.4s, v4.4s, %[w1].s[1] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w1].s[2] \n" // v1 * w02 + "fmla v10.4s, v6.4s, %[w1].s[0] \n" // v2 * w00 + + "ext v14.16b, %[vzero].16b, v13.16b, #12 \n" // v6 = {0,1,3,5} + + "fmla v8.4s, v12.4s, %[w2].s[1] \n" // v0 * w01 + "fmla v9.4s, v13.4s, %[w2].s[2] \n" // v1 * w02 + "fmla v10.4s, v14.4s, %[w2].s[0] \n" // v2 * w00 + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + "fmax v0.4s, v0.4s, %[vzero].4s \n" + + "bif v0.16b, %[vzero].16b, %[wmask].16b \n" //pipei + + "st1 {v0.4s}, [%[outptr]], #16 \n" + : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [inptr2] "+r"(din2_ptr), [outptr] "+r"(doutr0_ptr), \ + [vzero] "+w" (vzero), [w1] "+w" (wr1), [w2] "+w" (wr2), [w0] "+w" (wr0), \ + [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14" + ); + dr0 = dr2; + dr1 = dr0 + w_in; + dr2 = dr1 + w_in; + doutr0 = doutr0 + w_out; + } + + if (size_pad_bottom){ + din0_ptr = dr0; + din1_ptr = dr1; + + doutr0_ptr = doutr0; + + asm volatile ( + //top + // Load up 12 elements (3 vectors) from each of 8 sources. + "0: \n" + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" //v0={0,2,4,6} v1={1,3,5,7} + "ld2 {v4.4s, v5.4s}, [%[inptr1]], #32 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" //v10 = vbias + "bif v0.16b, %[vzero].16b, %[mask1].16b \n" //pipei + "bif v1.16b, %[vzero].16b, %[mask2].16b \n" //pipei + // "bif v10.16b, %[vzero].16b, %[wmask].16b \n" //pipei + "ext v2.16b, %[vzero].16b, v1.16b, #12 \n" // v6 = {2,4,6,8} + "bif v4.16b, %[vzero].16b, %[mask1].16b \n" //pipei + + + "fmul v8.4s, v0.4s, %[w0].s[0] \n" // v0 * w01 + "fmul v9.4s, v1.4s, %[w0].s[1] \n" // v1 * w02 + "fmla v10.4s, v2.4s, %[w0].s[2] \n" // v2 * w00 + + "bif v5.16b, %[vzero].16b, %[mask2].16b \n" //pipei + "ext v6.16b, %[vzero].16b, v5.16b, #12 \n" // v6 = {2,4,6,8} + + "fmla v8.4s, v4.4s, %[w1].s[0] \n" // v0 * w01 + "fmla v9.4s, v5.4s, %[w1].s[1] \n" // v1 * w02 + "fmla v10.4s, v6.4s, %[w1].s[2] \n" // v2 * w00 + + "fadd v0.4s, v8.4s, v9.4s \n" + "fadd v0.4s, v0.4s, v10.4s \n" + "fmax v0.4s, v0.4s, %[vzero].4s \n" + "bif v0.16b, %[vzero].16b, %[wmask].16b \n" //pipei + + "st1 {v0.4s}, [%[outptr]], #16 \n" + "4: \n" + : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [outptr] "+r"(doutr0_ptr), \ + [vzero] "+w" (vzero), [w0] "+w" (wr0), [w1] "+w" (wr1), \ + [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias) + : [remain] "r" (cnt_remain) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" + ); + } + + } + } + + +} +#else void conv_depthwise_3x3s2p1_bias_s_relu(float* dout, const float* din, \ const float* weights, const float* bias, bool flag_bias, \ const int num, const int ch_in, const int h_in, const int w_in, \ @@ -10546,6 +11926,7 @@ void conv_depthwise_3x3s2p1_bias_s_relu(float* dout, const float* din, \ } } } +#endif #else void conv_depthwise_3x3s2p1_bias_s_relu(float* dout, const float* din, \ const float* weights, const float* bias, bool flag_bias, \ From dd63fae45bb71d2f67bd669be79d0e1f526fbc2b Mon Sep 17 00:00:00 2001 From: lixiaoyang05 Date: Sat, 25 Aug 2018 17:33:55 +0800 Subject: [PATCH 07/10] add shufflechannel param --- saber/lite/funcs/neon/impl/sgemm_conv.cpp | 4 ++ saber/lite/funcs/neon/impl/sgemm_conv.h | 8 +++ saber/lite/funcs/op_param.h | 17 +++++ saber/lite/funcs/saber_shuffle_channel.cpp | 83 ++++++++++++++++++++++ saber/lite/funcs/saber_shuffle_channel.h | 64 +++++++++++++++++ 5 files changed, 176 insertions(+) create mode 100644 saber/lite/funcs/neon/impl/sgemm_conv.cpp create mode 100644 saber/lite/funcs/neon/impl/sgemm_conv.h create mode 100644 saber/lite/funcs/saber_shuffle_channel.cpp create mode 100644 saber/lite/funcs/saber_shuffle_channel.h diff --git a/saber/lite/funcs/neon/impl/sgemm_conv.cpp b/saber/lite/funcs/neon/impl/sgemm_conv.cpp new file mode 100644 index 000000000..1da5b6b8e --- /dev/null +++ b/saber/lite/funcs/neon/impl/sgemm_conv.cpp @@ -0,0 +1,4 @@ +// +// Created by Li,Xiaoyang(SYS) on 2018/8/24. +// + diff --git a/saber/lite/funcs/neon/impl/sgemm_conv.h b/saber/lite/funcs/neon/impl/sgemm_conv.h new file mode 100644 index 000000000..a700f19d2 --- /dev/null +++ b/saber/lite/funcs/neon/impl/sgemm_conv.h @@ -0,0 +1,8 @@ +// +// Created by Li,Xiaoyang(SYS) on 2018/8/24. +// + +#ifndef ANAKIN_SGEMM_CONV_H +#define ANAKIN_SGEMM_CONV_H + +#endif //ANAKIN_SGEMM_CONV_H diff --git a/saber/lite/funcs/op_param.h b/saber/lite/funcs/op_param.h index 8dccd5e80..6d2808ac4 100644 --- a/saber/lite/funcs/op_param.h +++ b/saber/lite/funcs/op_param.h @@ -755,6 +755,23 @@ struct ScaleParam : public ParamBase { const float* _scale_w; const float* _scale_b; }; + +struct ShuffleChannelParam : public ParamBase { + ShuffleChannelParam() : _group(1) {} + ShuffleChannelParam(int group) : _group(group) {} + ShuffleChannelParam(const ShuffleChannelParam& right) : ParamBase(right) { + _group = right._group; + } + ShuffleChannelParam& operator=(const ShuffleChannelParam& right){ + _group = right._group; + return *this; + } + bool operator==(const ShuffleChannelParam& right){ + return _group == right._group; + } + int _group; +}; + } //namespace lite } //namespace saber diff --git a/saber/lite/funcs/saber_shuffle_channel.cpp b/saber/lite/funcs/saber_shuffle_channel.cpp new file mode 100644 index 000000000..72cd14689 --- /dev/null +++ b/saber/lite/funcs/saber_shuffle_channel.cpp @@ -0,0 +1,83 @@ +#include "saber/lite/funcs/saber_flatten.h" +#include "saber/lite/net/saber_factory_lite.h" +#ifdef USE_ARM_PLACE + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +SaberFlatten::SaberFlatten(const ParamBase *param) { + _param = (const FlattenParam*)param; + this->_flag_param = true; +} + +SaberStatus SaberFlatten::load_param(const ParamBase *param) { + _param = (const FlattenParam*)param; + this->_flag_param = true; + return SaberSuccess; +} + +SaberStatus SaberFlatten::load_param(std::istream &stream, const float *weights) { + this->_flag_param = true; + return SaberSuccess; +} +#if 0 +SaberStatus SaberFlatten::load_param(FILE *fp, const float *weights) { + fscanf(fp, "\n"); + this->_flag_param = true; + return SaberSuccess; +} +#endif +SaberStatus SaberFlatten::compute_output_shape(const std::vector *> &inputs, + std::vector *> &outputs) { + if (!this->_flag_param) { + printf("load flatten param first\n"); + return SaberNotInitialized; + } + + SaberStatus status; + //! input size is equal to 1 + Shape shape_in = inputs[0]->valid_shape(); + LCHECK_EQ(shape_in.dims(), 4, "only support 4d(NCHW) layout"); + shape_in[1] = inputs[0]->valid_size() / inputs[0]->num(); + shape_in[2] = 1; + shape_in[3] = 1; + return outputs[0]->set_shape(shape_in); +} + +SaberStatus SaberFlatten::init(const std::vector *> &inputs, + std::vector *> &outputs, Context &ctx) { + if (!this->_flag_param) { + printf("load flatten param first\n"); + return SaberNotInitialized; + } + // get context + this->_ctx = &ctx; + //outputs[0]->share_from(*inputs[0]); + this->_flag_init = true; + return SaberSuccess; +} + + +//template +SaberStatus SaberFlatten::dispatch(const std::vector *> &inputs, + std::vector *> &outputs) { + + if (!this->_flag_init) { + printf("init flatten first\n"); + return SaberNotInitialized; + } + return SaberSuccess; +} +REGISTER_LAYER_CLASS(SaberFlatten); +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif //USE_ARM + + diff --git a/saber/lite/funcs/saber_shuffle_channel.h b/saber/lite/funcs/saber_shuffle_channel.h new file mode 100644 index 000000000..f5f58bb61 --- /dev/null +++ b/saber/lite/funcs/saber_shuffle_channel.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_LITE_FUNCS_SABER_FLATTEN_H +#define ANAKIN_SABER_LITE_FUNCS_SABER_FLATTEN_H + +#include "saber/lite/funcs/op_base.h" + +#ifdef USE_ARM_PLACE + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +//template +class SaberFlatten : public OpBase { +public: + SaberFlatten() {} + + SaberFlatten(const ParamBase* param); + + virtual SaberStatus load_param(const ParamBase* param) override; + + //virtual SaberStatus load_param(FILE* fp, const float* weights) override; + + virtual SaberStatus load_param(std::istream& stream, const float* weights) override; + + ~SaberFlatten() {} + + virtual SaberStatus compute_output_shape(const std::vector*>& inputs, + std::vector*>& outputs) override; + + virtual SaberStatus init(const std::vector*>& inputs, \ + std::vector*>& outputs, Context &ctx) override; + + virtual SaberStatus dispatch(const std::vector*>& inputs, \ + std::vector*>& outputs) override; + + +private: + const FlattenParam* _param; +}; + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif // USE_ARM_PLACE + +#endif //ANAKIN_SABER_LITE_FUNCS_SABER_FC_H From 890b66f056ae271c99a993dbf4e5aa94ed51667e Mon Sep 17 00:00:00 2001 From: lixiaoyang05 Date: Sat, 25 Aug 2018 19:11:26 +0800 Subject: [PATCH 08/10] change conv impl api --- saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp b/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp index 30a1fac03..f862be444 100644 --- a/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp +++ b/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp @@ -59,7 +59,7 @@ void conv_depthwise_3x3(const float* din, float* dout, \ int chin, int hin, int win, \ const float* weights, const float* bias, \ int group, int kernel_w, int kernel_h, int stride_w, int stride_h, int dila_w, int dila_h, \ - int pad_w, int pad_h, bool flag_bias, bool flag_relu, Sgemm& gemmer, void* work_space) { + int pad_w, int pad_h, bool flag_bias, bool flag_relu, /*Sgemm& gemmer*/Context* ctx, void* work_space) { int w_in = win; int h_in = hin; From 103dbe289610e85d08b3ed1dae7064b01f99b525 Mon Sep 17 00:00:00 2001 From: lixiaoyang05 Date: Tue, 4 Sep 2018 17:04:39 +0800 Subject: [PATCH 09/10] add bn to arm_lite --- framework/lite/code_gen_cpp.cpp | 1 - framework/lite/op_map_cpp.cpp | 85 ++++++++++++++++++- .../funcs/neon/impl/conv_arm_depthwise.cpp | 2 +- saber/lite/funcs/saber_prelu.h | 61 ------------- saber/lite/funcs/saber_shuffle_channel.cpp | 83 ------------------ saber/lite/funcs/saber_shuffle_channel.h | 64 -------------- 6 files changed, 84 insertions(+), 212 deletions(-) delete mode 100644 saber/lite/funcs/saber_prelu.h delete mode 100644 saber/lite/funcs/saber_shuffle_channel.cpp delete mode 100644 saber/lite/funcs/saber_shuffle_channel.h diff --git a/framework/lite/code_gen_cpp.cpp b/framework/lite/code_gen_cpp.cpp index 38fa55542..3a3d7c00d 100644 --- a/framework/lite/code_gen_cpp.cpp +++ b/framework/lite/code_gen_cpp.cpp @@ -29,7 +29,6 @@ void GenCPP::gen_header_start() { _code<<"#include \n"; _code<<"#include \n"; _code<<"#include \n"; - _code<<"#include \n"; _code<<"#include \n"; _code<<"#include \n"; _code<<"#include \n"; diff --git a/framework/lite/op_map_cpp.cpp b/framework/lite/op_map_cpp.cpp index 43c172f15..98a48b44a 100755 --- a/framework/lite/op_map_cpp.cpp +++ b/framework/lite/op_map_cpp.cpp @@ -1545,19 +1545,24 @@ std::string ParserPriorBox(graph::AttrInfo& attr, //add std::vector fixed_size, fixed_ratio, density; if (find_attr("fixed_size", attr) == SaberSuccess) { - LOG(ERROR) << "not exit"; auto fix_size = get_attr>("fixed_size", attr); fixed_size = fix_size.vector(); + } else { + LOG(WARNING) << "not fixed_size param in priorbox"; } if (find_attr("fixed_ratio", attr) == SaberSuccess) { auto fix_ratio = get_attr>("fixed_ratio", attr); fixed_ratio = fix_ratio.vector(); + } else { + LOG(WARNING) << "not fixed_ratio param in priorbox"; } if (find_attr("density", attr) == SaberSuccess) { auto den = get_attr>("density", attr); density = den.vector(); + } else { + LOG(WARNING) << "not density param in priorbox"; } auto flip_flag = get_attr("is_flip", attr); @@ -1740,7 +1745,7 @@ std::string ParserSlice(graph::AttrInfo& attr, return code_w.get_code_string(); } -// SaberSlice +// SaberScale std::string ParserScale(graph::AttrInfo& attr, std::string& code_name, std::string& op_class_name, @@ -1790,6 +1795,81 @@ std::string ParserScale(graph::AttrInfo& attr, return code_w.get_code_string(); } +// SaberScale +std::string ParserBatchNorm(graph::AttrInfo& attr, + std::string& code_name, + std::string& op_class_name, + std::string& node_name, + std::string& weights_ptr_name, + WeightsWritter& writter, + bool gen_param) { + + // get batchnorm param + auto eps = get_attr("epsilon", attr); + auto momentum = get_attr("momentum", attr); + auto mean = get_attr>("weight_1", attr); + auto mean_vec = mean.vector(); + auto var = get_attr>("weight_2", attr); + auto var_vec = var.vector(); + auto scale_factor = get_attr>("weight_3", attr); + auto scale_factor_vec = scale_factor.vector(); + + std::vector scale; + std::vector bias; + scale.resize(mean.count()); + bias.resize(mean.count()); + auto scale_val = scale_factor_vec[0] == 0 ? 0 : 1 / scale_factor_vec[0]; + + for (int i = 0; i < mean.count(); i++) { + scale[i] = 1.0f / std::sqrt(var_vec[i] * scale_val + eps); + bias[i] = - mean_vec[i] * scale_val / std::sqrt(var_vec[i] * scale_val + eps); + } + + Shape sh1 = {1, 1, 1, scale.size()}; + Shape sh2 = {1, 1, 1, bias.size()}; + PBlock pscale(sh1); + PBlock pbias(sh2); + float* pscale_ptr = pscale.h_tensor().mutable_data(); + for (int j = 0; j < scale.size(); ++j) { + pscale_ptr[j] = scale[j]; + } + float* pbias_ptr = pbias.h_tensor().mutable_data(); + for (int j = 0; j < bias.size(); ++j) { + pbias_ptr[j] = bias[j]; + } + + writter.register_weights(node_name, pscale); + LOG(INFO) << node_name << " write weights: " << pscale.count(); + + writter.register_weights(node_name, pbias); + LOG(INFO) << node_name << " write bias: " << pbias.count(); + + auto offset_info = writter.get_weights_by_name(node_name); + + // gen cpp code + CodeWritter code_w; + if (gen_param) { + code_w.feed("%d %d %d %d %d\n", + offset_info.weights[0].offset, + offset_info.weights[1].offset, + 1, + 1, + 1); + } else { + code_w.feed("ParamBase* %s_param = new ScaleParam(%s+%d, %s+%d, %s, %d, %d);\n", + node_name.c_str(), + weights_ptr_name.c_str(), + offset_info.weights[0].offset, + weights_ptr_name.c_str(), + offset_info.weights[1].offset, + "true", + 1, + 1); + + code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); + } + return code_w.get_code_string(); +} // SaberSoftmax std::string ParserSoftmax(graph::AttrInfo& attr, @@ -1923,6 +2003,7 @@ std::unordered_map OPERATION_MAP({ {"PriorBox", {"SaberPriorBox", ParserPriorBox} }, // done {"Power", {"SaberPower", ParserPower} }, // done {"Scale", {"SaberScale", ParserScale} }, // done + {"BatchNorm", {"SaberScale", ParserBatchNorm} }, // done {"Slice", {"SaberSlice", ParserSlice} }, // done {"Flatten", {"SaberFlatten", ParserFlatten}}, //done {"Reshape", {"SaberReshape", ParserReshape}}, //done diff --git a/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp b/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp index 91add9ffe..ac2691bbb 100644 --- a/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp +++ b/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp @@ -59,7 +59,7 @@ void conv_depthwise_3x3(const float* din, float* dout, \ int chin, int hin, int win, \ const float* weights, const float* bias, \ int group, int kernel_w, int kernel_h, int stride_w, int stride_h, int dila_w, int dila_h, \ - int pad_w, int pad_h, bool flag_bias, bool flag_relu, /*Sgemm& gemmer*/Context* ctx, void* work_space) { + int pad_w, int pad_h, bool flag_bias, bool flag_relu, Sgemm& gemmer, void* work_space) { int w_in = win; int h_in = hin; diff --git a/saber/lite/funcs/saber_prelu.h b/saber/lite/funcs/saber_prelu.h deleted file mode 100644 index 12069bc97..000000000 --- a/saber/lite/funcs/saber_prelu.h +++ /dev/null @@ -1,61 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#ifndef ANAKIN_SABER_LITE_FUNCS_NEON_SABER_PRELU_H -#define ANAKIN_SABER_LITE_FUNCS_NEON_SABER_PRELU_H - -#include "saber/lite/funcs/op_base.h" -#if 0 -#ifdef USE_ARM_PLACE -namespace anakin{ - -namespace saber{ - -namespace lite{ - -//template -class SaberPrelu : public OpBase { - -public: - - SaberPrelu() {} - - SaberPrelu(bool flag_shared, const float* weights); - - SaberStatus load_param(bool flag_shared, const float* weights); - - ~SaberPrelu() {} - - virtual SaberStatus compute_output_shape(const std::vector*>& inputs, - std::vector*>& outputs) override; - - virtual SaberStatus init(const std::vector*>& inputs, \ - std::vector*>& outputs, Context &ctx) override; - - virtual SaberStatus dispatch(const std::vector*>& inputs, \ - std::vector*>& outputs) override; - -private: - - bool _flag_shared; - const float* _weights{nullptr}; -}; - -} //namespace lite - -} //namespace saber - -} //namespace anakin -#endif // USE_ARM_PLACE -#endif -#endif //ANAKIN_SABER_LITE_FUNCS_NEON_SABER_PRELU_H diff --git a/saber/lite/funcs/saber_shuffle_channel.cpp b/saber/lite/funcs/saber_shuffle_channel.cpp deleted file mode 100644 index 72cd14689..000000000 --- a/saber/lite/funcs/saber_shuffle_channel.cpp +++ /dev/null @@ -1,83 +0,0 @@ -#include "saber/lite/funcs/saber_flatten.h" -#include "saber/lite/net/saber_factory_lite.h" -#ifdef USE_ARM_PLACE - -namespace anakin{ - -namespace saber{ - -namespace lite{ - -SaberFlatten::SaberFlatten(const ParamBase *param) { - _param = (const FlattenParam*)param; - this->_flag_param = true; -} - -SaberStatus SaberFlatten::load_param(const ParamBase *param) { - _param = (const FlattenParam*)param; - this->_flag_param = true; - return SaberSuccess; -} - -SaberStatus SaberFlatten::load_param(std::istream &stream, const float *weights) { - this->_flag_param = true; - return SaberSuccess; -} -#if 0 -SaberStatus SaberFlatten::load_param(FILE *fp, const float *weights) { - fscanf(fp, "\n"); - this->_flag_param = true; - return SaberSuccess; -} -#endif -SaberStatus SaberFlatten::compute_output_shape(const std::vector *> &inputs, - std::vector *> &outputs) { - if (!this->_flag_param) { - printf("load flatten param first\n"); - return SaberNotInitialized; - } - - SaberStatus status; - //! input size is equal to 1 - Shape shape_in = inputs[0]->valid_shape(); - LCHECK_EQ(shape_in.dims(), 4, "only support 4d(NCHW) layout"); - shape_in[1] = inputs[0]->valid_size() / inputs[0]->num(); - shape_in[2] = 1; - shape_in[3] = 1; - return outputs[0]->set_shape(shape_in); -} - -SaberStatus SaberFlatten::init(const std::vector *> &inputs, - std::vector *> &outputs, Context &ctx) { - if (!this->_flag_param) { - printf("load flatten param first\n"); - return SaberNotInitialized; - } - // get context - this->_ctx = &ctx; - //outputs[0]->share_from(*inputs[0]); - this->_flag_init = true; - return SaberSuccess; -} - - -//template -SaberStatus SaberFlatten::dispatch(const std::vector *> &inputs, - std::vector *> &outputs) { - - if (!this->_flag_init) { - printf("init flatten first\n"); - return SaberNotInitialized; - } - return SaberSuccess; -} -REGISTER_LAYER_CLASS(SaberFlatten); -} //namespace lite - -} //namespace saber - -} //namespace anakin - -#endif //USE_ARM - - diff --git a/saber/lite/funcs/saber_shuffle_channel.h b/saber/lite/funcs/saber_shuffle_channel.h deleted file mode 100644 index f5f58bb61..000000000 --- a/saber/lite/funcs/saber_shuffle_channel.h +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#ifndef ANAKIN_SABER_LITE_FUNCS_SABER_FLATTEN_H -#define ANAKIN_SABER_LITE_FUNCS_SABER_FLATTEN_H - -#include "saber/lite/funcs/op_base.h" - -#ifdef USE_ARM_PLACE - -namespace anakin{ - -namespace saber{ - -namespace lite{ - -//template -class SaberFlatten : public OpBase { -public: - SaberFlatten() {} - - SaberFlatten(const ParamBase* param); - - virtual SaberStatus load_param(const ParamBase* param) override; - - //virtual SaberStatus load_param(FILE* fp, const float* weights) override; - - virtual SaberStatus load_param(std::istream& stream, const float* weights) override; - - ~SaberFlatten() {} - - virtual SaberStatus compute_output_shape(const std::vector*>& inputs, - std::vector*>& outputs) override; - - virtual SaberStatus init(const std::vector*>& inputs, \ - std::vector*>& outputs, Context &ctx) override; - - virtual SaberStatus dispatch(const std::vector*>& inputs, \ - std::vector*>& outputs) override; - - -private: - const FlattenParam* _param; -}; - -} //namespace lite - -} //namespace saber - -} //namespace anakin - -#endif // USE_ARM_PLACE - -#endif //ANAKIN_SABER_LITE_FUNCS_SABER_FC_H From f940010eb9d4585836908ac0c3952109a4977ab0 Mon Sep 17 00:00:00 2001 From: Xiaoyang LI Date: Fri, 7 Sep 2018 18:15:19 +0800 Subject: [PATCH 10/10] fix issue 428 fix iOS cmake toolchain build failed on some mac, fix issue 428 --- cmake/ios/ios.toolchain.cmake | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cmake/ios/ios.toolchain.cmake b/cmake/ios/ios.toolchain.cmake index 4872605db..e6b56c7a5 100755 --- a/cmake/ios/ios.toolchain.cmake +++ b/cmake/ios/ios.toolchain.cmake @@ -47,8 +47,10 @@ endif (CMAKE_UNAME) # Force the compilers to gcc for iOS include (CMakeForceCompiler) -CMAKE_FORCE_C_COMPILER (/usr/bin/clang Apple) -CMAKE_FORCE_CXX_COMPILER (/usr/bin/clang++ Apple) +set(CMAKE_C_COMPILER /usr/bin/clang) +set(CMAKE_CXX_COMPILER /usr/bin/clang++) +#CMAKE_FORCE_C_COMPILER (/usr/bin/clang Apple) +#CMAKE_FORCE_CXX_COMPILER (/usr/bin/clang++ Apple) set(CMAKE_AR ar CACHE FILEPATH "" FORCE) # Skip the platform compiler checks for cross compiling