From f8507272c0b63bbaaec3f306843e885e0c746766 Mon Sep 17 00:00:00 2001
From: chenjiaoAngel <chenjiao04@baidu.com>
Date: Thu, 23 Aug 2018 17:56:30 +0800
Subject: [PATCH 01/10] fix bug in ios build

---
 saber/lite/funcs/neon/saber_eltwise_act.cpp | 38 ++++++++++-----------
 saber/lite/funcs/saber_eltwise_act.h        |  2 +-
 2 files changed, 20 insertions(+), 20 deletions(-)
diff --git a/saber/lite/funcs/neon/saber_eltwise_act.cpp b/saber/lite/funcs/neon/saber_eltwise_act.cpp
index 2b301cecc..3d3435ce7 100644
--- a/saber/lite/funcs/neon/saber_eltwise_act.cpp
+++ b/saber/lite/funcs/neon/saber_eltwise_act.cpp
@@ -9,32 +9,32 @@ namespace lite{
 
 template <typename Dtype>
 void eltwise_prod_relu(const Dtype* din_a, const Dtype* din_b, Dtype* dout, const int num, \
-    int channel, int channel_size, std::vector<float> coef, bool channel_shared, float* slop_ptr);
+    int channel, int channel_size, std::vector<float> coef, bool channel_shared, const float* slop_ptr);
 
 template <typename Dtype>
 void eltwise_sum_relu(const Dtype* din_a, const Dtype* din_b, Dtype* dout, const int num, \
-     int channel, int channel_size, std::vector<float> coef, bool channel_shared, float* slop_ptr);
+     int channel, int channel_size, std::vector<float> coef, bool channel_shared, const float* slop_ptr);
 
 template <typename Dtype>
 void eltwise_max_relu(const Dtype* din_a, const Dtype* din_b, Dtype* dout, const int num, \
-     int channel, int channel_size, std::vector<float> coef, bool channel_shared, float* slop_ptr);
+     int channel, int channel_size, std::vector<float> coef, bool channel_shared, const float* slop_ptr);
 
 template <typename Dtype>
 void eltwise_prod_prelu(const Dtype* din_a, const Dtype* din_b, Dtype* dout, const int num, \
-    int channel, int channel_size, std::vector<float> coef, bool channel_shared, float* slop_ptr);
+    int channel, int channel_size, std::vector<float> coef, bool channel_shared, const float* slop_ptr);
 
 template <typename Dtype>
 void eltwise_sum_prelu(const Dtype* din_a, const Dtype* din_b, Dtype* dout, const int num, \
-     int channel, int channel_size, std::vector<float> coef, bool channel_shared, float* slop_ptr);
+     int channel, int channel_size, std::vector<float> coef, bool channel_shared, const float* slop_ptr);
 
 template <typename Dtype>
 void eltwise_max_prelu(const Dtype* din_a, const Dtype* din_b, Dtype* dout, const int num, \
-     int channel, int channel_size, std::vector<float> coef, bool channel_shared, float* slop_ptr);
+     int channel, int channel_size, std::vector<float> coef, bool channel_shared, const float* slop_ptr);
 
 
 template <>
 void eltwise_prod_relu(const float* din_a, const float* din_b, float* dout, const int num, \
-     int channel, int channel_size, std::vector<float> coef, bool channel_shared, float* slop_ptr) {
+     int channel, int channel_size, std::vector<float> coef, bool channel_shared, const float* slop_ptr) {
 
     int cnt = channel_size >> 3;
     int remain = channel_size & 7;
@@ -95,7 +95,7 @@ void eltwise_prod_relu(const float* din_a, const float* din_b, float* dout, cons
 }
 
 void eltwise_sum_relu(const float* din_a, const float* din_b, float* dout, const int num, \
-     int channel, int channel_size, std::vector<float> coef, bool channel_shared, float* slop_ptr) {
+     int channel, int channel_size, std::vector<float> coef, bool channel_shared, const float* slop_ptr) {
 
     int cnt = channel_size >> 3;
     int remain = channel_size & 7;
@@ -156,7 +156,7 @@ void eltwise_sum_relu(const float* din_a, const float* din_b, float* dout, const
 }
 
 void eltwise_max_relu(const float* din_a, const float* din_b, float* dout, const int num, \
-     int channel, int channel_size, std::vector<float> coef, bool channel_shared, float* slop_ptr) {
+     int channel, int channel_size, std::vector<float> coef, bool channel_shared, const float* slop_ptr) {
 
     int cnt = channel_size >> 3;
     int remain = channel_size & 7;
@@ -215,7 +215,7 @@ void eltwise_max_relu(const float* din_a, const float* din_b, float* dout, const
 
 //prelu
 void eltwise_prod_prelu(const float* din_a, const float* din_b, float* dout, const int num, \
-    int channel, int channel_size, std::vector<float> coeff, bool channel_shared, float* slop_ptr) {
+    int channel, int channel_size, std::vector<float> coeff, bool channel_shared, const float* slop_ptr) {
 
     int cnt = channel_size >> 3;
     int remain = channel_size & 7;
@@ -250,7 +250,7 @@ void eltwise_prod_prelu(const float* din_a, const float* din_b, float* dout, con
 
                 float32x4_t vout_sel0 = vbslq_f32(vmask0, vout0, vsum0);
                 float32x4_t vout_sel1 = vbslq_f32(vmask1, vout1, vsum1);
-            
+
                 a_ptr += 8;
                 b_ptr += 8;
 
@@ -259,7 +259,7 @@ void eltwise_prod_prelu(const float* din_a, const float* din_b, float* dout, con
 
                 out_ptr += 8;
             }
-    
+
     #else
             int loop_cnt = cnt;
             if (loop_cnt > 0) {
@@ -304,7 +304,7 @@ void eltwise_prod_prelu(const float* din_a, const float* din_b, float* dout, con
 }
 
 void eltwise_sum_prelu(const float* din_a, const float* din_b, float* dout, const int num, \
-    int channel, int channel_size, std::vector<float> coeff, bool channel_shared, float* slop_ptr) {
+    int channel, int channel_size, std::vector<float> coeff, bool channel_shared, const float* slop_ptr) {
 
     int cnt = channel_size >> 3;
     int remain = channel_size & 7;
@@ -339,7 +339,7 @@ void eltwise_sum_prelu(const float* din_a, const float* din_b, float* dout, cons
 
                 float32x4_t vout_sel0 = vbslq_f32(vmask0, vout0, vsum0);
                 float32x4_t vout_sel1 = vbslq_f32(vmask1, vout1, vsum1);
-            
+
                 a_ptr += 8;
                 b_ptr += 8;
 
@@ -348,7 +348,7 @@ void eltwise_sum_prelu(const float* din_a, const float* din_b, float* dout, cons
 
                 out_ptr += 8;
             }
-    
+
 #else
             int loop_cnt = cnt;
             if (loop_cnt > 0) {
@@ -393,7 +393,7 @@ void eltwise_sum_prelu(const float* din_a, const float* din_b, float* dout, cons
 }
 
 void eltwise_max_prelu(const float* din_a, const float* din_b, float* dout, const int num, \
-    int channel, int channel_size, std::vector<float> coeff, bool channel_shared, float* slop_ptr) {
+    int channel, int channel_size, std::vector<float> coeff, bool channel_shared, const float* slop_ptr) {
 
     int cnt = channel_size >> 3;
     int remain = channel_size & 7;
@@ -428,7 +428,7 @@ void eltwise_max_prelu(const float* din_a, const float* din_b, float* dout, cons
 
                 float32x4_t vout_sel0 = vbslq_f32(vmask0, vout0, vsum0);
                 float32x4_t vout_sel1 = vbslq_f32(vmask1, vout1, vsum1);
-            
+
                 a_ptr += 8;
                 b_ptr += 8;
 
@@ -437,7 +437,7 @@ void eltwise_max_prelu(const float* din_a, const float* din_b, float* dout, cons
 
                 out_ptr += 8;
             }
-    
+
 #else
             int loop_cnt = cnt;
             if (loop_cnt > 0) {
@@ -668,4 +668,4 @@ REGISTER_LAYER_CLASS(SaberEltwiseAct);
 
 } // namespace anakin
 
-#endif //USE_ARM_PLACE
\ No newline at end of file
+#endif //USE_ARM_PLACE
diff --git a/saber/lite/funcs/saber_eltwise_act.h b/saber/lite/funcs/saber_eltwise_act.h
index 685e29348..43da72b5e 100644
--- a/saber/lite/funcs/saber_eltwise_act.h
+++ b/saber/lite/funcs/saber_eltwise_act.h
@@ -26,7 +26,7 @@ namespace lite{
 
 typedef void (*eltwise_act_func)(const float* din_a, \
     const float* din_b, float* dout, const int num, const int channel, \
-    const int channel_size, std::vector<float> coef, bool channel_shared, float* slop_ptr);
+    const int channel_size, std::vector<float> coef, bool channel_shared, const float* slop_ptr);
 
 //template <typename Dtype>
 class SaberEltwiseAct : public OpBase {

From 5596a5e8097f2434f555ac7f9e86f664c9d1925b Mon Sep 17 00:00:00 2001
From: Zhang <zhangxi20@baidu.com>
Date: Thu, 23 Aug 2018 18:58:17 +0800
Subject: [PATCH 02/10] some change for resize

---
 saber/lite/funcs/neon/saber_resize.cpp | 162 ++++++++++++++-----------
 saber/lite/funcs/saber_resize.h        |   4 +
 test/lite/test_resize_lite.cpp         |  33 ++---
 3 files changed, 111 insertions(+), 88 deletions(-)

diff --git a/saber/lite/funcs/neon/saber_resize.cpp b/saber/lite/funcs/neon/saber_resize.cpp
index b38b45a38..2b09d2286 100644
--- a/saber/lite/funcs/neon/saber_resize.cpp
+++ b/saber/lite/funcs/neon/saber_resize.cpp
@@ -12,63 +12,25 @@ namespace saber{
 namespace lite{
 
 
-static void resize_spatial(const float* src, int w_in, int h_in, float* dst, int w_out, int h_out, float scale_x, float scale_y)
-{
+static void resize_spatial(const float* src, int w_in, int h_in, float* dst, \
+                            int w_out, int h_out, float* coor_buf, std::vector<Tensor<CPU, AK_FLOAT>> rows_buf){
 
-    int* buf = new int[w_out + h_out + w_out * 2 + h_out * 2];
+    int* xofs = (int*)coor_buf;
+    int* yofs = xofs + w_out;
 
-    int* xofs = buf;
-    int* yofs = buf + w_out;
+    float* alpha = (float*)yofs + h_out;
+    float* beta = alpha + w_out * 2;
 
-    float* alpha = (float*)(buf + w_out + h_out);
-    float* beta = (float*)(buf + w_out + h_out + w_out * 2);
-
-    float fx;
-    float fy;
-    int sx;
-    int sy;
-
-    for (int dx = 0; dx < w_out; dx++){
-        fx = dx * scale_x;
-        sx = int(fx);
-        fx -= sx;
-
-        if (sx >= w_in - 1){
-            sx = w_in - 2;
-            fx = 1.f;
-        }
-
-        xofs[dx] = sx;
-
-        alpha[dx*2    ] = 1.f - fx;
-        alpha[dx*2 + 1] = fx;
-    }
-
-    for (int dy = 0; dy < h_out; dy++){
-        fy = dy * scale_y;
-        sy = int(fy);
-        fy -= sy;
-
-        if (sy >= h_in - 1)
-        {
-            sy = h_in - 2;
-            fy = 1.f;
-        }
-
-        yofs[dy] = sy;
-
-        beta[dy*2    ] = 1.f - fy;
-        beta[dy*2 + 1] = fy;
-    }
-
-    // loop body
-    float* rowsbuf0 = new float[w_out + 1];
-    float* rowsbuf1 = new float[w_out + 1];
-    float* rows0 = rowsbuf0;
-    float* rows1 = rowsbuf1;
+#ifdef USE_OPENMP
+    int thread_id = omp_get_thread_num();
+#else
+    int thread_id = 0;
+#endif
+    float* rows0 = rows_buf[thread_id * 2].mutable_data();
+    float* rows1 = rows_buf[thread_id * 2 + 1].mutable_data();
 
     int prev_sy1 = -1;
-
+    //main loop 
     for (int dy = 0; dy < h_out; dy++ ){
         int sy = yofs[dy];
 
@@ -83,7 +45,6 @@ static void resize_spatial(const float* src, int w_in, int h_in, float* dst, int
             float* rows1p = rows1;
             
             int dx = 0;
-            float* rows1pt = rows1;
             for ( ; dx+1 < w_out; dx += 2 ){
                 int sx = xofs[dx];
                 int sxn = xofs[dx+1];
@@ -103,6 +64,7 @@ static void resize_spatial(const float* src, int w_in, int h_in, float* dst, int
                 vst1_f32(rows1p + dx, _rows1);
                 alphap += 4;
 #else
+                float* rows1pt = rows1;
                 asm volatile(
                         "vld1.32 {d0-d1}, [%[alpha]]!       @load alpha to q0\n"
                         "vld1.32 d2, [%[s1p]]               @load s1p to d2  \n"
@@ -176,22 +138,21 @@ static void resize_spatial(const float* src, int w_in, int h_in, float* dst, int
                   
                 /*
                 asm volatile(
-                            "pld [%[alpha]]        \n"
-                            "vld1.32 {d0-d1}, [%[alpha]]!      \n"
-                            "vld1.32 d2, [%[s0p]]                  \n"
+                            "pld [%[alpha]]                         \n"
+                            "vld1.32 {d0-d1}, [%[alpha]]!           \n"
+                            "vld1.32 d2, [%[s0p]]                   \n"
                             "vld1.32 d3, [%[s0np]]                  \n"
                             "vld1.32 d4, [%[s1p]]                   \n"
-                            "vld1.32 d5, [%[s1np]]                   \n"
+                            "vld1.32 d5, [%[s1np]]                  \n"
                 
-                            "vmul.f32 q3, q1, q0               \n"
-                            "vmul.f32 q4, q2, q0               \n"
+                            "vmul.f32 q3, q1, q0                    \n"
+                            "vmul.f32 q4, q2, q0                    \n"
 
-                            "vpadd.f32 d10, d6, d7              \n"
-                            "vpadd.f32 d11, d8, d9              \n"
+                            "vpadd.f32 d10, d6, d7                  \n"
+                            "vpadd.f32 d11, d8, d9                  \n"
 
-                            "vst1.32 d10, [%[out1]]!             \n"
-                          
-                            "vst1.32 d11, [%[out2]]!            \n"
+                            "vst1.32 d10, [%[out1]]!                \n"
+                            "vst1.32 d11, [%[out2]]!                \n"
 
                             :[out1]"+r"(rows0pt), [out2]"+r"(rows1pt), [alpha]"+r"(alphap)
                             :[s0p]"r"(S0p), [s1p]"r"(S1p),[s0np]"r"(S0np),[s1np]"r"(S1np)
@@ -257,7 +218,7 @@ static void resize_spatial(const float* src, int w_in, int h_in, float* dst, int
         }
         
 #else
-        if(nn > 0){
+        if (nn > 0){
             asm volatile(
                 "vdup.32 q0, %[b0]                   @dup b0 to q1\n"
                 "vdup.32 q1, %[b1]                   @dup b1 to q0\n"
@@ -291,22 +252,19 @@ static void resize_spatial(const float* src, int w_in, int h_in, float* dst, int
         beta += 2;
     }
 
-    delete[] buf;
-
-    delete[] rowsbuf0;
-    delete[] rowsbuf1;
 }
 
 
 void resize(const float* in_data, int count, int h_in, int w_in, \
-            float* out_data, int h_out, int w_out, float width_scale, float height_scale){
+            float* out_data, int h_out, int w_out, float* coor_buf, std::vector<Tensor<CPU, AK_FLOAT>> &rows_buf){
 
     int spatial_in = h_in * w_in;
     int spatial_out = h_out * w_out;
 
 #pragma omp parallel for
     for(int i = 0; i < count; ++i){
-        resize_spatial(in_data + i * spatial_in, w_in, h_in, out_data + i * spatial_out, w_out, h_out, width_scale, height_scale);
+        resize_spatial(in_data + i * spatial_in, w_in, h_in, \
+                        out_data + i * spatial_out, w_out, h_out, coor_buf, rows_buf);
     }
 }
 
@@ -361,10 +319,66 @@ SaberStatus SaberResize::init(const std::vector<Tensor<CPU, AK_FLOAT> *> &inputs
         return SaberNotInitialized;
     }
     this->_ctx = &ctx;
+
     _width_scale = _param->_width_scale;
     _height_scale = _param->_height_scale;
+    
+    int out_w = outputs[0]->valid_shape()[3];
+    int out_h = outputs[0]->valid_shape()[2];
+    int in_w = inputs[0]->valid_shape()[3];
+    int in_h = inputs[0]->valid_shape()[2];
+
+#ifdef USE_OPENMP
+    int num_threads = omp_get_max_threads();
+#else
+    int num_threads = 1;
+#endif
+    //allocate space 
+    Shape sh_coor(1, 1, 1, 3 * (out_w+out_h)); 
+    _coor_buf.re_alloc(sh_coor);
+    _rows_buf.resize(2 * num_threads);
+    //allocate rows buf according to threads num
+    for (int i = 0; i < _rows_buf.size(); ++i){
+        _rows_buf[i].re_alloc(out_w + 1);
+    }
+    int* xofs = (int*)_coor_buf.mutable_data();
+    int* yofs =  xofs + out_w; 
+    float* alpha = (float*)(yofs) + out_h;
+    float* beta = alpha + out_w * 2;
+    float fx, fy;
+    int sx, sy;
+
+    //pre compute coordinate in x and y direction
+    for (int dx = 0; dx < out_w; dx++){
+        fx = dx * (1.0 / _width_scale);
+        sx = int(fx);
+        fx -= sx;
+
+        if (sx >= in_w - 1){
+            sx = in_w - 2;
+            fx = 1.f;
+        }
+        xofs[dx] = sx;
+        alpha[dx * 2    ] = 1.f - fx;
+        alpha[dx * 2 + 1] = fx;
+    }
+
+    for (int dy = 0; dy < out_h; dy++){
+        fy = dy * (1.0 / _height_scale);
+        sy = int(fy);
+        fy -= sy;
+
+        if (sy >= in_h - 1){
+            sy = in_h - 2;
+            fy = 1.f;
+        }
+        yofs[dy] = sy;
+        beta[dy * 2    ] = 1.f - fy;
+        beta[dy * 2 + 1] = fy;
+    }
     this->_flag_init = true;
     return SaberSuccess;
+
 }
 
 
@@ -392,9 +406,7 @@ SaberStatus SaberResize::dispatch(const std::vector<Tensor<CPU, AK_FLOAT>*>& inp
     int out_h = outputs[0]->height();
     int out_w = outputs[0]->width();
 
-
-    resize(din, count, in_h, in_w, dout, out_h, out_w, 1.0f / _width_scale, 1.0f / _height_scale);
-
+    resize(din, count, in_h, in_w, dout, out_h, out_w, _coor_buf.mutable_data(), _rows_buf);
 
 
 #ifdef ENABLE_OP_TIMER
@@ -406,7 +418,9 @@ SaberStatus SaberResize::dispatch(const std::vector<Tensor<CPU, AK_FLOAT>*>& inp
 #endif
     return SaberSuccess;
 }
+
 REGISTER_LAYER_CLASS(SaberResize);
+
 } //namespace lite
 
 } //namespace saber
diff --git a/saber/lite/funcs/saber_resize.h b/saber/lite/funcs/saber_resize.h
index 618d24abc..e7ab6c0ee 100644
--- a/saber/lite/funcs/saber_resize.h
+++ b/saber/lite/funcs/saber_resize.h
@@ -51,6 +51,10 @@ namespace anakin{
 
             private:
                 const ResizeParam* _param;
+                ////coordinate buffer
+                Tensor<CPU, AK_FLOAT> _coor_buf; 
+                //rows buffer
+                std::vector<Tensor<CPU, AK_FLOAT>> _rows_buf;
                 float _width_scale{0.0f};
                 float _height_scale{0.0f};
             };
diff --git a/test/lite/test_resize_lite.cpp b/test/lite/test_resize_lite.cpp
index d8e68bb1b..8e2e13bd1 100644
--- a/test/lite/test_resize_lite.cpp
+++ b/test/lite/test_resize_lite.cpp
@@ -21,9 +21,9 @@ void resize_basic(const float* in_data,int count, int h_in, int w_in, \
 
     int spatial_in = h_in * w_in;
     int spatial_out = h_out * w_out;
-#pragma omp parallel for
-    for(int i = 0; i < count; ++i){
-        for(int s = 0; s < spatial_out; ++s){
+
+    for (int i = 0; i < count; ++i){
+        for (int s = 0; s < spatial_out; ++s){
             int x_out = s % w_out;
             int y_out = s / w_out;
             float x_in = x_out * width_scale;
@@ -67,8 +67,7 @@ TEST(TestSaberLite, test_func_resize_arm) {
     // start Reshape & doInfer
     Context ctx1;
     LOG(INFO) << "set runtine context";
-    PowerMode mode = cluster == 0? SABER_POWER_HIGH : SABER_POWER_LOW;
-    ctx1.set_run_mode(mode, threads);
+    ctx1.set_run_mode((PowerMode)cluster, threads);
     LOG(INFO) << "test threads activated";
 #pragma omp parallel
     {
@@ -139,16 +138,16 @@ TEST(TestSaberLite, test_func_resize_arm) {
         }
     }
 
+    printf("basic resize time: %.4fms\n", basic_tdiff);
+    printf("saber resize total time : %.4fms, avg time : %.4fms\n", sum, sum / test_iter, min_time);
 #if COMPARE_RESULT
     double max_ratio = 0;
     double max_diff = 0;
-
     tensor_cmp_host(tout_basic.data(), tout.data(), tout_basic.valid_size(), max_ratio, max_diff);
     CHECK_EQ(fabsf(max_ratio) < 1e-5f, true) << "compute result error" \
      << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio;;
 #endif
-    printf("basic resize time: %.4fms\n", basic_tdiff);
-    printf("saber resize total time : %.4fms, avg time : %.4fms\n", sum, sum / test_iter, min_time);
+    
     //print_tensor(*vin[0]);
     //print_tensor(tout_basic);
     //print_tensor(*vout[0]);
@@ -161,26 +160,32 @@ int main(int argc, const char** argv){
 
     if (argc >= 2) {
         cluster = atoi(argv[1]);
+        if (cluster < 0){
+            cluster = 0;
+        }
+        if (cluster > 3){
+            cluster = 3;
+        }
     }
     if (argc >= 3) {
         threads = atoi(argv[2]);
     }
-    if(argc >= 4){
+    if (argc >= 4){
         num_in = atoi(argv[3]);
     }
-    if(argc >= 5){
+    if (argc >= 5){
         ch_in = atoi(argv[4]);
     }
-    if(argc >= 6){
+    if (argc >= 6){
         h_in = atoi(argv[5]);
     }
-    if(argc >= 7){
+    if (argc >= 7){
         w_in = atoi(argv[6]);
     }
-    if(argc >= 8){
+    if (argc >= 8){
         width_scale = atof(argv[7]);
     }
-    if(argc >= 9){
+    if (argc >= 9){
         height_scale = atof(argv[8]);
     }
     InitTest();

From 1d570e98f79de9b365e073e9f47a371dd19b13cb Mon Sep 17 00:00:00 2001
From: chenjiaoAngel <chenjiao04@baidu.com>
Date: Fri, 24 Aug 2018 13:39:16 +0800
Subject: [PATCH 03/10] update softmax

---
 saber/lite/funcs/saber_softmax.h |  1 +
 test/lite/test_softmax_lite.cpp  | 31 ++++++++++++++++++++++---------
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/saber/lite/funcs/saber_softmax.h b/saber/lite/funcs/saber_softmax.h
index 3181f56f9..2686ea2e2 100755
--- a/saber/lite/funcs/saber_softmax.h
+++ b/saber/lite/funcs/saber_softmax.h
@@ -53,6 +53,7 @@ class SaberSoftmax : public OpBase {
 
 private:
     const SoftmaxParam* _param;
+    float* _work_space_data;
     int _axis_size{0};
     int _inner_num{0};
     int _outer_num{0};
diff --git a/test/lite/test_softmax_lite.cpp b/test/lite/test_softmax_lite.cpp
index bd5f3a147..4e0959e99 100644
--- a/test/lite/test_softmax_lite.cpp
+++ b/test/lite/test_softmax_lite.cpp
@@ -6,7 +6,11 @@ using namespace anakin::saber::lite;
 
 int cluster = 0;
 int threads = 4;
-
+int num = 1;
+int ch = 1971;
+int h = 21;
+int w = 1;
+int axis = 2;
 typedef Tensor<CPU, AK_FLOAT> TensorHf4;
 
 #define COMPARE_RESULT 1
@@ -78,13 +82,13 @@ TEST(TestSaberLite, test_func_softmax_arm) {
 #endif
     }
 
-    int test_iter = 100;
+    int test_iter = 1;
 
-    int softmax_axis = 2; // channel
-    int w_in = 1;
-    int h_in = 21;
-    int ch_in = 1917;
-    int num_in = 1;
+    int softmax_axis = axis; // channel
+    int w_in = w;
+    int h_in = h;
+    int ch_in = ch;
+    int num_in = num;
 
     Shape shape_in(num_in, ch_in, h_in, w_in);
     Shape shape_out = shape_in;
@@ -107,7 +111,7 @@ TEST(TestSaberLite, test_func_softmax_arm) {
 
 #if COMPARE_RESULT
     softmax_basic(thin, softmax_axis, tout_basic);
-    //print_tensor_host(tout_basic);
+    //print_tensor(tout_basic);
 #endif
 
     SaberSoftmax softmax_lite;
@@ -145,7 +149,7 @@ TEST(TestSaberLite, test_func_softmax_arm) {
     }
 
     printf("saber softmax total time : %.4f, avg time : %.4f\n", to, to / test_iter, min_time);
-    //print_tensor_host(*vout[0]);
+    //print_tensor(*vout[0]);
 
 #if COMPARE_RESULT
     double max_ratio = 0;
@@ -170,6 +174,15 @@ int main(int argc, const char** argv){
     if (argc >= 3) {
         threads = atoi(argv[2]);
     }
+    if (argc >= 4) {
+        axis = atoi(argv[3]);
+    }
+    if (argc >= 5 && argc <= 8) {
+        num = atoi(argv[4]);
+        ch = atoi(argv[5]);
+        h = atoi(argv[6]);
+        w = atoi(argv[7]);
+    }
 
     InitTest();
     RUN_ALL_TESTS(argv[0]);

From aa81f67dc0d2191902d9c53e7837aeae8e68555a Mon Sep 17 00:00:00 2001
From: chenjiaoAngel <chenjiao04@baidu.com>
Date: Fri, 24 Aug 2018 13:54:04 +0800
Subject: [PATCH 04/10] update softmax

---
 saber/lite/funcs/neon/saber_softmax.cpp | 106 +++++++++++++++++++++++-
 saber/lite/funcs/saber_softmax.h        |   1 -
 2 files changed, 105 insertions(+), 2 deletions(-)

diff --git a/saber/lite/funcs/neon/saber_softmax.cpp b/saber/lite/funcs/neon/saber_softmax.cpp
index b609d0869..657874bae 100644
--- a/saber/lite/funcs/neon/saber_softmax.cpp
+++ b/saber/lite/funcs/neon/saber_softmax.cpp
@@ -48,6 +48,101 @@ void softmax_basic(const float* din, float* dout, \
     }
 }
 
+void softmax_arm_lite(const float* din, float* dout, \
+    const int axis_size, const int inner_num, \
+    const int outer_num, const int compute_size) {
+
+    int cmp_cnt = compute_size >> 2;
+   // int cmp_remain = compute_size % 4;
+   // printf("axis_size: %d, inner_num: %d, outer_num: %d, compute_size: %d \n", axis_size, inner_num, outer_num, compute_size);
+    //printf("cmp_cnt: %d \n", cmp_cnt);
+    #pragma omp parallel for
+    for (int c = 0; c < cmp_cnt; ++c) {
+        int i = c * 4;
+        int idx_inner = i % inner_num;
+        int idx_outer = (i / inner_num) * axis_size;
+        int real_index = idx_outer * inner_num + idx_inner;
+
+        //float max_data = din[real_index];
+        const float* din_ptr =  din + real_index;
+        float32x4_t vmax = vld1q_f32(din_ptr);
+        //! get max
+        for (int j = 1; j < axis_size; ++j) {
+            din_ptr += inner_num;
+            float32x4_t vdata = vld1q_f32(din_ptr);
+            vmax = vmaxq_f32(vmax, vdata);
+        }
+
+        //! sub, exp and sum
+      //  dout[real_index] = expf(din[real_index] - max_data);
+        din_ptr =  din + real_index;
+        float* dout_ptr = dout + real_index;
+        float32x4_t vdata = vld1q_f32(din_ptr);
+        float32x4_t vsum  = exp_ps(vsubq_f32(vdata, vmax));
+        din_ptr += inner_num;
+        vst1q_f32(dout_ptr, vsum);
+        dout_ptr += inner_num;
+        //float sum_data = dout[real_index];
+        for (int j = 1; j < axis_size; ++j) {
+          //  real_index += inner_num;
+            float32x4_t vdata0 = vld1q_f32(din_ptr);
+            vdata0 = exp_ps(vsubq_f32(vdata0, vmax));
+            din_ptr += inner_num;
+            vsum = vaddq_f32(vsum, vdata0);
+            vst1q_f32(dout_ptr, vdata0);
+            dout_ptr += inner_num;
+        }
+
+      //  float sum_inv = 1.f / sum_data;
+        float32x4_t vone = vdupq_n_f32(1.0f);
+        float32x4_t vinf = div_ps(vone, vsum);
+        dout_ptr = dout + real_index;
+       //printf("real_index: %d, dout: %x, dout_ptr: %x \n", real_index, dout, dout_ptr);
+       // real_index = idx_outer * inner_num + idx_inner;
+        //! get softmax result
+        for (int j = 0; j < axis_size; ++j) {
+            float32x4_t vdata0 = vld1q_f32(dout_ptr);
+            vdata0 = vmulq_f32(vdata0, vinf);
+            vst1q_f32(dout_ptr, vdata0);
+            dout_ptr += inner_num;
+        }
+    }
+
+    for(int i = cmp_cnt * 4; i < compute_size; i++){
+        int idx_inner = i % inner_num;
+        int idx_outer = (i / inner_num) * axis_size;
+        int real_index = idx_outer * inner_num + idx_inner;
+
+      //  printf("real_index: %d, din: %x\n", real_index, din);
+
+        float max_data = din[real_index];
+        //! get max
+        for (int j = 1; j < axis_size; ++j) {
+            real_index += inner_num;
+            max_data = din[real_index] > max_data? din[real_index] : max_data;
+        }
+
+        real_index = idx_outer * inner_num + idx_inner;
+        //! sub, exp and sum
+        dout[real_index] = expf(din[real_index] - max_data);
+        float sum_data = dout[real_index];
+        for (int j = 1; j < axis_size; ++j) {
+            real_index += inner_num;
+            dout[real_index] = expf(din[real_index] - max_data);
+            sum_data += dout[real_index];
+        }
+
+        float sum_inv = 1.f / sum_data;
+        real_index = idx_outer * inner_num + idx_inner;
+        //! get softmax result
+        for (int j = 0; j < axis_size; ++j) {
+            dout[real_index] *= sum_inv;
+            real_index += inner_num;
+        }
+    }
+
+}
+
 //! for inner size == 1
 void softmax_inner1(const float* din, float* dout, \
     const int outer_size, const int axis_size) {
@@ -242,7 +337,16 @@ SaberStatus SaberSoftmax::dispatch(const std::vector<Tensor<CPU, AK_FLOAT>*>& in
         }
     } else {
         int compute_size = inputs[0]->valid_size() / _axis_size;
-        softmax_basic(din, dout, _axis_size, _inner_num, _outer_num, compute_size);
+     //   softmax_basic(din, dout, _axis_size, _inner_num, _outer_num, compute_size);
+#if 1
+        if(this->_inner_num % 4){
+           // printf("basic \n");
+            softmax_basic(din, dout, _axis_size, _inner_num, _outer_num, compute_size);
+        }else{
+           // printf("lite \n");
+            softmax_arm_lite(din, dout, _axis_size, _inner_num, _outer_num, compute_size);
+        }
+#endif
     }
 #ifdef ENABLE_OP_TIMER
     this->_timer.end();
diff --git a/saber/lite/funcs/saber_softmax.h b/saber/lite/funcs/saber_softmax.h
index 2686ea2e2..3181f56f9 100755
--- a/saber/lite/funcs/saber_softmax.h
+++ b/saber/lite/funcs/saber_softmax.h
@@ -53,7 +53,6 @@ class SaberSoftmax : public OpBase {
 
 private:
     const SoftmaxParam* _param;
-    float* _work_space_data;
     int _axis_size{0};
     int _inner_num{0};
     int _outer_num{0};

From 6ebaeb6ca62bc4ed0c4641f58b0d3cc373aa16e7 Mon Sep 17 00:00:00 2001
From: chenjiaoAngel <chenjiao04@baidu.com>
Date: Fri, 24 Aug 2018 16:58:02 +0800
Subject: [PATCH 05/10] fix bug in parse priorbox

---
 framework/operators/priorbox.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/framework/operators/priorbox.cpp b/framework/operators/priorbox.cpp
index 27855904b..7ee221f12 100644
--- a/framework/operators/priorbox.cpp
+++ b/framework/operators/priorbox.cpp
@@ -23,14 +23,17 @@ Status PriorBoxHelper<Ttype, Dtype, Ptype>::InitParam() {
     if (FIND_PARAMETER(fixed_size)) {
         fixed_size_ = GET_PARAMETER(PTuple<float>, fixed_size);
     }
+   // LOG(ERROR) << "fixed_size size " << fixed_size_.size();
     PTuple<float> fixed_ratio_;
     if (FIND_PARAMETER(fixed_ratio)) {
         fixed_ratio_ = GET_PARAMETER(PTuple<float>, fixed_ratio);;
     }
+    //LOG(ERROR) << "fixed_ratio size " << fixed_ratio_.size();
     PTuple<float> density_;
     if (FIND_PARAMETER(density)) {
-        auto density_ = GET_PARAMETER(PTuple<float>, density);
+         density_ = GET_PARAMETER(PTuple<float>, density);
     }
+    //LOG(ERROR) << "density_ size " << density_.size();
     //end
     auto max_size_ = GET_PARAMETER(PTuple<float>, max_size);
     auto as_ratio  = GET_PARAMETER(PTuple<float>, aspect_ratio);

From 5f8bb54b149c928e794fa22dfcc63da3a7d2a238 Mon Sep 17 00:00:00 2001
From: chenjiaoAngel <chenjiao04@baidu.com>
Date: Sat, 25 Aug 2018 14:09:55 +0800
Subject: [PATCH 06/10] update conv_dw3x3

---
 .../funcs/neon/impl/conv_arm_depthwise.cpp    | 1397 ++++++++++++++++-
 1 file changed, 1389 insertions(+), 8 deletions(-)

diff --git a/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp b/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp
index 30a1fac03..ac2691bbb 100644
--- a/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp
+++ b/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp
@@ -96,7 +96,7 @@ void conv_depthwise_3x3(const float* din, float* dout, \
         }
     } else { //! stride = 2
         if (flag_relu) {
-            if(w_in > 4){
+            if(w_in > 7){
                 conv_depthwise_3x3s2p1_bias_relu(dout, din, weights, bias, flag_bias, \
                 num, ch_in, h_in, w_in, h_out, w_out);
             }else{
@@ -104,7 +104,7 @@ void conv_depthwise_3x3(const float* din, float* dout, \
                 num, ch_in, h_in, w_in, h_out, w_out);
             }
         } else {
-            if(w_in > 4){
+            if(w_in > 7){
                 conv_depthwise_3x3s2p1_bias(dout, din, weights, bias, flag_bias, \
                 num, ch_in, h_in, w_in, h_out, w_out);
             }else{
@@ -3088,6 +3088,465 @@ void conv_depthwise_3x3s1p1_bias(float* dout, const float* din, \
  */
 #ifdef __aarch64__
 //one line
+#if 1
+//w_in > 7
+void conv_depthwise_3x3s2p1_bias(float* dout, const float* din, \
+    const float* weights, const float* bias, bool flag_bias, \
+    const int num, const int ch_in, const int h_in, const int w_in, \
+    const int h_out, const int w_out) {
+
+    int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+    int out_pad_idx[4] = {0, 1, 2, 3};
+    int size_pad_bottom = h_out * 2 - h_in;
+
+    int cnt_col = (w_out >> 2) - 1;
+    int cnt_remain = (w_out % 4);
+    int size_right_remain = w_in - (7 + cnt_col * 8);
+
+    int size_right_pad = w_out * 2 - w_in;
+
+    uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), vld1q_s32(right_pad_idx));//0 2 4 6
+    uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), vld1q_s32(right_pad_idx + 4));//1 3 5 7
+    uint32x4_t wmask = vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));//0 1 2 3
+   // printf("w_out %d, cnt_col: %d, remain: %d \n", w_out, cnt_col, size_right_remain);
+    //printf("mask1: %d, %d, %d, %d \n", vmask_rp1[0], vmask_rp1[1], vmask_rp1[2], vmask_rp1[3]);
+    //printf("mask2: %d, %d, %d, %d \n", vmask_rp2[0], vmask_rp2[1], vmask_rp2[2], vmask_rp2[3]);
+    //printf("wmask: %d, %d, %d, %d \n", wmask[0], wmask[1], wmask[2], wmask[3]);
+   // size_right_remain *= sizeof(float);
+
+    int size_in_channel = w_in * h_in;
+    int size_out_channel = w_out * h_out;
+
+    for (int n = 0; n < num; ++n) {
+        const float *din_batch = din + n * ch_in * size_in_channel;
+        float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+        for (int i = 0; i < ch_in; ++i) {
+            const float* din_channel = din_batch + i * size_in_channel;
+            float* dout_channel = dout_batch + i * size_out_channel;
+
+            const float *weight_ptr = weights + i * 9;
+            float32x4_t wr0 = vld1q_f32(weight_ptr);
+            float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+            float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+            float32x4_t vzero= vdupq_n_f32(0.f);
+
+            float32x4_t wbias;
+            if (flag_bias) {
+                wbias  = vdupq_n_f32(bias[i]);
+            } else {
+                wbias = vdupq_n_f32(0.f);
+            }
+
+            const float *dr0 = din_channel;
+            const float *dr1 = dr0 + w_in;
+            const float *dr2 = dr1 + w_in;
+
+            const float *din0_ptr = dr0;
+            const float *din1_ptr = dr1;
+            const float *din2_ptr = dr2;
+
+            float *doutr0 = dout_channel;
+            float *doutr0_ptr = doutr0;
+
+            //! top pad
+            if(1){
+               int cnt = cnt_col;
+
+               //printf("cnt_col: %d, remain: %d \n", cnt_col, size_right_remain);
+             //  printf("mask1: %d, %d, %d, %d \n", vmask_rp1[0], vmask_rp1[1], vmask_rp1[2], vmask_rp1[3]);
+             //  printf("mask2: %d, %d, %d, %d \n", vmask_rp2[0], vmask_rp2[1], vmask_rp2[2], vmask_rp2[3]);
+              // printf("wmask: %d, %d, %d, %d \n", wmask[0], wmask[1], wmask[2], wmask[3]);
+
+                asm volatile (
+                //top
+                // Load up 12 elements (3 vectors) from each of 8 sources.
+                "0:                                      \n"
+                "prfm pldl1keep, [%[inptr0]]             \n"
+                "prfm pldl1keep, [%[inptr1]]             \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+
+                "ext  v2.16b, %[vzero].16b, v1.16b, #12     \n" // v2 = {0,1,3,5}
+                "ext  v6.16b, %[vzero].16b, v5.16b, #12    \n" // v6 = {0,1,3,5}
+
+                "fmul v8.4s, v0.4s, %[w1].s[1]            \n" // v0 * w01
+                "fmul v9.4s, v1.4s, %[w1].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v2.4s, %[w1].s[0]            \n" // v2 * w00
+
+                "sub %[inptr0], %[inptr0], #4            \n"
+                "sub %[inptr1], %[inptr1], #4             \n"
+
+                "fmla v8.4s, v4.4s, %[w2].s[1]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w2].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v6.4s, %[w2].s[0]            \n" // v2 * w00
+
+                "prfm pldl1keep, [%[inptr0]]             \n"
+                "prfm pldl1keep, [%[inptr1]]             \n"
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+                "cmp %[cnt], #1                             \n"
+                "blt 1f                                     \n"
+                //mid
+                "2:                                          \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+
+                "prfm pldl1keep, [%[inptr0]]             \n"
+                "prfm pldl1keep, [%[inptr1]]             \n"
+
+                "ld2  {v2.4s, v3.4s}, [%[inptr0]]      \n" //v2={8,10,12,14} v3={9,11,13,15}
+                "ext  v6.16b, v0.16b, v2.16b, #4     \n" // v6 = {2,4,6,8}
+
+                "fmul v8.4s, v0.4s, %[w1].s[0]            \n" // v0 * w00
+                "fmul v9.4s, v1.4s, %[w1].s[1]            \n" // v1 * w01
+                "fmla v10.4s, v6.4s, %[w1].s[2]            \n" // v6 * w02
+
+                "ld2  {v6.4s, v7.4s}, [%[inptr1]]      \n" //v2={8,10,12,14} v3={9,11,13,15}
+                "ext  v11.16b, v4.16b, v6.16b, #4     \n" // v6 = {2,4,6,8}
+
+                "fmla v8.4s, v4.4s, %[w2].s[0]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w2].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v11.4s, %[w2].s[2]            \n" // v2 * w00
+
+                "prfm pldl1keep, [%[inptr0]]             \n"
+                "prfm pldl1keep, [%[inptr1]]             \n"
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+
+                "subs %[cnt], %[cnt], #1                    \n"
+
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+
+                "bne  2b                                    \n"
+
+                //right
+                "1:                                          \n"
+                "cmp %[remain], #1                           \n"
+                "blt 4f                                     \n"
+                "3:                                         \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+                "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+                "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+               // "bif  v10.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+                "ext  v2.16b, v0.16b, %[vzero].16b, #4     \n" // v6 = {2,4,6,8}
+                "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+
+
+                "fmul v8.4s, v0.4s, %[w1].s[0]            \n" // v0 * w01
+                "fmul v9.4s, v1.4s, %[w1].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v2.4s, %[w1].s[2]            \n" // v2 * w00
+
+                "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+                "ext  v6.16b, v4.16b, %[vzero].16b, #4     \n" // v6 = {2,4,6,8}
+
+                "fmla v8.4s, v4.4s, %[w2].s[0]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w2].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v6.4s, %[w2].s[2]            \n" // v2 * w00
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+                "bif  v0.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+                "4:                                          \n"
+                : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [outptr] "+r"(doutr0_ptr), \
+                  [vzero] "+w" (vzero), [w1] "+w" (wr1), [w2] "+w" (wr2), [cnt] "+r" (cnt), \
+                  [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias)
+                : [remain] "r" (cnt_remain)
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"
+                );
+            }
+
+            dr0 = dr1;
+            dr1 = dr2;
+            dr2 = dr1 + w_in;
+            doutr0 = doutr0 + w_out;
+            //! mid
+            for (int j = h_out - size_pad_bottom - 1; j > 0; j--){
+                din0_ptr = dr0;
+                din1_ptr = dr1;
+                din2_ptr = dr2;
+
+                doutr0_ptr = doutr0;
+
+               int cnt = cnt_col;
+
+                asm volatile (
+                //top
+                // Load up 12 elements (3 vectors) from each of 8 sources.
+                "0:                                      \n"
+                "prfm pldl1keep, [%[inptr0]]             \n"
+                "prfm pldl1keep, [%[inptr1]]             \n"
+                "prfm pldl1keep, [%[inptr2]]             \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+
+                "ext  v2.16b, %[vzero].16b, v1.16b, #12     \n" // v2 = {0,1,3,5}
+                "ext  v6.16b, %[vzero].16b, v5.16b, #12    \n" // v6 = {0,1,3,5}
+
+                "fmul v8.4s, v0.4s, %[w0].s[1]            \n" // v0 * w01
+                "fmul v9.4s, v1.4s, %[w0].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v2.4s, %[w0].s[0]            \n" // v2 * w00
+
+                "ld2  {v12.4s, v13.4s}, [%[inptr2]], #32    \n"
+
+                "sub %[inptr0], %[inptr0], #4            \n"
+                "sub %[inptr1], %[inptr1], #4             \n"
+
+                "fmla v8.4s, v4.4s, %[w1].s[1]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w1].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v6.4s, %[w1].s[0]            \n" // v2 * w00
+
+                "ext  v14.16b, %[vzero].16b, v13.16b, #12    \n" // v6 = {0,1,3,5}
+                "sub %[inptr2], %[inptr2], #4             \n"
+
+                "prfm pldl1keep, [%[inptr0]]             \n"
+
+                "fmla v8.4s, v12.4s, %[w2].s[1]            \n" // v0 * w01
+                "fmla v9.4s, v13.4s, %[w2].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v14.4s, %[w2].s[0]            \n" // v2 * w00
+
+                "prfm pldl1keep, [%[inptr1]]             \n"
+                "prfm pldl1keep, [%[inptr2]]             \n"
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+                "cmp %[cnt], #1                             \n"
+                "blt 1f                                     \n"
+                //mid
+                "2:                                          \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+
+                "prfm pldl1keep, [%[inptr0]]             \n"
+                "prfm pldl1keep, [%[inptr1]]             \n"
+
+                "ld2  {v2.4s, v3.4s}, [%[inptr0]]      \n" //v2={8,10,12,14} v3={9,11,13,15}
+                "ld2  {v6.4s, v7.4s}, [%[inptr1]]      \n" //v2={8,10,12,14} v3={9,11,13,15}
+                "ld2  {v12.4s, v13.4s}, [%[inptr2]], #32    \n"
+                "ext  v11.16b, v0.16b, v2.16b, #4     \n" // v6 = {2,4,6,8}
+
+                "prfm pldl1keep, [%[inptr2]]             \n"
+
+                "fmul v8.4s, v0.4s, %[w0].s[0]            \n" // v0 * w00
+                "fmul v9.4s, v1.4s, %[w0].s[1]            \n" // v1 * w01
+                "fmla v10.4s, v11.4s, %[w0].s[2]            \n" // v6 * w02
+
+                "ext  v11.16b, v4.16b, v6.16b, #4     \n" // v6 = {2,4,6,8}
+                "ld2  {v14.4s, v15.4s}, [%[inptr2]]    \n"
+
+                "fmla v8.4s, v4.4s, %[w1].s[0]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w1].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v11.4s, %[w1].s[2]            \n" // v2 * w00
+
+                "ext  v11.16b, v12.16b, v14.16b, #4     \n" // v6 = {2,4,6,8}
+
+                "fmla v8.4s, v12.4s, %[w2].s[0]            \n" // v0 * w00
+                "fmla v9.4s, v13.4s, %[w2].s[1]            \n" // v1 * w01
+                "fmla v10.4s, v11.4s, %[w2].s[2]            \n" // v6 * w02
+
+                "prfm pldl1keep, [%[inptr0]]             \n"
+                "prfm pldl1keep, [%[inptr1]]             \n"
+                "prfm pldl1keep, [%[inptr2]]             \n"
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+
+                "subs %[cnt], %[cnt], #1                    \n"
+
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+
+                "bne  2b                                    \n"
+
+                //right
+                "1:                                          \n"
+                "cmp %[remain], #1                           \n"
+                "blt 4f                                     \n"
+                "3:                                         \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+
+                "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+                "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+               // "bif  v10.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+                "ext  v2.16b, v0.16b, %[vzero].16b, #4     \n" // v6 = {2,4,6,8}
+                "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+
+                "ld2  {v12.4s, v13.4s}, [%[inptr2]], #32    \n"
+                "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+                "ext  v6.16b, v4.16b, %[vzero].16b, #4     \n" // v6 = {2,4,6,8}
+
+                "fmul v8.4s, v0.4s, %[w0].s[0]            \n" // v0 * w01
+                "fmul v9.4s, v1.4s, %[w0].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v2.4s, %[w0].s[2]            \n" // v2 * w00
+
+                "bif  v12.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+                "bif  v13.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+
+                "fmla v8.4s, v4.4s, %[w1].s[0]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w1].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v6.4s, %[w1].s[2]            \n" // v2 * w00
+
+                "ext  v14.16b, v12.16b, %[vzero].16b, #4     \n" // v6 = {2,4,6,8}
+
+                "fmla v8.4s, v12.4s, %[w2].s[0]            \n" // v0 * w01
+                "fmla v9.4s, v13.4s, %[w2].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v14.4s, %[w2].s[2]            \n" // v2 * w00
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+                "bif  v0.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+                "4:                                          \n"
+                : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [inptr2] "+r"(din2_ptr), [outptr] "+r"(doutr0_ptr), \
+                  [vzero] "+w" (vzero), [w1] "+w" (wr1), [w2] "+w" (wr2), [cnt] "+r" (cnt), [w0] "+w" (wr0), \
+                  [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias)
+                : [remain] "r" (cnt_remain)
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
+                );
+                dr0 = dr2;
+                dr1 = dr0 + w_in;
+                dr2 = dr1 + w_in;
+                doutr0 = doutr0 + w_out;
+            }
+
+            if (size_pad_bottom){
+               int cnt = cnt_col;
+                din0_ptr = dr0;
+                din1_ptr = dr1;
+
+                doutr0_ptr = doutr0;
+
+                asm volatile (
+                //top
+                // Load up 12 elements (3 vectors) from each of 8 sources.
+                "0:                                      \n"
+                "prfm pldl1keep, [%[inptr0]]             \n"
+                "prfm pldl1keep, [%[inptr1]]             \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+
+                "ext  v2.16b, %[vzero].16b, v1.16b, #12     \n" // v2 = {0,1,3,5}
+                "ext  v6.16b, %[vzero].16b, v5.16b, #12    \n" // v6 = {0,1,3,5}
+
+                "fmul v8.4s, v0.4s, %[w0].s[1]            \n" // v0 * w01
+                "fmul v9.4s, v1.4s, %[w0].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v2.4s, %[w0].s[0]            \n" // v2 * w00
+
+                "sub %[inptr0], %[inptr0], #4            \n"
+                "sub %[inptr1], %[inptr1], #4             \n"
+
+                "fmla v8.4s, v4.4s, %[w1].s[1]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w1].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v6.4s, %[w1].s[0]            \n" // v2 * w00
+
+                "prfm pldl1keep, [%[inptr0]]             \n"
+                "prfm pldl1keep, [%[inptr1]]             \n"
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+                "cmp %[cnt], #1                             \n"
+                "blt 1f                                     \n"
+                //mid
+                "2:                                          \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+
+                "prfm pldl1keep, [%[inptr0]]             \n"
+                "prfm pldl1keep, [%[inptr1]]             \n"
+
+                "ld2  {v2.4s, v3.4s}, [%[inptr0]]      \n" //v2={8,10,12,14} v3={9,11,13,15}
+                "ext  v6.16b, v0.16b, v2.16b, #4     \n" // v6 = {2,4,6,8}
+
+                "fmul v8.4s, v0.4s, %[w0].s[0]            \n" // v0 * w00
+                "fmul v9.4s, v1.4s, %[w0].s[1]            \n" // v1 * w01
+                "fmla v10.4s, v6.4s, %[w0].s[2]            \n" // v6 * w02
+
+                "ld2  {v6.4s, v7.4s}, [%[inptr1]]      \n" //v2={8,10,12,14} v3={9,11,13,15}
+                "ext  v11.16b, v4.16b, v6.16b, #4     \n" // v6 = {2,4,6,8}
+
+                "fmla v8.4s, v4.4s, %[w1].s[0]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w1].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v11.4s, %[w1].s[2]            \n" // v2 * w00
+
+                "prfm pldl1keep, [%[inptr0]]             \n"
+                "prfm pldl1keep, [%[inptr1]]             \n"
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+
+                "subs %[cnt], %[cnt], #1                    \n"
+
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+
+                "bne  2b                                    \n"
+
+                //right
+                "1:                                          \n"
+                "cmp %[remain], #1                           \n"
+                "blt 4f                                     \n"
+                "3:                                         \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+                "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+                "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+               // "bif  v10.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+                "ext  v2.16b, v0.16b, %[vzero].16b, #4     \n" // v6 = {2,4,6,8}
+                "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+
+
+                "fmul v8.4s, v0.4s, %[w0].s[0]            \n" // v0 * w01
+                "fmul v9.4s, v1.4s, %[w0].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v2.4s, %[w0].s[2]            \n" // v2 * w00
+
+                "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+                "ext  v6.16b, v4.16b, %[vzero].16b, #4     \n" // v6 = {2,4,6,8}
+
+                "fmla v8.4s, v4.4s, %[w1].s[0]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w1].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v6.4s, %[w1].s[2]            \n" // v2 * w00
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+                "bif  v0.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+                "4:                                          \n"
+                : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [outptr] "+r"(doutr0_ptr), \
+                  [vzero] "+w" (vzero), [w0] "+w" (wr0), [w1] "+w" (wr1), [cnt] "+r" (cnt), \
+                  [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias)
+                : [remain] "r" (cnt_remain)
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"
+                );
+            }
+
+        }
+    }
+
+
+}
+#else
 void conv_depthwise_3x3s2p1_bias(float* dout, const float* din, \
     const float* weights, const float* bias, bool flag_bias, \
     const int num, const int ch_in, const int h_in, const int w_in, \
@@ -3155,9 +3614,9 @@ void conv_depthwise_3x3s2p1_bias(float* dout, const float* din, \
 
             prefetch(din0_ptr);
             prefetch(din1_ptr);
-            // todo
-            float *doutr0_ptr = doutr0;
 
+            float *doutr0_ptr = doutr0;
+            // todo
             float32x4_t din0_1234 = vld1q_f32(din0_ptr);
             float32x4_t din1_1234 = vld1q_f32(din1_ptr);
 
@@ -3536,7 +3995,7 @@ void conv_depthwise_3x3s2p1_bias(float* dout, const float* din, \
         }
     }
 }
-
+#endif
 #else
 
 void conv_depthwise_3x3s2p1_bias(float* dout, const float* din, \
@@ -6275,7 +6734,6 @@ void conv_depthwise_3x3s1p1_bias_relu(float* dout, const float* din, \
     }
 
 }
-
 #else
 
 void conv_depthwise_3x3s1p1_bias_relu(float* dout, const float* din, \
@@ -7070,6 +7528,476 @@ void conv_depthwise_3x3s1p1_bias_relu(float* dout, const float* din, \
  * \brief depthwise convolution kernel 3x3, stride 2, with reulu
  */
 #ifdef __aarch64__
+#if 1
+//w_in > 7
+void conv_depthwise_3x3s2p1_bias_relu(float* dout, const float* din, \
+    const float* weights, const float* bias, bool flag_bias, \
+    const int num, const int ch_in, const int h_in, const int w_in, \
+    const int h_out, const int w_out) {
+
+    int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+    int out_pad_idx[4] = {0, 1, 2, 3};
+    int size_pad_bottom = h_out * 2 - h_in;
+
+    int cnt_col = (w_out >> 2) - 1;
+    int cnt_remain = (w_out % 4);
+    int size_right_remain = w_in - (7 + cnt_col * 8);
+
+    int size_right_pad = w_out * 2 - w_in;
+
+    uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), vld1q_s32(right_pad_idx));//0 2 4 6
+    uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), vld1q_s32(right_pad_idx + 4));//1 3 5 7
+    uint32x4_t wmask = vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));//0 1 2 3
+   // printf("w_out %d, cnt_col: %d, remain: %d \n", w_out, cnt_col, size_right_remain);
+    //printf("mask1: %d, %d, %d, %d \n", vmask_rp1[0], vmask_rp1[1], vmask_rp1[2], vmask_rp1[3]);
+    //printf("mask2: %d, %d, %d, %d \n", vmask_rp2[0], vmask_rp2[1], vmask_rp2[2], vmask_rp2[3]);
+    //printf("wmask: %d, %d, %d, %d \n", wmask[0], wmask[1], wmask[2], wmask[3]);
+   // size_right_remain *= sizeof(float);
+
+    int size_in_channel = w_in * h_in;
+    int size_out_channel = w_out * h_out;
+
+    for (int n = 0; n < num; ++n) {
+        const float *din_batch = din + n * ch_in * size_in_channel;
+        float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+        for (int i = 0; i < ch_in; ++i) {
+            const float* din_channel = din_batch + i * size_in_channel;
+            float* dout_channel = dout_batch + i * size_out_channel;
+
+            const float *weight_ptr = weights + i * 9;
+            float32x4_t wr0 = vld1q_f32(weight_ptr);
+            float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+            float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+            float32x4_t vzero= vdupq_n_f32(0.f);
+
+            float32x4_t wbias;
+            if (flag_bias) {
+                wbias  = vdupq_n_f32(bias[i]);
+            } else {
+                wbias = vdupq_n_f32(0.f);
+            }
+
+            const float *dr0 = din_channel;
+            const float *dr1 = dr0 + w_in;
+            const float *dr2 = dr1 + w_in;
+
+            const float *din0_ptr = dr0;
+            const float *din1_ptr = dr1;
+            const float *din2_ptr = dr2;
+
+            float *doutr0 = dout_channel;
+            float *doutr0_ptr = doutr0;
+
+            //! top pad
+            if(1){
+               int cnt = cnt_col;
+
+               //printf("cnt_col: %d, remain: %d \n", cnt_col, size_right_remain);
+             //  printf("mask1: %d, %d, %d, %d \n", vmask_rp1[0], vmask_rp1[1], vmask_rp1[2], vmask_rp1[3]);
+             //  printf("mask2: %d, %d, %d, %d \n", vmask_rp2[0], vmask_rp2[1], vmask_rp2[2], vmask_rp2[3]);
+              // printf("wmask: %d, %d, %d, %d \n", wmask[0], wmask[1], wmask[2], wmask[3]);
+
+                asm volatile (
+                //top
+                // Load up 12 elements (3 vectors) from each of 8 sources.
+                "0:                                      \n"
+                "prfm pldl1keep, [%[inptr0]]             \n"
+                "prfm pldl1keep, [%[inptr1]]             \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+
+                "ext  v2.16b, %[vzero].16b, v1.16b, #12     \n" // v2 = {0,1,3,5}
+                "ext  v6.16b, %[vzero].16b, v5.16b, #12    \n" // v6 = {0,1,3,5}
+
+                "fmul v8.4s, v0.4s, %[w1].s[1]            \n" // v0 * w01
+                "fmul v9.4s, v1.4s, %[w1].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v2.4s, %[w1].s[0]            \n" // v2 * w00
+
+                "sub %[inptr0], %[inptr0], #4            \n"
+                "sub %[inptr1], %[inptr1], #4             \n"
+
+                "fmla v8.4s, v4.4s, %[w2].s[1]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w2].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v6.4s, %[w2].s[0]            \n" // v2 * w00
+
+                "prfm pldl1keep, [%[inptr0]]             \n"
+                "prfm pldl1keep, [%[inptr1]]             \n"
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+                "fmax  v0.4s, v0.4s, %[vzero].4s             \n"
+
+                "cmp %[cnt], #1                             \n"
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+                "blt 1f                                     \n"
+                //mid
+                "2:                                          \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+
+                "prfm pldl1keep, [%[inptr0]]             \n"
+                "prfm pldl1keep, [%[inptr1]]             \n"
+
+                "ld2  {v2.4s, v3.4s}, [%[inptr0]]      \n" //v2={8,10,12,14} v3={9,11,13,15}
+                "ext  v6.16b, v0.16b, v2.16b, #4     \n" // v6 = {2,4,6,8}
+
+                "fmul v8.4s, v0.4s, %[w1].s[0]            \n" // v0 * w00
+                "fmul v9.4s, v1.4s, %[w1].s[1]            \n" // v1 * w01
+                "fmla v10.4s, v6.4s, %[w1].s[2]            \n" // v6 * w02
+
+                "ld2  {v6.4s, v7.4s}, [%[inptr1]]      \n" //v2={8,10,12,14} v3={9,11,13,15}
+                "ext  v11.16b, v4.16b, v6.16b, #4     \n" // v6 = {2,4,6,8}
+
+                "fmla v8.4s, v4.4s, %[w2].s[0]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w2].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v11.4s, %[w2].s[2]            \n" // v2 * w00
+
+                "prfm pldl1keep, [%[inptr0]]             \n"
+                "prfm pldl1keep, [%[inptr1]]             \n"
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+
+                "subs %[cnt], %[cnt], #1                    \n"
+
+                "fmax  v0.4s, v0.4s, %[vzero].4s             \n"
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+
+                "bne  2b                                    \n"
+
+                //right
+                "1:                                          \n"
+                "cmp %[remain], #1                           \n"
+                "blt 4f                                     \n"
+                "3:                                         \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+                "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+                "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+               // "bif  v10.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+                "ext  v2.16b, v0.16b, %[vzero].16b, #4     \n" // v6 = {2,4,6,8}
+                "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+
+
+                "fmul v8.4s, v0.4s, %[w1].s[0]            \n" // v0 * w01
+                "fmul v9.4s, v1.4s, %[w1].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v2.4s, %[w1].s[2]            \n" // v2 * w00
+
+                "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+                "ext  v6.16b, v4.16b, %[vzero].16b, #4     \n" // v6 = {2,4,6,8}
+
+                "fmla v8.4s, v4.4s, %[w2].s[0]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w2].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v6.4s, %[w2].s[2]            \n" // v2 * w00
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+                "fmax  v0.4s, v0.4s, %[vzero].4s             \n"
+                "bif  v0.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+                "4:                                          \n"
+                : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [outptr] "+r"(doutr0_ptr), \
+                  [vzero] "+w" (vzero), [w1] "+w" (wr1), [w2] "+w" (wr2), [cnt] "+r" (cnt), \
+                  [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias)
+                : [remain] "r" (cnt_remain)
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"
+                );
+            }
+
+            dr0 = dr1;
+            dr1 = dr2;
+            dr2 = dr1 + w_in;
+            doutr0 = doutr0 + w_out;
+            //! mid
+            for (int j = h_out - size_pad_bottom - 1; j > 0; j--){
+                din0_ptr = dr0;
+                din1_ptr = dr1;
+                din2_ptr = dr2;
+
+                doutr0_ptr = doutr0;
+
+               int cnt = cnt_col;
+
+                asm volatile (
+                //top
+                // Load up 12 elements (3 vectors) from each of 8 sources.
+                "0:                                      \n"
+                "prfm pldl1keep, [%[inptr0]]             \n"
+                "prfm pldl1keep, [%[inptr1]]             \n"
+                "prfm pldl1keep, [%[inptr2]]             \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+
+                "ext  v2.16b, %[vzero].16b, v1.16b, #12     \n" // v2 = {0,1,3,5}
+                "ext  v6.16b, %[vzero].16b, v5.16b, #12    \n" // v6 = {0,1,3,5}
+
+                "fmul v8.4s, v0.4s, %[w0].s[1]            \n" // v0 * w01
+                "fmul v9.4s, v1.4s, %[w0].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v2.4s, %[w0].s[0]            \n" // v2 * w00
+
+                "ld2  {v12.4s, v13.4s}, [%[inptr2]], #32    \n"
+
+                "sub %[inptr0], %[inptr0], #4            \n"
+                "sub %[inptr1], %[inptr1], #4             \n"
+
+                "fmla v8.4s, v4.4s, %[w1].s[1]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w1].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v6.4s, %[w1].s[0]            \n" // v2 * w00
+
+                "ext  v14.16b, %[vzero].16b, v13.16b, #12    \n" // v6 = {0,1,3,5}
+                "sub %[inptr2], %[inptr2], #4             \n"
+
+                "prfm pldl1keep, [%[inptr0]]             \n"
+
+                "fmla v8.4s, v12.4s, %[w2].s[1]            \n" // v0 * w01
+                "fmla v9.4s, v13.4s, %[w2].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v14.4s, %[w2].s[0]            \n" // v2 * w00
+
+                "prfm pldl1keep, [%[inptr1]]             \n"
+                "prfm pldl1keep, [%[inptr2]]             \n"
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+                "fmax  v0.4s, v0.4s, %[vzero].4s            \n"
+
+                "cmp %[cnt], #1                             \n"
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+                "blt 1f                                     \n"
+                //mid
+                "2:                                          \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+
+                "prfm pldl1keep, [%[inptr0]]             \n"
+                "prfm pldl1keep, [%[inptr1]]             \n"
+
+                "ld2  {v2.4s, v3.4s}, [%[inptr0]]      \n" //v2={8,10,12,14} v3={9,11,13,15}
+                "ld2  {v6.4s, v7.4s}, [%[inptr1]]      \n" //v2={8,10,12,14} v3={9,11,13,15}
+                "ld2  {v12.4s, v13.4s}, [%[inptr2]], #32    \n"
+                "ext  v11.16b, v0.16b, v2.16b, #4     \n" // v6 = {2,4,6,8}
+
+                "prfm pldl1keep, [%[inptr2]]             \n"
+
+                "fmul v8.4s, v0.4s, %[w0].s[0]            \n" // v0 * w00
+                "fmul v9.4s, v1.4s, %[w0].s[1]            \n" // v1 * w01
+                "fmla v10.4s, v11.4s, %[w0].s[2]            \n" // v6 * w02
+
+                "ext  v11.16b, v4.16b, v6.16b, #4     \n" // v6 = {2,4,6,8}
+                "ld2  {v14.4s, v15.4s}, [%[inptr2]]    \n"
+
+                "fmla v8.4s, v4.4s, %[w1].s[0]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w1].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v11.4s, %[w1].s[2]            \n" // v2 * w00
+
+                "ext  v11.16b, v12.16b, v14.16b, #4     \n" // v6 = {2,4,6,8}
+
+                "fmla v8.4s, v12.4s, %[w2].s[0]            \n" // v0 * w00
+                "fmla v9.4s, v13.4s, %[w2].s[1]            \n" // v1 * w01
+                "fmla v10.4s, v11.4s, %[w2].s[2]            \n" // v6 * w02
+
+                "prfm pldl1keep, [%[inptr0]]             \n"
+                "prfm pldl1keep, [%[inptr1]]             \n"
+                "prfm pldl1keep, [%[inptr2]]             \n"
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+
+                "subs %[cnt], %[cnt], #1                    \n"
+                "fmax  v0.4s, v0.4s, %[vzero].4s            \n"
+
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+
+                "bne  2b                                    \n"
+
+                //right
+                "1:                                          \n"
+                "cmp %[remain], #1                           \n"
+                "blt 4f                                     \n"
+                "3:                                         \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+
+                "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+                "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+               // "bif  v10.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+                "ext  v2.16b, v0.16b, %[vzero].16b, #4     \n" // v6 = {2,4,6,8}
+                "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+
+                "ld2  {v12.4s, v13.4s}, [%[inptr2]], #32    \n"
+                "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+                "ext  v6.16b, v4.16b, %[vzero].16b, #4     \n" // v6 = {2,4,6,8}
+
+                "fmul v8.4s, v0.4s, %[w0].s[0]            \n" // v0 * w01
+                "fmul v9.4s, v1.4s, %[w0].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v2.4s, %[w0].s[2]            \n" // v2 * w00
+
+                "bif  v12.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+                "bif  v13.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+
+                "fmla v8.4s, v4.4s, %[w1].s[0]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w1].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v6.4s, %[w1].s[2]            \n" // v2 * w00
+
+                "ext  v14.16b, v12.16b, %[vzero].16b, #4     \n" // v6 = {2,4,6,8}
+
+                "fmla v8.4s, v12.4s, %[w2].s[0]            \n" // v0 * w01
+                "fmla v9.4s, v13.4s, %[w2].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v14.4s, %[w2].s[2]            \n" // v2 * w00
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+                "fmax  v0.4s, v0.4s, %[vzero].4s             \n"
+
+                "bif  v0.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+                "4:                                          \n"
+                : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [inptr2] "+r"(din2_ptr), [outptr] "+r"(doutr0_ptr), \
+                  [vzero] "+w" (vzero), [w1] "+w" (wr1), [w2] "+w" (wr2), [cnt] "+r" (cnt), [w0] "+w" (wr0), \
+                  [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias)
+                : [remain] "r" (cnt_remain)
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
+                );
+                dr0 = dr2;
+                dr1 = dr0 + w_in;
+                dr2 = dr1 + w_in;
+                doutr0 = doutr0 + w_out;
+            }
+
+            if (size_pad_bottom){
+               int cnt = cnt_col;
+                din0_ptr = dr0;
+                din1_ptr = dr1;
+
+                doutr0_ptr = doutr0;
+
+                asm volatile (
+                //top
+                // Load up 12 elements (3 vectors) from each of 8 sources.
+                "0:                                      \n"
+                "prfm pldl1keep, [%[inptr0]]             \n"
+                "prfm pldl1keep, [%[inptr1]]             \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+
+                "ext  v2.16b, %[vzero].16b, v1.16b, #12     \n" // v2 = {0,1,3,5}
+                "ext  v6.16b, %[vzero].16b, v5.16b, #12    \n" // v6 = {0,1,3,5}
+
+                "fmul v8.4s, v0.4s, %[w0].s[1]            \n" // v0 * w01
+                "fmul v9.4s, v1.4s, %[w0].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v2.4s, %[w0].s[0]            \n" // v2 * w00
+
+                "sub %[inptr0], %[inptr0], #4            \n"
+                "sub %[inptr1], %[inptr1], #4             \n"
+
+                "fmla v8.4s, v4.4s, %[w1].s[1]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w1].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v6.4s, %[w1].s[0]            \n" // v2 * w00
+
+                "prfm pldl1keep, [%[inptr0]]             \n"
+                "prfm pldl1keep, [%[inptr1]]             \n"
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+
+                "fmax  v0.4s, v0.4s, %[vzero].4s             \n"
+
+                "cmp %[cnt], #1                             \n"
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+                "blt 1f                                     \n"
+                //mid
+                "2:                                          \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+
+                "prfm pldl1keep, [%[inptr0]]             \n"
+                "prfm pldl1keep, [%[inptr1]]             \n"
+
+                "ld2  {v2.4s, v3.4s}, [%[inptr0]]      \n" //v2={8,10,12,14} v3={9,11,13,15}
+                "ext  v6.16b, v0.16b, v2.16b, #4     \n" // v6 = {2,4,6,8}
+
+                "fmul v8.4s, v0.4s, %[w0].s[0]            \n" // v0 * w00
+                "fmul v9.4s, v1.4s, %[w0].s[1]            \n" // v1 * w01
+                "fmla v10.4s, v6.4s, %[w0].s[2]            \n" // v6 * w02
+
+                "ld2  {v6.4s, v7.4s}, [%[inptr1]]      \n" //v2={8,10,12,14} v3={9,11,13,15}
+                "ext  v11.16b, v4.16b, v6.16b, #4     \n" // v6 = {2,4,6,8}
+
+                "fmla v8.4s, v4.4s, %[w1].s[0]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w1].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v11.4s, %[w1].s[2]            \n" // v2 * w00
+
+                "prfm pldl1keep, [%[inptr0]]             \n"
+                "prfm pldl1keep, [%[inptr1]]             \n"
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+
+                "subs %[cnt], %[cnt], #1                    \n"
+                "fmax  v0.4s, v0.4s, %[vzero].4s             \n"
+
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+
+                "bne  2b                                    \n"
+
+                //right
+                "1:                                          \n"
+                "cmp %[remain], #1                           \n"
+                "blt 4f                                     \n"
+                "3:                                         \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+                "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+                "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+               // "bif  v10.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+                "ext  v2.16b, v0.16b, %[vzero].16b, #4     \n" // v6 = {2,4,6,8}
+                "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+
+
+                "fmul v8.4s, v0.4s, %[w0].s[0]            \n" // v0 * w01
+                "fmul v9.4s, v1.4s, %[w0].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v2.4s, %[w0].s[2]            \n" // v2 * w00
+
+                "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+                "ext  v6.16b, v4.16b, %[vzero].16b, #4     \n" // v6 = {2,4,6,8}
+
+                "fmla v8.4s, v4.4s, %[w1].s[0]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w1].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v6.4s, %[w1].s[2]            \n" // v2 * w00
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+                "fmax  v0.4s, v0.4s, %[vzero].4s             \n"
+                "bif  v0.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+                "4:                                          \n"
+                : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [outptr] "+r"(doutr0_ptr), \
+                  [vzero] "+w" (vzero), [w0] "+w" (wr0), [w1] "+w" (wr1), [cnt] "+r" (cnt), \
+                  [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias)
+                : [remain] "r" (cnt_remain)
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"
+                );
+            }
+
+        }
+    }
+
+
+}
+#else
 void conv_depthwise_3x3s2p1_bias_relu(float* dout, const float* din, \
     const float* weights, const float* bias, bool flag_bias, \
     const int num, const int ch_in, const int h_in, const int w_in, \
@@ -7529,7 +8457,7 @@ void conv_depthwise_3x3s2p1_bias_relu(float* dout, const float* din, \
         }
     }
 }
-
+#endif
 #else
 
 void conv_depthwise_3x3s2p1_bias_relu(float* dout, const float* din, \
@@ -8881,6 +9809,232 @@ void conv_depthwise_3x3s1p1_bias_s(float* dout, const float* din, \
  * \brief depthwise convolution kernel 3x3, stride 2, width <= 4
  */
 #ifdef __aarch64__
+#if 1  //w <= 7
+void conv_depthwise_3x3s2p1_bias_s(float* dout, const float* din, \
+    const float* weights, const float* bias, bool flag_bias, \
+    const int num, const int ch_in, const int h_in, const int w_in, \
+    const int h_out, const int w_out) {
+
+    int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+    int out_pad_idx[4] = {0, 1, 2, 3};
+    int size_pad_bottom = h_out * 2 - h_in;
+
+    int size_right_remain = w_in;
+
+    int size_right_pad = w_out * 2 - w_in;
+
+    int cnt_remain = w_out;
+
+    uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), vld1q_s32(right_pad_idx));//0 2 4 6
+    uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), vld1q_s32(right_pad_idx + 4));//1 3 5 7
+    uint32x4_t wmask = vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));//0 1 2 3
+    //printf("w_in %d, remain: %d \n", w_in, size_right_remain);
+   // printf("mask1: %d, %d, %d, %d \n", vmask_rp1[0], vmask_rp1[1], vmask_rp1[2], vmask_rp1[3]);
+    //printf("mask2: %d, %d, %d, %d \n", vmask_rp2[0], vmask_rp2[1], vmask_rp2[2], vmask_rp2[3]);
+    //printf("wmask: %d, %d, %d, %d \n", wmask[0], wmask[1], wmask[2], wmask[3]);
+   // size_right_remain *= sizeof(float);
+
+    int size_in_channel = w_in * h_in;
+    int size_out_channel = w_out * h_out;
+
+    for (int n = 0; n < num; ++n) {
+        const float *din_batch = din + n * ch_in * size_in_channel;
+        float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+        for (int i = 0; i < ch_in; ++i) {
+            const float* din_channel = din_batch + i * size_in_channel;
+            float* dout_channel = dout_batch + i * size_out_channel;
+
+            const float *weight_ptr = weights + i * 9;
+            float32x4_t wr0 = vld1q_f32(weight_ptr);
+            float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+            float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+            float32x4_t vzero= vdupq_n_f32(0.f);
+
+            float32x4_t wbias;
+            if (flag_bias) {
+                wbias  = vdupq_n_f32(bias[i]);
+            } else {
+                wbias = vdupq_n_f32(0.f);
+            }
+
+            const float *dr0 = din_channel;
+            const float *dr1 = dr0 + w_in;
+            const float *dr2 = dr1 + w_in;
+
+            const float *din0_ptr = dr0;
+            const float *din1_ptr = dr1;
+            const float *din2_ptr = dr2;
+
+            float *doutr0 = dout_channel;
+            float *doutr0_ptr = doutr0;
+
+            //! top pad
+            if(1){
+               //printf("cnt_col: %d, remain: %d \n", cnt_col, size_right_remain);
+             //  printf("mask1: %d, %d, %d, %d \n", vmask_rp1[0], vmask_rp1[1], vmask_rp1[2], vmask_rp1[3]);
+             //  printf("mask2: %d, %d, %d, %d \n", vmask_rp2[0], vmask_rp2[1], vmask_rp2[2], vmask_rp2[3]);
+              // printf("wmask: %d, %d, %d, %d \n", wmask[0], wmask[1], wmask[2], wmask[3]);
+
+                asm volatile (
+                //top
+                // Load up 12 elements (3 vectors) from each of 8 sources.
+                "0:                                      \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+                "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+                "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+               // "bif  v10.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+                "ext  v2.16b, %[vzero].16b, v1.16b, #12     \n" // v6 = {0, 1, 3, 5}
+                "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+
+
+                "fmul v8.4s, v0.4s, %[w1].s[1]            \n" // v0 * w01
+                "fmul v9.4s, v1.4s, %[w1].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v2.4s, %[w1].s[0]            \n" // v2 * w00
+
+                "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+                "ext  v6.16b, %[vzero].16b, v5.16b, #12     \n" // v6 = {2,4,6,8}
+
+                "fmla v8.4s, v4.4s, %[w2].s[1]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w2].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v6.4s, %[w2].s[0]            \n" // v2 * w00
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+              //  "fmax  v0.4s, v0.4s, %[vzero].4s             \n"
+                "bif  v0.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+                : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [outptr] "+r"(doutr0_ptr), \
+                  [vzero] "+w" (vzero), [w1] "+w" (wr1), [w2] "+w" (wr2), \
+                  [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias)
+                :
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"
+                );
+            }
+
+            dr0 = dr1;
+            dr1 = dr2;
+            dr2 = dr1 + w_in;
+            doutr0 = doutr0 + w_out;
+            //! mid
+            for (int j = h_out - size_pad_bottom - 1; j > 0; j--){
+                din0_ptr = dr0;
+                din1_ptr = dr1;
+                din2_ptr = dr2;
+
+                doutr0_ptr = doutr0;
+
+
+                asm volatile (
+                //top
+                // Load up 12 elements (3 vectors) from each of 8 sources.
+                "0:                                      \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+
+                "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+                "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+               // "bif  v10.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+                "ext  v2.16b, %[vzero].16b, v1.16b, #12     \n" // v6 = {0,1,3,5}
+                "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+
+                "ld2  {v12.4s, v13.4s}, [%[inptr2]], #32    \n"
+                "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+                "ext  v6.16b, %[vzero].16b, v5.16b, #12     \n" // v6 = {2,4,6,8}
+
+                "fmul v8.4s, v0.4s, %[w0].s[1]            \n" // v0 * w01
+                "fmul v9.4s, v1.4s, %[w0].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v2.4s, %[w0].s[0]            \n" // v2 * w00
+
+                "bif  v12.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+                "bif  v13.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+
+                "fmla v8.4s, v4.4s, %[w1].s[1]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w1].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v6.4s, %[w1].s[0]            \n" // v2 * w00
+
+                "ext  v14.16b, %[vzero].16b, v13.16b, #12     \n" // v6 = {0,1,3,5}
+
+                "fmla v8.4s, v12.4s, %[w2].s[1]            \n" // v0 * w01
+                "fmla v9.4s, v13.4s, %[w2].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v14.4s, %[w2].s[0]            \n" // v2 * w00
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+               // "fmax  v0.4s, v0.4s, %[vzero].4s             \n"
+
+                "bif  v0.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+                : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [inptr2] "+r"(din2_ptr), [outptr] "+r"(doutr0_ptr), \
+                  [vzero] "+w" (vzero), [w1] "+w" (wr1), [w2] "+w" (wr2), [w0] "+w" (wr0), \
+                  [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias)
+                :
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
+                );
+                dr0 = dr2;
+                dr1 = dr0 + w_in;
+                dr2 = dr1 + w_in;
+                doutr0 = doutr0 + w_out;
+            }
+
+            if (size_pad_bottom){
+                din0_ptr = dr0;
+                din1_ptr = dr1;
+
+                doutr0_ptr = doutr0;
+
+                asm volatile (
+                //top
+                // Load up 12 elements (3 vectors) from each of 8 sources.
+                "0:                                      \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+                "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+                "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+               // "bif  v10.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+                "ext  v2.16b, %[vzero].16b, v1.16b, #12    \n" // v6 = {2,4,6,8}
+                "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+
+
+                "fmul v8.4s, v0.4s, %[w0].s[0]            \n" // v0 * w01
+                "fmul v9.4s, v1.4s, %[w0].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v2.4s, %[w0].s[2]            \n" // v2 * w00
+
+                "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+                "ext  v6.16b, %[vzero].16b, v5.16b, #12     \n" // v6 = {2,4,6,8}
+
+                "fmla v8.4s, v4.4s, %[w1].s[0]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w1].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v6.4s, %[w1].s[2]            \n" // v2 * w00
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+             //   "fmax  v0.4s, v0.4s, %[vzero].4s             \n"
+                "bif  v0.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+                "4:                                          \n"
+                : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [outptr] "+r"(doutr0_ptr), \
+                  [vzero] "+w" (vzero), [w0] "+w" (wr0), [w1] "+w" (wr1), \
+                  [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias)
+                : [remain] "r" (cnt_remain)
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"
+                );
+            }
+
+        }
+    }
+
+
+}
+#else
 void conv_depthwise_3x3s2p1_bias_s(float* dout, const float* din, \
     const float* weights, const float* bias, bool flag_bias, \
     const int num, const int ch_in, const int h_in, const int w_in, \
@@ -9144,6 +10298,7 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout, const float* din, \
         }
     }
 }
+#endif
 #else
 void conv_depthwise_3x3s2p1_bias_s(float* dout, const float* din, \
     const float* weights, const float* bias, bool flag_bias, \
@@ -9908,7 +11063,6 @@ void conv_depthwise_3x3s1p1_bias_s_relu(float* dout, const float* din, \
 }
 
 #else
-
 void conv_depthwise_3x3s1p1_bias_s_relu(float* dout, const float* din, \
     const float* weights, const float* bias, bool flag_bias, \
     const int num, const int ch_in, const int h_in, const int w_in, \
@@ -10279,6 +11433,232 @@ void conv_depthwise_3x3s1p1_bias_s_relu(float* dout, const float* din, \
  * \brief depthwise convolution kernel 3x3, stride 2, width <= 4
  */
 #ifdef __aarch64__
+#if 1  //w <= 7
+void conv_depthwise_3x3s2p1_bias_s_relu(float* dout, const float* din, \
+    const float* weights, const float* bias, bool flag_bias, \
+    const int num, const int ch_in, const int h_in, const int w_in, \
+    const int h_out, const int w_out) {
+
+    int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+    int out_pad_idx[4] = {0, 1, 2, 3};
+    int size_pad_bottom = h_out * 2 - h_in;
+
+    int size_right_remain = w_in;
+
+    int size_right_pad = w_out * 2 - w_in;
+
+    int cnt_remain = w_out;
+
+    uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), vld1q_s32(right_pad_idx));//0 2 4 6
+    uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), vld1q_s32(right_pad_idx + 4));//1 3 5 7
+    uint32x4_t wmask = vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));//0 1 2 3
+    //printf("w_in %d, remain: %d \n", w_in, size_right_remain);
+   // printf("mask1: %d, %d, %d, %d \n", vmask_rp1[0], vmask_rp1[1], vmask_rp1[2], vmask_rp1[3]);
+    //printf("mask2: %d, %d, %d, %d \n", vmask_rp2[0], vmask_rp2[1], vmask_rp2[2], vmask_rp2[3]);
+    //printf("wmask: %d, %d, %d, %d \n", wmask[0], wmask[1], wmask[2], wmask[3]);
+   // size_right_remain *= sizeof(float);
+
+    int size_in_channel = w_in * h_in;
+    int size_out_channel = w_out * h_out;
+
+    for (int n = 0; n < num; ++n) {
+        const float *din_batch = din + n * ch_in * size_in_channel;
+        float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+        for (int i = 0; i < ch_in; ++i) {
+            const float* din_channel = din_batch + i * size_in_channel;
+            float* dout_channel = dout_batch + i * size_out_channel;
+
+            const float *weight_ptr = weights + i * 9;
+            float32x4_t wr0 = vld1q_f32(weight_ptr);
+            float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+            float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+            float32x4_t vzero= vdupq_n_f32(0.f);
+
+            float32x4_t wbias;
+            if (flag_bias) {
+                wbias  = vdupq_n_f32(bias[i]);
+            } else {
+                wbias = vdupq_n_f32(0.f);
+            }
+
+            const float *dr0 = din_channel;
+            const float *dr1 = dr0 + w_in;
+            const float *dr2 = dr1 + w_in;
+
+            const float *din0_ptr = dr0;
+            const float *din1_ptr = dr1;
+            const float *din2_ptr = dr2;
+
+            float *doutr0 = dout_channel;
+            float *doutr0_ptr = doutr0;
+
+            //! top pad
+            if(1){
+               //printf("cnt_col: %d, remain: %d \n", cnt_col, size_right_remain);
+             //  printf("mask1: %d, %d, %d, %d \n", vmask_rp1[0], vmask_rp1[1], vmask_rp1[2], vmask_rp1[3]);
+             //  printf("mask2: %d, %d, %d, %d \n", vmask_rp2[0], vmask_rp2[1], vmask_rp2[2], vmask_rp2[3]);
+              // printf("wmask: %d, %d, %d, %d \n", wmask[0], wmask[1], wmask[2], wmask[3]);
+
+                asm volatile (
+                //top
+                // Load up 12 elements (3 vectors) from each of 8 sources.
+                "0:                                      \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+                "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+                "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+               // "bif  v10.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+                "ext  v2.16b, %[vzero].16b, v1.16b, #12     \n" // v6 = {0, 1, 3, 5}
+                "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+
+
+                "fmul v8.4s, v0.4s, %[w1].s[1]            \n" // v0 * w01
+                "fmul v9.4s, v1.4s, %[w1].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v2.4s, %[w1].s[0]            \n" // v2 * w00
+
+                "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+                "ext  v6.16b, %[vzero].16b, v5.16b, #12     \n" // v6 = {2,4,6,8}
+
+                "fmla v8.4s, v4.4s, %[w2].s[1]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w2].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v6.4s, %[w2].s[0]            \n" // v2 * w00
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+                "fmax  v0.4s, v0.4s, %[vzero].4s             \n"
+                "bif  v0.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+                : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [outptr] "+r"(doutr0_ptr), \
+                  [vzero] "+w" (vzero), [w1] "+w" (wr1), [w2] "+w" (wr2), \
+                  [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias)
+                :
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"
+                );
+            }
+
+            dr0 = dr1;
+            dr1 = dr2;
+            dr2 = dr1 + w_in;
+            doutr0 = doutr0 + w_out;
+            //! mid
+            for (int j = h_out - size_pad_bottom - 1; j > 0; j--){
+                din0_ptr = dr0;
+                din1_ptr = dr1;
+                din2_ptr = dr2;
+
+                doutr0_ptr = doutr0;
+
+
+                asm volatile (
+                //top
+                // Load up 12 elements (3 vectors) from each of 8 sources.
+                "0:                                      \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+
+                "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+                "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+               // "bif  v10.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+                "ext  v2.16b, %[vzero].16b, v1.16b, #12     \n" // v6 = {0,1,3,5}
+                "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+
+                "ld2  {v12.4s, v13.4s}, [%[inptr2]], #32    \n"
+                "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+                "ext  v6.16b, %[vzero].16b, v5.16b, #12     \n" // v6 = {2,4,6,8}
+
+                "fmul v8.4s, v0.4s, %[w0].s[1]            \n" // v0 * w01
+                "fmul v9.4s, v1.4s, %[w0].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v2.4s, %[w0].s[0]            \n" // v2 * w00
+
+                "bif  v12.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+                "bif  v13.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+
+                "fmla v8.4s, v4.4s, %[w1].s[1]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w1].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v6.4s, %[w1].s[0]            \n" // v2 * w00
+
+                "ext  v14.16b, %[vzero].16b, v13.16b, #12     \n" // v6 = {0,1,3,5}
+
+                "fmla v8.4s, v12.4s, %[w2].s[1]            \n" // v0 * w01
+                "fmla v9.4s, v13.4s, %[w2].s[2]            \n" // v1 * w02
+                "fmla v10.4s, v14.4s, %[w2].s[0]            \n" // v2 * w00
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+                "fmax  v0.4s, v0.4s, %[vzero].4s             \n"
+
+                "bif  v0.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+                : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [inptr2] "+r"(din2_ptr), [outptr] "+r"(doutr0_ptr), \
+                  [vzero] "+w" (vzero), [w1] "+w" (wr1), [w2] "+w" (wr2), [w0] "+w" (wr0), \
+                  [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias)
+                :
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
+                );
+                dr0 = dr2;
+                dr1 = dr0 + w_in;
+                dr2 = dr1 + w_in;
+                doutr0 = doutr0 + w_out;
+            }
+
+            if (size_pad_bottom){
+                din0_ptr = dr0;
+                din1_ptr = dr1;
+
+                doutr0_ptr = doutr0;
+
+                asm volatile (
+                //top
+                // Load up 12 elements (3 vectors) from each of 8 sources.
+                "0:                                      \n"
+                "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" //v0={0,2,4,6} v1={1,3,5,7}
+                "ld2  {v4.4s, v5.4s}, [%[inptr1]], #32    \n"
+                "and  v10.16b, %[vbias].16b, %[vbias].16b  \n" //v10 = vbias
+                "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+                "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+               // "bif  v10.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+                "ext  v2.16b, %[vzero].16b, v1.16b, #12    \n" // v6 = {2,4,6,8}
+                "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n" //pipei
+
+
+                "fmul v8.4s, v0.4s, %[w0].s[0]            \n" // v0 * w01
+                "fmul v9.4s, v1.4s, %[w0].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v2.4s, %[w0].s[2]            \n" // v2 * w00
+
+                "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n" //pipei
+                "ext  v6.16b, %[vzero].16b, v5.16b, #12     \n" // v6 = {2,4,6,8}
+
+                "fmla v8.4s, v4.4s, %[w1].s[0]            \n" // v0 * w01
+                "fmla v9.4s, v5.4s, %[w1].s[1]            \n" // v1 * w02
+                "fmla v10.4s, v6.4s, %[w1].s[2]            \n" // v2 * w00
+
+                "fadd v0.4s, v8.4s, v9.4s                  \n"
+                "fadd v0.4s, v0.4s, v10.4s                  \n"
+                "fmax  v0.4s, v0.4s, %[vzero].4s             \n"
+                "bif  v0.16b, %[vzero].16b, %[wmask].16b    \n" //pipei
+
+                "st1 {v0.4s}, [%[outptr]], #16              \n"
+                "4:                                          \n"
+                : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), [outptr] "+r"(doutr0_ptr), \
+                  [vzero] "+w" (vzero), [w0] "+w" (wr0), [w1] "+w" (wr1), \
+                  [mask1] "+w" (vmask_rp1), [mask2] "+w" (vmask_rp2), [wmask] "+w" (wmask), [vbias] "+w" (wbias)
+                : [remain] "r" (cnt_remain)
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"
+                );
+            }
+
+        }
+    }
+
+
+}
+#else
 void conv_depthwise_3x3s2p1_bias_s_relu(float* dout, const float* din, \
     const float* weights, const float* bias, bool flag_bias, \
     const int num, const int ch_in, const int h_in, const int w_in, \
@@ -10546,6 +11926,7 @@ void conv_depthwise_3x3s2p1_bias_s_relu(float* dout, const float* din, \
         }
     }
 }
+#endif
 #else
 void conv_depthwise_3x3s2p1_bias_s_relu(float* dout, const float* din, \
     const float* weights, const float* bias, bool flag_bias, \

From dd63fae45bb71d2f67bd669be79d0e1f526fbc2b Mon Sep 17 00:00:00 2001
From: lixiaoyang05 <lixiaoyang05@baidu.com>
Date: Sat, 25 Aug 2018 17:33:55 +0800
Subject: [PATCH 07/10] add shufflechannel param

---
 saber/lite/funcs/neon/impl/sgemm_conv.cpp  |  4 ++
 saber/lite/funcs/neon/impl/sgemm_conv.h    |  8 +++
 saber/lite/funcs/op_param.h                | 17 +++++
 saber/lite/funcs/saber_shuffle_channel.cpp | 83 ++++++++++++++++++++++
 saber/lite/funcs/saber_shuffle_channel.h   | 64 +++++++++++++++++
 5 files changed, 176 insertions(+)
 create mode 100644 saber/lite/funcs/neon/impl/sgemm_conv.cpp
 create mode 100644 saber/lite/funcs/neon/impl/sgemm_conv.h
 create mode 100644 saber/lite/funcs/saber_shuffle_channel.cpp
 create mode 100644 saber/lite/funcs/saber_shuffle_channel.h

diff --git a/saber/lite/funcs/neon/impl/sgemm_conv.cpp b/saber/lite/funcs/neon/impl/sgemm_conv.cpp
new file mode 100644
index 000000000..1da5b6b8e
--- /dev/null
+++ b/saber/lite/funcs/neon/impl/sgemm_conv.cpp
@@ -0,0 +1,4 @@
+//
+// Created by Li,Xiaoyang(SYS) on 2018/8/24.
+//
+
diff --git a/saber/lite/funcs/neon/impl/sgemm_conv.h b/saber/lite/funcs/neon/impl/sgemm_conv.h
new file mode 100644
index 000000000..a700f19d2
--- /dev/null
+++ b/saber/lite/funcs/neon/impl/sgemm_conv.h
@@ -0,0 +1,8 @@
+//
+// Created by Li,Xiaoyang(SYS) on 2018/8/24.
+//
+
+#ifndef ANAKIN_SGEMM_CONV_H
+#define ANAKIN_SGEMM_CONV_H
+
+#endif //ANAKIN_SGEMM_CONV_H
diff --git a/saber/lite/funcs/op_param.h b/saber/lite/funcs/op_param.h
index 8dccd5e80..6d2808ac4 100644
--- a/saber/lite/funcs/op_param.h
+++ b/saber/lite/funcs/op_param.h
@@ -755,6 +755,23 @@ struct ScaleParam : public ParamBase {
     const float*  _scale_w;
     const float*  _scale_b;
 };
+
+struct ShuffleChannelParam : public ParamBase {
+    ShuffleChannelParam() : _group(1) {}
+    ShuffleChannelParam(int group) : _group(group) {}
+    ShuffleChannelParam(const ShuffleChannelParam& right) : ParamBase(right) {
+        _group = right._group;
+    }
+    ShuffleChannelParam& operator=(const ShuffleChannelParam& right){
+        _group = right._group;
+        return *this;
+    }
+    bool operator==(const ShuffleChannelParam& right){
+        return _group == right._group;
+    }
+    int _group;
+};
+
 } //namespace lite
 
 } //namespace saber
diff --git a/saber/lite/funcs/saber_shuffle_channel.cpp b/saber/lite/funcs/saber_shuffle_channel.cpp
new file mode 100644
index 000000000..72cd14689
--- /dev/null
+++ b/saber/lite/funcs/saber_shuffle_channel.cpp
@@ -0,0 +1,83 @@
+#include "saber/lite/funcs/saber_flatten.h"
+#include "saber/lite/net/saber_factory_lite.h"
+#ifdef USE_ARM_PLACE
+
+namespace anakin{
+
+namespace saber{
+
+namespace lite{
+
+SaberFlatten::SaberFlatten(const ParamBase *param) {
+    _param = (const FlattenParam*)param;
+    this->_flag_param = true;
+}
+
+SaberStatus SaberFlatten::load_param(const ParamBase *param) {
+    _param = (const FlattenParam*)param;
+    this->_flag_param = true;
+    return SaberSuccess;
+}
+
+SaberStatus SaberFlatten::load_param(std::istream &stream, const float *weights) {
+    this->_flag_param = true;
+    return SaberSuccess;
+}
+#if 0
+SaberStatus SaberFlatten::load_param(FILE *fp, const float *weights) {
+    fscanf(fp, "\n");
+    this->_flag_param = true;
+    return SaberSuccess;
+}
+#endif
+SaberStatus SaberFlatten::compute_output_shape(const std::vector<Tensor<CPU, AK_FLOAT> *> &inputs,
+                                             std::vector<Tensor<CPU, AK_FLOAT> *> &outputs) {
+    if (!this->_flag_param) {
+        printf("load flatten param first\n");
+        return SaberNotInitialized;
+    }
+
+    SaberStatus status;
+    //! input size is equal to 1
+    Shape shape_in = inputs[0]->valid_shape();
+    LCHECK_EQ(shape_in.dims(), 4, "only support 4d(NCHW) layout");
+    shape_in[1] = inputs[0]->valid_size() / inputs[0]->num();
+    shape_in[2] = 1;
+    shape_in[3] = 1;
+    return outputs[0]->set_shape(shape_in);
+}
+
+SaberStatus SaberFlatten::init(const std::vector<Tensor<CPU, AK_FLOAT> *> &inputs,
+                             std::vector<Tensor<CPU, AK_FLOAT> *> &outputs, Context &ctx) {
+    if (!this->_flag_param) {
+        printf("load flatten param first\n");
+        return SaberNotInitialized;
+    }
+    // get context
+    this->_ctx = &ctx;
+    //outputs[0]->share_from(*inputs[0]);
+    this->_flag_init = true;
+    return SaberSuccess;
+}
+
+
+//template <typename Dtype>
+SaberStatus SaberFlatten::dispatch(const std::vector<Tensor<CPU, AK_FLOAT> *> &inputs,
+                                 std::vector<Tensor<CPU, AK_FLOAT> *> &outputs) {
+
+    if (!this->_flag_init) {
+        printf("init flatten first\n");
+        return SaberNotInitialized;
+    }
+    return SaberSuccess;
+}
+REGISTER_LAYER_CLASS(SaberFlatten);
+} //namespace lite
+
+} //namespace saber
+
+} //namespace anakin
+
+#endif //USE_ARM
+
+
diff --git a/saber/lite/funcs/saber_shuffle_channel.h b/saber/lite/funcs/saber_shuffle_channel.h
new file mode 100644
index 000000000..f5f58bb61
--- /dev/null
+++ b/saber/lite/funcs/saber_shuffle_channel.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+#ifndef ANAKIN_SABER_LITE_FUNCS_SABER_FLATTEN_H
+#define ANAKIN_SABER_LITE_FUNCS_SABER_FLATTEN_H
+
+#include "saber/lite/funcs/op_base.h"
+
+#ifdef USE_ARM_PLACE
+
+namespace anakin{
+
+namespace saber{
+
+namespace lite{
+
+//template <typename Dtype>
+class SaberFlatten : public OpBase {
+public:
+    SaberFlatten() {}
+
+    SaberFlatten(const ParamBase* param);
+
+    virtual SaberStatus load_param(const ParamBase* param) override;
+
+    //virtual SaberStatus load_param(FILE* fp, const float* weights) override;
+
+    virtual SaberStatus load_param(std::istream& stream, const float* weights) override;
+
+    ~SaberFlatten() {}
+
+    virtual SaberStatus compute_output_shape(const std::vector<Tensor<CPU, AK_FLOAT>*>& inputs,
+                                     std::vector<Tensor<CPU, AK_FLOAT>*>& outputs) override;
+
+    virtual SaberStatus init(const std::vector<Tensor<CPU, AK_FLOAT>*>& inputs, \
+        std::vector<Tensor<CPU, AK_FLOAT>*>& outputs, Context &ctx) override;
+
+    virtual SaberStatus dispatch(const std::vector<Tensor<CPU, AK_FLOAT>*>& inputs, \
+        std::vector<Tensor<CPU, AK_FLOAT>*>& outputs) override;
+
+
+private:
+    const FlattenParam* _param;
+};
+
+} //namespace lite
+
+} //namespace saber
+
+} //namespace anakin
+
+#endif // USE_ARM_PLACE
+
+#endif //ANAKIN_SABER_LITE_FUNCS_SABER_FC_H

From 890b66f056ae271c99a993dbf4e5aa94ed51667e Mon Sep 17 00:00:00 2001
From: lixiaoyang05 <lixiaoyang05@baidu.com>
Date: Sat, 25 Aug 2018 19:11:26 +0800
Subject: [PATCH 08/10] change conv impl api

---
 saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp b/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp
index 30a1fac03..f862be444 100644
--- a/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp
+++ b/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp
@@ -59,7 +59,7 @@ void conv_depthwise_3x3(const float* din, float* dout, \
                           int chin, int hin, int win, \
                           const float* weights, const float* bias, \
                           int group, int kernel_w, int kernel_h, int stride_w, int stride_h, int dila_w, int dila_h, \
-                          int pad_w, int pad_h, bool flag_bias, bool flag_relu, Sgemm& gemmer, void* work_space) {
+                          int pad_w, int pad_h, bool flag_bias, bool flag_relu, /*Sgemm& gemmer*/Context* ctx, void* work_space) {
 
     int w_in = win;
     int h_in = hin;

From 103dbe289610e85d08b3ed1dae7064b01f99b525 Mon Sep 17 00:00:00 2001
From: lixiaoyang05 <lixiaoyang05@baidu.com>
Date: Tue, 4 Sep 2018 17:04:39 +0800
Subject: [PATCH 09/10] add bn to arm_lite

---
 framework/lite/code_gen_cpp.cpp               |  1 -
 framework/lite/op_map_cpp.cpp                 | 85 ++++++++++++++++++-
 .../funcs/neon/impl/conv_arm_depthwise.cpp    |  2 +-
 saber/lite/funcs/saber_prelu.h                | 61 -------------
 saber/lite/funcs/saber_shuffle_channel.cpp    | 83 ------------------
 saber/lite/funcs/saber_shuffle_channel.h      | 64 --------------
 6 files changed, 84 insertions(+), 212 deletions(-)
 delete mode 100644 saber/lite/funcs/saber_prelu.h
 delete mode 100644 saber/lite/funcs/saber_shuffle_channel.cpp
 delete mode 100644 saber/lite/funcs/saber_shuffle_channel.h

diff --git a/framework/lite/code_gen_cpp.cpp b/framework/lite/code_gen_cpp.cpp
index 38fa55542..3a3d7c00d 100644
--- a/framework/lite/code_gen_cpp.cpp
+++ b/framework/lite/code_gen_cpp.cpp
@@ -29,7 +29,6 @@ void GenCPP<Ttype, Dtype, Ptype>::gen_header_start() {
     _code<<"#include <saber/lite/funcs/saber_eltwise.h>\n";
     _code<<"#include <saber/lite/funcs/saber_eltwise_act.h>\n";
     _code<<"#include <saber/lite/funcs/saber_permute.h>\n";
-    _code<<"#include <saber/lite/funcs/saber_prelu.h>\n";
     _code<<"#include <saber/lite/funcs/saber_power.h>\n";
     _code<<"#include <saber/lite/funcs/saber_priorbox.h>\n";
     _code<<"#include <saber/lite/funcs/saber_scale.h>\n";
diff --git a/framework/lite/op_map_cpp.cpp b/framework/lite/op_map_cpp.cpp
index 43c172f15..98a48b44a 100755
--- a/framework/lite/op_map_cpp.cpp
+++ b/framework/lite/op_map_cpp.cpp
@@ -1545,19 +1545,24 @@ std::string ParserPriorBox(graph::AttrInfo& attr,
     //add
     std::vector<float> fixed_size, fixed_ratio, density;
     if (find_attr("fixed_size", attr) == SaberSuccess) {
-        LOG(ERROR) << "not exit";
         auto fix_size  = get_attr<PTuple<float>>("fixed_size", attr);
         fixed_size = fix_size.vector();
+    } else {
+        LOG(WARNING) << "not fixed_size param in priorbox";
     }
 
     if (find_attr("fixed_ratio", attr) == SaberSuccess) {
         auto fix_ratio  = get_attr<PTuple<float>>("fixed_ratio", attr);
         fixed_ratio = fix_ratio.vector();
+    } else {
+        LOG(WARNING) << "not fixed_ratio param in priorbox";
     }
 
     if (find_attr("density", attr) == SaberSuccess) {
         auto den = get_attr<PTuple<float>>("density", attr);
         density = den.vector();
+    } else {
+        LOG(WARNING) << "not density param in priorbox";
     }
 
     auto flip_flag = get_attr<bool>("is_flip", attr); 
@@ -1740,7 +1745,7 @@ std::string ParserSlice(graph::AttrInfo& attr,
 	return code_w.get_code_string();
 }
 
-// SaberSlice
+// SaberScale
 std::string ParserScale(graph::AttrInfo& attr,
                         std::string& code_name,
                         std::string& op_class_name,
@@ -1790,6 +1795,81 @@ std::string ParserScale(graph::AttrInfo& attr,
     return code_w.get_code_string();
 }
 
+// SaberScale
+std::string ParserBatchNorm(graph::AttrInfo& attr,
+                            std::string& code_name,
+                            std::string& op_class_name,
+                            std::string& node_name,
+                            std::string& weights_ptr_name,
+                            WeightsWritter& writter,
+                            bool gen_param) {
+
+    // get batchnorm param
+    auto eps = get_attr<float>("epsilon", attr);
+    auto momentum = get_attr<float>("momentum", attr);
+    auto mean = get_attr<PBlock<float, X86>>("weight_1", attr);
+    auto mean_vec = mean.vector();
+    auto var = get_attr<PBlock<float, X86>>("weight_2", attr);
+    auto var_vec = var.vector();
+    auto scale_factor = get_attr<PBlock<float, X86>>("weight_3", attr);
+    auto scale_factor_vec = scale_factor.vector();
+
+    std::vector<float> scale;
+    std::vector<float> bias;
+    scale.resize(mean.count());
+    bias.resize(mean.count());
+    auto scale_val = scale_factor_vec[0] == 0 ? 0 : 1 / scale_factor_vec[0];
+
+    for (int i = 0; i < mean.count(); i++) {
+        scale[i] = 1.0f / std::sqrt(var_vec[i] * scale_val + eps);
+        bias[i] = - mean_vec[i] * scale_val / std::sqrt(var_vec[i] * scale_val + eps);
+    }
+
+    Shape sh1 = {1, 1, 1, scale.size()};
+    Shape sh2 = {1, 1, 1, bias.size()};
+    PBlock<float, X86> pscale(sh1);
+    PBlock<float, X86> pbias(sh2);
+    float* pscale_ptr = pscale.h_tensor().mutable_data();
+    for (int j = 0; j < scale.size(); ++j) {
+        pscale_ptr[j] = scale[j];
+    }
+    float* pbias_ptr = pbias.h_tensor().mutable_data();
+    for (int j = 0; j < bias.size(); ++j) {
+        pbias_ptr[j] = bias[j];
+    }
+
+    writter.register_weights(node_name, pscale);
+            LOG(INFO) << node_name << " write weights: " << pscale.count();
+
+    writter.register_weights(node_name, pbias);
+            LOG(INFO) << node_name << " write bias: " << pbias.count();
+
+    auto offset_info = writter.get_weights_by_name(node_name);
+
+    // gen cpp code
+    CodeWritter code_w;
+    if (gen_param) {
+        code_w.feed("%d %d %d %d %d\n",
+                    offset_info.weights[0].offset,
+                    offset_info.weights[1].offset,
+                    1,
+                    1,
+                    1);
+    } else {
+        code_w.feed("ParamBase* %s_param = new ScaleParam(%s+%d, %s+%d, %s, %d, %d);\n",
+                    node_name.c_str(),
+                    weights_ptr_name.c_str(),
+                    offset_info.weights[0].offset,
+                    weights_ptr_name.c_str(),
+                    offset_info.weights[1].offset,
+                    "true",
+                    1,
+                    1);
+
+        code_w.feed("    %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str());
+    }
+    return code_w.get_code_string();
+}
 
 // SaberSoftmax
 std::string ParserSoftmax(graph::AttrInfo& attr,
@@ -1923,6 +2003,7 @@ std::unordered_map<std::string, OpParser> OPERATION_MAP({
 	{"PriorBox", {"SaberPriorBox", ParserPriorBox} }, // done
 	{"Power", {"SaberPower", ParserPower} }, // done
 	{"Scale", {"SaberScale", ParserScale} }, // done
+    {"BatchNorm", {"SaberScale", ParserBatchNorm} }, // done
 	{"Slice", {"SaberSlice", ParserSlice} }, // done
     {"Flatten", {"SaberFlatten", ParserFlatten}}, //done
     {"Reshape", {"SaberReshape", ParserReshape}}, //done
diff --git a/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp b/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp
index 91add9ffe..ac2691bbb 100644
--- a/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp
+++ b/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp
@@ -59,7 +59,7 @@ void conv_depthwise_3x3(const float* din, float* dout, \
                           int chin, int hin, int win, \
                           const float* weights, const float* bias, \
                           int group, int kernel_w, int kernel_h, int stride_w, int stride_h, int dila_w, int dila_h, \
-                          int pad_w, int pad_h, bool flag_bias, bool flag_relu, /*Sgemm& gemmer*/Context* ctx, void* work_space) {
+                          int pad_w, int pad_h, bool flag_bias, bool flag_relu, Sgemm& gemmer, void* work_space) {
 
     int w_in = win;
     int h_in = hin;
diff --git a/saber/lite/funcs/saber_prelu.h b/saber/lite/funcs/saber_prelu.h
deleted file mode 100644
index 12069bc97..000000000
--- a/saber/lite/funcs/saber_prelu.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-#ifndef ANAKIN_SABER_LITE_FUNCS_NEON_SABER_PRELU_H
-#define ANAKIN_SABER_LITE_FUNCS_NEON_SABER_PRELU_H
-
-#include "saber/lite/funcs/op_base.h"
-#if 0
-#ifdef USE_ARM_PLACE
-namespace anakin{
-
-namespace saber{
-
-namespace lite{
-
-//template <typename Dtype>
-class SaberPrelu : public OpBase {
-
-public:
-
-    SaberPrelu() {}
-
-    SaberPrelu(bool flag_shared, const float* weights);
-
-    SaberStatus load_param(bool flag_shared, const float* weights);
-
-    ~SaberPrelu() {}
-
-    virtual SaberStatus compute_output_shape(const std::vector<Tensor<CPU, AK_FLOAT>*>& inputs,
-                                     std::vector<Tensor<CPU, AK_FLOAT>*>& outputs) override;
-
-    virtual SaberStatus init(const std::vector<Tensor<CPU, AK_FLOAT>*>& inputs, \
-        std::vector<Tensor<CPU, AK_FLOAT>*>& outputs,  Context &ctx) override;
-
-    virtual SaberStatus dispatch(const std::vector<Tensor<CPU, AK_FLOAT>*>& inputs, \
-        std::vector<Tensor<CPU, AK_FLOAT>*>& outputs) override;
-
-private:
-
-    bool _flag_shared;
-    const float* _weights{nullptr};
-};
-
-} //namespace lite
-
-} //namespace saber
-
-} //namespace anakin
-#endif // USE_ARM_PLACE
-#endif
-#endif //ANAKIN_SABER_LITE_FUNCS_NEON_SABER_PRELU_H
diff --git a/saber/lite/funcs/saber_shuffle_channel.cpp b/saber/lite/funcs/saber_shuffle_channel.cpp
deleted file mode 100644
index 72cd14689..000000000
--- a/saber/lite/funcs/saber_shuffle_channel.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-#include "saber/lite/funcs/saber_flatten.h"
-#include "saber/lite/net/saber_factory_lite.h"
-#ifdef USE_ARM_PLACE
-
-namespace anakin{
-
-namespace saber{
-
-namespace lite{
-
-SaberFlatten::SaberFlatten(const ParamBase *param) {
-    _param = (const FlattenParam*)param;
-    this->_flag_param = true;
-}
-
-SaberStatus SaberFlatten::load_param(const ParamBase *param) {
-    _param = (const FlattenParam*)param;
-    this->_flag_param = true;
-    return SaberSuccess;
-}
-
-SaberStatus SaberFlatten::load_param(std::istream &stream, const float *weights) {
-    this->_flag_param = true;
-    return SaberSuccess;
-}
-#if 0
-SaberStatus SaberFlatten::load_param(FILE *fp, const float *weights) {
-    fscanf(fp, "\n");
-    this->_flag_param = true;
-    return SaberSuccess;
-}
-#endif
-SaberStatus SaberFlatten::compute_output_shape(const std::vector<Tensor<CPU, AK_FLOAT> *> &inputs,
-                                             std::vector<Tensor<CPU, AK_FLOAT> *> &outputs) {
-    if (!this->_flag_param) {
-        printf("load flatten param first\n");
-        return SaberNotInitialized;
-    }
-
-    SaberStatus status;
-    //! input size is equal to 1
-    Shape shape_in = inputs[0]->valid_shape();
-    LCHECK_EQ(shape_in.dims(), 4, "only support 4d(NCHW) layout");
-    shape_in[1] = inputs[0]->valid_size() / inputs[0]->num();
-    shape_in[2] = 1;
-    shape_in[3] = 1;
-    return outputs[0]->set_shape(shape_in);
-}
-
-SaberStatus SaberFlatten::init(const std::vector<Tensor<CPU, AK_FLOAT> *> &inputs,
-                             std::vector<Tensor<CPU, AK_FLOAT> *> &outputs, Context &ctx) {
-    if (!this->_flag_param) {
-        printf("load flatten param first\n");
-        return SaberNotInitialized;
-    }
-    // get context
-    this->_ctx = &ctx;
-    //outputs[0]->share_from(*inputs[0]);
-    this->_flag_init = true;
-    return SaberSuccess;
-}
-
-
-//template <typename Dtype>
-SaberStatus SaberFlatten::dispatch(const std::vector<Tensor<CPU, AK_FLOAT> *> &inputs,
-                                 std::vector<Tensor<CPU, AK_FLOAT> *> &outputs) {
-
-    if (!this->_flag_init) {
-        printf("init flatten first\n");
-        return SaberNotInitialized;
-    }
-    return SaberSuccess;
-}
-REGISTER_LAYER_CLASS(SaberFlatten);
-} //namespace lite
-
-} //namespace saber
-
-} //namespace anakin
-
-#endif //USE_ARM
-
-
diff --git a/saber/lite/funcs/saber_shuffle_channel.h b/saber/lite/funcs/saber_shuffle_channel.h
deleted file mode 100644
index f5f58bb61..000000000
--- a/saber/lite/funcs/saber_shuffle_channel.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-#ifndef ANAKIN_SABER_LITE_FUNCS_SABER_FLATTEN_H
-#define ANAKIN_SABER_LITE_FUNCS_SABER_FLATTEN_H
-
-#include "saber/lite/funcs/op_base.h"
-
-#ifdef USE_ARM_PLACE
-
-namespace anakin{
-
-namespace saber{
-
-namespace lite{
-
-//template <typename Dtype>
-class SaberFlatten : public OpBase {
-public:
-    SaberFlatten() {}
-
-    SaberFlatten(const ParamBase* param);
-
-    virtual SaberStatus load_param(const ParamBase* param) override;
-
-    //virtual SaberStatus load_param(FILE* fp, const float* weights) override;
-
-    virtual SaberStatus load_param(std::istream& stream, const float* weights) override;
-
-    ~SaberFlatten() {}
-
-    virtual SaberStatus compute_output_shape(const std::vector<Tensor<CPU, AK_FLOAT>*>& inputs,
-                                     std::vector<Tensor<CPU, AK_FLOAT>*>& outputs) override;
-
-    virtual SaberStatus init(const std::vector<Tensor<CPU, AK_FLOAT>*>& inputs, \
-        std::vector<Tensor<CPU, AK_FLOAT>*>& outputs, Context &ctx) override;
-
-    virtual SaberStatus dispatch(const std::vector<Tensor<CPU, AK_FLOAT>*>& inputs, \
-        std::vector<Tensor<CPU, AK_FLOAT>*>& outputs) override;
-
-
-private:
-    const FlattenParam* _param;
-};
-
-} //namespace lite
-
-} //namespace saber
-
-} //namespace anakin
-
-#endif // USE_ARM_PLACE
-
-#endif //ANAKIN_SABER_LITE_FUNCS_SABER_FC_H

From f940010eb9d4585836908ac0c3952109a4977ab0 Mon Sep 17 00:00:00 2001
From: Xiaoyang LI <lxy890123@tju.edu.cn>
Date: Fri, 7 Sep 2018 18:15:19 +0800
Subject: [PATCH 10/10] fix issue 428

fix iOS cmake toolchain build failed on some mac,  fix issue 428
---
 cmake/ios/ios.toolchain.cmake | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cmake/ios/ios.toolchain.cmake b/cmake/ios/ios.toolchain.cmake
index 4872605db..e6b56c7a5 100755
--- a/cmake/ios/ios.toolchain.cmake
+++ b/cmake/ios/ios.toolchain.cmake
@@ -47,8 +47,10 @@ endif (CMAKE_UNAME)
 
 # Force the compilers to gcc for iOS
 include (CMakeForceCompiler)
-CMAKE_FORCE_C_COMPILER (/usr/bin/clang Apple)
-CMAKE_FORCE_CXX_COMPILER (/usr/bin/clang++ Apple)
+set(CMAKE_C_COMPILER /usr/bin/clang)
+set(CMAKE_CXX_COMPILER /usr/bin/clang++)
+#CMAKE_FORCE_C_COMPILER (/usr/bin/clang Apple)
+#CMAKE_FORCE_CXX_COMPILER (/usr/bin/clang++ Apple)
 set(CMAKE_AR ar CACHE FILEPATH "" FORCE)
 
 # Skip the platform compiler checks for cross compiling