diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index b6532f23d3..f8ce75a824 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -648,6 +648,7 @@ public: bool useAVX; bool useAVX2; bool useAVX512; + int blk_size_cn; ParallelConv() : input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0), @@ -704,12 +705,17 @@ public: p.useAVX2 = checkHardwareSupport(CPU_AVX2) && isConv2D; p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX && isConv2D; - int ncn = std::min(inpCn, (int)BLK_SIZE_CN); - int kernel_d = !isConv2D? kernel_size[0] : 1; int kernel_h = kernel_size[kernel_size.size() - 2]; int kernel_w = kernel_size.back(); + int blk_size_cn0 = cvCeil(800./(kernel_w*kernel_h)); + int ncn = 16; + while (ncn*2 < blk_size_cn0 && ncn < inpCn) + ncn *= 2; + ncn = std::min(ncn, inpCn); + p.blk_size_cn = ncn; + int dil_d = !isConv2D? dilations[0] : 1; int dil_h = dilations[dilations.size() - 2]; int dil_w = dilations.back(); @@ -777,18 +783,26 @@ public: int dilation_w = dilations.back(); int i, j, k, d; - size_t inpPlaneSize = input_->total(2); - size_t outPlaneSize = output_->total(2); + int inpPlaneSize = (int)input_->total(2); + int outPlaneSize = (int)output_->total(2); bool is1x1 = is1x1_; int stripesPerSample; - size_t stripeSize; + int stripeSize; Range r = r0; + bool depthWiseConvolution = !is1x1 && isConv2D && ngroups > 1 && inpCn == 1 && + outCn == 1 && kernel_d == 1 && dilation_d == 1 && stride_d == 0 && pad_d == 0 && + width >= 16 + dilation_w*(kernel_w - 1); + // for now only 3x3 depth-wise convolutions are supported + depthWiseConvolution = depthWiseConvolution && kernel_w == 3 && kernel_h == 3 && + // computing at most 1 pixel from each side can involve padding + max(stride_w, dilation_w) >= pad_l && max(stride_h, dilation_h) >= pad_t && + pad_l <= 1 && pad_t <= 1; - if( nstripes >= batchSize*2 ) + if( !depthWiseConvolution && nstripes >= batchSize*2 ) { stripesPerSample = nstripes/batchSize; - stripeSize = alignSize((outPlaneSize + stripesPerSample - 1)/stripesPerSample, valign); + stripeSize = (int)alignSize((outPlaneSize + stripesPerSample - 1)/stripesPerSample, valign); stripeSize = std::min(stripeSize, outPlaneSize); } else @@ -807,20 +821,29 @@ public: const float* biasptr_ = &biasvec_->at(0); const float* reluptr_ = reluslope_->empty() ? 0 : &reluslope_->at(0); float* data_out0_ = output_->ptr(); - size_t rowbufsz = (size_t)karea*BLK_SIZE_CN*BLK_SIZE; - AutoBuffer rowbuf0_(rowbufsz + valign); - float* rowbuf0 = alignPtr(rowbuf0_.data(), (int)(valign*sizeof(float))); + AutoBuffer rowbuf0_; + float* rowbuf0 = 0; + bool use_rowbuf = !depthWiseConvolution; + int blk_size = depthWiseConvolution ? outPlaneSize : min((int)BLK_SIZE, stripeSize); - // we clear the buffer once; ultimately, it lets us to avoid - // tail processing after running the unrolled/vectorized loop. - // the main idea is to make sure that the tail (a.k.a. padding) of each row - // (i.e. the elements with indices between vsz=karea*ncn and vsz_a) - // does not contain NaNs or Infs. Because the padding in the weights - // matrix is explicitly initialized with 0's, we handle all other - // cases nicely, i.e. we can skip expliciting re-initialization - // of the padding - we just retain elements from the previous iteration - // of the loop over channels (cn0). - memset(rowbuf0, 0, rowbufsz*sizeof(rowbuf0[0]) ); + // im2row buffer is not used for depth-wise convolution + if(use_rowbuf) + { + size_t rowbufsz = alignSize(karea*blk_size_cn, valign)*min((int)BLK_SIZE, blk_size); + //printf("karea=%d, blk_size_cn=%d, rowbufsz=%d, stripeSize=%d\n", karea, blk_size_cn, (int)rowbufsz, stripeSize); + rowbuf0_.allocate(rowbufsz + valign); + rowbuf0 = alignPtr(rowbuf0_.data(), (int)(valign*sizeof(float))); + // we clear the buffer once; ultimately, it lets us to avoid + // tail processing after running the unrolled/vectorized loop. + // the main idea is to make sure that the tail (a.k.a. padding) of each row + // (i.e. the elements with indices between vsz=karea*ncn and vsz_a) + // does not contain NaNs or Infs. Because the padding in the weights + // matrix is explicitly initialized with 0's, we handle all other + // cases nicely, i.e. we can skip expliciting re-initialization + // of the padding - we just retain elements from the previous iteration + // of the loop over channels (cn0). + memset(rowbuf0, 0, rowbufsz*sizeof(rowbuf0[0]) ); + } for( int stripe = r.start; stripe < r.end; stripe++ ) { @@ -835,28 +858,213 @@ public: const float* wptr_orig = wptr_orig_ + wstep*startOutCn; const float* biasptr = biasptr_ + startOutCn; - for( int cn0 = 0; cn0 < inpCn; cn0 += BLK_SIZE_CN ) + for( int cn0 = 0; cn0 < inpCn; cn0 += blk_size_cn ) { - int cn1 = std::min(cn0 + BLK_SIZE_CN, inpCn); + int cn1 = std::min(cn0 + blk_size_cn, inpCn); int ncn = cn1 - cn0, vsz = karea*ncn; int vsz_a = (int)alignSize(vsz, valign); const float* wptr = wptr_orig + cn0*karea; // we apply [Channels][P]ReLU (if any) during the final pass only. const float* relu = cn1 == inpCn && reluptr_ ? reluptr_ + startOutCn : 0; - for( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += BLK_SIZE ) + for( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += blk_size ) { - int ofs, ofs1 = std::min(ofs0 + BLK_SIZE, stripeEnd); + int ofs, ofs1 = std::min(ofs0 + blk_size, stripeEnd); + int bsz = ofs1 - ofs0; int out_d = ofs0 / (outH * outW); int out_i = (ofs0 - out_d * outH * outW) / outW; int out_j = ofs0 % outW; + if (depthWiseConvolution) + { + CV_Assert(out_i == 0 && out_j == 0); + int in_d = out_d * stride_d - pad_d; + const float* inptr_ = data_inp0 + (cn0*depth*height + in_d*height)*width; + float* outptr_ = data_out0 + ofs0; + + #if CV_TRY_AVX2 + if(useAVX2) + opt_AVX2::fastDepthwiseConv(wptr, kernel_h, kernel_w, + stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l, + biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW); + else + #endif + #if CV_TRY_AVX + if(useAVX) + opt_AVX::fastDepthwiseConv(wptr, kernel_h, kernel_w, + stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l, + biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW); + else + #endif + { + const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], + w10 = wptr[3], w11 = wptr[4], w12 = wptr[5], + w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8]; + int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w); + float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d]; + + for (int out_i = 0; out_i < outH; out_i++) + { + int in_i = out_i * stride_h - pad_t, out_j = 0; + const float* imgptr0 = inptr_ + in_i*width; + const float* imgptr1 = imgptr0 + dilation_h*width; + const float* imgptr2 = imgptr0 + (dilation_h*2)*width; + float out, w00 = w00_, w01 = w01_, w02 = w02_; + float w20 = w20_, w21 = w21_, w22 = w22_; + if (in_i < 0) + { + w00 = w01 = w02 = 0.f; + imgptr0 = imgptr1; + } + else if (in_i + dilation_h*(kernel_h-1) >= height) + { + w20 = w21 = w22 = 0.f; + imgptr2 = imgptr1; + } + float* outptr = outptr_ + out_i*outW; + if (pad_l > 0) + { + out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 + + imgptr1[0]*w11 + imgptr1[dilation_w]*w12 + + imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[0] = out; + out_j = 1; + } + + #if CV_SIMD + // maybe with AVX or AVX512 strided depthwise convolution + // can be accelerated with vector code, but with 4xfloat vectors + // it's hardly the case + if( stride_w == 1 ) + { + const int VECSZ = v_float32::nlanes; + const int out_delta = VECSZ/stride_w; + v_float32 vw00 = vx_setall_f32(w00), vw01 = vx_setall_f32(w01), vw02 = vx_setall_f32(w02), + vw10 = vx_setall_f32(w10), vw11 = vx_setall_f32(w11), vw12 = vx_setall_f32(w12), + vw20 = vx_setall_f32(w20), vw21 = vx_setall_f32(w21), vw22 = vx_setall_f32(w22); + v_float32 z = vx_setzero_f32(), vbias = vx_setall_f32(bias), vrc = vx_setall_f32(relu_coeff); + for( ; out_j < outW1; out_j += out_delta ) + { + if (out_j + out_delta > outW1) + { + if (out_j <= pad_l) + break; + out_j = outW1 - out_delta; + } + int in_j = out_j * stride_w - pad_l; + v_float32 v00 = vx_load(imgptr0 + in_j), + v01 = vx_load(imgptr0 + in_j + dilation_w), + v02 = vx_load(imgptr0 + in_j + dilation_w*2), + v10 = vx_load(imgptr1 + in_j), + v11 = vx_load(imgptr1 + in_j + dilation_w), + v12 = vx_load(imgptr1 + in_j + dilation_w*2), + v20 = vx_load(imgptr2 + in_j), + v21 = vx_load(imgptr2 + in_j + dilation_w), + v22 = vx_load(imgptr2 + in_j + dilation_w*2); + + v_float32 vout = v00*vw00 + v01*vw01 + v02*vw02 + + v10*vw10 + v11*vw11 + v12*vw12 + + v20*vw20 + v21*vw21 + v22*vw22 + vbias; + if (relu) + vout = v_select(vout > z, vout, vout*vrc); + vx_store(outptr + out_j, vout); + } + } + #endif + for (; out_j < outW1; out_j++) + { + int in_j = out_j * stride_w - pad_l; + out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 + + imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 + + imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[out_j] = out; + } + + for (; out_j < outW; out_j++ ) + { + int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2; + float s0 = 1.f, s1 = 1.f, s2 = 1.f; + if (in_j0 >= width) + { + in_j0 = 0; + s0 = 0.f; + } + if (in_j1 >= width) + { + in_j1 = 0; + s1 = 0.f; + } + if (in_j2 >= width) + { + in_j2 = 0; + s2 = 0.f; + } + out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 + + imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 + + imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[out_j] = out; + } + } + } + continue; + } + // do im2row for a part of input tensor float* rowbuf = rowbuf0; if (isConv2D) { + if( is1x1 && stride_w == 1 && stride_h == 1 ) + { + const float* imgptr = data_inp0 + (cn0*height + out_i)*width + out_j; + for( int j = 0; j < bsz; j++, rowbuf += vsz_a ) + { + if( j + 4 <= bsz ) + { + k = 0; + #if CV_SIMD128 + for( ; k <= vsz - 4; k += 4 ) + { + const float* inp = imgptr + j + k*inpPlaneSize; + v_float32x4 p0 = v_load(inp), p1 = v_load(inp + inpPlaneSize); + v_float32x4 p2 = v_load(inp + inpPlaneSize*2), p3 = v_load(inp + inpPlaneSize*3); + v_float32x4 r0, r1, r2, r3; + v_transpose4x4(p0, p1, p2, p3, r0, r1, r2, r3); + v_store(rowbuf + k, r0); + v_store(rowbuf + k + vsz_a, r1); + v_store(rowbuf + k + vsz_a*2, r2); + v_store(rowbuf + k + vsz_a*3, r3); + } + #endif + for( ; k < vsz; k++ ) + { + const float* inp = imgptr + j + k*inpPlaneSize; + float v0 = inp[0], v1 = inp[1], v2 = inp[2], v3 = inp[3]; + rowbuf[k] = v0; + rowbuf[k + vsz_a] = v1; + rowbuf[k + vsz_a*2] = v2; + rowbuf[k + vsz_a*3] = v3; + } + j += 3; + rowbuf += vsz_a*3; + } + else + { + for( k = 0; k < vsz; k++ ) + { + rowbuf[k] = imgptr[j + k*inpPlaneSize]; + } + } + } + } + else for( ofs = ofs0; ofs < ofs1; out_j = 0, ++out_i ) { int delta = std::min(ofs1 - ofs, outW - out_j); @@ -976,7 +1184,6 @@ public: // now compute dot product of the weights // and im2row-transformed part of the tensor - int bsz = ofs1 - ofs0; #if CV_TRY_AVX512_SKX /* AVX512 convolution requires an alignment of 16, and ROI is only there for larger vector sizes */ if(useAVX512) diff --git a/modules/dnn/src/layers/layers_common.simd.hpp b/modules/dnn/src/layers/layers_common.simd.hpp index b2c0aa6a29..706695a7b2 100644 --- a/modules/dnn/src/layers/layers_common.simd.hpp +++ b/modules/dnn/src/layers/layers_common.simd.hpp @@ -50,6 +50,16 @@ void fastConv( const float* weights, size_t wstep, const float* bias, const float* rowbuf, float* output, const int* outShape, int blockSize, int vecsize, int vecsize_aligned, const float* relu, bool initOutput ); +void fastDepthwiseConv( const float* weights, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int dilation_h, int dilation_w, + int pad_t, int pad_l, + const float* bias, const float* relu, + const float* inptr, + int height, int width, + float* outptr, + int out_d, int outH, int outW ); void fastGEMM1T( const float* vec, const float* weights, size_t wstep, const float* bias, float* dst, int nvecs, int vecsize ); @@ -64,6 +74,8 @@ void fastGEMM( const float* aptr, size_t astep, const float* bptr, #define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b)) #endif +enum { FASCONV_BASE_VECSZ = 4 }; + void fastConv( const float* weights, size_t wstep, const float* bias, const float* rowbuf, float* output, const int* outShape, int blockSize, int vecsize, int vecsize_aligned, @@ -73,6 +85,11 @@ void fastConv( const float* weights, size_t wstep, const float* bias, size_t outPlaneSize = outShape[2]*outShape[3]; float r0 = 1.f, r1 = 1.f, r2 = 1.f; __m128 vr0 = _mm_set1_ps(1.f), vr1 = vr0, vr2 = vr0, z = _mm_setzero_ps(); + int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0}; + int rsz = blockSize % FASCONV_BASE_VECSZ; + for( int i = 0; i < rsz; i++ ) + maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1; + __m128 mask = _mm_loadu_ps((const float*)maskbuf); // now compute dot product of the weights // and im2row-transformed part of the tensor @@ -114,8 +131,16 @@ void fastConv( const float* weights, size_t wstep, const float* bias, } int j = 0; - for( ; j <= blockSize - 4; j += 4 ) + for( ; j < blockSize; j += FASCONV_BASE_VECSZ ) { + bool tail = false; + if (j + FASCONV_BASE_VECSZ > blockSize) + { + if (j == 0) + break; + j = blockSize - FASCONV_BASE_VECSZ; + tail = true; + } int k = 0; const float* rptr = rowbuf + j*vecsize_aligned; @@ -243,9 +268,16 @@ void fastConv( const float* weights, size_t wstep, const float* bias, __m128 m0 = _mm_cmp_ps(s0, z, _CMP_GT_OS); __m128 m1 = _mm_cmp_ps(s1, z, _CMP_GT_OS); __m128 m2 = _mm_cmp_ps(s2, z, _CMP_GT_OS); - s0 = _mm_xor_ps(s0, _mm_andnot_ps(m0, _mm_xor_ps(_mm_mul_ps(s0, vr0), s0))); - s1 = _mm_xor_ps(s1, _mm_andnot_ps(m1, _mm_xor_ps(_mm_mul_ps(s1, vr1), s1))); - s2 = _mm_xor_ps(s2, _mm_andnot_ps(m2, _mm_xor_ps(_mm_mul_ps(s2, vr2), s2))); + s0 = _mm_blendv_ps(_mm_mul_ps(s0, vr0), s0, m0); + s1 = _mm_blendv_ps(_mm_mul_ps(s1, vr1), s1, m1); + s2 = _mm_blendv_ps(_mm_mul_ps(s2, vr2), s2, m2); + } + + if( tail ) + { + s0 = _mm_blendv_ps(_mm_loadu_ps(outptr0 + j), s0, mask); + s1 = _mm_blendv_ps(_mm_loadu_ps(outptr1 + j), s1, mask); + s2 = _mm_blendv_ps(_mm_loadu_ps(outptr2 + j), s2, mask); } _mm_storeu_ps(outptr0 + j, s0); @@ -253,9 +285,55 @@ void fastConv( const float* weights, size_t wstep, const float* bias, _mm_storeu_ps(outptr2 + j, s2); } + for( ; j <= blockSize - 2; j += 2 ) + { + const float* rptr0 = rowbuf + j*vecsize_aligned; + const float* rptr1 = rowbuf + (j+1)*vecsize_aligned; + float s00, s01, s10, s11, s20, s21; + + if( initOutput ) + { + s00 = s01 = bias0; + s10 = s11 = bias1; + s20 = s21 = bias2; + } + else + { + s00 = outptr0[j]; s01 = outptr0[j+1]; + s10 = outptr1[j]; s11 = outptr1[j+1]; + s20 = outptr2[j]; s21 = outptr2[j+1]; + } + + for( int k = 0; k < vecsize; k++ ) + { + float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k]; + float r = rptr0[k]; + s00 += w0*r; s10 += w1*r; s20 += w2*r; + r = rptr1[k]; + s01 += w0*r; s11 += w1*r; s21 += w2*r; + } + + if( relu ) + { + s00 = s00 > 0.f ? s00 : s00*r0; + s01 = s01 > 0.f ? s01 : s01*r0; + s10 = s10 > 0.f ? s10 : s10*r1; + s11 = s11 > 0.f ? s11 : s11*r1; + s20 = s20 > 0.f ? s20 : s20*r2; + s21 = s21 > 0.f ? s21 : s21*r2; + } + + outptr0[j] = s00; + outptr0[j+1] = s01; + outptr1[j] = s10; + outptr1[j+1] = s11; + outptr2[j] = s20; + outptr2[j+1] = s21; + } + for( ; j < blockSize; j++ ) { - const float* rptr = rowbuf + j*vecsize_aligned; + const float* rptr0 = rowbuf + j*vecsize_aligned; float s00, s10, s20; if( initOutput ) @@ -273,10 +351,9 @@ void fastConv( const float* weights, size_t wstep, const float* bias, for( int k = 0; k < vecsize; k++ ) { - float r0 = rptr[k]; - s00 += wptr0[k]*r0; - s10 += wptr1[k]*r0; - s20 += wptr2[k]*r0; + float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k]; + float r = rptr0[k]; + s00 += w0*r; s10 += w1*r; s20 += w2*r; } if( relu ) @@ -294,6 +371,185 @@ void fastConv( const float* weights, size_t wstep, const float* bias, _mm256_zeroupper(); } +static inline void _mm256_load_deinterleave(const float* ptr, __m256& a, __m256& b) +{ + __m256 t0 = _mm256_loadu_ps(ptr); + __m256 t1 = _mm256_loadu_ps(ptr + 8); + + __m256 lo = _mm256_permute2f128_ps(t0, t1, 0+2*16); + __m256 hi = _mm256_permute2f128_ps(t0, t1, 1+3*16); + a = _mm256_shuffle_ps(lo, hi, 0x88); + b = _mm256_shuffle_ps(lo, hi, 0xdd); +} + +void fastDepthwiseConv( const float* wptr, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int dilation_h, int dilation_w, + int pad_t, int pad_l, + const float* biasptr, const float* relu, + const float* inptr_, + int height, int width, + float* outptr_, + int out_d, int outH, int outW ) +{ + const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], + w10 = wptr[3], w11 = wptr[4], w12 = wptr[5], + w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8]; + int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w); + float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d]; + + for (int out_i = 0; out_i < outH; out_i++) + { + int in_i = out_i * stride_h - pad_t, out_j = 0; + const float* imgptr0 = inptr_ + in_i*width; + const float* imgptr1 = imgptr0 + dilation_h*width; + const float* imgptr2 = imgptr0 + (dilation_h*2)*width; + float out, w00 = w00_, w01 = w01_, w02 = w02_; + float w20 = w20_, w21 = w21_, w22 = w22_; + if (in_i < 0) + { + w00 = w01 = w02 = 0.f; + imgptr0 = imgptr1; + } + else if (in_i + dilation_h*(kernel_h-1) >= height) + { + w20 = w21 = w22 = 0.f; + imgptr2 = imgptr1; + } + float* outptr = outptr_ + out_i*outW; + if (pad_l > 0) + { + out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 + + imgptr1[0]*w11 + imgptr1[dilation_w]*w12 + + imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[0] = out; + out_j = 1; + } + + if (stride_w == 1 || (stride_w == 2 && dilation_w == 1)) + { + const int VECSZ = 8; + __m256 vw00 = _mm256_set1_ps(w00), vw01 = _mm256_set1_ps(w01), vw02 = _mm256_set1_ps(w02), + vw10 = _mm256_set1_ps(w10), vw11 = _mm256_set1_ps(w11), vw12 = _mm256_set1_ps(w12), + vw20 = _mm256_set1_ps(w20), vw21 = _mm256_set1_ps(w21), vw22 = _mm256_set1_ps(w22); + __m256 z = _mm256_setzero_ps(), vbias = _mm256_set1_ps(bias), vrc = _mm256_set1_ps(relu_coeff); + + if( stride_w == 1 ) + for( ; out_j < outW1; out_j += VECSZ ) + { + if (out_j + VECSZ > outW1 && out_j > pad_l) + out_j = outW1 - VECSZ; + int in_j = out_j * stride_w - pad_l; + __m256 v00 = _mm256_loadu_ps(imgptr0 + in_j), + v01 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w), + v02 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w*2), + v10 = _mm256_loadu_ps(imgptr1 + in_j), + v11 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w), + v12 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w*2), + v20 = _mm256_loadu_ps(imgptr2 + in_j), + v21 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w), + v22 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w*2); + + __m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias); + __m256 vout1 = _mm256_mul_ps(v01, vw01); + __m256 vout2 = _mm256_mul_ps(v02, vw02); + + vout0 = _mm256_fmadd_ps(v10, vw10, vout0); + vout1 = _mm256_fmadd_ps(v11, vw11, vout1); + vout2 = _mm256_fmadd_ps(v12, vw12, vout2); + + vout0 = _mm256_fmadd_ps(v20, vw20, vout0); + vout1 = _mm256_fmadd_ps(v21, vw21, vout1); + vout2 = _mm256_fmadd_ps(v22, vw22, vout2); + + vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2); + if (relu) + { + __m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ); + vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m); + } + _mm256_storeu_ps(outptr + out_j, vout0); + } + else + for( ; out_j < outW1; out_j += VECSZ ) + { + if (out_j + VECSZ > outW1 && out_j > pad_l) + out_j = outW1 - VECSZ; + int in_j = out_j * stride_w - pad_l; + __m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused; + _mm256_load_deinterleave(imgptr0 + in_j, v00, v01); + _mm256_load_deinterleave(imgptr0 + in_j + 2, v02, unused); + _mm256_load_deinterleave(imgptr1 + in_j, v10, v11); + _mm256_load_deinterleave(imgptr1 + in_j + 2, v12, unused); + _mm256_load_deinterleave(imgptr2 + in_j, v20, v21); + _mm256_load_deinterleave(imgptr2 + in_j + 2, v22, unused); + + __m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias); + __m256 vout1 = _mm256_mul_ps(v01, vw01); + __m256 vout2 = _mm256_mul_ps(v02, vw02); + + vout0 = _mm256_fmadd_ps(v10, vw10, vout0); + vout1 = _mm256_fmadd_ps(v11, vw11, vout1); + vout2 = _mm256_fmadd_ps(v12, vw12, vout2); + + vout0 = _mm256_fmadd_ps(v20, vw20, vout0); + vout1 = _mm256_fmadd_ps(v21, vw21, vout1); + vout2 = _mm256_fmadd_ps(v22, vw22, vout2); + + vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2); + if (relu) + { + __m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ); + vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m); + } + _mm256_storeu_ps(outptr + out_j, vout0); + } + } + + for (; out_j < outW1; out_j++) + { + int in_j = out_j * stride_w - pad_l; + out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 + + imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 + + imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[out_j] = out; + } + + for (; out_j < outW; out_j++ ) + { + int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2; + float s0 = 1.f, s1 = 1.f, s2 = 1.f; + if (in_j0 >= width) + { + in_j0 = 0; + s0 = 0.f; + } + if (in_j1 >= width) + { + in_j1 = 0; + s1 = 0.f; + } + if (in_j2 >= width) + { + in_j2 = 0; + s2 = 0.f; + } + out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 + + imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 + + imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[out_j] = out; + } + } + _mm256_zeroupper(); +} + // dst = vec * weights^t + bias void fastGEMM1T( const float* vec, const float* weights, size_t wstep, const float* bias,