// This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpConv.fx). // Here is the original license: /* This file is a part of ficus language project. See ficus/LICENSE for the licensing terms */ #include "../../precomp.hpp" #include "fast_convolution.hpp" namespace cv { namespace dnn { static void depthWiseBlock(const float *inptr, float *outptr, const float *weights, float biasval, int *ofstab, int *yxtab, float minval, float maxval, int Hi, int Wi, int H0, int W0, int ksize, int pad_top, int pad_left, int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop, int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3) { #if CV_SIMD128 const int VEC_NLANES = 4; v_float32x4 vminval = v_setall_f32(minval), vmaxval = v_setall_f32(maxval); v_float32x4 w0 = v_setall_f32( 0.f), w1 = w0, w2 = w0, w3 = w0, w4 = w0, w5 = w0, w6 = w0, w7 = w0, w8 = w0, vbias = w0; if (useSIMD) { vbias = v_setall_f32(biasval); if (is3x3) { w0 = v_setall_f32(weights[0]); w1 = v_setall_f32(weights[1]); w2 = v_setall_f32(weights[2]); w3 = v_setall_f32(weights[3]); w4 = v_setall_f32(weights[4]); w5 = v_setall_f32(weights[5]); w6 = v_setall_f32(weights[6]); w7 = v_setall_f32(weights[7]); w8 = v_setall_f32(weights[8]); } } #endif int dy0 = 1; for (int y0 = 0; y0 < H0; y0 += dy0, outptr += W0 * dy0) { #if CV_SIMD128 dy0 = inner_ytop <= y0 && y0 + 3 < inner_ybottom && is3x3 && stride_y == 1 && dilation_y == 1 ? 3 : 1; #endif int x0 = 0, x1 = y0 >= inner_ytop && y0 < inner_ybottom ? inner_xleft : W0; int yi_ = y0 * stride_y - pad_top; for (;;) { float s_0, s_1, s_2; if (dy0 == 3) { for (; x0 < x1; x0++) { int xi_ = x0 * stride_x - pad_left; s_0 = s_1 = s_2 = biasval; for (int k = 0; k < ksize; k++) { int dy = yxtab[k * 2]; int yi = yi_ + dy; int xi = xi_ + yxtab[k * 2 + 1]; float w = weights[k]; if ((unsigned) xi < (unsigned) Wi) { s_0 += inptr[yi * Wi + xi] * w; s_1 += inptr[(yi + 1) * Wi + xi] * w; s_2 += inptr[(yi + 2) * Wi + xi] * w; } } s_0 = std::min(std::max(s_0, minval), maxval); s_1 = std::min(std::max(s_1, minval), maxval); s_2 = std::min(std::max(s_2, minval), maxval); outptr[x0] = s_0; outptr[x0 + W0] = s_1; outptr[x0 + W0 * 2] = s_2; } } else { for (; x0 < x1; x0++) { int xi_ = x0 * stride_x - pad_left; s_0 = biasval; for (int k = 0; k < ksize; k++) { int dy = yxtab[k * 2]; int yi = yi_ + dy; int xi = xi_ + yxtab[k * 2 + 1]; float w = weights[k]; if (((unsigned) yi < (unsigned) Hi) & ((unsigned) xi < (unsigned) Wi)) s_0 += inptr[yi * Wi + xi] * w; } s_0 = std::min(std::max(s_0, minval), maxval); outptr[x0] = s_0; } } if (x0 == W0) break; x1 = inner_xright; #if CV_SIMD128 if (useSIMD) { if (is3x3) { if (dy0 == 3) { for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES) { int xi_ = x0 * stride_x - pad_left; const float *inptr_xi = inptr + Wi * yi_ + xi_; v_float32x4 s0, s1, s2; v_float32x4 x00 = v_load(inptr_xi); v_float32x4 x01 = v_load(inptr_xi + 1); v_float32x4 x02 = v_load(inptr_xi + 2); v_float32x4 x10 = v_load(inptr_xi + Wi); v_float32x4 x11 = v_load(inptr_xi + Wi + 1); v_float32x4 x12 = v_load(inptr_xi + Wi + 2); v_float32x4 x20 = v_load(inptr_xi + Wi * 2); v_float32x4 x21 = v_load(inptr_xi + Wi * 2 + 1); v_float32x4 x22 = v_load(inptr_xi + Wi * 2 + 2); v_float32x4 x30 = v_load(inptr_xi + Wi * 3); v_float32x4 x31 = v_load(inptr_xi + Wi * 3 + 1); v_float32x4 x32 = v_load(inptr_xi + Wi * 3 + 2); v_float32x4 x40 = v_load(inptr_xi + Wi * 4); v_float32x4 x41 = v_load(inptr_xi + Wi * 4 + 1); v_float32x4 x42 = v_load(inptr_xi + Wi * 4 + 2); s0 = v_fma(x00, w0, vbias); s1 = v_fma(x10, w0, vbias); s2 = v_fma(x20, w0, vbias); s0 = v_fma(x01, w1, s0); s1 = v_fma(x11, w1, s1); s2 = v_fma(x21, w1, s2); s0 = v_fma(x02, w2, s0); s1 = v_fma(x12, w2, s1); s2 = v_fma(x22, w2, s2); s0 = v_fma(x10, w3, s0); s1 = v_fma(x20, w3, s1); s2 = v_fma(x30, w3, s2); s0 = v_fma(x11, w4, s0); s1 = v_fma(x21, w4, s1); s2 = v_fma(x31, w4, s2); s0 = v_fma(x12, w5, s0); s1 = v_fma(x22, w5, s1); s2 = v_fma(x32, w5, s2); s0 = v_fma(x20, w6, s0); s1 = v_fma(x30, w6, s1); s2 = v_fma(x40, w6, s2); s0 = v_fma(x21, w7, s0); s1 = v_fma(x31, w7, s1); s2 = v_fma(x41, w7, s2); s0 = v_fma(x22, w8, s0); s1 = v_fma(x32, w8, s1); s2 = v_fma(x42, w8, s2); if (ifMinMaxAct) { s0 = v_min(v_max(s0, vminval), vmaxval); s1 = v_min(v_max(s1, vminval), vmaxval); s2 = v_min(v_max(s2, vminval), vmaxval); } v_store(outptr + x0, s0); v_store(outptr + W0 + x0, s1); v_store(outptr + W0 * 2 + x0, s2); } } else { for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES) { int xi_ = x0 * stride_x - pad_left; const float *inptr_xi = inptr + Wi * yi_ + xi_; v_float32x4 s0 = v_fma(v_load(inptr_xi + ofstab[0]), w0, vbias); v_float32x4 s1 = v_load(inptr_xi + ofstab[1]) * w1; v_float32x4 s2 = v_load(inptr_xi + ofstab[2]) * w2; s0 = v_fma(v_load(inptr_xi + ofstab[3]), w3, s0); s1 = v_fma(v_load(inptr_xi + ofstab[4]), w4, s1); s2 = v_fma(v_load(inptr_xi + ofstab[5]), w5, s2); s0 = v_fma(v_load(inptr_xi + ofstab[6]), w6, s0); s1 = v_fma(v_load(inptr_xi + ofstab[7]), w7, s1); s2 = v_fma(v_load(inptr_xi + ofstab[8]), w8, s2); s0 = s0 + s1 + s2; if (ifMinMaxAct) s0 = v_min(v_max(s0, vminval), vmaxval); v_store(outptr + x0, s0); } } } else { for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES) { int xi_ = x0 * stride_x - pad_left, k = 0; const float *inptr_xi = inptr + Wi * yi_ + xi_; v_float32x4 s0 = vbias; for (; k <= ksize - 4; k += 4) { v_float32x4 v0 = v_load(inptr_xi + ofstab[k]); v_float32x4 v1 = v_load(inptr_xi + ofstab[k + 1]); v_float32x4 v2 = v_load(inptr_xi + ofstab[k + 2]); v_float32x4 v3 = v_load(inptr_xi + ofstab[k + 3]); v_float32x4 ww0 = v_setall_f32(weights[k]); v_float32x4 ww1 = v_setall_f32(weights[k+1]); v_float32x4 ww2 = v_setall_f32(weights[k+2]); v_float32x4 ww3 = v_setall_f32(weights[k+3]); s0 = v_fma(v0, ww0, s0); s0 = v_fma(v1, ww1, s0); s0 = v_fma(v2, ww2, s0); s0 = v_fma(v3, ww3, s0); } for (; k < ksize; k++) s0 = v_fma(v_load(inptr_xi + ofstab[k]), v_setall_f32(weights[k]), s0); if (ifMinMaxAct) s0 = v_min(v_max(s0, vminval), vmaxval); v_store(outptr + x0, s0); } } } #endif if (dy0 == 3) { for (; x0 < x1; x0++) { int xi_ = x0 * stride_x - pad_left; const float *inptr_xi = inptr + W0 * yi_ + xi_; s_0 = s_1 = s_2 = biasval; for (int k = 0; k < ksize; k++) { int inp_ofs = ofstab[k]; float w = weights[k]; s_0 += inptr_xi[inp_ofs] * w; s_1 += inptr_xi[inp_ofs + Wi] * w; s_2 += inptr_xi[inp_ofs + Wi * 2] * w; } if (ifMinMaxAct) { s_0 = std::min(std::max(s_0, minval), maxval); s_1 = std::min(std::max(s_1, minval), maxval); s_2 = std::min(std::max(s_2, minval), maxval); } outptr[x0] = s_0; outptr[x0 + W0] = s_1; outptr[x0 + W0 * 2] = s_2; } } else { for (; x0 < x1; x0++) { int xi_ = x0 * stride_x - pad_left; const float *inptr_xi = inptr + Wi * yi_ + xi_; s_0 = biasval; for (int k = 0; k < ksize; k++) { s_0 += inptr_xi[ofstab[k]] * weights[k]; } if (ifMinMaxAct) s_0 = std::min(std::max(s_0, minval), maxval); outptr[x0] = s_0; } } x1 = W0; } } } void runDepthwise(InputArray _input, OutputArray _output, const Ptr& conv, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) { Mat input = _input.getMat(); Mat output = _output.getMat(); MatShape inputShape = shape(input); MatShape outputShape = shape(output); CV_Assert(inputShape.size() == 4 && outputShape.size() == 4); int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3]; // [N, C, H, W] int K = conv->K, Hk = conv->Hk, Wk = conv->Wk; int H0 = outputShape[2], W0 = outputShape[3], ngroups = conv->ngroups; const size_t inp_planesize = (size_t) Hi * Wi; const size_t out_planesize = (size_t) H0 * W0; CV_Assert(ngroups > 1 && ngroups == K && ngroups == C); int stride_y = conv->stride_y, stride_x = conv->stride_x; int dilation_y = conv->dilation_y, dilation_x = conv->dilation_x; int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom; int pad_left = conv->pad_left, pad_right = conv->pad_right; int VEC_NLANES = 4; #if CV_TRY_AVX2 if (conv->useAVX2) VEC_NLANES = 8; #endif int ksize = Hk * Wk, padded_ksize = ((ksize + VEC_NLANES - 1) / VEC_NLANES) * VEC_NLANES; const float *inp = input.ptr(); float *out = output.ptr(); std::vector ofstab_(3 * padded_ksize, 0); int *ofstab = ofstab_.data(); int *yxtab = ofstab + padded_ksize; for (int k = 0; k < padded_ksize; k++) { int y = k < ksize ? k / Wk : 0; int x = k < ksize ? k % Wk : 0; int dy = y * dilation_y, dx = x * dilation_x; yxtab[k * 2] = dy; yxtab[k * 2 + 1] = dx; ofstab[k] = dy * Wi + dx; } const float *weights0 = conv->weightsBufPtr, *bias = conv->biasBuf.data(); int inner_ytop = (pad_bottom + stride_y - 1) / stride_y, inner_ybottom = 3; int inner_xleft = (pad_left + stride_x - 1) / stride_x, inner_xright = 4; CV_Assert(ksize > 1 || (pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0)); inner_xright = (Wi - (Wk - 1) * dilation_x + pad_left) / stride_x; inner_xright += inner_xright * stride_x - pad_left + (Wk - 1) * dilation_x < Wi; inner_ybottom = (Hi - (Hk - 1) * dilation_y + pad_top) / stride_y; inner_ybottom += inner_ybottom * stride_y - pad_top + (Hk - 1) * dilation_y < Hi; if (inner_xleft >= inner_xright || inner_ytop >= inner_ybottom) { inner_xleft = W0; inner_ytop = H0; } inner_ybottom = inner_ybottom < H0 ? inner_ybottom : H0; bool useSIMD = stride_x == 1 && inner_xleft < W0; bool is3x3 = Hk == 3 && Wk == 3; parallel_for_(Range(0, N * C), [&](const Range &r0) { for (int nc = r0.start; nc < r0.end; nc++) { int c = nc % C; const float *inptr = inp + inp_planesize * nc; float *outptr0 = out + out_planesize * nc; float biasval = bias[c]; const float *weights = weights0 + c * padded_ksize; #if CV_TRY_AVX2 if (conv->useAVX2) opt_AVX2::depthWiseBlock_AVX2(inptr, outptr0, weights, biasval, ofstab, yxtab, minval, maxval, Hi, Wi, H0, W0, ksize, pad_top, pad_left, dilation_y, stride_x, stride_y, inner_xleft, inner_xright, inner_ytop, inner_ybottom, ifMinMaxAct, useSIMD, is3x3); else #endif depthWiseBlock(inptr, outptr0, weights, biasval, ofstab, yxtab, minval, maxval, Hi, Wi, H0, W0, ksize, pad_top, pad_left, dilation_y, stride_x, stride_y, inner_xleft, inner_xright, inner_ytop, inner_ybottom, ifMinMaxAct, useSIMD, is3x3); if (activ) activ->forwardSlice(outptr0, outptr0, (int) out_planesize, out_planesize, c, c+1); } }); } }} // namespace cv::dnn