opencv/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp

391 lines
16 KiB
C++

// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpConv.fx).
// Here is the original license:
/*
This file is a part of ficus language project.
See ficus/LICENSE for the licensing terms
*/
#include "../../precomp.hpp"
#include "fast_convolution.hpp"
namespace cv { namespace dnn {
static void depthWiseBlock(const float *inptr, float *outptr, const float *weights, float biasval, int *ofstab, int *yxtab,
float minval, float maxval, int Hi, int Wi, int H0, int W0, int ksize, int pad_top, int pad_left,
int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop,
int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3)
{
#if CV_SIMD128
const int VEC_NLANES = 4;
v_float32x4 vminval = v_setall_f32(minval), vmaxval = v_setall_f32(maxval);
v_float32x4 w0 = v_setall_f32(
0.f), w1 = w0, w2 = w0, w3 = w0, w4 = w0, w5 = w0, w6 = w0, w7 = w0, w8 = w0, vbias = w0;
if (useSIMD)
{
vbias = v_setall_f32(biasval);
if (is3x3)
{
w0 = v_setall_f32(weights[0]);
w1 = v_setall_f32(weights[1]);
w2 = v_setall_f32(weights[2]);
w3 = v_setall_f32(weights[3]);
w4 = v_setall_f32(weights[4]);
w5 = v_setall_f32(weights[5]);
w6 = v_setall_f32(weights[6]);
w7 = v_setall_f32(weights[7]);
w8 = v_setall_f32(weights[8]);
}
}
#endif
int dy0 = 1;
for (int y0 = 0; y0 < H0; y0 += dy0, outptr += W0 * dy0)
{
#if CV_SIMD128
dy0 = inner_ytop <= y0 && y0 + 3 < inner_ybottom && is3x3 && stride_y == 1 && dilation_y == 1
? 3 : 1;
#endif
int x0 = 0, x1 = y0 >= inner_ytop && y0 < inner_ybottom ? inner_xleft : W0;
int yi_ = y0 * stride_y - pad_top;
for (;;)
{
float s_0, s_1, s_2;
if (dy0 == 3)
{
for (; x0 < x1; x0++)
{
int xi_ = x0 * stride_x - pad_left;
s_0 = s_1 = s_2 = biasval;
for (int k = 0; k < ksize; k++)
{
int dy = yxtab[k * 2];
int yi = yi_ + dy;
int xi = xi_ + yxtab[k * 2 + 1];
float w = weights[k];
if ((unsigned) xi < (unsigned) Wi)
{
s_0 += inptr[yi * Wi + xi] * w;
s_1 += inptr[(yi + 1) * Wi + xi] * w;
s_2 += inptr[(yi + 2) * Wi + xi] * w;
}
}
s_0 = std::min(std::max(s_0, minval), maxval);
s_1 = std::min(std::max(s_1, minval), maxval);
s_2 = std::min(std::max(s_2, minval), maxval);
outptr[x0] = s_0;
outptr[x0 + W0] = s_1;
outptr[x0 + W0 * 2] = s_2;
}
}
else
{
for (; x0 < x1; x0++)
{
int xi_ = x0 * stride_x - pad_left;
s_0 = biasval;
for (int k = 0; k < ksize; k++) {
int dy = yxtab[k * 2];
int yi = yi_ + dy;
int xi = xi_ + yxtab[k * 2 + 1];
float w = weights[k];
if (((unsigned) yi < (unsigned) Hi) & ((unsigned) xi < (unsigned) Wi))
s_0 += inptr[yi * Wi + xi] * w;
}
s_0 = std::min(std::max(s_0, minval), maxval);
outptr[x0] = s_0;
}
}
if (x0 == W0)
break;
x1 = inner_xright;
#if CV_SIMD128
if (useSIMD)
{
if (is3x3)
{
if (dy0 == 3)
{
for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
{
int xi_ = x0 * stride_x - pad_left;
const float *inptr_xi = inptr + Wi * yi_ + xi_;
v_float32x4 s0, s1, s2;
v_float32x4 x00 = v_load(inptr_xi);
v_float32x4 x01 = v_load(inptr_xi + 1);
v_float32x4 x02 = v_load(inptr_xi + 2);
v_float32x4 x10 = v_load(inptr_xi + Wi);
v_float32x4 x11 = v_load(inptr_xi + Wi + 1);
v_float32x4 x12 = v_load(inptr_xi + Wi + 2);
v_float32x4 x20 = v_load(inptr_xi + Wi * 2);
v_float32x4 x21 = v_load(inptr_xi + Wi * 2 + 1);
v_float32x4 x22 = v_load(inptr_xi + Wi * 2 + 2);
v_float32x4 x30 = v_load(inptr_xi + Wi * 3);
v_float32x4 x31 = v_load(inptr_xi + Wi * 3 + 1);
v_float32x4 x32 = v_load(inptr_xi + Wi * 3 + 2);
v_float32x4 x40 = v_load(inptr_xi + Wi * 4);
v_float32x4 x41 = v_load(inptr_xi + Wi * 4 + 1);
v_float32x4 x42 = v_load(inptr_xi + Wi * 4 + 2);
s0 = v_fma(x00, w0, vbias);
s1 = v_fma(x10, w0, vbias);
s2 = v_fma(x20, w0, vbias);
s0 = v_fma(x01, w1, s0);
s1 = v_fma(x11, w1, s1);
s2 = v_fma(x21, w1, s2);
s0 = v_fma(x02, w2, s0);
s1 = v_fma(x12, w2, s1);
s2 = v_fma(x22, w2, s2);
s0 = v_fma(x10, w3, s0);
s1 = v_fma(x20, w3, s1);
s2 = v_fma(x30, w3, s2);
s0 = v_fma(x11, w4, s0);
s1 = v_fma(x21, w4, s1);
s2 = v_fma(x31, w4, s2);
s0 = v_fma(x12, w5, s0);
s1 = v_fma(x22, w5, s1);
s2 = v_fma(x32, w5, s2);
s0 = v_fma(x20, w6, s0);
s1 = v_fma(x30, w6, s1);
s2 = v_fma(x40, w6, s2);
s0 = v_fma(x21, w7, s0);
s1 = v_fma(x31, w7, s1);
s2 = v_fma(x41, w7, s2);
s0 = v_fma(x22, w8, s0);
s1 = v_fma(x32, w8, s1);
s2 = v_fma(x42, w8, s2);
if (ifMinMaxAct)
{
s0 = v_min(v_max(s0, vminval), vmaxval);
s1 = v_min(v_max(s1, vminval), vmaxval);
s2 = v_min(v_max(s2, vminval), vmaxval);
}
v_store(outptr + x0, s0);
v_store(outptr + W0 + x0, s1);
v_store(outptr + W0 * 2 + x0, s2);
}
}
else
{
for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
{
int xi_ = x0 * stride_x - pad_left;
const float *inptr_xi = inptr + Wi * yi_ + xi_;
v_float32x4 s0 = v_fma(v_load(inptr_xi + ofstab[0]), w0, vbias);
v_float32x4 s1 = v_load(inptr_xi + ofstab[1]) * w1;
v_float32x4 s2 = v_load(inptr_xi + ofstab[2]) * w2;
s0 = v_fma(v_load(inptr_xi + ofstab[3]), w3, s0);
s1 = v_fma(v_load(inptr_xi + ofstab[4]), w4, s1);
s2 = v_fma(v_load(inptr_xi + ofstab[5]), w5, s2);
s0 = v_fma(v_load(inptr_xi + ofstab[6]), w6, s0);
s1 = v_fma(v_load(inptr_xi + ofstab[7]), w7, s1);
s2 = v_fma(v_load(inptr_xi + ofstab[8]), w8, s2);
s0 = s0 + s1 + s2;
if (ifMinMaxAct)
s0 = v_min(v_max(s0, vminval), vmaxval);
v_store(outptr + x0, s0);
}
}
}
else
{
for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
{
int xi_ = x0 * stride_x - pad_left, k = 0;
const float *inptr_xi = inptr + Wi * yi_ + xi_;
v_float32x4 s0 = vbias;
for (; k <= ksize - 4; k += 4)
{
v_float32x4 v0 = v_load(inptr_xi + ofstab[k]);
v_float32x4 v1 = v_load(inptr_xi + ofstab[k + 1]);
v_float32x4 v2 = v_load(inptr_xi + ofstab[k + 2]);
v_float32x4 v3 = v_load(inptr_xi + ofstab[k + 3]);
v_float32x4 ww0 = v_setall_f32(weights[k]);
v_float32x4 ww1 = v_setall_f32(weights[k+1]);
v_float32x4 ww2 = v_setall_f32(weights[k+2]);
v_float32x4 ww3 = v_setall_f32(weights[k+3]);
s0 = v_fma(v0, ww0, s0);
s0 = v_fma(v1, ww1, s0);
s0 = v_fma(v2, ww2, s0);
s0 = v_fma(v3, ww3, s0);
}
for (; k < ksize; k++)
s0 = v_fma(v_load(inptr_xi + ofstab[k]),
v_setall_f32(weights[k]), s0);
if (ifMinMaxAct)
s0 = v_min(v_max(s0, vminval), vmaxval);
v_store(outptr + x0, s0);
}
}
}
#endif
if (dy0 == 3)
{
for (; x0 < x1; x0++)
{
int xi_ = x0 * stride_x - pad_left;
const float *inptr_xi = inptr + W0 * yi_ + xi_;
s_0 = s_1 = s_2 = biasval;
for (int k = 0; k < ksize; k++)
{
int inp_ofs = ofstab[k];
float w = weights[k];
s_0 += inptr_xi[inp_ofs] * w;
s_1 += inptr_xi[inp_ofs + Wi] * w;
s_2 += inptr_xi[inp_ofs + Wi * 2] * w;
}
if (ifMinMaxAct)
{
s_0 = std::min(std::max(s_0, minval), maxval);
s_1 = std::min(std::max(s_1, minval), maxval);
s_2 = std::min(std::max(s_2, minval), maxval);
}
outptr[x0] = s_0;
outptr[x0 + W0] = s_1;
outptr[x0 + W0 * 2] = s_2;
}
}
else
{
for (; x0 < x1; x0++)
{
int xi_ = x0 * stride_x - pad_left;
const float *inptr_xi = inptr + Wi * yi_ + xi_;
s_0 = biasval;
for (int k = 0; k < ksize; k++)
{
s_0 += inptr_xi[ofstab[k]] * weights[k];
}
if (ifMinMaxAct)
s_0 = std::min(std::max(s_0, minval), maxval);
outptr[x0] = s_0;
}
}
x1 = W0;
}
}
}
void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) {
Mat input = _input.getMat();
Mat output = _output.getMat();
MatShape inputShape = shape(input);
MatShape outputShape = shape(output);
CV_Assert(inputShape.size() == 4 && outputShape.size() == 4);
int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3]; // [N, C, H, W]
int K = conv->K, Hk = conv->Hk, Wk = conv->Wk;
int H0 = outputShape[2], W0 = outputShape[3], ngroups = conv->ngroups;
const size_t inp_planesize = (size_t) Hi * Wi;
const size_t out_planesize = (size_t) H0 * W0;
CV_Assert(ngroups > 1 && ngroups == K && ngroups == C);
int stride_y = conv->stride_y, stride_x = conv->stride_x;
int dilation_y = conv->dilation_y, dilation_x = conv->dilation_x;
int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom;
int pad_left = conv->pad_left, pad_right = conv->pad_right;
int VEC_NLANES = 4;
#if CV_TRY_AVX2
if (conv->useAVX2)
VEC_NLANES = 8;
#endif
int ksize = Hk * Wk, padded_ksize = ((ksize + VEC_NLANES - 1) / VEC_NLANES) * VEC_NLANES;
const float *inp = input.ptr<float>();
float *out = output.ptr<float>();
std::vector<int> ofstab_(3 * padded_ksize, 0);
int *ofstab = ofstab_.data();
int *yxtab = ofstab + padded_ksize;
for (int k = 0; k < padded_ksize; k++)
{
int y = k < ksize ? k / Wk : 0;
int x = k < ksize ? k % Wk : 0;
int dy = y * dilation_y, dx = x * dilation_x;
yxtab[k * 2] = dy;
yxtab[k * 2 + 1] = dx;
ofstab[k] = dy * Wi + dx;
}
const float *weights0 = conv->weightsBufPtr, *bias = conv->biasBuf.data();
int inner_ytop = (pad_bottom + stride_y - 1) / stride_y, inner_ybottom = 3;
int inner_xleft = (pad_left + stride_x - 1) / stride_x, inner_xright = 4;
CV_Assert(ksize > 1 || (pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0));
inner_xright = (Wi - (Wk - 1) * dilation_x + pad_left) / stride_x;
inner_xright += inner_xright * stride_x - pad_left + (Wk - 1) * dilation_x < Wi;
inner_ybottom = (Hi - (Hk - 1) * dilation_y + pad_top) / stride_y;
inner_ybottom += inner_ybottom * stride_y - pad_top + (Hk - 1) * dilation_y < Hi;
if (inner_xleft >= inner_xright || inner_ytop >= inner_ybottom)
{
inner_xleft = W0;
inner_ytop = H0;
}
inner_ybottom = inner_ybottom < H0 ? inner_ybottom : H0;
bool useSIMD = stride_x == 1 && inner_xleft < W0;
bool is3x3 = Hk == 3 && Wk == 3;
parallel_for_(Range(0, N * C), [&](const Range &r0) {
for (int nc = r0.start; nc < r0.end; nc++)
{
int c = nc % C;
const float *inptr = inp + inp_planesize * nc;
float *outptr0 = out + out_planesize * nc;
float biasval = bias[c];
const float *weights = weights0 + c * padded_ksize;
#if CV_TRY_AVX2
if (conv->useAVX2)
opt_AVX2::depthWiseBlock_AVX2(inptr, outptr0, weights, biasval, ofstab, yxtab, minval, maxval, Hi, Wi, H0, W0, ksize,
pad_top, pad_left, dilation_y, stride_x, stride_y, inner_xleft, inner_xright, inner_ytop,
inner_ybottom, ifMinMaxAct, useSIMD, is3x3);
else
#endif
depthWiseBlock(inptr, outptr0, weights, biasval, ofstab, yxtab, minval, maxval, Hi, Wi, H0, W0, ksize,
pad_top, pad_left, dilation_y, stride_x, stride_y, inner_xleft, inner_xright, inner_ytop,
inner_ybottom, ifMinMaxAct, useSIMD, is3x3);
if (activ)
activ->forwardSlice(outptr0, outptr0, (int) out_planesize, out_planesize, c, c+1);
}
});
}
}} // namespace cv::dnn