mirror of
https://github.com/opencv/opencv.git
synced 2025-06-12 20:42:53 +08:00
391 lines
16 KiB
C++
391 lines
16 KiB
C++
// This file is part of OpenCV project.
|
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
|
// of this distribution and at http://opencv.org/license.html.
|
|
|
|
// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpConv.fx).
|
|
// Here is the original license:
|
|
/*
|
|
This file is a part of ficus language project.
|
|
See ficus/LICENSE for the licensing terms
|
|
*/
|
|
|
|
#include "../../precomp.hpp"
|
|
#include "fast_convolution.hpp"
|
|
|
|
namespace cv { namespace dnn {
|
|
|
|
static void depthWiseBlock(const float *inptr, float *outptr, const float *weights, float biasval, int *ofstab, int *yxtab,
|
|
float minval, float maxval, int Hi, int Wi, int H0, int W0, int ksize, int pad_top, int pad_left,
|
|
int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop,
|
|
int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3)
|
|
{
|
|
#if CV_SIMD128
|
|
const int VEC_NLANES = 4;
|
|
v_float32x4 vminval = v_setall_f32(minval), vmaxval = v_setall_f32(maxval);
|
|
|
|
v_float32x4 w0 = v_setall_f32(
|
|
0.f), w1 = w0, w2 = w0, w3 = w0, w4 = w0, w5 = w0, w6 = w0, w7 = w0, w8 = w0, vbias = w0;
|
|
if (useSIMD)
|
|
{
|
|
vbias = v_setall_f32(biasval);
|
|
if (is3x3)
|
|
{
|
|
w0 = v_setall_f32(weights[0]);
|
|
w1 = v_setall_f32(weights[1]);
|
|
w2 = v_setall_f32(weights[2]);
|
|
w3 = v_setall_f32(weights[3]);
|
|
w4 = v_setall_f32(weights[4]);
|
|
w5 = v_setall_f32(weights[5]);
|
|
w6 = v_setall_f32(weights[6]);
|
|
w7 = v_setall_f32(weights[7]);
|
|
w8 = v_setall_f32(weights[8]);
|
|
}
|
|
}
|
|
#endif
|
|
int dy0 = 1;
|
|
for (int y0 = 0; y0 < H0; y0 += dy0, outptr += W0 * dy0)
|
|
{
|
|
#if CV_SIMD128
|
|
dy0 = inner_ytop <= y0 && y0 + 3 < inner_ybottom && is3x3 && stride_y == 1 && dilation_y == 1
|
|
? 3 : 1;
|
|
#endif
|
|
int x0 = 0, x1 = y0 >= inner_ytop && y0 < inner_ybottom ? inner_xleft : W0;
|
|
int yi_ = y0 * stride_y - pad_top;
|
|
|
|
for (;;)
|
|
{
|
|
float s_0, s_1, s_2;
|
|
if (dy0 == 3)
|
|
{
|
|
for (; x0 < x1; x0++)
|
|
{
|
|
int xi_ = x0 * stride_x - pad_left;
|
|
s_0 = s_1 = s_2 = biasval;
|
|
for (int k = 0; k < ksize; k++)
|
|
{
|
|
int dy = yxtab[k * 2];
|
|
int yi = yi_ + dy;
|
|
int xi = xi_ + yxtab[k * 2 + 1];
|
|
float w = weights[k];
|
|
|
|
if ((unsigned) xi < (unsigned) Wi)
|
|
{
|
|
s_0 += inptr[yi * Wi + xi] * w;
|
|
s_1 += inptr[(yi + 1) * Wi + xi] * w;
|
|
s_2 += inptr[(yi + 2) * Wi + xi] * w;
|
|
}
|
|
}
|
|
s_0 = std::min(std::max(s_0, minval), maxval);
|
|
s_1 = std::min(std::max(s_1, minval), maxval);
|
|
s_2 = std::min(std::max(s_2, minval), maxval);
|
|
outptr[x0] = s_0;
|
|
outptr[x0 + W0] = s_1;
|
|
outptr[x0 + W0 * 2] = s_2;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (; x0 < x1; x0++)
|
|
{
|
|
int xi_ = x0 * stride_x - pad_left;
|
|
s_0 = biasval;
|
|
for (int k = 0; k < ksize; k++) {
|
|
int dy = yxtab[k * 2];
|
|
int yi = yi_ + dy;
|
|
int xi = xi_ + yxtab[k * 2 + 1];
|
|
float w = weights[k];
|
|
if (((unsigned) yi < (unsigned) Hi) & ((unsigned) xi < (unsigned) Wi))
|
|
s_0 += inptr[yi * Wi + xi] * w;
|
|
}
|
|
s_0 = std::min(std::max(s_0, minval), maxval);
|
|
outptr[x0] = s_0;
|
|
}
|
|
}
|
|
if (x0 == W0)
|
|
break;
|
|
x1 = inner_xright;
|
|
#if CV_SIMD128
|
|
if (useSIMD)
|
|
{
|
|
if (is3x3)
|
|
{
|
|
if (dy0 == 3)
|
|
{
|
|
for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
|
|
{
|
|
int xi_ = x0 * stride_x - pad_left;
|
|
const float *inptr_xi = inptr + Wi * yi_ + xi_;
|
|
|
|
v_float32x4 s0, s1, s2;
|
|
v_float32x4 x00 = v_load(inptr_xi);
|
|
v_float32x4 x01 = v_load(inptr_xi + 1);
|
|
v_float32x4 x02 = v_load(inptr_xi + 2);
|
|
|
|
v_float32x4 x10 = v_load(inptr_xi + Wi);
|
|
v_float32x4 x11 = v_load(inptr_xi + Wi + 1);
|
|
v_float32x4 x12 = v_load(inptr_xi + Wi + 2);
|
|
|
|
v_float32x4 x20 = v_load(inptr_xi + Wi * 2);
|
|
v_float32x4 x21 = v_load(inptr_xi + Wi * 2 + 1);
|
|
v_float32x4 x22 = v_load(inptr_xi + Wi * 2 + 2);
|
|
|
|
v_float32x4 x30 = v_load(inptr_xi + Wi * 3);
|
|
v_float32x4 x31 = v_load(inptr_xi + Wi * 3 + 1);
|
|
v_float32x4 x32 = v_load(inptr_xi + Wi * 3 + 2);
|
|
|
|
v_float32x4 x40 = v_load(inptr_xi + Wi * 4);
|
|
v_float32x4 x41 = v_load(inptr_xi + Wi * 4 + 1);
|
|
v_float32x4 x42 = v_load(inptr_xi + Wi * 4 + 2);
|
|
|
|
s0 = v_fma(x00, w0, vbias);
|
|
s1 = v_fma(x10, w0, vbias);
|
|
s2 = v_fma(x20, w0, vbias);
|
|
|
|
s0 = v_fma(x01, w1, s0);
|
|
s1 = v_fma(x11, w1, s1);
|
|
s2 = v_fma(x21, w1, s2);
|
|
|
|
s0 = v_fma(x02, w2, s0);
|
|
s1 = v_fma(x12, w2, s1);
|
|
s2 = v_fma(x22, w2, s2);
|
|
|
|
s0 = v_fma(x10, w3, s0);
|
|
s1 = v_fma(x20, w3, s1);
|
|
s2 = v_fma(x30, w3, s2);
|
|
|
|
s0 = v_fma(x11, w4, s0);
|
|
s1 = v_fma(x21, w4, s1);
|
|
s2 = v_fma(x31, w4, s2);
|
|
|
|
s0 = v_fma(x12, w5, s0);
|
|
s1 = v_fma(x22, w5, s1);
|
|
s2 = v_fma(x32, w5, s2);
|
|
|
|
s0 = v_fma(x20, w6, s0);
|
|
s1 = v_fma(x30, w6, s1);
|
|
s2 = v_fma(x40, w6, s2);
|
|
|
|
s0 = v_fma(x21, w7, s0);
|
|
s1 = v_fma(x31, w7, s1);
|
|
s2 = v_fma(x41, w7, s2);
|
|
|
|
s0 = v_fma(x22, w8, s0);
|
|
s1 = v_fma(x32, w8, s1);
|
|
s2 = v_fma(x42, w8, s2);
|
|
|
|
if (ifMinMaxAct)
|
|
{
|
|
s0 = v_min(v_max(s0, vminval), vmaxval);
|
|
s1 = v_min(v_max(s1, vminval), vmaxval);
|
|
s2 = v_min(v_max(s2, vminval), vmaxval);
|
|
}
|
|
|
|
v_store(outptr + x0, s0);
|
|
v_store(outptr + W0 + x0, s1);
|
|
v_store(outptr + W0 * 2 + x0, s2);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
|
|
{
|
|
int xi_ = x0 * stride_x - pad_left;
|
|
const float *inptr_xi = inptr + Wi * yi_ + xi_;
|
|
v_float32x4 s0 = v_fma(v_load(inptr_xi + ofstab[0]), w0, vbias);
|
|
v_float32x4 s1 = v_load(inptr_xi + ofstab[1]) * w1;
|
|
v_float32x4 s2 = v_load(inptr_xi + ofstab[2]) * w2;
|
|
|
|
s0 = v_fma(v_load(inptr_xi + ofstab[3]), w3, s0);
|
|
s1 = v_fma(v_load(inptr_xi + ofstab[4]), w4, s1);
|
|
s2 = v_fma(v_load(inptr_xi + ofstab[5]), w5, s2);
|
|
|
|
s0 = v_fma(v_load(inptr_xi + ofstab[6]), w6, s0);
|
|
s1 = v_fma(v_load(inptr_xi + ofstab[7]), w7, s1);
|
|
s2 = v_fma(v_load(inptr_xi + ofstab[8]), w8, s2);
|
|
|
|
s0 = s0 + s1 + s2;
|
|
if (ifMinMaxAct)
|
|
s0 = v_min(v_max(s0, vminval), vmaxval);
|
|
v_store(outptr + x0, s0);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
|
|
{
|
|
int xi_ = x0 * stride_x - pad_left, k = 0;
|
|
const float *inptr_xi = inptr + Wi * yi_ + xi_;
|
|
v_float32x4 s0 = vbias;
|
|
for (; k <= ksize - 4; k += 4)
|
|
{
|
|
v_float32x4 v0 = v_load(inptr_xi + ofstab[k]);
|
|
v_float32x4 v1 = v_load(inptr_xi + ofstab[k + 1]);
|
|
v_float32x4 v2 = v_load(inptr_xi + ofstab[k + 2]);
|
|
v_float32x4 v3 = v_load(inptr_xi + ofstab[k + 3]);
|
|
|
|
v_float32x4 ww0 = v_setall_f32(weights[k]);
|
|
v_float32x4 ww1 = v_setall_f32(weights[k+1]);
|
|
v_float32x4 ww2 = v_setall_f32(weights[k+2]);
|
|
v_float32x4 ww3 = v_setall_f32(weights[k+3]);
|
|
|
|
s0 = v_fma(v0, ww0, s0);
|
|
s0 = v_fma(v1, ww1, s0);
|
|
s0 = v_fma(v2, ww2, s0);
|
|
s0 = v_fma(v3, ww3, s0);
|
|
}
|
|
for (; k < ksize; k++)
|
|
s0 = v_fma(v_load(inptr_xi + ofstab[k]),
|
|
v_setall_f32(weights[k]), s0);
|
|
if (ifMinMaxAct)
|
|
s0 = v_min(v_max(s0, vminval), vmaxval);
|
|
v_store(outptr + x0, s0);
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
if (dy0 == 3)
|
|
{
|
|
for (; x0 < x1; x0++)
|
|
{
|
|
int xi_ = x0 * stride_x - pad_left;
|
|
const float *inptr_xi = inptr + W0 * yi_ + xi_;
|
|
s_0 = s_1 = s_2 = biasval;
|
|
for (int k = 0; k < ksize; k++)
|
|
{
|
|
int inp_ofs = ofstab[k];
|
|
float w = weights[k];
|
|
s_0 += inptr_xi[inp_ofs] * w;
|
|
s_1 += inptr_xi[inp_ofs + Wi] * w;
|
|
s_2 += inptr_xi[inp_ofs + Wi * 2] * w;
|
|
}
|
|
if (ifMinMaxAct)
|
|
{
|
|
s_0 = std::min(std::max(s_0, minval), maxval);
|
|
s_1 = std::min(std::max(s_1, minval), maxval);
|
|
s_2 = std::min(std::max(s_2, minval), maxval);
|
|
}
|
|
|
|
outptr[x0] = s_0;
|
|
outptr[x0 + W0] = s_1;
|
|
outptr[x0 + W0 * 2] = s_2;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (; x0 < x1; x0++)
|
|
{
|
|
int xi_ = x0 * stride_x - pad_left;
|
|
const float *inptr_xi = inptr + Wi * yi_ + xi_;
|
|
s_0 = biasval;
|
|
for (int k = 0; k < ksize; k++)
|
|
{
|
|
s_0 += inptr_xi[ofstab[k]] * weights[k];
|
|
}
|
|
|
|
if (ifMinMaxAct)
|
|
s_0 = std::min(std::max(s_0, minval), maxval);
|
|
outptr[x0] = s_0;
|
|
}
|
|
}
|
|
x1 = W0;
|
|
}
|
|
}
|
|
}
|
|
|
|
void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) {
|
|
Mat input = _input.getMat();
|
|
Mat output = _output.getMat();
|
|
MatShape inputShape = shape(input);
|
|
MatShape outputShape = shape(output);
|
|
CV_Assert(inputShape.size() == 4 && outputShape.size() == 4);
|
|
|
|
int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3]; // [N, C, H, W]
|
|
int K = conv->K, Hk = conv->Hk, Wk = conv->Wk;
|
|
int H0 = outputShape[2], W0 = outputShape[3], ngroups = conv->ngroups;
|
|
|
|
const size_t inp_planesize = (size_t) Hi * Wi;
|
|
const size_t out_planesize = (size_t) H0 * W0;
|
|
|
|
CV_Assert(ngroups > 1 && ngroups == K && ngroups == C);
|
|
|
|
int stride_y = conv->stride_y, stride_x = conv->stride_x;
|
|
int dilation_y = conv->dilation_y, dilation_x = conv->dilation_x;
|
|
|
|
int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom;
|
|
int pad_left = conv->pad_left, pad_right = conv->pad_right;
|
|
|
|
int VEC_NLANES = 4;
|
|
#if CV_TRY_AVX2
|
|
if (conv->useAVX2)
|
|
VEC_NLANES = 8;
|
|
#endif
|
|
int ksize = Hk * Wk, padded_ksize = ((ksize + VEC_NLANES - 1) / VEC_NLANES) * VEC_NLANES;
|
|
|
|
const float *inp = input.ptr<float>();
|
|
float *out = output.ptr<float>();
|
|
|
|
std::vector<int> ofstab_(3 * padded_ksize, 0);
|
|
int *ofstab = ofstab_.data();
|
|
int *yxtab = ofstab + padded_ksize;
|
|
|
|
for (int k = 0; k < padded_ksize; k++)
|
|
{
|
|
int y = k < ksize ? k / Wk : 0;
|
|
int x = k < ksize ? k % Wk : 0;
|
|
int dy = y * dilation_y, dx = x * dilation_x;
|
|
yxtab[k * 2] = dy;
|
|
yxtab[k * 2 + 1] = dx;
|
|
ofstab[k] = dy * Wi + dx;
|
|
}
|
|
|
|
const float *weights0 = conv->weightsBufPtr, *bias = conv->biasBuf.data();
|
|
int inner_ytop = (pad_bottom + stride_y - 1) / stride_y, inner_ybottom = 3;
|
|
int inner_xleft = (pad_left + stride_x - 1) / stride_x, inner_xright = 4;
|
|
|
|
CV_Assert(ksize > 1 || (pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0));
|
|
|
|
inner_xright = (Wi - (Wk - 1) * dilation_x + pad_left) / stride_x;
|
|
inner_xright += inner_xright * stride_x - pad_left + (Wk - 1) * dilation_x < Wi;
|
|
inner_ybottom = (Hi - (Hk - 1) * dilation_y + pad_top) / stride_y;
|
|
inner_ybottom += inner_ybottom * stride_y - pad_top + (Hk - 1) * dilation_y < Hi;
|
|
|
|
if (inner_xleft >= inner_xright || inner_ytop >= inner_ybottom)
|
|
{
|
|
inner_xleft = W0;
|
|
inner_ytop = H0;
|
|
}
|
|
|
|
inner_ybottom = inner_ybottom < H0 ? inner_ybottom : H0;
|
|
|
|
bool useSIMD = stride_x == 1 && inner_xleft < W0;
|
|
bool is3x3 = Hk == 3 && Wk == 3;
|
|
|
|
parallel_for_(Range(0, N * C), [&](const Range &r0) {
|
|
for (int nc = r0.start; nc < r0.end; nc++)
|
|
{
|
|
int c = nc % C;
|
|
const float *inptr = inp + inp_planesize * nc;
|
|
float *outptr0 = out + out_planesize * nc;
|
|
|
|
float biasval = bias[c];
|
|
const float *weights = weights0 + c * padded_ksize;
|
|
|
|
#if CV_TRY_AVX2
|
|
if (conv->useAVX2)
|
|
opt_AVX2::depthWiseBlock_AVX2(inptr, outptr0, weights, biasval, ofstab, yxtab, minval, maxval, Hi, Wi, H0, W0, ksize,
|
|
pad_top, pad_left, dilation_y, stride_x, stride_y, inner_xleft, inner_xright, inner_ytop,
|
|
inner_ybottom, ifMinMaxAct, useSIMD, is3x3);
|
|
else
|
|
#endif
|
|
depthWiseBlock(inptr, outptr0, weights, biasval, ofstab, yxtab, minval, maxval, Hi, Wi, H0, W0, ksize,
|
|
pad_top, pad_left, dilation_y, stride_x, stride_y, inner_xleft, inner_xright, inner_ytop,
|
|
inner_ybottom, ifMinMaxAct, useSIMD, is3x3);
|
|
|
|
if (activ)
|
|
activ->forwardSlice(outptr0, outptr0, (int) out_planesize, out_planesize, c, c+1);
|
|
}
|
|
});
|
|
}
|
|
|
|
}} // namespace cv::dnn
|