mirror of
https://github.com/opencv/opencv.git
synced 2025-06-09 18:43:05 +08:00
fixed bug at winograd of SIMD128 and more robust code.
This commit is contained in:
parent
5d292826b2
commit
cee8c86b6e
@ -20,6 +20,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
|
||||
int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3)
|
||||
{
|
||||
#if CV_SIMD128
|
||||
const int VEC_NLANES = 4;
|
||||
v_float32x4 vminval = v_setall_f32(minval), vmaxval = v_setall_f32(maxval);
|
||||
|
||||
v_float32x4 w0 = v_setall_f32(
|
||||
@ -110,7 +111,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
|
||||
{
|
||||
if (dy0 == 3)
|
||||
{
|
||||
for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
|
||||
for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
|
||||
{
|
||||
int xi_ = x0 * stride_x - pad_left;
|
||||
const float *inptr_xi = inptr + Wi * yi_ + xi_;
|
||||
@ -186,7 +187,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
|
||||
}
|
||||
else
|
||||
{
|
||||
for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
|
||||
for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
|
||||
{
|
||||
int xi_ = x0 * stride_x - pad_left;
|
||||
const float *inptr_xi = inptr + Wi * yi_ + xi_;
|
||||
@ -211,7 +212,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
|
||||
}
|
||||
else
|
||||
{
|
||||
for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
|
||||
for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
|
||||
{
|
||||
int xi_ = x0 * stride_x - pad_left, k = 0;
|
||||
const float *inptr_xi = inptr + Wi * yi_ + xi_;
|
||||
@ -314,7 +315,12 @@ void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>&
|
||||
int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom;
|
||||
int pad_left = conv->pad_left, pad_right = conv->pad_right;
|
||||
|
||||
int ksize = Hk * Wk, padded_ksize = ((ksize + FAST_VEC_NLANES - 1) / FAST_VEC_NLANES) * FAST_VEC_NLANES;
|
||||
int VEC_NLANES = 4;
|
||||
#if CV_TRY_AVX2
|
||||
if (conv->useAVX2)
|
||||
VEC_NLANES = 8;
|
||||
#endif
|
||||
int ksize = Hk * Wk, padded_ksize = ((ksize + VEC_NLANES - 1) / VEC_NLANES) * VEC_NLANES;
|
||||
|
||||
const float *inp = input.ptr<float>();
|
||||
float *out = output.ptr<float>();
|
||||
|
@ -78,6 +78,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
|
||||
int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop,
|
||||
int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3)
|
||||
{
|
||||
const int VEC_NLANES = 8;
|
||||
__m256 vminval = _mm256_set1_ps(minval);
|
||||
__m256 vmaxval = _mm256_set1_ps(maxval);
|
||||
|
||||
@ -174,7 +175,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
|
||||
{
|
||||
if (dy0 == 3)
|
||||
{
|
||||
for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
|
||||
for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
|
||||
{
|
||||
int xi_ = x0 * stride_x - pad_left;
|
||||
const float *inptr_xi = inptr + Wi * yi_ + xi_;
|
||||
@ -250,7 +251,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
|
||||
}
|
||||
else
|
||||
{
|
||||
for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
|
||||
for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
|
||||
{
|
||||
int xi_ = x0 * stride_x - pad_left;
|
||||
const float *inptr_xi = inptr + Wi * yi_ + xi_;
|
||||
@ -276,7 +277,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
|
||||
}
|
||||
else
|
||||
{
|
||||
for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
|
||||
for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
|
||||
{
|
||||
int xi_ = x0 * stride_x - pad_left, k = 0;
|
||||
const float *inptr_xi = inptr + Wi * yi_ + xi_;
|
||||
@ -701,7 +702,6 @@ void _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep,
|
||||
z50 = _mm256_add_ps(vbias, z50);
|
||||
}
|
||||
|
||||
// TODO make sure the lenght of bpptr is 8.
|
||||
if (bpptr)
|
||||
{
|
||||
z00 = _mm256_add_ps(z00, _mm256_loadu_ps(bpptr));
|
||||
|
@ -49,6 +49,15 @@ Ptr<FastConv2d> initFastConv2d(
|
||||
useWinograd && ((conv->useSIMD128 || conv->useAVX2 || conv->useNEON) && Hk == 3 && Wk == 3 &&
|
||||
dilation_y == 1 && dilation_x == 1 && stride_y == 1 && stride_x == 1) ? _FX_CONV_TYPE_WINOGRAD3X3 :
|
||||
_FX_CONV_TYPE_GENERIC;
|
||||
|
||||
int VEC_NLANES = 4;
|
||||
#if CV_TRY_AVX2
|
||||
if (!conv->useAVX2 && conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // convert Winograd to generic conv.
|
||||
conv->conv_type = _FX_CONV_TYPE_GENERIC;
|
||||
if (conv->useAVX2)
|
||||
VEC_NLANES = 8;
|
||||
#endif
|
||||
|
||||
Mat weightsMat = _weightsMat.getMat();
|
||||
auto wShape = shape(weightsMat);
|
||||
const size_t wstep = weightsMat.step1();
|
||||
@ -61,7 +70,7 @@ Ptr<FastConv2d> initFastConv2d(
|
||||
int ksize = Hk*Wk;
|
||||
|
||||
// this code aims to let memory fit with vector size.
|
||||
int padded_ksize = ((ksize + FAST_VEC_NLANES-1) / FAST_VEC_NLANES) * FAST_VEC_NLANES;
|
||||
int padded_ksize = ((ksize + VEC_NLANES-1) / VEC_NLANES) * VEC_NLANES;
|
||||
int nweights = C*padded_ksize;
|
||||
conv->weightsBuf.reserve(nweights + VEC_ALIGN);
|
||||
conv->weightsBufPtr = alignPtr(conv->weightsBuf.data(), VEC_ALIGN);
|
||||
@ -265,7 +274,8 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr<FastConv2d>
|
||||
else if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3 && inputShape[2] >= 12 && inputShape[3] >= 12) // winograd
|
||||
{
|
||||
CV_Assert(conv->weightsWinoBufPtr);
|
||||
return runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct);
|
||||
if (runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct))
|
||||
return;
|
||||
}
|
||||
|
||||
int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3]; // [N, C, H, W]
|
||||
|
@ -12,35 +12,25 @@
|
||||
#if CV_NEON && CV_NEON_AARCH64 // 32 registers.
|
||||
#define CONV_MR 4
|
||||
#define CONV_NR 28
|
||||
enum { FAST_VEC_NLANES=4 };
|
||||
#elif CV_NEON // 16 registers.
|
||||
#define CONV_MR 4
|
||||
#define CONV_NR 12
|
||||
enum { FAST_VEC_NLANES=4 };
|
||||
#else // SIMD 128, AVX or AVX2
|
||||
#define CONV_MR 4
|
||||
#define CONV_NR 24
|
||||
|
||||
#if CV_TRY_AVX2
|
||||
enum { FAST_VEC_NLANES=8 }; // AVX2
|
||||
#else
|
||||
enum { FAST_VEC_NLANES=4 }; // SIMD 128
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Winograd Params
|
||||
enum {
|
||||
_FX_WINO_STEP=6,
|
||||
_FX_WINO_KSIZE=3,
|
||||
_FX_WINO_SIZE=_FX_WINO_STEP+_FX_WINO_KSIZE-1,
|
||||
_FX_WINO_AREA=_FX_WINO_SIZE*_FX_WINO_SIZE,
|
||||
|
||||
#if CV_TRY_AVX2 || (CV_NEON && CV_NEON_AARCH64)
|
||||
_FX_WINO_KBLOCK = 4,
|
||||
#if (CV_NEON && CV_NEON_AARCH64) || CV_TRY_AVX2
|
||||
_FX_WINO_IBLOCK = 6,
|
||||
#else
|
||||
_FX_WINO_KBLOCK = 4,
|
||||
_FX_WINO_IBLOCK = 3,
|
||||
#endif
|
||||
|
||||
@ -52,8 +42,8 @@ enum {
|
||||
|
||||
_FX_WINO_NATOMS_F32 = _FX_WINO_AREA / _FX_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16.
|
||||
};
|
||||
|
||||
enum { _FX_CONV_TYPE_GENERIC=0, _FX_CONV_TYPE_DEPTHWISE=1, _FX_CONV_TYPE_WINOGRAD3X3=2 };
|
||||
#endif
|
||||
|
||||
namespace cv {
|
||||
namespace dnn {
|
||||
@ -77,8 +67,18 @@ struct FastConv2d
|
||||
#else
|
||||
bool useSIMD128 = false;
|
||||
#endif
|
||||
|
||||
#if CV_TRY_AVX2
|
||||
bool useAVX2 = checkHardwareSupport(CPU_AVX2);
|
||||
#else
|
||||
bool useAVX2 = false;
|
||||
#endif
|
||||
|
||||
#if CV_NEON
|
||||
bool useNEON = checkHardwareSupport(CPU_NEON);
|
||||
#else
|
||||
bool useNEON = false;
|
||||
#endif
|
||||
};
|
||||
|
||||
// return a FastConv2d instance.
|
||||
@ -99,7 +99,7 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr<FastConv2d>
|
||||
void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, float minval, float maxval,
|
||||
ActivationLayer* activ, bool ifMinMaxAct);
|
||||
|
||||
void runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv, int ntasks,
|
||||
int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv, int ntasks,
|
||||
float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct);
|
||||
|
||||
} // namespace dnn
|
||||
|
@ -13,6 +13,8 @@
|
||||
#include "fast_convolution.hpp"
|
||||
|
||||
namespace cv { namespace dnn {
|
||||
|
||||
#if CV_NEON || CV_SIMD128 || CV_TRY_AVX2
|
||||
enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment.
|
||||
|
||||
static void
|
||||
@ -141,7 +143,7 @@ _fx_winograd_accum_f32(const float* inwptr, const float* wptr,
|
||||
vst1q_f32(outbuf + 20*64, s32);
|
||||
}
|
||||
}
|
||||
#elif CV_SIMD
|
||||
#elif CV_SIMD128
|
||||
CV_Assert(_FX_WINO_IBLOCK == 3 && _FX_WINO_KBLOCK == 4);
|
||||
for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++,
|
||||
outbuf += _FX_WINO_ATOM_F32)
|
||||
@ -183,15 +185,15 @@ _fx_winograd_accum_f32(const float* inwptr, const float* wptr,
|
||||
v_store(outbuf, s00);
|
||||
v_store(outbuf + 1*64, s01);
|
||||
v_store(outbuf + 2*64, s02);
|
||||
v_store(outbuf + 6*64, s10);
|
||||
v_store(outbuf + 7*64, s11);
|
||||
v_store(outbuf + 8*64, s12);
|
||||
v_store(outbuf + 12*64, s20);
|
||||
v_store(outbuf + 13*64, s21);
|
||||
v_store(outbuf + 14*64, s22);
|
||||
v_store(outbuf + 18*64, s30);
|
||||
v_store(outbuf + 19*64, s31);
|
||||
v_store(outbuf + 20*64, s32);
|
||||
v_store(outbuf + 3*64, s10);
|
||||
v_store(outbuf + 4*64, s11);
|
||||
v_store(outbuf + 5*64, s12);
|
||||
v_store(outbuf + 6*64, s20);
|
||||
v_store(outbuf + 7*64, s21);
|
||||
v_store(outbuf + 8*64, s22);
|
||||
v_store(outbuf + 9*64, s30);
|
||||
v_store(outbuf + 10*64, s31);
|
||||
v_store(outbuf + 11*64, s32);
|
||||
}
|
||||
#else
|
||||
for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32;
|
||||
@ -406,7 +408,7 @@ _fx_winograd_BtXB_8x8_f32(const float* inptr, int inpstep,
|
||||
vst1q_f32(outptr + outstep*13, z61);
|
||||
vst1q_f32(outptr + outstep*14, z70);
|
||||
vst1q_f32(outptr + outstep*15, z71);
|
||||
#elif CV_SIMD
|
||||
#elif CV_SIMD128
|
||||
v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
|
||||
v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
|
||||
v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
|
||||
@ -750,8 +752,7 @@ _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep,
|
||||
vst1_f32(outptr + outstep*4 + 4, vget_low_f32(z41));
|
||||
vst1q_f32(outptr + outstep*5, z50);
|
||||
vst1_f32(outptr + outstep*5 + 4, vget_low_f32(z51));
|
||||
//#elif CV_AVX2
|
||||
#elif CV_SIMD
|
||||
#elif CV_SIMD128
|
||||
v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
|
||||
v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
|
||||
v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
|
||||
@ -919,7 +920,7 @@ _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep,
|
||||
#endif
|
||||
}
|
||||
|
||||
void runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv,
|
||||
int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv,
|
||||
int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
|
||||
{
|
||||
Mat input = _input.getMat();
|
||||
@ -1138,5 +1139,15 @@ void runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outp
|
||||
}
|
||||
}
|
||||
}});
|
||||
return 1;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv,
|
||||
int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
}} // namespace cv::dnn
|
||||
|
Loading…
Reference in New Issue
Block a user