mirror of
https://github.com/opencv/opencv.git
synced 2024-11-29 13:47:32 +08:00
Optimize the winograd futher more.
This commit is contained in:
parent
ec26541771
commit
0fa43e3aac
@ -333,7 +333,7 @@ void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>&
|
||||
ofstab[k] = dy * Wi + dx;
|
||||
}
|
||||
|
||||
const float *weights0 = conv->weightsBuf.data(), *bias = conv->biasBuf.data();
|
||||
const float *weights0 = conv->weightsBufPtr, *bias = conv->biasBuf.data();
|
||||
int inner_ytop = (pad_bottom + stride_y - 1) / stride_y, inner_ybottom = 3;
|
||||
int inner_xleft = (pad_left + stride_x - 1) / stride_x, inner_xright = 4;
|
||||
|
||||
|
@ -354,7 +354,387 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
|
||||
x1 = W0;
|
||||
}
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
|
||||
void _fx_winograd_accum_f32(const float* inwptr, const float* wptr,
|
||||
float* outbuf, int Cg, int iblock)
|
||||
{
|
||||
CV_Assert(_FX_WINO_IBLOCK == 6 && _FX_WINO_KBLOCK == 4);// && _FX_WINO_ATOM_F32 == 8);
|
||||
if (iblock > 3)
|
||||
{
|
||||
for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++,
|
||||
outbuf += _FX_WINO_ATOM_F32)
|
||||
{
|
||||
__m256 s00 = _mm256_set1_ps(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00;
|
||||
__m256 s10 = _mm256_set1_ps(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00;
|
||||
__m256 s20 = _mm256_set1_ps(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00;
|
||||
__m256 s30 = _mm256_set1_ps(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00;
|
||||
for (int c = 0; c < Cg; c++, inwptr += _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32,
|
||||
wptr += _FX_WINO_KBLOCK*_FX_WINO_ATOM_F32)
|
||||
{
|
||||
__m256 w0 = _mm256_load_ps(wptr), w1 = _mm256_load_ps(wptr + 8);
|
||||
__m256 w2 = _mm256_load_ps(wptr + 16), w3 = _mm256_load_ps(wptr + 24);
|
||||
__m256 x0, x1;
|
||||
x0 = _mm256_load_ps(inwptr);
|
||||
x1 = _mm256_load_ps(inwptr + 8);
|
||||
s00 = _mm256_fmadd_ps(w0, x0, s00);
|
||||
s01 = _mm256_fmadd_ps(w0, x1, s01);
|
||||
s10 = _mm256_fmadd_ps(w1, x0, s10);
|
||||
s11 = _mm256_fmadd_ps(w1, x1, s11);
|
||||
s20 = _mm256_fmadd_ps(w2, x0, s20);
|
||||
s21 = _mm256_fmadd_ps(w2, x1, s21);
|
||||
s30 = _mm256_fmadd_ps(w3, x0, s30);
|
||||
s31 = _mm256_fmadd_ps(w3, x1, s31);
|
||||
x0 = _mm256_load_ps(inwptr + 16);
|
||||
x1 = _mm256_load_ps(inwptr + 24);
|
||||
s02 = _mm256_fmadd_ps(w0, x0, s02);
|
||||
s03 = _mm256_fmadd_ps(w0, x1, s03);
|
||||
s12 = _mm256_fmadd_ps(w1, x0, s12);
|
||||
s13 = _mm256_fmadd_ps(w1, x1, s13);
|
||||
s22 = _mm256_fmadd_ps(w2, x0, s22);
|
||||
s23 = _mm256_fmadd_ps(w2, x1, s23);
|
||||
s32 = _mm256_fmadd_ps(w3, x0, s32);
|
||||
s33 = _mm256_fmadd_ps(w3, x1, s33);
|
||||
x0 = _mm256_load_ps(inwptr + 32);
|
||||
x1 = _mm256_load_ps(inwptr + 40);
|
||||
s04 = _mm256_fmadd_ps(w0, x0, s04);
|
||||
s05 = _mm256_fmadd_ps(w0, x1, s05);
|
||||
s14 = _mm256_fmadd_ps(w1, x0, s14);
|
||||
s15 = _mm256_fmadd_ps(w1, x1, s15);
|
||||
s24 = _mm256_fmadd_ps(w2, x0, s24);
|
||||
s25 = _mm256_fmadd_ps(w2, x1, s25);
|
||||
s34 = _mm256_fmadd_ps(w3, x0, s34);
|
||||
s35 = _mm256_fmadd_ps(w3, x1, s35);
|
||||
}
|
||||
|
||||
_mm256_store_ps(outbuf, s00);
|
||||
_mm256_store_ps(outbuf + 1*64, s01);
|
||||
_mm256_store_ps(outbuf + 2*64, s02);
|
||||
_mm256_store_ps(outbuf + 3*64, s03);
|
||||
_mm256_store_ps(outbuf + 4*64, s04);
|
||||
_mm256_store_ps(outbuf + 5*64, s05);
|
||||
|
||||
_mm256_store_ps(outbuf + 6*64, s10);
|
||||
_mm256_store_ps(outbuf + 7*64, s11);
|
||||
_mm256_store_ps(outbuf + 8*64, s12);
|
||||
_mm256_store_ps(outbuf + 9*64, s13);
|
||||
_mm256_store_ps(outbuf + 10*64, s14);
|
||||
_mm256_store_ps(outbuf + 11*64, s15);
|
||||
|
||||
_mm256_store_ps(outbuf + 12*64, s20);
|
||||
_mm256_store_ps(outbuf + 13*64, s21);
|
||||
_mm256_store_ps(outbuf + 14*64, s22);
|
||||
_mm256_store_ps(outbuf + 15*64, s23);
|
||||
_mm256_store_ps(outbuf + 16*64, s24);
|
||||
_mm256_store_ps(outbuf + 17*64, s25);
|
||||
|
||||
_mm256_store_ps(outbuf + 18*64, s30);
|
||||
_mm256_store_ps(outbuf + 19*64, s31);
|
||||
_mm256_store_ps(outbuf + 20*64, s32);
|
||||
_mm256_store_ps(outbuf + 21*64, s33);
|
||||
_mm256_store_ps(outbuf + 22*64, s34);
|
||||
_mm256_store_ps(outbuf + 23*64, s35);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++,
|
||||
outbuf += _FX_WINO_ATOM_F32)
|
||||
{
|
||||
__m256 s00 = _mm256_set1_ps(0.f), s01 = s00, s02 = s00;
|
||||
__m256 s10 = _mm256_set1_ps(0.f), s11 = s00, s12 = s00;
|
||||
__m256 s20 = _mm256_set1_ps(0.f), s21 = s00, s22 = s00;
|
||||
__m256 s30 = _mm256_set1_ps(0.f), s31 = s00, s32 = s00;
|
||||
for (int c = 0; c < Cg; c++, inwptr += _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32,
|
||||
wptr += _FX_WINO_KBLOCK*_FX_WINO_ATOM_F32) {
|
||||
__m256 w0 = _mm256_load_ps(wptr), w1 = _mm256_load_ps(wptr + 8);
|
||||
__m256 w2 = _mm256_load_ps(wptr + 16), w3 = _mm256_load_ps(wptr + 24);
|
||||
__m256 x0, x1, x2;
|
||||
x0 = _mm256_load_ps(inwptr);
|
||||
x1 = _mm256_load_ps(inwptr + 8);
|
||||
x2 = _mm256_load_ps(inwptr + 16);
|
||||
s00 = _mm256_fmadd_ps(w0, x0, s00);
|
||||
s01 = _mm256_fmadd_ps(w0, x1, s01);
|
||||
s02 = _mm256_fmadd_ps(w0, x2, s02);
|
||||
s10 = _mm256_fmadd_ps(w1, x0, s10);
|
||||
s11 = _mm256_fmadd_ps(w1, x1, s11);
|
||||
s12 = _mm256_fmadd_ps(w1, x2, s12);
|
||||
s20 = _mm256_fmadd_ps(w2, x0, s20);
|
||||
s21 = _mm256_fmadd_ps(w2, x1, s21);
|
||||
s22 = _mm256_fmadd_ps(w2, x2, s22);
|
||||
s30 = _mm256_fmadd_ps(w3, x0, s30);
|
||||
s31 = _mm256_fmadd_ps(w3, x1, s31);
|
||||
s32 = _mm256_fmadd_ps(w3, x2, s32);
|
||||
}
|
||||
|
||||
_mm256_store_ps(outbuf, s00);
|
||||
_mm256_store_ps(outbuf + 1*64, s01);
|
||||
_mm256_store_ps(outbuf + 2*64, s02);
|
||||
_mm256_store_ps(outbuf + 6*64, s10);
|
||||
_mm256_store_ps(outbuf + 7*64, s11);
|
||||
_mm256_store_ps(outbuf + 8*64, s12);
|
||||
_mm256_store_ps(outbuf + 12*64, s20);
|
||||
_mm256_store_ps(outbuf + 13*64, s21);
|
||||
_mm256_store_ps(outbuf + 14*64, s22);
|
||||
_mm256_store_ps(outbuf + 18*64, s30);
|
||||
_mm256_store_ps(outbuf + 19*64, s31);
|
||||
_mm256_store_ps(outbuf + 20*64, s32);
|
||||
}
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
static inline
|
||||
void transpose8_ps(__m256 &row0, __m256 &row1, __m256 &row2, __m256 &row3, __m256 &row4, __m256 &row5, __m256 &row6, __m256 &row7)
|
||||
{
|
||||
__m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7;
|
||||
__m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7;
|
||||
__t0 = _mm256_unpacklo_ps(row0, row1);
|
||||
__t1 = _mm256_unpackhi_ps(row0, row1);
|
||||
__t2 = _mm256_unpacklo_ps(row2, row3);
|
||||
__t3 = _mm256_unpackhi_ps(row2, row3);
|
||||
__t4 = _mm256_unpacklo_ps(row4, row5);
|
||||
__t5 = _mm256_unpackhi_ps(row4, row5);
|
||||
__t6 = _mm256_unpacklo_ps(row6, row7);
|
||||
__t7 = _mm256_unpackhi_ps(row6, row7);
|
||||
__tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
|
||||
__tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
|
||||
__tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
|
||||
__tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
|
||||
__tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
|
||||
__tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
|
||||
__tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
|
||||
__tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
|
||||
row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
|
||||
row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
|
||||
row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
|
||||
row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
|
||||
row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
|
||||
row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
|
||||
row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
|
||||
row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
|
||||
}
|
||||
|
||||
/*Input transform*/
|
||||
void _fx_winograd_BtXB_8x8_f32(const float* inptr, int inpstep, float* outptr, int Cg)
|
||||
{
|
||||
__m256 x00 = _mm256_loadu_ps(inptr);
|
||||
__m256 x10 = _mm256_loadu_ps(inptr + inpstep);
|
||||
__m256 x20 = _mm256_loadu_ps(inptr + inpstep*2);
|
||||
__m256 x30 = _mm256_loadu_ps(inptr + inpstep*3);
|
||||
__m256 x40 = _mm256_loadu_ps(inptr + inpstep*4);
|
||||
__m256 x50 = _mm256_loadu_ps(inptr + inpstep*5);
|
||||
__m256 x60 = _mm256_loadu_ps(inptr + inpstep*6);
|
||||
__m256 x70 = _mm256_loadu_ps(inptr + inpstep*7);
|
||||
|
||||
__m256 z00, z10, z20, z30, z40, z50, z60, z70;
|
||||
|
||||
{
|
||||
/* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
|
||||
/* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
|
||||
__m256 q5_25 = _mm256_set1_ps(5.25f), t00, t10;
|
||||
t00 = _mm256_sub_ps(x40, x20);
|
||||
t10 = _mm256_sub_ps(x30, x50);
|
||||
|
||||
__m256 y00 = _mm256_fmadd_ps(t00, q5_25, _mm256_sub_ps(x00, x60));
|
||||
__m256 y70 = _mm256_fmadd_ps(t10, q5_25, _mm256_sub_ps(x70, x10));
|
||||
|
||||
/* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
|
||||
/* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
|
||||
__m256 qm4_25 = _mm256_set1_ps(-4.25f);
|
||||
t00 = _mm256_fmadd_ps(x30, qm4_25, _mm256_add_ps(x10, x50));
|
||||
t10 = _mm256_fmadd_ps(x40, qm4_25, _mm256_add_ps(x20, x60));
|
||||
|
||||
__m256 y10 = _mm256_add_ps(t00, t10);
|
||||
__m256 y20 = _mm256_sub_ps(t10, t00);
|
||||
|
||||
/* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
|
||||
/* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
|
||||
__m256 q0_5 = _mm256_set1_ps(0.5f), q0_25 = _mm256_set1_ps(0.25f);
|
||||
__m256 qm2_5 = _mm256_set1_ps(-2.5f), qm1_25 = _mm256_set1_ps(-1.25f);
|
||||
t00 = _mm256_fmadd_ps(x10, q0_5, _mm256_add_ps(x50, x50));
|
||||
t10 = _mm256_fmadd_ps(x20, q0_25, x60);
|
||||
t00 = _mm256_fmadd_ps(x30, qm2_5, t00);
|
||||
t10 = _mm256_fmadd_ps(x40, qm1_25, t10);
|
||||
|
||||
__m256 y30 = _mm256_add_ps(t00, t10);
|
||||
__m256 y40 = _mm256_sub_ps(t10, t00);
|
||||
|
||||
/* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
|
||||
/* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
|
||||
__m256 q4 = _mm256_set1_ps(4.f), qm5 = _mm256_set1_ps(-5.f);
|
||||
t00 = _mm256_fmadd_ps(x50, q0_5, _mm256_add_ps(x10, x10));
|
||||
t10 = _mm256_fmadd_ps(x20, q4 , x60);
|
||||
t00 = _mm256_fmadd_ps(x30, qm2_5, t00);
|
||||
t10 = _mm256_fmadd_ps(x40, qm5 , t10);
|
||||
|
||||
__m256 y50 = _mm256_add_ps(t00, t10);
|
||||
__m256 y60 = _mm256_sub_ps(t10, t00);
|
||||
|
||||
/* transpose 8x8 matrix in-place with some renumeration of the elements: */
|
||||
transpose8_ps(y00, y10, y20, y30, y40, y50, y60, y70);
|
||||
|
||||
/* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
|
||||
/* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
|
||||
t00 = _mm256_sub_ps(y40, y20);
|
||||
t10 = _mm256_sub_ps(y30, y50);
|
||||
z00 = _mm256_fmadd_ps(t00, q5_25, _mm256_sub_ps(y00, y60));
|
||||
z70 = _mm256_fmadd_ps(t10, q5_25, _mm256_sub_ps(y70, y10));
|
||||
|
||||
/* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
|
||||
/* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
|
||||
t00 = _mm256_fmadd_ps(y30, qm4_25, _mm256_add_ps(y10, y50));
|
||||
t10 = _mm256_fmadd_ps(y40, qm4_25, _mm256_add_ps(y20, y60));
|
||||
z10 = _mm256_add_ps(t00, t10);
|
||||
z20 = _mm256_sub_ps(t10, t00);
|
||||
|
||||
/* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
|
||||
/* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
|
||||
t00 = _mm256_fmadd_ps(y10, q0_5, _mm256_add_ps(y50, y50));
|
||||
t10 = _mm256_fmadd_ps(y20, q0_25, y60);
|
||||
t00 = _mm256_fmadd_ps(y30, qm2_5, t00);
|
||||
t10 = _mm256_fmadd_ps(y40, qm1_25, t10);
|
||||
|
||||
z30 = _mm256_add_ps(t00, t10);
|
||||
z40 = _mm256_sub_ps(t10, t00);
|
||||
|
||||
/* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
|
||||
/* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
|
||||
t00 = _mm256_fmadd_ps(y50, q0_5, _mm256_add_ps(y10, y10));
|
||||
t10 = _mm256_fmadd_ps(y20, q4, y60);
|
||||
t00 = _mm256_fmadd_ps(y30, qm2_5, t00);
|
||||
t10 = _mm256_fmadd_ps(y40, qm5, t10);
|
||||
|
||||
z50 = _mm256_add_ps(t00, t10);
|
||||
z60 = _mm256_sub_ps(t10, t00);
|
||||
}
|
||||
|
||||
const int outstep = _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32*Cg;
|
||||
|
||||
_mm256_storeu_ps(outptr, z00);
|
||||
_mm256_storeu_ps(outptr + outstep, z10);
|
||||
_mm256_storeu_ps(outptr + outstep*2, z20);
|
||||
_mm256_storeu_ps(outptr + outstep*3, z30);
|
||||
_mm256_storeu_ps(outptr + outstep*4, z40);
|
||||
_mm256_storeu_ps(outptr + outstep*5, z50);
|
||||
_mm256_storeu_ps(outptr + outstep*6, z60);
|
||||
_mm256_storeu_ps(outptr + outstep*7, z70);
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
|
||||
#define STORE6_ELE_FROM_16(ptr, z00, lowM, highM) \
|
||||
lowM = _mm256_castps256_ps128(z00); \
|
||||
highM = _mm256_extractf128_ps(z00, 1); \
|
||||
_mm_storeu_ps(ptr, lowM); \
|
||||
_mm_storel_epi64((__m128i*)(ptr + 4), _mm_castps_si128(highM))
|
||||
|
||||
/* Inverse Winograd 8x8 transform:
|
||||
out = (A'*inp*A)', where
|
||||
inp is input 8x8 FP32 matrix,
|
||||
A' is
|
||||
[1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f,
|
||||
0.f, 1.f, -1.f, 2.f, -2.f, 0.5f, -0.5f, 0.f,
|
||||
0.f, 1.f, 1.f, 4.f, 4.f, 0.25f, 0.25f, 0.f,
|
||||
0.f, 1.f, -1.f, 8.f, -8.f, 0.125f, -0.125f, 0.f,
|
||||
0.f, 1.f, 1.f, 16.f, 16.f, 1.f/16, 1.f/16, 0.f,
|
||||
0.f, 1.f, -1.f, 32.f, -32.f, 1.f/32, -1.f/32, 1.f]
|
||||
*/
|
||||
void _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep,
|
||||
float* bpptr, int bpstep, float* outptr, int outstep,
|
||||
float bias, float minval, float maxval, bool ifMinMaxAct)
|
||||
{
|
||||
|
||||
__m256 x00 = _mm256_load_ps(inptr);
|
||||
__m256 x10 = _mm256_load_ps(inptr + inpstep);
|
||||
__m256 x20 = _mm256_load_ps(inptr + inpstep*2);
|
||||
__m256 x30 = _mm256_load_ps(inptr + inpstep*3);
|
||||
__m256 x40 = _mm256_load_ps(inptr + inpstep*4);
|
||||
__m256 x50 = _mm256_load_ps(inptr + inpstep*5);
|
||||
__m256 x60 = _mm256_load_ps(inptr + inpstep*6);
|
||||
__m256 x70 = _mm256_load_ps(inptr + inpstep*7);
|
||||
__m256 z00, z10, z20, z30, z40, z50;
|
||||
|
||||
{
|
||||
__m256 s12_0, s34_0, s56_0;
|
||||
s12_0 = _mm256_add_ps(x10, x20);
|
||||
s34_0 = _mm256_add_ps(x30, x40);
|
||||
s56_0 = _mm256_add_ps(x50, x60);
|
||||
|
||||
__m256 y00 = _mm256_add_ps(x00, _mm256_add_ps(s12_0, _mm256_add_ps(s34_0, s56_0)));
|
||||
__m256 y20 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.25f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(4.0f), s12_0));
|
||||
__m256 y40 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/16), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(16.0f), s12_0));
|
||||
|
||||
s12_0 = _mm256_sub_ps(x10, x20);
|
||||
s34_0 = _mm256_sub_ps(x30, x40);
|
||||
s56_0 = _mm256_sub_ps(x50, x60);
|
||||
__m256 y50 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/32), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(32.f), _mm256_add_ps(x70, s12_0)));
|
||||
__m256 y10 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.5f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(2.f), s12_0));
|
||||
__m256 y30 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.125f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(8.f), s12_0));
|
||||
__m256 y60 = _mm256_set1_ps(0.f), y70 = y60;
|
||||
|
||||
/* transpose 8x8 matrix in-place with some renumeration of the elements: */
|
||||
|
||||
transpose8_ps(y00, y10, y20, y30, y40, y50, y60, y70);
|
||||
|
||||
s12_0 = _mm256_add_ps(y10, y20);
|
||||
s34_0 = _mm256_add_ps(y30, y40);
|
||||
s56_0 = _mm256_add_ps(y50, y60);
|
||||
|
||||
z00 = _mm256_add_ps(y00, _mm256_add_ps(s12_0, _mm256_add_ps(s34_0, s56_0)));
|
||||
z20 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.25f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(4.0f), s12_0));
|
||||
z40 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/16), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(16.0f), s12_0));
|
||||
|
||||
s12_0 = _mm256_sub_ps(y10, y20);
|
||||
s34_0 = _mm256_sub_ps(y30, y40);
|
||||
s56_0 = _mm256_sub_ps(y50, y60);
|
||||
|
||||
z50 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/32), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(32.0f), _mm256_add_ps(y70, s12_0)));
|
||||
z10 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.5f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(2.0f), s12_0));
|
||||
z30 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.125f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(8.0f), s12_0));
|
||||
|
||||
__m256 vbias = _mm256_set1_ps(bias);
|
||||
z00 = _mm256_add_ps(vbias, z00);
|
||||
z10 = _mm256_add_ps(vbias, z10);
|
||||
z20 = _mm256_add_ps(vbias, z20);
|
||||
z30 = _mm256_add_ps(vbias, z30);
|
||||
z40 = _mm256_add_ps(vbias, z40);
|
||||
z50 = _mm256_add_ps(vbias, z50);
|
||||
}
|
||||
|
||||
// TODO make sure the lenght of bpptr is 8.
|
||||
if (bpptr)
|
||||
{
|
||||
z00 = _mm256_add_ps(z00, _mm256_loadu_ps(bpptr));
|
||||
z10 = _mm256_add_ps(z10, _mm256_loadu_ps(bpptr + bpstep));
|
||||
z20 = _mm256_add_ps(z20, _mm256_loadu_ps(bpptr + bpstep*2));
|
||||
z30 = _mm256_add_ps(z30, _mm256_loadu_ps(bpptr + bpstep*3));
|
||||
z40 = _mm256_add_ps(z40, _mm256_loadu_ps(bpptr + bpstep*4));
|
||||
z50 = _mm256_add_ps(z50, _mm256_loadu_ps(bpptr + bpstep*5));
|
||||
}
|
||||
|
||||
if (ifMinMaxAct)
|
||||
{
|
||||
__m256 vmax = _mm256_set1_ps(maxval);
|
||||
__m256 vmin = _mm256_set1_ps(minval);
|
||||
|
||||
z00 = _mm256_min_ps(_mm256_max_ps(z00, vmin), vmax);
|
||||
z10 = _mm256_min_ps(_mm256_max_ps(z10, vmin), vmax);
|
||||
z20 = _mm256_min_ps(_mm256_max_ps(z20, vmin), vmax);
|
||||
z30 = _mm256_min_ps(_mm256_max_ps(z30, vmin), vmax);
|
||||
z40 = _mm256_min_ps(_mm256_max_ps(z40, vmin), vmax);
|
||||
z50 = _mm256_min_ps(_mm256_max_ps(z50, vmin), vmax);
|
||||
}
|
||||
|
||||
__m128 lowM, highM;
|
||||
STORE6_ELE_FROM_16(outptr, z00, lowM, highM);
|
||||
STORE6_ELE_FROM_16(outptr + outstep, z10, lowM, highM);
|
||||
STORE6_ELE_FROM_16(outptr + outstep * 2, z20, lowM, highM);
|
||||
STORE6_ELE_FROM_16(outptr + outstep * 3, z30, lowM, highM);
|
||||
STORE6_ELE_FROM_16(outptr + outstep * 4, z40, lowM, highM);
|
||||
STORE6_ELE_FROM_16(outptr + outstep * 5, z50, lowM, highM);
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
|
||||
#endif
|
||||
} // namespace opt_AVX2
|
||||
} // namespace cv
|
@ -14,7 +14,7 @@
|
||||
#include "fast_convolution.simd.hpp"
|
||||
|
||||
namespace cv { namespace dnn {
|
||||
|
||||
enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment.
|
||||
Ptr<FastConv2d> initFastConv2d(
|
||||
int ngroups,
|
||||
int K, int C, int Hk, int Wk,
|
||||
@ -44,20 +44,17 @@ Ptr<FastConv2d> initFastConv2d(
|
||||
conv->pad_bottom = pads_end[0];
|
||||
conv->pad_left = pads_begin[1];
|
||||
conv->pad_right = pads_end[1];
|
||||
conv->conv_type =
|
||||
(ngroups > 1 && ngroups == K && ngroups == C) ? _FX_CONV_TYPE_DEPTHWISE :
|
||||
useWinograd && ((conv->useSIMD128 || conv->useAVX2 || conv->useNEON) && Hk == 3 && Wk == 3 &&
|
||||
dilation_y == 1 && dilation_x == 1 && stride_y == 1 && stride_x == 1) ? _FX_CONV_TYPE_WINOGRAD3X3 :
|
||||
_FX_CONV_TYPE_GENERIC;
|
||||
Mat weightsMat = _weightsMat.getMat();
|
||||
auto wShape = shape(weightsMat);
|
||||
const size_t wstep = weightsMat.step1();
|
||||
|
||||
#if CV_NEON // For now, winograd is ARM platform only.
|
||||
if (useWinograd && ngroups == 1 && Hk ==3 && Wk == 3 && stride_x == 1 && stride_y == 1 &&
|
||||
dilation_x == 1 && dilation_y ==1 && K >= 16 && C >= 16)
|
||||
conv->useWinograd63 = true;
|
||||
#else
|
||||
conv->useWinograd63 = false;
|
||||
#endif
|
||||
|
||||
float *srcWeights = (float *)weightsMat.data;
|
||||
if (ngroups > 1 && ngroups == K && ngroups == C)
|
||||
if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE)
|
||||
{
|
||||
// for depth-wise convolutions on NCHW data we just preserve the weights in KCHW layout,
|
||||
// but add some padding to make the weights array layout more SIMD-friendly
|
||||
@ -66,17 +63,97 @@ Ptr<FastConv2d> initFastConv2d(
|
||||
// this code aims to let memory fit with vector size.
|
||||
int padded_ksize = ((ksize + FAST_VEC_NLANES-1) / FAST_VEC_NLANES) * FAST_VEC_NLANES;
|
||||
int nweights = C*padded_ksize;
|
||||
conv->weightsBuf.reserve(nweights);
|
||||
float* weightsBufPtr = conv->weightsBuf.data();
|
||||
memset(weightsBufPtr, 0, nweights*sizeof(weightsBufPtr[0]));
|
||||
for(int c = 0; c < C; c++)
|
||||
conv->weightsBuf.reserve(nweights + VEC_ALIGN);
|
||||
conv->weightsBufPtr = alignPtr(conv->weightsBuf.data(), VEC_ALIGN);
|
||||
memset(conv->weightsBufPtr, 0, nweights*sizeof(conv->weightsBufPtr[0]));
|
||||
auto weightsBufPtr = conv->weightsBufPtr;
|
||||
parallel_for_(Range(0, C), [&](const Range& r0){
|
||||
for(int c = r0.start; c < r0.end; c++)
|
||||
{
|
||||
for (int k = 0; k < ksize; k++)
|
||||
weightsBufPtr[c*padded_ksize + k] = srcWeights[c*wstep + k];
|
||||
}
|
||||
}});
|
||||
}
|
||||
else
|
||||
{
|
||||
if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // winograd
|
||||
{
|
||||
static const float ktm[8][3] = {
|
||||
{1.0f, 0.0f, 0.0f},
|
||||
{-2.0f / 9, -2.0f / 9, -2.0f / 9},
|
||||
{-2.0f / 9, 2.0f / 9, -2.0f / 9},
|
||||
{1.0f / 90, 1.0f / 45, 2.0f / 45},
|
||||
{1.0f / 90, -1.0f / 45, 2.0f / 45},
|
||||
{32.f/45, 16.f/45, 8.f/45},
|
||||
{32.f/45, -16.f/45, 8.f/45},
|
||||
{0.0f, 0.0f, 1.0f}
|
||||
};
|
||||
|
||||
// the weights are packed as 6-dim tensor:
|
||||
// ngroups * ceil((K/ngroups)/KBLOCK) * (W*W/ATOM_SIZE) * (C/ngroups) * KBLOCK * ATOM_SIZE,
|
||||
// where W is the size of Winograd-transformed kernel (8x8),
|
||||
// ATOM_SIZE is number of lanes in SIMD register (4 for NEON and FP32),
|
||||
// KBLOCK is some platform-dependent constant dependent on the number of SIMD registers.
|
||||
int ksize = _FX_WINO_KSIZE * _FX_WINO_KSIZE;
|
||||
int Cg = C/ngroups;
|
||||
int Kg = K/ngroups;
|
||||
int Kg_nblocks = (Kg + _FX_WINO_KBLOCK - 1)/_FX_WINO_KBLOCK;
|
||||
size_t nweights = ngroups*Kg_nblocks*Cg*_FX_WINO_KBLOCK*_FX_WINO_AREA;
|
||||
conv->weightsWinoBuf.reserve(nweights + VEC_ALIGN);
|
||||
conv->weightsWinoBufPtr = alignPtr(conv->weightsWinoBuf.data(), VEC_ALIGN);
|
||||
float* wptrWino = conv->weightsWinoBufPtr;
|
||||
memset(wptrWino, 0, nweights * sizeof(wptrWino[0]));
|
||||
|
||||
parallel_for_(Range(0, K), [&](const Range& r0){
|
||||
float kernelTm[_FX_WINO_AREA];
|
||||
for (int k = r0.start; k < r0.end; k++)
|
||||
{
|
||||
int g = k / Kg;
|
||||
int k_ = k - g*Kg;
|
||||
int ki = k_ / _FX_WINO_KBLOCK;
|
||||
int dk = k_ - ki*_FX_WINO_KBLOCK;
|
||||
|
||||
for (int c = 0; c < Cg; c++)
|
||||
{
|
||||
// wstep = Hk*Wk*Cg
|
||||
const float *kernel0 = srcWeights + k * wstep + c * ksize;
|
||||
|
||||
// transform kernel, transposed
|
||||
const float *k0 = kernel0;
|
||||
const float *k1 = kernel0 + 3;
|
||||
const float *k2 = kernel0 + 6;
|
||||
|
||||
// h
|
||||
float tmp[8][3];
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
|
||||
tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
|
||||
tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
|
||||
}
|
||||
|
||||
// v
|
||||
for (int j = 0; j < 8; j++)
|
||||
{
|
||||
float *tmpp = &tmp[j][0];
|
||||
|
||||
for (int i = 0; i < 8; i++)
|
||||
kernelTm[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
|
||||
}
|
||||
|
||||
// repack the data.
|
||||
float* wptr = wptrWino + (g*Kg_nblocks + ki) * Cg *_FX_WINO_KBLOCK*_FX_WINO_AREA +
|
||||
(c*_FX_WINO_KBLOCK + dk)*_FX_WINO_ATOM_F32;
|
||||
for (int i = 0; i < _FX_WINO_NATOMS_F32; i++,
|
||||
wptr += Cg * _FX_WINO_KBLOCK * _FX_WINO_ATOM_F32)
|
||||
{
|
||||
CV_Assert(conv->weightsWinoBufPtr <= wptr && wptr + _FX_WINO_ATOM_F32 <= conv->weightsWinoBufPtr + nweights);
|
||||
memcpy(wptr, kernelTm + i * _FX_WINO_ATOM_F32, _FX_WINO_ATOM_F32*sizeof (wptr[0]));
|
||||
}
|
||||
}
|
||||
}});
|
||||
}
|
||||
|
||||
// The weights are packed as
|
||||
// ngroups x (ceil((K/ngroups)/CONV_MR)*CONV_MR) x (Cg*Hk*Wk) x CONV_MR tensor
|
||||
int Kg = K/ngroups, Cg = max(C/ngroups, 1);
|
||||
@ -84,8 +161,9 @@ Ptr<FastConv2d> initFastConv2d(
|
||||
int Kg_aligned = numStripsMR * CONV_MR;
|
||||
int HkWkCg = Hk*Wk*Cg;
|
||||
size_t nweights = ngroups*Kg_aligned*HkWkCg;
|
||||
conv->weightsBuf.reserve(nweights);
|
||||
float* weightsBufPtr = conv->weightsBuf.data();
|
||||
conv->weightsBuf.reserve(nweights + VEC_ALIGN);
|
||||
conv->weightsBufPtr = alignPtr(conv->weightsBuf.data(), VEC_ALIGN);
|
||||
float* weightsBufPtr = conv->weightsBufPtr;
|
||||
memset(weightsBufPtr, 0, nweights*sizeof(weightsBufPtr[0]));
|
||||
|
||||
// Pack the weight.
|
||||
@ -114,18 +192,12 @@ Ptr<FastConv2d> initFastConv2d(
|
||||
}
|
||||
}
|
||||
}});
|
||||
|
||||
// Prepare Weight for Winograd F(6x6, 3x3)
|
||||
if (conv->useWinograd63)
|
||||
{
|
||||
initWinograd63(conv, weightsMat, K, C);
|
||||
}
|
||||
}
|
||||
|
||||
// store bias; append some zero's to make sure that
|
||||
// we can always read MR elements starting from any valid index
|
||||
{
|
||||
int k = 0, nbias = K + CONV_MR - 1;
|
||||
int k = 0, nbias = K + 32;
|
||||
conv->biasBuf.reserve(nbias);
|
||||
float* biasBufPtr = conv->biasBuf.data();
|
||||
for(; k < K; k++)
|
||||
@ -185,19 +257,16 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr<FastConv2d>
|
||||
else
|
||||
activ = nullptr;
|
||||
|
||||
if (conv->ngroups > 1 && conv->ngroups == conv->K && conv->ngroups == conv->C)
|
||||
if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE)
|
||||
{
|
||||
CV_Assert(fusedAddMat.empty()); // Depthwise-Convolution layer should not be followed by Add layer.
|
||||
return runDepthwise(input, output, conv, minval, maxval, activ, ifMinMaxAct);
|
||||
}
|
||||
|
||||
#if CV_NEON
|
||||
if (conv->useWinograd63 && inputShape[2] > 12 && inputShape[3] > 12)
|
||||
else if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3 && inputShape[2] >= 12 && inputShape[3] >= 12) // winograd
|
||||
{
|
||||
if (runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct))
|
||||
return;
|
||||
CV_Assert(conv->weightsWinoBufPtr);
|
||||
return runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct);
|
||||
}
|
||||
#endif
|
||||
|
||||
int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3]; // [N, C, H, W]
|
||||
int K = conv->K, Hk = conv->Hk, Wk = conv->Wk;
|
||||
@ -217,7 +286,6 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr<FastConv2d>
|
||||
bool fast_1x1 = stride_x == 1 && stride_y == 1 && ksize == 1;
|
||||
int HkWkCg = Hk*Wk*Cg;
|
||||
|
||||
enum { VEC_ALIGN = 8, DFT_TYPE = CV_32F }; // Memory alignment.
|
||||
int MAX_STRIPES = 2; // (56 + CONV_NR - 1)/CONV_NR;
|
||||
|
||||
// Friendly to L1 cache
|
||||
@ -447,7 +515,7 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr<FastConv2d>
|
||||
}
|
||||
|
||||
yx0 = yx0_saved;
|
||||
float* weights = conv->weightsBuf.data() + g * Kg_aligned * HkWkCg;
|
||||
float* weights = conv->weightsBufPtr + g * Kg_aligned * HkWkCg;
|
||||
const float* biasptr = conv->biasBuf.data() + Kg * g;
|
||||
int ldc = nstripes * CONV_NR;
|
||||
|
||||
|
@ -21,7 +21,7 @@ enum { FAST_VEC_NLANES=4 };
|
||||
#define CONV_MR 4
|
||||
#define CONV_NR 24
|
||||
|
||||
#if CV_AVX2
|
||||
#if CV_TRY_AVX2
|
||||
enum { FAST_VEC_NLANES=8 }; // AVX2
|
||||
#else
|
||||
enum { FAST_VEC_NLANES=4 }; // SIMD 128
|
||||
@ -30,6 +30,31 @@ enum { FAST_VEC_NLANES=4 }; // SIMD 128
|
||||
#endif
|
||||
#endif
|
||||
|
||||
enum {
|
||||
_FX_WINO_STEP=6,
|
||||
_FX_WINO_KSIZE=3,
|
||||
_FX_WINO_SIZE=_FX_WINO_STEP+_FX_WINO_KSIZE-1,
|
||||
_FX_WINO_AREA=_FX_WINO_SIZE*_FX_WINO_SIZE,
|
||||
|
||||
#if CV_TRY_AVX2 || (CV_NEON && CV_NEON_AARCH64)
|
||||
_FX_WINO_KBLOCK = 4,
|
||||
_FX_WINO_IBLOCK = 6,
|
||||
#else
|
||||
_FX_WINO_KBLOCK = 4,
|
||||
_FX_WINO_IBLOCK = 3,
|
||||
#endif
|
||||
|
||||
#if CV_TRY_AVX2
|
||||
_FX_WINO_ATOM_F32 = 8,
|
||||
#else
|
||||
_FX_WINO_ATOM_F32 = 4,
|
||||
#endif
|
||||
|
||||
_FX_WINO_NATOMS_F32 = _FX_WINO_AREA / _FX_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16.
|
||||
};
|
||||
|
||||
enum { _FX_CONV_TYPE_GENERIC=0, _FX_CONV_TYPE_DEPTHWISE=1, _FX_CONV_TYPE_WINOGRAD3X3=2 };
|
||||
|
||||
namespace cv {
|
||||
namespace dnn {
|
||||
|
||||
@ -41,10 +66,17 @@ struct FastConv2d
|
||||
int dilation_y, dilation_x;
|
||||
int pad_top, pad_bottom, pad_left, pad_right;
|
||||
|
||||
std::vector<float> weightsBuf; // For generic Conv 2D
|
||||
std::vector<float> weightsWino63Buf; // For Winograd F(6x6, 3x3).
|
||||
std::vector<float> weightsBuf; // For generic Conv 2D
|
||||
float* weightsBufPtr;
|
||||
std::vector<float> weightsWinoBuf; // For Winograd F(6x6, 3x3).
|
||||
float* weightsWinoBufPtr;
|
||||
std::vector<float> biasBuf;
|
||||
bool useWinograd63 = false;
|
||||
int conv_type;
|
||||
#if CV_SIMD128
|
||||
bool useSIMD128 = true;
|
||||
#else
|
||||
bool useSIMD128 = false;
|
||||
#endif
|
||||
bool useAVX2 = checkHardwareSupport(CPU_AVX2);
|
||||
bool useNEON = checkHardwareSupport(CPU_NEON);
|
||||
};
|
||||
@ -67,10 +99,7 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr<FastConv2d>
|
||||
void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, float minval, float maxval,
|
||||
ActivationLayer* activ, bool ifMinMaxAct);
|
||||
|
||||
// winograd init
|
||||
void initWinograd63(Ptr<FastConv2d>& conv, InputArray weightsMat, int K, int C);
|
||||
|
||||
int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv, int ntasks,
|
||||
void runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv, int ntasks,
|
||||
float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct);
|
||||
|
||||
} // namespace dnn
|
||||
@ -84,6 +113,12 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
|
||||
float minval, float maxval, int Hi, int Wi, int H0, int W0, int ksize, int pad_top, int pad_left,
|
||||
int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop,
|
||||
int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3);
|
||||
|
||||
void _fx_winograd_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock);
|
||||
void _fx_winograd_BtXB_8x8_f32(const float* inptr, int inpstep, float* outptr, int Cg);
|
||||
void _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep,
|
||||
float bias, float minval, float maxval, bool ifMinMaxAct);
|
||||
|
||||
#endif
|
||||
} // namespace opt_AVX2
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -545,7 +545,7 @@ TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16)
|
||||
Mat img = imread(findDataFile("dnn/googlenet_1.png"));
|
||||
Mat inp = blobFromImage(img, 1.0, Size(320, 240), Scalar(103.939, 116.779, 123.68), false, false);
|
||||
// Output image has values in range [-143.526, 148.539].
|
||||
float l1 = 1e-4, lInf = 2e-3;
|
||||
float l1 = 2e-4, lInf = 2e-3;
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
{
|
||||
l1 = 0.4;
|
||||
|
@ -1221,6 +1221,7 @@ TEST_P(Layer_Test_DWconv_Prelu, Accuracy)
|
||||
Mat in_blob(4, &shape[0], CV_32FC1, Scalar(1));
|
||||
|
||||
net.setPreferableBackend(DNN_BACKEND_OPENCV);
|
||||
net.enableWinograd(false);
|
||||
net.setInput(in_blob);
|
||||
Mat out = net.forward();
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user