mirror of
https://github.com/opencv/opencv.git
synced 2024-11-25 19:50:38 +08:00
Merge pull request #23112 from zihaomu:fix_x86_winograd
This commit is contained in:
commit
65c2d6a2be
@ -119,7 +119,7 @@ void convBlock_AVX2(int np, const float* a, const float* b, float* c, int ldc, b
|
||||
void _fx_winograd_accum_f32(const float* inwptr, const float* wptr,
|
||||
float* outbuf, int Cg, int iblock)
|
||||
{
|
||||
CV_Assert(_FX_WINO_IBLOCK == 6 && _FX_WINO_KBLOCK == 4);// && _FX_WINO_ATOM_F32 == 8);
|
||||
CV_Assert(_FX_WINO_IBLOCK == 6 && _FX_WINO_KBLOCK == 4 && _FX_WINO_ATOM_F32 == 8);
|
||||
if (iblock > 3)
|
||||
{
|
||||
for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++,
|
||||
|
@ -105,6 +105,12 @@ Ptr<FastConv> initFastConv(
|
||||
conv->conv_type = _FX_CONV_TYPE_GENERIC;
|
||||
#endif
|
||||
|
||||
#if CV_TRY_AVX2
|
||||
// Disabel Winograd when CV_TRY_AVX2 is true, but conv->useAVX2 is false.
|
||||
if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3 && !conv->useAVX2)
|
||||
conv->conv_type = _FX_CONV_TYPE_GENERIC;
|
||||
#endif
|
||||
|
||||
Mat weightsMat = _weightsMat.getMat();
|
||||
auto wShape = shape(weightsMat);
|
||||
const size_t wstep = weightsMat.step1();
|
||||
@ -257,7 +263,7 @@ Ptr<FastConv> initFastConv(
|
||||
// we can always read MR elements starting from any valid index
|
||||
{
|
||||
int k = 0, nbias = K + VEC_ALIGN;
|
||||
conv->biasBuf.reserve(nbias);
|
||||
conv->biasBuf.resize(nbias);
|
||||
float* biasBufPtr = conv->biasBuf.data();
|
||||
for(; k < K; k++)
|
||||
biasBufPtr[k] = srcBias ? srcBias[k] : 0.f;
|
||||
|
@ -22,7 +22,7 @@ _fx_winograd_accum_f32(const float* inwptr, const float* wptr,
|
||||
float* outbuf, int Cg, int iblock)
|
||||
{
|
||||
#if CV_NEON && CV_NEON_AARCH64
|
||||
CV_Assert(_FX_WINO_IBLOCK == 6 && _FX_WINO_KBLOCK == 4);
|
||||
CV_Assert(_FX_WINO_IBLOCK == 6 && _FX_WINO_KBLOCK == 4 && _FX_WINO_ATOM_F32 == 4);
|
||||
if (iblock > 3)
|
||||
{
|
||||
for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++,
|
||||
@ -144,7 +144,7 @@ _fx_winograd_accum_f32(const float* inwptr, const float* wptr,
|
||||
}
|
||||
}
|
||||
#elif CV_SIMD128
|
||||
CV_Assert(_FX_WINO_IBLOCK == 3 && _FX_WINO_KBLOCK == 4);
|
||||
CV_Assert(_FX_WINO_IBLOCK == 3 && _FX_WINO_KBLOCK == 4 && _FX_WINO_ATOM_F32 == 4);
|
||||
for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++,
|
||||
outbuf += _FX_WINO_ATOM_F32)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user