mirror of
https://github.com/opencv/opencv.git
synced 2024-11-24 19:20:28 +08:00
Merge pull request #23763 from zihaomu:add_runtime_check
DNN: fix bug for X86 Winograd #23763 Address https://github.com/opencv/opencv/issues/23760 The patch aims to add a runtime check for X86 platform without AVX(2). ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
5d913f4d72
commit
eec8a20c33
@ -31,7 +31,6 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
|
||||
void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep,
|
||||
float bias, float minval, float maxval, bool ifMinMaxAct);
|
||||
|
||||
|
||||
int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv,
|
||||
int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
|
||||
{
|
||||
@ -51,6 +50,23 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
|
||||
int pad_left = conv->pad_left;
|
||||
|
||||
int ngroups = conv->ngroups, Cg = C/ngroups, Kg = K/ngroups;
|
||||
|
||||
const int CONV_WINO_KBLOCK = 4;
|
||||
#if (CV_NEON && CV_NEON_AARCH64)
|
||||
const int CONV_WINO_IBLOCK = 6;
|
||||
#elif CV_TRY_AVX || CV_TRY_AVX2
|
||||
const int CONV_WINO_IBLOCK = (conv->useAVX || conv->useAVX2) ? 6 : 3;
|
||||
#else
|
||||
const int CONV_WINO_IBLOCK = 3;
|
||||
#endif
|
||||
|
||||
#if CV_TRY_AVX || CV_TRY_AVX2
|
||||
const int CONV_WINO_ATOM_F32 = (conv->useAVX || conv->useAVX2) ? 8 : 4;
|
||||
#else
|
||||
const int CONV_WINO_ATOM_F32 = 4;
|
||||
#endif
|
||||
const int CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32; // for AVX2, it is 8, otherwise, it's 16.
|
||||
|
||||
int Kg_nblocks = (Kg + CONV_WINO_KBLOCK - 1)/CONV_WINO_KBLOCK;
|
||||
const size_t inp_planesize = (size_t)Hi*Wi;
|
||||
const size_t out_planesize = (size_t)H0*W0;
|
||||
@ -398,7 +414,7 @@ void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, i
|
||||
void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
|
||||
float* outptr, int Cg, const int winoIblock, const int winoAtomF32)
|
||||
{
|
||||
CV_Assert(CONV_WINO_IBLOCK == 3 && CONV_WINO_KBLOCK == 4 && CONV_WINO_ATOM_F32 == 4);
|
||||
CV_Assert(winoIblock == 3 && winoAtomF32 == 4);
|
||||
v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
|
||||
v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
|
||||
v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
|
||||
@ -573,7 +589,6 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
|
||||
float* bpptr, int bpstep, float* outptr, int outstep,
|
||||
float bias, float minval, float maxval, bool ifMinMaxAct)
|
||||
{
|
||||
CV_Assert(CONV_WINO_IBLOCK == 3 && CONV_WINO_KBLOCK == 4 && CONV_WINO_ATOM_F32 == 4);
|
||||
v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
|
||||
v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
|
||||
v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
|
||||
|
@ -181,6 +181,21 @@ Ptr<FastConv> initFastConv(
|
||||
{0.0f, 0.0f, 1.0f}
|
||||
};
|
||||
|
||||
const int CONV_WINO_KBLOCK = 4;
|
||||
|
||||
#if CV_TRY_AVX || CV_TRY_AVX2
|
||||
const int CONV_WINO_ATOM_F32 = (conv->useAVX || conv->useAVX2) ? 8 : 4;
|
||||
#else
|
||||
const int CONV_WINO_ATOM_F32 = 4;
|
||||
#endif
|
||||
const int CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32; // for AVX2, it is 8, otherwise, it's 16.
|
||||
|
||||
#ifdef CONV_ARM_FP16
|
||||
// FP 16
|
||||
const int CONV_WINO_ATOM_F16 = CONV_WINO_ATOM_F32 * 2;
|
||||
const int CONV_WINO_NATOMS_F16 = CONV_WINO_AREA / CONV_WINO_ATOM_F16;
|
||||
#endif
|
||||
|
||||
// the weights are packed as 6-dim tensor:
|
||||
// ngroups * ceil((K/ngroups)/KBLOCK) * (W*W/ATOM_SIZE) * (C/ngroups) * KBLOCK * ATOM_SIZE,
|
||||
// where W is the size of Winograd-transformed kernel (8x8),
|
||||
|
@ -33,36 +33,17 @@ typedef __fp16 float16_t; // Fix conflict between float16_t in arm_neon.h and fl
|
||||
#define CONV_NR_FP32 24
|
||||
#endif
|
||||
|
||||
// Winograd Params
|
||||
enum {
|
||||
CONV_WINO_STEP=6,
|
||||
CONV_WINO_KSIZE=3,
|
||||
CONV_WINO_SIZE=CONV_WINO_STEP+CONV_WINO_KSIZE - 1, // 8
|
||||
CONV_WINO_AREA=CONV_WINO_SIZE*CONV_WINO_SIZE,
|
||||
|
||||
CONV_WINO_KBLOCK = 4,
|
||||
#if (CV_NEON && CV_NEON_AARCH64) || CV_TRY_AVX || CV_TRY_AVX2
|
||||
CONV_WINO_IBLOCK = 6,
|
||||
#else
|
||||
CONV_WINO_IBLOCK = 3,
|
||||
#endif
|
||||
|
||||
#if CV_TRY_AVX || CV_TRY_AVX2
|
||||
CONV_WINO_ATOM_F32 = 8,
|
||||
#else
|
||||
CONV_WINO_ATOM_F32 = 4,
|
||||
#endif
|
||||
|
||||
CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16.
|
||||
|
||||
// FP 16
|
||||
CONV_WINO_ATOM_F16 = CONV_WINO_ATOM_F32 * 2,
|
||||
CONV_WINO_NATOMS_F16 = CONV_WINO_AREA / CONV_WINO_ATOM_F16,
|
||||
};
|
||||
|
||||
// NOTE that: CONV_TYPE_DEPTHWISE is for 3x3 depthwise conv, and others depthwise will be set as CONV_TYPE_DEPTHWISE_REMAIN.
|
||||
enum { CONV_TYPE_GENERIC=0, CONV_TYPE_DEPTHWISE=1, CONV_TYPE_WINOGRAD3X3=2, CONV_TYPE_DEPTHWISE_REMAIN=3 };
|
||||
enum { CONV_1D = 0, CONV_2D = 1, CONV_3D = 2 };
|
||||
|
||||
#endif
|
||||
|
||||
namespace cv {
|
||||
|
Loading…
Reference in New Issue
Block a user