Merge pull request #23310 from hanliutong:fix_hal_compatibility

Fix HAL compatibility layer
This commit is contained in:
Alexander Smorkalov 2023-04-11 12:43:54 +03:00 committed by GitHub
commit 3f02c9d5b9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 263 additions and 91 deletions

View File

@ -758,6 +758,36 @@ namespace CV__SIMD_NAMESPACE {
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
// when we use CV_SIMD128 with 256/512 bit SIMD (e.g. AVX2 or AVX512)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x4)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x2)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
// when we use CV_SIMD256 with 512 bit SIMD (e.g. AVX512)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x4)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
#endif
#endif
#define OPENCV_HAL_WRAP_BIN_OP_LOGIC(_Tpvec) \
inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \
@ -785,6 +815,26 @@ namespace CV__SIMD_NAMESPACE {
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64)
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x8)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x4)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x2)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x8)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x4)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x2)
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x32)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x8)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x4)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x32)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x8)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x4)
#endif
#define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
@ -805,17 +855,51 @@ namespace CV__SIMD_NAMESPACE {
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
#endif
inline v_float32 v_div(const v_float32& a, const v_float32& b) \
{ \
return a / b; \
}
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x4)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x4)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x4)
#if CV_SIMD_64F
inline v_float64 v_div(const v_float64& a, const v_float64& b) \
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x4)
#endif
#endif
#define OPENCV_HAL_WRAP_BIN_OP_DIV(_Tpvec) \
inline _Tpvec v_div(const _Tpvec& a, const _Tpvec& b) \
{ \
return a / b; \
}
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x4)
#endif
#endif
#define OPENCV_HAL_WRAP_CMP_OP(_Tpvec, intrin, op) \
@ -844,44 +928,124 @@ namespace CV__SIMD_NAMESPACE {
#if CV_SIMD_64F
OPENCV_HAL_WRAP_CMP(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_CMP(v_uint8x16)
OPENCV_HAL_WRAP_CMP(v_uint16x8)
OPENCV_HAL_WRAP_CMP(v_uint32x4)
OPENCV_HAL_WRAP_CMP(v_int8x16)
OPENCV_HAL_WRAP_CMP(v_int16x8)
OPENCV_HAL_WRAP_CMP(v_int32x4)
OPENCV_HAL_WRAP_CMP(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_CMP(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_CMP(v_uint8x32)
OPENCV_HAL_WRAP_CMP(v_uint16x16)
OPENCV_HAL_WRAP_CMP(v_uint32x8)
OPENCV_HAL_WRAP_CMP(v_int8x32)
OPENCV_HAL_WRAP_CMP(v_int16x16)
OPENCV_HAL_WRAP_CMP(v_int32x8)
OPENCV_HAL_WRAP_CMP(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_CMP(v_float64x4)
#endif
#endif
//////////// get0 ////////////
#define OPENCV_HAL_WRAP_GRT0_INT(_Tpvec, _Tp) \
inline _Tp v_get0(const v_##_Tpvec& v) \
#define OPENCV_HAL_WRAP_GRT0(_Tpvec) \
inline typename VTraits<_Tpvec>::lane_type v_get0(const _Tpvec& v) \
{ \
return v.get0(); \
}
OPENCV_HAL_WRAP_GRT0_INT(uint8, uchar)
OPENCV_HAL_WRAP_GRT0_INT(int8, schar)
OPENCV_HAL_WRAP_GRT0_INT(uint16, ushort)
OPENCV_HAL_WRAP_GRT0_INT(int16, short)
OPENCV_HAL_WRAP_GRT0_INT(uint32, unsigned)
OPENCV_HAL_WRAP_GRT0_INT(int32, int)
OPENCV_HAL_WRAP_GRT0_INT(uint64, uint64)
OPENCV_HAL_WRAP_GRT0_INT(int64, int64)
OPENCV_HAL_WRAP_GRT0_INT(float32, float)
OPENCV_HAL_WRAP_GRT0(v_uint8)
OPENCV_HAL_WRAP_GRT0(v_int8)
OPENCV_HAL_WRAP_GRT0(v_uint16)
OPENCV_HAL_WRAP_GRT0(v_int16)
OPENCV_HAL_WRAP_GRT0(v_uint32)
OPENCV_HAL_WRAP_GRT0(v_int32)
OPENCV_HAL_WRAP_GRT0(v_uint64)
OPENCV_HAL_WRAP_GRT0(v_int64)
OPENCV_HAL_WRAP_GRT0(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_GRT0_INT(float64, double)
OPENCV_HAL_WRAP_GRT0(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_GRT0(v_uint8x16)
OPENCV_HAL_WRAP_GRT0(v_uint16x8)
OPENCV_HAL_WRAP_GRT0(v_uint32x4)
OPENCV_HAL_WRAP_GRT0(v_uint64x2)
OPENCV_HAL_WRAP_GRT0(v_int8x16)
OPENCV_HAL_WRAP_GRT0(v_int16x8)
OPENCV_HAL_WRAP_GRT0(v_int32x4)
OPENCV_HAL_WRAP_GRT0(v_int64x2)
OPENCV_HAL_WRAP_GRT0(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_GRT0(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_GRT0(v_uint8x32)
OPENCV_HAL_WRAP_GRT0(v_uint16x16)
OPENCV_HAL_WRAP_GRT0(v_uint32x8)
OPENCV_HAL_WRAP_GRT0(v_uint64x4)
OPENCV_HAL_WRAP_GRT0(v_int8x32)
OPENCV_HAL_WRAP_GRT0(v_int16x16)
OPENCV_HAL_WRAP_GRT0(v_int32x8)
OPENCV_HAL_WRAP_GRT0(v_int64x4)
OPENCV_HAL_WRAP_GRT0(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_GRT0(v_float64x4)
#endif
#endif
#define OPENCV_HAL_WRAP_EXTRACT(_Tpvec, _Tp, vl) \
inline _Tp v_extract_highest(const _Tpvec& v) \
#define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
{ \
return v_extract_n<vl-1>(v); \
return v_extract_n<VTraits<_Tpvec>::nlanes-1>(v); \
}
OPENCV_HAL_WRAP_EXTRACT(v_uint8, uchar, VTraits<v_uint8>::nlanes)
OPENCV_HAL_WRAP_EXTRACT(v_int8, schar, VTraits<v_int8>::nlanes)
OPENCV_HAL_WRAP_EXTRACT(v_uint16, ushort, VTraits<v_uint16>::nlanes)
OPENCV_HAL_WRAP_EXTRACT(v_int16, short, VTraits<v_int16>::nlanes)
OPENCV_HAL_WRAP_EXTRACT(v_uint32, unsigned int, VTraits<v_uint32>::nlanes)
OPENCV_HAL_WRAP_EXTRACT(v_int32, int, VTraits<v_int32>::nlanes)
OPENCV_HAL_WRAP_EXTRACT(v_uint64, uint64, VTraits<v_uint64>::nlanes)
OPENCV_HAL_WRAP_EXTRACT(v_int64, int64, VTraits<v_int64>::nlanes)
OPENCV_HAL_WRAP_EXTRACT(v_float32, float, VTraits<v_float32>::nlanes)
OPENCV_HAL_WRAP_EXTRACT(v_uint8)
OPENCV_HAL_WRAP_EXTRACT(v_int8)
OPENCV_HAL_WRAP_EXTRACT(v_uint16)
OPENCV_HAL_WRAP_EXTRACT(v_int16)
OPENCV_HAL_WRAP_EXTRACT(v_uint32)
OPENCV_HAL_WRAP_EXTRACT(v_int32)
OPENCV_HAL_WRAP_EXTRACT(v_uint64)
OPENCV_HAL_WRAP_EXTRACT(v_int64)
OPENCV_HAL_WRAP_EXTRACT(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_EXTRACT(v_float64, double, VTraits<v_float64>::nlanes)
OPENCV_HAL_WRAP_EXTRACT(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_EXTRACT(v_uint8x16)
OPENCV_HAL_WRAP_EXTRACT(v_uint16x8)
OPENCV_HAL_WRAP_EXTRACT(v_uint32x4)
OPENCV_HAL_WRAP_EXTRACT(v_uint64x2)
OPENCV_HAL_WRAP_EXTRACT(v_int8x16)
OPENCV_HAL_WRAP_EXTRACT(v_int16x8)
OPENCV_HAL_WRAP_EXTRACT(v_int32x4)
OPENCV_HAL_WRAP_EXTRACT(v_int64x2)
OPENCV_HAL_WRAP_EXTRACT(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_EXTRACT(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_EXTRACT(v_uint8x32)
OPENCV_HAL_WRAP_EXTRACT(v_uint16x16)
OPENCV_HAL_WRAP_EXTRACT(v_uint32x8)
OPENCV_HAL_WRAP_EXTRACT(v_uint64x4)
OPENCV_HAL_WRAP_EXTRACT(v_int8x32)
OPENCV_HAL_WRAP_EXTRACT(v_int16x16)
OPENCV_HAL_WRAP_EXTRACT(v_int32x8)
OPENCV_HAL_WRAP_EXTRACT(v_int64x4)
OPENCV_HAL_WRAP_EXTRACT(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_EXTRACT(v_float64x4)
#endif
#endif
#define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
@ -893,7 +1057,16 @@ namespace CV__SIMD_NAMESPACE {
OPENCV_HAL_WRAP_BROADCAST(v_uint32)
OPENCV_HAL_WRAP_BROADCAST(v_int32)
OPENCV_HAL_WRAP_BROADCAST(v_float32)
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_BROADCAST(v_uint32x4)
OPENCV_HAL_WRAP_BROADCAST(v_int32x4)
OPENCV_HAL_WRAP_BROADCAST(v_float32x4)
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_BROADCAST(v_uint32x8)
OPENCV_HAL_WRAP_BROADCAST(v_int32x8)
OPENCV_HAL_WRAP_BROADCAST(v_float32x8)
#endif
#endif //!CV_SIMD_SCALABLE

View File

@ -1028,11 +1028,10 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
{
for (; j + 7 < out_width; j += 8)
{
v_float32x4 v0 = v_load(cptr + j) + vbias;
v_float32x4 v1 = v_load(cptr + j + 4) + vbias;
v0 += v_load(pbptr + j);
v1 += v_load(pbptr + j + 4);
v_float32x4 v0 = v_add(v_load(cptr + j), vbias);
v_float32x4 v1 = v_add(v_load(cptr + j + 4), vbias);
v0 = v_add(v0, v_load(pbptr + j));
v1 = v_add(v1, v_load(pbptr + j + 4));
if (ifMinMaxAct)
{
@ -1048,8 +1047,8 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
{
for (; j + 7 < out_width; j += 8)
{
v_float32x4 v0 = v_load(cptr + j) + vbias;
v_float32x4 v1 = v_load(cptr + j + 4) + vbias;
v_float32x4 v0 = v_add(v_load(cptr + j), vbias);
v_float32x4 v1 = v_add(v_load(cptr + j + 4), vbias);
if (ifMinMaxAct)
{
@ -1154,13 +1153,13 @@ static void convBlockMR1x28(int np, const float* a, const float* b, float *c, co
if (init_c)
{
c0 += v_load(c);
c1 += v_load(c + 4);
c2 += v_load(c + 8);
c3 += v_load(c + 12);
c4 += v_load(c + 16);
c5 += v_load(c + 20);
c6 += v_load(c + 24);
c0 = v_add(c0, v_load(c));
c1 = v_add(c1, v_load(c + 4));
c2 = v_add(c2, v_load(c + 8));
c3 = v_add(c3, v_load(c + 12));
c4 = v_add(c4, v_load(c + 16));
c5 = v_add(c5, v_load(c + 20));
c6 = v_add(c6, v_load(c + 24));
}
if (ifMinMaxAct)
@ -1207,12 +1206,12 @@ static void convBlockMR1x24(int np, const float* a, const float* b, float *c, co
if (init_c)
{
c0 += v_load(c);
c1 += v_load(c + 4);
c2 += v_load(c + 8);
c3 += v_load(c + 12);
c4 += v_load(c + 16);
c5 += v_load(c + 20);
c0 = v_add(c0, v_load(c));
c1 = v_add(c1, v_load(c + 4));
c2 = v_add(c2, v_load(c + 8));
c3 = v_add(c3, v_load(c + 12));
c4 = v_add(c4, v_load(c + 16));
c5 = v_add(c5, v_load(c + 20));
}
if (ifMinMaxAct)
@ -1251,9 +1250,9 @@ static void convBlockMR1x12(int np, const float* a, const float* b, float *c, co
if (init_c)
{
c0 += v_load(c);
c1 += v_load(c + 4);
c2 += v_load(c + 8);
c0 = v_add(c0, v_load(c));
c1 = v_add(c1, v_load(c + 4));
c2 = v_add(c2, v_load(c + 8));
}
if (ifMinMaxAct)
@ -1343,33 +1342,33 @@ static void convBlock4x24(int np, const float* a, const float* b, float* c, int
if (!init_c)
{
c0 += v_load(c);
c1 += v_load(c + 4);
c2 += v_load(c + 8);
c3 += v_load(c + 12);
c4 += v_load(c + 16);
c5 += v_load(c + 20);
c0 = v_add(c0, v_load(c));
c1 = v_add(c1, v_load(c + 4));
c2 = v_add(c2, v_load(c + 8));
c3 = v_add(c3, v_load(c + 12));
c4 = v_add(c4, v_load(c + 16));
c5 = v_add(c5, v_load(c + 20));
c6 += v_load(c + ldc);
c7 += v_load(c + ldc + 4);
c8 += v_load(c + ldc + 8);
c9 += v_load(c + ldc + 12);
c10 += v_load(c + ldc + 16);
c11 += v_load(c + ldc + 20);
c6 = v_add(c6 , v_load(c + ldc));
c7 = v_add(c7 , v_load(c + ldc + 4));
c8 = v_add(c8 , v_load(c + ldc + 8));
c9 = v_add(c9 , v_load(c + ldc + 12));
c10 = v_add(c10, v_load(c + ldc + 16));
c11 = v_add(c11, v_load(c + ldc + 20));
c12 += v_load(c + ldc*2);
c13 += v_load(c + ldc*2 + 4);
c14 += v_load(c + ldc*2 + 8);
c15 += v_load(c + ldc*2 + 12);
c16 += v_load(c + ldc*2 + 16);
c17 += v_load(c + ldc*2 + 20);
c12 = v_add(c12, v_load(c + ldc*2));
c13 = v_add(c13, v_load(c + ldc*2 + 4));
c14 = v_add(c14, v_load(c + ldc*2 + 8));
c15 = v_add(c15, v_load(c + ldc*2 + 12));
c16 = v_add(c16, v_load(c + ldc*2 + 16));
c17 = v_add(c17, v_load(c + ldc*2 + 20));
c18 += v_load(c + ldc*3);
c19 += v_load(c + ldc*3 + 4);
c20 += v_load(c + ldc*3 + 8);
c21 += v_load(c + ldc*3 + 12);
c22 += v_load(c + ldc*3 + 16);
c23 += v_load(c + ldc*3 + 20);
c18 = v_add(c18, v_load(c + ldc*3));
c19 = v_add(c19, v_load(c + ldc*3 + 4));
c20 = v_add(c20, v_load(c + ldc*3 + 8));
c21 = v_add(c21, v_load(c + ldc*3 + 12));
c22 = v_add(c22, v_load(c + ldc*3 + 16));
c23 = v_add(c23, v_load(c + ldc*3 + 20));
}
v_store(c, c0);
@ -1431,17 +1430,17 @@ static void convBlock4x8(int np, const float* a, const float* b, float* c, int l
if (!init_c)
{
c0 += v_load(c);
c1 += v_load(c + 4);
c0 = v_add(c0, v_load(c));
c1 = v_add(c1, v_load(c + 4));
c2 += v_load(c + ldc);
c3 += v_load(c + ldc + 4);
c2 = v_add(c2, v_load(c + ldc));
c3 = v_add(c3, v_load(c + ldc + 4));
c4 += v_load(c + ldc*2);
c5 += v_load(c + ldc*2 + 4);
c4 = v_add(c4, v_load(c + ldc*2));
c5 = v_add(c5, v_load(c + ldc*2 + 4));
c6 += v_load(c + ldc*3);
c7 += v_load(c + ldc*3 + 4);
c6 = v_add(c6, v_load(c + ldc*3));
c7 = v_add(c7, v_load(c + ldc*3 + 4));
}
v_store(c, c0);
@ -1476,10 +1475,10 @@ static void convBlock4x4(int np, const float* a, const float* b, float* c, int l
if (!init_c)
{
c0 += v_load(c);
c1 += v_load(c + ldc);
c2 += v_load(c + ldc*2);
c3 += v_load(c + ldc*3);
c0 = v_add(c0, v_load(c));
c1 = v_add(c1, v_load(c + ldc));
c2 = v_add(c2, v_load(c + ldc*2));
c3 = v_add(c3, v_load(c + ldc*3));
}
v_store(c, c0);