mirror of
https://github.com/opencv/opencv.git
synced 2024-11-24 19:20:28 +08:00
Merge pull request #23310 from hanliutong:fix_hal_compatibility
Fix HAL compatibility layer
This commit is contained in:
commit
3f02c9d5b9
@ -758,6 +758,36 @@ namespace CV__SIMD_NAMESPACE {
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
|
||||
// when we use CV_SIMD128 with 256/512 bit SIMD (e.g. AVX2 or AVX512)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x2)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
|
||||
#endif
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
|
||||
// when we use CV_SIMD256 with 512 bit SIMD (e.g. AVX512)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_BIN_OP_LOGIC(_Tpvec) \
|
||||
inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \
|
||||
@ -785,6 +815,26 @@ namespace CV__SIMD_NAMESPACE {
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64)
|
||||
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x2)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x2)
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x4)
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
|
||||
inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
|
||||
@ -805,17 +855,51 @@ namespace CV__SIMD_NAMESPACE {
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
|
||||
#endif
|
||||
|
||||
|
||||
inline v_float32 v_div(const v_float32& a, const v_float32& b) \
|
||||
{ \
|
||||
return a / b; \
|
||||
}
|
||||
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x4)
|
||||
#if CV_SIMD_64F
|
||||
inline v_float64 v_div(const v_float64& a, const v_float64& b) \
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x2)
|
||||
#endif
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x8)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x4)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_BIN_OP_DIV(_Tpvec) \
|
||||
inline _Tpvec v_div(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return a / b; \
|
||||
}
|
||||
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64)
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
|
||||
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x4)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x2)
|
||||
#endif
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
|
||||
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x8)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x4)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_CMP_OP(_Tpvec, intrin, op) \
|
||||
@ -844,44 +928,124 @@ namespace CV__SIMD_NAMESPACE {
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_CMP(v_float64)
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
|
||||
OPENCV_HAL_WRAP_CMP(v_uint8x16)
|
||||
OPENCV_HAL_WRAP_CMP(v_uint16x8)
|
||||
OPENCV_HAL_WRAP_CMP(v_uint32x4)
|
||||
OPENCV_HAL_WRAP_CMP(v_int8x16)
|
||||
OPENCV_HAL_WRAP_CMP(v_int16x8)
|
||||
OPENCV_HAL_WRAP_CMP(v_int32x4)
|
||||
OPENCV_HAL_WRAP_CMP(v_float32x4)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_CMP(v_float64x2)
|
||||
#endif
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
|
||||
OPENCV_HAL_WRAP_CMP(v_uint8x32)
|
||||
OPENCV_HAL_WRAP_CMP(v_uint16x16)
|
||||
OPENCV_HAL_WRAP_CMP(v_uint32x8)
|
||||
OPENCV_HAL_WRAP_CMP(v_int8x32)
|
||||
OPENCV_HAL_WRAP_CMP(v_int16x16)
|
||||
OPENCV_HAL_WRAP_CMP(v_int32x8)
|
||||
OPENCV_HAL_WRAP_CMP(v_float32x8)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_CMP(v_float64x4)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//////////// get0 ////////////
|
||||
#define OPENCV_HAL_WRAP_GRT0_INT(_Tpvec, _Tp) \
|
||||
inline _Tp v_get0(const v_##_Tpvec& v) \
|
||||
#define OPENCV_HAL_WRAP_GRT0(_Tpvec) \
|
||||
inline typename VTraits<_Tpvec>::lane_type v_get0(const _Tpvec& v) \
|
||||
{ \
|
||||
return v.get0(); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_WRAP_GRT0_INT(uint8, uchar)
|
||||
OPENCV_HAL_WRAP_GRT0_INT(int8, schar)
|
||||
OPENCV_HAL_WRAP_GRT0_INT(uint16, ushort)
|
||||
OPENCV_HAL_WRAP_GRT0_INT(int16, short)
|
||||
OPENCV_HAL_WRAP_GRT0_INT(uint32, unsigned)
|
||||
OPENCV_HAL_WRAP_GRT0_INT(int32, int)
|
||||
OPENCV_HAL_WRAP_GRT0_INT(uint64, uint64)
|
||||
OPENCV_HAL_WRAP_GRT0_INT(int64, int64)
|
||||
OPENCV_HAL_WRAP_GRT0_INT(float32, float)
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint8)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int8)
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint16)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int16)
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint32)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int32)
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint64)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int64)
|
||||
OPENCV_HAL_WRAP_GRT0(v_float32)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_GRT0_INT(float64, double)
|
||||
OPENCV_HAL_WRAP_GRT0(v_float64)
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint8x16)
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint16x8)
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint32x4)
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint64x2)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int8x16)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int16x8)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int32x4)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int64x2)
|
||||
OPENCV_HAL_WRAP_GRT0(v_float32x4)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_GRT0(v_float64x2)
|
||||
#endif
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint8x32)
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint16x16)
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint32x8)
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint64x4)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int8x32)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int16x16)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int32x8)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int64x4)
|
||||
OPENCV_HAL_WRAP_GRT0(v_float32x8)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_GRT0(v_float64x4)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_EXTRACT(_Tpvec, _Tp, vl) \
|
||||
inline _Tp v_extract_highest(const _Tpvec& v) \
|
||||
#define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
|
||||
inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
|
||||
{ \
|
||||
return v_extract_n<vl-1>(v); \
|
||||
return v_extract_n<VTraits<_Tpvec>::nlanes-1>(v); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint8, uchar, VTraits<v_uint8>::nlanes)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int8, schar, VTraits<v_int8>::nlanes)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint16, ushort, VTraits<v_uint16>::nlanes)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int16, short, VTraits<v_int16>::nlanes)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint32, unsigned int, VTraits<v_uint32>::nlanes)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int32, int, VTraits<v_int32>::nlanes)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint64, uint64, VTraits<v_uint64>::nlanes)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int64, int64, VTraits<v_int64>::nlanes)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_float32, float, VTraits<v_float32>::nlanes)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint8)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int8)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint16)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int16)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint32)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int32)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint64)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int64)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_float32)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_float64, double, VTraits<v_float64>::nlanes)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_float64)
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint8x16)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint16x8)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint32x4)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint64x2)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int8x16)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int16x8)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int32x4)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int64x2)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_float32x4)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_float64x2)
|
||||
#endif
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint8x32)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint16x16)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint32x8)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint64x4)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int8x32)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int16x16)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int32x8)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int64x4)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_float32x8)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_float64x4)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
|
||||
@ -893,7 +1057,16 @@ namespace CV__SIMD_NAMESPACE {
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_uint32)
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_int32)
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_float32)
|
||||
|
||||
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_uint32x4)
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_int32x4)
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_float32x4)
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_uint32x8)
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_int32x8)
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_float32x8)
|
||||
#endif
|
||||
|
||||
#endif //!CV_SIMD_SCALABLE
|
||||
|
||||
|
@ -1028,11 +1028,10 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
|
||||
{
|
||||
for (; j + 7 < out_width; j += 8)
|
||||
{
|
||||
v_float32x4 v0 = v_load(cptr + j) + vbias;
|
||||
v_float32x4 v1 = v_load(cptr + j + 4) + vbias;
|
||||
|
||||
v0 += v_load(pbptr + j);
|
||||
v1 += v_load(pbptr + j + 4);
|
||||
v_float32x4 v0 = v_add(v_load(cptr + j), vbias);
|
||||
v_float32x4 v1 = v_add(v_load(cptr + j + 4), vbias);
|
||||
v0 = v_add(v0, v_load(pbptr + j));
|
||||
v1 = v_add(v1, v_load(pbptr + j + 4));
|
||||
|
||||
if (ifMinMaxAct)
|
||||
{
|
||||
@ -1048,8 +1047,8 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
|
||||
{
|
||||
for (; j + 7 < out_width; j += 8)
|
||||
{
|
||||
v_float32x4 v0 = v_load(cptr + j) + vbias;
|
||||
v_float32x4 v1 = v_load(cptr + j + 4) + vbias;
|
||||
v_float32x4 v0 = v_add(v_load(cptr + j), vbias);
|
||||
v_float32x4 v1 = v_add(v_load(cptr + j + 4), vbias);
|
||||
|
||||
if (ifMinMaxAct)
|
||||
{
|
||||
@ -1154,13 +1153,13 @@ static void convBlockMR1x28(int np, const float* a, const float* b, float *c, co
|
||||
|
||||
if (init_c)
|
||||
{
|
||||
c0 += v_load(c);
|
||||
c1 += v_load(c + 4);
|
||||
c2 += v_load(c + 8);
|
||||
c3 += v_load(c + 12);
|
||||
c4 += v_load(c + 16);
|
||||
c5 += v_load(c + 20);
|
||||
c6 += v_load(c + 24);
|
||||
c0 = v_add(c0, v_load(c));
|
||||
c1 = v_add(c1, v_load(c + 4));
|
||||
c2 = v_add(c2, v_load(c + 8));
|
||||
c3 = v_add(c3, v_load(c + 12));
|
||||
c4 = v_add(c4, v_load(c + 16));
|
||||
c5 = v_add(c5, v_load(c + 20));
|
||||
c6 = v_add(c6, v_load(c + 24));
|
||||
}
|
||||
|
||||
if (ifMinMaxAct)
|
||||
@ -1207,12 +1206,12 @@ static void convBlockMR1x24(int np, const float* a, const float* b, float *c, co
|
||||
|
||||
if (init_c)
|
||||
{
|
||||
c0 += v_load(c);
|
||||
c1 += v_load(c + 4);
|
||||
c2 += v_load(c + 8);
|
||||
c3 += v_load(c + 12);
|
||||
c4 += v_load(c + 16);
|
||||
c5 += v_load(c + 20);
|
||||
c0 = v_add(c0, v_load(c));
|
||||
c1 = v_add(c1, v_load(c + 4));
|
||||
c2 = v_add(c2, v_load(c + 8));
|
||||
c3 = v_add(c3, v_load(c + 12));
|
||||
c4 = v_add(c4, v_load(c + 16));
|
||||
c5 = v_add(c5, v_load(c + 20));
|
||||
}
|
||||
|
||||
if (ifMinMaxAct)
|
||||
@ -1251,9 +1250,9 @@ static void convBlockMR1x12(int np, const float* a, const float* b, float *c, co
|
||||
|
||||
if (init_c)
|
||||
{
|
||||
c0 += v_load(c);
|
||||
c1 += v_load(c + 4);
|
||||
c2 += v_load(c + 8);
|
||||
c0 = v_add(c0, v_load(c));
|
||||
c1 = v_add(c1, v_load(c + 4));
|
||||
c2 = v_add(c2, v_load(c + 8));
|
||||
}
|
||||
|
||||
if (ifMinMaxAct)
|
||||
@ -1343,33 +1342,33 @@ static void convBlock4x24(int np, const float* a, const float* b, float* c, int
|
||||
|
||||
if (!init_c)
|
||||
{
|
||||
c0 += v_load(c);
|
||||
c1 += v_load(c + 4);
|
||||
c2 += v_load(c + 8);
|
||||
c3 += v_load(c + 12);
|
||||
c4 += v_load(c + 16);
|
||||
c5 += v_load(c + 20);
|
||||
c0 = v_add(c0, v_load(c));
|
||||
c1 = v_add(c1, v_load(c + 4));
|
||||
c2 = v_add(c2, v_load(c + 8));
|
||||
c3 = v_add(c3, v_load(c + 12));
|
||||
c4 = v_add(c4, v_load(c + 16));
|
||||
c5 = v_add(c5, v_load(c + 20));
|
||||
|
||||
c6 += v_load(c + ldc);
|
||||
c7 += v_load(c + ldc + 4);
|
||||
c8 += v_load(c + ldc + 8);
|
||||
c9 += v_load(c + ldc + 12);
|
||||
c10 += v_load(c + ldc + 16);
|
||||
c11 += v_load(c + ldc + 20);
|
||||
c6 = v_add(c6 , v_load(c + ldc));
|
||||
c7 = v_add(c7 , v_load(c + ldc + 4));
|
||||
c8 = v_add(c8 , v_load(c + ldc + 8));
|
||||
c9 = v_add(c9 , v_load(c + ldc + 12));
|
||||
c10 = v_add(c10, v_load(c + ldc + 16));
|
||||
c11 = v_add(c11, v_load(c + ldc + 20));
|
||||
|
||||
c12 += v_load(c + ldc*2);
|
||||
c13 += v_load(c + ldc*2 + 4);
|
||||
c14 += v_load(c + ldc*2 + 8);
|
||||
c15 += v_load(c + ldc*2 + 12);
|
||||
c16 += v_load(c + ldc*2 + 16);
|
||||
c17 += v_load(c + ldc*2 + 20);
|
||||
c12 = v_add(c12, v_load(c + ldc*2));
|
||||
c13 = v_add(c13, v_load(c + ldc*2 + 4));
|
||||
c14 = v_add(c14, v_load(c + ldc*2 + 8));
|
||||
c15 = v_add(c15, v_load(c + ldc*2 + 12));
|
||||
c16 = v_add(c16, v_load(c + ldc*2 + 16));
|
||||
c17 = v_add(c17, v_load(c + ldc*2 + 20));
|
||||
|
||||
c18 += v_load(c + ldc*3);
|
||||
c19 += v_load(c + ldc*3 + 4);
|
||||
c20 += v_load(c + ldc*3 + 8);
|
||||
c21 += v_load(c + ldc*3 + 12);
|
||||
c22 += v_load(c + ldc*3 + 16);
|
||||
c23 += v_load(c + ldc*3 + 20);
|
||||
c18 = v_add(c18, v_load(c + ldc*3));
|
||||
c19 = v_add(c19, v_load(c + ldc*3 + 4));
|
||||
c20 = v_add(c20, v_load(c + ldc*3 + 8));
|
||||
c21 = v_add(c21, v_load(c + ldc*3 + 12));
|
||||
c22 = v_add(c22, v_load(c + ldc*3 + 16));
|
||||
c23 = v_add(c23, v_load(c + ldc*3 + 20));
|
||||
}
|
||||
|
||||
v_store(c, c0);
|
||||
@ -1431,17 +1430,17 @@ static void convBlock4x8(int np, const float* a, const float* b, float* c, int l
|
||||
|
||||
if (!init_c)
|
||||
{
|
||||
c0 += v_load(c);
|
||||
c1 += v_load(c + 4);
|
||||
c0 = v_add(c0, v_load(c));
|
||||
c1 = v_add(c1, v_load(c + 4));
|
||||
|
||||
c2 += v_load(c + ldc);
|
||||
c3 += v_load(c + ldc + 4);
|
||||
c2 = v_add(c2, v_load(c + ldc));
|
||||
c3 = v_add(c3, v_load(c + ldc + 4));
|
||||
|
||||
c4 += v_load(c + ldc*2);
|
||||
c5 += v_load(c + ldc*2 + 4);
|
||||
c4 = v_add(c4, v_load(c + ldc*2));
|
||||
c5 = v_add(c5, v_load(c + ldc*2 + 4));
|
||||
|
||||
c6 += v_load(c + ldc*3);
|
||||
c7 += v_load(c + ldc*3 + 4);
|
||||
c6 = v_add(c6, v_load(c + ldc*3));
|
||||
c7 = v_add(c7, v_load(c + ldc*3 + 4));
|
||||
}
|
||||
|
||||
v_store(c, c0);
|
||||
@ -1476,10 +1475,10 @@ static void convBlock4x4(int np, const float* a, const float* b, float* c, int l
|
||||
|
||||
if (!init_c)
|
||||
{
|
||||
c0 += v_load(c);
|
||||
c1 += v_load(c + ldc);
|
||||
c2 += v_load(c + ldc*2);
|
||||
c3 += v_load(c + ldc*3);
|
||||
c0 = v_add(c0, v_load(c));
|
||||
c1 = v_add(c1, v_load(c + ldc));
|
||||
c2 = v_add(c2, v_load(c + ldc*2));
|
||||
c3 = v_add(c3, v_load(c + ldc*3));
|
||||
}
|
||||
|
||||
v_store(c, c0);
|
||||
|
Loading…
Reference in New Issue
Block a user