From eefa327f30cc51372599d7afad4224d44cede71a Mon Sep 17 00:00:00 2001 From: Yuantao Feng Date: Wed, 12 Mar 2025 21:43:10 +0800 Subject: [PATCH] Merge pull request #27042 from fengyuentau:4x/core/normDiff_simd core: vectorize normDiff with universal intrinsics #27042 Merge with https://github.com/opencv/opencv_extra/pull/1242. Performance results on Desktop Intel i7-12700K, Apple M2, Jetson Orin and SpaceMIT K1: [perf-normDiff.zip](https://github.com/user-attachments/files/19178689/perf-normDiff.zip) ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake --- modules/core/perf/perf_norm.cpp | 4 +- modules/core/src/norm.dispatch.cpp | 145 +----- modules/core/src/norm.simd.hpp | 739 +++++++++++++++++++++++++++++ 3 files changed, 754 insertions(+), 134 deletions(-) diff --git a/modules/core/perf/perf_norm.cpp b/modules/core/perf/perf_norm.cpp index 07f989f21c..8bcf9ea224 100644 --- a/modules/core/perf/perf_norm.cpp +++ b/modules/core/perf/perf_norm.cpp @@ -59,7 +59,7 @@ PERF_TEST_P(Size_MatType_NormType, norm_mask, PERF_TEST_P(Size_MatType_NormType, norm2, testing::Combine( testing::Values(TYPICAL_MAT_SIZES), - testing::Values(TYPICAL_MAT_TYPES), + testing::Values(CV_8UC1, CV_8UC4, CV_8SC1, CV_16UC1, CV_16SC1, CV_32SC1, CV_32FC1, CV_64FC1), testing::Values((int)NORM_INF, (int)NORM_L1, (int)NORM_L2, (int)(NORM_RELATIVE+NORM_INF), (int)(NORM_RELATIVE+NORM_L1), (int)(NORM_RELATIVE+NORM_L2)) ) ) @@ -82,7 +82,7 @@ PERF_TEST_P(Size_MatType_NormType, norm2, PERF_TEST_P(Size_MatType_NormType, norm2_mask, testing::Combine( testing::Values(TYPICAL_MAT_SIZES), - testing::Values(TYPICAL_MAT_TYPES), + testing::Values(CV_8UC1, CV_8UC4, CV_8SC1, CV_16UC1, CV_16SC1, CV_32SC1, CV_32FC1, CV_64FC1), testing::Values((int)NORM_INF, (int)NORM_L1, (int)NORM_L2, (int)(NORM_RELATIVE|NORM_INF), (int)(NORM_RELATIVE|NORM_L1), (int)(NORM_RELATIVE|NORM_L2)) ) ) diff --git a/modules/core/src/norm.dispatch.cpp b/modules/core/src/norm.dispatch.cpp index a67df07eba..fed4896677 100644 --- a/modules/core/src/norm.dispatch.cpp +++ b/modules/core/src/norm.dispatch.cpp @@ -218,120 +218,9 @@ int normL1_(const uchar* a, const uchar* b, int n) //================================================================================================== -template int -normDiffInf_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn) -{ - ST result = *_result; - if( !mask ) - { - result = std::max(result, normInf(src1, src2, len*cn)); - } - else - { - for( int i = 0; i < len; i++, src1 += cn, src2 += cn ) - if( mask[i] ) - { - for( int k = 0; k < cn; k++ ) - result = std::max(result, (ST)std::abs(src1[k] - src2[k])); - } - } - *_result = result; - return 0; -} - -template int -normDiffL1_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn) -{ - ST result = *_result; - if( !mask ) - { - result += normL1(src1, src2, len*cn); - } - else - { - for( int i = 0; i < len; i++, src1 += cn, src2 += cn ) - if( mask[i] ) - { - for( int k = 0; k < cn; k++ ) - result += std::abs(src1[k] - src2[k]); - } - } - *_result = result; - return 0; -} - -template int -normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn) -{ - ST result = *_result; - if( !mask ) - { - result += normL2Sqr(src1, src2, len*cn); - } - else - { - for( int i = 0; i < len; i++, src1 += cn, src2 += cn ) - if( mask[i] ) - { - for( int k = 0; k < cn; k++ ) - { - ST v = src1[k] - src2[k]; - result += v*v; - } - } - } - *_result = result; - return 0; -} - -#define CV_DEF_NORM_DIFF_FUNC(L, suffix, type, ntype) \ - static int normDiff##L##_##suffix(const type* src1, const type* src2, \ - const uchar* mask, ntype* r, int len, int cn) \ -{ return normDiff##L##_(src1, src2, mask, r, (int)len, cn); } - -#define CV_DEF_NORM_DIFF_ALL(suffix, type, inftype, l1type, l2type) \ - CV_DEF_NORM_DIFF_FUNC(Inf, suffix, type, inftype) \ - CV_DEF_NORM_DIFF_FUNC(L1, suffix, type, l1type) \ - CV_DEF_NORM_DIFF_FUNC(L2, suffix, type, l2type) - -CV_DEF_NORM_DIFF_ALL(8u, uchar, int, int, int) -CV_DEF_NORM_DIFF_ALL(8s, schar, int, int, int) -CV_DEF_NORM_DIFF_ALL(16u, ushort, int, int, double) -CV_DEF_NORM_DIFF_ALL(16s, short, int, int, double) -CV_DEF_NORM_DIFF_ALL(32s, int, int, double, double) -CV_DEF_NORM_DIFF_ALL(32f, float, float, double, double) -CV_DEF_NORM_DIFF_ALL(64f, double, double, double, double) - typedef int (*NormFunc)(const uchar*, const uchar*, uchar*, int, int); typedef int (*NormDiffFunc)(const uchar*, const uchar*, const uchar*, uchar*, int, int); -static NormDiffFunc getNormDiffFunc(int normType, int depth) -{ - static NormDiffFunc normDiffTab[3][8] = - { - { - (NormDiffFunc)GET_OPTIMIZED(normDiffInf_8u), (NormDiffFunc)normDiffInf_8s, - (NormDiffFunc)normDiffInf_16u, (NormDiffFunc)normDiffInf_16s, - (NormDiffFunc)normDiffInf_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffInf_32f), - (NormDiffFunc)normDiffInf_64f, 0 - }, - { - (NormDiffFunc)GET_OPTIMIZED(normDiffL1_8u), (NormDiffFunc)normDiffL1_8s, - (NormDiffFunc)normDiffL1_16u, (NormDiffFunc)normDiffL1_16s, - (NormDiffFunc)normDiffL1_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL1_32f), - (NormDiffFunc)normDiffL1_64f, 0 - }, - { - (NormDiffFunc)GET_OPTIMIZED(normDiffL2_8u), (NormDiffFunc)normDiffL2_8s, - (NormDiffFunc)normDiffL2_16u, (NormDiffFunc)normDiffL2_16s, - (NormDiffFunc)normDiffL2_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL2_32f), - (NormDiffFunc)normDiffL2_64f, 0 - } - }; - - return normDiffTab[normType][depth]; -} - #ifdef HAVE_OPENCL static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & result ) @@ -520,6 +409,10 @@ static NormFunc getNormFunc(int normType, int depth) { CV_INSTRUMENT_REGION(); CV_CPU_DISPATCH(getNormFunc, (normType, depth), CV_CPU_DISPATCH_MODES_ALL); } +static NormDiffFunc getNormDiffFunc(int normType, int depth) { + CV_INSTRUMENT_REGION(); + CV_CPU_DISPATCH(getNormDiffFunc, (normType, depth), CV_CPU_DISPATCH_MODES_ALL); +} double norm( InputArray _src, int normType, InputArray _mask ) { @@ -1050,6 +943,9 @@ double norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask normType == NORM_L2 || normType == NORM_L2SQR || ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) ); + NormDiffFunc func = getNormDiffFunc(normType >> 1, depth == CV_16F ? CV_32F : depth); + CV_Assert( func != 0 ); + if( src1.isContinuous() && src2.isContinuous() && mask.empty() ) { size_t len = src1.total()*src1.channels(); @@ -1057,31 +953,19 @@ double norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask { if( src1.depth() == CV_32F ) { - const float* data1 = src1.ptr(); - const float* data2 = src2.ptr(); + const uchar* data1 = src1.ptr(); + const uchar* data2 = src2.ptr(); - if( normType == NORM_L2 ) + if( normType == NORM_L2 || normType == NORM_L2SQR || normType == NORM_L1 ) { double result = 0; - GET_OPTIMIZED(normDiffL2_32f)(data1, data2, 0, &result, (int)len, 1); - return std::sqrt(result); - } - if( normType == NORM_L2SQR ) - { - double result = 0; - GET_OPTIMIZED(normDiffL2_32f)(data1, data2, 0, &result, (int)len, 1); - return result; - } - if( normType == NORM_L1 ) - { - double result = 0; - GET_OPTIMIZED(normDiffL1_32f)(data1, data2, 0, &result, (int)len, 1); - return result; + func(data1, data2, 0, (uchar*)&result, (int)len, 1); + return normType == NORM_L2 ? std::sqrt(result) : result; } if( normType == NORM_INF ) { float result = 0; - GET_OPTIMIZED(normDiffInf_32f)(data1, data2, 0, &result, (int)len, 1); + func(data1, data2, 0, (uchar*)&result, (int)len, 1); return result; } } @@ -1115,9 +999,6 @@ double norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask return result; } - NormDiffFunc func = getNormDiffFunc(normType >> 1, depth == CV_16F ? CV_32F : depth); - CV_Assert( func != 0 ); - const Mat* arrays[] = {&src1, &src2, &mask, 0}; uchar* ptrs[3] = {}; union diff --git a/modules/core/src/norm.simd.hpp b/modules/core/src/norm.simd.hpp index fd7b658ba1..c2b72d2e13 100644 --- a/modules/core/src/norm.simd.hpp +++ b/modules/core/src/norm.simd.hpp @@ -11,10 +11,12 @@ namespace cv { using NormFunc = int (*)(const uchar*, const uchar*, uchar*, int, int); +using NormDiffFunc = int (*)(const uchar*, const uchar*, const uchar*, uchar*, int, int); CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN NormFunc getNormFunc(int normType, int depth); +NormDiffFunc getNormDiffFunc(int normType, int depth); #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY @@ -52,6 +54,42 @@ struct NormL2_SIMD { } }; +template +struct NormDiffInf_SIMD { + inline ST operator() (const T* src1, const T* src2, int n) const { + ST s = 0; + for (int i = 0; i < n; i++) { + ST v = ST(src1[i] - src2[i]); + s = std::max(s, (ST)cv_abs(v)); + } + return s; + } +}; + +template +struct NormDiffL1_SIMD { + inline ST operator() (const T* src1, const T* src2, int n) const { + ST s = 0; + for (int i = 0; i < n; i++) { + ST v = ST(src1[i] - src2[i]); + s += cv_abs(v); + } + return s; + } +}; + +template +struct NormDiffL2_SIMD { + inline ST operator() (const T* src1, const T* src2, int n) const { + ST s = 0; + for (int i = 0; i < n; i++) { + ST v = ST(src1[i] - src2[i]); + s += v * v; + } + return s; + } +}; + #if (CV_SIMD || CV_SIMD_SCALABLE) template<> @@ -348,6 +386,367 @@ struct NormL2_SIMD { } }; +template<> +struct NormDiffInf_SIMD { + int operator() (const uchar* src1, const uchar* src2, int n) const { + int j = 0; + int s = 0; + v_uint8 r0 = vx_setzero_u8(), r1 = vx_setzero_u8(); + v_uint8 r2 = vx_setzero_u8(), r3 = vx_setzero_u8(); + for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { + v_uint8 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); + r0 = v_max(r0, v_absdiff(v01, v02)); + + v_uint8 v11 = vx_load(src1 + j + VTraits::vlanes()), + v12 = vx_load(src2 + j + VTraits::vlanes()); + r1 = v_max(r1, v_absdiff(v11, v12)); + + v_uint8 v21 = vx_load(src1 + j + 2 * VTraits::vlanes()), + v22 = vx_load(src2 + j + 2 * VTraits::vlanes()); + r2 = v_max(r2, v_absdiff(v21, v22)); + + v_uint8 v31 = vx_load(src1 + j + 3 * VTraits::vlanes()), + v32 = vx_load(src2 + j + 3 * VTraits::vlanes()); + r3 = v_max(r3, v_absdiff(v31, v32)); + } + s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3)); + for (; j < n; j++) { + int v = src1[j] - src2[j]; + s = std::max(s, (int)cv_abs(v)); + } + return s; + } +}; + +template<> +struct NormDiffInf_SIMD { + int operator() (const schar* src1, const schar* src2, int n) const { + int j = 0; + int s = 0; + v_uint8 r0 = vx_setzero_u8(), r1 = vx_setzero_u8(); + v_uint8 r2 = vx_setzero_u8(), r3 = vx_setzero_u8(); + for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { + v_int8 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); + r0 = v_max(r0, v_absdiff(v01, v02)); + + v_int8 v11 = vx_load(src1 + j + VTraits::vlanes()), + v12 = vx_load(src2 + j + VTraits::vlanes()); + r1 = v_max(r1, v_absdiff(v11, v12)); + + v_int8 v21 = vx_load(src1 + j + 2 * VTraits::vlanes()), + v22 = vx_load(src2 + j + 2 * VTraits::vlanes()); + r2 = v_max(r2, v_absdiff(v21, v22)); + + v_int8 v31 = vx_load(src1 + j + 3 * VTraits::vlanes()), + v32 = vx_load(src2 + j + 3 * VTraits::vlanes()); + r3 = v_max(r3, v_absdiff(v31, v32)); + } + s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3)); + for (; j < n; j++) { + int v = src1[j] - src2[j]; + s = std::max(s, (int)cv_abs(v)); + } + return s; + } +}; + +template<> +struct NormDiffInf_SIMD { + int operator() (const ushort* src1, const ushort* src2, int n) const { + int j = 0; + int s = 0; + v_uint16 r0 = vx_setzero_u16(), r1 = vx_setzero_u16(); + v_uint16 r2 = vx_setzero_u16(), r3 = vx_setzero_u16(); + for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { + v_uint16 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); + r0 = v_max(r0, v_absdiff(v01, v02)); + + v_uint16 v11 = vx_load(src1 + j + VTraits::vlanes()), + v12 = vx_load(src2 + j + VTraits::vlanes()); + r1 = v_max(r1, v_absdiff(v11, v12)); + + v_uint16 v21 = vx_load(src1 + j + 2 * VTraits::vlanes()), + v22 = vx_load(src2 + j + 2 * VTraits::vlanes()); + r2 = v_max(r2, v_absdiff(v21, v22)); + + v_uint16 v31 = vx_load(src1 + j + 3 * VTraits::vlanes()), + v32 = vx_load(src2 + j + 3 * VTraits::vlanes()); + r3 = v_max(r3, v_absdiff(v31, v32)); + } + s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3)); + for (; j < n; j++) { + int v = src1[j] - src2[j]; + s = std::max(s, (int)cv_abs(v)); + } + return s; + } +}; + +template<> +struct NormDiffInf_SIMD { + int operator() (const short* src1, const short* src2, int n) const { + int j = 0; + int s = 0; + v_uint16 r0 = vx_setzero_u16(), r1 = vx_setzero_u16(); + v_uint16 r2 = vx_setzero_u16(), r3 = vx_setzero_u16(); + for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { + v_int16 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); + r0 = v_max(r0, v_absdiff(v01, v02)); + + v_int16 v11 = vx_load(src1 + j + VTraits::vlanes()), + v12 = vx_load(src2 + j + VTraits::vlanes()); + r1 = v_max(r1, v_absdiff(v11, v12)); + + v_int16 v21 = vx_load(src1 + j + 2 * VTraits::vlanes()), + v22 = vx_load(src2 + j + 2 * VTraits::vlanes()); + r2 = v_max(r2, v_absdiff(v21, v22)); + + v_int16 v31 = vx_load(src1 + j + 3 * VTraits::vlanes()), + v32 = vx_load(src2 + j + 3 * VTraits::vlanes()); + r3 = v_max(r3, v_absdiff(v31, v32)); + } + s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3)); + for (; j < n; j++) { + int v = src1[j] - src2[j]; + s = std::max(s, (int)cv_abs(v)); + } + return s; + } +}; + +template<> +struct NormDiffInf_SIMD { + int operator() (const int* src1, const int* src2, int n) const { + int j = 0; + int s = 0; + v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32(); + v_uint32 r2 = vx_setzero_u32(), r3 = vx_setzero_u32(); + for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { + v_int32 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); + r0 = v_max(r0, v_abs(v_sub(v01, v02))); + + v_int32 v11 = vx_load(src1 + j + VTraits::vlanes()), + v12 = vx_load(src2 + j + VTraits::vlanes()); + r1 = v_max(r1, v_abs(v_sub(v11, v12))); + + v_int32 v21 = vx_load(src1 + j + 2 * VTraits::vlanes()), + v22 = vx_load(src2 + j + 2 * VTraits::vlanes()); + r2 = v_max(r2, v_abs(v_sub(v21, v22))); + + v_int32 v31 = vx_load(src1 + j + 3 * VTraits::vlanes()), + v32 = vx_load(src2 + j + 3 * VTraits::vlanes()); + r3 = v_max(r3, v_abs(v_sub(v31, v32))); + } + s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3)); + for (; j < n; j++) { + int v = src1[j] - src2[j]; + s = std::max(s, (int)cv_abs(v)); + } + return s; + } +}; + +template<> +struct NormDiffInf_SIMD { + float operator() (const float* src1, const float* src2, int n) const { + int j = 0; + float s = 0; + v_float32 r0 = vx_setzero_f32(), r1 = vx_setzero_f32(); + for (; j <= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_float32 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); + r0 = v_max(r0, v_absdiff(v01, v02)); + + v_float32 v11 = vx_load(src1 + j + VTraits::vlanes()), + v12 = vx_load(src2 + j + VTraits::vlanes()); + r1 = v_max(r1, v_absdiff(v11, v12)); + } + s = v_reduce_max(v_max(r0, r1)); + for (; j < n; j++) { + float v = src1[j] - src2[j]; + s = std::max(s, cv_abs(v)); + } + return s; + } +}; + +template<> +struct NormDiffL1_SIMD { + int operator() (const uchar* src1, const uchar* src2, int n) const { + int j = 0; + int s = 0; + v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32(); + v_uint8 one = vx_setall_u8(1); + for (; j<= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_uint8 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); + r0 = v_dotprod_expand_fast(v_absdiff(v01, v02), one, r0); + + v_uint8 v11 = vx_load(src1 + j + VTraits::vlanes()), + v12 = vx_load(src2 + j + VTraits::vlanes()); + r1 = v_dotprod_expand_fast(v_absdiff(v11, v12), one, r1); + } + s += v_reduce_sum(v_add(r0, r1)); + for (; j < n; j++) { + int v = src1[j] - src2[j]; + s += (int)cv_abs(v); + } + return s; + } +}; + +template<> +struct NormDiffL1_SIMD { + int operator() (const schar* src1, const schar* src2, int n) const { + int j = 0; + int s = 0; + v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32(); + v_uint8 one = vx_setall_u8(1); + for (; j<= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_int8 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); + r0 = v_dotprod_expand_fast(v_absdiff(v01, v02), one, r0); + + v_int8 v11 = vx_load(src1 + j + VTraits::vlanes()), + v12 = vx_load(src2 + j + VTraits::vlanes()); + r1 = v_dotprod_expand_fast(v_absdiff(v11, v12), one, r1); + } + s += v_reduce_sum(v_add(r0, r1)); + for (; j < n; j++) { + int v =src1[j] - src2[j]; + s += (int)cv_abs(v); + } + return s; + } +}; + +template<> +struct NormDiffL1_SIMD { + int operator() (const ushort* src1, const ushort* src2, int n) const { + int j = 0; + int s = 0; + v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32(); + v_uint32 r2 = vx_setzero_u32(), r3 = vx_setzero_u32(); + for (; j<= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { + v_uint16 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); + v_uint32 u00, u01; + v_expand(v_absdiff(v01, v02), u00, u01); + r0 = v_add(r0, v_add(u00, u01)); + + v_uint16 v11 = vx_load(src1 + j + VTraits::vlanes()), + v12 = vx_load(src2 + j + VTraits::vlanes()); + v_uint32 u10, u11; + v_expand(v_absdiff(v11, v12), u10, u11); + r1 = v_add(r1, v_add(u10, u11)); + + v_uint16 v21 = vx_load(src1 + j + 2 * VTraits::vlanes()), + v22 = vx_load(src2 + j + 2 * VTraits::vlanes()); + v_uint32 u20, u21; + v_expand(v_absdiff(v21, v22), u20, u21); + r2 = v_add(r2, v_add(u20, u21)); + + v_uint16 v31 = vx_load(src1 + j + 3 * VTraits::vlanes()), + v32 = vx_load(src2 + j + 3 * VTraits::vlanes()); + v_uint32 u30, u31; + v_expand(v_absdiff(v31, v32), u30, u31); + r3 = v_add(r3, v_add(u30, u31)); + } + s += (int)v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3)); + for (; j < n; j++) { + int v = src1[j] - src2[j]; + s += (int)cv_abs(v); + } + return s; + } +}; + +template<> +struct NormDiffL1_SIMD { + int operator() (const short* src1, const short* src2, int n) const { + int j = 0; + int s = 0; + v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32(); + v_uint32 r2 = vx_setzero_u32(), r3 = vx_setzero_u32(); + for (; j<= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { + v_int16 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); + v_uint32 u00, u01; + v_expand(v_absdiff(v01, v02), u00, u01); + r0 = v_add(r0, v_add(u00, u01)); + + v_int16 v11 = vx_load(src1 + j + VTraits::vlanes()), + v12 = vx_load(src2 + j + VTraits::vlanes()); + v_uint32 u10, u11; + v_expand(v_absdiff(v11, v12), u10, u11); + r1 = v_add(r1, v_add(u10, u11)); + + v_int16 v21 = vx_load(src1 + j + 2 * VTraits::vlanes()), + v22 = vx_load(src2 + j + 2 * VTraits::vlanes()); + v_uint32 u20, u21; + v_expand(v_absdiff(v21, v22), u20, u21); + r2 = v_add(r2, v_add(u20, u21)); + + v_int16 v31 = vx_load(src1 + j + 3 * VTraits::vlanes()), + v32 = vx_load(src2 + j + 3 * VTraits::vlanes()); + v_uint32 u30, u31; + v_expand(v_absdiff(v31, v32), u30, u31); + r3 = v_add(r3, v_add(u30, u31)); + } + s += (int)v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3)); + for (; j < n; j++) { + int v = src1[j] - src2[j]; + s += (int)cv_abs(v); + } + return s; + } +}; + +template<> +struct NormDiffL2_SIMD { + int operator() (const uchar* src1, const uchar* src2, int n) const { + int j = 0; + int s = 0; + v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32(); + for (; j <= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_uint8 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); + v_uint8 v0 = v_absdiff(v01, v02); + r0 = v_dotprod_expand_fast(v0, v0, r0); + + v_uint8 v11 = vx_load(src1 + j + VTraits::vlanes()), + v12 = vx_load(src2 + j + VTraits::vlanes()); + v_uint8 v1 = v_absdiff(v11, v12); + r1 = v_dotprod_expand_fast(v1, v1, r1); + } + s += v_reduce_sum(v_add(r0, r1)); + for (; j < n; j++) { + int v = saturate_cast(src1[j] - src2[j]); + s += v * v; + } + return s; + } +}; + +template<> +struct NormDiffL2_SIMD { + int operator() (const schar* src1, const schar* src2, int n) const { + int j = 0; + int s = 0; + v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32(); + for (; j <= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_int8 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); + v_uint8 v0 = v_absdiff(v01, v02); + r0 = v_dotprod_expand_fast(v0, v0, r0); + + v_int8 v11 = vx_load(src1 + j + VTraits::vlanes()), + v12 = vx_load(src2 + j + VTraits::vlanes()); + v_uint8 v1 = v_absdiff(v11, v12); + r1 = v_dotprod_expand_fast(v1, v1, r1); + } + s += v_reduce_sum(v_add(r0, r1)); + for (; j < n; j++) { + int v = saturate_cast(src1[j] - src2[j]); + s += v * v; + } + return s; + } +}; + #endif #if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) @@ -570,6 +969,257 @@ struct NormL2_SIMD { } }; +template<> +struct NormDiffInf_SIMD { + double operator() (const double* src1, const double* src2, int n) const { + int j = 0; + double s = 0; + v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64(); + for (; j <= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_float64 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); + r0 = v_max(r0, v_absdiff(v01, v02)); + + v_float64 v11 = vx_load(src1 + j + VTraits::vlanes()), + v12 = vx_load(src2 + j + VTraits::vlanes()); + r1 = v_max(r1, v_absdiff(v11, v12)); + } + // [TODO]: use v_reduce_max when it supports float64 + double t[VTraits::max_nlanes]; + vx_store(t, v_max(r0, r1)); + for (int i = 0; i < VTraits::vlanes(); i++) { + s = std::max(s, cv_abs(t[i])); + } + for (; j < n; j++) { + double v = src1[j] - src2[j]; + s = std::max(s, cv_abs(v)); + } + return s; + } +}; + +template<> +struct NormDiffL1_SIMD { + double operator() (const int* src1, const int* src2, int n) const { + int j = 0; + double s = 0.f; + v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64(); + v_float64 r2 = vx_setzero_f64(), r3 = vx_setzero_f64(); + for (; j <= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_int32 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); + v_float32 v0 = v_abs(v_cvt_f32(v_sub(v01, v02))); + r0 = v_add(r0, v_cvt_f64(v0)); r1 = v_add(r1, v_cvt_f64_high(v0)); + + v_int32 v11 = vx_load(src1 + j + VTraits::vlanes()), + v12 = vx_load(src2 + j + VTraits::vlanes()); + v_float32 v1 = v_abs(v_cvt_f32(v_sub(v11, v12))); + r2 = v_add(r2, v_cvt_f64(v1)); r3 = v_add(r3, v_cvt_f64_high(v1)); + } + s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3)); + for (; j < n; j++) { + double v = src1[j] - src2[j]; + s += cv_abs(v); + } + return s; + } +}; + +template<> +struct NormDiffL1_SIMD { + double operator() (const float* src1, const float* src2, int n) const { + int j = 0; + double s = 0.f; + v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64(); + v_float64 r2 = vx_setzero_f64(), r3 = vx_setzero_f64(); + for (; j <= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_float32 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); + v_float32 v0 = v_absdiff(v01, v02); + r0 = v_add(r0, v_cvt_f64(v0)); r1 = v_add(r1, v_cvt_f64_high(v0)); + + v_float32 v11 = vx_load(src1 + j + VTraits::vlanes()), + v12 = vx_load(src2 + j + VTraits::vlanes()); + v_float32 v1 = v_absdiff(v11, v12); + r2 = v_add(r2, v_cvt_f64(v1)); r3 = v_add(r3, v_cvt_f64_high(v1)); + } + s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3)); + for (; j < n; j++) { + double v = src1[j] - src2[j]; + s += cv_abs(v); + } + return s; + } +}; + +template<> +struct NormDiffL1_SIMD { + double operator() (const double* src1, const double* src2, int n) const { + int j = 0; + double s = 0.f; + v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64(); + for (; j <= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_float64 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); + r0 = v_add(r0, v_absdiff(v01, v02)); + + v_float64 v11 = vx_load(src1 + j + VTraits::vlanes()), + v12 = vx_load(src2 + j + VTraits::vlanes()); + r1 = v_add(r1, v_absdiff(v11, v12)); + } + s += v_reduce_sum(v_add(r0, r1)); + for (; j < n; j++) { + double v = src1[j] - src2[j]; + s += cv_abs(v); + } + return s; + } +}; + +template<> +struct NormDiffL2_SIMD { + double operator() (const ushort* src1, const ushort* src2, int n) const { + int j = 0; + double s = 0.f; + v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64(); + for (; j <= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_uint16 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); + v_uint16 v0 = v_absdiff(v01, v02); + v_uint64 u0 = v_dotprod_expand_fast(v0, v0); + r0 = v_add(r0, v_cvt_f64(v_reinterpret_as_s64(u0))); + + v_uint16 v11 = vx_load(src1 + j + VTraits::vlanes()), + v12 = vx_load(src2 + j + VTraits::vlanes()); + v_uint16 v1 = v_absdiff(v11, v12); + v_uint64 u1 = v_dotprod_expand_fast(v1, v1); + r1 = v_add(r1, v_cvt_f64(v_reinterpret_as_s64(u1))); + } + s += v_reduce_sum(v_add(r0, r1)); + for (; j < n; j++) { + double v = saturate_cast(src1[j] - src2[j]); + s += v * v; + } + return s; + } +}; + +template<> +struct NormDiffL2_SIMD { + double operator() (const short* src1, const short* src2, int n) const { + int j = 0; + double s = 0.f; + v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64(); + for (; j <= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_int16 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); + v_uint16 v0 = v_absdiff(v01, v02); + v_uint64 u0 = v_dotprod_expand_fast(v0, v0); + r0 = v_add(r0, v_cvt_f64(v_reinterpret_as_s64(u0))); + + v_int16 v11 = vx_load(src1 + j + VTraits::vlanes()), + v12 = vx_load(src2 + j + VTraits::vlanes()); + v_uint16 v1 = v_absdiff(v11, v12); + v_uint64 u1 = v_dotprod_expand_fast(v1, v1); + r1 = v_add(r1, v_cvt_f64(v_reinterpret_as_s64(u1))); + } + s += v_reduce_sum(v_add(r0, r1)); + for (; j < n; j++) { + double v = saturate_cast(src1[j] - src2[j]); + s += v * v; + } + return s; + } +}; + +template<> +struct NormDiffL2_SIMD { + double operator() (const int* src1, const int* src2, int n) const { + int j = 0; + double s = 0.f; + v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64(); + v_float64 r2 = vx_setzero_f64(), r3 = vx_setzero_f64(); + for (; j <= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_int32 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); + v_float32 v0 = v_abs(v_cvt_f32(v_sub(v01, v02))); + v_float64 f00, f01; + f00 = v_cvt_f64(v0); f01 = v_cvt_f64_high(v0); + r0 = v_fma(f00, f00, r0); r1 = v_fma(f01, f01, r1); + + v_int32 v11 = vx_load(src1 + j + VTraits::vlanes()), + v12 = vx_load(src2 + j + VTraits::vlanes()); + v_float32 v1 = v_abs(v_cvt_f32(v_sub(v11, v12))); + v_float64 f10, f11; + f10 = v_cvt_f64(v1); f11 = v_cvt_f64_high(v1); + r2 = v_fma(f10, f10, r2); r3 = v_fma(f11, f11, r3); + } + s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3)); + for (; j < n; j++) { + double v = src1[j] - src2[j]; + s += v * v; + } + return s; + } +}; + +template<> +struct NormDiffL2_SIMD { + double operator() (const float* src1, const float* src2, int n) const { + int j = 0; + double s = 0.f; + v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64(); + v_float64 r2 = vx_setzero_f64(), r3 = vx_setzero_f64(); + for (; j <= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + v_float32 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); + v_float32 v0 = v_absdiff(v01, v02); + v_float64 f01 = v_cvt_f64(v0), f02 = v_cvt_f64_high(v0); + r0 = v_fma(f01, f01, r0); r1 = v_fma(f02, f02, r1); + + v_float32 v11 = vx_load(src1 + j + VTraits::vlanes()), + v12 = vx_load(src2 + j + VTraits::vlanes()); + v_float32 v1 = v_absdiff(v11, v12); + v_float64 f11 = v_cvt_f64(v1), f12 = v_cvt_f64_high(v1); + r2 = v_fma(f11, f11, r2); r3 = v_fma(f12, f12, r3); + } + s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3)); + for (; j < n; j++) { + double v = src1[j] - src2[j]; + s += v * v; + } + return s; + } +}; + +template<> +struct NormDiffL2_SIMD { + double operator() (const double* src1, const double* src2, int n) const { + int j = 0; + double s = 0.f; + v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64(); + v_float64 r2 = vx_setzero_f64(), r3 = vx_setzero_f64(); + for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { + v_float64 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); + v_float64 v0 = v_absdiff(v01, v02); + r0 = v_fma(v0, v0, r0); + + v_float64 v11 = vx_load(src1 + j + VTraits::vlanes()), + v12 = vx_load(src2 + j + VTraits::vlanes()); + v_float64 v1 = v_absdiff(v11, v12); + r1 = v_fma(v1, v1, r1); + + v_float64 v21 = vx_load(src1 + j + 2 * VTraits::vlanes()), + v22 = vx_load(src2 + j + 2 * VTraits::vlanes()); + v_float64 v2 = v_absdiff(v21, v22); + r2 = v_fma(v2, v2, r2); + + v_float64 v31 = vx_load(src1 + j + 3 * VTraits::vlanes()), + v32 = vx_load(src2 + j + 3 * VTraits::vlanes()); + v_float64 v3 = v_absdiff(v31, v32); + r3 = v_fma(v3, v3, r3); + } + s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3)); + for (; j < n; j++) { + double v = src1[j] - src2[j]; + s += v * v; + } + return s; + } +}; + #endif template int @@ -630,9 +1280,71 @@ normL2_(const T* src, const uchar* mask, ST* _result, int len, int cn) { return 0; } +template int +normDiffInf_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn) { + ST result = *_result; + if( !mask ) { + NormDiffInf_SIMD op; + result = std::max(result, op(src1, src2, len*cn)); + } else { + for( int i = 0; i < len; i++, src1 += cn, src2 += cn ) { + if( mask[i] ) { + for( int k = 0; k < cn; k++ ) { + result = std::max(result, (ST)std::abs(src1[k] - src2[k])); + } + } + } + } + *_result = result; + return 0; +} + +template int +normDiffL1_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn) { + ST result = *_result; + if( !mask ) { + NormDiffL1_SIMD op; + result += op(src1, src2, len*cn); + } + else { + for( int i = 0; i < len; i++, src1 += cn, src2 += cn ) { + if( mask[i] ) { + for( int k = 0; k < cn; k++ ) { + result += std::abs(src1[k] - src2[k]); + } + } + } + } + *_result = result; + return 0; +} + +template int +normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn) { + ST result = *_result; + if( !mask ) { + NormDiffL2_SIMD op; + result += op(src1, src2, len*cn); + } else { + for( int i = 0; i < len; i++, src1 += cn, src2 += cn ) { + if( mask[i] ) { + for( int k = 0; k < cn; k++ ) { + ST v = src1[k] - src2[k]; + result += v*v; + } + } + } + } + *_result = result; + return 0; +} + #define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \ static int norm##L##_##suffix(const type* src, const uchar* mask, ntype* r, int len, int cn) \ { CV_INSTRUMENT_REGION(); return norm##L##_(src, mask, r, len, cn); } \ + static int normDiff##L##_##suffix(const type* src1, const type* src2, \ + const uchar* mask, ntype* r, int len, int cn) \ +{ return normDiff##L##_(src1, src2, mask, r, (int)len, cn); } #define CV_DEF_NORM_ALL(suffix, type, inftype, l1type, l2type) \ CV_DEF_NORM_FUNC(Inf, suffix, type, inftype) \ @@ -669,6 +1381,33 @@ NormFunc getNormFunc(int normType, int depth) return normTab[normType][depth]; } +NormDiffFunc getNormDiffFunc(int normType, int depth) +{ + static NormDiffFunc normDiffTab[3][8] = + { + { + (NormDiffFunc)GET_OPTIMIZED(normDiffInf_8u), (NormDiffFunc)normDiffInf_8s, + (NormDiffFunc)normDiffInf_16u, (NormDiffFunc)normDiffInf_16s, + (NormDiffFunc)normDiffInf_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffInf_32f), + (NormDiffFunc)normDiffInf_64f, 0 + }, + { + (NormDiffFunc)GET_OPTIMIZED(normDiffL1_8u), (NormDiffFunc)normDiffL1_8s, + (NormDiffFunc)normDiffL1_16u, (NormDiffFunc)normDiffL1_16s, + (NormDiffFunc)normDiffL1_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL1_32f), + (NormDiffFunc)normDiffL1_64f, 0 + }, + { + (NormDiffFunc)GET_OPTIMIZED(normDiffL2_8u), (NormDiffFunc)normDiffL2_8s, + (NormDiffFunc)normDiffL2_16u, (NormDiffFunc)normDiffL2_16s, + (NormDiffFunc)normDiffL2_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL2_32f), + (NormDiffFunc)normDiffL2_64f, 0 + } + }; + + return normDiffTab[normType][depth]; +} + #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY CV_CPU_OPTIMIZATION_NAMESPACE_END