Merge pull request #27042 from fengyuentau:4x/core/normDiff_simd

core: vectorize normDiff with universal intrinsics #27042

Merge with https://github.com/opencv/opencv_extra/pull/1242.

Performance results on Desktop Intel i7-12700K, Apple M2, Jetson Orin and SpaceMIT K1:

[perf-normDiff.zip](https://github.com/user-attachments/files/19178689/perf-normDiff.zip)


### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
Yuantao Feng 2025-03-12 21:43:10 +08:00 committed by GitHub
parent 46dbc57a86
commit eefa327f30
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 754 additions and 134 deletions

View File

@ -59,7 +59,7 @@ PERF_TEST_P(Size_MatType_NormType, norm_mask,
PERF_TEST_P(Size_MatType_NormType, norm2,
testing::Combine(
testing::Values(TYPICAL_MAT_SIZES),
testing::Values(TYPICAL_MAT_TYPES),
testing::Values(CV_8UC1, CV_8UC4, CV_8SC1, CV_16UC1, CV_16SC1, CV_32SC1, CV_32FC1, CV_64FC1),
testing::Values((int)NORM_INF, (int)NORM_L1, (int)NORM_L2, (int)(NORM_RELATIVE+NORM_INF), (int)(NORM_RELATIVE+NORM_L1), (int)(NORM_RELATIVE+NORM_L2))
)
)
@ -82,7 +82,7 @@ PERF_TEST_P(Size_MatType_NormType, norm2,
PERF_TEST_P(Size_MatType_NormType, norm2_mask,
testing::Combine(
testing::Values(TYPICAL_MAT_SIZES),
testing::Values(TYPICAL_MAT_TYPES),
testing::Values(CV_8UC1, CV_8UC4, CV_8SC1, CV_16UC1, CV_16SC1, CV_32SC1, CV_32FC1, CV_64FC1),
testing::Values((int)NORM_INF, (int)NORM_L1, (int)NORM_L2, (int)(NORM_RELATIVE|NORM_INF), (int)(NORM_RELATIVE|NORM_L1), (int)(NORM_RELATIVE|NORM_L2))
)
)

View File

@ -218,120 +218,9 @@ int normL1_(const uchar* a, const uchar* b, int n)
//==================================================================================================
template<typename T, typename ST> int
normDiffInf_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
{
ST result = *_result;
if( !mask )
{
result = std::max(result, normInf<T, ST>(src1, src2, len*cn));
}
else
{
for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
if( mask[i] )
{
for( int k = 0; k < cn; k++ )
result = std::max(result, (ST)std::abs(src1[k] - src2[k]));
}
}
*_result = result;
return 0;
}
template<typename T, typename ST> int
normDiffL1_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
{
ST result = *_result;
if( !mask )
{
result += normL1<T, ST>(src1, src2, len*cn);
}
else
{
for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
if( mask[i] )
{
for( int k = 0; k < cn; k++ )
result += std::abs(src1[k] - src2[k]);
}
}
*_result = result;
return 0;
}
template<typename T, typename ST> int
normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
{
ST result = *_result;
if( !mask )
{
result += normL2Sqr<T, ST>(src1, src2, len*cn);
}
else
{
for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
if( mask[i] )
{
for( int k = 0; k < cn; k++ )
{
ST v = src1[k] - src2[k];
result += v*v;
}
}
}
*_result = result;
return 0;
}
#define CV_DEF_NORM_DIFF_FUNC(L, suffix, type, ntype) \
static int normDiff##L##_##suffix(const type* src1, const type* src2, \
const uchar* mask, ntype* r, int len, int cn) \
{ return normDiff##L##_(src1, src2, mask, r, (int)len, cn); }
#define CV_DEF_NORM_DIFF_ALL(suffix, type, inftype, l1type, l2type) \
CV_DEF_NORM_DIFF_FUNC(Inf, suffix, type, inftype) \
CV_DEF_NORM_DIFF_FUNC(L1, suffix, type, l1type) \
CV_DEF_NORM_DIFF_FUNC(L2, suffix, type, l2type)
CV_DEF_NORM_DIFF_ALL(8u, uchar, int, int, int)
CV_DEF_NORM_DIFF_ALL(8s, schar, int, int, int)
CV_DEF_NORM_DIFF_ALL(16u, ushort, int, int, double)
CV_DEF_NORM_DIFF_ALL(16s, short, int, int, double)
CV_DEF_NORM_DIFF_ALL(32s, int, int, double, double)
CV_DEF_NORM_DIFF_ALL(32f, float, float, double, double)
CV_DEF_NORM_DIFF_ALL(64f, double, double, double, double)
typedef int (*NormFunc)(const uchar*, const uchar*, uchar*, int, int);
typedef int (*NormDiffFunc)(const uchar*, const uchar*, const uchar*, uchar*, int, int);
static NormDiffFunc getNormDiffFunc(int normType, int depth)
{
static NormDiffFunc normDiffTab[3][8] =
{
{
(NormDiffFunc)GET_OPTIMIZED(normDiffInf_8u), (NormDiffFunc)normDiffInf_8s,
(NormDiffFunc)normDiffInf_16u, (NormDiffFunc)normDiffInf_16s,
(NormDiffFunc)normDiffInf_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffInf_32f),
(NormDiffFunc)normDiffInf_64f, 0
},
{
(NormDiffFunc)GET_OPTIMIZED(normDiffL1_8u), (NormDiffFunc)normDiffL1_8s,
(NormDiffFunc)normDiffL1_16u, (NormDiffFunc)normDiffL1_16s,
(NormDiffFunc)normDiffL1_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL1_32f),
(NormDiffFunc)normDiffL1_64f, 0
},
{
(NormDiffFunc)GET_OPTIMIZED(normDiffL2_8u), (NormDiffFunc)normDiffL2_8s,
(NormDiffFunc)normDiffL2_16u, (NormDiffFunc)normDiffL2_16s,
(NormDiffFunc)normDiffL2_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL2_32f),
(NormDiffFunc)normDiffL2_64f, 0
}
};
return normDiffTab[normType][depth];
}
#ifdef HAVE_OPENCL
static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & result )
@ -520,6 +409,10 @@ static NormFunc getNormFunc(int normType, int depth) {
CV_INSTRUMENT_REGION();
CV_CPU_DISPATCH(getNormFunc, (normType, depth), CV_CPU_DISPATCH_MODES_ALL);
}
static NormDiffFunc getNormDiffFunc(int normType, int depth) {
CV_INSTRUMENT_REGION();
CV_CPU_DISPATCH(getNormDiffFunc, (normType, depth), CV_CPU_DISPATCH_MODES_ALL);
}
double norm( InputArray _src, int normType, InputArray _mask )
{
@ -1050,6 +943,9 @@ double norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask
normType == NORM_L2 || normType == NORM_L2SQR ||
((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) );
NormDiffFunc func = getNormDiffFunc(normType >> 1, depth == CV_16F ? CV_32F : depth);
CV_Assert( func != 0 );
if( src1.isContinuous() && src2.isContinuous() && mask.empty() )
{
size_t len = src1.total()*src1.channels();
@ -1057,31 +953,19 @@ double norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask
{
if( src1.depth() == CV_32F )
{
const float* data1 = src1.ptr<float>();
const float* data2 = src2.ptr<float>();
const uchar* data1 = src1.ptr<const uchar>();
const uchar* data2 = src2.ptr<const uchar>();
if( normType == NORM_L2 )
if( normType == NORM_L2 || normType == NORM_L2SQR || normType == NORM_L1 )
{
double result = 0;
GET_OPTIMIZED(normDiffL2_32f)(data1, data2, 0, &result, (int)len, 1);
return std::sqrt(result);
}
if( normType == NORM_L2SQR )
{
double result = 0;
GET_OPTIMIZED(normDiffL2_32f)(data1, data2, 0, &result, (int)len, 1);
return result;
}
if( normType == NORM_L1 )
{
double result = 0;
GET_OPTIMIZED(normDiffL1_32f)(data1, data2, 0, &result, (int)len, 1);
return result;
func(data1, data2, 0, (uchar*)&result, (int)len, 1);
return normType == NORM_L2 ? std::sqrt(result) : result;
}
if( normType == NORM_INF )
{
float result = 0;
GET_OPTIMIZED(normDiffInf_32f)(data1, data2, 0, &result, (int)len, 1);
func(data1, data2, 0, (uchar*)&result, (int)len, 1);
return result;
}
}
@ -1115,9 +999,6 @@ double norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask
return result;
}
NormDiffFunc func = getNormDiffFunc(normType >> 1, depth == CV_16F ? CV_32F : depth);
CV_Assert( func != 0 );
const Mat* arrays[] = {&src1, &src2, &mask, 0};
uchar* ptrs[3] = {};
union

View File

@ -11,10 +11,12 @@
namespace cv {
using NormFunc = int (*)(const uchar*, const uchar*, uchar*, int, int);
using NormDiffFunc = int (*)(const uchar*, const uchar*, const uchar*, uchar*, int, int);
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
NormFunc getNormFunc(int normType, int depth);
NormDiffFunc getNormDiffFunc(int normType, int depth);
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
@ -52,6 +54,42 @@ struct NormL2_SIMD {
}
};
template <typename T, typename ST>
struct NormDiffInf_SIMD {
inline ST operator() (const T* src1, const T* src2, int n) const {
ST s = 0;
for (int i = 0; i < n; i++) {
ST v = ST(src1[i] - src2[i]);
s = std::max(s, (ST)cv_abs(v));
}
return s;
}
};
template <typename T, typename ST>
struct NormDiffL1_SIMD {
inline ST operator() (const T* src1, const T* src2, int n) const {
ST s = 0;
for (int i = 0; i < n; i++) {
ST v = ST(src1[i] - src2[i]);
s += cv_abs(v);
}
return s;
}
};
template <typename T, typename ST>
struct NormDiffL2_SIMD {
inline ST operator() (const T* src1, const T* src2, int n) const {
ST s = 0;
for (int i = 0; i < n; i++) {
ST v = ST(src1[i] - src2[i]);
s += v * v;
}
return s;
}
};
#if (CV_SIMD || CV_SIMD_SCALABLE)
template<>
@ -348,6 +386,367 @@ struct NormL2_SIMD<schar, int> {
}
};
template<>
struct NormDiffInf_SIMD<uchar, int> {
int operator() (const uchar* src1, const uchar* src2, int n) const {
int j = 0;
int s = 0;
v_uint8 r0 = vx_setzero_u8(), r1 = vx_setzero_u8();
v_uint8 r2 = vx_setzero_u8(), r3 = vx_setzero_u8();
for (; j <= n - 4 * VTraits<v_uint8>::vlanes(); j += 4 * VTraits<v_uint8>::vlanes()) {
v_uint8 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
r0 = v_max(r0, v_absdiff(v01, v02));
v_uint8 v11 = vx_load(src1 + j + VTraits<v_uint8>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_uint8>::vlanes());
r1 = v_max(r1, v_absdiff(v11, v12));
v_uint8 v21 = vx_load(src1 + j + 2 * VTraits<v_uint8>::vlanes()),
v22 = vx_load(src2 + j + 2 * VTraits<v_uint8>::vlanes());
r2 = v_max(r2, v_absdiff(v21, v22));
v_uint8 v31 = vx_load(src1 + j + 3 * VTraits<v_uint8>::vlanes()),
v32 = vx_load(src2 + j + 3 * VTraits<v_uint8>::vlanes());
r3 = v_max(r3, v_absdiff(v31, v32));
}
s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3));
for (; j < n; j++) {
int v = src1[j] - src2[j];
s = std::max(s, (int)cv_abs(v));
}
return s;
}
};
template<>
struct NormDiffInf_SIMD<schar, int> {
int operator() (const schar* src1, const schar* src2, int n) const {
int j = 0;
int s = 0;
v_uint8 r0 = vx_setzero_u8(), r1 = vx_setzero_u8();
v_uint8 r2 = vx_setzero_u8(), r3 = vx_setzero_u8();
for (; j <= n - 4 * VTraits<v_int8>::vlanes(); j += 4 * VTraits<v_int8>::vlanes()) {
v_int8 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
r0 = v_max(r0, v_absdiff(v01, v02));
v_int8 v11 = vx_load(src1 + j + VTraits<v_int8>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_int8>::vlanes());
r1 = v_max(r1, v_absdiff(v11, v12));
v_int8 v21 = vx_load(src1 + j + 2 * VTraits<v_int8>::vlanes()),
v22 = vx_load(src2 + j + 2 * VTraits<v_int8>::vlanes());
r2 = v_max(r2, v_absdiff(v21, v22));
v_int8 v31 = vx_load(src1 + j + 3 * VTraits<v_int8>::vlanes()),
v32 = vx_load(src2 + j + 3 * VTraits<v_int8>::vlanes());
r3 = v_max(r3, v_absdiff(v31, v32));
}
s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3));
for (; j < n; j++) {
int v = src1[j] - src2[j];
s = std::max(s, (int)cv_abs(v));
}
return s;
}
};
template<>
struct NormDiffInf_SIMD<ushort, int> {
int operator() (const ushort* src1, const ushort* src2, int n) const {
int j = 0;
int s = 0;
v_uint16 r0 = vx_setzero_u16(), r1 = vx_setzero_u16();
v_uint16 r2 = vx_setzero_u16(), r3 = vx_setzero_u16();
for (; j <= n - 4 * VTraits<v_uint16>::vlanes(); j += 4 * VTraits<v_uint16>::vlanes()) {
v_uint16 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
r0 = v_max(r0, v_absdiff(v01, v02));
v_uint16 v11 = vx_load(src1 + j + VTraits<v_uint16>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_uint16>::vlanes());
r1 = v_max(r1, v_absdiff(v11, v12));
v_uint16 v21 = vx_load(src1 + j + 2 * VTraits<v_uint16>::vlanes()),
v22 = vx_load(src2 + j + 2 * VTraits<v_uint16>::vlanes());
r2 = v_max(r2, v_absdiff(v21, v22));
v_uint16 v31 = vx_load(src1 + j + 3 * VTraits<v_uint16>::vlanes()),
v32 = vx_load(src2 + j + 3 * VTraits<v_uint16>::vlanes());
r3 = v_max(r3, v_absdiff(v31, v32));
}
s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3));
for (; j < n; j++) {
int v = src1[j] - src2[j];
s = std::max(s, (int)cv_abs(v));
}
return s;
}
};
template<>
struct NormDiffInf_SIMD<short, int> {
int operator() (const short* src1, const short* src2, int n) const {
int j = 0;
int s = 0;
v_uint16 r0 = vx_setzero_u16(), r1 = vx_setzero_u16();
v_uint16 r2 = vx_setzero_u16(), r3 = vx_setzero_u16();
for (; j <= n - 4 * VTraits<v_int16>::vlanes(); j += 4 * VTraits<v_int16>::vlanes()) {
v_int16 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
r0 = v_max(r0, v_absdiff(v01, v02));
v_int16 v11 = vx_load(src1 + j + VTraits<v_int16>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_int16>::vlanes());
r1 = v_max(r1, v_absdiff(v11, v12));
v_int16 v21 = vx_load(src1 + j + 2 * VTraits<v_int16>::vlanes()),
v22 = vx_load(src2 + j + 2 * VTraits<v_int16>::vlanes());
r2 = v_max(r2, v_absdiff(v21, v22));
v_int16 v31 = vx_load(src1 + j + 3 * VTraits<v_int16>::vlanes()),
v32 = vx_load(src2 + j + 3 * VTraits<v_int16>::vlanes());
r3 = v_max(r3, v_absdiff(v31, v32));
}
s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3));
for (; j < n; j++) {
int v = src1[j] - src2[j];
s = std::max(s, (int)cv_abs(v));
}
return s;
}
};
template<>
struct NormDiffInf_SIMD<int, int> {
int operator() (const int* src1, const int* src2, int n) const {
int j = 0;
int s = 0;
v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32();
v_uint32 r2 = vx_setzero_u32(), r3 = vx_setzero_u32();
for (; j <= n - 4 * VTraits<v_int32>::vlanes(); j += 4 * VTraits<v_int32>::vlanes()) {
v_int32 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
r0 = v_max(r0, v_abs(v_sub(v01, v02)));
v_int32 v11 = vx_load(src1 + j + VTraits<v_int32>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_int32>::vlanes());
r1 = v_max(r1, v_abs(v_sub(v11, v12)));
v_int32 v21 = vx_load(src1 + j + 2 * VTraits<v_int32>::vlanes()),
v22 = vx_load(src2 + j + 2 * VTraits<v_int32>::vlanes());
r2 = v_max(r2, v_abs(v_sub(v21, v22)));
v_int32 v31 = vx_load(src1 + j + 3 * VTraits<v_int32>::vlanes()),
v32 = vx_load(src2 + j + 3 * VTraits<v_int32>::vlanes());
r3 = v_max(r3, v_abs(v_sub(v31, v32)));
}
s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3));
for (; j < n; j++) {
int v = src1[j] - src2[j];
s = std::max(s, (int)cv_abs(v));
}
return s;
}
};
template<>
struct NormDiffInf_SIMD<float, float> {
float operator() (const float* src1, const float* src2, int n) const {
int j = 0;
float s = 0;
v_float32 r0 = vx_setzero_f32(), r1 = vx_setzero_f32();
for (; j <= n - 2 * VTraits<v_float32>::vlanes(); j += 2 * VTraits<v_float32>::vlanes()) {
v_float32 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
r0 = v_max(r0, v_absdiff(v01, v02));
v_float32 v11 = vx_load(src1 + j + VTraits<v_float32>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_float32>::vlanes());
r1 = v_max(r1, v_absdiff(v11, v12));
}
s = v_reduce_max(v_max(r0, r1));
for (; j < n; j++) {
float v = src1[j] - src2[j];
s = std::max(s, cv_abs(v));
}
return s;
}
};
template<>
struct NormDiffL1_SIMD<uchar, int> {
int operator() (const uchar* src1, const uchar* src2, int n) const {
int j = 0;
int s = 0;
v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32();
v_uint8 one = vx_setall_u8(1);
for (; j<= n - 2 * VTraits<v_uint8>::vlanes(); j += 2 * VTraits<v_uint8>::vlanes()) {
v_uint8 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
r0 = v_dotprod_expand_fast(v_absdiff(v01, v02), one, r0);
v_uint8 v11 = vx_load(src1 + j + VTraits<v_uint8>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_uint8>::vlanes());
r1 = v_dotprod_expand_fast(v_absdiff(v11, v12), one, r1);
}
s += v_reduce_sum(v_add(r0, r1));
for (; j < n; j++) {
int v = src1[j] - src2[j];
s += (int)cv_abs(v);
}
return s;
}
};
template<>
struct NormDiffL1_SIMD<schar, int> {
int operator() (const schar* src1, const schar* src2, int n) const {
int j = 0;
int s = 0;
v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32();
v_uint8 one = vx_setall_u8(1);
for (; j<= n - 2 * VTraits<v_int8>::vlanes(); j += 2 * VTraits<v_int8>::vlanes()) {
v_int8 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
r0 = v_dotprod_expand_fast(v_absdiff(v01, v02), one, r0);
v_int8 v11 = vx_load(src1 + j + VTraits<v_int8>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_int8>::vlanes());
r1 = v_dotprod_expand_fast(v_absdiff(v11, v12), one, r1);
}
s += v_reduce_sum(v_add(r0, r1));
for (; j < n; j++) {
int v =src1[j] - src2[j];
s += (int)cv_abs(v);
}
return s;
}
};
template<>
struct NormDiffL1_SIMD<ushort, int> {
int operator() (const ushort* src1, const ushort* src2, int n) const {
int j = 0;
int s = 0;
v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32();
v_uint32 r2 = vx_setzero_u32(), r3 = vx_setzero_u32();
for (; j<= n - 4 * VTraits<v_uint16>::vlanes(); j += 4 * VTraits<v_uint16>::vlanes()) {
v_uint16 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
v_uint32 u00, u01;
v_expand(v_absdiff(v01, v02), u00, u01);
r0 = v_add(r0, v_add(u00, u01));
v_uint16 v11 = vx_load(src1 + j + VTraits<v_uint16>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_uint16>::vlanes());
v_uint32 u10, u11;
v_expand(v_absdiff(v11, v12), u10, u11);
r1 = v_add(r1, v_add(u10, u11));
v_uint16 v21 = vx_load(src1 + j + 2 * VTraits<v_uint16>::vlanes()),
v22 = vx_load(src2 + j + 2 * VTraits<v_uint16>::vlanes());
v_uint32 u20, u21;
v_expand(v_absdiff(v21, v22), u20, u21);
r2 = v_add(r2, v_add(u20, u21));
v_uint16 v31 = vx_load(src1 + j + 3 * VTraits<v_uint16>::vlanes()),
v32 = vx_load(src2 + j + 3 * VTraits<v_uint16>::vlanes());
v_uint32 u30, u31;
v_expand(v_absdiff(v31, v32), u30, u31);
r3 = v_add(r3, v_add(u30, u31));
}
s += (int)v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3));
for (; j < n; j++) {
int v = src1[j] - src2[j];
s += (int)cv_abs(v);
}
return s;
}
};
template<>
struct NormDiffL1_SIMD<short, int> {
int operator() (const short* src1, const short* src2, int n) const {
int j = 0;
int s = 0;
v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32();
v_uint32 r2 = vx_setzero_u32(), r3 = vx_setzero_u32();
for (; j<= n - 4 * VTraits<v_int16>::vlanes(); j += 4 * VTraits<v_int16>::vlanes()) {
v_int16 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
v_uint32 u00, u01;
v_expand(v_absdiff(v01, v02), u00, u01);
r0 = v_add(r0, v_add(u00, u01));
v_int16 v11 = vx_load(src1 + j + VTraits<v_int16>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_int16>::vlanes());
v_uint32 u10, u11;
v_expand(v_absdiff(v11, v12), u10, u11);
r1 = v_add(r1, v_add(u10, u11));
v_int16 v21 = vx_load(src1 + j + 2 * VTraits<v_int16>::vlanes()),
v22 = vx_load(src2 + j + 2 * VTraits<v_int16>::vlanes());
v_uint32 u20, u21;
v_expand(v_absdiff(v21, v22), u20, u21);
r2 = v_add(r2, v_add(u20, u21));
v_int16 v31 = vx_load(src1 + j + 3 * VTraits<v_int16>::vlanes()),
v32 = vx_load(src2 + j + 3 * VTraits<v_int16>::vlanes());
v_uint32 u30, u31;
v_expand(v_absdiff(v31, v32), u30, u31);
r3 = v_add(r3, v_add(u30, u31));
}
s += (int)v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3));
for (; j < n; j++) {
int v = src1[j] - src2[j];
s += (int)cv_abs(v);
}
return s;
}
};
template<>
struct NormDiffL2_SIMD<uchar, int> {
int operator() (const uchar* src1, const uchar* src2, int n) const {
int j = 0;
int s = 0;
v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32();
for (; j <= n - 2 * VTraits<v_uint8>::vlanes(); j += 2 * VTraits<v_uint8>::vlanes()) {
v_uint8 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
v_uint8 v0 = v_absdiff(v01, v02);
r0 = v_dotprod_expand_fast(v0, v0, r0);
v_uint8 v11 = vx_load(src1 + j + VTraits<v_uint8>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_uint8>::vlanes());
v_uint8 v1 = v_absdiff(v11, v12);
r1 = v_dotprod_expand_fast(v1, v1, r1);
}
s += v_reduce_sum(v_add(r0, r1));
for (; j < n; j++) {
int v = saturate_cast<int>(src1[j] - src2[j]);
s += v * v;
}
return s;
}
};
template<>
struct NormDiffL2_SIMD<schar, int> {
int operator() (const schar* src1, const schar* src2, int n) const {
int j = 0;
int s = 0;
v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32();
for (; j <= n - 2 * VTraits<v_int8>::vlanes(); j += 2 * VTraits<v_int8>::vlanes()) {
v_int8 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
v_uint8 v0 = v_absdiff(v01, v02);
r0 = v_dotprod_expand_fast(v0, v0, r0);
v_int8 v11 = vx_load(src1 + j + VTraits<v_int8>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_int8>::vlanes());
v_uint8 v1 = v_absdiff(v11, v12);
r1 = v_dotprod_expand_fast(v1, v1, r1);
}
s += v_reduce_sum(v_add(r0, r1));
for (; j < n; j++) {
int v = saturate_cast<int>(src1[j] - src2[j]);
s += v * v;
}
return s;
}
};
#endif
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
@ -570,6 +969,257 @@ struct NormL2_SIMD<double, double> {
}
};
template<>
struct NormDiffInf_SIMD<double, double> {
double operator() (const double* src1, const double* src2, int n) const {
int j = 0;
double s = 0;
v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64();
for (; j <= n - 2 * VTraits<v_float64>::vlanes(); j += 2 * VTraits<v_float64>::vlanes()) {
v_float64 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
r0 = v_max(r0, v_absdiff(v01, v02));
v_float64 v11 = vx_load(src1 + j + VTraits<v_float64>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_float64>::vlanes());
r1 = v_max(r1, v_absdiff(v11, v12));
}
// [TODO]: use v_reduce_max when it supports float64
double t[VTraits<v_float64>::max_nlanes];
vx_store(t, v_max(r0, r1));
for (int i = 0; i < VTraits<v_float64>::vlanes(); i++) {
s = std::max(s, cv_abs(t[i]));
}
for (; j < n; j++) {
double v = src1[j] - src2[j];
s = std::max(s, cv_abs(v));
}
return s;
}
};
template<>
struct NormDiffL1_SIMD<int, double> {
double operator() (const int* src1, const int* src2, int n) const {
int j = 0;
double s = 0.f;
v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64();
v_float64 r2 = vx_setzero_f64(), r3 = vx_setzero_f64();
for (; j <= n - 2 * VTraits<v_int32>::vlanes(); j += 2 * VTraits<v_int32>::vlanes()) {
v_int32 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
v_float32 v0 = v_abs(v_cvt_f32(v_sub(v01, v02)));
r0 = v_add(r0, v_cvt_f64(v0)); r1 = v_add(r1, v_cvt_f64_high(v0));
v_int32 v11 = vx_load(src1 + j + VTraits<v_int32>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_int32>::vlanes());
v_float32 v1 = v_abs(v_cvt_f32(v_sub(v11, v12)));
r2 = v_add(r2, v_cvt_f64(v1)); r3 = v_add(r3, v_cvt_f64_high(v1));
}
s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3));
for (; j < n; j++) {
double v = src1[j] - src2[j];
s += cv_abs(v);
}
return s;
}
};
template<>
struct NormDiffL1_SIMD<float, double> {
double operator() (const float* src1, const float* src2, int n) const {
int j = 0;
double s = 0.f;
v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64();
v_float64 r2 = vx_setzero_f64(), r3 = vx_setzero_f64();
for (; j <= n - 2 * VTraits<v_float32>::vlanes(); j += 2 * VTraits<v_float32>::vlanes()) {
v_float32 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
v_float32 v0 = v_absdiff(v01, v02);
r0 = v_add(r0, v_cvt_f64(v0)); r1 = v_add(r1, v_cvt_f64_high(v0));
v_float32 v11 = vx_load(src1 + j + VTraits<v_float32>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_float32>::vlanes());
v_float32 v1 = v_absdiff(v11, v12);
r2 = v_add(r2, v_cvt_f64(v1)); r3 = v_add(r3, v_cvt_f64_high(v1));
}
s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3));
for (; j < n; j++) {
double v = src1[j] - src2[j];
s += cv_abs(v);
}
return s;
}
};
template<>
struct NormDiffL1_SIMD<double, double> {
double operator() (const double* src1, const double* src2, int n) const {
int j = 0;
double s = 0.f;
v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64();
for (; j <= n - 2 * VTraits<v_float64>::vlanes(); j += 2 * VTraits<v_float64>::vlanes()) {
v_float64 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
r0 = v_add(r0, v_absdiff(v01, v02));
v_float64 v11 = vx_load(src1 + j + VTraits<v_float64>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_float64>::vlanes());
r1 = v_add(r1, v_absdiff(v11, v12));
}
s += v_reduce_sum(v_add(r0, r1));
for (; j < n; j++) {
double v = src1[j] - src2[j];
s += cv_abs(v);
}
return s;
}
};
template<>
struct NormDiffL2_SIMD<ushort, double> {
double operator() (const ushort* src1, const ushort* src2, int n) const {
int j = 0;
double s = 0.f;
v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64();
for (; j <= n - 2 * VTraits<v_uint16>::vlanes(); j += 2 * VTraits<v_uint16>::vlanes()) {
v_uint16 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
v_uint16 v0 = v_absdiff(v01, v02);
v_uint64 u0 = v_dotprod_expand_fast(v0, v0);
r0 = v_add(r0, v_cvt_f64(v_reinterpret_as_s64(u0)));
v_uint16 v11 = vx_load(src1 + j + VTraits<v_uint16>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_uint16>::vlanes());
v_uint16 v1 = v_absdiff(v11, v12);
v_uint64 u1 = v_dotprod_expand_fast(v1, v1);
r1 = v_add(r1, v_cvt_f64(v_reinterpret_as_s64(u1)));
}
s += v_reduce_sum(v_add(r0, r1));
for (; j < n; j++) {
double v = saturate_cast<double>(src1[j] - src2[j]);
s += v * v;
}
return s;
}
};
template<>
struct NormDiffL2_SIMD<short, double> {
double operator() (const short* src1, const short* src2, int n) const {
int j = 0;
double s = 0.f;
v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64();
for (; j <= n - 2 * VTraits<v_int16>::vlanes(); j += 2 * VTraits<v_int16>::vlanes()) {
v_int16 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
v_uint16 v0 = v_absdiff(v01, v02);
v_uint64 u0 = v_dotprod_expand_fast(v0, v0);
r0 = v_add(r0, v_cvt_f64(v_reinterpret_as_s64(u0)));
v_int16 v11 = vx_load(src1 + j + VTraits<v_uint16>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_uint16>::vlanes());
v_uint16 v1 = v_absdiff(v11, v12);
v_uint64 u1 = v_dotprod_expand_fast(v1, v1);
r1 = v_add(r1, v_cvt_f64(v_reinterpret_as_s64(u1)));
}
s += v_reduce_sum(v_add(r0, r1));
for (; j < n; j++) {
double v = saturate_cast<double>(src1[j] - src2[j]);
s += v * v;
}
return s;
}
};
template<>
struct NormDiffL2_SIMD<int, double> {
double operator() (const int* src1, const int* src2, int n) const {
int j = 0;
double s = 0.f;
v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64();
v_float64 r2 = vx_setzero_f64(), r3 = vx_setzero_f64();
for (; j <= n - 2 * VTraits<v_int32>::vlanes(); j += 2 * VTraits<v_int32>::vlanes()) {
v_int32 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
v_float32 v0 = v_abs(v_cvt_f32(v_sub(v01, v02)));
v_float64 f00, f01;
f00 = v_cvt_f64(v0); f01 = v_cvt_f64_high(v0);
r0 = v_fma(f00, f00, r0); r1 = v_fma(f01, f01, r1);
v_int32 v11 = vx_load(src1 + j + VTraits<v_int32>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_int32>::vlanes());
v_float32 v1 = v_abs(v_cvt_f32(v_sub(v11, v12)));
v_float64 f10, f11;
f10 = v_cvt_f64(v1); f11 = v_cvt_f64_high(v1);
r2 = v_fma(f10, f10, r2); r3 = v_fma(f11, f11, r3);
}
s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3));
for (; j < n; j++) {
double v = src1[j] - src2[j];
s += v * v;
}
return s;
}
};
template<>
struct NormDiffL2_SIMD<float, double> {
double operator() (const float* src1, const float* src2, int n) const {
int j = 0;
double s = 0.f;
v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64();
v_float64 r2 = vx_setzero_f64(), r3 = vx_setzero_f64();
for (; j <= n - 2 * VTraits<v_float32>::vlanes(); j += 2 * VTraits<v_float32>::vlanes()) {
v_float32 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
v_float32 v0 = v_absdiff(v01, v02);
v_float64 f01 = v_cvt_f64(v0), f02 = v_cvt_f64_high(v0);
r0 = v_fma(f01, f01, r0); r1 = v_fma(f02, f02, r1);
v_float32 v11 = vx_load(src1 + j + VTraits<v_float32>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_float32>::vlanes());
v_float32 v1 = v_absdiff(v11, v12);
v_float64 f11 = v_cvt_f64(v1), f12 = v_cvt_f64_high(v1);
r2 = v_fma(f11, f11, r2); r3 = v_fma(f12, f12, r3);
}
s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3));
for (; j < n; j++) {
double v = src1[j] - src2[j];
s += v * v;
}
return s;
}
};
template<>
struct NormDiffL2_SIMD<double, double> {
double operator() (const double* src1, const double* src2, int n) const {
int j = 0;
double s = 0.f;
v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64();
v_float64 r2 = vx_setzero_f64(), r3 = vx_setzero_f64();
for (; j <= n - 4 * VTraits<v_float64>::vlanes(); j += 4 * VTraits<v_float64>::vlanes()) {
v_float64 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
v_float64 v0 = v_absdiff(v01, v02);
r0 = v_fma(v0, v0, r0);
v_float64 v11 = vx_load(src1 + j + VTraits<v_float64>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_float64>::vlanes());
v_float64 v1 = v_absdiff(v11, v12);
r1 = v_fma(v1, v1, r1);
v_float64 v21 = vx_load(src1 + j + 2 * VTraits<v_float64>::vlanes()),
v22 = vx_load(src2 + j + 2 * VTraits<v_float64>::vlanes());
v_float64 v2 = v_absdiff(v21, v22);
r2 = v_fma(v2, v2, r2);
v_float64 v31 = vx_load(src1 + j + 3 * VTraits<v_float64>::vlanes()),
v32 = vx_load(src2 + j + 3 * VTraits<v_float64>::vlanes());
v_float64 v3 = v_absdiff(v31, v32);
r3 = v_fma(v3, v3, r3);
}
s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3));
for (; j < n; j++) {
double v = src1[j] - src2[j];
s += v * v;
}
return s;
}
};
#endif
template<typename T, typename ST> int
@ -630,9 +1280,71 @@ normL2_(const T* src, const uchar* mask, ST* _result, int len, int cn) {
return 0;
}
template<typename T, typename ST> int
normDiffInf_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn) {
ST result = *_result;
if( !mask ) {
NormDiffInf_SIMD<T, ST> op;
result = std::max(result, op(src1, src2, len*cn));
} else {
for( int i = 0; i < len; i++, src1 += cn, src2 += cn ) {
if( mask[i] ) {
for( int k = 0; k < cn; k++ ) {
result = std::max(result, (ST)std::abs(src1[k] - src2[k]));
}
}
}
}
*_result = result;
return 0;
}
template<typename T, typename ST> int
normDiffL1_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn) {
ST result = *_result;
if( !mask ) {
NormDiffL1_SIMD<T, ST> op;
result += op(src1, src2, len*cn);
}
else {
for( int i = 0; i < len; i++, src1 += cn, src2 += cn ) {
if( mask[i] ) {
for( int k = 0; k < cn; k++ ) {
result += std::abs(src1[k] - src2[k]);
}
}
}
}
*_result = result;
return 0;
}
template<typename T, typename ST> int
normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn) {
ST result = *_result;
if( !mask ) {
NormDiffL2_SIMD<T, ST> op;
result += op(src1, src2, len*cn);
} else {
for( int i = 0; i < len; i++, src1 += cn, src2 += cn ) {
if( mask[i] ) {
for( int k = 0; k < cn; k++ ) {
ST v = src1[k] - src2[k];
result += v*v;
}
}
}
}
*_result = result;
return 0;
}
#define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \
static int norm##L##_##suffix(const type* src, const uchar* mask, ntype* r, int len, int cn) \
{ CV_INSTRUMENT_REGION(); return norm##L##_(src, mask, r, len, cn); } \
static int normDiff##L##_##suffix(const type* src1, const type* src2, \
const uchar* mask, ntype* r, int len, int cn) \
{ return normDiff##L##_(src1, src2, mask, r, (int)len, cn); }
#define CV_DEF_NORM_ALL(suffix, type, inftype, l1type, l2type) \
CV_DEF_NORM_FUNC(Inf, suffix, type, inftype) \
@ -669,6 +1381,33 @@ NormFunc getNormFunc(int normType, int depth)
return normTab[normType][depth];
}
NormDiffFunc getNormDiffFunc(int normType, int depth)
{
static NormDiffFunc normDiffTab[3][8] =
{
{
(NormDiffFunc)GET_OPTIMIZED(normDiffInf_8u), (NormDiffFunc)normDiffInf_8s,
(NormDiffFunc)normDiffInf_16u, (NormDiffFunc)normDiffInf_16s,
(NormDiffFunc)normDiffInf_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffInf_32f),
(NormDiffFunc)normDiffInf_64f, 0
},
{
(NormDiffFunc)GET_OPTIMIZED(normDiffL1_8u), (NormDiffFunc)normDiffL1_8s,
(NormDiffFunc)normDiffL1_16u, (NormDiffFunc)normDiffL1_16s,
(NormDiffFunc)normDiffL1_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL1_32f),
(NormDiffFunc)normDiffL1_64f, 0
},
{
(NormDiffFunc)GET_OPTIMIZED(normDiffL2_8u), (NormDiffFunc)normDiffL2_8s,
(NormDiffFunc)normDiffL2_16u, (NormDiffFunc)normDiffL2_16s,
(NormDiffFunc)normDiffL2_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL2_32f),
(NormDiffFunc)normDiffL2_64f, 0
}
};
return normDiffTab[normType][depth];
}
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
CV_CPU_OPTIMIZATION_NAMESPACE_END