Merge pull request #26885 from fengyuentau:4x/core/normalize_simd

core: vectorize cv::normalize / cv::norm #26885

Checklist:
|      | normInf | normL1 | normL2 |
| ---- | ------- | ------ | ------ |
| bool |    -    |   -    |   -    |
| 8u   |    √    |   √    |   √    |
| 8s   |    √    |   √    |   √    |
| 16u  |    √    |   √    |   √    |
| 16s  |    √    |   √    |   √    |
| 16f  |    -    |   -    |   -    |
| 16bf |    -    |   -    |   -    |
| 32u  |    -    |   -    |   -    |
| 32s  |    √    |   √    |   √    |
| 32f  |    √    |   √    |   √    |
| 64u  |    -    |   -    |   -    |
| 64s  |    -    |   -    |   -    |
| 64f  |    √    |   √    |   √    |

*: Vectorization of data type bool, 16f, 16bf, 32u, 64u and 64s needs to be done on 5.x.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
Yuantao Feng 2025-02-21 18:49:11 +08:00 committed by GitHub
parent 7a2b048c92
commit e2803bee5c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 905 additions and 122 deletions

View File

@ -12,6 +12,7 @@ ocv_add_dispatched_file(mean SSE2 AVX2 LASX)
ocv_add_dispatched_file(merge SSE2 AVX2 LASX)
ocv_add_dispatched_file(split SSE2 AVX2 LASX)
ocv_add_dispatched_file(sum SSE2 AVX2 LASX)
ocv_add_dispatched_file(norm SSE2 SSE4_1 AVX AVX2 NEON_DOTPROD LASX)
# dispatching for accuracy tests
ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2 AVX512_SKX)

View File

@ -7,6 +7,9 @@
#include "opencl_kernels_core.hpp"
#include "stat.hpp"
#include "norm.simd.hpp"
#include "norm.simd_declarations.hpp"
/****************************************************************************************\
* norm *
\****************************************************************************************/
@ -215,72 +218,6 @@ int normL1_(const uchar* a, const uchar* b, int n)
//==================================================================================================
template<typename T, typename ST> int
normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
{
ST result = *_result;
if( !mask )
{
result = std::max(result, normInf<T, ST>(src, len*cn));
}
else
{
for( int i = 0; i < len; i++, src += cn )
if( mask[i] )
{
for( int k = 0; k < cn; k++ )
result = std::max(result, ST(cv_abs(src[k])));
}
}
*_result = result;
return 0;
}
template<typename T, typename ST> int
normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
{
ST result = *_result;
if( !mask )
{
result += normL1<T, ST>(src, len*cn);
}
else
{
for( int i = 0; i < len; i++, src += cn )
if( mask[i] )
{
for( int k = 0; k < cn; k++ )
result += cv_abs(src[k]);
}
}
*_result = result;
return 0;
}
template<typename T, typename ST> int
normL2_(const T* src, const uchar* mask, ST* _result, int len, int cn)
{
ST result = *_result;
if( !mask )
{
result += normL2Sqr<T, ST>(src, len*cn);
}
else
{
for( int i = 0; i < len; i++, src += cn )
if( mask[i] )
{
for( int k = 0; k < cn; k++ )
{
T v = src[k];
result += (ST)v*v;
}
}
}
*_result = result;
return 0;
}
template<typename T, typename ST> int
normDiffInf_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
{
@ -347,51 +284,27 @@ normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int le
return 0;
}
#define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \
static int norm##L##_##suffix(const type* src, const uchar* mask, ntype* r, int len, int cn) \
{ return norm##L##_(src, mask, r, len, cn); } \
#define CV_DEF_NORM_DIFF_FUNC(L, suffix, type, ntype) \
static int normDiff##L##_##suffix(const type* src1, const type* src2, \
const uchar* mask, ntype* r, int len, int cn) \
{ return normDiff##L##_(src1, src2, mask, r, (int)len, cn); }
#define CV_DEF_NORM_ALL(suffix, type, inftype, l1type, l2type) \
CV_DEF_NORM_FUNC(Inf, suffix, type, inftype) \
CV_DEF_NORM_FUNC(L1, suffix, type, l1type) \
CV_DEF_NORM_FUNC(L2, suffix, type, l2type)
CV_DEF_NORM_ALL(8u, uchar, int, int, int)
CV_DEF_NORM_ALL(8s, schar, int, int, int)
CV_DEF_NORM_ALL(16u, ushort, int, int, double)
CV_DEF_NORM_ALL(16s, short, int, int, double)
CV_DEF_NORM_ALL(32s, int, int, double, double)
CV_DEF_NORM_ALL(32f, float, float, double, double)
CV_DEF_NORM_ALL(64f, double, double, double, double)
#define CV_DEF_NORM_DIFF_ALL(suffix, type, inftype, l1type, l2type) \
CV_DEF_NORM_DIFF_FUNC(Inf, suffix, type, inftype) \
CV_DEF_NORM_DIFF_FUNC(L1, suffix, type, l1type) \
CV_DEF_NORM_DIFF_FUNC(L2, suffix, type, l2type)
CV_DEF_NORM_DIFF_ALL(8u, uchar, int, int, int)
CV_DEF_NORM_DIFF_ALL(8s, schar, int, int, int)
CV_DEF_NORM_DIFF_ALL(16u, ushort, int, int, double)
CV_DEF_NORM_DIFF_ALL(16s, short, int, int, double)
CV_DEF_NORM_DIFF_ALL(32s, int, int, double, double)
CV_DEF_NORM_DIFF_ALL(32f, float, float, double, double)
CV_DEF_NORM_DIFF_ALL(64f, double, double, double, double)
typedef int (*NormFunc)(const uchar*, const uchar*, uchar*, int, int);
typedef int (*NormDiffFunc)(const uchar*, const uchar*, const uchar*, uchar*, int, int);
static NormFunc getNormFunc(int normType, int depth)
{
static NormFunc normTab[3][8] =
{
{
(NormFunc)GET_OPTIMIZED(normInf_8u), (NormFunc)GET_OPTIMIZED(normInf_8s), (NormFunc)GET_OPTIMIZED(normInf_16u), (NormFunc)GET_OPTIMIZED(normInf_16s),
(NormFunc)GET_OPTIMIZED(normInf_32s), (NormFunc)GET_OPTIMIZED(normInf_32f), (NormFunc)normInf_64f, 0
},
{
(NormFunc)GET_OPTIMIZED(normL1_8u), (NormFunc)GET_OPTIMIZED(normL1_8s), (NormFunc)GET_OPTIMIZED(normL1_16u), (NormFunc)GET_OPTIMIZED(normL1_16s),
(NormFunc)GET_OPTIMIZED(normL1_32s), (NormFunc)GET_OPTIMIZED(normL1_32f), (NormFunc)normL1_64f, 0
},
{
(NormFunc)GET_OPTIMIZED(normL2_8u), (NormFunc)GET_OPTIMIZED(normL2_8s), (NormFunc)GET_OPTIMIZED(normL2_16u), (NormFunc)GET_OPTIMIZED(normL2_16s),
(NormFunc)GET_OPTIMIZED(normL2_32s), (NormFunc)GET_OPTIMIZED(normL2_32f), (NormFunc)normL2_64f, 0
}
};
return normTab[normType][depth];
}
static NormDiffFunc getNormDiffFunc(int normType, int depth)
{
static NormDiffFunc normDiffTab[3][8] =
@ -603,6 +516,11 @@ static bool ipp_norm(Mat &src, int normType, Mat &mask, double &result)
} // ipp_norm()
#endif // HAVE_IPP
static NormFunc getNormFunc(int normType, int depth) {
CV_INSTRUMENT_REGION();
CV_CPU_DISPATCH(getNormFunc, (normType, depth), CV_CPU_DISPATCH_MODES_ALL);
}
double norm( InputArray _src, int normType, InputArray _mask )
{
CV_INSTRUMENT_REGION();
@ -637,6 +555,9 @@ double norm( InputArray _src, int normType, InputArray _mask )
CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_norm(src, normType, mask, _result), _result);
NormFunc func = getNormFunc(normType >> 1, depth == CV_16F ? CV_32F : depth);
CV_Assert( func != 0 );
if( src.isContinuous() && mask.empty() )
{
size_t len = src.total()*cn;
@ -644,30 +565,18 @@ double norm( InputArray _src, int normType, InputArray _mask )
{
if( depth == CV_32F )
{
const float* data = src.ptr<float>();
const uchar* data = src.ptr<const uchar>();
if( normType == NORM_L2 )
if( normType == NORM_L2 || normType == NORM_L2SQR || normType == NORM_L1 )
{
double result = 0;
GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1);
return std::sqrt(result);
}
if( normType == NORM_L2SQR )
{
double result = 0;
GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1);
return result;
}
if( normType == NORM_L1 )
{
double result = 0;
GET_OPTIMIZED(normL1_32f)(data, 0, &result, (int)len, 1);
return result;
func(data, 0, (uchar*)&result, (int)len, 1);
return normType == NORM_L2 ? std::sqrt(result) : result;
}
if( normType == NORM_INF )
{
float result = 0;
GET_OPTIMIZED(normInf_32f)(data, 0, &result, (int)len, 1);
func(data, 0, (uchar*)&result, (int)len, 1);
return result;
}
}
@ -714,9 +623,6 @@ double norm( InputArray _src, int normType, InputArray _mask )
return result;
}
NormFunc func = getNormFunc(normType >> 1, depth == CV_16F ? CV_32F : depth);
CV_Assert( func != 0 );
const Mat* arrays[] = {&src, &mask, 0};
uchar* ptrs[2] = {};
union

View File

@ -0,0 +1,200 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
//
// Copytright (C) 2025, SpaceMIT Inc., all rights reserved.
#include "opencv2/core/hal/intrin.hpp"
namespace cv {
namespace {
// [TODO] Drop this until rvv has dedicated intrinsics for abs on integers.
template<typename T, typename ST> inline ST __riscv_vabs(const T&);
template<> inline
vuint8m1_t __riscv_vabs(const vint8m1_t& v) {
const int vle8m1 = __riscv_vsetvlmax_e8m1();
vint8m1_t mask = __riscv_vsra_vx_i8m1(v, 7, vle8m1);
vint8m1_t v_xor = __riscv_vxor_vv_i8m1(v, mask, vle8m1);
return __riscv_vreinterpret_v_i8m1_u8m1(
__riscv_vsub_vv_i8m1(v_xor, mask, vle8m1)
);
}
template<> inline
vuint16m1_t __riscv_vabs(const vint16m1_t& v) {
const int vle16m1 = __riscv_vsetvlmax_e16m1();
vint16m1_t mask = __riscv_vsra_vx_i16m1(v, 15, vle16m1);
vint16m1_t v_xor = __riscv_vxor_vv_i16m1(v, mask, vle16m1);
return __riscv_vreinterpret_v_i16m1_u16m1(
__riscv_vsub_vv_i16m1(v_xor, mask, vle16m1)
);
}
template<> inline
vuint32m1_t __riscv_vabs(const vint32m1_t& v) {
const int vle32m1 = __riscv_vsetvlmax_e32m1();
vint32m1_t mask = __riscv_vsra_vx_i32m1(v, 31, vle32m1);
vint32m1_t v_xor = __riscv_vxor_vv_i32m1(v, mask, vle32m1);
return __riscv_vreinterpret_v_i32m1_u32m1(
__riscv_vsub_vv_i32m1(v_xor, mask, vle32m1)
);
}
}
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
template <typename T, typename ST> inline
ST normInf_rvv(const T* src, int n, int& j);
template<> inline
int normInf_rvv(const int* src, int n, int& j) {
const int vle32m1 = __riscv_vsetvlmax_e32m1();
vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1);
vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1);
for (; j <= n - 2 * vle32m1; j += 2 * vle32m1) {
vuint32m1_t v0 = __riscv_vabs<vint32m1_t, vuint32m1_t>(__riscv_vle32_v_i32m1(src + j, vle32m1));
r0 = __riscv_vmaxu(r0, v0, vle32m1);
vuint32m1_t v1 = __riscv_vabs<vint32m1_t, vuint32m1_t>(__riscv_vle32_v_i32m1(src + j + vle32m1, vle32m1));
r1 = __riscv_vmaxu(r1, v1, vle32m1);
}
r0 = __riscv_vmaxu(r0, r1, vle32m1);
return (int)__riscv_vmv_x(__riscv_vredmaxu(r0, __riscv_vmv_v_x_u32m1(0, vle32m1), vle32m1));
}
template <typename T, typename ST> inline
ST normL1_rvv(const T* src, int n, int& j);
template<> inline
int normL1_rvv(const schar* src, int n, int& j) {
const int vle8m1 = __riscv_vsetvlmax_e8m1();
const int vle16m1 = __riscv_vsetvlmax_e16m1();
const int vle32m1 = __riscv_vsetvlmax_e32m1();
vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1);
vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1);
vuint16m1_t zero = __riscv_vmv_v_x_u16m1(0, vle16m1);
for (; j <= n - 2 * vle8m1; j += 2 * vle8m1) {
vuint8m1_t v0 = __riscv_vabs<vint8m1_t, vuint8m1_t>(__riscv_vle8_v_i8m1(src + j, vle8m1));
vuint16m1_t u0 = __riscv_vwredsumu_tu(zero, v0, zero, vle8m1);
r0 = __riscv_vwredsumu(u0, r0, vle16m1);
vuint8m1_t v1 = __riscv_vabs<vint8m1_t, vuint8m1_t>(__riscv_vle8_v_i8m1(src + j + vle8m1, vle8m1));
vuint16m1_t u1 = __riscv_vwredsumu_tu(zero, v1, zero, vle8m1);
r1 = __riscv_vwredsumu(u1, r1, vle16m1);
}
return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1));
}
template<> inline
int normL1_rvv(const ushort* src, int n, int& j) {
const int vle16m1 = __riscv_vsetvlmax_e16m1();
const int vle32m1 = __riscv_vsetvlmax_e32m1();
vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1);
vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1);
for (; j <= n - 2 * vle16m1; j += 2 * vle16m1) {
vuint16m1_t v0 = __riscv_vle16_v_u16m1(src + j, vle16m1);
r0 = __riscv_vwredsumu(v0, r0, vle16m1);
vuint16m1_t v1 = __riscv_vle16_v_u16m1(src + j + vle16m1, vle16m1);
r1 = __riscv_vwredsumu(v1, r1, vle16m1);
}
return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1));
}
template<> inline
int normL1_rvv(const short* src, int n, int& j) {
const int vle16m1 = __riscv_vsetvlmax_e16m1();
const int vle32m1 = __riscv_vsetvlmax_e32m1();
vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1);
vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1);
for (; j<= n - 2 * vle16m1; j += 2 * vle16m1) {
vuint16m1_t v0 = __riscv_vabs<vint16m1_t, vuint16m1_t>(__riscv_vle16_v_i16m1(src + j, vle16m1));
r0 = __riscv_vwredsumu(v0, r0, vle16m1);
vuint16m1_t v1 = __riscv_vabs<vint16m1_t, vuint16m1_t>(__riscv_vle16_v_i16m1(src + j + vle16m1, vle16m1));
r1 = __riscv_vwredsumu(v1, r1, vle16m1);
}
return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1));
}
template<> inline
double normL1_rvv(const double* src, int n, int& j) {
const int vle64m1 = __riscv_vsetvlmax_e64m1();
vfloat64m1_t r0 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1);
vfloat64m1_t r1 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1);
for (; j <= n - 2 * vle64m1; j += 2 * vle64m1) {
vfloat64m1_t v0 = __riscv_vle64_v_f64m1(src + j, vle64m1);
v0 = __riscv_vfabs(v0, vle64m1);
r0 = __riscv_vfadd(r0, v0, vle64m1);
vfloat64m1_t v1 = __riscv_vle64_v_f64m1(src + j + vle64m1, vle64m1);
v1 = __riscv_vfabs(v1, vle64m1);
r1 = __riscv_vfadd(r1, v1, vle64m1);
}
r0 = __riscv_vfadd(r0, r1, vle64m1);
return __riscv_vfmv_f(__riscv_vfredusum(r0, __riscv_vfmv_v_f_f64m1(0.f, vle64m1), vle64m1));
}
template <typename T, typename ST> inline
ST normL2_rvv(const T* src, int n, int& j);
template<> inline
int normL2_rvv(const uchar* src, int n, int& j) {
const int vle8m1 = __riscv_vsetvlmax_e8m1();
const int vle16m1 = __riscv_vsetvlmax_e16m1();
const int vle32m1 = __riscv_vsetvlmax_e32m1();
vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1);
vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1);
for (; j <= n - 2 * vle8m1; j += 2 * vle8m1) {
vuint8m1_t v0 = __riscv_vle8_v_u8m1(src + j, vle8m1);
vuint16m2_t u0 = __riscv_vwmulu(v0, v0, vle8m1);
r0 = __riscv_vwredsumu(u0, r0, vle16m1 * 2);
vuint8m1_t v1 = __riscv_vle8_v_u8m1(src + j + vle8m1, vle8m1);
vuint16m2_t u1 = __riscv_vwmulu(v1, v1, vle8m1);
r1 = __riscv_vwredsumu(u1, r1, vle16m1 * 2);
}
return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1));
}
template<> inline
int normL2_rvv(const schar* src, int n, int& j) {
const int vle8m1 = __riscv_vsetvlmax_e8m1();
const int vle16m1 = __riscv_vsetvlmax_e16m1();
const int vle32m1 = __riscv_vsetvlmax_e32m1();
vint32m1_t r0 = __riscv_vmv_v_x_i32m1(0, vle32m1);
vint32m1_t r1 = __riscv_vmv_v_x_i32m1(0, vle32m1);
for (; j <= n - 2 * vle8m1; j += 2 * vle8m1) {
vint8m1_t v0 = __riscv_vle8_v_i8m1(src + j, vle8m1);
vint16m2_t u0 = __riscv_vwmul(v0, v0, vle8m1);
r0 = __riscv_vwredsum(u0, r0, vle16m1 * 2);
vint8m1_t v1 = __riscv_vle8_v_i8m1(src + j + vle8m1, vle8m1);
vint16m2_t u1 = __riscv_vwmul(v1, v1, vle8m1);
r1 = __riscv_vwredsum(u1, r1, vle16m1 * 2);
}
return __riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1));
}
template<> inline
double normL2_rvv(const double* src, int n, int& j) {
const int vle64m1 = __riscv_vsetvlmax_e64m1();
vfloat64m1_t r0 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1);
vfloat64m1_t r1 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1);
for (; j <= n - 2 * vle64m1; j += 2 * vle64m1) {
vfloat64m1_t v0 = __riscv_vle64_v_f64m1(src + j, vle64m1);
r0 = __riscv_vfmacc(r0, v0, v0, vle64m1);
vfloat64m1_t v1 = __riscv_vle64_v_f64m1(src + j + vle64m1, vle64m1);
r1 = __riscv_vfmacc(r1, v1, v1, vle64m1);
}
r0 = __riscv_vfadd(r0, r1, vle64m1);
return __riscv_vfmv_f(__riscv_vfredusum(r0, __riscv_vfmv_v_f_f64m1(0.f, vle64m1), vle64m1));
}
CV_CPU_OPTIMIZATION_NAMESPACE_END
} // cv::

View File

@ -0,0 +1,676 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
#include "precomp.hpp"
#if CV_RVV
#include "norm.rvv1p0.hpp"
#endif
namespace cv {
using NormFunc = int (*)(const uchar*, const uchar*, uchar*, int, int);
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
NormFunc getNormFunc(int normType, int depth);
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
template <typename T, typename ST>
struct NormInf_SIMD {
inline ST operator() (const T* src, int n) const {
ST s = 0;
for (int i = 0; i < n; i++) {
s = std::max(s, (ST)cv_abs(src[i]));
}
return s;
}
};
template <typename T, typename ST>
struct NormL1_SIMD {
inline ST operator() (const T* src, int n) const {
ST s = 0;
for (int i = 0; i < n; i++) {
s += cv_abs(src[i]);
}
return s;
}
};
template <typename T, typename ST>
struct NormL2_SIMD {
inline ST operator() (const T* src, int n) const {
ST s = 0;
for (int i = 0; i < n; i++) {
ST v = src[i];
s += v * v;
}
return s;
}
};
#if (CV_SIMD || CV_SIMD_SCALABLE)
template<>
struct NormInf_SIMD<uchar, int> {
int operator() (const uchar* src, int n) const {
int j = 0;
int s = 0;
v_uint8 r0 = vx_setzero_u8(), r1 = vx_setzero_u8();
v_uint8 r2 = vx_setzero_u8(), r3 = vx_setzero_u8();
for (; j <= n - 4 * VTraits<v_uint8>::vlanes(); j += 4 * VTraits<v_uint8>::vlanes()) {
r0 = v_max(r0, vx_load(src + j ));
r1 = v_max(r1, vx_load(src + j + VTraits<v_uint8>::vlanes()));
r2 = v_max(r2, vx_load(src + j + 2 * VTraits<v_uint8>::vlanes()));
r3 = v_max(r3, vx_load(src + j + 3 * VTraits<v_uint8>::vlanes()));
}
r0 = v_max(r0, v_max(r1, v_max(r2, r3)));
for (; j < n; j++) {
s = std::max(s, (int)src[j]);
}
return std::max(s, (int)v_reduce_max(r0));
}
};
template<>
struct NormInf_SIMD<schar, int> {
int operator() (const schar* src, int n) const {
int j = 0;
int s = 0;
v_uint8 r0 = vx_setzero_u8(), r1 = vx_setzero_u8();
v_uint8 r2 = vx_setzero_u8(), r3 = vx_setzero_u8();
for (; j <= n - 4 * VTraits<v_int8>::vlanes(); j += 4 * VTraits<v_int8>::vlanes()) {
r0 = v_max(r0, v_abs(vx_load(src + j )));
r1 = v_max(r1, v_abs(vx_load(src + j + VTraits<v_int8>::vlanes())));
r2 = v_max(r2, v_abs(vx_load(src + j + 2 * VTraits<v_int8>::vlanes())));
r3 = v_max(r3, v_abs(vx_load(src + j + 3 * VTraits<v_int8>::vlanes())));
}
r0 = v_max(r0, v_max(r1, v_max(r2, r3)));
for (; j < n; j++) {
s = std::max(s, cv_abs(src[j]));
}
return std::max(s, saturate_cast<int>(v_reduce_max(r0)));
}
};
template<>
struct NormInf_SIMD<ushort, int> {
int operator() (const ushort* src, int n) const {
int j = 0;
int s = 0;
v_uint16 d0 = vx_setzero_u16(), d1 = vx_setzero_u16();
v_uint16 d2 = vx_setzero_u16(), d3 = vx_setzero_u16();
for (; j <= n - 4 * VTraits<v_uint16>::vlanes(); j += 4 * VTraits<v_uint16>::vlanes()) {
d0 = v_max(d0, vx_load(src + j ));
d1 = v_max(d1, vx_load(src + j + VTraits<v_uint16>::vlanes()));
d2 = v_max(d2, vx_load(src + j + 2 * VTraits<v_uint16>::vlanes()));
d3 = v_max(d3, vx_load(src + j + 3 * VTraits<v_uint16>::vlanes()));
}
d0 = v_max(d0, v_max(d1, v_max(d2, d3)));
for (; j < n; j++) {
s = std::max(s, (int)src[j]);
}
return std::max(s, (int)v_reduce_max(d0));
}
};
template<>
struct NormInf_SIMD<short, int> {
int operator() (const short* src, int n) const {
int j = 0;
int s = 0;
v_uint16 d0 = vx_setzero_u16(), d1 = vx_setzero_u16();
v_uint16 d2 = vx_setzero_u16(), d3 = vx_setzero_u16();
for (; j <= n - 4 * VTraits<v_int16>::vlanes(); j += 4 * VTraits<v_int16>::vlanes()) {
d0 = v_max(d0, v_abs(vx_load(src + j )));
d1 = v_max(d1, v_abs(vx_load(src + j + VTraits<v_int16>::vlanes())));
d2 = v_max(d2, v_abs(vx_load(src + j + 2 * VTraits<v_int16>::vlanes())));
d3 = v_max(d3, v_abs(vx_load(src + j + 3 * VTraits<v_int16>::vlanes())));
}
d0 = v_max(d0, v_max(d1, v_max(d2, d3)));
for (; j < n; j++) {
s = std::max(s, saturate_cast<int>(cv_abs(src[j])));
}
return std::max(s, saturate_cast<int>(v_reduce_max(d0)));
}
};
template<>
struct NormInf_SIMD<int, int> {
int operator() (const int* src, int n) const {
int j = 0;
int s = 0;
#if CV_RVV
s = normInf_rvv<int, int>(src, n, j);
#else
v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32();
v_uint32 r2 = vx_setzero_u32(), r3 = vx_setzero_u32();
for (; j <= n - 4 * VTraits<v_int32>::vlanes(); j += 4 * VTraits<v_int32>::vlanes()) {
r0 = v_max(r0, v_abs(vx_load(src + j )));
r1 = v_max(r1, v_abs(vx_load(src + j + VTraits<v_int32>::vlanes())));
r2 = v_max(r2, v_abs(vx_load(src + j + 2 * VTraits<v_int32>::vlanes())));
r3 = v_max(r3, v_abs(vx_load(src + j + 3 * VTraits<v_int32>::vlanes())));
}
r0 = v_max(r0, v_max(r1, v_max(r2, r3)));
s = std::max(s, saturate_cast<int>(v_reduce_max(r0)));
#endif
for (; j < n; j++) {
s = std::max(s, cv_abs(src[j]));
}
return s;
}
};
template<>
struct NormInf_SIMD<float, float> {
float operator() (const float* src, int n) const {
int j = 0;
float s = 0.f;
v_float32 r0 = vx_setzero_f32(), r1 = vx_setzero_f32();
v_float32 r2 = vx_setzero_f32(), r3 = vx_setzero_f32();
for (; j <= n - 4 * VTraits<v_float32>::vlanes(); j += 4 * VTraits<v_float32>::vlanes()) {
r0 = v_max(r0, v_abs(vx_load(src + j )));
r1 = v_max(r1, v_abs(vx_load(src + j + VTraits<v_float32>::vlanes())));
r2 = v_max(r2, v_abs(vx_load(src + j + 2 * VTraits<v_float32>::vlanes())));
r3 = v_max(r3, v_abs(vx_load(src + j + 3 * VTraits<v_float32>::vlanes())));
}
r0 = v_max(r0, v_max(r1, v_max(r2, r3)));
for (; j < n; j++) {
s = std::max(s, cv_abs(src[j]));
}
return std::max(s, v_reduce_max(r0));
}
};
template<>
struct NormL1_SIMD<uchar, int> {
int operator() (const uchar* src, int n) const {
int j = 0;
int s = 0;
v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32();
v_uint8 one = vx_setall_u8(1);
for (; j<= n - 2 * VTraits<v_uint8>::vlanes(); j += 2 * VTraits<v_uint8>::vlanes()) {
v_uint8 v0 = vx_load(src + j);
r0 = v_dotprod_expand_fast(v0, one, r0);
v_uint8 v1 = vx_load(src + j + VTraits<v_uint8>::vlanes());
r1 = v_dotprod_expand_fast(v1, one, r1);
}
s += v_reduce_sum(v_add(r0, r1));
for (; j < n; j++) {
s += src[j];
}
return s;
}
};
template<>
struct NormL1_SIMD<schar, int> {
int operator() (const schar* src, int n) const {
int j = 0;
int s = 0;
#if CV_RVV
s = normL1_rvv<schar, int>(src, n, j);
#else
v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32();
v_uint8 one = vx_setall_u8(1);
for (; j<= n - 2 * VTraits<v_int8>::vlanes(); j += 2 * VTraits<v_int8>::vlanes()) {
v_uint8 v0 = v_abs(vx_load(src + j));
r0 = v_dotprod_expand_fast(v0, one, r0);
v_uint8 v1 = v_abs(vx_load(src + j + VTraits<v_int8>::vlanes()));
r1 = v_dotprod_expand_fast(v1, one, r1);
}
s += v_reduce_sum(v_add(r0, r1));
#endif
for (; j < n; j++) {
s += saturate_cast<int>(cv_abs(src[j]));
}
return s;
}
};
template<>
struct NormL1_SIMD<ushort, int> {
int operator() (const ushort* src, int n) const {
int j = 0;
int s = 0;
#if CV_RVV
s = normL1_rvv<ushort, int>(src, n, j);
#else
v_uint32 r00 = vx_setzero_u32(), r01 = vx_setzero_u32();
v_uint32 r10 = vx_setzero_u32(), r11 = vx_setzero_u32();
for (; j<= n - 2 * VTraits<v_uint16>::vlanes(); j += 2 * VTraits<v_uint16>::vlanes()) {
v_uint16 v0 = vx_load(src + j);
v_uint32 v00, v01;
v_expand(v0, v00, v01);
r00 = v_add(r00, v00);
r01 = v_add(r01, v01);
v_uint16 v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
v_uint32 v10, v11;
v_expand(v1, v10, v11);
r10 = v_add(r10, v10);
r11 = v_add(r11, v11);
}
s += (int)v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
#endif
for (; j < n; j++) {
s += src[j];
}
return s;
}
};
template<>
struct NormL1_SIMD<short, int> {
int operator() (const short* src, int n) const {
int j = 0;
int s = 0;
#if CV_RVV
s = normL1_rvv<short, int>(src, n, j);
#else
v_uint32 r00 = vx_setzero_u32(), r01 = vx_setzero_u32();
v_uint32 r10 = vx_setzero_u32(), r11 = vx_setzero_u32();
for (; j<= n - 2 * VTraits<v_int16>::vlanes(); j += 2 * VTraits<v_int16>::vlanes()) {
v_uint16 v0 = v_abs(vx_load(src + j));
v_uint32 v00, v01;
v_expand(v0, v00, v01);
r00 = v_add(r00, v00);
r01 = v_add(r01, v01);
v_uint16 v1 = v_abs(vx_load(src + j + VTraits<v_int16>::vlanes()));
v_uint32 v10, v11;
v_expand(v1, v10, v11);
r10 = v_add(r10, v10);
r11 = v_add(r11, v11);
}
s += (int)v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
#endif
for (; j < n; j++) {
s += saturate_cast<int>(cv_abs(src[j]));
}
return s;
}
};
template<>
struct NormL2_SIMD<uchar, int> {
int operator() (const uchar* src, int n) const {
int j = 0;
int s = 0;
#if CV_RVV
s = normL2_rvv<uchar, int>(src, n, j);
#else
v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32();
for (; j <= n - 2 * VTraits<v_uint8>::vlanes(); j += 2 * VTraits<v_uint8>::vlanes()) {
v_uint8 v0 = vx_load(src + j);
r0 = v_dotprod_expand_fast(v0, v0, r0);
v_uint8 v1 = vx_load(src + j + VTraits<v_uint8>::vlanes());
r1 = v_dotprod_expand_fast(v1, v1, r1);
}
s += v_reduce_sum(v_add(r0, r1));
#endif
for (; j < n; j++) {
int v = saturate_cast<int>(src[j]);
s += v * v;
}
return s;
}
};
template<>
struct NormL2_SIMD<schar, int> {
int operator() (const schar* src, int n) const {
int j = 0;
int s = 0;
#if CV_RVV
s = normL2_rvv<schar, int>(src, n, j);
#else
v_int32 r0 = vx_setzero_s32(), r1 = vx_setzero_s32();
for (; j <= n - 2 * VTraits<v_int8>::vlanes(); j += 2 * VTraits<v_int8>::vlanes()) {
v_int8 v0 = vx_load(src + j);
r0 = v_dotprod_expand_fast(v0, v0, r0);
v_int8 v1 = vx_load(src + j + VTraits<v_int8>::vlanes());
r1 = v_dotprod_expand_fast(v1, v1, r1);
}
s += v_reduce_sum(v_add(r0, r1));
#endif
for (; j < n; j++) {
int v = saturate_cast<int>(src[j]);
s += v * v;
}
return s;
}
};
#endif
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
template<>
struct NormInf_SIMD<double, double> {
double operator() (const double* src, int n) const {
int j = 0;
double s = 0.f;
v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64();
v_float64 r2 = vx_setzero_f64(), r3 = vx_setzero_f64();
for (; j <= n - 4 * VTraits<v_float64>::vlanes(); j += 4 * VTraits<v_float64>::vlanes()) {
r0 = v_max(r0, v_abs(vx_load(src + j )));
r1 = v_max(r1, v_abs(vx_load(src + j + VTraits<v_float64>::vlanes())));
r2 = v_max(r2, v_abs(vx_load(src + j + 2 * VTraits<v_float64>::vlanes())));
r3 = v_max(r3, v_abs(vx_load(src + j + 3 * VTraits<v_float64>::vlanes())));
}
r0 = v_max(r0, v_max(r1, v_max(r2, r3)));
for (; j < n; j++) {
s = std::max(s, cv_abs(src[j]));
}
// [TODO]: use v_reduce_max when it supports float64
double t[VTraits<v_float64>::max_nlanes];
vx_store(t, r0);
for (int i = 0; i < VTraits<v_float64>::vlanes(); i++) {
s = std::max(s, cv_abs(t[i]));
}
return s;
}
};
template<>
struct NormL1_SIMD<int, double> {
double operator() (const int* src, int n) const {
int j = 0;
double s = 0.f;
v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64();
v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64();
for (; j <= n - 2 * VTraits<v_int32>::vlanes(); j += 2 * VTraits<v_int32>::vlanes()) {
v_float32 v0 = v_abs(v_cvt_f32(vx_load(src + j))), v1 = v_abs(v_cvt_f32(vx_load(src + j + VTraits<v_int32>::vlanes())));
r00 = v_add(r00, v_cvt_f64(v0)); r01 = v_add(r01, v_cvt_f64_high(v0));
r10 = v_add(r10, v_cvt_f64(v1)); r11 = v_add(r11, v_cvt_f64_high(v1));
}
s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
for (; j < n; j++) {
s += cv_abs(src[j]);
}
return s;
}
};
template<>
struct NormL1_SIMD<float, double> {
double operator() (const float* src, int n) const {
int j = 0;
double s = 0.f;
v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64();
v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64();
v_float64 r20 = vx_setzero_f64(), r21 = vx_setzero_f64();
v_float64 r30 = vx_setzero_f64(), r31 = vx_setzero_f64();
for (; j <= n - 4 * VTraits<v_float32>::vlanes(); j += 4 * VTraits<v_float32>::vlanes()) {
v_float32 v0 = v_abs(vx_load(src + j)), v1 = v_abs(vx_load(src + j + VTraits<v_float32>::vlanes()));
r00 = v_add(r00, v_cvt_f64(v0)); r01 = v_add(r01, v_cvt_f64_high(v0));
r10 = v_add(r10, v_cvt_f64(v1)); r11 = v_add(r11, v_cvt_f64_high(v1));
v_float32 v2 = v_abs(vx_load(src + j + 2 * VTraits<v_float32>::vlanes())), v3 = v_abs(vx_load(src + j + 3 * VTraits<v_float32>::vlanes()));
r20 = v_add(r20, v_cvt_f64(v2)); r21 = v_add(r21, v_cvt_f64_high(v2));
r30 = v_add(r30, v_cvt_f64(v3)); r31 = v_add(r31, v_cvt_f64_high(v3));
}
s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
s += v_reduce_sum(v_add(v_add(v_add(r20, r21), r30), r31));
for (; j < n; j++) {
s += cv_abs(src[j]);
}
return s;
}
};
template<>
struct NormL1_SIMD<double, double> {
double operator() (const double* src, int n) const {
int j = 0;
double s = 0.f;
#if CV_RVV // This is introduced to workaround the accuracy issue on ci
s = normL1_rvv<double, double>(src, n, j);
#else
v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64();
v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64();
for (; j <= n - 4 * VTraits<v_float64>::vlanes(); j += 4 * VTraits<v_float64>::vlanes()) {
r00 = v_add(r00, v_abs(vx_load(src + j )));
r01 = v_add(r01, v_abs(vx_load(src + j + VTraits<v_float64>::vlanes())));
r10 = v_add(r10, v_abs(vx_load(src + j + 2 * VTraits<v_float64>::vlanes())));
r11 = v_add(r11, v_abs(vx_load(src + j + 3 * VTraits<v_float64>::vlanes())));
}
s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
#endif
for (; j < n; j++) {
s += cv_abs(src[j]);
}
return s;
}
};
template<>
struct NormL2_SIMD<ushort, double> {
double operator() (const ushort* src, int n) const {
int j = 0;
double s = 0.f;
v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64();
for (; j <= n - 2 * VTraits<v_uint16>::vlanes(); j += 2 * VTraits<v_uint16>::vlanes()) {
v_uint16 v0 = vx_load(src + j);
v_uint64 u0 = v_dotprod_expand_fast(v0, v0);
r0 = v_add(r0, v_cvt_f64(v_reinterpret_as_s64(u0)));
v_uint16 v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
v_uint64 u1 = v_dotprod_expand_fast(v1, v1);
r1 = v_add(r1, v_cvt_f64(v_reinterpret_as_s64(u1)));
}
s += v_reduce_sum(v_add(r0, r1));
for (; j < n; j++) {
double v = saturate_cast<double>(src[j]);
s += v * v;
}
return s;
}
};
template<>
struct NormL2_SIMD<short, double> {
double operator() (const short* src, int n) const {
int j = 0;
double s = 0.f;
v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64();
for (; j <= n - 2 * VTraits<v_int16>::vlanes(); j += 2 * VTraits<v_int16>::vlanes()) {
v_int16 v0 = vx_load(src + j);
r0 = v_add(r0, v_cvt_f64(v_dotprod_expand_fast(v0, v0)));
v_int16 v1 = vx_load(src + j + VTraits<v_int16>::vlanes());
r1 = v_add(r1, v_cvt_f64(v_dotprod_expand_fast(v1, v1)));
}
s += v_reduce_sum(v_add(r0, r1));
for (; j < n; j++) {
double v = saturate_cast<double>(src[j]);
s += v * v;
}
return s;
}
};
template<>
struct NormL2_SIMD<int, double> {
double operator() (const int* src, int n) const {
int j = 0;
double s = 0.f;
v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64();
for (; j <= n - 2 * VTraits<v_int32>::vlanes(); j += 2 * VTraits<v_int32>::vlanes()) {
v_int32 v0 = vx_load(src + j);
r0 = v_dotprod_expand_fast(v0, v0, r0);
v_int32 v1 = vx_load(src + j + VTraits<v_int32>::vlanes());
r1 = v_dotprod_expand_fast(v1, v1, r1);
}
s += v_reduce_sum(v_add(r0, r1));
for (; j < n; j++) {
double v = src[j];
s += v * v;
}
return s;
}
};
template<>
struct NormL2_SIMD<float, double> {
double operator() (const float* src, int n) const {
int j = 0;
double s = 0.f;
v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64();
v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64();
for (; j <= n - 2 * VTraits<v_float32>::vlanes(); j += 2 * VTraits<v_float32>::vlanes()) {
v_float32 v0 = vx_load(src + j), v1 = vx_load(src + j + VTraits<v_float32>::vlanes());
v_float64 v00 = v_cvt_f64(v0), v01 = v_cvt_f64_high(v0);
v_float64 v10 = v_cvt_f64(v1), v11 = v_cvt_f64_high(v1);
r00 = v_fma(v00, v00, r00); r01 = v_fma(v01, v01, r01);
r10 = v_fma(v10, v10, r10); r11 = v_fma(v11, v11, r11);
}
s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
for (; j < n; j++) {
double v = src[j];
s += v * v;
}
return s;
}
};
template<>
struct NormL2_SIMD<double, double> {
double operator() (const double* src, int n) const {
int j = 0;
double s = 0.f;
#if CV_RVV // This is introduced to workaround the accuracy issue on ci
s = normL2_rvv<double, double>(src, n, j);
#else
v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64();
v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64();
for (; j <= n - 4 * VTraits<v_float64>::vlanes(); j += 4 * VTraits<v_float64>::vlanes()) {
v_float64 v00 = vx_load(src + j );
v_float64 v01 = vx_load(src + j + VTraits<v_float64>::vlanes());
v_float64 v10 = vx_load(src + j + 2 * VTraits<v_float64>::vlanes());
v_float64 v11 = vx_load(src + j + 3 * VTraits<v_float64>::vlanes());
r00 = v_fma(v00, v00, r00); r01 = v_fma(v01, v01, r01);
r10 = v_fma(v10, v10, r10); r11 = v_fma(v11, v11, r11);
}
s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
#endif
for (; j < n; j++) {
double v = src[j];
s += v * v;
}
return s;
}
};
#endif
template<typename T, typename ST> int
normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn) {
ST result = *_result;
if( !mask ) {
NormInf_SIMD<T, ST> op;
result = std::max(result, op(src, len*cn));
} else {
for( int i = 0; i < len; i++, src += cn ) {
if( mask[i] ) {
for( int k = 0; k < cn; k++ ) {
result = std::max(result, ST(cv_abs(src[k])));
}
}
}
}
*_result = result;
return 0;
}
template<typename T, typename ST> int
normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn) {
ST result = *_result;
if( !mask ) {
NormL1_SIMD<T, ST> op;
result += op(src, len*cn);
} else {
for( int i = 0; i < len; i++, src += cn ) {
if( mask[i] ) {
for( int k = 0; k < cn; k++ ) {
result += cv_abs(src[k]);
}
}
}
}
*_result = result;
return 0;
}
template<typename T, typename ST> int
normL2_(const T* src, const uchar* mask, ST* _result, int len, int cn) {
ST result = *_result;
if( !mask ) {
NormL2_SIMD<T, ST> op;
result += op(src, len*cn);
} else {
for( int i = 0; i < len; i++, src += cn ) {
if( mask[i] ) {
for( int k = 0; k < cn; k++ ) {
T v = src[k];
result += (ST)v*v;
}
}
}
}
*_result = result;
return 0;
}
#define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \
static int norm##L##_##suffix(const type* src, const uchar* mask, ntype* r, int len, int cn) \
{ CV_INSTRUMENT_REGION(); return norm##L##_(src, mask, r, len, cn); } \
#define CV_DEF_NORM_ALL(suffix, type, inftype, l1type, l2type) \
CV_DEF_NORM_FUNC(Inf, suffix, type, inftype) \
CV_DEF_NORM_FUNC(L1, suffix, type, l1type) \
CV_DEF_NORM_FUNC(L2, suffix, type, l2type)
CV_DEF_NORM_ALL(8u, uchar, int, int, int)
CV_DEF_NORM_ALL(8s, schar, int, int, int)
CV_DEF_NORM_ALL(16u, ushort, int, int, double)
CV_DEF_NORM_ALL(16s, short, int, int, double)
CV_DEF_NORM_ALL(32s, int, int, double, double)
CV_DEF_NORM_ALL(32f, float, float, double, double)
CV_DEF_NORM_ALL(64f, double, double, double, double)
NormFunc getNormFunc(int normType, int depth)
{
CV_INSTRUMENT_REGION();
static NormFunc normTab[3][8] =
{
{
(NormFunc)GET_OPTIMIZED(normInf_8u), (NormFunc)GET_OPTIMIZED(normInf_8s), (NormFunc)GET_OPTIMIZED(normInf_16u), (NormFunc)GET_OPTIMIZED(normInf_16s),
(NormFunc)GET_OPTIMIZED(normInf_32s), (NormFunc)GET_OPTIMIZED(normInf_32f), (NormFunc)normInf_64f, 0
},
{
(NormFunc)GET_OPTIMIZED(normL1_8u), (NormFunc)GET_OPTIMIZED(normL1_8s), (NormFunc)GET_OPTIMIZED(normL1_16u), (NormFunc)GET_OPTIMIZED(normL1_16s),
(NormFunc)GET_OPTIMIZED(normL1_32s), (NormFunc)GET_OPTIMIZED(normL1_32f), (NormFunc)normL1_64f, 0
},
{
(NormFunc)GET_OPTIMIZED(normL2_8u), (NormFunc)GET_OPTIMIZED(normL2_8s), (NormFunc)GET_OPTIMIZED(normL2_16u), (NormFunc)GET_OPTIMIZED(normL2_16s),
(NormFunc)GET_OPTIMIZED(normL2_32s), (NormFunc)GET_OPTIMIZED(normL2_32f), (NormFunc)normL2_64f, 0
}
};
return normTab[normType][depth];
}
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
CV_CPU_OPTIMIZATION_NAMESPACE_END
} // cv::