mirror of
https://github.com/opencv/opencv.git
synced 2025-08-06 06:26:29 +08:00
Merge pull request #26885 from fengyuentau:4x/core/normalize_simd
core: vectorize cv::normalize / cv::norm #26885 Checklist: | | normInf | normL1 | normL2 | | ---- | ------- | ------ | ------ | | bool | - | - | - | | 8u | √ | √ | √ | | 8s | √ | √ | √ | | 16u | √ | √ | √ | | 16s | √ | √ | √ | | 16f | - | - | - | | 16bf | - | - | - | | 32u | - | - | - | | 32s | √ | √ | √ | | 32f | √ | √ | √ | | 64u | - | - | - | | 64s | - | - | - | | 64f | √ | √ | √ | *: Vectorization of data type bool, 16f, 16bf, 32u, 64u and 64s needs to be done on 5.x. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
7a2b048c92
commit
e2803bee5c
@ -12,6 +12,7 @@ ocv_add_dispatched_file(mean SSE2 AVX2 LASX)
|
||||
ocv_add_dispatched_file(merge SSE2 AVX2 LASX)
|
||||
ocv_add_dispatched_file(split SSE2 AVX2 LASX)
|
||||
ocv_add_dispatched_file(sum SSE2 AVX2 LASX)
|
||||
ocv_add_dispatched_file(norm SSE2 SSE4_1 AVX AVX2 NEON_DOTPROD LASX)
|
||||
|
||||
# dispatching for accuracy tests
|
||||
ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2 AVX512_SKX)
|
||||
|
@ -7,6 +7,9 @@
|
||||
#include "opencl_kernels_core.hpp"
|
||||
#include "stat.hpp"
|
||||
|
||||
#include "norm.simd.hpp"
|
||||
#include "norm.simd_declarations.hpp"
|
||||
|
||||
/****************************************************************************************\
|
||||
* norm *
|
||||
\****************************************************************************************/
|
||||
@ -215,72 +218,6 @@ int normL1_(const uchar* a, const uchar* b, int n)
|
||||
|
||||
//==================================================================================================
|
||||
|
||||
template<typename T, typename ST> int
|
||||
normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
|
||||
{
|
||||
ST result = *_result;
|
||||
if( !mask )
|
||||
{
|
||||
result = std::max(result, normInf<T, ST>(src, len*cn));
|
||||
}
|
||||
else
|
||||
{
|
||||
for( int i = 0; i < len; i++, src += cn )
|
||||
if( mask[i] )
|
||||
{
|
||||
for( int k = 0; k < cn; k++ )
|
||||
result = std::max(result, ST(cv_abs(src[k])));
|
||||
}
|
||||
}
|
||||
*_result = result;
|
||||
return 0;
|
||||
}
|
||||
|
||||
template<typename T, typename ST> int
|
||||
normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
|
||||
{
|
||||
ST result = *_result;
|
||||
if( !mask )
|
||||
{
|
||||
result += normL1<T, ST>(src, len*cn);
|
||||
}
|
||||
else
|
||||
{
|
||||
for( int i = 0; i < len; i++, src += cn )
|
||||
if( mask[i] )
|
||||
{
|
||||
for( int k = 0; k < cn; k++ )
|
||||
result += cv_abs(src[k]);
|
||||
}
|
||||
}
|
||||
*_result = result;
|
||||
return 0;
|
||||
}
|
||||
|
||||
template<typename T, typename ST> int
|
||||
normL2_(const T* src, const uchar* mask, ST* _result, int len, int cn)
|
||||
{
|
||||
ST result = *_result;
|
||||
if( !mask )
|
||||
{
|
||||
result += normL2Sqr<T, ST>(src, len*cn);
|
||||
}
|
||||
else
|
||||
{
|
||||
for( int i = 0; i < len; i++, src += cn )
|
||||
if( mask[i] )
|
||||
{
|
||||
for( int k = 0; k < cn; k++ )
|
||||
{
|
||||
T v = src[k];
|
||||
result += (ST)v*v;
|
||||
}
|
||||
}
|
||||
}
|
||||
*_result = result;
|
||||
return 0;
|
||||
}
|
||||
|
||||
template<typename T, typename ST> int
|
||||
normDiffInf_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
|
||||
{
|
||||
@ -347,51 +284,27 @@ normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int le
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \
|
||||
static int norm##L##_##suffix(const type* src, const uchar* mask, ntype* r, int len, int cn) \
|
||||
{ return norm##L##_(src, mask, r, len, cn); } \
|
||||
#define CV_DEF_NORM_DIFF_FUNC(L, suffix, type, ntype) \
|
||||
static int normDiff##L##_##suffix(const type* src1, const type* src2, \
|
||||
const uchar* mask, ntype* r, int len, int cn) \
|
||||
{ return normDiff##L##_(src1, src2, mask, r, (int)len, cn); }
|
||||
|
||||
#define CV_DEF_NORM_ALL(suffix, type, inftype, l1type, l2type) \
|
||||
CV_DEF_NORM_FUNC(Inf, suffix, type, inftype) \
|
||||
CV_DEF_NORM_FUNC(L1, suffix, type, l1type) \
|
||||
CV_DEF_NORM_FUNC(L2, suffix, type, l2type)
|
||||
|
||||
CV_DEF_NORM_ALL(8u, uchar, int, int, int)
|
||||
CV_DEF_NORM_ALL(8s, schar, int, int, int)
|
||||
CV_DEF_NORM_ALL(16u, ushort, int, int, double)
|
||||
CV_DEF_NORM_ALL(16s, short, int, int, double)
|
||||
CV_DEF_NORM_ALL(32s, int, int, double, double)
|
||||
CV_DEF_NORM_ALL(32f, float, float, double, double)
|
||||
CV_DEF_NORM_ALL(64f, double, double, double, double)
|
||||
#define CV_DEF_NORM_DIFF_ALL(suffix, type, inftype, l1type, l2type) \
|
||||
CV_DEF_NORM_DIFF_FUNC(Inf, suffix, type, inftype) \
|
||||
CV_DEF_NORM_DIFF_FUNC(L1, suffix, type, l1type) \
|
||||
CV_DEF_NORM_DIFF_FUNC(L2, suffix, type, l2type)
|
||||
|
||||
CV_DEF_NORM_DIFF_ALL(8u, uchar, int, int, int)
|
||||
CV_DEF_NORM_DIFF_ALL(8s, schar, int, int, int)
|
||||
CV_DEF_NORM_DIFF_ALL(16u, ushort, int, int, double)
|
||||
CV_DEF_NORM_DIFF_ALL(16s, short, int, int, double)
|
||||
CV_DEF_NORM_DIFF_ALL(32s, int, int, double, double)
|
||||
CV_DEF_NORM_DIFF_ALL(32f, float, float, double, double)
|
||||
CV_DEF_NORM_DIFF_ALL(64f, double, double, double, double)
|
||||
|
||||
typedef int (*NormFunc)(const uchar*, const uchar*, uchar*, int, int);
|
||||
typedef int (*NormDiffFunc)(const uchar*, const uchar*, const uchar*, uchar*, int, int);
|
||||
|
||||
static NormFunc getNormFunc(int normType, int depth)
|
||||
{
|
||||
static NormFunc normTab[3][8] =
|
||||
{
|
||||
{
|
||||
(NormFunc)GET_OPTIMIZED(normInf_8u), (NormFunc)GET_OPTIMIZED(normInf_8s), (NormFunc)GET_OPTIMIZED(normInf_16u), (NormFunc)GET_OPTIMIZED(normInf_16s),
|
||||
(NormFunc)GET_OPTIMIZED(normInf_32s), (NormFunc)GET_OPTIMIZED(normInf_32f), (NormFunc)normInf_64f, 0
|
||||
},
|
||||
{
|
||||
(NormFunc)GET_OPTIMIZED(normL1_8u), (NormFunc)GET_OPTIMIZED(normL1_8s), (NormFunc)GET_OPTIMIZED(normL1_16u), (NormFunc)GET_OPTIMIZED(normL1_16s),
|
||||
(NormFunc)GET_OPTIMIZED(normL1_32s), (NormFunc)GET_OPTIMIZED(normL1_32f), (NormFunc)normL1_64f, 0
|
||||
},
|
||||
{
|
||||
(NormFunc)GET_OPTIMIZED(normL2_8u), (NormFunc)GET_OPTIMIZED(normL2_8s), (NormFunc)GET_OPTIMIZED(normL2_16u), (NormFunc)GET_OPTIMIZED(normL2_16s),
|
||||
(NormFunc)GET_OPTIMIZED(normL2_32s), (NormFunc)GET_OPTIMIZED(normL2_32f), (NormFunc)normL2_64f, 0
|
||||
}
|
||||
};
|
||||
|
||||
return normTab[normType][depth];
|
||||
}
|
||||
|
||||
static NormDiffFunc getNormDiffFunc(int normType, int depth)
|
||||
{
|
||||
static NormDiffFunc normDiffTab[3][8] =
|
||||
@ -603,6 +516,11 @@ static bool ipp_norm(Mat &src, int normType, Mat &mask, double &result)
|
||||
} // ipp_norm()
|
||||
#endif // HAVE_IPP
|
||||
|
||||
static NormFunc getNormFunc(int normType, int depth) {
|
||||
CV_INSTRUMENT_REGION();
|
||||
CV_CPU_DISPATCH(getNormFunc, (normType, depth), CV_CPU_DISPATCH_MODES_ALL);
|
||||
}
|
||||
|
||||
double norm( InputArray _src, int normType, InputArray _mask )
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
@ -637,6 +555,9 @@ double norm( InputArray _src, int normType, InputArray _mask )
|
||||
|
||||
CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_norm(src, normType, mask, _result), _result);
|
||||
|
||||
NormFunc func = getNormFunc(normType >> 1, depth == CV_16F ? CV_32F : depth);
|
||||
CV_Assert( func != 0 );
|
||||
|
||||
if( src.isContinuous() && mask.empty() )
|
||||
{
|
||||
size_t len = src.total()*cn;
|
||||
@ -644,30 +565,18 @@ double norm( InputArray _src, int normType, InputArray _mask )
|
||||
{
|
||||
if( depth == CV_32F )
|
||||
{
|
||||
const float* data = src.ptr<float>();
|
||||
const uchar* data = src.ptr<const uchar>();
|
||||
|
||||
if( normType == NORM_L2 )
|
||||
if( normType == NORM_L2 || normType == NORM_L2SQR || normType == NORM_L1 )
|
||||
{
|
||||
double result = 0;
|
||||
GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1);
|
||||
return std::sqrt(result);
|
||||
}
|
||||
if( normType == NORM_L2SQR )
|
||||
{
|
||||
double result = 0;
|
||||
GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1);
|
||||
return result;
|
||||
}
|
||||
if( normType == NORM_L1 )
|
||||
{
|
||||
double result = 0;
|
||||
GET_OPTIMIZED(normL1_32f)(data, 0, &result, (int)len, 1);
|
||||
return result;
|
||||
func(data, 0, (uchar*)&result, (int)len, 1);
|
||||
return normType == NORM_L2 ? std::sqrt(result) : result;
|
||||
}
|
||||
if( normType == NORM_INF )
|
||||
{
|
||||
float result = 0;
|
||||
GET_OPTIMIZED(normInf_32f)(data, 0, &result, (int)len, 1);
|
||||
func(data, 0, (uchar*)&result, (int)len, 1);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@ -714,9 +623,6 @@ double norm( InputArray _src, int normType, InputArray _mask )
|
||||
return result;
|
||||
}
|
||||
|
||||
NormFunc func = getNormFunc(normType >> 1, depth == CV_16F ? CV_32F : depth);
|
||||
CV_Assert( func != 0 );
|
||||
|
||||
const Mat* arrays[] = {&src, &mask, 0};
|
||||
uchar* ptrs[2] = {};
|
||||
union
|
200
modules/core/src/norm.rvv1p0.hpp
Normal file
200
modules/core/src/norm.rvv1p0.hpp
Normal file
@ -0,0 +1,200 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
//
|
||||
// Copytright (C) 2025, SpaceMIT Inc., all rights reserved.
|
||||
|
||||
#include "opencv2/core/hal/intrin.hpp"
|
||||
|
||||
namespace cv {
|
||||
|
||||
namespace {
|
||||
|
||||
// [TODO] Drop this until rvv has dedicated intrinsics for abs on integers.
|
||||
template<typename T, typename ST> inline ST __riscv_vabs(const T&);
|
||||
|
||||
template<> inline
|
||||
vuint8m1_t __riscv_vabs(const vint8m1_t& v) {
|
||||
const int vle8m1 = __riscv_vsetvlmax_e8m1();
|
||||
vint8m1_t mask = __riscv_vsra_vx_i8m1(v, 7, vle8m1);
|
||||
vint8m1_t v_xor = __riscv_vxor_vv_i8m1(v, mask, vle8m1);
|
||||
return __riscv_vreinterpret_v_i8m1_u8m1(
|
||||
__riscv_vsub_vv_i8m1(v_xor, mask, vle8m1)
|
||||
);
|
||||
}
|
||||
|
||||
template<> inline
|
||||
vuint16m1_t __riscv_vabs(const vint16m1_t& v) {
|
||||
const int vle16m1 = __riscv_vsetvlmax_e16m1();
|
||||
vint16m1_t mask = __riscv_vsra_vx_i16m1(v, 15, vle16m1);
|
||||
vint16m1_t v_xor = __riscv_vxor_vv_i16m1(v, mask, vle16m1);
|
||||
return __riscv_vreinterpret_v_i16m1_u16m1(
|
||||
__riscv_vsub_vv_i16m1(v_xor, mask, vle16m1)
|
||||
);
|
||||
}
|
||||
|
||||
template<> inline
|
||||
vuint32m1_t __riscv_vabs(const vint32m1_t& v) {
|
||||
const int vle32m1 = __riscv_vsetvlmax_e32m1();
|
||||
vint32m1_t mask = __riscv_vsra_vx_i32m1(v, 31, vle32m1);
|
||||
vint32m1_t v_xor = __riscv_vxor_vv_i32m1(v, mask, vle32m1);
|
||||
return __riscv_vreinterpret_v_i32m1_u32m1(
|
||||
__riscv_vsub_vv_i32m1(v_xor, mask, vle32m1)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
|
||||
|
||||
template <typename T, typename ST> inline
|
||||
ST normInf_rvv(const T* src, int n, int& j);
|
||||
|
||||
template<> inline
|
||||
int normInf_rvv(const int* src, int n, int& j) {
|
||||
const int vle32m1 = __riscv_vsetvlmax_e32m1();
|
||||
vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1);
|
||||
vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1);
|
||||
for (; j <= n - 2 * vle32m1; j += 2 * vle32m1) {
|
||||
vuint32m1_t v0 = __riscv_vabs<vint32m1_t, vuint32m1_t>(__riscv_vle32_v_i32m1(src + j, vle32m1));
|
||||
r0 = __riscv_vmaxu(r0, v0, vle32m1);
|
||||
|
||||
vuint32m1_t v1 = __riscv_vabs<vint32m1_t, vuint32m1_t>(__riscv_vle32_v_i32m1(src + j + vle32m1, vle32m1));
|
||||
r1 = __riscv_vmaxu(r1, v1, vle32m1);
|
||||
}
|
||||
r0 = __riscv_vmaxu(r0, r1, vle32m1);
|
||||
return (int)__riscv_vmv_x(__riscv_vredmaxu(r0, __riscv_vmv_v_x_u32m1(0, vle32m1), vle32m1));
|
||||
}
|
||||
|
||||
template <typename T, typename ST> inline
|
||||
ST normL1_rvv(const T* src, int n, int& j);
|
||||
|
||||
template<> inline
|
||||
int normL1_rvv(const schar* src, int n, int& j) {
|
||||
const int vle8m1 = __riscv_vsetvlmax_e8m1();
|
||||
const int vle16m1 = __riscv_vsetvlmax_e16m1();
|
||||
const int vle32m1 = __riscv_vsetvlmax_e32m1();
|
||||
vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1);
|
||||
vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1);
|
||||
vuint16m1_t zero = __riscv_vmv_v_x_u16m1(0, vle16m1);
|
||||
for (; j <= n - 2 * vle8m1; j += 2 * vle8m1) {
|
||||
vuint8m1_t v0 = __riscv_vabs<vint8m1_t, vuint8m1_t>(__riscv_vle8_v_i8m1(src + j, vle8m1));
|
||||
vuint16m1_t u0 = __riscv_vwredsumu_tu(zero, v0, zero, vle8m1);
|
||||
r0 = __riscv_vwredsumu(u0, r0, vle16m1);
|
||||
|
||||
vuint8m1_t v1 = __riscv_vabs<vint8m1_t, vuint8m1_t>(__riscv_vle8_v_i8m1(src + j + vle8m1, vle8m1));
|
||||
vuint16m1_t u1 = __riscv_vwredsumu_tu(zero, v1, zero, vle8m1);
|
||||
r1 = __riscv_vwredsumu(u1, r1, vle16m1);
|
||||
}
|
||||
return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1));
|
||||
}
|
||||
|
||||
template<> inline
|
||||
int normL1_rvv(const ushort* src, int n, int& j) {
|
||||
const int vle16m1 = __riscv_vsetvlmax_e16m1();
|
||||
const int vle32m1 = __riscv_vsetvlmax_e32m1();
|
||||
vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1);
|
||||
vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1);
|
||||
for (; j <= n - 2 * vle16m1; j += 2 * vle16m1) {
|
||||
vuint16m1_t v0 = __riscv_vle16_v_u16m1(src + j, vle16m1);
|
||||
r0 = __riscv_vwredsumu(v0, r0, vle16m1);
|
||||
|
||||
vuint16m1_t v1 = __riscv_vle16_v_u16m1(src + j + vle16m1, vle16m1);
|
||||
r1 = __riscv_vwredsumu(v1, r1, vle16m1);
|
||||
}
|
||||
return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1));
|
||||
}
|
||||
|
||||
template<> inline
|
||||
int normL1_rvv(const short* src, int n, int& j) {
|
||||
const int vle16m1 = __riscv_vsetvlmax_e16m1();
|
||||
const int vle32m1 = __riscv_vsetvlmax_e32m1();
|
||||
vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1);
|
||||
vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1);
|
||||
for (; j<= n - 2 * vle16m1; j += 2 * vle16m1) {
|
||||
vuint16m1_t v0 = __riscv_vabs<vint16m1_t, vuint16m1_t>(__riscv_vle16_v_i16m1(src + j, vle16m1));
|
||||
r0 = __riscv_vwredsumu(v0, r0, vle16m1);
|
||||
|
||||
vuint16m1_t v1 = __riscv_vabs<vint16m1_t, vuint16m1_t>(__riscv_vle16_v_i16m1(src + j + vle16m1, vle16m1));
|
||||
r1 = __riscv_vwredsumu(v1, r1, vle16m1);
|
||||
}
|
||||
return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1));
|
||||
}
|
||||
|
||||
template<> inline
|
||||
double normL1_rvv(const double* src, int n, int& j) {
|
||||
const int vle64m1 = __riscv_vsetvlmax_e64m1();
|
||||
vfloat64m1_t r0 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1);
|
||||
vfloat64m1_t r1 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1);
|
||||
for (; j <= n - 2 * vle64m1; j += 2 * vle64m1) {
|
||||
vfloat64m1_t v0 = __riscv_vle64_v_f64m1(src + j, vle64m1);
|
||||
v0 = __riscv_vfabs(v0, vle64m1);
|
||||
r0 = __riscv_vfadd(r0, v0, vle64m1);
|
||||
|
||||
vfloat64m1_t v1 = __riscv_vle64_v_f64m1(src + j + vle64m1, vle64m1);
|
||||
v1 = __riscv_vfabs(v1, vle64m1);
|
||||
r1 = __riscv_vfadd(r1, v1, vle64m1);
|
||||
}
|
||||
r0 = __riscv_vfadd(r0, r1, vle64m1);
|
||||
return __riscv_vfmv_f(__riscv_vfredusum(r0, __riscv_vfmv_v_f_f64m1(0.f, vle64m1), vle64m1));
|
||||
}
|
||||
|
||||
template <typename T, typename ST> inline
|
||||
ST normL2_rvv(const T* src, int n, int& j);
|
||||
|
||||
template<> inline
|
||||
int normL2_rvv(const uchar* src, int n, int& j) {
|
||||
const int vle8m1 = __riscv_vsetvlmax_e8m1();
|
||||
const int vle16m1 = __riscv_vsetvlmax_e16m1();
|
||||
const int vle32m1 = __riscv_vsetvlmax_e32m1();
|
||||
vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1);
|
||||
vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1);
|
||||
for (; j <= n - 2 * vle8m1; j += 2 * vle8m1) {
|
||||
vuint8m1_t v0 = __riscv_vle8_v_u8m1(src + j, vle8m1);
|
||||
vuint16m2_t u0 = __riscv_vwmulu(v0, v0, vle8m1);
|
||||
r0 = __riscv_vwredsumu(u0, r0, vle16m1 * 2);
|
||||
|
||||
vuint8m1_t v1 = __riscv_vle8_v_u8m1(src + j + vle8m1, vle8m1);
|
||||
vuint16m2_t u1 = __riscv_vwmulu(v1, v1, vle8m1);
|
||||
r1 = __riscv_vwredsumu(u1, r1, vle16m1 * 2);
|
||||
}
|
||||
return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1));
|
||||
}
|
||||
|
||||
template<> inline
|
||||
int normL2_rvv(const schar* src, int n, int& j) {
|
||||
const int vle8m1 = __riscv_vsetvlmax_e8m1();
|
||||
const int vle16m1 = __riscv_vsetvlmax_e16m1();
|
||||
const int vle32m1 = __riscv_vsetvlmax_e32m1();
|
||||
vint32m1_t r0 = __riscv_vmv_v_x_i32m1(0, vle32m1);
|
||||
vint32m1_t r1 = __riscv_vmv_v_x_i32m1(0, vle32m1);
|
||||
for (; j <= n - 2 * vle8m1; j += 2 * vle8m1) {
|
||||
vint8m1_t v0 = __riscv_vle8_v_i8m1(src + j, vle8m1);
|
||||
vint16m2_t u0 = __riscv_vwmul(v0, v0, vle8m1);
|
||||
r0 = __riscv_vwredsum(u0, r0, vle16m1 * 2);
|
||||
|
||||
vint8m1_t v1 = __riscv_vle8_v_i8m1(src + j + vle8m1, vle8m1);
|
||||
vint16m2_t u1 = __riscv_vwmul(v1, v1, vle8m1);
|
||||
r1 = __riscv_vwredsum(u1, r1, vle16m1 * 2);
|
||||
}
|
||||
return __riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1));
|
||||
}
|
||||
|
||||
template<> inline
|
||||
double normL2_rvv(const double* src, int n, int& j) {
|
||||
const int vle64m1 = __riscv_vsetvlmax_e64m1();
|
||||
vfloat64m1_t r0 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1);
|
||||
vfloat64m1_t r1 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1);
|
||||
for (; j <= n - 2 * vle64m1; j += 2 * vle64m1) {
|
||||
vfloat64m1_t v0 = __riscv_vle64_v_f64m1(src + j, vle64m1);
|
||||
r0 = __riscv_vfmacc(r0, v0, v0, vle64m1);
|
||||
|
||||
vfloat64m1_t v1 = __riscv_vle64_v_f64m1(src + j + vle64m1, vle64m1);
|
||||
r1 = __riscv_vfmacc(r1, v1, v1, vle64m1);
|
||||
}
|
||||
r0 = __riscv_vfadd(r0, r1, vle64m1);
|
||||
return __riscv_vfmv_f(__riscv_vfredusum(r0, __riscv_vfmv_v_f_f64m1(0.f, vle64m1), vle64m1));
|
||||
}
|
||||
|
||||
CV_CPU_OPTIMIZATION_NAMESPACE_END
|
||||
|
||||
} // cv::
|
676
modules/core/src/norm.simd.hpp
Normal file
676
modules/core/src/norm.simd.hpp
Normal file
@ -0,0 +1,676 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
#include "precomp.hpp"
|
||||
|
||||
#if CV_RVV
|
||||
#include "norm.rvv1p0.hpp"
|
||||
#endif
|
||||
|
||||
namespace cv {
|
||||
|
||||
using NormFunc = int (*)(const uchar*, const uchar*, uchar*, int, int);
|
||||
|
||||
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
|
||||
|
||||
NormFunc getNormFunc(int normType, int depth);
|
||||
|
||||
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
||||
|
||||
template <typename T, typename ST>
|
||||
struct NormInf_SIMD {
|
||||
inline ST operator() (const T* src, int n) const {
|
||||
ST s = 0;
|
||||
for (int i = 0; i < n; i++) {
|
||||
s = std::max(s, (ST)cv_abs(src[i]));
|
||||
}
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename ST>
|
||||
struct NormL1_SIMD {
|
||||
inline ST operator() (const T* src, int n) const {
|
||||
ST s = 0;
|
||||
for (int i = 0; i < n; i++) {
|
||||
s += cv_abs(src[i]);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename ST>
|
||||
struct NormL2_SIMD {
|
||||
inline ST operator() (const T* src, int n) const {
|
||||
ST s = 0;
|
||||
for (int i = 0; i < n; i++) {
|
||||
ST v = src[i];
|
||||
s += v * v;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
|
||||
template<>
|
||||
struct NormInf_SIMD<uchar, int> {
|
||||
int operator() (const uchar* src, int n) const {
|
||||
int j = 0;
|
||||
int s = 0;
|
||||
v_uint8 r0 = vx_setzero_u8(), r1 = vx_setzero_u8();
|
||||
v_uint8 r2 = vx_setzero_u8(), r3 = vx_setzero_u8();
|
||||
for (; j <= n - 4 * VTraits<v_uint8>::vlanes(); j += 4 * VTraits<v_uint8>::vlanes()) {
|
||||
r0 = v_max(r0, vx_load(src + j ));
|
||||
r1 = v_max(r1, vx_load(src + j + VTraits<v_uint8>::vlanes()));
|
||||
r2 = v_max(r2, vx_load(src + j + 2 * VTraits<v_uint8>::vlanes()));
|
||||
r3 = v_max(r3, vx_load(src + j + 3 * VTraits<v_uint8>::vlanes()));
|
||||
}
|
||||
r0 = v_max(r0, v_max(r1, v_max(r2, r3)));
|
||||
for (; j < n; j++) {
|
||||
s = std::max(s, (int)src[j]);
|
||||
}
|
||||
return std::max(s, (int)v_reduce_max(r0));
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct NormInf_SIMD<schar, int> {
|
||||
int operator() (const schar* src, int n) const {
|
||||
int j = 0;
|
||||
int s = 0;
|
||||
v_uint8 r0 = vx_setzero_u8(), r1 = vx_setzero_u8();
|
||||
v_uint8 r2 = vx_setzero_u8(), r3 = vx_setzero_u8();
|
||||
for (; j <= n - 4 * VTraits<v_int8>::vlanes(); j += 4 * VTraits<v_int8>::vlanes()) {
|
||||
r0 = v_max(r0, v_abs(vx_load(src + j )));
|
||||
r1 = v_max(r1, v_abs(vx_load(src + j + VTraits<v_int8>::vlanes())));
|
||||
r2 = v_max(r2, v_abs(vx_load(src + j + 2 * VTraits<v_int8>::vlanes())));
|
||||
r3 = v_max(r3, v_abs(vx_load(src + j + 3 * VTraits<v_int8>::vlanes())));
|
||||
}
|
||||
r0 = v_max(r0, v_max(r1, v_max(r2, r3)));
|
||||
for (; j < n; j++) {
|
||||
s = std::max(s, cv_abs(src[j]));
|
||||
}
|
||||
return std::max(s, saturate_cast<int>(v_reduce_max(r0)));
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct NormInf_SIMD<ushort, int> {
|
||||
int operator() (const ushort* src, int n) const {
|
||||
int j = 0;
|
||||
int s = 0;
|
||||
v_uint16 d0 = vx_setzero_u16(), d1 = vx_setzero_u16();
|
||||
v_uint16 d2 = vx_setzero_u16(), d3 = vx_setzero_u16();
|
||||
for (; j <= n - 4 * VTraits<v_uint16>::vlanes(); j += 4 * VTraits<v_uint16>::vlanes()) {
|
||||
d0 = v_max(d0, vx_load(src + j ));
|
||||
d1 = v_max(d1, vx_load(src + j + VTraits<v_uint16>::vlanes()));
|
||||
d2 = v_max(d2, vx_load(src + j + 2 * VTraits<v_uint16>::vlanes()));
|
||||
d3 = v_max(d3, vx_load(src + j + 3 * VTraits<v_uint16>::vlanes()));
|
||||
}
|
||||
d0 = v_max(d0, v_max(d1, v_max(d2, d3)));
|
||||
for (; j < n; j++) {
|
||||
s = std::max(s, (int)src[j]);
|
||||
}
|
||||
return std::max(s, (int)v_reduce_max(d0));
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct NormInf_SIMD<short, int> {
|
||||
int operator() (const short* src, int n) const {
|
||||
int j = 0;
|
||||
int s = 0;
|
||||
v_uint16 d0 = vx_setzero_u16(), d1 = vx_setzero_u16();
|
||||
v_uint16 d2 = vx_setzero_u16(), d3 = vx_setzero_u16();
|
||||
for (; j <= n - 4 * VTraits<v_int16>::vlanes(); j += 4 * VTraits<v_int16>::vlanes()) {
|
||||
d0 = v_max(d0, v_abs(vx_load(src + j )));
|
||||
d1 = v_max(d1, v_abs(vx_load(src + j + VTraits<v_int16>::vlanes())));
|
||||
d2 = v_max(d2, v_abs(vx_load(src + j + 2 * VTraits<v_int16>::vlanes())));
|
||||
d3 = v_max(d3, v_abs(vx_load(src + j + 3 * VTraits<v_int16>::vlanes())));
|
||||
}
|
||||
d0 = v_max(d0, v_max(d1, v_max(d2, d3)));
|
||||
for (; j < n; j++) {
|
||||
s = std::max(s, saturate_cast<int>(cv_abs(src[j])));
|
||||
}
|
||||
return std::max(s, saturate_cast<int>(v_reduce_max(d0)));
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct NormInf_SIMD<int, int> {
|
||||
int operator() (const int* src, int n) const {
|
||||
int j = 0;
|
||||
int s = 0;
|
||||
#if CV_RVV
|
||||
s = normInf_rvv<int, int>(src, n, j);
|
||||
#else
|
||||
v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32();
|
||||
v_uint32 r2 = vx_setzero_u32(), r3 = vx_setzero_u32();
|
||||
for (; j <= n - 4 * VTraits<v_int32>::vlanes(); j += 4 * VTraits<v_int32>::vlanes()) {
|
||||
r0 = v_max(r0, v_abs(vx_load(src + j )));
|
||||
r1 = v_max(r1, v_abs(vx_load(src + j + VTraits<v_int32>::vlanes())));
|
||||
r2 = v_max(r2, v_abs(vx_load(src + j + 2 * VTraits<v_int32>::vlanes())));
|
||||
r3 = v_max(r3, v_abs(vx_load(src + j + 3 * VTraits<v_int32>::vlanes())));
|
||||
}
|
||||
r0 = v_max(r0, v_max(r1, v_max(r2, r3)));
|
||||
s = std::max(s, saturate_cast<int>(v_reduce_max(r0)));
|
||||
#endif
|
||||
for (; j < n; j++) {
|
||||
s = std::max(s, cv_abs(src[j]));
|
||||
}
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct NormInf_SIMD<float, float> {
|
||||
float operator() (const float* src, int n) const {
|
||||
int j = 0;
|
||||
float s = 0.f;
|
||||
v_float32 r0 = vx_setzero_f32(), r1 = vx_setzero_f32();
|
||||
v_float32 r2 = vx_setzero_f32(), r3 = vx_setzero_f32();
|
||||
for (; j <= n - 4 * VTraits<v_float32>::vlanes(); j += 4 * VTraits<v_float32>::vlanes()) {
|
||||
r0 = v_max(r0, v_abs(vx_load(src + j )));
|
||||
r1 = v_max(r1, v_abs(vx_load(src + j + VTraits<v_float32>::vlanes())));
|
||||
r2 = v_max(r2, v_abs(vx_load(src + j + 2 * VTraits<v_float32>::vlanes())));
|
||||
r3 = v_max(r3, v_abs(vx_load(src + j + 3 * VTraits<v_float32>::vlanes())));
|
||||
}
|
||||
r0 = v_max(r0, v_max(r1, v_max(r2, r3)));
|
||||
for (; j < n; j++) {
|
||||
s = std::max(s, cv_abs(src[j]));
|
||||
}
|
||||
return std::max(s, v_reduce_max(r0));
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct NormL1_SIMD<uchar, int> {
|
||||
int operator() (const uchar* src, int n) const {
|
||||
int j = 0;
|
||||
int s = 0;
|
||||
v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32();
|
||||
v_uint8 one = vx_setall_u8(1);
|
||||
for (; j<= n - 2 * VTraits<v_uint8>::vlanes(); j += 2 * VTraits<v_uint8>::vlanes()) {
|
||||
v_uint8 v0 = vx_load(src + j);
|
||||
r0 = v_dotprod_expand_fast(v0, one, r0);
|
||||
|
||||
v_uint8 v1 = vx_load(src + j + VTraits<v_uint8>::vlanes());
|
||||
r1 = v_dotprod_expand_fast(v1, one, r1);
|
||||
}
|
||||
s += v_reduce_sum(v_add(r0, r1));
|
||||
for (; j < n; j++) {
|
||||
s += src[j];
|
||||
}
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct NormL1_SIMD<schar, int> {
|
||||
int operator() (const schar* src, int n) const {
|
||||
int j = 0;
|
||||
int s = 0;
|
||||
#if CV_RVV
|
||||
s = normL1_rvv<schar, int>(src, n, j);
|
||||
#else
|
||||
v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32();
|
||||
v_uint8 one = vx_setall_u8(1);
|
||||
for (; j<= n - 2 * VTraits<v_int8>::vlanes(); j += 2 * VTraits<v_int8>::vlanes()) {
|
||||
v_uint8 v0 = v_abs(vx_load(src + j));
|
||||
r0 = v_dotprod_expand_fast(v0, one, r0);
|
||||
|
||||
v_uint8 v1 = v_abs(vx_load(src + j + VTraits<v_int8>::vlanes()));
|
||||
r1 = v_dotprod_expand_fast(v1, one, r1);
|
||||
}
|
||||
s += v_reduce_sum(v_add(r0, r1));
|
||||
#endif
|
||||
for (; j < n; j++) {
|
||||
s += saturate_cast<int>(cv_abs(src[j]));
|
||||
}
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct NormL1_SIMD<ushort, int> {
|
||||
int operator() (const ushort* src, int n) const {
|
||||
int j = 0;
|
||||
int s = 0;
|
||||
#if CV_RVV
|
||||
s = normL1_rvv<ushort, int>(src, n, j);
|
||||
#else
|
||||
v_uint32 r00 = vx_setzero_u32(), r01 = vx_setzero_u32();
|
||||
v_uint32 r10 = vx_setzero_u32(), r11 = vx_setzero_u32();
|
||||
for (; j<= n - 2 * VTraits<v_uint16>::vlanes(); j += 2 * VTraits<v_uint16>::vlanes()) {
|
||||
v_uint16 v0 = vx_load(src + j);
|
||||
v_uint32 v00, v01;
|
||||
v_expand(v0, v00, v01);
|
||||
r00 = v_add(r00, v00);
|
||||
r01 = v_add(r01, v01);
|
||||
|
||||
v_uint16 v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
|
||||
v_uint32 v10, v11;
|
||||
v_expand(v1, v10, v11);
|
||||
r10 = v_add(r10, v10);
|
||||
r11 = v_add(r11, v11);
|
||||
}
|
||||
s += (int)v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
|
||||
#endif
|
||||
for (; j < n; j++) {
|
||||
s += src[j];
|
||||
}
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct NormL1_SIMD<short, int> {
|
||||
int operator() (const short* src, int n) const {
|
||||
int j = 0;
|
||||
int s = 0;
|
||||
#if CV_RVV
|
||||
s = normL1_rvv<short, int>(src, n, j);
|
||||
#else
|
||||
v_uint32 r00 = vx_setzero_u32(), r01 = vx_setzero_u32();
|
||||
v_uint32 r10 = vx_setzero_u32(), r11 = vx_setzero_u32();
|
||||
for (; j<= n - 2 * VTraits<v_int16>::vlanes(); j += 2 * VTraits<v_int16>::vlanes()) {
|
||||
v_uint16 v0 = v_abs(vx_load(src + j));
|
||||
v_uint32 v00, v01;
|
||||
v_expand(v0, v00, v01);
|
||||
r00 = v_add(r00, v00);
|
||||
r01 = v_add(r01, v01);
|
||||
|
||||
v_uint16 v1 = v_abs(vx_load(src + j + VTraits<v_int16>::vlanes()));
|
||||
v_uint32 v10, v11;
|
||||
v_expand(v1, v10, v11);
|
||||
r10 = v_add(r10, v10);
|
||||
r11 = v_add(r11, v11);
|
||||
}
|
||||
s += (int)v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
|
||||
#endif
|
||||
for (; j < n; j++) {
|
||||
s += saturate_cast<int>(cv_abs(src[j]));
|
||||
}
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct NormL2_SIMD<uchar, int> {
|
||||
int operator() (const uchar* src, int n) const {
|
||||
int j = 0;
|
||||
int s = 0;
|
||||
#if CV_RVV
|
||||
s = normL2_rvv<uchar, int>(src, n, j);
|
||||
#else
|
||||
v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32();
|
||||
for (; j <= n - 2 * VTraits<v_uint8>::vlanes(); j += 2 * VTraits<v_uint8>::vlanes()) {
|
||||
v_uint8 v0 = vx_load(src + j);
|
||||
r0 = v_dotprod_expand_fast(v0, v0, r0);
|
||||
|
||||
v_uint8 v1 = vx_load(src + j + VTraits<v_uint8>::vlanes());
|
||||
r1 = v_dotprod_expand_fast(v1, v1, r1);
|
||||
}
|
||||
s += v_reduce_sum(v_add(r0, r1));
|
||||
#endif
|
||||
for (; j < n; j++) {
|
||||
int v = saturate_cast<int>(src[j]);
|
||||
s += v * v;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct NormL2_SIMD<schar, int> {
|
||||
int operator() (const schar* src, int n) const {
|
||||
int j = 0;
|
||||
int s = 0;
|
||||
#if CV_RVV
|
||||
s = normL2_rvv<schar, int>(src, n, j);
|
||||
#else
|
||||
v_int32 r0 = vx_setzero_s32(), r1 = vx_setzero_s32();
|
||||
for (; j <= n - 2 * VTraits<v_int8>::vlanes(); j += 2 * VTraits<v_int8>::vlanes()) {
|
||||
v_int8 v0 = vx_load(src + j);
|
||||
r0 = v_dotprod_expand_fast(v0, v0, r0);
|
||||
v_int8 v1 = vx_load(src + j + VTraits<v_int8>::vlanes());
|
||||
r1 = v_dotprod_expand_fast(v1, v1, r1);
|
||||
}
|
||||
s += v_reduce_sum(v_add(r0, r1));
|
||||
#endif
|
||||
for (; j < n; j++) {
|
||||
int v = saturate_cast<int>(src[j]);
|
||||
s += v * v;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
||||
|
||||
template<>
|
||||
struct NormInf_SIMD<double, double> {
|
||||
double operator() (const double* src, int n) const {
|
||||
int j = 0;
|
||||
double s = 0.f;
|
||||
v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64();
|
||||
v_float64 r2 = vx_setzero_f64(), r3 = vx_setzero_f64();
|
||||
for (; j <= n - 4 * VTraits<v_float64>::vlanes(); j += 4 * VTraits<v_float64>::vlanes()) {
|
||||
r0 = v_max(r0, v_abs(vx_load(src + j )));
|
||||
r1 = v_max(r1, v_abs(vx_load(src + j + VTraits<v_float64>::vlanes())));
|
||||
r2 = v_max(r2, v_abs(vx_load(src + j + 2 * VTraits<v_float64>::vlanes())));
|
||||
r3 = v_max(r3, v_abs(vx_load(src + j + 3 * VTraits<v_float64>::vlanes())));
|
||||
}
|
||||
r0 = v_max(r0, v_max(r1, v_max(r2, r3)));
|
||||
for (; j < n; j++) {
|
||||
s = std::max(s, cv_abs(src[j]));
|
||||
}
|
||||
// [TODO]: use v_reduce_max when it supports float64
|
||||
double t[VTraits<v_float64>::max_nlanes];
|
||||
vx_store(t, r0);
|
||||
for (int i = 0; i < VTraits<v_float64>::vlanes(); i++) {
|
||||
s = std::max(s, cv_abs(t[i]));
|
||||
}
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct NormL1_SIMD<int, double> {
|
||||
double operator() (const int* src, int n) const {
|
||||
int j = 0;
|
||||
double s = 0.f;
|
||||
v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64();
|
||||
v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64();
|
||||
for (; j <= n - 2 * VTraits<v_int32>::vlanes(); j += 2 * VTraits<v_int32>::vlanes()) {
|
||||
v_float32 v0 = v_abs(v_cvt_f32(vx_load(src + j))), v1 = v_abs(v_cvt_f32(vx_load(src + j + VTraits<v_int32>::vlanes())));
|
||||
r00 = v_add(r00, v_cvt_f64(v0)); r01 = v_add(r01, v_cvt_f64_high(v0));
|
||||
r10 = v_add(r10, v_cvt_f64(v1)); r11 = v_add(r11, v_cvt_f64_high(v1));
|
||||
}
|
||||
s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
|
||||
for (; j < n; j++) {
|
||||
s += cv_abs(src[j]);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct NormL1_SIMD<float, double> {
|
||||
double operator() (const float* src, int n) const {
|
||||
int j = 0;
|
||||
double s = 0.f;
|
||||
v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64();
|
||||
v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64();
|
||||
v_float64 r20 = vx_setzero_f64(), r21 = vx_setzero_f64();
|
||||
v_float64 r30 = vx_setzero_f64(), r31 = vx_setzero_f64();
|
||||
for (; j <= n - 4 * VTraits<v_float32>::vlanes(); j += 4 * VTraits<v_float32>::vlanes()) {
|
||||
v_float32 v0 = v_abs(vx_load(src + j)), v1 = v_abs(vx_load(src + j + VTraits<v_float32>::vlanes()));
|
||||
r00 = v_add(r00, v_cvt_f64(v0)); r01 = v_add(r01, v_cvt_f64_high(v0));
|
||||
r10 = v_add(r10, v_cvt_f64(v1)); r11 = v_add(r11, v_cvt_f64_high(v1));
|
||||
|
||||
v_float32 v2 = v_abs(vx_load(src + j + 2 * VTraits<v_float32>::vlanes())), v3 = v_abs(vx_load(src + j + 3 * VTraits<v_float32>::vlanes()));
|
||||
r20 = v_add(r20, v_cvt_f64(v2)); r21 = v_add(r21, v_cvt_f64_high(v2));
|
||||
r30 = v_add(r30, v_cvt_f64(v3)); r31 = v_add(r31, v_cvt_f64_high(v3));
|
||||
}
|
||||
s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
|
||||
s += v_reduce_sum(v_add(v_add(v_add(r20, r21), r30), r31));
|
||||
for (; j < n; j++) {
|
||||
s += cv_abs(src[j]);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct NormL1_SIMD<double, double> {
|
||||
double operator() (const double* src, int n) const {
|
||||
int j = 0;
|
||||
double s = 0.f;
|
||||
#if CV_RVV // This is introduced to workaround the accuracy issue on ci
|
||||
s = normL1_rvv<double, double>(src, n, j);
|
||||
#else
|
||||
v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64();
|
||||
v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64();
|
||||
for (; j <= n - 4 * VTraits<v_float64>::vlanes(); j += 4 * VTraits<v_float64>::vlanes()) {
|
||||
r00 = v_add(r00, v_abs(vx_load(src + j )));
|
||||
r01 = v_add(r01, v_abs(vx_load(src + j + VTraits<v_float64>::vlanes())));
|
||||
r10 = v_add(r10, v_abs(vx_load(src + j + 2 * VTraits<v_float64>::vlanes())));
|
||||
r11 = v_add(r11, v_abs(vx_load(src + j + 3 * VTraits<v_float64>::vlanes())));
|
||||
}
|
||||
s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
|
||||
#endif
|
||||
for (; j < n; j++) {
|
||||
s += cv_abs(src[j]);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct NormL2_SIMD<ushort, double> {
|
||||
double operator() (const ushort* src, int n) const {
|
||||
int j = 0;
|
||||
double s = 0.f;
|
||||
v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64();
|
||||
for (; j <= n - 2 * VTraits<v_uint16>::vlanes(); j += 2 * VTraits<v_uint16>::vlanes()) {
|
||||
v_uint16 v0 = vx_load(src + j);
|
||||
v_uint64 u0 = v_dotprod_expand_fast(v0, v0);
|
||||
r0 = v_add(r0, v_cvt_f64(v_reinterpret_as_s64(u0)));
|
||||
|
||||
v_uint16 v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
|
||||
v_uint64 u1 = v_dotprod_expand_fast(v1, v1);
|
||||
r1 = v_add(r1, v_cvt_f64(v_reinterpret_as_s64(u1)));
|
||||
}
|
||||
s += v_reduce_sum(v_add(r0, r1));
|
||||
for (; j < n; j++) {
|
||||
double v = saturate_cast<double>(src[j]);
|
||||
s += v * v;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct NormL2_SIMD<short, double> {
|
||||
double operator() (const short* src, int n) const {
|
||||
int j = 0;
|
||||
double s = 0.f;
|
||||
v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64();
|
||||
for (; j <= n - 2 * VTraits<v_int16>::vlanes(); j += 2 * VTraits<v_int16>::vlanes()) {
|
||||
v_int16 v0 = vx_load(src + j);
|
||||
r0 = v_add(r0, v_cvt_f64(v_dotprod_expand_fast(v0, v0)));
|
||||
|
||||
v_int16 v1 = vx_load(src + j + VTraits<v_int16>::vlanes());
|
||||
r1 = v_add(r1, v_cvt_f64(v_dotprod_expand_fast(v1, v1)));
|
||||
}
|
||||
s += v_reduce_sum(v_add(r0, r1));
|
||||
for (; j < n; j++) {
|
||||
double v = saturate_cast<double>(src[j]);
|
||||
s += v * v;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct NormL2_SIMD<int, double> {
|
||||
double operator() (const int* src, int n) const {
|
||||
int j = 0;
|
||||
double s = 0.f;
|
||||
v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64();
|
||||
for (; j <= n - 2 * VTraits<v_int32>::vlanes(); j += 2 * VTraits<v_int32>::vlanes()) {
|
||||
v_int32 v0 = vx_load(src + j);
|
||||
r0 = v_dotprod_expand_fast(v0, v0, r0);
|
||||
|
||||
v_int32 v1 = vx_load(src + j + VTraits<v_int32>::vlanes());
|
||||
r1 = v_dotprod_expand_fast(v1, v1, r1);
|
||||
}
|
||||
s += v_reduce_sum(v_add(r0, r1));
|
||||
for (; j < n; j++) {
|
||||
double v = src[j];
|
||||
s += v * v;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct NormL2_SIMD<float, double> {
|
||||
double operator() (const float* src, int n) const {
|
||||
int j = 0;
|
||||
double s = 0.f;
|
||||
v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64();
|
||||
v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64();
|
||||
for (; j <= n - 2 * VTraits<v_float32>::vlanes(); j += 2 * VTraits<v_float32>::vlanes()) {
|
||||
v_float32 v0 = vx_load(src + j), v1 = vx_load(src + j + VTraits<v_float32>::vlanes());
|
||||
v_float64 v00 = v_cvt_f64(v0), v01 = v_cvt_f64_high(v0);
|
||||
v_float64 v10 = v_cvt_f64(v1), v11 = v_cvt_f64_high(v1);
|
||||
r00 = v_fma(v00, v00, r00); r01 = v_fma(v01, v01, r01);
|
||||
r10 = v_fma(v10, v10, r10); r11 = v_fma(v11, v11, r11);
|
||||
}
|
||||
s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
|
||||
for (; j < n; j++) {
|
||||
double v = src[j];
|
||||
s += v * v;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct NormL2_SIMD<double, double> {
|
||||
double operator() (const double* src, int n) const {
|
||||
int j = 0;
|
||||
double s = 0.f;
|
||||
#if CV_RVV // This is introduced to workaround the accuracy issue on ci
|
||||
s = normL2_rvv<double, double>(src, n, j);
|
||||
#else
|
||||
v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64();
|
||||
v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64();
|
||||
for (; j <= n - 4 * VTraits<v_float64>::vlanes(); j += 4 * VTraits<v_float64>::vlanes()) {
|
||||
v_float64 v00 = vx_load(src + j );
|
||||
v_float64 v01 = vx_load(src + j + VTraits<v_float64>::vlanes());
|
||||
v_float64 v10 = vx_load(src + j + 2 * VTraits<v_float64>::vlanes());
|
||||
v_float64 v11 = vx_load(src + j + 3 * VTraits<v_float64>::vlanes());
|
||||
r00 = v_fma(v00, v00, r00); r01 = v_fma(v01, v01, r01);
|
||||
r10 = v_fma(v10, v10, r10); r11 = v_fma(v11, v11, r11);
|
||||
}
|
||||
s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
|
||||
#endif
|
||||
for (; j < n; j++) {
|
||||
double v = src[j];
|
||||
s += v * v;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
template<typename T, typename ST> int
|
||||
normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn) {
|
||||
ST result = *_result;
|
||||
if( !mask ) {
|
||||
NormInf_SIMD<T, ST> op;
|
||||
result = std::max(result, op(src, len*cn));
|
||||
} else {
|
||||
for( int i = 0; i < len; i++, src += cn ) {
|
||||
if( mask[i] ) {
|
||||
for( int k = 0; k < cn; k++ ) {
|
||||
result = std::max(result, ST(cv_abs(src[k])));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*_result = result;
|
||||
return 0;
|
||||
}
|
||||
|
||||
template<typename T, typename ST> int
|
||||
normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn) {
|
||||
ST result = *_result;
|
||||
if( !mask ) {
|
||||
NormL1_SIMD<T, ST> op;
|
||||
result += op(src, len*cn);
|
||||
} else {
|
||||
for( int i = 0; i < len; i++, src += cn ) {
|
||||
if( mask[i] ) {
|
||||
for( int k = 0; k < cn; k++ ) {
|
||||
result += cv_abs(src[k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*_result = result;
|
||||
return 0;
|
||||
}
|
||||
|
||||
template<typename T, typename ST> int
|
||||
normL2_(const T* src, const uchar* mask, ST* _result, int len, int cn) {
|
||||
ST result = *_result;
|
||||
if( !mask ) {
|
||||
NormL2_SIMD<T, ST> op;
|
||||
result += op(src, len*cn);
|
||||
} else {
|
||||
for( int i = 0; i < len; i++, src += cn ) {
|
||||
if( mask[i] ) {
|
||||
for( int k = 0; k < cn; k++ ) {
|
||||
T v = src[k];
|
||||
result += (ST)v*v;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*_result = result;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \
|
||||
static int norm##L##_##suffix(const type* src, const uchar* mask, ntype* r, int len, int cn) \
|
||||
{ CV_INSTRUMENT_REGION(); return norm##L##_(src, mask, r, len, cn); } \
|
||||
|
||||
#define CV_DEF_NORM_ALL(suffix, type, inftype, l1type, l2type) \
|
||||
CV_DEF_NORM_FUNC(Inf, suffix, type, inftype) \
|
||||
CV_DEF_NORM_FUNC(L1, suffix, type, l1type) \
|
||||
CV_DEF_NORM_FUNC(L2, suffix, type, l2type)
|
||||
|
||||
CV_DEF_NORM_ALL(8u, uchar, int, int, int)
|
||||
CV_DEF_NORM_ALL(8s, schar, int, int, int)
|
||||
CV_DEF_NORM_ALL(16u, ushort, int, int, double)
|
||||
CV_DEF_NORM_ALL(16s, short, int, int, double)
|
||||
CV_DEF_NORM_ALL(32s, int, int, double, double)
|
||||
CV_DEF_NORM_ALL(32f, float, float, double, double)
|
||||
CV_DEF_NORM_ALL(64f, double, double, double, double)
|
||||
|
||||
NormFunc getNormFunc(int normType, int depth)
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
static NormFunc normTab[3][8] =
|
||||
{
|
||||
{
|
||||
(NormFunc)GET_OPTIMIZED(normInf_8u), (NormFunc)GET_OPTIMIZED(normInf_8s), (NormFunc)GET_OPTIMIZED(normInf_16u), (NormFunc)GET_OPTIMIZED(normInf_16s),
|
||||
(NormFunc)GET_OPTIMIZED(normInf_32s), (NormFunc)GET_OPTIMIZED(normInf_32f), (NormFunc)normInf_64f, 0
|
||||
},
|
||||
{
|
||||
(NormFunc)GET_OPTIMIZED(normL1_8u), (NormFunc)GET_OPTIMIZED(normL1_8s), (NormFunc)GET_OPTIMIZED(normL1_16u), (NormFunc)GET_OPTIMIZED(normL1_16s),
|
||||
(NormFunc)GET_OPTIMIZED(normL1_32s), (NormFunc)GET_OPTIMIZED(normL1_32f), (NormFunc)normL1_64f, 0
|
||||
},
|
||||
{
|
||||
(NormFunc)GET_OPTIMIZED(normL2_8u), (NormFunc)GET_OPTIMIZED(normL2_8s), (NormFunc)GET_OPTIMIZED(normL2_16u), (NormFunc)GET_OPTIMIZED(normL2_16s),
|
||||
(NormFunc)GET_OPTIMIZED(normL2_32s), (NormFunc)GET_OPTIMIZED(normL2_32f), (NormFunc)normL2_64f, 0
|
||||
}
|
||||
};
|
||||
|
||||
return normTab[normType][depth];
|
||||
}
|
||||
|
||||
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
||||
|
||||
CV_CPU_OPTIMIZATION_NAMESPACE_END
|
||||
|
||||
} // cv::
|
Loading…
Reference in New Issue
Block a user