mirror of
https://github.com/opencv/opencv.git
synced 2025-08-06 14:36:36 +08:00
Merge pull request #26991 from fengyuentau:4x/core/norm2hal_rvv
core: improve norm of hal rvv #26991 Merge with https://github.com/opencv/opencv_extra/pull/1241 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
0142231e4d
commit
8207549638
30
3rdparty/hal_rvv/hal_rvv_1p0/common.hpp
vendored
Normal file
30
3rdparty/hal_rvv/hal_rvv_1p0/common.hpp
vendored
Normal file
@ -0,0 +1,30 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_HAL_RVV_COMMON_HPP_INCLUDED
|
||||
#define OPENCV_HAL_RVV_COMMON_HPP_INCLUDED
|
||||
|
||||
#include <riscv_vector.h>
|
||||
|
||||
namespace cv { namespace cv_hal_rvv { namespace custom_intrin {
|
||||
|
||||
#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(_Tpvs, _Tpvd, shift, suffix) \
|
||||
inline _Tpvd __riscv_vabs(const _Tpvs& v, const int vl) { \
|
||||
_Tpvs mask = __riscv_vsra(v, shift, vl); \
|
||||
_Tpvs v_xor = __riscv_vxor(v, mask, vl); \
|
||||
return __riscv_vreinterpret_##suffix( \
|
||||
__riscv_vsub(v_xor, mask, vl) \
|
||||
); \
|
||||
}
|
||||
|
||||
CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint8m2_t, vuint8m2_t, 7, u8m2)
|
||||
CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint8m8_t, vuint8m8_t, 7, u8m8)
|
||||
CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m4_t, vuint16m4_t, 15, u16m4)
|
||||
CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m8_t, vuint16m8_t, 15, u16m8)
|
||||
CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m4_t, vuint32m4_t, 31, u32m4)
|
||||
CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m8_t, vuint32m8_t, 31, u32m8)
|
||||
|
||||
}}} // cv::cv_hal_rvv::custom_intrin
|
||||
|
||||
#endif
|
1528
3rdparty/hal_rvv/hal_rvv_1p0/norm.hpp
vendored
1528
3rdparty/hal_rvv/hal_rvv_1p0/norm.hpp
vendored
File diff suppressed because it is too large
Load Diff
@ -14,7 +14,7 @@ typedef perf::TestBaseWithParam<Size_MatType_NormType_t> Size_MatType_NormType;
|
||||
PERF_TEST_P(Size_MatType_NormType, norm,
|
||||
testing::Combine(
|
||||
testing::Values(TYPICAL_MAT_SIZES),
|
||||
testing::Values(TYPICAL_MAT_TYPES),
|
||||
testing::Values(CV_8UC1, CV_8UC4, CV_8SC1, CV_16UC1, CV_16SC1, CV_32SC1, CV_32FC1, CV_64FC1),
|
||||
testing::Values((int)NORM_INF, (int)NORM_L1, (int)NORM_L2)
|
||||
)
|
||||
)
|
||||
@ -36,7 +36,7 @@ PERF_TEST_P(Size_MatType_NormType, norm,
|
||||
PERF_TEST_P(Size_MatType_NormType, norm_mask,
|
||||
testing::Combine(
|
||||
testing::Values(TYPICAL_MAT_SIZES),
|
||||
testing::Values(TYPICAL_MAT_TYPES),
|
||||
testing::Values(CV_8UC1, CV_8UC4, CV_8SC1, CV_16UC1, CV_16SC1, CV_32SC1, CV_32FC1, CV_64FC1),
|
||||
testing::Values((int)NORM_INF, (int)NORM_L1, (int)NORM_L2)
|
||||
)
|
||||
)
|
||||
|
@ -1,200 +0,0 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
//
|
||||
// Copytright (C) 2025, SpaceMIT Inc., all rights reserved.
|
||||
|
||||
#include "opencv2/core/hal/intrin.hpp"
|
||||
|
||||
namespace cv {
|
||||
|
||||
namespace {
|
||||
|
||||
// [TODO] Drop this until rvv has dedicated intrinsics for abs on integers.
|
||||
template<typename T, typename ST> inline ST __riscv_vabs(const T&);
|
||||
|
||||
template<> inline
|
||||
vuint8m1_t __riscv_vabs(const vint8m1_t& v) {
|
||||
const int vle8m1 = __riscv_vsetvlmax_e8m1();
|
||||
vint8m1_t mask = __riscv_vsra_vx_i8m1(v, 7, vle8m1);
|
||||
vint8m1_t v_xor = __riscv_vxor_vv_i8m1(v, mask, vle8m1);
|
||||
return __riscv_vreinterpret_v_i8m1_u8m1(
|
||||
__riscv_vsub_vv_i8m1(v_xor, mask, vle8m1)
|
||||
);
|
||||
}
|
||||
|
||||
template<> inline
|
||||
vuint16m1_t __riscv_vabs(const vint16m1_t& v) {
|
||||
const int vle16m1 = __riscv_vsetvlmax_e16m1();
|
||||
vint16m1_t mask = __riscv_vsra_vx_i16m1(v, 15, vle16m1);
|
||||
vint16m1_t v_xor = __riscv_vxor_vv_i16m1(v, mask, vle16m1);
|
||||
return __riscv_vreinterpret_v_i16m1_u16m1(
|
||||
__riscv_vsub_vv_i16m1(v_xor, mask, vle16m1)
|
||||
);
|
||||
}
|
||||
|
||||
template<> inline
|
||||
vuint32m1_t __riscv_vabs(const vint32m1_t& v) {
|
||||
const int vle32m1 = __riscv_vsetvlmax_e32m1();
|
||||
vint32m1_t mask = __riscv_vsra_vx_i32m1(v, 31, vle32m1);
|
||||
vint32m1_t v_xor = __riscv_vxor_vv_i32m1(v, mask, vle32m1);
|
||||
return __riscv_vreinterpret_v_i32m1_u32m1(
|
||||
__riscv_vsub_vv_i32m1(v_xor, mask, vle32m1)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
|
||||
|
||||
template <typename T, typename ST> inline
|
||||
ST normInf_rvv(const T* src, int n, int& j);
|
||||
|
||||
template<> inline
|
||||
int normInf_rvv(const int* src, int n, int& j) {
|
||||
const int vle32m1 = __riscv_vsetvlmax_e32m1();
|
||||
vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1);
|
||||
vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1);
|
||||
for (; j <= n - 2 * vle32m1; j += 2 * vle32m1) {
|
||||
vuint32m1_t v0 = __riscv_vabs<vint32m1_t, vuint32m1_t>(__riscv_vle32_v_i32m1(src + j, vle32m1));
|
||||
r0 = __riscv_vmaxu(r0, v0, vle32m1);
|
||||
|
||||
vuint32m1_t v1 = __riscv_vabs<vint32m1_t, vuint32m1_t>(__riscv_vle32_v_i32m1(src + j + vle32m1, vle32m1));
|
||||
r1 = __riscv_vmaxu(r1, v1, vle32m1);
|
||||
}
|
||||
r0 = __riscv_vmaxu(r0, r1, vle32m1);
|
||||
return (int)__riscv_vmv_x(__riscv_vredmaxu(r0, __riscv_vmv_v_x_u32m1(0, vle32m1), vle32m1));
|
||||
}
|
||||
|
||||
template <typename T, typename ST> inline
|
||||
ST normL1_rvv(const T* src, int n, int& j);
|
||||
|
||||
template<> inline
|
||||
int normL1_rvv(const schar* src, int n, int& j) {
|
||||
const int vle8m1 = __riscv_vsetvlmax_e8m1();
|
||||
const int vle16m1 = __riscv_vsetvlmax_e16m1();
|
||||
const int vle32m1 = __riscv_vsetvlmax_e32m1();
|
||||
vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1);
|
||||
vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1);
|
||||
vuint16m1_t zero = __riscv_vmv_v_x_u16m1(0, vle16m1);
|
||||
for (; j <= n - 2 * vle8m1; j += 2 * vle8m1) {
|
||||
vuint8m1_t v0 = __riscv_vabs<vint8m1_t, vuint8m1_t>(__riscv_vle8_v_i8m1(src + j, vle8m1));
|
||||
vuint16m1_t u0 = __riscv_vwredsumu_tu(zero, v0, zero, vle8m1);
|
||||
r0 = __riscv_vwredsumu(u0, r0, vle16m1);
|
||||
|
||||
vuint8m1_t v1 = __riscv_vabs<vint8m1_t, vuint8m1_t>(__riscv_vle8_v_i8m1(src + j + vle8m1, vle8m1));
|
||||
vuint16m1_t u1 = __riscv_vwredsumu_tu(zero, v1, zero, vle8m1);
|
||||
r1 = __riscv_vwredsumu(u1, r1, vle16m1);
|
||||
}
|
||||
return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1));
|
||||
}
|
||||
|
||||
template<> inline
|
||||
int normL1_rvv(const ushort* src, int n, int& j) {
|
||||
const int vle16m1 = __riscv_vsetvlmax_e16m1();
|
||||
const int vle32m1 = __riscv_vsetvlmax_e32m1();
|
||||
vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1);
|
||||
vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1);
|
||||
for (; j <= n - 2 * vle16m1; j += 2 * vle16m1) {
|
||||
vuint16m1_t v0 = __riscv_vle16_v_u16m1(src + j, vle16m1);
|
||||
r0 = __riscv_vwredsumu(v0, r0, vle16m1);
|
||||
|
||||
vuint16m1_t v1 = __riscv_vle16_v_u16m1(src + j + vle16m1, vle16m1);
|
||||
r1 = __riscv_vwredsumu(v1, r1, vle16m1);
|
||||
}
|
||||
return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1));
|
||||
}
|
||||
|
||||
template<> inline
|
||||
int normL1_rvv(const short* src, int n, int& j) {
|
||||
const int vle16m1 = __riscv_vsetvlmax_e16m1();
|
||||
const int vle32m1 = __riscv_vsetvlmax_e32m1();
|
||||
vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1);
|
||||
vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1);
|
||||
for (; j<= n - 2 * vle16m1; j += 2 * vle16m1) {
|
||||
vuint16m1_t v0 = __riscv_vabs<vint16m1_t, vuint16m1_t>(__riscv_vle16_v_i16m1(src + j, vle16m1));
|
||||
r0 = __riscv_vwredsumu(v0, r0, vle16m1);
|
||||
|
||||
vuint16m1_t v1 = __riscv_vabs<vint16m1_t, vuint16m1_t>(__riscv_vle16_v_i16m1(src + j + vle16m1, vle16m1));
|
||||
r1 = __riscv_vwredsumu(v1, r1, vle16m1);
|
||||
}
|
||||
return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1));
|
||||
}
|
||||
|
||||
template<> inline
|
||||
double normL1_rvv(const double* src, int n, int& j) {
|
||||
const int vle64m1 = __riscv_vsetvlmax_e64m1();
|
||||
vfloat64m1_t r0 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1);
|
||||
vfloat64m1_t r1 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1);
|
||||
for (; j <= n - 2 * vle64m1; j += 2 * vle64m1) {
|
||||
vfloat64m1_t v0 = __riscv_vle64_v_f64m1(src + j, vle64m1);
|
||||
v0 = __riscv_vfabs(v0, vle64m1);
|
||||
r0 = __riscv_vfadd(r0, v0, vle64m1);
|
||||
|
||||
vfloat64m1_t v1 = __riscv_vle64_v_f64m1(src + j + vle64m1, vle64m1);
|
||||
v1 = __riscv_vfabs(v1, vle64m1);
|
||||
r1 = __riscv_vfadd(r1, v1, vle64m1);
|
||||
}
|
||||
r0 = __riscv_vfadd(r0, r1, vle64m1);
|
||||
return __riscv_vfmv_f(__riscv_vfredusum(r0, __riscv_vfmv_v_f_f64m1(0.f, vle64m1), vle64m1));
|
||||
}
|
||||
|
||||
template <typename T, typename ST> inline
|
||||
ST normL2_rvv(const T* src, int n, int& j);
|
||||
|
||||
template<> inline
|
||||
int normL2_rvv(const uchar* src, int n, int& j) {
|
||||
const int vle8m1 = __riscv_vsetvlmax_e8m1();
|
||||
const int vle16m1 = __riscv_vsetvlmax_e16m1();
|
||||
const int vle32m1 = __riscv_vsetvlmax_e32m1();
|
||||
vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1);
|
||||
vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1);
|
||||
for (; j <= n - 2 * vle8m1; j += 2 * vle8m1) {
|
||||
vuint8m1_t v0 = __riscv_vle8_v_u8m1(src + j, vle8m1);
|
||||
vuint16m2_t u0 = __riscv_vwmulu(v0, v0, vle8m1);
|
||||
r0 = __riscv_vwredsumu(u0, r0, vle16m1 * 2);
|
||||
|
||||
vuint8m1_t v1 = __riscv_vle8_v_u8m1(src + j + vle8m1, vle8m1);
|
||||
vuint16m2_t u1 = __riscv_vwmulu(v1, v1, vle8m1);
|
||||
r1 = __riscv_vwredsumu(u1, r1, vle16m1 * 2);
|
||||
}
|
||||
return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1));
|
||||
}
|
||||
|
||||
template<> inline
|
||||
int normL2_rvv(const schar* src, int n, int& j) {
|
||||
const int vle8m1 = __riscv_vsetvlmax_e8m1();
|
||||
const int vle16m1 = __riscv_vsetvlmax_e16m1();
|
||||
const int vle32m1 = __riscv_vsetvlmax_e32m1();
|
||||
vint32m1_t r0 = __riscv_vmv_v_x_i32m1(0, vle32m1);
|
||||
vint32m1_t r1 = __riscv_vmv_v_x_i32m1(0, vle32m1);
|
||||
for (; j <= n - 2 * vle8m1; j += 2 * vle8m1) {
|
||||
vint8m1_t v0 = __riscv_vle8_v_i8m1(src + j, vle8m1);
|
||||
vint16m2_t u0 = __riscv_vwmul(v0, v0, vle8m1);
|
||||
r0 = __riscv_vwredsum(u0, r0, vle16m1 * 2);
|
||||
|
||||
vint8m1_t v1 = __riscv_vle8_v_i8m1(src + j + vle8m1, vle8m1);
|
||||
vint16m2_t u1 = __riscv_vwmul(v1, v1, vle8m1);
|
||||
r1 = __riscv_vwredsum(u1, r1, vle16m1 * 2);
|
||||
}
|
||||
return __riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1));
|
||||
}
|
||||
|
||||
template<> inline
|
||||
double normL2_rvv(const double* src, int n, int& j) {
|
||||
const int vle64m1 = __riscv_vsetvlmax_e64m1();
|
||||
vfloat64m1_t r0 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1);
|
||||
vfloat64m1_t r1 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1);
|
||||
for (; j <= n - 2 * vle64m1; j += 2 * vle64m1) {
|
||||
vfloat64m1_t v0 = __riscv_vle64_v_f64m1(src + j, vle64m1);
|
||||
r0 = __riscv_vfmacc(r0, v0, v0, vle64m1);
|
||||
|
||||
vfloat64m1_t v1 = __riscv_vle64_v_f64m1(src + j + vle64m1, vle64m1);
|
||||
r1 = __riscv_vfmacc(r1, v1, v1, vle64m1);
|
||||
}
|
||||
r0 = __riscv_vfadd(r0, r1, vle64m1);
|
||||
return __riscv_vfmv_f(__riscv_vfredusum(r0, __riscv_vfmv_v_f_f64m1(0.f, vle64m1), vle64m1));
|
||||
}
|
||||
|
||||
CV_CPU_OPTIMIZATION_NAMESPACE_END
|
||||
|
||||
} // cv::
|
@ -4,10 +4,6 @@
|
||||
|
||||
#include "precomp.hpp"
|
||||
|
||||
#if CV_RVV
|
||||
#include "norm.rvv1p0.hpp"
|
||||
#endif
|
||||
|
||||
namespace cv {
|
||||
|
||||
using NormFunc = int (*)(const uchar*, const uchar*, uchar*, int, int);
|
||||
@ -181,9 +177,6 @@ struct NormInf_SIMD<int, int> {
|
||||
int operator() (const int* src, int n) const {
|
||||
int j = 0;
|
||||
int s = 0;
|
||||
#if CV_RVV
|
||||
s = normInf_rvv<int, int>(src, n, j);
|
||||
#else
|
||||
v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32();
|
||||
v_uint32 r2 = vx_setzero_u32(), r3 = vx_setzero_u32();
|
||||
for (; j <= n - 4 * VTraits<v_int32>::vlanes(); j += 4 * VTraits<v_int32>::vlanes()) {
|
||||
@ -194,7 +187,6 @@ struct NormInf_SIMD<int, int> {
|
||||
}
|
||||
r0 = v_max(r0, v_max(r1, v_max(r2, r3)));
|
||||
s = std::max(s, saturate_cast<int>(v_reduce_max(r0)));
|
||||
#endif
|
||||
for (; j < n; j++) {
|
||||
s = std::max(s, cv_abs(src[j]));
|
||||
}
|
||||
@ -250,9 +242,6 @@ struct NormL1_SIMD<schar, int> {
|
||||
int operator() (const schar* src, int n) const {
|
||||
int j = 0;
|
||||
int s = 0;
|
||||
#if CV_RVV
|
||||
s = normL1_rvv<schar, int>(src, n, j);
|
||||
#else
|
||||
v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32();
|
||||
v_uint8 one = vx_setall_u8(1);
|
||||
for (; j<= n - 2 * VTraits<v_int8>::vlanes(); j += 2 * VTraits<v_int8>::vlanes()) {
|
||||
@ -263,7 +252,6 @@ struct NormL1_SIMD<schar, int> {
|
||||
r1 = v_dotprod_expand_fast(v1, one, r1);
|
||||
}
|
||||
s += v_reduce_sum(v_add(r0, r1));
|
||||
#endif
|
||||
for (; j < n; j++) {
|
||||
s += saturate_cast<int>(cv_abs(src[j]));
|
||||
}
|
||||
@ -276,9 +264,6 @@ struct NormL1_SIMD<ushort, int> {
|
||||
int operator() (const ushort* src, int n) const {
|
||||
int j = 0;
|
||||
int s = 0;
|
||||
#if CV_RVV
|
||||
s = normL1_rvv<ushort, int>(src, n, j);
|
||||
#else
|
||||
v_uint32 r00 = vx_setzero_u32(), r01 = vx_setzero_u32();
|
||||
v_uint32 r10 = vx_setzero_u32(), r11 = vx_setzero_u32();
|
||||
for (; j<= n - 2 * VTraits<v_uint16>::vlanes(); j += 2 * VTraits<v_uint16>::vlanes()) {
|
||||
@ -295,7 +280,6 @@ struct NormL1_SIMD<ushort, int> {
|
||||
r11 = v_add(r11, v11);
|
||||
}
|
||||
s += (int)v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
|
||||
#endif
|
||||
for (; j < n; j++) {
|
||||
s += src[j];
|
||||
}
|
||||
@ -308,9 +292,6 @@ struct NormL1_SIMD<short, int> {
|
||||
int operator() (const short* src, int n) const {
|
||||
int j = 0;
|
||||
int s = 0;
|
||||
#if CV_RVV
|
||||
s = normL1_rvv<short, int>(src, n, j);
|
||||
#else
|
||||
v_uint32 r00 = vx_setzero_u32(), r01 = vx_setzero_u32();
|
||||
v_uint32 r10 = vx_setzero_u32(), r11 = vx_setzero_u32();
|
||||
for (; j<= n - 2 * VTraits<v_int16>::vlanes(); j += 2 * VTraits<v_int16>::vlanes()) {
|
||||
@ -327,7 +308,6 @@ struct NormL1_SIMD<short, int> {
|
||||
r11 = v_add(r11, v11);
|
||||
}
|
||||
s += (int)v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
|
||||
#endif
|
||||
for (; j < n; j++) {
|
||||
s += saturate_cast<int>(cv_abs(src[j]));
|
||||
}
|
||||
@ -340,9 +320,6 @@ struct NormL2_SIMD<uchar, int> {
|
||||
int operator() (const uchar* src, int n) const {
|
||||
int j = 0;
|
||||
int s = 0;
|
||||
#if CV_RVV
|
||||
s = normL2_rvv<uchar, int>(src, n, j);
|
||||
#else
|
||||
v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32();
|
||||
for (; j <= n - 2 * VTraits<v_uint8>::vlanes(); j += 2 * VTraits<v_uint8>::vlanes()) {
|
||||
v_uint8 v0 = vx_load(src + j);
|
||||
@ -352,7 +329,6 @@ struct NormL2_SIMD<uchar, int> {
|
||||
r1 = v_dotprod_expand_fast(v1, v1, r1);
|
||||
}
|
||||
s += v_reduce_sum(v_add(r0, r1));
|
||||
#endif
|
||||
for (; j < n; j++) {
|
||||
int v = saturate_cast<int>(src[j]);
|
||||
s += v * v;
|
||||
@ -366,9 +342,6 @@ struct NormL2_SIMD<schar, int> {
|
||||
int operator() (const schar* src, int n) const {
|
||||
int j = 0;
|
||||
int s = 0;
|
||||
#if CV_RVV
|
||||
s = normL2_rvv<schar, int>(src, n, j);
|
||||
#else
|
||||
v_int32 r0 = vx_setzero_s32(), r1 = vx_setzero_s32();
|
||||
for (; j <= n - 2 * VTraits<v_int8>::vlanes(); j += 2 * VTraits<v_int8>::vlanes()) {
|
||||
v_int8 v0 = vx_load(src + j);
|
||||
@ -377,7 +350,6 @@ struct NormL2_SIMD<schar, int> {
|
||||
r1 = v_dotprod_expand_fast(v1, v1, r1);
|
||||
}
|
||||
s += v_reduce_sum(v_add(r0, r1));
|
||||
#endif
|
||||
for (; j < n; j++) {
|
||||
int v = saturate_cast<int>(src[j]);
|
||||
s += v * v;
|
||||
@ -825,31 +797,6 @@ struct NormL1_SIMD<float, double> {
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct NormL1_SIMD<double, double> {
|
||||
double operator() (const double* src, int n) const {
|
||||
int j = 0;
|
||||
double s = 0.f;
|
||||
#if CV_RVV // This is introduced to workaround the accuracy issue on ci
|
||||
s = normL1_rvv<double, double>(src, n, j);
|
||||
#else
|
||||
v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64();
|
||||
v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64();
|
||||
for (; j <= n - 4 * VTraits<v_float64>::vlanes(); j += 4 * VTraits<v_float64>::vlanes()) {
|
||||
r00 = v_add(r00, v_abs(vx_load(src + j )));
|
||||
r01 = v_add(r01, v_abs(vx_load(src + j + VTraits<v_float64>::vlanes())));
|
||||
r10 = v_add(r10, v_abs(vx_load(src + j + 2 * VTraits<v_float64>::vlanes())));
|
||||
r11 = v_add(r11, v_abs(vx_load(src + j + 3 * VTraits<v_float64>::vlanes())));
|
||||
}
|
||||
s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
|
||||
#endif
|
||||
for (; j < n; j++) {
|
||||
s += cv_abs(src[j]);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct NormL2_SIMD<ushort, double> {
|
||||
double operator() (const ushort* src, int n) const {
|
||||
@ -941,14 +888,36 @@ struct NormL2_SIMD<float, double> {
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#if CV_SIMD_64F // CV_SIMD_SCALABLE_64F has accuracy problem with the following kernels on ci
|
||||
|
||||
template<>
|
||||
struct NormL1_SIMD<double, double> {
|
||||
double operator() (const double* src, int n) const {
|
||||
int j = 0;
|
||||
double s = 0.f;
|
||||
v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64();
|
||||
v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64();
|
||||
for (; j <= n - 4 * VTraits<v_float64>::vlanes(); j += 4 * VTraits<v_float64>::vlanes()) {
|
||||
r00 = v_add(r00, v_abs(vx_load(src + j )));
|
||||
r01 = v_add(r01, v_abs(vx_load(src + j + VTraits<v_float64>::vlanes())));
|
||||
r10 = v_add(r10, v_abs(vx_load(src + j + 2 * VTraits<v_float64>::vlanes())));
|
||||
r11 = v_add(r11, v_abs(vx_load(src + j + 3 * VTraits<v_float64>::vlanes())));
|
||||
}
|
||||
s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
|
||||
for (; j < n; j++) {
|
||||
s += cv_abs(src[j]);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct NormL2_SIMD<double, double> {
|
||||
double operator() (const double* src, int n) const {
|
||||
int j = 0;
|
||||
double s = 0.f;
|
||||
#if CV_RVV // This is introduced to workaround the accuracy issue on ci
|
||||
s = normL2_rvv<double, double>(src, n, j);
|
||||
#else
|
||||
v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64();
|
||||
v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64();
|
||||
for (; j <= n - 4 * VTraits<v_float64>::vlanes(); j += 4 * VTraits<v_float64>::vlanes()) {
|
||||
@ -960,7 +929,6 @@ struct NormL2_SIMD<double, double> {
|
||||
r10 = v_fma(v10, v10, r10); r11 = v_fma(v11, v11, r11);
|
||||
}
|
||||
s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
|
||||
#endif
|
||||
for (; j < n; j++) {
|
||||
double v = src[j];
|
||||
s += v * v;
|
||||
@ -1362,7 +1330,9 @@ CV_DEF_NORM_ALL(64f, double, double, double, double)
|
||||
NormFunc getNormFunc(int normType, int depth)
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
static NormFunc normTab[3][8] =
|
||||
|
||||
// [FIXME] append 0's when merging to 5.x
|
||||
static NormFunc normTab[3][CV_DEPTH_MAX] =
|
||||
{
|
||||
{
|
||||
(NormFunc)GET_OPTIMIZED(normInf_8u), (NormFunc)GET_OPTIMIZED(normInf_8s), (NormFunc)GET_OPTIMIZED(normInf_16u), (NormFunc)GET_OPTIMIZED(normInf_16s),
|
||||
|
Loading…
Reference in New Issue
Block a user