Merge pull request #26991 from fengyuentau:4x/core/norm2hal_rvv

core: improve norm of hal rvv #26991 Merge with https://github.com/opencv/opencv_extra/pull/1241 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2025-08-06 14:36:36 +08:00 · 2025-03-18 14:42:55 +08:00 · 2025-03-18 14:42:55 +08:00 · 8207549638
commit 8207549638
parent 0142231e4d
5 changed files with 1111 additions and 737 deletions
--- a/3rdparty/hal_rvv/hal_rvv_1p0/common.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/common.hpp
@ -0,0 +1,30 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_HAL_RVV_COMMON_HPP_INCLUDED
+#define OPENCV_HAL_RVV_COMMON_HPP_INCLUDED
+
+#include <riscv_vector.h>
+
+namespace cv { namespace cv_hal_rvv { namespace custom_intrin {
+
+#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(_Tpvs, _Tpvd, shift, suffix) \
+    inline _Tpvd __riscv_vabs(const _Tpvs& v, const int vl) { \
+        _Tpvs mask = __riscv_vsra(v, shift, vl); \
+        _Tpvs v_xor = __riscv_vxor(v, mask, vl); \
+        return __riscv_vreinterpret_##suffix( \
+            __riscv_vsub(v_xor, mask, vl) \
+        ); \
+    }
+
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint8m2_t,  vuint8m2_t,  7,  u8m2)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint8m8_t,  vuint8m8_t,  7,  u8m8)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m4_t, vuint16m4_t, 15, u16m4)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m8_t, vuint16m8_t, 15, u16m8)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m4_t, vuint32m4_t, 31, u32m4)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m8_t, vuint32m8_t, 31, u32m8)
+
+}}} // cv::cv_hal_rvv::custom_intrin
+
+#endif
--- a/3rdparty/hal_rvv/hal_rvv_1p0/norm.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/norm.hpp
--- a/modules/core/perf/perf_norm.cpp
+++ b/modules/core/perf/perf_norm.cpp
@ -14,7 +14,7 @@ typedef perf::TestBaseWithParam<Size_MatType_NormType_t> Size_MatType_NormType;
 PERF_TEST_P(Size_MatType_NormType, norm,
            testing::Combine(
                testing::Values(TYPICAL_MAT_SIZES),
-                testing::Values(TYPICAL_MAT_TYPES),
+                testing::Values(CV_8UC1, CV_8UC4, CV_8SC1, CV_16UC1, CV_16SC1, CV_32SC1, CV_32FC1, CV_64FC1),
                testing::Values((int)NORM_INF, (int)NORM_L1, (int)NORM_L2)
                )
            )
@ -36,7 +36,7 @@ PERF_TEST_P(Size_MatType_NormType, norm,
 PERF_TEST_P(Size_MatType_NormType, norm_mask,
            testing::Combine(
                testing::Values(TYPICAL_MAT_SIZES),
-                testing::Values(TYPICAL_MAT_TYPES),
+                testing::Values(CV_8UC1, CV_8UC4, CV_8SC1, CV_16UC1, CV_16SC1, CV_32SC1, CV_32FC1, CV_64FC1),
                testing::Values((int)NORM_INF, (int)NORM_L1, (int)NORM_L2)
                )
            )
--- a/modules/core/src/norm.rvv1p0.hpp
+++ b/modules/core/src/norm.rvv1p0.hpp
@ -1,200 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-//
-// Copytright (C) 2025, SpaceMIT Inc., all rights reserved.
-
-#include "opencv2/core/hal/intrin.hpp"
-
-namespace cv {
-
-namespace {
-
-// [TODO] Drop this until rvv has dedicated intrinsics for abs on integers.
-template<typename T, typename ST> inline ST __riscv_vabs(const T&);
-
-template<> inline
-vuint8m1_t __riscv_vabs(const vint8m1_t& v) {
-    const int vle8m1 = __riscv_vsetvlmax_e8m1();
-    vint8m1_t mask = __riscv_vsra_vx_i8m1(v, 7, vle8m1);
-    vint8m1_t v_xor = __riscv_vxor_vv_i8m1(v, mask, vle8m1);
-    return __riscv_vreinterpret_v_i8m1_u8m1(
-        __riscv_vsub_vv_i8m1(v_xor, mask, vle8m1)
-    );
-}
-
-template<> inline
-vuint16m1_t __riscv_vabs(const vint16m1_t& v) {
-    const int vle16m1 = __riscv_vsetvlmax_e16m1();
-    vint16m1_t mask = __riscv_vsra_vx_i16m1(v, 15, vle16m1);
-    vint16m1_t v_xor = __riscv_vxor_vv_i16m1(v, mask, vle16m1);
-    return __riscv_vreinterpret_v_i16m1_u16m1(
-        __riscv_vsub_vv_i16m1(v_xor, mask, vle16m1)
-    );
-}
-
-template<> inline
-vuint32m1_t __riscv_vabs(const vint32m1_t& v) {
-    const int vle32m1 = __riscv_vsetvlmax_e32m1();
-    vint32m1_t mask = __riscv_vsra_vx_i32m1(v, 31, vle32m1);
-    vint32m1_t v_xor = __riscv_vxor_vv_i32m1(v, mask, vle32m1);
-    return __riscv_vreinterpret_v_i32m1_u32m1(
-        __riscv_vsub_vv_i32m1(v_xor, mask, vle32m1)
-    );
-}
-}
-
-CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
-
-template <typename T, typename ST> inline
-ST normInf_rvv(const T* src, int n, int& j);
-
-template<> inline
-int normInf_rvv(const int* src, int n, int& j) {
-    const int vle32m1 = __riscv_vsetvlmax_e32m1();
-    vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1);
-    vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1);
-    for (; j <= n - 2 * vle32m1; j += 2 * vle32m1) {
-        vuint32m1_t v0 = __riscv_vabs<vint32m1_t, vuint32m1_t>(__riscv_vle32_v_i32m1(src + j, vle32m1));
-        r0 = __riscv_vmaxu(r0, v0, vle32m1);
-
-        vuint32m1_t v1 = __riscv_vabs<vint32m1_t, vuint32m1_t>(__riscv_vle32_v_i32m1(src + j + vle32m1, vle32m1));
-        r1 = __riscv_vmaxu(r1, v1, vle32m1);
-    }
-    r0 = __riscv_vmaxu(r0, r1, vle32m1);
-    return (int)__riscv_vmv_x(__riscv_vredmaxu(r0, __riscv_vmv_v_x_u32m1(0, vle32m1), vle32m1));
-}
-
-template <typename T, typename ST> inline
-ST normL1_rvv(const T* src, int n, int& j);
-
-template<> inline
-int normL1_rvv(const schar* src, int n, int& j) {
-    const int vle8m1 = __riscv_vsetvlmax_e8m1();
-    const int vle16m1 = __riscv_vsetvlmax_e16m1();
-    const int vle32m1 = __riscv_vsetvlmax_e32m1();
-    vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1);
-    vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1);
-    vuint16m1_t zero = __riscv_vmv_v_x_u16m1(0, vle16m1);
-    for (; j <= n - 2 * vle8m1; j += 2 * vle8m1) {
-        vuint8m1_t v0 = __riscv_vabs<vint8m1_t, vuint8m1_t>(__riscv_vle8_v_i8m1(src + j, vle8m1));
-        vuint16m1_t u0 = __riscv_vwredsumu_tu(zero, v0, zero, vle8m1);
-        r0 = __riscv_vwredsumu(u0, r0, vle16m1);
-
-        vuint8m1_t v1 = __riscv_vabs<vint8m1_t, vuint8m1_t>(__riscv_vle8_v_i8m1(src + j + vle8m1, vle8m1));
-        vuint16m1_t u1 = __riscv_vwredsumu_tu(zero, v1, zero, vle8m1);
-        r1 = __riscv_vwredsumu(u1, r1, vle16m1);
-    }
-    return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1));
-}
-
-template<> inline
-int normL1_rvv(const ushort* src, int n, int& j) {
-    const int vle16m1 = __riscv_vsetvlmax_e16m1();
-    const int vle32m1 = __riscv_vsetvlmax_e32m1();
-    vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1);
-    vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1);
-    for (; j <= n - 2 * vle16m1; j += 2 * vle16m1) {
-        vuint16m1_t v0 = __riscv_vle16_v_u16m1(src + j, vle16m1);
-        r0 = __riscv_vwredsumu(v0, r0, vle16m1);
-
-        vuint16m1_t v1 = __riscv_vle16_v_u16m1(src + j + vle16m1, vle16m1);
-        r1 = __riscv_vwredsumu(v1, r1, vle16m1);
-    }
-    return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1));
-}
-
-template<> inline
-int normL1_rvv(const short* src, int n, int& j) {
-    const int vle16m1 = __riscv_vsetvlmax_e16m1();
-    const int vle32m1 = __riscv_vsetvlmax_e32m1();
-    vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1);
-    vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1);
-    for (; j<= n - 2 * vle16m1; j += 2 * vle16m1) {
-        vuint16m1_t v0 = __riscv_vabs<vint16m1_t, vuint16m1_t>(__riscv_vle16_v_i16m1(src + j, vle16m1));
-        r0 = __riscv_vwredsumu(v0, r0, vle16m1);
-
-        vuint16m1_t v1 = __riscv_vabs<vint16m1_t, vuint16m1_t>(__riscv_vle16_v_i16m1(src + j + vle16m1, vle16m1));
-        r1 = __riscv_vwredsumu(v1, r1, vle16m1);
-    }
-    return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1));
-}
-
-template<> inline
-double normL1_rvv(const double* src, int n, int& j) {
-    const int vle64m1 = __riscv_vsetvlmax_e64m1();
-    vfloat64m1_t r0 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1);
-    vfloat64m1_t r1 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1);
-    for (; j <= n - 2 * vle64m1; j += 2 * vle64m1) {
-        vfloat64m1_t v0 = __riscv_vle64_v_f64m1(src + j, vle64m1);
-        v0 = __riscv_vfabs(v0, vle64m1);
-        r0 = __riscv_vfadd(r0, v0, vle64m1);
-
-        vfloat64m1_t v1 = __riscv_vle64_v_f64m1(src + j + vle64m1, vle64m1);
-        v1 = __riscv_vfabs(v1, vle64m1);
-        r1 = __riscv_vfadd(r1, v1, vle64m1);
-    }
-    r0 = __riscv_vfadd(r0, r1, vle64m1);
-    return __riscv_vfmv_f(__riscv_vfredusum(r0, __riscv_vfmv_v_f_f64m1(0.f, vle64m1), vle64m1));
-}
-
-template <typename T, typename ST> inline
-ST normL2_rvv(const T* src, int n, int& j);
-
-template<> inline
-int normL2_rvv(const uchar* src, int n, int& j) {
-    const int vle8m1 = __riscv_vsetvlmax_e8m1();
-    const int vle16m1 = __riscv_vsetvlmax_e16m1();
-    const int vle32m1 = __riscv_vsetvlmax_e32m1();
-    vuint32m1_t r0 = __riscv_vmv_v_x_u32m1(0, vle32m1);
-    vuint32m1_t r1 = __riscv_vmv_v_x_u32m1(0, vle32m1);
-    for (; j <= n - 2 * vle8m1; j += 2 * vle8m1) {
-        vuint8m1_t v0 = __riscv_vle8_v_u8m1(src + j, vle8m1);
-        vuint16m2_t u0 = __riscv_vwmulu(v0, v0, vle8m1);
-        r0 = __riscv_vwredsumu(u0, r0, vle16m1 * 2);
-
-        vuint8m1_t v1 = __riscv_vle8_v_u8m1(src + j + vle8m1, vle8m1);
-        vuint16m2_t u1 = __riscv_vwmulu(v1, v1, vle8m1);
-        r1 = __riscv_vwredsumu(u1, r1, vle16m1 * 2);
-    }
-    return (int)__riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1));
-}
-
-template<> inline
-int normL2_rvv(const schar* src, int n, int& j) {
-    const int vle8m1 = __riscv_vsetvlmax_e8m1();
-    const int vle16m1 = __riscv_vsetvlmax_e16m1();
-    const int vle32m1 = __riscv_vsetvlmax_e32m1();
-    vint32m1_t r0 = __riscv_vmv_v_x_i32m1(0, vle32m1);
-    vint32m1_t r1 = __riscv_vmv_v_x_i32m1(0, vle32m1);
-    for (; j <= n - 2 * vle8m1; j += 2 * vle8m1) {
-        vint8m1_t v0 = __riscv_vle8_v_i8m1(src + j, vle8m1);
-        vint16m2_t u0 = __riscv_vwmul(v0, v0, vle8m1);
-        r0 = __riscv_vwredsum(u0, r0, vle16m1 * 2);
-
-        vint8m1_t v1 = __riscv_vle8_v_i8m1(src + j + vle8m1, vle8m1);
-        vint16m2_t u1 = __riscv_vwmul(v1, v1, vle8m1);
-        r1 = __riscv_vwredsum(u1, r1, vle16m1 * 2);
-    }
-    return __riscv_vmv_x(__riscv_vadd(r0, r1, vle32m1));
-}
-
-template<> inline
-double normL2_rvv(const double* src, int n, int& j) {
-    const int vle64m1 = __riscv_vsetvlmax_e64m1();
-    vfloat64m1_t r0 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1);
-    vfloat64m1_t r1 = __riscv_vfmv_v_f_f64m1(0.f, vle64m1);
-    for (; j <= n - 2 * vle64m1; j += 2 * vle64m1) {
-        vfloat64m1_t v0 = __riscv_vle64_v_f64m1(src + j, vle64m1);
-        r0 = __riscv_vfmacc(r0, v0, v0, vle64m1);
-
-        vfloat64m1_t v1 = __riscv_vle64_v_f64m1(src + j + vle64m1, vle64m1);
-        r1 = __riscv_vfmacc(r1, v1, v1, vle64m1);
-    }
-    r0 = __riscv_vfadd(r0, r1, vle64m1);
-    return __riscv_vfmv_f(__riscv_vfredusum(r0, __riscv_vfmv_v_f_f64m1(0.f, vle64m1), vle64m1));
-}
-
-CV_CPU_OPTIMIZATION_NAMESPACE_END
-
-} // cv::
--- a/modules/core/src/norm.simd.hpp
+++ b/modules/core/src/norm.simd.hpp
@ -4,10 +4,6 @@

 #include "precomp.hpp"

-#if CV_RVV
-#include "norm.rvv1p0.hpp"
-#endif
-
 namespace cv {

 using NormFunc = int (*)(const uchar*, const uchar*, uchar*, int, int);
@ -181,9 +177,6 @@ struct NormInf_SIMD<int, int> {
    int operator() (const int* src, int n) const {
        int j = 0;
        int s = 0;
-#if CV_RVV
-        s = normInf_rvv<int, int>(src, n, j);
-#else
        v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32();
        v_uint32 r2 = vx_setzero_u32(), r3 = vx_setzero_u32();
        for (; j <= n - 4 * VTraits<v_int32>::vlanes(); j += 4 * VTraits<v_int32>::vlanes()) {
@ -194,7 +187,6 @@ struct NormInf_SIMD<int, int> {
        }
        r0 = v_max(r0, v_max(r1, v_max(r2, r3)));
        s = std::max(s, saturate_cast<int>(v_reduce_max(r0)));
-#endif
        for (; j < n; j++) {
            s = std::max(s, cv_abs(src[j]));
        }
@ -250,9 +242,6 @@ struct NormL1_SIMD<schar, int> {
    int operator() (const schar* src, int n) const {
        int j = 0;
        int s = 0;
-#if CV_RVV
-        s = normL1_rvv<schar, int>(src, n, j);
-#else
        v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32();
        v_uint8  one = vx_setall_u8(1);
        for (; j<= n - 2 * VTraits<v_int8>::vlanes(); j += 2 * VTraits<v_int8>::vlanes()) {
@ -263,7 +252,6 @@ struct NormL1_SIMD<schar, int> {
            r1 = v_dotprod_expand_fast(v1, one, r1);
        }
        s += v_reduce_sum(v_add(r0, r1));
-#endif
        for (; j < n; j++) {
            s += saturate_cast<int>(cv_abs(src[j]));
        }
@ -276,9 +264,6 @@ struct NormL1_SIMD<ushort, int> {
    int operator() (const ushort* src, int n) const {
        int j = 0;
        int s = 0;
-#if CV_RVV
-        s = normL1_rvv<ushort, int>(src, n, j);
-#else
        v_uint32 r00 = vx_setzero_u32(), r01 = vx_setzero_u32();
        v_uint32 r10 = vx_setzero_u32(), r11 = vx_setzero_u32();
        for (; j<= n - 2 * VTraits<v_uint16>::vlanes(); j += 2 * VTraits<v_uint16>::vlanes()) {
@ -295,7 +280,6 @@ struct NormL1_SIMD<ushort, int> {
            r11 = v_add(r11, v11);
        }
        s += (int)v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
-#endif
        for (; j < n; j++) {
            s += src[j];
        }
@ -308,9 +292,6 @@ struct NormL1_SIMD<short, int> {
    int operator() (const short* src, int n) const {
        int j = 0;
        int s = 0;
-#if CV_RVV
-        s = normL1_rvv<short, int>(src, n, j);
-#else
        v_uint32 r00 = vx_setzero_u32(), r01 = vx_setzero_u32();
        v_uint32 r10 = vx_setzero_u32(), r11 = vx_setzero_u32();
        for (; j<= n - 2 * VTraits<v_int16>::vlanes(); j += 2 * VTraits<v_int16>::vlanes()) {
@ -327,7 +308,6 @@ struct NormL1_SIMD<short, int> {
            r11 = v_add(r11, v11);
        }
        s += (int)v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
-#endif
        for (; j < n; j++) {
            s += saturate_cast<int>(cv_abs(src[j]));
        }
@ -340,9 +320,6 @@ struct NormL2_SIMD<uchar, int> {
    int operator() (const uchar* src, int n) const {
        int j = 0;
        int s = 0;
-#if CV_RVV
-        s = normL2_rvv<uchar, int>(src, n, j);
-#else
        v_uint32 r0 = vx_setzero_u32(), r1 = vx_setzero_u32();
        for (; j <= n - 2 * VTraits<v_uint8>::vlanes(); j += 2 * VTraits<v_uint8>::vlanes()) {
            v_uint8 v0 = vx_load(src + j);
@ -352,7 +329,6 @@ struct NormL2_SIMD<uchar, int> {
            r1 = v_dotprod_expand_fast(v1, v1, r1);
        }
        s += v_reduce_sum(v_add(r0, r1));
-#endif
        for (; j < n; j++) {
            int v = saturate_cast<int>(src[j]);
            s += v * v;
@ -366,9 +342,6 @@ struct NormL2_SIMD<schar, int> {
    int operator() (const schar* src, int n) const {
        int j = 0;
        int s = 0;
-#if CV_RVV
-        s = normL2_rvv<schar, int>(src, n, j);
-#else
        v_int32 r0 = vx_setzero_s32(), r1 = vx_setzero_s32();
        for (; j <= n - 2 * VTraits<v_int8>::vlanes(); j += 2 * VTraits<v_int8>::vlanes()) {
            v_int8 v0 = vx_load(src + j);
@ -377,7 +350,6 @@ struct NormL2_SIMD<schar, int> {
            r1 = v_dotprod_expand_fast(v1, v1, r1);
        }
        s += v_reduce_sum(v_add(r0, r1));
-#endif
        for (; j < n; j++) {
            int v = saturate_cast<int>(src[j]);
            s += v * v;
@ -825,31 +797,6 @@ struct NormL1_SIMD<float, double> {
    }
 };

-template<>
-struct NormL1_SIMD<double, double> {
-    double operator() (const double* src, int n) const {
-        int j = 0;
-        double s = 0.f;
-#if CV_RVV // This is introduced to workaround the accuracy issue on ci
-        s = normL1_rvv<double, double>(src, n, j);
-#else
-        v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64();
-        v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64();
-        for (; j <= n - 4 * VTraits<v_float64>::vlanes(); j += 4 * VTraits<v_float64>::vlanes()) {
-            r00 = v_add(r00, v_abs(vx_load(src + j                                   )));
-            r01 = v_add(r01, v_abs(vx_load(src + j +     VTraits<v_float64>::vlanes())));
-            r10 = v_add(r10, v_abs(vx_load(src + j + 2 * VTraits<v_float64>::vlanes())));
-            r11 = v_add(r11, v_abs(vx_load(src + j + 3 * VTraits<v_float64>::vlanes())));
-        }
-        s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
-#endif
-        for (; j < n; j++) {
-            s += cv_abs(src[j]);
-        }
-        return s;
-    }
-};
-
 template<>
 struct NormL2_SIMD<ushort, double> {
    double operator() (const ushort* src, int n) const {
@ -941,14 +888,36 @@ struct NormL2_SIMD<float, double> {
    }
 };

+#endif
+
+#if CV_SIMD_64F // CV_SIMD_SCALABLE_64F has accuracy problem with the following kernels on ci
+
+template<>
+struct NormL1_SIMD<double, double> {
+    double operator() (const double* src, int n) const {
+        int j = 0;
+        double s = 0.f;
+        v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64();
+        v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64();
+        for (; j <= n - 4 * VTraits<v_float64>::vlanes(); j += 4 * VTraits<v_float64>::vlanes()) {
+            r00 = v_add(r00, v_abs(vx_load(src + j                                   )));
+            r01 = v_add(r01, v_abs(vx_load(src + j +     VTraits<v_float64>::vlanes())));
+            r10 = v_add(r10, v_abs(vx_load(src + j + 2 * VTraits<v_float64>::vlanes())));
+            r11 = v_add(r11, v_abs(vx_load(src + j + 3 * VTraits<v_float64>::vlanes())));
+        }
+        s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
+        for (; j < n; j++) {
+            s += cv_abs(src[j]);
+        }
+        return s;
+    }
+};
+
 template<>
 struct NormL2_SIMD<double, double> {
    double operator() (const double* src, int n) const {
        int j = 0;
        double s = 0.f;
-#if CV_RVV // This is introduced to workaround the accuracy issue on ci
-        s = normL2_rvv<double, double>(src, n, j);
-#else
        v_float64 r00 = vx_setzero_f64(), r01 = vx_setzero_f64();
        v_float64 r10 = vx_setzero_f64(), r11 = vx_setzero_f64();
        for (; j <= n - 4 * VTraits<v_float64>::vlanes(); j += 4 * VTraits<v_float64>::vlanes()) {
@ -960,7 +929,6 @@ struct NormL2_SIMD<double, double> {
            r10 = v_fma(v10, v10, r10); r11 = v_fma(v11, v11, r11);
        }
        s += v_reduce_sum(v_add(v_add(v_add(r00, r01), r10), r11));
-#endif
        for (; j < n; j++) {
            double v = src[j];
            s += v * v;
@ -1362,7 +1330,9 @@ CV_DEF_NORM_ALL(64f, double, double, double, double)
 NormFunc getNormFunc(int normType, int depth)
 {
    CV_INSTRUMENT_REGION();
-    static NormFunc normTab[3][8] =
+
+    // [FIXME] append 0's when merging to 5.x
+    static NormFunc normTab[3][CV_DEPTH_MAX] =
    {
        {
            (NormFunc)GET_OPTIMIZED(normInf_8u), (NormFunc)GET_OPTIMIZED(normInf_8s), (NormFunc)GET_OPTIMIZED(normInf_16u), (NormFunc)GET_OPTIMIZED(normInf_16s),