Merge pull request #27115 from fengyuentau:4x/hal_rvv/normDiff

core: refactored normDiff in hal_rvv and extended with support of more data types #27115 Merge wtih https://github.com/opencv/opencv_extra/pull/1246. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2025-08-05 22:19:14 +08:00 · 2025-03-25 12:59:59 +08:00 · 2025-03-25 12:59:59 +08:00 · a2a2f37ebb
commit a2a2f37ebb
parent 7d87f3cda6
5 changed files with 1235 additions and 625 deletions
--- a/3rdparty/hal_rvv/hal_rvv.hpp
+++ b/3rdparty/hal_rvv/hal_rvv.hpp
@ -5,6 +5,7 @@
 #ifndef OPENCV_HAL_RVV_HPP_INCLUDED
 #define OPENCV_HAL_RVV_HPP_INCLUDED

+#include "opencv2/core/base.hpp"
 #include "opencv2/core/hal/interface.h"
 #include "opencv2/imgproc/hal/interface.h"

--- a/3rdparty/hal_rvv/hal_rvv_1p0/common.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/common.hpp
@ -1,6 +1,9 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.

 #ifndef OPENCV_HAL_RVV_COMMON_HPP_INCLUDED
 #define OPENCV_HAL_RVV_COMMON_HPP_INCLUDED
@ -9,6 +12,8 @@

 namespace cv { namespace cv_hal_rvv { namespace custom_intrin {

+#define CV_HAL_RVV_NOOP(a) (a)
+
 #define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(_Tpvs, _Tpvd, shift, suffix) \
    inline _Tpvd __riscv_vabs(const _Tpvs& v, const int vl) { \
        _Tpvs mask = __riscv_vsra(v, shift, vl); \
@ -25,6 +30,23 @@ CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m8_t, vuint16m8_t, 15, u16m8)
 CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m4_t, vuint32m4_t, 31, u32m4)
 CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m8_t, vuint32m8_t, 31, u32m8)

+#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(_Tpvs, _Tpvd, cast, sub, max, min) \
+    inline _Tpvd __riscv_vabd(const _Tpvs& v1, const _Tpvs& v2, const int vl) { \
+        return cast(__riscv_##sub(__riscv_##max(v1, v2, vl), __riscv_##min(v1, v2, vl), vl)); \
+    }
+
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint8m4_t, vuint8m4_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint8m8_t, vuint8m8_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint16m2_t, vuint16m2_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint16m8_t, vuint16m8_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
+
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint8m4_t, vuint8m4_t, __riscv_vreinterpret_u8m4, vsub, vmax, vmin)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint8m8_t, vuint8m8_t, __riscv_vreinterpret_u8m8, vsub, vmax, vmin)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m2_t, vuint16m2_t, __riscv_vreinterpret_u16m2, vsub, vmax, vmin)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m8_t, vuint16m8_t, __riscv_vreinterpret_u16m8, vsub, vmax, vmin)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m4_t, vuint32m4_t, __riscv_vreinterpret_u32m4, vsub, vmax, vmin)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m8_t, vuint32m8_t, __riscv_vreinterpret_u32m8, vsub, vmax, vmin)
+
 }}} // cv::cv_hal_rvv::custom_intrin

 #endif
--- a/3rdparty/hal_rvv/hal_rvv_1p0/norm.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/norm.hpp
@ -166,7 +166,7 @@ struct NormL1_RVV<uchar, int> {
        for (int i = 0; i < n; i += vl) {
            vl = __riscv_vsetvl_e8m8(n - i);
            auto v = __riscv_vle8_v_u8m8(src + i, vl);
-            s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, vl);
+            s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
        }
        return __riscv_vmv_x(s);
    }
@ -181,7 +181,7 @@ struct NormL1_RVV<schar, int> {
        for (int i = 0; i < n; i += vl) {
            vl = __riscv_vsetvl_e8m8(n - i);
            auto v = custom_intrin::__riscv_vabs(__riscv_vle8_v_i8m8(src + i, vl), vl);
-            s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, vl);
+            s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
        }
        return __riscv_vmv_x(s);
    }
@ -1008,6 +1008,7 @@ inline int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mas
        sizeof(int),     sizeof(float),
        sizeof(int64_t), 0,
    };
+    CV_Assert(elem_size_tab[depth]);

    bool src_continuous = (src_step == width * elem_size_tab[depth] * cn || (src_step != width * elem_size_tab[depth] * cn && height == 1));
    bool mask_continuous = (mask_step == static_cast<size_t>(width));
--- a/3rdparty/hal_rvv/hal_rvv_1p0/norm_diff.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/norm_diff.hpp
--- a/modules/core/src/norm.simd.hpp
+++ b/modules/core/src/norm.simd.hpp
@ -1,6 +1,9 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html
+//
+// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.

 #include "precomp.hpp"

@ -55,8 +58,7 @@ struct NormDiffInf_SIMD {
    inline ST operator() (const T* src1, const T* src2, int n) const {
        ST s = 0;
        for (int i = 0; i < n; i++) {
-            ST v = ST(src1[i] - src2[i]);
-            s = std::max(s, (ST)cv_abs(v));
+            s = std::max(s, (ST)std::abs(src1[i] - src2[i]));
        }
        return s;
    }
@ -67,8 +69,7 @@ struct NormDiffL1_SIMD {
    inline ST operator() (const T* src1, const T* src2, int n) const {
        ST s = 0;
        for (int i = 0; i < n; i++) {
-            ST v = ST(src1[i] - src2[i]);
-            s += cv_abs(v);
+            s += std::abs(src1[i] - src2[i]);
        }
        return s;
    }
@ -79,7 +80,7 @@ struct NormDiffL2_SIMD {
    inline ST operator() (const T* src1, const T* src2, int n) const {
        ST s = 0;
        for (int i = 0; i < n; i++) {
-            ST v = ST(src1[i] - src2[i]);
+            ST v = (ST)src1[i] - (ST)src2[i];
            s += v * v;
        }
        return s;
@ -383,8 +384,7 @@ struct NormDiffInf_SIMD<uchar, int> {
        }
        s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3));
        for (; j < n; j++) {
-            int v = src1[j] - src2[j];
-            s = std::max(s, (int)cv_abs(v));
+            s = std::max(s, (int)std::abs(src1[j] - src2[j]));
        }
        return s;
    }
@ -415,8 +415,7 @@ struct NormDiffInf_SIMD<schar, int> {
        }
        s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3));
        for (; j < n; j++) {
-            int v = src1[j] - src2[j];
-            s = std::max(s, (int)cv_abs(v));
+            s = std::max(s, (int)std::abs(src1[j] - src2[j]));
        }
        return s;
    }
@ -447,8 +446,7 @@ struct NormDiffInf_SIMD<ushort, int> {
        }
        s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3));
        for (; j < n; j++) {
-            int v = src1[j] - src2[j];
-            s = std::max(s, (int)cv_abs(v));
+            s = std::max(s, (int)std::abs(src1[j] - src2[j]));
        }
        return s;
    }
@ -479,8 +477,7 @@ struct NormDiffInf_SIMD<short, int> {
        }
        s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3));
        for (; j < n; j++) {
-            int v = src1[j] - src2[j];
-            s = std::max(s, (int)cv_abs(v));
+            s = std::max(s, (int)std::abs(src1[j] - src2[j]));
        }
        return s;
    }
@ -511,8 +508,7 @@ struct NormDiffInf_SIMD<int, int> {
        }
        s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3));
        for (; j < n; j++) {
-            int v = src1[j] - src2[j];
-            s = std::max(s, (int)cv_abs(v));
+            s = std::max(s, (int)std::abs(src1[j] - src2[j]));
        }
        return s;
    }
@ -534,8 +530,7 @@ struct NormDiffInf_SIMD<float, float> {
        }
        s = v_reduce_max(v_max(r0, r1));
        for (; j < n; j++) {
-            float v = src1[j] - src2[j];
-            s = std::max(s, cv_abs(v));
+            s = std::max(s, (float)std::abs(src1[j] - src2[j]));
        }
        return s;
    }
@ -558,8 +553,7 @@ struct NormDiffL1_SIMD<uchar, int> {
        }
        s += v_reduce_sum(v_add(r0, r1));
        for (; j < n; j++) {
-            int v = src1[j] - src2[j];
-            s += (int)cv_abs(v);
+            s += std::abs(src1[j] - src2[j]);
        }
        return s;
    }
@ -582,8 +576,7 @@ struct NormDiffL1_SIMD<schar, int> {
        }
        s += v_reduce_sum(v_add(r0, r1));
        for (; j < n; j++) {
-            int v =src1[j] - src2[j];
-            s += (int)cv_abs(v);
+            s += std::abs(src1[j] - src2[j]);
        }
        return s;
    }
@ -622,8 +615,7 @@ struct NormDiffL1_SIMD<ushort, int> {
        }
        s += (int)v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3));
        for (; j < n; j++) {
-            int v = src1[j] - src2[j];
-            s += (int)cv_abs(v);
+            s += std::abs(src1[j] - src2[j]);
        }
        return s;
    }
@ -662,8 +654,7 @@ struct NormDiffL1_SIMD<short, int> {
        }
        s += (int)v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3));
        for (; j < n; j++) {
-            int v = src1[j] - src2[j];
-            s += (int)cv_abs(v);
+            s += std::abs(src1[j] - src2[j]);
        }
        return s;
    }
@ -687,7 +678,7 @@ struct NormDiffL2_SIMD<uchar, int> {
        }
        s += v_reduce_sum(v_add(r0, r1));
        for (; j < n; j++) {
-            int v = saturate_cast<int>(src1[j] - src2[j]);
+            int v = (int)src1[j] - (int)src2[j];
            s += v * v;
        }
        return s;
@ -712,7 +703,7 @@ struct NormDiffL2_SIMD<schar, int> {
        }
        s += v_reduce_sum(v_add(r0, r1));
        for (; j < n; j++) {
-            int v = saturate_cast<int>(src1[j] - src2[j]);
+            int v = (int)src1[j] - (int)src2[j];
            s += v * v;
        }
        return s;
@ -955,11 +946,10 @@ struct NormDiffInf_SIMD<double, double> {
        double t[VTraits<v_float64>::max_nlanes];
        vx_store(t, v_max(r0, r1));
        for (int i = 0; i < VTraits<v_float64>::vlanes(); i++) {
-            s = std::max(s, cv_abs(t[i]));
+            s = std::max(s, std::abs(t[i]));
        }
        for (; j < n; j++) {
-            double v = src1[j] - src2[j];
-            s = std::max(s, cv_abs(v));
+            s = std::max(s, (double)std::abs(src1[j] - src2[j]));
        }
        return s;
    }
@ -971,21 +961,17 @@ struct NormDiffL1_SIMD<int, double> {
        int j = 0;
        double s = 0.f;
        v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64();
-        v_float64 r2 = vx_setzero_f64(), r3 = vx_setzero_f64();
-        for (; j <= n - 2 * VTraits<v_int32>::vlanes(); j += 2 * VTraits<v_int32>::vlanes()) {
+        for (; j <= n - VTraits<v_int32>::vlanes(); j += VTraits<v_int32>::vlanes()) {
            v_int32 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
-            v_float32 v0 = v_abs(v_cvt_f32(v_sub(v01, v02)));
-            r0 = v_add(r0, v_cvt_f64(v0)); r1 = v_add(r1, v_cvt_f64_high(v0));
-
-            v_int32 v11 = vx_load(src1 + j + VTraits<v_int32>::vlanes()),
-                    v12 = vx_load(src2 + j + VTraits<v_int32>::vlanes());
-            v_float32 v1 = v_abs(v_cvt_f32(v_sub(v11, v12)));
-            r2 = v_add(r2, v_cvt_f64(v1)); r3 = v_add(r3, v_cvt_f64_high(v1));
+            v_uint32 v0 = v_abs(v_sub(v01, v02));
+            v_uint64 ev0, ev1;
+            v_expand(v0, ev0, ev1);
+            r0 = v_add(r0, v_cvt_f64(v_reinterpret_as_s64(ev0)));
+            r1 = v_add(r1, v_cvt_f64(v_reinterpret_as_s64(ev1)));
        }
-        s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3));
+        s += v_reduce_sum(v_add(r0, r1));
        for (; j < n; j++) {
-            double v = src1[j] - src2[j];
-            s += cv_abs(v);
+            s += std::abs(src1[j] - src2[j]);
        }
        return s;
    }
@ -1010,8 +996,7 @@ struct NormDiffL1_SIMD<float, double> {
        }
        s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3));
        for (; j < n; j++) {
-            double v = src1[j] - src2[j];
-            s += cv_abs(v);
+            s += std::abs(src1[j] - src2[j]);
        }
        return s;
    }
@ -1033,8 +1018,7 @@ struct NormDiffL1_SIMD<double, double> {
        }
        s += v_reduce_sum(v_add(r0, r1));
        for (; j < n; j++) {
-            double v = src1[j] - src2[j];
-            s += cv_abs(v);
+            s += std::abs(src1[j] - src2[j]);
        }
        return s;
    }
@ -1060,7 +1044,7 @@ struct NormDiffL2_SIMD<ushort, double> {
        }
        s += v_reduce_sum(v_add(r0, r1));
        for (; j < n; j++) {
-            double v = saturate_cast<double>(src1[j] - src2[j]);
+            double v = (double)src1[j] - (double)src2[j];
            s += v * v;
        }
        return s;
@ -1087,7 +1071,7 @@ struct NormDiffL2_SIMD<short, double> {
        }
        s += v_reduce_sum(v_add(r0, r1));
        for (; j < n; j++) {
-            double v = saturate_cast<double>(src1[j] - src2[j]);
+            double v = (double)src1[j] - (double)src2[j];
            s += v * v;
        }
        return s;
@ -1100,24 +1084,17 @@ struct NormDiffL2_SIMD<int, double> {
        int j = 0;
        double s = 0.f;
        v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64();
-        v_float64 r2 = vx_setzero_f64(), r3 = vx_setzero_f64();
-        for (; j <= n - 2 * VTraits<v_int32>::vlanes(); j += 2 * VTraits<v_int32>::vlanes()) {
+        for (; j <= n - VTraits<v_int32>::vlanes(); j += VTraits<v_int32>::vlanes()) {
            v_int32 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
-            v_float32 v0 = v_abs(v_cvt_f32(v_sub(v01, v02)));
-            v_float64 f00, f01;
-            f00 = v_cvt_f64(v0); f01 = v_cvt_f64_high(v0);
-            r0 = v_fma(f00, f00, r0); r1 = v_fma(f01, f01, r1);
-
-            v_int32 v11 = vx_load(src1 + j + VTraits<v_int32>::vlanes()),
-                    v12 = vx_load(src2 + j + VTraits<v_int32>::vlanes());
-            v_float32 v1 = v_abs(v_cvt_f32(v_sub(v11, v12)));
-            v_float64 f10, f11;
-            f10 = v_cvt_f64(v1); f11 = v_cvt_f64_high(v1);
-            r2 = v_fma(f10, f10, r2); r3 = v_fma(f11, f11, r3);
+            v_uint32 v0 = v_absdiff(v01, v02);
+            v_uint64 ev0, ev1;
+            v_expand(v0, ev0, ev1);
+            v_float64 f0 = v_cvt_f64(v_reinterpret_as_s64(ev0)), f1 = v_cvt_f64(v_reinterpret_as_s64(ev1));
+            r0 = v_fma(f0, f0, r0); r1 = v_fma(f1, f1, r1);
        }
-        s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3));
+        s += v_reduce_sum(v_add(r0, r1));
        for (; j < n; j++) {
-            double v = src1[j] - src2[j];
+            double v = (double)src1[j] - (double)src2[j];
            s += v * v;
        }
        return s;
@ -1145,7 +1122,7 @@ struct NormDiffL2_SIMD<float, double> {
        }
        s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3));
        for (; j < n; j++) {
-            double v = src1[j] - src2[j];
+            double v = (double)src1[j] - (double)src2[j];
            s += v * v;
        }
        return s;
@ -1181,7 +1158,7 @@ struct NormDiffL2_SIMD<double, double> {
        }
        s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3));
        for (; j < n; j++) {
-            double v = src1[j] - src2[j];
+            double v = (double)src1[j] - (double)src2[j];
            s += v * v;
        }
        return s;
@ -1297,7 +1274,7 @@ normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int le
        for( int i = 0; i < len; i++, src1 += cn, src2 += cn ) {
            if( mask[i] ) {
                for( int k = 0; k < cn; k++ ) {
-                    ST v = src1[k] - src2[k];
+                    ST v = (ST)src1[k] - (ST)src2[k];
                    result += v*v;
                }
            }
@ -1331,7 +1308,6 @@ NormFunc getNormFunc(int normType, int depth)
 {
    CV_INSTRUMENT_REGION();

-    // [FIXME] append 0's when merging to 5.x
    static NormFunc normTab[3][CV_DEPTH_MAX] =
    {
        {
@ -1353,7 +1329,7 @@ NormFunc getNormFunc(int normType, int depth)

 NormDiffFunc getNormDiffFunc(int normType, int depth)
 {
-    static NormDiffFunc normDiffTab[3][8] =
+    static NormDiffFunc normDiffTab[3][CV_DEPTH_MAX] =
    {
        {
            (NormDiffFunc)GET_OPTIMIZED(normDiffInf_8u), (NormDiffFunc)normDiffInf_8s,