diff --git a/3rdparty/hal_rvv/hal_rvv.hpp b/3rdparty/hal_rvv/hal_rvv.hpp index a0c1d0e52f..8287ebeda9 100644 --- a/3rdparty/hal_rvv/hal_rvv.hpp +++ b/3rdparty/hal_rvv/hal_rvv.hpp @@ -5,6 +5,7 @@ #ifndef OPENCV_HAL_RVV_HPP_INCLUDED #define OPENCV_HAL_RVV_HPP_INCLUDED +#include "opencv2/core/base.hpp" #include "opencv2/core/hal/interface.h" #include "opencv2/imgproc/hal/interface.h" diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/common.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/common.hpp index 8db03267e1..9fc01d2897 100644 --- a/3rdparty/hal_rvv/hal_rvv_1p0/common.hpp +++ b/3rdparty/hal_rvv/hal_rvv_1p0/common.hpp @@ -1,6 +1,9 @@ // This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2025, SpaceMIT Inc., all rights reserved. +// Third party copyrights are property of their respective owners. #ifndef OPENCV_HAL_RVV_COMMON_HPP_INCLUDED #define OPENCV_HAL_RVV_COMMON_HPP_INCLUDED @@ -9,6 +12,8 @@ namespace cv { namespace cv_hal_rvv { namespace custom_intrin { +#define CV_HAL_RVV_NOOP(a) (a) + #define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(_Tpvs, _Tpvd, shift, suffix) \ inline _Tpvd __riscv_vabs(const _Tpvs& v, const int vl) { \ _Tpvs mask = __riscv_vsra(v, shift, vl); \ @@ -25,6 +30,23 @@ CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m8_t, vuint16m8_t, 15, u16m8) CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m4_t, vuint32m4_t, 31, u32m4) CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m8_t, vuint32m8_t, 31, u32m8) +#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(_Tpvs, _Tpvd, cast, sub, max, min) \ + inline _Tpvd __riscv_vabd(const _Tpvs& v1, const _Tpvs& v2, const int vl) { \ + return cast(__riscv_##sub(__riscv_##max(v1, v2, vl), __riscv_##min(v1, v2, vl), vl)); \ + } + +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint8m4_t, vuint8m4_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint8m8_t, vuint8m8_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint16m2_t, vuint16m2_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint16m8_t, vuint16m8_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu) + +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint8m4_t, vuint8m4_t, __riscv_vreinterpret_u8m4, vsub, vmax, vmin) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint8m8_t, vuint8m8_t, __riscv_vreinterpret_u8m8, vsub, vmax, vmin) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m2_t, vuint16m2_t, __riscv_vreinterpret_u16m2, vsub, vmax, vmin) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m8_t, vuint16m8_t, __riscv_vreinterpret_u16m8, vsub, vmax, vmin) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m4_t, vuint32m4_t, __riscv_vreinterpret_u32m4, vsub, vmax, vmin) +CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m8_t, vuint32m8_t, __riscv_vreinterpret_u32m8, vsub, vmax, vmin) + }}} // cv::cv_hal_rvv::custom_intrin #endif diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/norm.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/norm.hpp index 9e79940390..c35c0a3bd5 100644 --- a/3rdparty/hal_rvv/hal_rvv_1p0/norm.hpp +++ b/3rdparty/hal_rvv/hal_rvv_1p0/norm.hpp @@ -166,7 +166,7 @@ struct NormL1_RVV { for (int i = 0; i < n; i += vl) { vl = __riscv_vsetvl_e8m8(n - i); auto v = __riscv_vle8_v_u8m8(src + i, vl); - s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, vl); + s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1()); } return __riscv_vmv_x(s); } @@ -181,7 +181,7 @@ struct NormL1_RVV { for (int i = 0; i < n; i += vl) { vl = __riscv_vsetvl_e8m8(n - i); auto v = custom_intrin::__riscv_vabs(__riscv_vle8_v_i8m8(src + i, vl), vl); - s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, vl); + s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1()); } return __riscv_vmv_x(s); } @@ -1008,6 +1008,7 @@ inline int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mas sizeof(int), sizeof(float), sizeof(int64_t), 0, }; + CV_Assert(elem_size_tab[depth]); bool src_continuous = (src_step == width * elem_size_tab[depth] * cn || (src_step != width * elem_size_tab[depth] * cn && height == 1)); bool mask_continuous = (mask_step == static_cast(width)); diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/norm_diff.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/norm_diff.hpp index cfb9fba6c7..d70a50a987 100644 --- a/3rdparty/hal_rvv/hal_rvv_1p0/norm_diff.hpp +++ b/3rdparty/hal_rvv/hal_rvv_1p0/norm_diff.hpp @@ -1,606 +1,1216 @@ // This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. - +// // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. +// Copyright (C) 2025, SpaceMIT Inc., all rights reserved. +// Third party copyrights are property of their respective owners. #ifndef OPENCV_HAL_RVV_NORM_DIFF_HPP_INCLUDED #define OPENCV_HAL_RVV_NORM_DIFF_HPP_INCLUDED -#include +#include "common.hpp" namespace cv { namespace cv_hal_rvv { namespace norm_diff { #undef cv_hal_normDiff #define cv_hal_normDiff cv::cv_hal_rvv::norm_diff::normDiff -inline int normDiffInf_8UC1(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step, int width, int height, double* result) -{ - int vlmax = __riscv_vsetvlmax_e8m8(); - auto vec_max = __riscv_vmv_v_x_u8m8(0, vlmax); +namespace { - if (mask) - { - for (int i = 0; i < height; i++) - { - const uchar* src1_row = src1 + i * src1_step; - const uchar* src2_row = src2 + i * src2_step; - const uchar* mask_row = mask + i * mask_step; - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e8m8(width - j); - auto vec_src1 = __riscv_vle8_v_u8m8(src1_row + j, vl); - auto vec_src2 = __riscv_vle8_v_u8m8(src2_row + j, vl); - auto vec_mask = __riscv_vle8_v_u8m8(mask_row + j, vl); - auto bool_mask = __riscv_vmsne(vec_mask, 0, vl); - auto vec_src = __riscv_vsub_vv_u8m8_m(bool_mask, __riscv_vmaxu_vv_u8m8_m(bool_mask, vec_src1, vec_src2, vl), - __riscv_vminu_vv_u8m8_m(bool_mask, vec_src1, vec_src2, vl), vl); - vec_max = __riscv_vmaxu_tumu(bool_mask, vec_max, vec_max, vec_src, vl); +template +struct NormDiffInf_RVV { + inline ST operator() (const T* src1, const T* src2, int n) const { + ST s = 0; + for (int i = 0; i < n; i++) { + s = std::max(s, (ST)std::abs(src1[i] - src2[i])); + } + return s; + } +}; + +template +struct NormDiffL1_RVV { + inline ST operator() (const T* src1, const T* src2, int n) const { + ST s = 0; + for (int i = 0; i < n; i++) { + s += std::abs(src1[i] - src2[i]); + } + return s; + } +}; + +template +struct NormDiffL2_RVV { + inline ST operator() (const T* src1, const T* src2, int n) const { + ST s = 0; + for (int i = 0; i < n; i++) { + ST v = (ST)src1[i] - (ST)src2[i]; + s += v * v; + } + return s; + } +}; + +template<> +struct NormDiffInf_RVV { + int operator() (const uchar* src1, const uchar* src2, int n) const { + int vlmax = __riscv_vsetvlmax_e8m8(); + auto s = __riscv_vmv_v_x_u8m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e8m8(n - i); + auto v1 = __riscv_vle8_v_u8m8(src1 + i, vl); + auto v2 = __riscv_vle8_v_u8m8(src2 + i, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + s = __riscv_vmaxu_tu(s, s, v, vl); + } + return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax)); + } +}; + +template<> +struct NormDiffInf_RVV { + int operator() (const schar* src1, const schar* src2, int n) const { + int vlmax = __riscv_vsetvlmax_e8m8(); + auto s = __riscv_vmv_v_x_u8m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e8m8(n - i); + auto v1 = __riscv_vle8_v_i8m8(src1 + i, vl); + auto v2 = __riscv_vle8_v_i8m8(src2 + i, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + s = __riscv_vmaxu_tu(s, s, v, vl); + } + return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax)); + } +}; + +template<> +struct NormDiffInf_RVV { + int operator() (const ushort* src1, const ushort* src2, int n) const { + int vlmax = __riscv_vsetvlmax_e16m8(); + auto s = __riscv_vmv_v_x_u16m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e16m8(n - i); + auto v1 = __riscv_vle16_v_u16m8(src1 + i, vl); + auto v2 = __riscv_vle16_v_u16m8(src2 + i, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + s = __riscv_vmaxu_tu(s, s, v, vl); + } + return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax)); + } +}; + +template<> +struct NormDiffInf_RVV { + int operator() (const short* src1, const short* src2, int n) const { + int vlmax = __riscv_vsetvlmax_e16m8(); + auto s = __riscv_vmv_v_x_u16m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e16m8(n - i); + auto v1 = __riscv_vle16_v_i16m8(src1 + i, vl); + auto v2 = __riscv_vle16_v_i16m8(src2 + i, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + s = __riscv_vmaxu_tu(s, s, v, vl); + } + return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax)); + } +}; + +template<> +struct NormDiffInf_RVV { + int operator() (const int* src1, const int* src2, int n) const { + int vlmax = __riscv_vsetvlmax_e32m8(); + auto s = __riscv_vmv_v_x_u32m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e32m8(n - i); + auto v1 = __riscv_vle32_v_i32m8(src1 + i, vl); + auto v2 = __riscv_vle32_v_i32m8(src2 + i, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + s = __riscv_vmaxu_tu(s, s, v, vl); + } + return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u32m1(0, __riscv_vsetvlmax_e32m1()), vlmax)); + } +}; + +template<> +struct NormDiffInf_RVV { + float operator() (const float* src1, const float* src2, int n) const { + int vlmax = __riscv_vsetvlmax_e32m8(); + auto s = __riscv_vfmv_v_f_f32m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e32m8(n - i); + auto v1 = __riscv_vle32_v_f32m8(src1 + i, vl); + auto v2 = __riscv_vle32_v_f32m8(src2 + i, vl); + auto v = __riscv_vfabs(__riscv_vfsub(v1, v2, vl), vl); + s = __riscv_vfmax_tu(s, s, v, vl); + } + return __riscv_vfmv_f(__riscv_vfredmax(s, __riscv_vfmv_s_f_f32m1(0, __riscv_vsetvlmax_e32m1()), vlmax)); + } +}; + +template<> +struct NormDiffInf_RVV { + double operator() (const double* src1, const double* src2, int n) const { + int vlmax = __riscv_vsetvlmax_e64m8(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e64m8(n - i); + auto v1 = __riscv_vle64_v_f64m8(src1 + i, vl); + auto v2 = __riscv_vle64_v_f64m8(src2 + i, vl); + auto v = __riscv_vfabs(__riscv_vfsub(v1, v2, vl), vl); + s = __riscv_vfmax_tu(s, s, v, vl); + } + return __riscv_vfmv_f(__riscv_vfredmax(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct NormDiffL1_RVV { + int operator() (const uchar* src1, const uchar* src2, int n) const { + auto s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1()); + auto zero = __riscv_vmv_v_x_u16m1(0, __riscv_vsetvlmax_e16m1()); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e8m8(n - i); + auto v1 = __riscv_vle8_v_u8m8(src1 + i, vl); + auto v2 = __riscv_vle8_v_u8m8(src2 + i, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1()); + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct NormDiffL1_RVV { + int operator() (const schar* src1, const schar* src2, int n) const { + auto s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1()); + auto zero = __riscv_vmv_v_x_u16m1(0, __riscv_vsetvlmax_e16m1()); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e8m8(n - i); + auto v1 = __riscv_vle8_v_i8m8(src1 + i, vl); + auto v2 = __riscv_vle8_v_i8m8(src2 + i, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1()); + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct NormDiffL1_RVV { + int operator() (const ushort* src1, const ushort* src2, int n) const { + auto s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1()); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e16m8(n - i); + auto v1 = __riscv_vle16_v_u16m8(src1 + i, vl); + auto v2 = __riscv_vle16_v_u16m8(src2 + i, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + s = __riscv_vwredsumu(v, s, vl); + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct NormDiffL1_RVV { + int operator() (const short* src1, const short* src2, int n) const { + auto s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1()); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e16m8(n - i); + auto v1 = __riscv_vle16_v_i16m8(src1 + i, vl); + auto v2 = __riscv_vle16_v_i16m8(src2 + i, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + s = __riscv_vwredsumu(v, s, vl); + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct NormDiffL1_RVV { + double operator() (const int* src1, const int* src2, int n) const { + int vlmax = __riscv_vsetvlmax_e32m4(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e32m4(n - i); + auto v1 = __riscv_vle32_v_i32m4(src1 + i, vl); + auto v2 = __riscv_vle32_v_i32m4(src2 + i, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v, vl), vl); + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct NormDiffL1_RVV { + double operator() (const float* src1, const float* src2, int n) const { + int vlmax = __riscv_vsetvlmax_e32m4(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e32m4(n - i); + auto v1 = __riscv_vle32_v_f32m4(src1 + i, vl); + auto v2 = __riscv_vle32_v_f32m4(src2 + i, vl); + auto v = __riscv_vfabs(__riscv_vfsub(v1, v2, vl), vl); + s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v, vl), vl); + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct NormDiffL1_RVV { + double operator() (const double* src1, const double* src2, int n) const { + int vlmax = __riscv_vsetvlmax_e64m8(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e64m8(n - i); + auto v1 = __riscv_vle64_v_f64m8(src1 + i, vl); + auto v2 = __riscv_vle64_v_f64m8(src2 + i, vl); + auto v = __riscv_vfabs(__riscv_vfsub(v1, v2, vl), vl); + s = __riscv_vfadd_tu(s, s, v, vl); + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct NormDiffL2_RVV { + int operator() (const uchar* src1, const uchar* src2, int n) const { + auto s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1()); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e8m4(n - i); + auto v1 = __riscv_vle8_v_u8m4(src1 + i, vl); + auto v2 = __riscv_vle8_v_u8m4(src2 + i, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + s = __riscv_vwredsumu(__riscv_vwmulu(v, v, vl), s, vl); + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct NormDiffL2_RVV { + int operator() (const schar* src1, const schar* src2, int n) const { + auto s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1()); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e8m4(n - i); + auto v1 = __riscv_vle8_v_i8m4(src1 + i, vl); + auto v2 = __riscv_vle8_v_i8m4(src2 + i, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + s = __riscv_vwredsumu(__riscv_vwmulu(v, v, vl), s, vl); + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct NormDiffL2_RVV { + double operator() (const ushort* src1, const ushort* src2, int n) const { + int vlmax = __riscv_vsetvlmax_e16m2(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e16m2(n - i); + auto v1 = __riscv_vle16_v_u16m2(src1 + i, vl); + auto v2 = __riscv_vle16_v_u16m2(src2 + i, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v_mul = __riscv_vwmulu(v, v, vl); + s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v_mul, vl), vl); + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct NormDiffL2_RVV { + double operator() (const short* src1, const short* src2, int n) const { + int vlmax = __riscv_vsetvlmax_e16m2(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e16m2(n - i); + auto v1 = __riscv_vle16_v_i16m2(src1 + i, vl); + auto v2 = __riscv_vle16_v_i16m2(src2 + i, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v_mul = __riscv_vwmulu(v, v, vl); + s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v_mul, vl), vl); + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct NormDiffL2_RVV { + double operator() (const int* src1, const int* src2, int n) const { + int vlmax = __riscv_vsetvlmax_e32m4(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e32m4(n - i); + auto v1 = __riscv_vle32_v_i32m4(src1 + i, vl); + auto v2 = __riscv_vle32_v_i32m4(src2 + i, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto v_mul = __riscv_vwmulu(v, v, vl); + s = __riscv_vfadd_tu(s, s, __riscv_vfcvt_f(v_mul, vl), vl); + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct NormDiffL2_RVV { + double operator() (const float* src1, const float* src2, int n) const { + int vlmax = __riscv_vsetvlmax_e32m4(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e32m4(n - i); + auto v1 = __riscv_vle32_v_f32m4(src1 + i, vl); + auto v2 = __riscv_vle32_v_f32m4(src2 + i, vl); + auto v = __riscv_vfsub(v1, v2, vl); + auto v_mul = __riscv_vfwmul(v, v, vl); + s = __riscv_vfadd_tu(s, s, v_mul, vl); + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct NormDiffL2_RVV { + double operator() (const double* src1, const double* src2, int n) const { + int vlmax = __riscv_vsetvlmax_e64m8(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + int vl; + for (int i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e64m8(n - i); + auto v1 = __riscv_vle64_v_f64m8(src1 + i, vl); + auto v2 = __riscv_vle64_v_f64m8(src2 + i, vl); + auto v = __riscv_vfsub(v1, v2, vl); + auto v_mul = __riscv_vfmul(v, v, vl); + s = __riscv_vfadd_tu(s, s, v_mul, vl); + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +// Norm with mask + +template +struct MaskedNormDiffInf_RVV { + inline ST operator() (const T* src1, const T* src2, const uchar* mask, int len, int cn) const { + ST s = 0; + for( int i = 0; i < len; i++, src1 += cn, src2 += cn ) { + if( mask[i] ) { + for( int k = 0; k < cn; k++ ) { + s = std::max(s, (ST)std::abs(src1[k] - src2[k])); + } } } + return s; } - else - { - for (int i = 0; i < height; i++) - { - const uchar* src1_row = src1 + i * src1_step; - const uchar* src2_row = src2 + i * src2_step; - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e8m8(width - j); - auto vec_src1 = __riscv_vle8_v_u8m8(src1_row + j, vl); - auto vec_src2 = __riscv_vle8_v_u8m8(src2_row + j, vl); - auto vec_src = __riscv_vsub(__riscv_vmaxu(vec_src1, vec_src2, vl), __riscv_vminu(vec_src1, vec_src2, vl), vl); - vec_max = __riscv_vmaxu_tu(vec_max, vec_max, vec_src, vl); +}; + +template +struct MaskedNormDiffL1_RVV { + inline ST operator() (const T* src1, const T* src2, const uchar* mask, int len, int cn) const { + ST s = 0; + for( int i = 0; i < len; i++, src1 += cn, src2 += cn ) { + if( mask[i] ) { + for( int k = 0; k < cn; k++ ) { + s += std::abs(src1[k] - src2[k]); + } } } + return s; } - auto sc_max = __riscv_vmv_s_x_u8m1(0, vlmax); - sc_max = __riscv_vredmaxu(vec_max, sc_max, vlmax); - *result = __riscv_vmv_x(sc_max); +}; - return CV_HAL_ERROR_OK; +template +struct MaskedNormDiffL2_RVV { + inline ST operator() (const T* src1, const T* src2, const uchar* mask, int len, int cn) const { + ST s = 0; + for( int i = 0; i < len; i++, src1 += cn, src2 += cn ) { + if( mask[i] ) { + for( int k = 0; k < cn; k++ ) { + ST v = (ST)src1[k] - (ST)src2[k]; + s += v * v; + } + } + } + return s; + } +}; + +template<> +struct MaskedNormDiffInf_RVV { + int operator() (const uchar* src1, const uchar* src2, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e8m8(); + auto s = __riscv_vmv_v_x_u8m8(0, vlmax); + if (cn == 1) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m8(len - i); + auto v1 = __riscv_vle8_v_u8m8(src1 + i, vl); + auto v2 = __riscv_vle8_v_u8m8(src2 + i, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto m = __riscv_vle8_v_u8m8(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vmaxu_tumu(b, s, s, v, vl); + } + } else if (cn == 4) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m2(len - i); + auto v1 = __riscv_vle8_v_u8m8(src1 + i * 4, vl * 4); + auto v2 = __riscv_vle8_v_u8m8(src2 + i * 4, vl * 4); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl * 4); + auto m = __riscv_vle8_v_u8m2(mask + i, vl); + auto b = __riscv_vmsne(__riscv_vreinterpret_u8m8(__riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(m, 1, vl), vl), 0x01010101, vl)), 0, vl * 4); + s = __riscv_vmaxu_tumu(b, s, s, v, vl * 4); + } + } else { + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m8(len - i); + auto v1 = __riscv_vlse8_v_u8m8(src1 + cn * i + cn_index, sizeof(uchar) * cn, vl); + auto v2 = __riscv_vlse8_v_u8m8(src2 + cn * i + cn_index, sizeof(uchar) * cn, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto m = __riscv_vle8_v_u8m8(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vmaxu_tumu(b, s, s, v, vl); + } + } + } + return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax)); + } +}; + +template<> +struct MaskedNormDiffInf_RVV { + int operator() (const schar* src1, const schar* src2, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e8m8(); + auto s = __riscv_vmv_v_x_u8m8(0, vlmax); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m8(len - i); + auto v1 = __riscv_vlse8_v_i8m8(src1 + cn * i + cn_index, sizeof(schar) * cn, vl); + auto v2 = __riscv_vlse8_v_i8m8(src2 + cn * i + cn_index, sizeof(schar) * cn, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto m = __riscv_vle8_v_u8m8(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vmaxu_tumu(b, s, s, v, vl); + } + } + return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax)); + } +}; + +template<> +struct MaskedNormDiffInf_RVV { + int operator() (const ushort* src1, const ushort* src2, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e16m8(); + auto s = __riscv_vmv_v_x_u16m8(0, vlmax); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e16m8(len - i); + auto v1 = __riscv_vlse16_v_u16m8(src1 + cn * i + cn_index, sizeof(ushort) * cn, vl); + auto v2 = __riscv_vlse16_v_u16m8(src2 + cn * i + cn_index, sizeof(ushort) * cn, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto m = __riscv_vle8_v_u8m4(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vmaxu_tumu(b, s, s, v, vl); + } + } + return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax)); + } +}; + +template<> +struct MaskedNormDiffInf_RVV { + int operator() (const short* src1, const short* src2, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e16m8(); + auto s = __riscv_vmv_v_x_u16m8(0, vlmax); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e16m8(len - i); + auto v1 = __riscv_vlse16_v_i16m8(src1 + cn * i + cn_index, sizeof(short) * cn, vl); + auto v2 = __riscv_vlse16_v_i16m8(src2 + cn * i + cn_index, sizeof(short) * cn, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto m = __riscv_vle8_v_u8m4(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vmaxu_tumu(b, s, s, v, vl); + } + } + return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax)); + } +}; + +template<> +struct MaskedNormDiffInf_RVV { + int operator() (const int* src1, const int* src2, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e32m8(); + auto s = __riscv_vmv_v_x_u32m8(0, vlmax); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e32m8(len - i); + auto v1 = __riscv_vlse32_v_i32m8(src1 + cn * i + cn_index, sizeof(int) * cn, vl); + auto v2 = __riscv_vlse32_v_i32m8(src2 + cn * i + cn_index, sizeof(int) * cn, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto m = __riscv_vle8_v_u8m2(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vmaxu_tumu(b, s, s, v, vl); + } + } + return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u32m1(0, __riscv_vsetvlmax_e32m1()), vlmax)); + } +}; + +template<> +struct MaskedNormDiffInf_RVV { + float operator() (const float* src1, const float* src2, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e32m8(); + auto s = __riscv_vfmv_v_f_f32m8(0, vlmax); + if (cn == 1) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e32m8(len - i); + auto v1 = __riscv_vle32_v_f32m8(src1 + i, vl); + auto v2 = __riscv_vle32_v_f32m8(src2 + i, vl); + auto v = __riscv_vfabs(__riscv_vfsub(v1, v2, vl), vl); + auto m = __riscv_vle8_v_u8m2(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vfmax_tumu(b, s, s, v, vl); + } + } else { + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e32m8(len - i); + auto v1 = __riscv_vlse32_v_f32m8(src1 + cn * i + cn_index, sizeof(float) * cn, vl); + auto v2 = __riscv_vlse32_v_f32m8(src2 + cn * i + cn_index, sizeof(float) * cn, vl); + auto v = __riscv_vfabs(__riscv_vfsub(v1, v2, vl), vl); + auto m = __riscv_vle8_v_u8m2(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vfmax_tumu(b, s, s, v, vl); + } + } + } + return __riscv_vfmv_f(__riscv_vfredmax(s, __riscv_vfmv_s_f_f32m1(0, __riscv_vsetvlmax_e32m1()), vlmax)); + } +}; + +template<> +struct MaskedNormDiffInf_RVV { + double operator() (const double* src1, const double* src2, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e64m8(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e64m8(len - i); + auto v1 = __riscv_vlse64_v_f64m8(src1 + cn * i + cn_index, sizeof(double) * cn, vl); + auto v2 = __riscv_vlse64_v_f64m8(src2 + cn * i + cn_index, sizeof(double) * cn, vl); + auto v = __riscv_vfabs(__riscv_vfsub(v1, v2, vl), vl); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vfmax_tumu(b, s, s, __riscv_vfabs(v, vl), vl); + } + } + return __riscv_vfmv_f(__riscv_vfredmax(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct MaskedNormDiffL1_RVV { + int operator() (const uchar* src1, const uchar* src2, const uchar* mask, int len, int cn) const { + auto s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1()); + auto zero = __riscv_vmv_v_x_u16m1(0, __riscv_vsetvlmax_e16m1()); + if (cn == 1) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m8(len - i); + auto v1 = __riscv_vle8_v_u8m8(src1 + i, vl); + auto v2 = __riscv_vle8_v_u8m8(src2 + i, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto m = __riscv_vle8_v_u8m8(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1()); + } + } else if (cn == 4) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m2(len - i); + auto v1 = __riscv_vle8_v_u8m8(src1 + i * 4, vl * 4); + auto v2 = __riscv_vle8_v_u8m8(src2 + i * 4, vl * 4); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl * 4); + auto m = __riscv_vle8_v_u8m2(mask + i, vl); + auto b = __riscv_vmsne(__riscv_vreinterpret_u8m8(__riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(m, 1, vl), vl), 0x01010101, vl)), 0, vl * 4); + s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl * 4), s, __riscv_vsetvlmax_e16m1()); + } + } else { + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m8(len - i); + auto v1 = __riscv_vlse8_v_u8m8(src1 + cn * i + cn_index, sizeof(uchar) * cn, vl); + auto v2 = __riscv_vlse8_v_u8m8(src2 + cn * i + cn_index, sizeof(uchar) * cn, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto m = __riscv_vle8_v_u8m8(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1()); + } + } + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct MaskedNormDiffL1_RVV { + int operator() (const schar* src1, const schar* src2, const uchar* mask, int len, int cn) const { + auto s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1()); + auto zero = __riscv_vmv_v_x_u16m1(0, __riscv_vsetvlmax_e16m1()); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m8(len - i); + auto v1 = __riscv_vlse8_v_i8m8(src1 + cn * i + cn_index, sizeof(schar) * cn, vl); + auto v2 = __riscv_vlse8_v_i8m8(src2 + cn * i + cn_index, sizeof(schar) * cn, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto m = __riscv_vle8_v_u8m8(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1()); + } + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct MaskedNormDiffL1_RVV { + int operator() (const ushort* src1, const ushort* src2, const uchar* mask, int len, int cn) const { + auto s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1()); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m4(len - i); + auto v1 = __riscv_vlse16_v_u16m8(src1 + cn * i + cn_index, sizeof(ushort) * cn, vl); + auto v2 = __riscv_vlse16_v_u16m8(src2 + cn * i + cn_index, sizeof(ushort) * cn, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto m = __riscv_vle8_v_u8m4(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vwredsumu_tum(b, s, v, s, vl); + } + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct MaskedNormDiffL1_RVV { + int operator() (const short* src1, const short* src2, const uchar* mask, int len, int cn) const { + auto s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1()); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m4(len - i); + auto v1 = __riscv_vlse16_v_i16m8(src1 + cn * i + cn_index, sizeof(short) * cn, vl); + auto v2 = __riscv_vlse16_v_i16m8(src2 + cn * i + cn_index, sizeof(short) * cn, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto m = __riscv_vle8_v_u8m4(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vwredsumu_tum(b, s, v, s, vl); + } + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct MaskedNormDiffL1_RVV { + double operator() (const int* src1, const int* src2, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e32m4(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e32m4(len - i); + auto v1 = __riscv_vlse32_v_i32m4(src1 + cn * i + cn_index, sizeof(int) * cn, vl); + auto v2 = __riscv_vlse32_v_i32m4(src2 + cn * i + cn_index, sizeof(int) * cn, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vfadd_tumu(b, s, s, __riscv_vfwcvt_f(b, v, vl), vl); + } + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct MaskedNormDiffL1_RVV { + double operator() (const float* src1, const float* src2, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e32m4(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + if (cn == 1) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e32m4(len - i); + auto v1 = __riscv_vle32_v_f32m4(src1 + i, vl); + auto v2 = __riscv_vle32_v_f32m4(src2 + i, vl); + auto v = __riscv_vfabs(__riscv_vfsub(v1, v2, vl), vl); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vfadd_tumu(b, s, s, __riscv_vfwcvt_f(b, v, vl), vl); + } + } else { + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e32m4(len - i); + auto v1 = __riscv_vlse32_v_f32m4(src1 + cn * i + cn_index, sizeof(float) * cn, vl); + auto v2 = __riscv_vlse32_v_f32m4(src2 + cn * i + cn_index, sizeof(float) * cn, vl); + auto v = __riscv_vfabs(__riscv_vfsub(v1, v2, vl), vl); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vfadd_tumu(b, s, s, __riscv_vfwcvt_f(b, v, vl), vl); + } + } + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct MaskedNormDiffL1_RVV { + double operator() (const double* src1, const double* src2, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e64m8(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e64m8(len - i); + auto v1 = __riscv_vlse64_v_f64m8(src1 + cn * i + cn_index, sizeof(double) * cn, vl); + auto v2 = __riscv_vlse64_v_f64m8(src2 + cn * i + cn_index, sizeof(double) * cn, vl); + auto v = __riscv_vfabs(__riscv_vfsub(v1, v2, vl), vl); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vfadd_tumu(b, s, s, __riscv_vfabs(v, vl), vl); + } + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct MaskedNormDiffL2_RVV { + int operator() (const uchar* src1, const uchar* src2, const uchar* mask, int len, int cn) const { + auto s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1()); + if (cn == 1) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m4(len - i); + auto v1 = __riscv_vle8_v_u8m4(src1 + i, vl); + auto v2 = __riscv_vle8_v_u8m4(src2 + i, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto m = __riscv_vle8_v_u8m4(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl), s, vl); + } + } else if (cn == 4) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m1(len - i); + auto v1 = __riscv_vle8_v_u8m4(src1 + i * 4, vl * 4); + auto v2 = __riscv_vle8_v_u8m4(src2 + i * 4, vl * 4); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl * 4); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(__riscv_vreinterpret_u8m4(__riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(m, 1, vl), vl), 0x01010101, vl)), 0, vl * 4); + s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl * 4), s, vl * 4); + } + } else { + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m4(len - i); + auto v1 = __riscv_vlse8_v_u8m4(src1 + cn * i + cn_index, sizeof(uchar) * cn, vl); + auto v2 = __riscv_vlse8_v_u8m4(src2 + cn * i + cn_index, sizeof(uchar) * cn, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto m = __riscv_vle8_v_u8m4(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl), s, vl); + } + } + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct MaskedNormDiffL2_RVV { + int operator() (const schar* src1, const schar* src2, const uchar* mask, int len, int cn) const { + auto s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1()); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e8m4(len - i); + auto v1 = __riscv_vlse8_v_i8m4(src1 + cn * i + cn_index, sizeof(schar) * cn, vl); + auto v2 = __riscv_vlse8_v_i8m4(src2 + cn * i + cn_index, sizeof(schar) * cn, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto m = __riscv_vle8_v_u8m4(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl), s, vl); + } + } + return __riscv_vmv_x(s); + } +}; + +template<> +struct MaskedNormDiffL2_RVV { + double operator() (const ushort* src1, const ushort* src2, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e16m2(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e16m2(len - i); + auto v1 = __riscv_vlse16_v_u16m2(src1 + cn * i + cn_index, sizeof(ushort) * cn, vl); + auto v2 = __riscv_vlse16_v_u16m2(src2 + cn * i + cn_index, sizeof(ushort) * cn, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + auto v_mul = __riscv_vwmulu(b, v, v, vl); + s = __riscv_vfadd_tumu(b, s, s, __riscv_vfwcvt_f(b, v_mul, vl), vl); + } + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct MaskedNormDiffL2_RVV { + double operator() (const short* src1, const short* src2, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e16m2(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e16m2(len - i); + auto v1 = __riscv_vlse16_v_i16m2(src1 + cn * i + cn_index, sizeof(short) * cn, vl); + auto v2 = __riscv_vlse16_v_i16m2(src2 + cn * i + cn_index, sizeof(short) * cn, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + auto v_mul = __riscv_vwmulu(b, v, v, vl); + s = __riscv_vfadd_tumu(b, s, s, __riscv_vfwcvt_f(b, v_mul, vl), vl); + } + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct MaskedNormDiffL2_RVV { + double operator() (const int* src1, const int* src2, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e32m4(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e16m2(len - i); + auto v1 = __riscv_vlse32_v_i32m4(src1 + cn * i + cn_index, sizeof(int) * cn, vl); + auto v2 = __riscv_vlse32_v_i32m4(src2 + cn * i + cn_index, sizeof(int) * cn, vl); + auto v = custom_intrin::__riscv_vabd(v1, v2, vl); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + auto v_mul = __riscv_vwmulu(b, v, v, vl); + s = __riscv_vfadd_tumu(b, s, s, __riscv_vfcvt_f(b, v_mul, vl), vl); + } + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct MaskedNormDiffL2_RVV { + double operator() (const float* src1, const float* src2, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e32m4(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + if (cn == 1) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e32m4(len - i); + auto v1 = __riscv_vle32_v_f32m4(src1 + i, vl); + auto v2 = __riscv_vle32_v_f32m4(src2 + i, vl); + auto v = __riscv_vfsub(v1, v2, vl); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + auto v_mul = __riscv_vfwmul(b, v, v, vl); + s = __riscv_vfadd_tumu(b, s, s, v_mul, vl); + } + } else { + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e32m4(len - i); + auto v1 = __riscv_vlse32_v_f32m4(src1 + cn * i + cn_index, sizeof(float) * cn, vl); + auto v2 = __riscv_vlse32_v_f32m4(src2 + cn * i + cn_index, sizeof(float) * cn, vl); + auto v = __riscv_vfsub(v1, v2, vl); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + auto v_mul = __riscv_vfwmul(b, v, v, vl); + s = __riscv_vfadd_tumu(b, s, s, v_mul, vl); + } + } + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template<> +struct MaskedNormDiffL2_RVV { + double operator() (const double* src1, const double* src2, const uchar* mask, int len, int cn) const { + int vlmax = __riscv_vsetvlmax_e64m8(); + auto s = __riscv_vfmv_v_f_f64m8(0, vlmax); + for (int cn_index = 0; cn_index < cn; cn_index++) { + int vl; + for (int i = 0; i < len; i += vl) { + vl = __riscv_vsetvl_e64m8(len - i); + auto v1 = __riscv_vlse64_v_f64m8(src1 + cn * i + cn_index, sizeof(double) * cn, vl); + auto v2 = __riscv_vlse64_v_f64m8(src2 + cn * i + cn_index, sizeof(double) * cn, vl); + auto v = __riscv_vfsub(v1, v2, vl); + auto m = __riscv_vle8_v_u8m1(mask + i, vl); + auto b = __riscv_vmsne(m, 0, vl); + auto v_mul = __riscv_vfmul(b, v, v, vl); + s = __riscv_vfadd_tumu(b, s, s, v_mul, vl); + } + } + return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax)); + } +}; + +template int +normDiffInf_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn) { + ST result = *_result; + if( !mask ) { + NormDiffInf_RVV op; + result = std::max(result, op(src1, src2, len*cn)); + } else { + MaskedNormDiffInf_RVV op; + result = std::max(result, op(src1, src2, mask, len, cn)); + } + *_result = result; + return 0; } -inline int normDiffL1_8UC1(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step, int width, int height, double* result) -{ - int vlmax = __riscv_vsetvlmax_e8m2(); - auto vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax); - - if (mask) - { - for (int i = 0; i < height; i++) - { - const uchar* src1_row = src1 + i * src1_step; - const uchar* src2_row = src2 + i * src2_step; - const uchar* mask_row = mask + i * mask_step; - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e8m2(width - j); - auto vec_src1 = __riscv_vle8_v_u8m2(src1_row + j, vl); - auto vec_src2 = __riscv_vle8_v_u8m2(src2_row + j, vl); - auto vec_mask = __riscv_vle8_v_u8m2(mask_row + j, vl); - auto bool_mask = __riscv_vmsne(vec_mask, 0, vl); - auto vec_src = __riscv_vsub_vv_u8m2_m(bool_mask, __riscv_vmaxu_vv_u8m2_m(bool_mask, vec_src1, vec_src2, vl), - __riscv_vminu_vv_u8m2_m(bool_mask, vec_src1, vec_src2, vl), vl); - auto vec_zext = __riscv_vzext_vf4_u32m8_m(bool_mask, vec_src, vl); - vec_sum = __riscv_vadd_tumu(bool_mask, vec_sum, vec_sum, vec_zext, vl); - } - } +template int +normDiffL1_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn) { + ST result = *_result; + if( !mask ) { + NormDiffL1_RVV op; + result += op(src1, src2, len*cn); + } else { + MaskedNormDiffL1_RVV op; + result += op(src1, src2, mask, len, cn); } - else - { - for (int i = 0; i < height; i++) - { - const uchar* src1_row = src1 + i * src1_step; - const uchar* src2_row = src2 + i * src2_step; - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e8m2(width - j); - auto vec_src1 = __riscv_vle8_v_u8m2(src1_row + j, vl); - auto vec_src2 = __riscv_vle8_v_u8m2(src2_row + j, vl); - auto vec_src = __riscv_vsub(__riscv_vmaxu(vec_src1, vec_src2, vl), __riscv_vminu(vec_src1, vec_src2, vl), vl); - auto vec_zext = __riscv_vzext_vf4(vec_src, vl); - vec_sum = __riscv_vadd_tu(vec_sum, vec_sum, vec_zext, vl); - } - } - } - auto sc_sum = __riscv_vmv_s_x_u32m1(0, vlmax); - sc_sum = __riscv_vredsum(vec_sum, sc_sum, vlmax); - *result = __riscv_vmv_x(sc_sum); - - return CV_HAL_ERROR_OK; + *_result = result; + return 0; } -inline int normDiffL2Sqr_8UC1(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step, int width, int height, double* result) -{ - int vlmax = __riscv_vsetvlmax_e8m2(); - auto vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax); - int cnt = 0; - auto reduce = [&](int vl) { - if ((cnt += vl) < (1 << 16)) - return; - cnt = vl; - for (int i = 0; i < vlmax; i++) - { - *result += __riscv_vmv_x(vec_sum); - vec_sum = __riscv_vslidedown(vec_sum, 1, vlmax); - } - vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax); - }; - - *result = 0; - if (mask) - { - for (int i = 0; i < height; i++) - { - const uchar* src1_row = src1 + i * src1_step; - const uchar* src2_row = src2 + i * src2_step; - const uchar* mask_row = mask + i * mask_step; - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e8m2(width - j); - reduce(vl); - - auto vec_src1 = __riscv_vle8_v_u8m2(src1_row + j, vl); - auto vec_src2 = __riscv_vle8_v_u8m2(src2_row + j, vl); - auto vec_mask = __riscv_vle8_v_u8m2(mask_row + j, vl); - auto bool_mask = __riscv_vmsne(vec_mask, 0, vl); - auto vec_src = __riscv_vsub_vv_u8m2_m(bool_mask, __riscv_vmaxu_vv_u8m2_m(bool_mask, vec_src1, vec_src2, vl), - __riscv_vminu_vv_u8m2_m(bool_mask, vec_src1, vec_src2, vl), vl); - auto vec_mul = __riscv_vwmulu_vv_u16m4_m(bool_mask, vec_src, vec_src, vl); - auto vec_zext = __riscv_vzext_vf2_u32m8_m(bool_mask, vec_mul, vl); - vec_sum = __riscv_vadd_tumu(bool_mask, vec_sum, vec_sum, vec_zext, vl); - } - } +template int +normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn) { + ST result = *_result; + if( !mask ) { + NormDiffL2_RVV op; + result += op(src1, src2, len*cn); + } else { + MaskedNormDiffL2_RVV op; + result += op(src1, src2, mask, len, cn); } - else - { - for (int i = 0; i < height; i++) - { - const uchar* src1_row = src1 + i * src1_step; - const uchar* src2_row = src2 + i * src2_step; - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e8m2(width - j); - reduce(vl); - - auto vec_src1 = __riscv_vle8_v_u8m2(src1_row + j, vl); - auto vec_src2 = __riscv_vle8_v_u8m2(src2_row + j, vl); - auto vec_src = __riscv_vsub(__riscv_vmaxu(vec_src1, vec_src2, vl), __riscv_vminu(vec_src1, vec_src2, vl), vl); - auto vec_mul = __riscv_vwmulu(vec_src, vec_src, vl); - auto vec_zext = __riscv_vzext_vf2(vec_mul, vl); - vec_sum = __riscv_vadd_tu(vec_sum, vec_sum, vec_zext, vl); - } - } - } - reduce(1 << 16); - - return CV_HAL_ERROR_OK; + *_result = result; + return 0; } -inline int normDiffInf_8UC4(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step, int width, int height, double* result) -{ - int vlmax = __riscv_vsetvlmax_e8m8(); - auto vec_max = __riscv_vmv_v_x_u8m8(0, vlmax); +#define CV_HAL_RVV_DEF_NORM_DIFF_FUNC(L, suffix, type, ntype) \ + static int normDiff##L##_##suffix(const type* src1, const type* src2, const uchar* mask, ntype* r, int len, int cn) \ + { return normDiff##L##_(src1, src2, mask, r, len, cn); } - if (mask) - { - for (int i = 0; i < height; i++) - { - const uchar* src1_row = src1 + i * src1_step; - const uchar* src2_row = src2 + i * src2_step; - const uchar* mask_row = mask + i * mask_step; - int vl, vlm; - for (int j = 0, jm = 0; j < width * 4; j += vl, jm += vlm) - { - vl = __riscv_vsetvl_e8m8(width * 4 - j); - vlm = __riscv_vsetvl_e8m2(width - jm); - auto vec_src1 = __riscv_vle8_v_u8m8(src1_row + j, vl); - auto vec_src2 = __riscv_vle8_v_u8m8(src2_row + j, vl); - auto vec_mask = __riscv_vle8_v_u8m2(mask_row + jm, vlm); - auto vec_mask_ext = __riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(vec_mask, 1, vlm), vlm), 0x01010101, vlm); - auto bool_mask_ext = __riscv_vmsne(__riscv_vreinterpret_u8m8(vec_mask_ext), 0, vl); - auto vec_src = __riscv_vsub_vv_u8m8_m(bool_mask_ext, __riscv_vmaxu_vv_u8m8_m(bool_mask_ext, vec_src1, vec_src2, vl), - __riscv_vminu_vv_u8m8_m(bool_mask_ext, vec_src1, vec_src2, vl), vl); - vec_max = __riscv_vmaxu_tumu(bool_mask_ext, vec_max, vec_max, vec_src, vl); - } - } - } - else - { - for (int i = 0; i < height; i++) - { - const uchar* src1_row = src1 + i * src1_step; - const uchar* src2_row = src2 + i * src2_step; - int vl; - for (int j = 0; j < width * 4; j += vl) - { - vl = __riscv_vsetvl_e8m8(width * 4 - j); - auto vec_src1 = __riscv_vle8_v_u8m8(src1_row + j, vl); - auto vec_src2 = __riscv_vle8_v_u8m8(src2_row + j, vl); - auto vec_src = __riscv_vsub(__riscv_vmaxu(vec_src1, vec_src2, vl), __riscv_vminu(vec_src1, vec_src2, vl), vl); - vec_max = __riscv_vmaxu_tu(vec_max, vec_max, vec_src, vl); - } - } - } - auto sc_max = __riscv_vmv_s_x_u8m1(0, vlmax); - sc_max = __riscv_vredmaxu(vec_max, sc_max, vlmax); - *result = __riscv_vmv_x(sc_max); +#define CV_HAL_RVV_DEF_NORM_DIFF_ALL(suffix, type, inftype, l1type, l2type) \ + CV_HAL_RVV_DEF_NORM_DIFF_FUNC(Inf, suffix, type, inftype) \ + CV_HAL_RVV_DEF_NORM_DIFF_FUNC(L1, suffix, type, l1type) \ + CV_HAL_RVV_DEF_NORM_DIFF_FUNC(L2, suffix, type, l2type) + +CV_HAL_RVV_DEF_NORM_DIFF_ALL(8u, uchar, int, int, int) +CV_HAL_RVV_DEF_NORM_DIFF_ALL(8s, schar, int, int, int) +CV_HAL_RVV_DEF_NORM_DIFF_ALL(16u, ushort, int, int, double) +CV_HAL_RVV_DEF_NORM_DIFF_ALL(16s, short, int, int, double) +CV_HAL_RVV_DEF_NORM_DIFF_ALL(32s, int, int, double, double) +CV_HAL_RVV_DEF_NORM_DIFF_ALL(32f, float, float, double, double) +CV_HAL_RVV_DEF_NORM_DIFF_ALL(64f, double, double, double, double) + +#undef CV_HAL_RVV_DEF_NORM_DIFF_ALL +#undef CV_HAL_RVV_DEF_NORM_DIFF_FUNC - return CV_HAL_ERROR_OK; -} - -inline int normDiffL1_8UC4(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step, int width, int height, double* result) -{ - int vlmax = __riscv_vsetvlmax_e8m2(); - auto vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax); - - if (mask) - { - for (int i = 0; i < height; i++) - { - const uchar* src1_row = src1 + i * src1_step; - const uchar* src2_row = src2 + i * src2_step; - const uchar* mask_row = mask + i * mask_step; - int vl, vlm; - for (int j = 0, jm = 0; j < width * 4; j += vl, jm += vlm) - { - vl = __riscv_vsetvl_e8m2(width * 4 - j); - vlm = __riscv_vsetvl_e8mf2(width - jm); - auto vec_src1 = __riscv_vle8_v_u8m2(src1_row + j, vl); - auto vec_src2 = __riscv_vle8_v_u8m2(src2_row + j, vl); - auto vec_mask = __riscv_vle8_v_u8mf2(mask_row + jm, vlm); - auto vec_mask_ext = __riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(vec_mask, 1, vlm), vlm), 0x01010101, vlm); - auto bool_mask_ext = __riscv_vmsne(__riscv_vreinterpret_u8m2(vec_mask_ext), 0, vl); - auto vec_src = __riscv_vsub_vv_u8m2_m(bool_mask_ext, __riscv_vmaxu_vv_u8m2_m(bool_mask_ext, vec_src1, vec_src2, vl), - __riscv_vminu_vv_u8m2_m(bool_mask_ext, vec_src1, vec_src2, vl), vl); - auto vec_zext = __riscv_vzext_vf4_u32m8_m(bool_mask_ext, vec_src, vl); - vec_sum = __riscv_vadd_tumu(bool_mask_ext, vec_sum, vec_sum, vec_zext, vl); - } - } - } - else - { - for (int i = 0; i < height; i++) - { - const uchar* src1_row = src1 + i * src1_step; - const uchar* src2_row = src2 + i * src2_step; - int vl; - for (int j = 0; j < width * 4; j += vl) - { - vl = __riscv_vsetvl_e8m2(width * 4 - j); - auto vec_src1 = __riscv_vle8_v_u8m2(src1_row + j, vl); - auto vec_src2 = __riscv_vle8_v_u8m2(src2_row + j, vl); - auto vec_src = __riscv_vsub(__riscv_vmaxu(vec_src1, vec_src2, vl), __riscv_vminu(vec_src1, vec_src2, vl), vl); - auto vec_zext = __riscv_vzext_vf4(vec_src, vl); - vec_sum = __riscv_vadd_tu(vec_sum, vec_sum, vec_zext, vl); - } - } - } - auto sc_sum = __riscv_vmv_s_x_u32m1(0, vlmax); - sc_sum = __riscv_vredsum(vec_sum, sc_sum, vlmax); - *result = __riscv_vmv_x(sc_sum); - - return CV_HAL_ERROR_OK; -} - -inline int normDiffL2Sqr_8UC4(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step, int width, int height, double* result) -{ - int vlmax = __riscv_vsetvlmax_e8m2(); - auto vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax); - int cnt = 0; - auto reduce = [&](int vl) { - if ((cnt += vl) < (1 << 16)) - return; - cnt = vl; - for (int i = 0; i < vlmax; i++) - { - *result += __riscv_vmv_x(vec_sum); - vec_sum = __riscv_vslidedown(vec_sum, 1, vlmax); - } - vec_sum = __riscv_vmv_v_x_u32m8(0, vlmax); - }; - - *result = 0; - if (mask) - { - for (int i = 0; i < height; i++) - { - const uchar* src1_row = src1 + i * src1_step; - const uchar* src2_row = src2 + i * src2_step; - const uchar* mask_row = mask + i * mask_step; - int vl, vlm; - for (int j = 0, jm = 0; j < width * 4; j += vl, jm += vlm) - { - vl = __riscv_vsetvl_e8m2(width * 4 - j); - vlm = __riscv_vsetvl_e8mf2(width - jm); - reduce(vl); - - auto vec_src1 = __riscv_vle8_v_u8m2(src1_row + j, vl); - auto vec_src2 = __riscv_vle8_v_u8m2(src2_row + j, vl); - auto vec_mask = __riscv_vle8_v_u8mf2(mask_row + jm, vlm); - auto vec_mask_ext = __riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(vec_mask, 1, vlm), vlm), 0x01010101, vlm); - auto bool_mask_ext = __riscv_vmsne(__riscv_vreinterpret_u8m2(vec_mask_ext), 0, vl); - auto vec_src = __riscv_vsub_vv_u8m2_m(bool_mask_ext, __riscv_vmaxu_vv_u8m2_m(bool_mask_ext, vec_src1, vec_src2, vl), - __riscv_vminu_vv_u8m2_m(bool_mask_ext, vec_src1, vec_src2, vl), vl); - auto vec_mul = __riscv_vwmulu_vv_u16m4_m(bool_mask_ext, vec_src, vec_src, vl); - auto vec_zext = __riscv_vzext_vf2_u32m8_m(bool_mask_ext, vec_mul, vl); - vec_sum = __riscv_vadd_tumu(bool_mask_ext, vec_sum, vec_sum, vec_zext, vl); - } - } - } - else - { - for (int i = 0; i < height; i++) - { - const uchar* src1_row = src1 + i * src1_step; - const uchar* src2_row = src2 + i * src2_step; - int vl; - for (int j = 0; j < width * 4; j += vl) - { - vl = __riscv_vsetvl_e8m2(width * 4 - j); - reduce(vl); - - auto vec_src1 = __riscv_vle8_v_u8m2(src1_row + j, vl); - auto vec_src2 = __riscv_vle8_v_u8m2(src2_row + j, vl); - auto vec_src = __riscv_vsub(__riscv_vmaxu(vec_src1, vec_src2, vl), __riscv_vminu(vec_src1, vec_src2, vl), vl); - auto vec_mul = __riscv_vwmulu(vec_src, vec_src, vl); - auto vec_zext = __riscv_vzext_vf2(vec_mul, vl); - vec_sum = __riscv_vadd_tu(vec_sum, vec_sum, vec_zext, vl); - } - } - } - reduce(1 << 16); - - return CV_HAL_ERROR_OK; -} - -inline int normDiffInf_32FC1(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step, int width, int height, double* result) -{ - int vlmax = __riscv_vsetvlmax_e32m8(); - auto vec_max = __riscv_vfmv_v_f_f32m8(0, vlmax); - - if (mask) - { - for (int i = 0; i < height; i++) - { - const float* src1_row = reinterpret_cast(src1 + i * src1_step); - const float* src2_row = reinterpret_cast(src2 + i * src2_step); - const uchar* mask_row = mask + i * mask_step; - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e32m8(width - j); - auto vec_src1 = __riscv_vle32_v_f32m8(src1_row + j, vl); - auto vec_src2 = __riscv_vle32_v_f32m8(src2_row + j, vl); - auto vec_mask = __riscv_vle8_v_u8m2(mask_row + j, vl); - auto bool_mask = __riscv_vmsne(vec_mask, 0, vl); - auto vec_src = __riscv_vfsub_vv_f32m8_m(bool_mask, vec_src1, vec_src2, vl); - auto vec_abs = __riscv_vfabs_v_f32m8_m(bool_mask, vec_src, vl); - vec_max = __riscv_vfmax_tumu(bool_mask, vec_max, vec_max, vec_abs, vl); - } - } - } - else - { - for (int i = 0; i < height; i++) - { - const float* src1_row = reinterpret_cast(src1 + i * src1_step); - const float* src2_row = reinterpret_cast(src2 + i * src2_step); - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e32m8(width - j); - auto vec_src1 = __riscv_vle32_v_f32m8(src1_row + j, vl); - auto vec_src2 = __riscv_vle32_v_f32m8(src2_row + j, vl); - auto vec_src = __riscv_vfsub(vec_src1, vec_src2, vl); - auto vec_abs = __riscv_vfabs(vec_src, vl); - vec_max = __riscv_vfmax_tu(vec_max, vec_max, vec_abs, vl); - } - } - } - auto sc_max = __riscv_vfmv_s_f_f32m1(0, vlmax); - sc_max = __riscv_vfredmax(vec_max, sc_max, vlmax); - *result = __riscv_vfmv_f(sc_max); - - return CV_HAL_ERROR_OK; -} - -inline int normDiffL1_32FC1(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step, int width, int height, double* result) -{ - int vlmax = __riscv_vsetvlmax_e32m4(); - auto vec_sum = __riscv_vfmv_v_f_f64m8(0, vlmax); - - if (mask) - { - for (int i = 0; i < height; i++) - { - const float* src1_row = reinterpret_cast(src1 + i * src1_step); - const float* src2_row = reinterpret_cast(src2 + i * src2_step); - const uchar* mask_row = mask + i * mask_step; - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e32m4(width - j); - auto vec_src1 = __riscv_vle32_v_f32m4(src1_row + j, vl); - auto vec_src2 = __riscv_vle32_v_f32m4(src2_row + j, vl); - auto vec_mask = __riscv_vle8_v_u8m1(mask_row + j, vl); - auto bool_mask = __riscv_vmsne(vec_mask, 0, vl); - auto vec_src = __riscv_vfsub_vv_f32m4_m(bool_mask, vec_src1, vec_src2, vl); - auto vec_abs = __riscv_vfabs_v_f32m4_m(bool_mask, vec_src, vl); - auto vec_fext = __riscv_vfwcvt_f_f_v_f64m8_m(bool_mask, vec_abs, vl); - vec_sum = __riscv_vfadd_tumu(bool_mask, vec_sum, vec_sum, vec_fext, vl); - } - } - } - else - { - for (int i = 0; i < height; i++) - { - const float* src1_row = reinterpret_cast(src1 + i * src1_step); - const float* src2_row = reinterpret_cast(src2 + i * src2_step); - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e32m4(width - j); - auto vec_src1 = __riscv_vle32_v_f32m4(src1_row + j, vl); - auto vec_src2 = __riscv_vle32_v_f32m4(src2_row + j, vl); - auto vec_src = __riscv_vfsub(vec_src1, vec_src2, vl); - auto vec_abs = __riscv_vfabs(vec_src, vl); - auto vec_fext = __riscv_vfwcvt_f_f_v_f64m8(vec_abs, vl); - vec_sum = __riscv_vfadd_tu(vec_sum, vec_sum, vec_fext, vl); - } - } - } - auto sc_sum = __riscv_vfmv_s_f_f64m1(0, vlmax); - sc_sum = __riscv_vfredosum(vec_sum, sc_sum, vlmax); - *result = __riscv_vfmv_f(sc_sum); - - return CV_HAL_ERROR_OK; -} - -inline int normDiffL2Sqr_32FC1(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step, int width, int height, double* result) -{ - int vlmax = __riscv_vsetvlmax_e32m4(); - auto vec_sum = __riscv_vfmv_v_f_f64m8(0, vlmax); - - if (mask) - { - for (int i = 0; i < height; i++) - { - const float* src1_row = reinterpret_cast(src1 + i * src1_step); - const float* src2_row = reinterpret_cast(src2 + i * src2_step); - const uchar* mask_row = mask + i * mask_step; - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e32m4(width - j); - auto vec_src1 = __riscv_vle32_v_f32m4(src1_row + j, vl); - auto vec_src2 = __riscv_vle32_v_f32m4(src2_row + j, vl); - auto vec_mask = __riscv_vle8_v_u8m1(mask_row + j, vl); - auto bool_mask = __riscv_vmsne(vec_mask, 0, vl); - auto vec_src = __riscv_vfsub_vv_f32m4_m(bool_mask, vec_src1, vec_src2, vl); - auto vec_mul = __riscv_vfwmul_vv_f64m8_m(bool_mask, vec_src, vec_src, vl); - vec_sum = __riscv_vfadd_tumu(bool_mask, vec_sum, vec_sum, vec_mul, vl); - } - } - } - else - { - for (int i = 0; i < height; i++) - { - const float* src1_row = reinterpret_cast(src1 + i * src1_step); - const float* src2_row = reinterpret_cast(src2 + i * src2_step); - int vl; - for (int j = 0; j < width; j += vl) - { - vl = __riscv_vsetvl_e32m4(width - j); - auto vec_src1 = __riscv_vle32_v_f32m4(src1_row + j, vl); - auto vec_src2 = __riscv_vle32_v_f32m4(src2_row + j, vl); - auto vec_src = __riscv_vfsub(vec_src1, vec_src2, vl); - auto vec_mul = __riscv_vfwmul(vec_src, vec_src, vl); - vec_sum = __riscv_vfadd_tu(vec_sum, vec_sum, vec_mul, vl); - } - } - } - auto sc_sum = __riscv_vfmv_s_f_f64m1(0, vlmax); - sc_sum = __riscv_vfredosum(vec_sum, sc_sum, vlmax); - *result = __riscv_vfmv_f(sc_sum); - - return CV_HAL_ERROR_OK; } +using NormDiffFunc = int (*)(const uchar*, const uchar*, const uchar*, uchar*, int, int); inline int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step, int width, int height, int type, int norm_type, double* result) { - if (!result) - return CV_HAL_ERROR_OK; + int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - int ret; - switch (type) - { - case CV_8UC1: - switch (norm_type & ~NORM_RELATIVE) - { - case NORM_INF: - ret = normDiffInf_8UC1(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result); - break; - case NORM_L1: - ret = normDiffL1_8UC1(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result); - break; - case NORM_L2SQR: - ret = normDiffL2Sqr_8UC1(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result); - break; - case NORM_L2: - ret = normDiffL2Sqr_8UC1(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result); - *result = std::sqrt(*result); - break; - default: - ret = CV_HAL_ERROR_NOT_IMPLEMENTED; - } - break; - case CV_8UC4: - switch (norm_type & ~NORM_RELATIVE) - { - case NORM_INF: - ret = normDiffInf_8UC4(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result); - break; - case NORM_L1: - ret = normDiffL1_8UC4(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result); - break; - case NORM_L2SQR: - ret = normDiffL2Sqr_8UC4(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result); - break; - case NORM_L2: - ret = normDiffL2Sqr_8UC4(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result); - *result = std::sqrt(*result); - break; - default: - ret = CV_HAL_ERROR_NOT_IMPLEMENTED; - } - break; - case CV_32FC1: - switch (norm_type & ~NORM_RELATIVE) - { - case NORM_INF: - ret = normDiffInf_32FC1(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result); - break; - case NORM_L1: - ret = normDiffL1_32FC1(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result); - break; - case NORM_L2SQR: - ret = normDiffL2Sqr_32FC1(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result); - break; - case NORM_L2: - ret = normDiffL2Sqr_32FC1(src1, src1_step, src2, src2_step, mask, mask_step, width, height, result); - *result = std::sqrt(*result); - break; - default: - ret = CV_HAL_ERROR_NOT_IMPLEMENTED; - } - break; - default: - ret = CV_HAL_ERROR_NOT_IMPLEMENTED; + bool relative = norm_type & NORM_RELATIVE; + norm_type &= ~NORM_RELATIVE; + + if (result == nullptr || depth == CV_16F || (norm_type > NORM_L2SQR && !relative)) { + return CV_HAL_ERROR_NOT_IMPLEMENTED; } - if(ret == CV_HAL_ERROR_OK && (norm_type & NORM_RELATIVE)) + // [FIXME] append 0's when merging to 5.x + static NormDiffFunc norm_diff_tab[3][CV_DEPTH_MAX] = { + { + (NormDiffFunc)(normDiffInf_8u), (NormDiffFunc)(normDiffInf_8s), + (NormDiffFunc)(normDiffInf_16u), (NormDiffFunc)(normDiffInf_16s), + (NormDiffFunc)(normDiffInf_32s), (NormDiffFunc)(normDiffInf_32f), + (NormDiffFunc)(normDiffInf_64f), 0, + }, + { + (NormDiffFunc)(normDiffL1_8u), (NormDiffFunc)(normDiffL1_8s), + (NormDiffFunc)(normDiffL1_16u), (NormDiffFunc)(normDiffL1_16s), + (NormDiffFunc)(normDiffL1_32s), (NormDiffFunc)(normDiffL1_32f), + (NormDiffFunc)(normDiffL1_64f), 0, + }, + { + (NormDiffFunc)(normDiffL2_8u), (NormDiffFunc)(normDiffL2_8s), + (NormDiffFunc)(normDiffL2_16u), (NormDiffFunc)(normDiffL2_16s), + (NormDiffFunc)(normDiffL2_32s), (NormDiffFunc)(normDiffL2_32f), + (NormDiffFunc)(normDiffL2_64f), 0, + }, + }; + + static const size_t elem_size_tab[CV_DEPTH_MAX] = { + sizeof(uchar), sizeof(schar), + sizeof(ushort), sizeof(short), + sizeof(int), sizeof(float), + sizeof(int64_t), 0, + }; + CV_Assert(elem_size_tab[depth]); + + bool src_continuous = (src1_step == width * elem_size_tab[depth] * cn || (src1_step != width * elem_size_tab[depth] * cn && height == 1)); + src_continuous &= (src2_step == width * elem_size_tab[depth] * cn || (src2_step != width * elem_size_tab[depth] * cn && height == 1)); + bool mask_continuous = (mask_step == width); + size_t nplanes = 1; + size_t size = width * height; + if ((mask && (!src_continuous || !mask_continuous)) || !src_continuous) { + nplanes = height; + size = width; + } + + NormDiffFunc func = norm_diff_tab[norm_type >> 1][depth]; + if (func == nullptr) { + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + // Handle overflow + union { + double d; + float f; + unsigned u; + } res; + res.d = 0; + if ((norm_type == NORM_L1 && depth <= CV_16S) || + ((norm_type == NORM_L2 || norm_type == NORM_L2SQR) && depth <= CV_8S)) { + const size_t esz = elem_size_tab[depth] * cn; + const int total = (int)size; + const int intSumBlockSize = (norm_type == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn; + const int blockSize = std::min(total, intSumBlockSize); + int isum = 0; + int count = 0; + auto _src1 = src1, _src2 = src2; + auto _mask = mask; + for (size_t i = 0; i < nplanes; i++) { + if ((mask && (!src_continuous || !mask_continuous)) || !src_continuous) { + _src1 = src1 + src1_step * i; + _src2 = src2 + src2_step * i; + _mask = mask + mask_step * i; + } + for (int j = 0; j < total; j += blockSize) { + int bsz = std::min(total - j, blockSize); + func(_src1, _src2, _mask, (uchar*)&isum, bsz, cn); + count += bsz; + if (count + blockSize >= intSumBlockSize || (i + 1 >= nplanes && j + bsz >= total)) { + res.d += isum; + isum = 0; + count = 0; + } + _src1 += bsz * esz; + _src2 += bsz * esz; + if (mask) { + _mask += bsz; + } + } + } + } else { + auto _src1 = src1, _src2 = src2; + auto _mask = mask; + for (size_t i = 0; i < nplanes; i++) { + if ((mask && (!src_continuous || !mask_continuous)) || !src_continuous) { + _src1 = src1 + src1_step * i; + _src2 = src2 + src2_step * i; + _mask = mask + mask_step * i; + } + func(_src1, _src2, _mask, (uchar*)&res, (int)size, cn); + } + } + + if (norm_type == NORM_INF) { + if (depth == CV_64F) { + *result = res.d; + } else if (depth == CV_32F) { + *result = res.f; + } else { + *result = res.u; + } + } else if (norm_type == NORM_L2) { + *result = std::sqrt(res.d); + } else { + *result = res.d; + } + + if(relative) { double result_; - ret = cv::cv_hal_rvv::norm::norm(src2, src2_step, mask, mask_step, width, height, type, norm_type & ~NORM_RELATIVE, &result_); + int ret = cv::cv_hal_rvv::norm::norm(src2, src2_step, mask, mask_step, width, height, type, norm_type, &result_); if(ret == CV_HAL_ERROR_OK) { *result /= result_ + DBL_EPSILON; } } - return ret; + return CV_HAL_ERROR_OK; } }}} diff --git a/modules/core/src/norm.simd.hpp b/modules/core/src/norm.simd.hpp index 0c3cd5d995..05227dde90 100644 --- a/modules/core/src/norm.simd.hpp +++ b/modules/core/src/norm.simd.hpp @@ -1,6 +1,9 @@ // This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html +// +// Copyright (C) 2025, SpaceMIT Inc., all rights reserved. +// Third party copyrights are property of their respective owners. #include "precomp.hpp" @@ -55,8 +58,7 @@ struct NormDiffInf_SIMD { inline ST operator() (const T* src1, const T* src2, int n) const { ST s = 0; for (int i = 0; i < n; i++) { - ST v = ST(src1[i] - src2[i]); - s = std::max(s, (ST)cv_abs(v)); + s = std::max(s, (ST)std::abs(src1[i] - src2[i])); } return s; } @@ -67,8 +69,7 @@ struct NormDiffL1_SIMD { inline ST operator() (const T* src1, const T* src2, int n) const { ST s = 0; for (int i = 0; i < n; i++) { - ST v = ST(src1[i] - src2[i]); - s += cv_abs(v); + s += std::abs(src1[i] - src2[i]); } return s; } @@ -79,7 +80,7 @@ struct NormDiffL2_SIMD { inline ST operator() (const T* src1, const T* src2, int n) const { ST s = 0; for (int i = 0; i < n; i++) { - ST v = ST(src1[i] - src2[i]); + ST v = (ST)src1[i] - (ST)src2[i]; s += v * v; } return s; @@ -383,8 +384,7 @@ struct NormDiffInf_SIMD { } s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3)); for (; j < n; j++) { - int v = src1[j] - src2[j]; - s = std::max(s, (int)cv_abs(v)); + s = std::max(s, (int)std::abs(src1[j] - src2[j])); } return s; } @@ -415,8 +415,7 @@ struct NormDiffInf_SIMD { } s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3)); for (; j < n; j++) { - int v = src1[j] - src2[j]; - s = std::max(s, (int)cv_abs(v)); + s = std::max(s, (int)std::abs(src1[j] - src2[j])); } return s; } @@ -447,8 +446,7 @@ struct NormDiffInf_SIMD { } s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3)); for (; j < n; j++) { - int v = src1[j] - src2[j]; - s = std::max(s, (int)cv_abs(v)); + s = std::max(s, (int)std::abs(src1[j] - src2[j])); } return s; } @@ -479,8 +477,7 @@ struct NormDiffInf_SIMD { } s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3)); for (; j < n; j++) { - int v = src1[j] - src2[j]; - s = std::max(s, (int)cv_abs(v)); + s = std::max(s, (int)std::abs(src1[j] - src2[j])); } return s; } @@ -511,8 +508,7 @@ struct NormDiffInf_SIMD { } s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3)); for (; j < n; j++) { - int v = src1[j] - src2[j]; - s = std::max(s, (int)cv_abs(v)); + s = std::max(s, (int)std::abs(src1[j] - src2[j])); } return s; } @@ -534,8 +530,7 @@ struct NormDiffInf_SIMD { } s = v_reduce_max(v_max(r0, r1)); for (; j < n; j++) { - float v = src1[j] - src2[j]; - s = std::max(s, cv_abs(v)); + s = std::max(s, (float)std::abs(src1[j] - src2[j])); } return s; } @@ -558,8 +553,7 @@ struct NormDiffL1_SIMD { } s += v_reduce_sum(v_add(r0, r1)); for (; j < n; j++) { - int v = src1[j] - src2[j]; - s += (int)cv_abs(v); + s += std::abs(src1[j] - src2[j]); } return s; } @@ -582,8 +576,7 @@ struct NormDiffL1_SIMD { } s += v_reduce_sum(v_add(r0, r1)); for (; j < n; j++) { - int v =src1[j] - src2[j]; - s += (int)cv_abs(v); + s += std::abs(src1[j] - src2[j]); } return s; } @@ -622,8 +615,7 @@ struct NormDiffL1_SIMD { } s += (int)v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3)); for (; j < n; j++) { - int v = src1[j] - src2[j]; - s += (int)cv_abs(v); + s += std::abs(src1[j] - src2[j]); } return s; } @@ -662,8 +654,7 @@ struct NormDiffL1_SIMD { } s += (int)v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3)); for (; j < n; j++) { - int v = src1[j] - src2[j]; - s += (int)cv_abs(v); + s += std::abs(src1[j] - src2[j]); } return s; } @@ -687,7 +678,7 @@ struct NormDiffL2_SIMD { } s += v_reduce_sum(v_add(r0, r1)); for (; j < n; j++) { - int v = saturate_cast(src1[j] - src2[j]); + int v = (int)src1[j] - (int)src2[j]; s += v * v; } return s; @@ -712,7 +703,7 @@ struct NormDiffL2_SIMD { } s += v_reduce_sum(v_add(r0, r1)); for (; j < n; j++) { - int v = saturate_cast(src1[j] - src2[j]); + int v = (int)src1[j] - (int)src2[j]; s += v * v; } return s; @@ -955,11 +946,10 @@ struct NormDiffInf_SIMD { double t[VTraits::max_nlanes]; vx_store(t, v_max(r0, r1)); for (int i = 0; i < VTraits::vlanes(); i++) { - s = std::max(s, cv_abs(t[i])); + s = std::max(s, std::abs(t[i])); } for (; j < n; j++) { - double v = src1[j] - src2[j]; - s = std::max(s, cv_abs(v)); + s = std::max(s, (double)std::abs(src1[j] - src2[j])); } return s; } @@ -971,21 +961,17 @@ struct NormDiffL1_SIMD { int j = 0; double s = 0.f; v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64(); - v_float64 r2 = vx_setzero_f64(), r3 = vx_setzero_f64(); - for (; j <= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + for (; j <= n - VTraits::vlanes(); j += VTraits::vlanes()) { v_int32 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); - v_float32 v0 = v_abs(v_cvt_f32(v_sub(v01, v02))); - r0 = v_add(r0, v_cvt_f64(v0)); r1 = v_add(r1, v_cvt_f64_high(v0)); - - v_int32 v11 = vx_load(src1 + j + VTraits::vlanes()), - v12 = vx_load(src2 + j + VTraits::vlanes()); - v_float32 v1 = v_abs(v_cvt_f32(v_sub(v11, v12))); - r2 = v_add(r2, v_cvt_f64(v1)); r3 = v_add(r3, v_cvt_f64_high(v1)); + v_uint32 v0 = v_abs(v_sub(v01, v02)); + v_uint64 ev0, ev1; + v_expand(v0, ev0, ev1); + r0 = v_add(r0, v_cvt_f64(v_reinterpret_as_s64(ev0))); + r1 = v_add(r1, v_cvt_f64(v_reinterpret_as_s64(ev1))); } - s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3)); + s += v_reduce_sum(v_add(r0, r1)); for (; j < n; j++) { - double v = src1[j] - src2[j]; - s += cv_abs(v); + s += std::abs(src1[j] - src2[j]); } return s; } @@ -1010,8 +996,7 @@ struct NormDiffL1_SIMD { } s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3)); for (; j < n; j++) { - double v = src1[j] - src2[j]; - s += cv_abs(v); + s += std::abs(src1[j] - src2[j]); } return s; } @@ -1033,8 +1018,7 @@ struct NormDiffL1_SIMD { } s += v_reduce_sum(v_add(r0, r1)); for (; j < n; j++) { - double v = src1[j] - src2[j]; - s += cv_abs(v); + s += std::abs(src1[j] - src2[j]); } return s; } @@ -1060,7 +1044,7 @@ struct NormDiffL2_SIMD { } s += v_reduce_sum(v_add(r0, r1)); for (; j < n; j++) { - double v = saturate_cast(src1[j] - src2[j]); + double v = (double)src1[j] - (double)src2[j]; s += v * v; } return s; @@ -1087,7 +1071,7 @@ struct NormDiffL2_SIMD { } s += v_reduce_sum(v_add(r0, r1)); for (; j < n; j++) { - double v = saturate_cast(src1[j] - src2[j]); + double v = (double)src1[j] - (double)src2[j]; s += v * v; } return s; @@ -1100,24 +1084,17 @@ struct NormDiffL2_SIMD { int j = 0; double s = 0.f; v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64(); - v_float64 r2 = vx_setzero_f64(), r3 = vx_setzero_f64(); - for (; j <= n - 2 * VTraits::vlanes(); j += 2 * VTraits::vlanes()) { + for (; j <= n - VTraits::vlanes(); j += VTraits::vlanes()) { v_int32 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j); - v_float32 v0 = v_abs(v_cvt_f32(v_sub(v01, v02))); - v_float64 f00, f01; - f00 = v_cvt_f64(v0); f01 = v_cvt_f64_high(v0); - r0 = v_fma(f00, f00, r0); r1 = v_fma(f01, f01, r1); - - v_int32 v11 = vx_load(src1 + j + VTraits::vlanes()), - v12 = vx_load(src2 + j + VTraits::vlanes()); - v_float32 v1 = v_abs(v_cvt_f32(v_sub(v11, v12))); - v_float64 f10, f11; - f10 = v_cvt_f64(v1); f11 = v_cvt_f64_high(v1); - r2 = v_fma(f10, f10, r2); r3 = v_fma(f11, f11, r3); + v_uint32 v0 = v_absdiff(v01, v02); + v_uint64 ev0, ev1; + v_expand(v0, ev0, ev1); + v_float64 f0 = v_cvt_f64(v_reinterpret_as_s64(ev0)), f1 = v_cvt_f64(v_reinterpret_as_s64(ev1)); + r0 = v_fma(f0, f0, r0); r1 = v_fma(f1, f1, r1); } - s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3)); + s += v_reduce_sum(v_add(r0, r1)); for (; j < n; j++) { - double v = src1[j] - src2[j]; + double v = (double)src1[j] - (double)src2[j]; s += v * v; } return s; @@ -1145,7 +1122,7 @@ struct NormDiffL2_SIMD { } s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3)); for (; j < n; j++) { - double v = src1[j] - src2[j]; + double v = (double)src1[j] - (double)src2[j]; s += v * v; } return s; @@ -1181,7 +1158,7 @@ struct NormDiffL2_SIMD { } s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3)); for (; j < n; j++) { - double v = src1[j] - src2[j]; + double v = (double)src1[j] - (double)src2[j]; s += v * v; } return s; @@ -1297,7 +1274,7 @@ normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int le for( int i = 0; i < len; i++, src1 += cn, src2 += cn ) { if( mask[i] ) { for( int k = 0; k < cn; k++ ) { - ST v = src1[k] - src2[k]; + ST v = (ST)src1[k] - (ST)src2[k]; result += v*v; } } @@ -1331,7 +1308,6 @@ NormFunc getNormFunc(int normType, int depth) { CV_INSTRUMENT_REGION(); - // [FIXME] append 0's when merging to 5.x static NormFunc normTab[3][CV_DEPTH_MAX] = { { @@ -1353,7 +1329,7 @@ NormFunc getNormFunc(int normType, int depth) NormDiffFunc getNormDiffFunc(int normType, int depth) { - static NormDiffFunc normDiffTab[3][8] = + static NormDiffFunc normDiffTab[3][CV_DEPTH_MAX] = { { (NormDiffFunc)GET_OPTIMIZED(normDiffInf_8u), (NormDiffFunc)normDiffInf_8s,