Merge pull request #27115 from fengyuentau:4x/hal_rvv/normDiff

core: refactored normDiff in hal_rvv and extended with support of more data types #27115 

Merge wtih https://github.com/opencv/opencv_extra/pull/1246.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
Yuantao Feng 2025-03-25 12:59:59 +08:00 committed by GitHub
parent 7d87f3cda6
commit a2a2f37ebb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 1235 additions and 625 deletions

View File

@ -5,6 +5,7 @@
#ifndef OPENCV_HAL_RVV_HPP_INCLUDED
#define OPENCV_HAL_RVV_HPP_INCLUDED
#include "opencv2/core/base.hpp"
#include "opencv2/core/hal/interface.h"
#include "opencv2/imgproc/hal/interface.h"

View File

@ -1,6 +1,9 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
//
// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
#ifndef OPENCV_HAL_RVV_COMMON_HPP_INCLUDED
#define OPENCV_HAL_RVV_COMMON_HPP_INCLUDED
@ -9,6 +12,8 @@
namespace cv { namespace cv_hal_rvv { namespace custom_intrin {
#define CV_HAL_RVV_NOOP(a) (a)
#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(_Tpvs, _Tpvd, shift, suffix) \
inline _Tpvd __riscv_vabs(const _Tpvs& v, const int vl) { \
_Tpvs mask = __riscv_vsra(v, shift, vl); \
@ -25,6 +30,23 @@ CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m8_t, vuint16m8_t, 15, u16m8)
CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m4_t, vuint32m4_t, 31, u32m4)
CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m8_t, vuint32m8_t, 31, u32m8)
#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(_Tpvs, _Tpvd, cast, sub, max, min) \
inline _Tpvd __riscv_vabd(const _Tpvs& v1, const _Tpvs& v2, const int vl) { \
return cast(__riscv_##sub(__riscv_##max(v1, v2, vl), __riscv_##min(v1, v2, vl), vl)); \
}
CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint8m4_t, vuint8m4_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint8m8_t, vuint8m8_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint16m2_t, vuint16m2_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint16m8_t, vuint16m8_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint8m4_t, vuint8m4_t, __riscv_vreinterpret_u8m4, vsub, vmax, vmin)
CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint8m8_t, vuint8m8_t, __riscv_vreinterpret_u8m8, vsub, vmax, vmin)
CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m2_t, vuint16m2_t, __riscv_vreinterpret_u16m2, vsub, vmax, vmin)
CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m8_t, vuint16m8_t, __riscv_vreinterpret_u16m8, vsub, vmax, vmin)
CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m4_t, vuint32m4_t, __riscv_vreinterpret_u32m4, vsub, vmax, vmin)
CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m8_t, vuint32m8_t, __riscv_vreinterpret_u32m8, vsub, vmax, vmin)
}}} // cv::cv_hal_rvv::custom_intrin
#endif

View File

@ -166,7 +166,7 @@ struct NormL1_RVV<uchar, int> {
for (int i = 0; i < n; i += vl) {
vl = __riscv_vsetvl_e8m8(n - i);
auto v = __riscv_vle8_v_u8m8(src + i, vl);
s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, vl);
s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
}
return __riscv_vmv_x(s);
}
@ -181,7 +181,7 @@ struct NormL1_RVV<schar, int> {
for (int i = 0; i < n; i += vl) {
vl = __riscv_vsetvl_e8m8(n - i);
auto v = custom_intrin::__riscv_vabs(__riscv_vle8_v_i8m8(src + i, vl), vl);
s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, vl);
s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
}
return __riscv_vmv_x(s);
}
@ -1008,6 +1008,7 @@ inline int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mas
sizeof(int), sizeof(float),
sizeof(int64_t), 0,
};
CV_Assert(elem_size_tab[depth]);
bool src_continuous = (src_step == width * elem_size_tab[depth] * cn || (src_step != width * elem_size_tab[depth] * cn && height == 1));
bool mask_continuous = (mask_step == static_cast<size_t>(width));

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,9 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
//
// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
#include "precomp.hpp"
@ -55,8 +58,7 @@ struct NormDiffInf_SIMD {
inline ST operator() (const T* src1, const T* src2, int n) const {
ST s = 0;
for (int i = 0; i < n; i++) {
ST v = ST(src1[i] - src2[i]);
s = std::max(s, (ST)cv_abs(v));
s = std::max(s, (ST)std::abs(src1[i] - src2[i]));
}
return s;
}
@ -67,8 +69,7 @@ struct NormDiffL1_SIMD {
inline ST operator() (const T* src1, const T* src2, int n) const {
ST s = 0;
for (int i = 0; i < n; i++) {
ST v = ST(src1[i] - src2[i]);
s += cv_abs(v);
s += std::abs(src1[i] - src2[i]);
}
return s;
}
@ -79,7 +80,7 @@ struct NormDiffL2_SIMD {
inline ST operator() (const T* src1, const T* src2, int n) const {
ST s = 0;
for (int i = 0; i < n; i++) {
ST v = ST(src1[i] - src2[i]);
ST v = (ST)src1[i] - (ST)src2[i];
s += v * v;
}
return s;
@ -383,8 +384,7 @@ struct NormDiffInf_SIMD<uchar, int> {
}
s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3));
for (; j < n; j++) {
int v = src1[j] - src2[j];
s = std::max(s, (int)cv_abs(v));
s = std::max(s, (int)std::abs(src1[j] - src2[j]));
}
return s;
}
@ -415,8 +415,7 @@ struct NormDiffInf_SIMD<schar, int> {
}
s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3));
for (; j < n; j++) {
int v = src1[j] - src2[j];
s = std::max(s, (int)cv_abs(v));
s = std::max(s, (int)std::abs(src1[j] - src2[j]));
}
return s;
}
@ -447,8 +446,7 @@ struct NormDiffInf_SIMD<ushort, int> {
}
s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3));
for (; j < n; j++) {
int v = src1[j] - src2[j];
s = std::max(s, (int)cv_abs(v));
s = std::max(s, (int)std::abs(src1[j] - src2[j]));
}
return s;
}
@ -479,8 +477,7 @@ struct NormDiffInf_SIMD<short, int> {
}
s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3));
for (; j < n; j++) {
int v = src1[j] - src2[j];
s = std::max(s, (int)cv_abs(v));
s = std::max(s, (int)std::abs(src1[j] - src2[j]));
}
return s;
}
@ -511,8 +508,7 @@ struct NormDiffInf_SIMD<int, int> {
}
s = (int)v_reduce_max(v_max(v_max(v_max(r0, r1), r2), r3));
for (; j < n; j++) {
int v = src1[j] - src2[j];
s = std::max(s, (int)cv_abs(v));
s = std::max(s, (int)std::abs(src1[j] - src2[j]));
}
return s;
}
@ -534,8 +530,7 @@ struct NormDiffInf_SIMD<float, float> {
}
s = v_reduce_max(v_max(r0, r1));
for (; j < n; j++) {
float v = src1[j] - src2[j];
s = std::max(s, cv_abs(v));
s = std::max(s, (float)std::abs(src1[j] - src2[j]));
}
return s;
}
@ -558,8 +553,7 @@ struct NormDiffL1_SIMD<uchar, int> {
}
s += v_reduce_sum(v_add(r0, r1));
for (; j < n; j++) {
int v = src1[j] - src2[j];
s += (int)cv_abs(v);
s += std::abs(src1[j] - src2[j]);
}
return s;
}
@ -582,8 +576,7 @@ struct NormDiffL1_SIMD<schar, int> {
}
s += v_reduce_sum(v_add(r0, r1));
for (; j < n; j++) {
int v =src1[j] - src2[j];
s += (int)cv_abs(v);
s += std::abs(src1[j] - src2[j]);
}
return s;
}
@ -622,8 +615,7 @@ struct NormDiffL1_SIMD<ushort, int> {
}
s += (int)v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3));
for (; j < n; j++) {
int v = src1[j] - src2[j];
s += (int)cv_abs(v);
s += std::abs(src1[j] - src2[j]);
}
return s;
}
@ -662,8 +654,7 @@ struct NormDiffL1_SIMD<short, int> {
}
s += (int)v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3));
for (; j < n; j++) {
int v = src1[j] - src2[j];
s += (int)cv_abs(v);
s += std::abs(src1[j] - src2[j]);
}
return s;
}
@ -687,7 +678,7 @@ struct NormDiffL2_SIMD<uchar, int> {
}
s += v_reduce_sum(v_add(r0, r1));
for (; j < n; j++) {
int v = saturate_cast<int>(src1[j] - src2[j]);
int v = (int)src1[j] - (int)src2[j];
s += v * v;
}
return s;
@ -712,7 +703,7 @@ struct NormDiffL2_SIMD<schar, int> {
}
s += v_reduce_sum(v_add(r0, r1));
for (; j < n; j++) {
int v = saturate_cast<int>(src1[j] - src2[j]);
int v = (int)src1[j] - (int)src2[j];
s += v * v;
}
return s;
@ -955,11 +946,10 @@ struct NormDiffInf_SIMD<double, double> {
double t[VTraits<v_float64>::max_nlanes];
vx_store(t, v_max(r0, r1));
for (int i = 0; i < VTraits<v_float64>::vlanes(); i++) {
s = std::max(s, cv_abs(t[i]));
s = std::max(s, std::abs(t[i]));
}
for (; j < n; j++) {
double v = src1[j] - src2[j];
s = std::max(s, cv_abs(v));
s = std::max(s, (double)std::abs(src1[j] - src2[j]));
}
return s;
}
@ -971,21 +961,17 @@ struct NormDiffL1_SIMD<int, double> {
int j = 0;
double s = 0.f;
v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64();
v_float64 r2 = vx_setzero_f64(), r3 = vx_setzero_f64();
for (; j <= n - 2 * VTraits<v_int32>::vlanes(); j += 2 * VTraits<v_int32>::vlanes()) {
for (; j <= n - VTraits<v_int32>::vlanes(); j += VTraits<v_int32>::vlanes()) {
v_int32 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
v_float32 v0 = v_abs(v_cvt_f32(v_sub(v01, v02)));
r0 = v_add(r0, v_cvt_f64(v0)); r1 = v_add(r1, v_cvt_f64_high(v0));
v_int32 v11 = vx_load(src1 + j + VTraits<v_int32>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_int32>::vlanes());
v_float32 v1 = v_abs(v_cvt_f32(v_sub(v11, v12)));
r2 = v_add(r2, v_cvt_f64(v1)); r3 = v_add(r3, v_cvt_f64_high(v1));
v_uint32 v0 = v_abs(v_sub(v01, v02));
v_uint64 ev0, ev1;
v_expand(v0, ev0, ev1);
r0 = v_add(r0, v_cvt_f64(v_reinterpret_as_s64(ev0)));
r1 = v_add(r1, v_cvt_f64(v_reinterpret_as_s64(ev1)));
}
s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3));
s += v_reduce_sum(v_add(r0, r1));
for (; j < n; j++) {
double v = src1[j] - src2[j];
s += cv_abs(v);
s += std::abs(src1[j] - src2[j]);
}
return s;
}
@ -1010,8 +996,7 @@ struct NormDiffL1_SIMD<float, double> {
}
s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3));
for (; j < n; j++) {
double v = src1[j] - src2[j];
s += cv_abs(v);
s += std::abs(src1[j] - src2[j]);
}
return s;
}
@ -1033,8 +1018,7 @@ struct NormDiffL1_SIMD<double, double> {
}
s += v_reduce_sum(v_add(r0, r1));
for (; j < n; j++) {
double v = src1[j] - src2[j];
s += cv_abs(v);
s += std::abs(src1[j] - src2[j]);
}
return s;
}
@ -1060,7 +1044,7 @@ struct NormDiffL2_SIMD<ushort, double> {
}
s += v_reduce_sum(v_add(r0, r1));
for (; j < n; j++) {
double v = saturate_cast<double>(src1[j] - src2[j]);
double v = (double)src1[j] - (double)src2[j];
s += v * v;
}
return s;
@ -1087,7 +1071,7 @@ struct NormDiffL2_SIMD<short, double> {
}
s += v_reduce_sum(v_add(r0, r1));
for (; j < n; j++) {
double v = saturate_cast<double>(src1[j] - src2[j]);
double v = (double)src1[j] - (double)src2[j];
s += v * v;
}
return s;
@ -1100,24 +1084,17 @@ struct NormDiffL2_SIMD<int, double> {
int j = 0;
double s = 0.f;
v_float64 r0 = vx_setzero_f64(), r1 = vx_setzero_f64();
v_float64 r2 = vx_setzero_f64(), r3 = vx_setzero_f64();
for (; j <= n - 2 * VTraits<v_int32>::vlanes(); j += 2 * VTraits<v_int32>::vlanes()) {
for (; j <= n - VTraits<v_int32>::vlanes(); j += VTraits<v_int32>::vlanes()) {
v_int32 v01 = vx_load(src1 + j), v02 = vx_load(src2 + j);
v_float32 v0 = v_abs(v_cvt_f32(v_sub(v01, v02)));
v_float64 f00, f01;
f00 = v_cvt_f64(v0); f01 = v_cvt_f64_high(v0);
r0 = v_fma(f00, f00, r0); r1 = v_fma(f01, f01, r1);
v_int32 v11 = vx_load(src1 + j + VTraits<v_int32>::vlanes()),
v12 = vx_load(src2 + j + VTraits<v_int32>::vlanes());
v_float32 v1 = v_abs(v_cvt_f32(v_sub(v11, v12)));
v_float64 f10, f11;
f10 = v_cvt_f64(v1); f11 = v_cvt_f64_high(v1);
r2 = v_fma(f10, f10, r2); r3 = v_fma(f11, f11, r3);
v_uint32 v0 = v_absdiff(v01, v02);
v_uint64 ev0, ev1;
v_expand(v0, ev0, ev1);
v_float64 f0 = v_cvt_f64(v_reinterpret_as_s64(ev0)), f1 = v_cvt_f64(v_reinterpret_as_s64(ev1));
r0 = v_fma(f0, f0, r0); r1 = v_fma(f1, f1, r1);
}
s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3));
s += v_reduce_sum(v_add(r0, r1));
for (; j < n; j++) {
double v = src1[j] - src2[j];
double v = (double)src1[j] - (double)src2[j];
s += v * v;
}
return s;
@ -1145,7 +1122,7 @@ struct NormDiffL2_SIMD<float, double> {
}
s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3));
for (; j < n; j++) {
double v = src1[j] - src2[j];
double v = (double)src1[j] - (double)src2[j];
s += v * v;
}
return s;
@ -1181,7 +1158,7 @@ struct NormDiffL2_SIMD<double, double> {
}
s += v_reduce_sum(v_add(v_add(v_add(r0, r1), r2), r3));
for (; j < n; j++) {
double v = src1[j] - src2[j];
double v = (double)src1[j] - (double)src2[j];
s += v * v;
}
return s;
@ -1297,7 +1274,7 @@ normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int le
for( int i = 0; i < len; i++, src1 += cn, src2 += cn ) {
if( mask[i] ) {
for( int k = 0; k < cn; k++ ) {
ST v = src1[k] - src2[k];
ST v = (ST)src1[k] - (ST)src2[k];
result += v*v;
}
}
@ -1331,7 +1308,6 @@ NormFunc getNormFunc(int normType, int depth)
{
CV_INSTRUMENT_REGION();
// [FIXME] append 0's when merging to 5.x
static NormFunc normTab[3][CV_DEPTH_MAX] =
{
{
@ -1353,7 +1329,7 @@ NormFunc getNormFunc(int normType, int depth)
NormDiffFunc getNormDiffFunc(int normType, int depth)
{
static NormDiffFunc normDiffTab[3][8] =
static NormDiffFunc normDiffTab[3][CV_DEPTH_MAX] =
{
{
(NormDiffFunc)GET_OPTIMIZED(normDiffInf_8u), (NormDiffFunc)normDiffInf_8s,