Merge pull request #27201 from fengyuentau:4x/hal_rvv/dotprod

HAL: implemented cv_hal_dotProduct in hal_rvv #27201

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
Yuantao Feng 2025-04-21 14:05:52 +08:00 committed by GitHub
parent b5d38ea4cb
commit 11e46cda86
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 235 additions and 0 deletions

View File

@ -47,6 +47,7 @@
#include "hal_rvv_1p0/sqrt.hpp" // core #include "hal_rvv_1p0/sqrt.hpp" // core
#include "hal_rvv_1p0/copy_mask.hpp" // core #include "hal_rvv_1p0/copy_mask.hpp" // core
#include "hal_rvv_1p0/div.hpp" // core #include "hal_rvv_1p0/div.hpp" // core
#include "hal_rvv_1p0/dotprod.hpp" // core
#include "hal_rvv_1p0/moments.hpp" // imgproc #include "hal_rvv_1p0/moments.hpp" // imgproc
#include "hal_rvv_1p0/filter.hpp" // imgproc #include "hal_rvv_1p0/filter.hpp" // imgproc

234
3rdparty/hal_rvv/hal_rvv_1p0/dotprod.hpp vendored Normal file
View File

@ -0,0 +1,234 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
//
// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
#ifndef OPENCV_HAL_RVV_DOTPROD_HPP_INCLUDED
#define OPENCV_HAL_RVV_DOTPROD_HPP_INCLUDED
#include <riscv_vector.h>
#include <algorithm>
namespace cv { namespace cv_hal_rvv { namespace dotprod {
#undef cv_hal_dotProduct
#define cv_hal_dotProduct cv::cv_hal_rvv::dotprod::dotprod
namespace {
double dotProd_8u(const uchar *a, const uchar *b, int len) {
constexpr int block_size0 = (1 << 15);
double r = 0;
int i = 0;
while (i < len) {
int block_size = std::min(block_size0, len - i);
vuint32m1_t s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1());
int vl;
for (int j = 0; j < block_size; j += vl) {
vl = __riscv_vsetvl_e8m4(block_size - j);
auto va = __riscv_vle8_v_u8m4(a + j, vl);
auto vb = __riscv_vle8_v_u8m4(b + j, vl);
s = __riscv_vwredsumu(__riscv_vwmulu(va, vb, vl), s, vl);
}
r += (double)__riscv_vmv_x(s);
i += block_size;
a += block_size;
b += block_size;
}
return r;
}
double dotProd_8s(const schar *a, const schar *b, int len) {
constexpr int block_size0 = (1 << 14);
double r = 0;
int i = 0;
while (i < len) {
int block_size = std::min(block_size0, len - i);
vint32m1_t s = __riscv_vmv_v_x_i32m1(0, __riscv_vsetvlmax_e32m1());
int vl;
for (int j = 0; j < block_size; j += vl) {
vl = __riscv_vsetvl_e8m4(block_size - j);
auto va = __riscv_vle8_v_i8m4(a + j, vl);
auto vb = __riscv_vle8_v_i8m4(b + j, vl);
s = __riscv_vwredsum(__riscv_vwmul(va, vb, vl), s, vl);
}
r += (double)__riscv_vmv_x(s);
i += block_size;
a += block_size;
b += block_size;
}
return r;
}
double dotProd_16u(const ushort *a, const ushort *b, int len) {
constexpr int block_size0 = (1 << 24);
double r = 0;
int i = 0;
while (i < len) {
int block_size = std::min(block_size0, len - i);
vuint64m1_t s = __riscv_vmv_v_x_u64m1(0, __riscv_vsetvlmax_e64m1());
int vl;
for (int j = 0; j < block_size; j += vl) {
vl = __riscv_vsetvl_e16m4(block_size - j);
auto va = __riscv_vle16_v_u16m4(a + j, vl);
auto vb = __riscv_vle16_v_u16m4(b + j, vl);
s = __riscv_vwredsumu(__riscv_vwmulu(va, vb, vl), s, vl);
}
r += (double)__riscv_vmv_x(s);
i += block_size;
a += block_size;
b += block_size;
}
return r;
}
double dotProd_16s(const short *a, const short *b, int len) {
constexpr int block_size0 = (1 << 24);
double r = 0;
int i = 0;
while (i < len) {
int block_size = std::min(block_size0, len - i);
vint64m1_t s = __riscv_vmv_v_x_i64m1(0, __riscv_vsetvlmax_e64m1());
int vl;
for (int j = 0; j < block_size; j += vl) {
vl = __riscv_vsetvl_e16m4(block_size - j);
auto va = __riscv_vle16_v_i16m4(a + j, vl);
auto vb = __riscv_vle16_v_i16m4(b + j, vl);
s = __riscv_vwredsum(__riscv_vwmul(va, vb, vl), s, vl);
}
r += (double)__riscv_vmv_x(s);
i += block_size;
a += block_size;
b += block_size;
}
return r;
}
double dotProd_32s(const int *a, const int *b, int len) {
double r = 0;
vfloat64m8_t s = __riscv_vfmv_v_f_f64m8(0.f, __riscv_vsetvlmax_e64m8());
int vl;
for (int j = 0; j < len; j += vl) {
vl = __riscv_vsetvl_e32m4(len - j);
auto va = __riscv_vle32_v_i32m4(a + j, vl);
auto vb = __riscv_vle32_v_i32m4(b + j, vl);
s = __riscv_vfadd(s, __riscv_vfcvt_f(__riscv_vwmul(va, vb, vl), vl), vl);
}
r = __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_v_f_f64m1(0.f, __riscv_vsetvlmax_e64m1()), __riscv_vsetvlmax_e64m8()));
return r;
}
double dotProd_32f(const float *a, const float *b, int len) {
constexpr int block_size0 = (1 << 11);
double r = 0.f;
int i = 0;
while (i < len) {
int block_size = std::min(block_size0, len - i);
vfloat32m4_t s = __riscv_vfmv_v_f_f32m4(0.f, __riscv_vsetvlmax_e32m4());
int vl;
for (int j = 0; j < block_size; j += vl) {
vl = __riscv_vsetvl_e32m4(block_size - j);
auto va = __riscv_vle32_v_f32m4(a + j, vl);
auto vb = __riscv_vle32_v_f32m4(b + j, vl);
s = __riscv_vfmacc(s, va, vb, vl);
}
r += (double)__riscv_vfmv_f(__riscv_vfredusum(s, __riscv_vfmv_v_f_f32m1(0.f, __riscv_vsetvlmax_e32m1()), __riscv_vsetvlmax_e32m4()));
i += block_size;
a += block_size;
b += block_size;
}
return r;
}
} // anonymous
using DotProdFunc = double (*)(const uchar *a, const uchar *b, int len);
inline int dotprod(const uchar *a_data, size_t a_step, const uchar *b_data, size_t b_step,
int width, int height, int type, double *dot_val) {
int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
static DotProdFunc dotprod_tab[CV_DEPTH_MAX] = {
(DotProdFunc)dotProd_8u, (DotProdFunc)dotProd_8s,
(DotProdFunc)dotProd_16u, (DotProdFunc)dotProd_16s,
(DotProdFunc)dotProd_32s, (DotProdFunc)dotProd_32f,
nullptr, nullptr
};
DotProdFunc func = dotprod_tab[depth];
if (func == nullptr) {
return CV_HAL_ERROR_NOT_IMPLEMENTED;
}
static const size_t elem_size_tab[CV_DEPTH_MAX] = {
sizeof(uchar), sizeof(schar),
sizeof(ushort), sizeof(short),
sizeof(int), sizeof(float),
sizeof(int64_t), 0,
};
CV_Assert(elem_size_tab[depth]);
bool a_continuous = (a_step == width * elem_size_tab[depth] * cn);
bool b_continuous = (b_step == width * elem_size_tab[depth] * cn);
size_t nplanes = 1;
size_t len = width * height;
if (!a_continuous || !b_continuous) {
nplanes = height;
len = width;
}
len *= cn;
double r = 0;
auto _a = a_data;
auto _b = b_data;
for (size_t i = 0; i < nplanes; i++) {
if (!a_continuous || !b_continuous) {
_a = a_data + a_step * i;
_b = b_data + b_step * i;
}
r += func(_a, _b, len);
}
*dot_val = r;
return CV_HAL_ERROR_OK;
}
}}} // cv::cv_hal_rvv::dotprod
#endif // OPENCV_HAL_RVV_DOTPROD_HPP_INCLUDED