mirror of
https://github.com/opencv/opencv.git
synced 2025-08-06 14:36:36 +08:00
Merge pull request #27201 from fengyuentau:4x/hal_rvv/dotprod
HAL: implemented cv_hal_dotProduct in hal_rvv #27201 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
b5d38ea4cb
commit
11e46cda86
1
3rdparty/hal_rvv/hal_rvv.hpp
vendored
1
3rdparty/hal_rvv/hal_rvv.hpp
vendored
@ -47,6 +47,7 @@
|
||||
#include "hal_rvv_1p0/sqrt.hpp" // core
|
||||
#include "hal_rvv_1p0/copy_mask.hpp" // core
|
||||
#include "hal_rvv_1p0/div.hpp" // core
|
||||
#include "hal_rvv_1p0/dotprod.hpp" // core
|
||||
|
||||
#include "hal_rvv_1p0/moments.hpp" // imgproc
|
||||
#include "hal_rvv_1p0/filter.hpp" // imgproc
|
||||
|
234
3rdparty/hal_rvv/hal_rvv_1p0/dotprod.hpp
vendored
Normal file
234
3rdparty/hal_rvv/hal_rvv_1p0/dotprod.hpp
vendored
Normal file
@ -0,0 +1,234 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
//
|
||||
// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
|
||||
|
||||
#ifndef OPENCV_HAL_RVV_DOTPROD_HPP_INCLUDED
|
||||
#define OPENCV_HAL_RVV_DOTPROD_HPP_INCLUDED
|
||||
|
||||
#include <riscv_vector.h>
|
||||
#include <algorithm>
|
||||
|
||||
namespace cv { namespace cv_hal_rvv { namespace dotprod {
|
||||
|
||||
#undef cv_hal_dotProduct
|
||||
#define cv_hal_dotProduct cv::cv_hal_rvv::dotprod::dotprod
|
||||
|
||||
namespace {
|
||||
|
||||
double dotProd_8u(const uchar *a, const uchar *b, int len) {
|
||||
constexpr int block_size0 = (1 << 15);
|
||||
|
||||
double r = 0;
|
||||
int i = 0;
|
||||
while (i < len) {
|
||||
int block_size = std::min(block_size0, len - i);
|
||||
|
||||
vuint32m1_t s = __riscv_vmv_v_x_u32m1(0, __riscv_vsetvlmax_e32m1());
|
||||
int vl;
|
||||
for (int j = 0; j < block_size; j += vl) {
|
||||
vl = __riscv_vsetvl_e8m4(block_size - j);
|
||||
|
||||
auto va = __riscv_vle8_v_u8m4(a + j, vl);
|
||||
auto vb = __riscv_vle8_v_u8m4(b + j, vl);
|
||||
|
||||
s = __riscv_vwredsumu(__riscv_vwmulu(va, vb, vl), s, vl);
|
||||
}
|
||||
r += (double)__riscv_vmv_x(s);
|
||||
|
||||
i += block_size;
|
||||
a += block_size;
|
||||
b += block_size;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
double dotProd_8s(const schar *a, const schar *b, int len) {
|
||||
constexpr int block_size0 = (1 << 14);
|
||||
|
||||
double r = 0;
|
||||
int i = 0;
|
||||
while (i < len) {
|
||||
int block_size = std::min(block_size0, len - i);
|
||||
|
||||
vint32m1_t s = __riscv_vmv_v_x_i32m1(0, __riscv_vsetvlmax_e32m1());
|
||||
int vl;
|
||||
for (int j = 0; j < block_size; j += vl) {
|
||||
vl = __riscv_vsetvl_e8m4(block_size - j);
|
||||
|
||||
auto va = __riscv_vle8_v_i8m4(a + j, vl);
|
||||
auto vb = __riscv_vle8_v_i8m4(b + j, vl);
|
||||
|
||||
s = __riscv_vwredsum(__riscv_vwmul(va, vb, vl), s, vl);
|
||||
}
|
||||
r += (double)__riscv_vmv_x(s);
|
||||
|
||||
i += block_size;
|
||||
a += block_size;
|
||||
b += block_size;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
double dotProd_16u(const ushort *a, const ushort *b, int len) {
|
||||
constexpr int block_size0 = (1 << 24);
|
||||
|
||||
double r = 0;
|
||||
int i = 0;
|
||||
while (i < len) {
|
||||
int block_size = std::min(block_size0, len - i);
|
||||
|
||||
vuint64m1_t s = __riscv_vmv_v_x_u64m1(0, __riscv_vsetvlmax_e64m1());
|
||||
int vl;
|
||||
for (int j = 0; j < block_size; j += vl) {
|
||||
vl = __riscv_vsetvl_e16m4(block_size - j);
|
||||
|
||||
auto va = __riscv_vle16_v_u16m4(a + j, vl);
|
||||
auto vb = __riscv_vle16_v_u16m4(b + j, vl);
|
||||
|
||||
s = __riscv_vwredsumu(__riscv_vwmulu(va, vb, vl), s, vl);
|
||||
}
|
||||
r += (double)__riscv_vmv_x(s);
|
||||
|
||||
i += block_size;
|
||||
a += block_size;
|
||||
b += block_size;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
double dotProd_16s(const short *a, const short *b, int len) {
|
||||
constexpr int block_size0 = (1 << 24);
|
||||
|
||||
double r = 0;
|
||||
int i = 0;
|
||||
while (i < len) {
|
||||
int block_size = std::min(block_size0, len - i);
|
||||
|
||||
vint64m1_t s = __riscv_vmv_v_x_i64m1(0, __riscv_vsetvlmax_e64m1());
|
||||
int vl;
|
||||
for (int j = 0; j < block_size; j += vl) {
|
||||
vl = __riscv_vsetvl_e16m4(block_size - j);
|
||||
|
||||
auto va = __riscv_vle16_v_i16m4(a + j, vl);
|
||||
auto vb = __riscv_vle16_v_i16m4(b + j, vl);
|
||||
|
||||
s = __riscv_vwredsum(__riscv_vwmul(va, vb, vl), s, vl);
|
||||
}
|
||||
r += (double)__riscv_vmv_x(s);
|
||||
|
||||
i += block_size;
|
||||
a += block_size;
|
||||
b += block_size;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
double dotProd_32s(const int *a, const int *b, int len) {
|
||||
double r = 0;
|
||||
|
||||
vfloat64m8_t s = __riscv_vfmv_v_f_f64m8(0.f, __riscv_vsetvlmax_e64m8());
|
||||
int vl;
|
||||
for (int j = 0; j < len; j += vl) {
|
||||
vl = __riscv_vsetvl_e32m4(len - j);
|
||||
|
||||
auto va = __riscv_vle32_v_i32m4(a + j, vl);
|
||||
auto vb = __riscv_vle32_v_i32m4(b + j, vl);
|
||||
|
||||
s = __riscv_vfadd(s, __riscv_vfcvt_f(__riscv_vwmul(va, vb, vl), vl), vl);
|
||||
}
|
||||
r = __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_v_f_f64m1(0.f, __riscv_vsetvlmax_e64m1()), __riscv_vsetvlmax_e64m8()));
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
double dotProd_32f(const float *a, const float *b, int len) {
|
||||
constexpr int block_size0 = (1 << 11);
|
||||
|
||||
double r = 0.f;
|
||||
int i = 0;
|
||||
while (i < len) {
|
||||
int block_size = std::min(block_size0, len - i);
|
||||
|
||||
vfloat32m4_t s = __riscv_vfmv_v_f_f32m4(0.f, __riscv_vsetvlmax_e32m4());
|
||||
int vl;
|
||||
for (int j = 0; j < block_size; j += vl) {
|
||||
vl = __riscv_vsetvl_e32m4(block_size - j);
|
||||
|
||||
auto va = __riscv_vle32_v_f32m4(a + j, vl);
|
||||
auto vb = __riscv_vle32_v_f32m4(b + j, vl);
|
||||
|
||||
s = __riscv_vfmacc(s, va, vb, vl);
|
||||
}
|
||||
r += (double)__riscv_vfmv_f(__riscv_vfredusum(s, __riscv_vfmv_v_f_f32m1(0.f, __riscv_vsetvlmax_e32m1()), __riscv_vsetvlmax_e32m4()));
|
||||
|
||||
i += block_size;
|
||||
a += block_size;
|
||||
b += block_size;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
} // anonymous
|
||||
|
||||
using DotProdFunc = double (*)(const uchar *a, const uchar *b, int len);
|
||||
inline int dotprod(const uchar *a_data, size_t a_step, const uchar *b_data, size_t b_step,
|
||||
int width, int height, int type, double *dot_val) {
|
||||
int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
|
||||
|
||||
static DotProdFunc dotprod_tab[CV_DEPTH_MAX] = {
|
||||
(DotProdFunc)dotProd_8u, (DotProdFunc)dotProd_8s,
|
||||
(DotProdFunc)dotProd_16u, (DotProdFunc)dotProd_16s,
|
||||
(DotProdFunc)dotProd_32s, (DotProdFunc)dotProd_32f,
|
||||
nullptr, nullptr
|
||||
};
|
||||
DotProdFunc func = dotprod_tab[depth];
|
||||
if (func == nullptr) {
|
||||
return CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
static const size_t elem_size_tab[CV_DEPTH_MAX] = {
|
||||
sizeof(uchar), sizeof(schar),
|
||||
sizeof(ushort), sizeof(short),
|
||||
sizeof(int), sizeof(float),
|
||||
sizeof(int64_t), 0,
|
||||
};
|
||||
CV_Assert(elem_size_tab[depth]);
|
||||
|
||||
bool a_continuous = (a_step == width * elem_size_tab[depth] * cn);
|
||||
bool b_continuous = (b_step == width * elem_size_tab[depth] * cn);
|
||||
size_t nplanes = 1;
|
||||
size_t len = width * height;
|
||||
if (!a_continuous || !b_continuous) {
|
||||
nplanes = height;
|
||||
len = width;
|
||||
}
|
||||
len *= cn;
|
||||
|
||||
double r = 0;
|
||||
auto _a = a_data;
|
||||
auto _b = b_data;
|
||||
for (size_t i = 0; i < nplanes; i++) {
|
||||
if (!a_continuous || !b_continuous) {
|
||||
_a = a_data + a_step * i;
|
||||
_b = b_data + b_step * i;
|
||||
}
|
||||
r += func(_a, _b, len);
|
||||
}
|
||||
*dot_val = r;
|
||||
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
}}} // cv::cv_hal_rvv::dotprod
|
||||
|
||||
#endif // OPENCV_HAL_RVV_DOTPROD_HPP_INCLUDED
|
||||
|
Loading…
Reference in New Issue
Block a user