Merge pull request #26892 from amane-ame:solve_hal_rvv

Add RISC-V HAL implementation for cv::solve #26892

This patch implements `cv_hal_LU/cv_hal_Cholesky/cv_hal_SVD/cv_hal_QR` function in RVV_HAL using native intrinsics, optimizing the performance for `cv::solve` with method `DECOMP_LU/DECOMP_SVD/DECOMP_CHOLESKY/DECOMP_QR` and data types `32FC1/64FC1`.

Tested on MUSE-PI (Spacemit X60) for both gcc 14.2 and clang 20.0.

```
$ ./opencv_test_core --gtest_filter="*Solve*:*SVD*:*Cholesky*"
$ ./opencv_perf_core --gtest_filter="*SolveTest*" --perf_min_samples=100 --perf_force_samples=100
```

The tail of the perf table is shown below since the table is too long.

View the full perf table here: [hal_rvv_solve.pdf](https://github.com/user-attachments/files/18725067/hal_rvv_solve.pdf)

<img width="1078" alt="Untitled" src="https://github.com/user-attachments/assets/c01d849c-f000-4bcc-bfe0-a302d6605d9e" />

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
天音あめ 2025-03-07 16:14:09 +08:00 committed by GitHub
parent bb525fe91d
commit 00956d5c15
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 814 additions and 0 deletions

View File

@ -32,6 +32,10 @@
#include "hal_rvv_1p0/split.hpp" // core
#include "hal_rvv_1p0/flip.hpp" // core
#include "hal_rvv_1p0/lut.hpp" // core
#include "hal_rvv_1p0/lu.hpp" // core
#include "hal_rvv_1p0/cholesky.hpp" // core
#include "hal_rvv_1p0/qr.hpp" // core
#include "hal_rvv_1p0/svd.hpp" // core
#include "hal_rvv_1p0/pyramids.hpp" // imgproc
#endif

View File

@ -0,0 +1,142 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_HAL_RVV_CHOLESKY_HPP_INCLUDED
#define OPENCV_HAL_RVV_CHOLESKY_HPP_INCLUDED
#include <riscv_vector.h>
namespace cv { namespace cv_hal_rvv { namespace cholesky {
#undef cv_hal_Cholesky32f
#define cv_hal_Cholesky32f cv::cv_hal_rvv::cholesky::Cholesky
#undef cv_hal_Cholesky64f
#define cv_hal_Cholesky64f cv::cv_hal_rvv::cholesky::Cholesky
template<typename T> struct rvv;
template<> struct rvv<float>
{
static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e32m4(); }
static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e32m4(a); }
static inline vfloat32m4_t vfmv_v_f(float a, size_t b) { return __riscv_vfmv_v_f_f32m4(a, b); }
static inline vfloat32m1_t vfmv_s_f(float a, size_t b) { return __riscv_vfmv_s_f_f32m1(a, b); }
static inline vfloat32m4_t vle(const float* a, size_t b) { return __riscv_vle32_v_f32m4(a, b); }
static inline vfloat32m4_t vlse(const float* a, ptrdiff_t b, size_t c) { return __riscv_vlse32_v_f32m4(a, b, c); }
static inline void vsse(float* a, ptrdiff_t b, vfloat32m4_t c, size_t d) { __riscv_vsse32(a, b, c, d); }
};
template<> struct rvv<double>
{
static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e64m4(); }
static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e64m4(a); }
static inline vfloat64m4_t vfmv_v_f(double a, size_t b) { return __riscv_vfmv_v_f_f64m4(a, b); }
static inline vfloat64m1_t vfmv_s_f(double a, size_t b) { return __riscv_vfmv_s_f_f64m1(a, b); }
static inline vfloat64m4_t vle(const double* a, size_t b) { return __riscv_vle64_v_f64m4(a, b); }
static inline vfloat64m4_t vlse(const double* a, ptrdiff_t b, size_t c) { return __riscv_vlse64_v_f64m4(a, b, c); }
static inline void vsse(double* a, ptrdiff_t b, vfloat64m4_t c, size_t d) { __riscv_vsse64(a, b, c, d); }
};
// the algorithm is copied from core/src/matrix_decomp.cpp,
// in the function template static int cv::CholImpl
template<typename T>
inline int Cholesky(T* src1, size_t src1_step, int m, T* src2, size_t src2_step, int n, bool* info)
{
int i, j, k;
double s;
src1_step /= sizeof(src1[0]);
src2_step /= sizeof(src2[0]);
int vlmax = rvv<T>::vsetvlmax(), vl;
for( i = 0; i < m; i++ )
{
for( j = 0; j < i; j++ )
{
auto vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
for( k = 0; k < j; k += vl )
{
vl = rvv<T>::vsetvl(j - k);
auto vec_src1 = rvv<T>::vle(src1 + i * src1_step + k, vl);
auto vec_src2 = rvv<T>::vle(src1 + j * src1_step + k, vl);
vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl);
}
s = src1[i*src1_step + j] - __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
src1[i*src1_step + j] = (T)(s*src1[j*src1_step + j]);
}
auto vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
for( k = 0; k < j; k += vl )
{
vl = rvv<T>::vsetvl(j - k);
auto vec_src = rvv<T>::vle(src1 + i * src1_step + k, vl);
vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src, vec_src, vl);
}
s = src1[i*src1_step + i] - __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
if( s < std::numeric_limits<T>::epsilon() )
{
*info = false;
return CV_HAL_ERROR_OK;
}
src1[i*src1_step + i] = (T)(1./std::sqrt(s));
}
if (!src2)
{
for( i = 0; i < m; i += vl )
{
vl = rvv<T>::vsetvl(m - i);
auto vec_src = rvv<T>::vlse(src1 + i * src1_step + i, sizeof(T) * (src1_step + 1), vl);
vec_src = __riscv_vfrdiv(vec_src, 1, vl);
rvv<T>::vsse(src1 + i * src1_step + i, sizeof(T) * (src1_step + 1), vec_src, vl);
}
*info = true;
return CV_HAL_ERROR_OK;
}
for( i = 0; i < m; i++ )
{
for( j = 0; j < n; j++ )
{
auto vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
for( k = 0; k < i; k += vl )
{
vl = rvv<T>::vsetvl(i - k);
auto vec_src1 = rvv<T>::vle(src1 + i * src1_step + k, vl);
auto vec_src2 = rvv<T>::vlse(src2 + k * src2_step + j, sizeof(T) * src2_step, vl);
vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl);
}
s = src2[i*src2_step + j] - __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
src2[i*src2_step + j] = (T)(s*src1[i*src1_step + i]);
}
}
for( i = m-1; i >= 0; i-- )
{
for( j = 0; j < n; j++ )
{
auto vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
for( k = i + 1; k < m; k += vl )
{
vl = rvv<T>::vsetvl(m - k);
auto vec_src1 = rvv<T>::vlse(src1 + k * src1_step + i, sizeof(T) * src1_step, vl);
auto vec_src2 = rvv<T>::vlse(src2 + k * src2_step + j, sizeof(T) * src2_step, vl);
vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl);
}
s = src2[i*src2_step + j] - __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
src2[i*src2_step + j] = (T)(s*src1[i*src1_step + i]);
}
}
for( i = 0; i < m; i += vl )
{
vl = rvv<T>::vsetvl(m - i);
auto vec_src = rvv<T>::vlse(src1 + i * src1_step + i, sizeof(T) * (src1_step + 1), vl);
vec_src = __riscv_vfrdiv(vec_src, 1, vl);
rvv<T>::vsse(src1 + i * src1_step + i, sizeof(T) * (src1_step + 1), vec_src, vl);
}
*info = true;
return CV_HAL_ERROR_OK;
}
}}}
#endif

189
3rdparty/hal_rvv/hal_rvv_1p0/lu.hpp vendored Normal file
View File

@ -0,0 +1,189 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_HAL_RVV_LU_HPP_INCLUDED
#define OPENCV_HAL_RVV_LU_HPP_INCLUDED
#include <riscv_vector.h>
namespace cv { namespace cv_hal_rvv { namespace lu {
#undef cv_hal_LU32f
#define cv_hal_LU32f cv::cv_hal_rvv::lu::LU
#undef cv_hal_LU64f
#define cv_hal_LU64f cv::cv_hal_rvv::lu::LU
template<typename T> struct rvv;
template<> struct rvv<float>
{
static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e32m4(); }
static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e32m4(a); }
static inline vfloat32m4_t vfmv_v_f(float a, size_t b) { return __riscv_vfmv_v_f_f32m4(a, b); }
static inline vfloat32m1_t vfmv_s_f(float a, size_t b) { return __riscv_vfmv_s_f_f32m1(a, b); }
static inline vfloat32m4_t vle(const float* a, size_t b) { return __riscv_vle32_v_f32m4(a, b); }
static inline vfloat32m4_t vlse(const float* a, ptrdiff_t b, size_t c) { return __riscv_vlse32_v_f32m4(a, b, c); }
static inline void vse(float* a, vfloat32m4_t b, size_t c) { __riscv_vse32(a, b, c); }
};
template<> struct rvv<double>
{
static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e64m4(); }
static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e64m4(a); }
static inline vfloat64m4_t vfmv_v_f(double a, size_t b) { return __riscv_vfmv_v_f_f64m4(a, b); }
static inline vfloat64m1_t vfmv_s_f(double a, size_t b) { return __riscv_vfmv_s_f_f64m1(a, b); }
static inline vfloat64m4_t vle(const double* a, size_t b) { return __riscv_vle64_v_f64m4(a, b); }
static inline vfloat64m4_t vlse(const double* a, ptrdiff_t b, size_t c) { return __riscv_vlse64_v_f64m4(a, b, c); }
static inline void vse(double* a, vfloat64m4_t b, size_t c) { __riscv_vse64(a, b, c); }
};
// the algorithm is copied from core/src/matrix_decomp.cpp,
// in the function template static int cv::LUImpl
template<typename T>
inline int LU(T* src1, size_t src1_step, int m, T* src2, size_t src2_step, int n, int* info)
{
T eps;
if( typeid(T) == typeid(float) )
eps = FLT_EPSILON*10;
else if( typeid(T) == typeid(double) )
eps = DBL_EPSILON*100;
else
return CV_HAL_ERROR_NOT_IMPLEMENTED;
int i, j, k, p = 1;
src1_step /= sizeof(src1[0]);
src2_step /= sizeof(src2[0]);
int vlmax = rvv<T>::vsetvlmax(), vl;
if( src2 )
{
for( i = 0; i < m; i++ )
{
k = i;
for( j = i+1; j < m; j++ )
if( std::abs(src1[j*src1_step + i]) > std::abs(src1[k*src1_step + i]) )
k = j;
if( std::abs(src1[k*src1_step + i]) < eps )
{
*info = 0;
return CV_HAL_ERROR_OK;
}
if( k != i )
{
for( j = i; j < m; j += vl )
{
vl = rvv<T>::vsetvl(m - j);
auto vec_src1 = rvv<T>::vle(src1 + i * src1_step + j, vl);
auto vec_src2 = rvv<T>::vle(src1 + k * src1_step + j, vl);
rvv<T>::vse(src1 + k * src1_step + j, vec_src1, vl);
rvv<T>::vse(src1 + i * src1_step + j, vec_src2, vl);
}
for( j = 0; j < n; j += vl )
{
vl = rvv<T>::vsetvl(n - j);
auto vec_src1 = rvv<T>::vle(src2 + i * src2_step + j, vl);
auto vec_src2 = rvv<T>::vle(src2 + k * src2_step + j, vl);
rvv<T>::vse(src2 + k * src2_step + j, vec_src1, vl);
rvv<T>::vse(src2 + i * src2_step + j, vec_src2, vl);
}
p = -p;
}
T d = -1/src1[i*src1_step + i];
for( j = i+1; j < m; j++ )
{
T alpha = src1[j*src1_step + i]*d;
for( k = i+1; k < m; k += vl )
{
vl = rvv<T>::vsetvl(m - k);
auto vec_src = rvv<T>::vle(src1 + i * src1_step + k, vl);
auto vec_dst = rvv<T>::vle(src1 + j * src1_step + k, vl);
vec_dst = __riscv_vfmacc(vec_dst, alpha, vec_src, vl);
rvv<T>::vse(src1 + j * src1_step + k, vec_dst, vl);
}
for( k = 0; k < n; k += vl )
{
vl = rvv<T>::vsetvl(n - k);
auto vec_src = rvv<T>::vle(src2 + i * src2_step + k, vl);
auto vec_dst = rvv<T>::vle(src2 + j * src2_step + k, vl);
vec_dst = __riscv_vfmacc(vec_dst, alpha, vec_src, vl);
rvv<T>::vse(src2 + j * src2_step + k, vec_dst, vl);
}
}
}
for( i = m-1; i >= 0; i-- )
for( j = 0; j < n; j++ )
{
T s = src2[i*src2_step + j];
auto vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
for( k = i+1; k < m; k += vl )
{
vl = rvv<T>::vsetvl(m - k);
auto vec_src1 = rvv<T>::vle(src1 + i * src1_step + k, vl);
auto vec_src2 = rvv<T>::vlse(src2 + k * src2_step + j, sizeof(T) * src2_step, vl);
vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl);
}
s -= __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
src2[i*src2_step + j] = s/src1[i*src1_step + i];
}
}
else
{
for( i = 0; i < m; i++ )
{
k = i;
for( j = i+1; j < m; j++ )
if( std::abs(src1[j*src1_step + i]) > std::abs(src1[k*src1_step + i]) )
k = j;
if( std::abs(src1[k*src1_step + i]) < eps )
{
*info = 0;
return CV_HAL_ERROR_OK;
}
if( k != i )
{
for( j = i; j < m; j += vl )
{
vl = rvv<T>::vsetvl(m - j);
auto vec_src1 = rvv<T>::vle(src1 + i * src1_step + j, vl);
auto vec_src2 = rvv<T>::vle(src1 + k * src1_step + j, vl);
rvv<T>::vse(src1 + k * src1_step + j, vec_src1, vl);
rvv<T>::vse(src1 + i * src1_step + j, vec_src2, vl);
}
p = -p;
}
T d = -1/src1[i*src1_step + i];
for( j = i+1; j < m; j++ )
{
T alpha = src1[j*src1_step + i]*d;
for( k = i+1; k < m; k += vl )
{
vl = rvv<T>::vsetvl(m - k);
auto vec_src = rvv<T>::vle(src1 + i * src1_step + k, vl);
auto vec_dst = rvv<T>::vle(src1 + j * src1_step + k, vl);
vec_dst = __riscv_vfmacc(vec_dst, alpha, vec_src, vl);
rvv<T>::vse(src1 + j * src1_step + k, vec_dst, vl);
}
}
}
}
*info = p;
return CV_HAL_ERROR_OK;
}
}}}
#endif

194
3rdparty/hal_rvv/hal_rvv_1p0/qr.hpp vendored Normal file
View File

@ -0,0 +1,194 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_HAL_RVV_QR_HPP_INCLUDED
#define OPENCV_HAL_RVV_QR_HPP_INCLUDED
#include <riscv_vector.h>
namespace cv { namespace cv_hal_rvv { namespace qr {
#undef cv_hal_QR32f
#define cv_hal_QR32f cv::cv_hal_rvv::qr::QR
#undef cv_hal_QR64f
#define cv_hal_QR64f cv::cv_hal_rvv::qr::QR
template<typename T> struct rvv;
template<> struct rvv<float>
{
static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e32m4(); }
static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e32m4(a); }
static inline vfloat32m4_t vfmv_v_f(float a, size_t b) { return __riscv_vfmv_v_f_f32m4(a, b); }
static inline vfloat32m1_t vfmv_s_f(float a, size_t b) { return __riscv_vfmv_s_f_f32m1(a, b); }
static inline vfloat32m4_t vle(const float* a, size_t b) { return __riscv_vle32_v_f32m4(a, b); }
static inline vfloat32m4_t vlse(const float* a, ptrdiff_t b, size_t c) { return __riscv_vlse32_v_f32m4(a, b, c); }
static inline void vse(float* a, vfloat32m4_t b, size_t c) { __riscv_vse32(a, b, c); }
static inline void vsse(float* a, ptrdiff_t b, vfloat32m4_t c, size_t d) { __riscv_vsse32(a, b, c, d); }
};
template<> struct rvv<double>
{
static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e64m4(); }
static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e64m4(a); }
static inline vfloat64m4_t vfmv_v_f(double a, size_t b) { return __riscv_vfmv_v_f_f64m4(a, b); }
static inline vfloat64m1_t vfmv_s_f(double a, size_t b) { return __riscv_vfmv_s_f_f64m1(a, b); }
static inline vfloat64m4_t vle(const double* a, size_t b) { return __riscv_vle64_v_f64m4(a, b); }
static inline vfloat64m4_t vlse(const double* a, ptrdiff_t b, size_t c) { return __riscv_vlse64_v_f64m4(a, b, c); }
static inline void vse(double* a, vfloat64m4_t b, size_t c) { __riscv_vse64(a, b, c); }
static inline void vsse(double* a, ptrdiff_t b, vfloat64m4_t c, size_t d) { __riscv_vsse64(a, b, c, d); }
};
// the algorithm is copied from core/src/matrix_decomp.cpp,
// in the function template static int cv::QRImpl
template<typename T>
inline int QR(T* src1, size_t src1_step, int m, int n, int k, T* src2, size_t src2_step, T* dst, int* info)
{
T eps;
if (typeid(T) == typeid(float))
eps = FLT_EPSILON*10;
else if (typeid(T) == typeid(double))
eps = DBL_EPSILON*400;
else
return CV_HAL_ERROR_NOT_IMPLEMENTED;
src1_step /= sizeof(T);
src2_step /= sizeof(T);
size_t buf_size = m ? m + n : dst != NULL;
std::vector<T> buffer(buf_size);
T* val = buffer.data();
if (dst == NULL)
dst = val + m;
int vlmax = rvv<T>::vsetvlmax(), vl;
for (int l = 0; l < n; l++)
{
//generate val
int vlSize = m - l;
auto vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
for (int i = 0; i < vlSize; i += vl)
{
vl = rvv<T>::vsetvl(vlSize - i);
auto vec_src = rvv<T>::vlse(src1 + (l + i) * src1_step + l, sizeof(T) * src1_step, vl);
rvv<T>::vse(val + i, vec_src, vl);
vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src, vec_src, vl);
}
T vlNorm = __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
T tmpV = val[0];
val[0] = val[0] + (val[0] >= 0 ? 1 : -1) * std::sqrt(vlNorm);
vlNorm = std::sqrt(vlNorm + val[0] * val[0] - tmpV*tmpV);
for (int i = 0; i < vlSize; i += vl)
{
vl = rvv<T>::vsetvl(vlSize - i);
auto vec_src = rvv<T>::vle(val + i, vl);
vec_src = __riscv_vfdiv(vec_src, vlNorm, vl);
rvv<T>::vse(val + i, vec_src, vl);
}
//multiply A_l*val
for (int j = l; j < n; j++)
{
vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
for (int i = l; i < m; i += vl)
{
vl = rvv<T>::vsetvl(m - i);
auto vec_src1 = rvv<T>::vle(val + i - l, vl);
auto vec_src2 = rvv<T>::vlse(src1 + i * src1_step + j, sizeof(T) * src1_step, vl);
vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl);
}
T v_lA = 2 * __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
for (int i = l; i < m; i += vl)
{
vl = rvv<T>::vsetvl(m - i);
auto vec_src1 = rvv<T>::vle(val + i - l, vl);
auto vec_src2 = rvv<T>::vlse(src1 + i * src1_step + j, sizeof(T) * src1_step, vl);
vec_src2 = __riscv_vfnmsac(vec_src2, v_lA, vec_src1, vl);
rvv<T>::vsse(src1 + i * src1_step + j, sizeof(T) * src1_step, vec_src2, vl);
}
}
//save val and factors
dst[l] = val[0] * val[0];
for (int i = 1; i < vlSize; i += vl)
{
vl = rvv<T>::vsetvl(vlSize - i);
auto vec_src = rvv<T>::vle(val + i, vl);
vec_src = __riscv_vfdiv(vec_src, val[0], vl);
rvv<T>::vsse(src1 + (l + i) * src1_step + l, sizeof(T) * src1_step, vec_src, vl);
}
}
if (src2)
{
//generate new rhs
for (int l = 0; l < n; l++)
{
//unpack val
val[0] = (T)1;
for (int j = 1; j < m - l; j += vl)
{
vl = rvv<T>::vsetvl(m - l - j);
auto vec_src = rvv<T>::vlse(src1 + (j + l) * src1_step + l, sizeof(T) * src1_step, vl);
rvv<T>::vse(val + j, vec_src, vl);
}
//h_l*x
for (int j = 0; j < k; j++)
{
auto vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
for (int i = l; i < m; i += vl)
{
vl = rvv<T>::vsetvl(m - i);
auto vec_src1 = rvv<T>::vle(val + i - l, vl);
auto vec_src2 = rvv<T>::vlse(src2 + i * src2_step + j, sizeof(T) * src2_step, vl);
vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl);
}
T v_lB = 2 * dst[l] * __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
for (int i = l; i < m; i += vl)
{
vl = rvv<T>::vsetvl(m - i);
auto vec_src1 = rvv<T>::vle(val + i - l, vl);
auto vec_src2 = rvv<T>::vlse(src2 + i * src2_step + j, sizeof(T) * src2_step, vl);
vec_src2 = __riscv_vfnmsac(vec_src2, v_lB, vec_src1, vl);
rvv<T>::vsse(src2 + i * src2_step + j, sizeof(T) * src2_step, vec_src2, vl);
}
}
}
//do back substitution
for (int i = n - 1; i >= 0; i--)
{
for (int j = n - 1; j > i; j--)
{
for (int p = 0; p < k; p += vl)
{
vl = rvv<T>::vsetvl(k - p);
auto vec_src1 = rvv<T>::vle(src2 + i * src2_step + p, vl);
auto vec_src2 = rvv<T>::vle(src2 + j * src2_step + p, vl);
vec_src1 = __riscv_vfnmsac(vec_src1, src1[i*src1_step + j], vec_src2, vl);
rvv<T>::vse(src2 + i * src2_step + p, vec_src1, vl);
}
}
if (std::abs(src1[i*src1_step + i]) < eps)
{
*info = 0;
return CV_HAL_ERROR_OK;
}
for (int p = 0; p < k; p += vl)
{
vl = rvv<T>::vsetvl(k - p);
auto vec_src = rvv<T>::vle(src2 + i * src2_step + p, vl);
vec_src = __riscv_vfdiv(vec_src, src1[i*src1_step + i], vl);
rvv<T>::vse(src2 + i * src2_step + p, vec_src, vl);
}
}
}
*info = 1;
return CV_HAL_ERROR_OK;
}
}}}
#endif

285
3rdparty/hal_rvv/hal_rvv_1p0/svd.hpp vendored Normal file
View File

@ -0,0 +1,285 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_HAL_RVV_SVD_HPP_INCLUDED
#define OPENCV_HAL_RVV_SVD_HPP_INCLUDED
#include <riscv_vector.h>
namespace cv { namespace cv_hal_rvv { namespace svd {
#undef cv_hal_SVD32f
#define cv_hal_SVD32f cv::cv_hal_rvv::svd::SVD
#undef cv_hal_SVD64f
#define cv_hal_SVD64f cv::cv_hal_rvv::svd::SVD
template<typename T> struct rvv;
template<> struct rvv<float>
{
static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e32m4(); }
static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e32m4(a); }
static inline vfloat32m4_t vfmv_v_f(float a, size_t b) { return __riscv_vfmv_v_f_f32m4(a, b); }
static inline vfloat32m1_t vfmv_s_f(float a, size_t b) { return __riscv_vfmv_s_f_f32m1(a, b); }
static inline vfloat32m4_t vle(const float* a, size_t b) { return __riscv_vle32_v_f32m4(a, b); }
static inline void vse(float* a, vfloat32m4_t b, size_t c) { __riscv_vse32(a, b, c); }
};
template<> struct rvv<double>
{
static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e64m4(); }
static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e64m4(a); }
static inline vfloat64m4_t vfmv_v_f(double a, size_t b) { return __riscv_vfmv_v_f_f64m4(a, b); }
static inline vfloat64m1_t vfmv_s_f(double a, size_t b) { return __riscv_vfmv_s_f_f64m1(a, b); }
static inline vfloat64m4_t vle(const double* a, size_t b) { return __riscv_vle64_v_f64m4(a, b); }
static inline void vse(double* a, vfloat64m4_t b, size_t c) { __riscv_vse64(a, b, c); }
};
// the algorithm is copied from core/src/lapack.cpp,
// in the function template static void cv::JacobiSVDImpl_
template<typename T>
inline int SVD(T* src, size_t src_step, T* w, T*, size_t, T* vt, size_t vt_step, int m, int n, int flags)
{
T minval, eps;
if( typeid(T) == typeid(float) )
{
minval = FLT_MIN;
eps = FLT_EPSILON*2;
}
else if( typeid(T) == typeid(double) )
{
minval = DBL_MIN;
eps = DBL_EPSILON*10;
}
else
return CV_HAL_ERROR_NOT_IMPLEMENTED;
int n1;
if( flags == CV_HAL_SVD_NO_UV )
n1 = 0;
else if( flags == (CV_HAL_SVD_SHORT_UV | CV_HAL_SVD_MODIFY_A) )
n1 = n;
else if( flags == (CV_HAL_SVD_FULL_UV | CV_HAL_SVD_MODIFY_A) )
n1 = m;
else
return CV_HAL_ERROR_NOT_IMPLEMENTED;
std::vector<double> Wbuf(n);
double* W = Wbuf.data();
int i, j, k, iter, max_iter = std::max(m, 30);
T c, s;
double sd;
src_step /= sizeof(src[0]);
vt_step /= sizeof(vt[0]);
int vlmax = rvv<T>::vsetvlmax(), vl;
for( i = 0; i < n; i++ )
{
for( k = 0, sd = 0; k < m; k++ )
{
T t = src[i*src_step + k];
sd += (double)t*t;
}
W[i] = sd;
if( vt )
{
for( k = 0; k < n; k++ )
vt[i*vt_step + k] = 0;
vt[i*vt_step + i] = 1;
}
}
for( iter = 0; iter < max_iter; iter++ )
{
bool changed = false;
for( i = 0; i < n-1; i++ )
for( j = i+1; j < n; j++ )
{
T *Ai = src + i*src_step, *Aj = src + j*src_step;
double a = W[i], p = 0, b = W[j];
auto vec_sum1 = rvv<T>::vfmv_v_f(0, vlmax);
for( k = 0; k < m; k += vl )
{
vl = rvv<T>::vsetvl(m - k);
auto vec_src1 = rvv<T>::vle(Ai + k, vl);
auto vec_src2 = rvv<T>::vle(Aj + k, vl);
vec_sum1 = __riscv_vfmacc_tu(vec_sum1, vec_src1, vec_src2, vl);
}
p = __riscv_vfmv_f(__riscv_vfredosum(vec_sum1, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
if( std::abs(p) <= eps*std::sqrt((double)a*b) )
continue;
p *= 2;
double beta = a - b, gamma = hypot((double)p, beta);
if( beta < 0 )
{
double delta = (gamma - beta)*0.5;
s = (T)std::sqrt(delta/gamma);
c = (T)(p/(gamma*s*2));
}
else
{
c = (T)std::sqrt((gamma + beta)/(gamma*2));
s = (T)(p/(gamma*c*2));
}
vec_sum1 = rvv<T>::vfmv_v_f(0, vlmax);
auto vec_sum2 = rvv<T>::vfmv_v_f(0, vlmax);
for( k = 0; k < m; k += vl )
{
vl = rvv<T>::vsetvl(m - k);
auto vec_src1 = rvv<T>::vle(Ai + k, vl);
auto vec_src2 = rvv<T>::vle(Aj + k, vl);
auto vec_t0 = __riscv_vfadd(__riscv_vfmul(vec_src1, c, vl), __riscv_vfmul(vec_src2, s, vl), vl);
auto vec_t1 = __riscv_vfsub(__riscv_vfmul(vec_src2, c, vl), __riscv_vfmul(vec_src1, s, vl), vl);
rvv<T>::vse(Ai + k, vec_t0, vl);
rvv<T>::vse(Aj + k, vec_t1, vl);
vec_sum1 = __riscv_vfmacc_tu(vec_sum1, vec_t0, vec_t0, vl);
vec_sum2 = __riscv_vfmacc_tu(vec_sum2, vec_t1, vec_t1, vl);
}
W[i] = __riscv_vfmv_f(__riscv_vfredosum(vec_sum1, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
W[j] = __riscv_vfmv_f(__riscv_vfredosum(vec_sum2, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
changed = true;
if( vt )
{
T *Vi = vt + i*vt_step, *Vj = vt + j*vt_step;
for( k = 0; k < n; k += vl )
{
vl = rvv<T>::vsetvl(n - k);
auto vec_src1 = rvv<T>::vle(Vi + k, vl);
auto vec_src2 = rvv<T>::vle(Vj + k, vl);
auto vec_t0 = __riscv_vfadd(__riscv_vfmul(vec_src1, c, vl), __riscv_vfmul(vec_src2, s, vl), vl);
auto vec_t1 = __riscv_vfsub(__riscv_vfmul(vec_src2, c, vl), __riscv_vfmul(vec_src1, s, vl), vl);
rvv<T>::vse(Vi + k, vec_t0, vl);
rvv<T>::vse(Vj + k, vec_t1, vl);
}
}
}
if( !changed )
break;
}
for( i = 0; i < n; i++ )
{
auto vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
for( k = 0; k < m; k += vl )
{
vl = rvv<T>::vsetvl(m - k);
auto vec_src = rvv<T>::vle(src + i * src_step + k, vl);
vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src, vec_src, vl);
}
W[i] = std::sqrt(__riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax)));
}
for( i = 0; i < n-1; i++ )
{
j = i;
for( k = i+1; k < n; k++ )
{
if( W[j] < W[k] )
j = k;
}
if( i != j )
{
std::swap(W[i], W[j]);
if( vt )
{
for( k = 0; k < m; k++ )
std::swap(src[i*src_step + k], src[j*src_step + k]);
for( k = 0; k < n; k++ )
std::swap(vt[i*vt_step + k], vt[j*vt_step + k]);
}
}
}
for( i = 0; i < n; i++ )
w[i] = (T)W[i];
if( !vt )
return CV_HAL_ERROR_OK;
for( i = 0; i < n1; i++ )
{
sd = i < n ? W[i] : 0;
for( int ii = 0; ii < 100 && sd <= minval; ii++ )
{
// if we got a zero singular value, then in order to get the corresponding left singular vector
// we generate a random vector, project it to the previously computed left singular vectors,
// subtract the projection and normalize the difference.
const T val0 = (T)(1./m);
for( k = 0; k < m; k++ )
{
T val = (rand() & 256) != 0 ? val0 : -val0;
src[i*src_step + k] = val;
}
for( iter = 0; iter < 2; iter++ )
{
for( j = 0; j < i; j++ )
{
auto vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
for( k = 0; k < m; k += vl )
{
vl = rvv<T>::vsetvl(m - k);
auto vec_src1 = rvv<T>::vle(src + i * src_step + k, vl);
auto vec_src2 = rvv<T>::vle(src + j * src_step + k, vl);
vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl);
}
sd = __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
for( k = 0; k < m; k += vl )
{
vl = rvv<T>::vsetvl(m - k);
auto vec_src1 = rvv<T>::vle(src + i * src_step + k, vl);
auto vec_src2 = rvv<T>::vle(src + j * src_step + k, vl);
vec_src1 = __riscv_vfnmsac(vec_src1, sd, vec_src2, vl);
rvv<T>::vse(src + i * src_step + k, vec_src1, vl);
vec_sum = __riscv_vfadd_tu(vec_sum, vec_sum, __riscv_vfabs(vec_src1, vl), vl);
}
T asum = __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
asum = asum > eps*100 ? 1/asum : 0;
for( k = 0; k < m; k += vl )
{
vl = rvv<T>::vsetvl(m - k);
auto vec_src = rvv<T>::vle(src + i * src_step + k, vl);
vec_src = __riscv_vfmul(vec_src, asum, vl);
rvv<T>::vse(src + i * src_step + k, vec_src, vl);
}
}
}
auto vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
for( k = 0; k < m; k += vl )
{
vl = rvv<T>::vsetvl(m - k);
auto vec_src1 = rvv<T>::vle(src + i * src_step + k, vl);
auto vec_src2 = rvv<T>::vle(src + j * src_step + k, vl);
vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl);
}
sd = std::sqrt(__riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax)));
}
s = (T)(sd > minval ? 1/sd : 0.);
for( k = 0; k < m; k += vl )
{
vl = rvv<T>::vsetvl(m - k);
auto vec_src = rvv<T>::vle(src + i * src_step + k, vl);
vec_src = __riscv_vfmul(vec_src, s, vl);
rvv<T>::vse(src + i * src_step + k, vec_src, vl);
}
}
return CV_HAL_ERROR_OK;
}
}}}
#endif