mirror of
https://github.com/opencv/opencv.git
synced 2025-08-06 14:36:36 +08:00
Merge pull request #26892 from amane-ame:solve_hal_rvv
Add RISC-V HAL implementation for cv::solve #26892 This patch implements `cv_hal_LU/cv_hal_Cholesky/cv_hal_SVD/cv_hal_QR` function in RVV_HAL using native intrinsics, optimizing the performance for `cv::solve` with method `DECOMP_LU/DECOMP_SVD/DECOMP_CHOLESKY/DECOMP_QR` and data types `32FC1/64FC1`. Tested on MUSE-PI (Spacemit X60) for both gcc 14.2 and clang 20.0. ``` $ ./opencv_test_core --gtest_filter="*Solve*:*SVD*:*Cholesky*" $ ./opencv_perf_core --gtest_filter="*SolveTest*" --perf_min_samples=100 --perf_force_samples=100 ``` The tail of the perf table is shown below since the table is too long. View the full perf table here: [hal_rvv_solve.pdf](https://github.com/user-attachments/files/18725067/hal_rvv_solve.pdf) <img width="1078" alt="Untitled" src="https://github.com/user-attachments/assets/c01d849c-f000-4bcc-bfe0-a302d6605d9e" /> ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
bb525fe91d
commit
00956d5c15
4
3rdparty/hal_rvv/hal_rvv.hpp
vendored
4
3rdparty/hal_rvv/hal_rvv.hpp
vendored
@ -32,6 +32,10 @@
|
||||
#include "hal_rvv_1p0/split.hpp" // core
|
||||
#include "hal_rvv_1p0/flip.hpp" // core
|
||||
#include "hal_rvv_1p0/lut.hpp" // core
|
||||
#include "hal_rvv_1p0/lu.hpp" // core
|
||||
#include "hal_rvv_1p0/cholesky.hpp" // core
|
||||
#include "hal_rvv_1p0/qr.hpp" // core
|
||||
#include "hal_rvv_1p0/svd.hpp" // core
|
||||
|
||||
#include "hal_rvv_1p0/pyramids.hpp" // imgproc
|
||||
#endif
|
||||
|
142
3rdparty/hal_rvv/hal_rvv_1p0/cholesky.hpp
vendored
Normal file
142
3rdparty/hal_rvv/hal_rvv_1p0/cholesky.hpp
vendored
Normal file
@ -0,0 +1,142 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
#ifndef OPENCV_HAL_RVV_CHOLESKY_HPP_INCLUDED
|
||||
#define OPENCV_HAL_RVV_CHOLESKY_HPP_INCLUDED
|
||||
|
||||
#include <riscv_vector.h>
|
||||
|
||||
namespace cv { namespace cv_hal_rvv { namespace cholesky {
|
||||
|
||||
#undef cv_hal_Cholesky32f
|
||||
#define cv_hal_Cholesky32f cv::cv_hal_rvv::cholesky::Cholesky
|
||||
#undef cv_hal_Cholesky64f
|
||||
#define cv_hal_Cholesky64f cv::cv_hal_rvv::cholesky::Cholesky
|
||||
|
||||
template<typename T> struct rvv;
|
||||
|
||||
template<> struct rvv<float>
|
||||
{
|
||||
static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e32m4(); }
|
||||
static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e32m4(a); }
|
||||
static inline vfloat32m4_t vfmv_v_f(float a, size_t b) { return __riscv_vfmv_v_f_f32m4(a, b); }
|
||||
static inline vfloat32m1_t vfmv_s_f(float a, size_t b) { return __riscv_vfmv_s_f_f32m1(a, b); }
|
||||
static inline vfloat32m4_t vle(const float* a, size_t b) { return __riscv_vle32_v_f32m4(a, b); }
|
||||
static inline vfloat32m4_t vlse(const float* a, ptrdiff_t b, size_t c) { return __riscv_vlse32_v_f32m4(a, b, c); }
|
||||
static inline void vsse(float* a, ptrdiff_t b, vfloat32m4_t c, size_t d) { __riscv_vsse32(a, b, c, d); }
|
||||
};
|
||||
|
||||
template<> struct rvv<double>
|
||||
{
|
||||
static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e64m4(); }
|
||||
static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e64m4(a); }
|
||||
static inline vfloat64m4_t vfmv_v_f(double a, size_t b) { return __riscv_vfmv_v_f_f64m4(a, b); }
|
||||
static inline vfloat64m1_t vfmv_s_f(double a, size_t b) { return __riscv_vfmv_s_f_f64m1(a, b); }
|
||||
static inline vfloat64m4_t vle(const double* a, size_t b) { return __riscv_vle64_v_f64m4(a, b); }
|
||||
static inline vfloat64m4_t vlse(const double* a, ptrdiff_t b, size_t c) { return __riscv_vlse64_v_f64m4(a, b, c); }
|
||||
static inline void vsse(double* a, ptrdiff_t b, vfloat64m4_t c, size_t d) { __riscv_vsse64(a, b, c, d); }
|
||||
};
|
||||
|
||||
// the algorithm is copied from core/src/matrix_decomp.cpp,
|
||||
// in the function template static int cv::CholImpl
|
||||
template<typename T>
|
||||
inline int Cholesky(T* src1, size_t src1_step, int m, T* src2, size_t src2_step, int n, bool* info)
|
||||
{
|
||||
int i, j, k;
|
||||
double s;
|
||||
src1_step /= sizeof(src1[0]);
|
||||
src2_step /= sizeof(src2[0]);
|
||||
|
||||
int vlmax = rvv<T>::vsetvlmax(), vl;
|
||||
for( i = 0; i < m; i++ )
|
||||
{
|
||||
for( j = 0; j < i; j++ )
|
||||
{
|
||||
auto vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
|
||||
for( k = 0; k < j; k += vl )
|
||||
{
|
||||
vl = rvv<T>::vsetvl(j - k);
|
||||
auto vec_src1 = rvv<T>::vle(src1 + i * src1_step + k, vl);
|
||||
auto vec_src2 = rvv<T>::vle(src1 + j * src1_step + k, vl);
|
||||
vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl);
|
||||
}
|
||||
s = src1[i*src1_step + j] - __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
|
||||
src1[i*src1_step + j] = (T)(s*src1[j*src1_step + j]);
|
||||
}
|
||||
auto vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
|
||||
for( k = 0; k < j; k += vl )
|
||||
{
|
||||
vl = rvv<T>::vsetvl(j - k);
|
||||
auto vec_src = rvv<T>::vle(src1 + i * src1_step + k, vl);
|
||||
vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src, vec_src, vl);
|
||||
}
|
||||
s = src1[i*src1_step + i] - __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
|
||||
if( s < std::numeric_limits<T>::epsilon() )
|
||||
{
|
||||
*info = false;
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
src1[i*src1_step + i] = (T)(1./std::sqrt(s));
|
||||
}
|
||||
|
||||
if (!src2)
|
||||
{
|
||||
for( i = 0; i < m; i += vl )
|
||||
{
|
||||
vl = rvv<T>::vsetvl(m - i);
|
||||
auto vec_src = rvv<T>::vlse(src1 + i * src1_step + i, sizeof(T) * (src1_step + 1), vl);
|
||||
vec_src = __riscv_vfrdiv(vec_src, 1, vl);
|
||||
rvv<T>::vsse(src1 + i * src1_step + i, sizeof(T) * (src1_step + 1), vec_src, vl);
|
||||
}
|
||||
*info = true;
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
for( i = 0; i < m; i++ )
|
||||
{
|
||||
for( j = 0; j < n; j++ )
|
||||
{
|
||||
auto vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
|
||||
for( k = 0; k < i; k += vl )
|
||||
{
|
||||
vl = rvv<T>::vsetvl(i - k);
|
||||
auto vec_src1 = rvv<T>::vle(src1 + i * src1_step + k, vl);
|
||||
auto vec_src2 = rvv<T>::vlse(src2 + k * src2_step + j, sizeof(T) * src2_step, vl);
|
||||
vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl);
|
||||
}
|
||||
s = src2[i*src2_step + j] - __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
|
||||
src2[i*src2_step + j] = (T)(s*src1[i*src1_step + i]);
|
||||
}
|
||||
}
|
||||
|
||||
for( i = m-1; i >= 0; i-- )
|
||||
{
|
||||
for( j = 0; j < n; j++ )
|
||||
{
|
||||
auto vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
|
||||
for( k = i + 1; k < m; k += vl )
|
||||
{
|
||||
vl = rvv<T>::vsetvl(m - k);
|
||||
auto vec_src1 = rvv<T>::vlse(src1 + k * src1_step + i, sizeof(T) * src1_step, vl);
|
||||
auto vec_src2 = rvv<T>::vlse(src2 + k * src2_step + j, sizeof(T) * src2_step, vl);
|
||||
vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl);
|
||||
}
|
||||
s = src2[i*src2_step + j] - __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
|
||||
src2[i*src2_step + j] = (T)(s*src1[i*src1_step + i]);
|
||||
}
|
||||
}
|
||||
for( i = 0; i < m; i += vl )
|
||||
{
|
||||
vl = rvv<T>::vsetvl(m - i);
|
||||
auto vec_src = rvv<T>::vlse(src1 + i * src1_step + i, sizeof(T) * (src1_step + 1), vl);
|
||||
vec_src = __riscv_vfrdiv(vec_src, 1, vl);
|
||||
rvv<T>::vsse(src1 + i * src1_step + i, sizeof(T) * (src1_step + 1), vec_src, vl);
|
||||
}
|
||||
|
||||
*info = true;
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
}}}
|
||||
|
||||
#endif
|
189
3rdparty/hal_rvv/hal_rvv_1p0/lu.hpp
vendored
Normal file
189
3rdparty/hal_rvv/hal_rvv_1p0/lu.hpp
vendored
Normal file
@ -0,0 +1,189 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
#ifndef OPENCV_HAL_RVV_LU_HPP_INCLUDED
|
||||
#define OPENCV_HAL_RVV_LU_HPP_INCLUDED
|
||||
|
||||
#include <riscv_vector.h>
|
||||
|
||||
namespace cv { namespace cv_hal_rvv { namespace lu {
|
||||
|
||||
#undef cv_hal_LU32f
|
||||
#define cv_hal_LU32f cv::cv_hal_rvv::lu::LU
|
||||
#undef cv_hal_LU64f
|
||||
#define cv_hal_LU64f cv::cv_hal_rvv::lu::LU
|
||||
|
||||
template<typename T> struct rvv;
|
||||
|
||||
template<> struct rvv<float>
|
||||
{
|
||||
static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e32m4(); }
|
||||
static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e32m4(a); }
|
||||
static inline vfloat32m4_t vfmv_v_f(float a, size_t b) { return __riscv_vfmv_v_f_f32m4(a, b); }
|
||||
static inline vfloat32m1_t vfmv_s_f(float a, size_t b) { return __riscv_vfmv_s_f_f32m1(a, b); }
|
||||
static inline vfloat32m4_t vle(const float* a, size_t b) { return __riscv_vle32_v_f32m4(a, b); }
|
||||
static inline vfloat32m4_t vlse(const float* a, ptrdiff_t b, size_t c) { return __riscv_vlse32_v_f32m4(a, b, c); }
|
||||
static inline void vse(float* a, vfloat32m4_t b, size_t c) { __riscv_vse32(a, b, c); }
|
||||
};
|
||||
|
||||
template<> struct rvv<double>
|
||||
{
|
||||
static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e64m4(); }
|
||||
static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e64m4(a); }
|
||||
static inline vfloat64m4_t vfmv_v_f(double a, size_t b) { return __riscv_vfmv_v_f_f64m4(a, b); }
|
||||
static inline vfloat64m1_t vfmv_s_f(double a, size_t b) { return __riscv_vfmv_s_f_f64m1(a, b); }
|
||||
static inline vfloat64m4_t vle(const double* a, size_t b) { return __riscv_vle64_v_f64m4(a, b); }
|
||||
static inline vfloat64m4_t vlse(const double* a, ptrdiff_t b, size_t c) { return __riscv_vlse64_v_f64m4(a, b, c); }
|
||||
static inline void vse(double* a, vfloat64m4_t b, size_t c) { __riscv_vse64(a, b, c); }
|
||||
};
|
||||
|
||||
// the algorithm is copied from core/src/matrix_decomp.cpp,
|
||||
// in the function template static int cv::LUImpl
|
||||
template<typename T>
|
||||
inline int LU(T* src1, size_t src1_step, int m, T* src2, size_t src2_step, int n, int* info)
|
||||
{
|
||||
T eps;
|
||||
if( typeid(T) == typeid(float) )
|
||||
eps = FLT_EPSILON*10;
|
||||
else if( typeid(T) == typeid(double) )
|
||||
eps = DBL_EPSILON*100;
|
||||
else
|
||||
return CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||
|
||||
int i, j, k, p = 1;
|
||||
src1_step /= sizeof(src1[0]);
|
||||
src2_step /= sizeof(src2[0]);
|
||||
|
||||
int vlmax = rvv<T>::vsetvlmax(), vl;
|
||||
if( src2 )
|
||||
{
|
||||
for( i = 0; i < m; i++ )
|
||||
{
|
||||
k = i;
|
||||
|
||||
for( j = i+1; j < m; j++ )
|
||||
if( std::abs(src1[j*src1_step + i]) > std::abs(src1[k*src1_step + i]) )
|
||||
k = j;
|
||||
|
||||
if( std::abs(src1[k*src1_step + i]) < eps )
|
||||
{
|
||||
*info = 0;
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
if( k != i )
|
||||
{
|
||||
for( j = i; j < m; j += vl )
|
||||
{
|
||||
vl = rvv<T>::vsetvl(m - j);
|
||||
auto vec_src1 = rvv<T>::vle(src1 + i * src1_step + j, vl);
|
||||
auto vec_src2 = rvv<T>::vle(src1 + k * src1_step + j, vl);
|
||||
rvv<T>::vse(src1 + k * src1_step + j, vec_src1, vl);
|
||||
rvv<T>::vse(src1 + i * src1_step + j, vec_src2, vl);
|
||||
}
|
||||
for( j = 0; j < n; j += vl )
|
||||
{
|
||||
vl = rvv<T>::vsetvl(n - j);
|
||||
auto vec_src1 = rvv<T>::vle(src2 + i * src2_step + j, vl);
|
||||
auto vec_src2 = rvv<T>::vle(src2 + k * src2_step + j, vl);
|
||||
rvv<T>::vse(src2 + k * src2_step + j, vec_src1, vl);
|
||||
rvv<T>::vse(src2 + i * src2_step + j, vec_src2, vl);
|
||||
}
|
||||
p = -p;
|
||||
}
|
||||
|
||||
T d = -1/src1[i*src1_step + i];
|
||||
|
||||
for( j = i+1; j < m; j++ )
|
||||
{
|
||||
T alpha = src1[j*src1_step + i]*d;
|
||||
|
||||
for( k = i+1; k < m; k += vl )
|
||||
{
|
||||
vl = rvv<T>::vsetvl(m - k);
|
||||
auto vec_src = rvv<T>::vle(src1 + i * src1_step + k, vl);
|
||||
auto vec_dst = rvv<T>::vle(src1 + j * src1_step + k, vl);
|
||||
vec_dst = __riscv_vfmacc(vec_dst, alpha, vec_src, vl);
|
||||
rvv<T>::vse(src1 + j * src1_step + k, vec_dst, vl);
|
||||
}
|
||||
for( k = 0; k < n; k += vl )
|
||||
{
|
||||
vl = rvv<T>::vsetvl(n - k);
|
||||
auto vec_src = rvv<T>::vle(src2 + i * src2_step + k, vl);
|
||||
auto vec_dst = rvv<T>::vle(src2 + j * src2_step + k, vl);
|
||||
vec_dst = __riscv_vfmacc(vec_dst, alpha, vec_src, vl);
|
||||
rvv<T>::vse(src2 + j * src2_step + k, vec_dst, vl);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for( i = m-1; i >= 0; i-- )
|
||||
for( j = 0; j < n; j++ )
|
||||
{
|
||||
T s = src2[i*src2_step + j];
|
||||
auto vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
|
||||
for( k = i+1; k < m; k += vl )
|
||||
{
|
||||
vl = rvv<T>::vsetvl(m - k);
|
||||
auto vec_src1 = rvv<T>::vle(src1 + i * src1_step + k, vl);
|
||||
auto vec_src2 = rvv<T>::vlse(src2 + k * src2_step + j, sizeof(T) * src2_step, vl);
|
||||
vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl);
|
||||
}
|
||||
s -= __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
|
||||
src2[i*src2_step + j] = s/src1[i*src1_step + i];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for( i = 0; i < m; i++ )
|
||||
{
|
||||
k = i;
|
||||
|
||||
for( j = i+1; j < m; j++ )
|
||||
if( std::abs(src1[j*src1_step + i]) > std::abs(src1[k*src1_step + i]) )
|
||||
k = j;
|
||||
|
||||
if( std::abs(src1[k*src1_step + i]) < eps )
|
||||
{
|
||||
*info = 0;
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
if( k != i )
|
||||
{
|
||||
for( j = i; j < m; j += vl )
|
||||
{
|
||||
vl = rvv<T>::vsetvl(m - j);
|
||||
auto vec_src1 = rvv<T>::vle(src1 + i * src1_step + j, vl);
|
||||
auto vec_src2 = rvv<T>::vle(src1 + k * src1_step + j, vl);
|
||||
rvv<T>::vse(src1 + k * src1_step + j, vec_src1, vl);
|
||||
rvv<T>::vse(src1 + i * src1_step + j, vec_src2, vl);
|
||||
}
|
||||
p = -p;
|
||||
}
|
||||
|
||||
T d = -1/src1[i*src1_step + i];
|
||||
|
||||
for( j = i+1; j < m; j++ )
|
||||
{
|
||||
T alpha = src1[j*src1_step + i]*d;
|
||||
|
||||
for( k = i+1; k < m; k += vl )
|
||||
{
|
||||
vl = rvv<T>::vsetvl(m - k);
|
||||
auto vec_src = rvv<T>::vle(src1 + i * src1_step + k, vl);
|
||||
auto vec_dst = rvv<T>::vle(src1 + j * src1_step + k, vl);
|
||||
vec_dst = __riscv_vfmacc(vec_dst, alpha, vec_src, vl);
|
||||
rvv<T>::vse(src1 + j * src1_step + k, vec_dst, vl);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*info = p;
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
}}}
|
||||
|
||||
#endif
|
194
3rdparty/hal_rvv/hal_rvv_1p0/qr.hpp
vendored
Normal file
194
3rdparty/hal_rvv/hal_rvv_1p0/qr.hpp
vendored
Normal file
@ -0,0 +1,194 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
#ifndef OPENCV_HAL_RVV_QR_HPP_INCLUDED
|
||||
#define OPENCV_HAL_RVV_QR_HPP_INCLUDED
|
||||
|
||||
#include <riscv_vector.h>
|
||||
|
||||
namespace cv { namespace cv_hal_rvv { namespace qr {
|
||||
|
||||
#undef cv_hal_QR32f
|
||||
#define cv_hal_QR32f cv::cv_hal_rvv::qr::QR
|
||||
#undef cv_hal_QR64f
|
||||
#define cv_hal_QR64f cv::cv_hal_rvv::qr::QR
|
||||
|
||||
template<typename T> struct rvv;
|
||||
|
||||
template<> struct rvv<float>
|
||||
{
|
||||
static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e32m4(); }
|
||||
static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e32m4(a); }
|
||||
static inline vfloat32m4_t vfmv_v_f(float a, size_t b) { return __riscv_vfmv_v_f_f32m4(a, b); }
|
||||
static inline vfloat32m1_t vfmv_s_f(float a, size_t b) { return __riscv_vfmv_s_f_f32m1(a, b); }
|
||||
static inline vfloat32m4_t vle(const float* a, size_t b) { return __riscv_vle32_v_f32m4(a, b); }
|
||||
static inline vfloat32m4_t vlse(const float* a, ptrdiff_t b, size_t c) { return __riscv_vlse32_v_f32m4(a, b, c); }
|
||||
static inline void vse(float* a, vfloat32m4_t b, size_t c) { __riscv_vse32(a, b, c); }
|
||||
static inline void vsse(float* a, ptrdiff_t b, vfloat32m4_t c, size_t d) { __riscv_vsse32(a, b, c, d); }
|
||||
};
|
||||
|
||||
template<> struct rvv<double>
|
||||
{
|
||||
static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e64m4(); }
|
||||
static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e64m4(a); }
|
||||
static inline vfloat64m4_t vfmv_v_f(double a, size_t b) { return __riscv_vfmv_v_f_f64m4(a, b); }
|
||||
static inline vfloat64m1_t vfmv_s_f(double a, size_t b) { return __riscv_vfmv_s_f_f64m1(a, b); }
|
||||
static inline vfloat64m4_t vle(const double* a, size_t b) { return __riscv_vle64_v_f64m4(a, b); }
|
||||
static inline vfloat64m4_t vlse(const double* a, ptrdiff_t b, size_t c) { return __riscv_vlse64_v_f64m4(a, b, c); }
|
||||
static inline void vse(double* a, vfloat64m4_t b, size_t c) { __riscv_vse64(a, b, c); }
|
||||
static inline void vsse(double* a, ptrdiff_t b, vfloat64m4_t c, size_t d) { __riscv_vsse64(a, b, c, d); }
|
||||
};
|
||||
|
||||
// the algorithm is copied from core/src/matrix_decomp.cpp,
|
||||
// in the function template static int cv::QRImpl
|
||||
template<typename T>
|
||||
inline int QR(T* src1, size_t src1_step, int m, int n, int k, T* src2, size_t src2_step, T* dst, int* info)
|
||||
{
|
||||
T eps;
|
||||
if (typeid(T) == typeid(float))
|
||||
eps = FLT_EPSILON*10;
|
||||
else if (typeid(T) == typeid(double))
|
||||
eps = DBL_EPSILON*400;
|
||||
else
|
||||
return CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||
|
||||
src1_step /= sizeof(T);
|
||||
src2_step /= sizeof(T);
|
||||
|
||||
size_t buf_size = m ? m + n : dst != NULL;
|
||||
std::vector<T> buffer(buf_size);
|
||||
T* val = buffer.data();
|
||||
if (dst == NULL)
|
||||
dst = val + m;
|
||||
|
||||
int vlmax = rvv<T>::vsetvlmax(), vl;
|
||||
for (int l = 0; l < n; l++)
|
||||
{
|
||||
//generate val
|
||||
int vlSize = m - l;
|
||||
auto vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
|
||||
for (int i = 0; i < vlSize; i += vl)
|
||||
{
|
||||
vl = rvv<T>::vsetvl(vlSize - i);
|
||||
auto vec_src = rvv<T>::vlse(src1 + (l + i) * src1_step + l, sizeof(T) * src1_step, vl);
|
||||
rvv<T>::vse(val + i, vec_src, vl);
|
||||
vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src, vec_src, vl);
|
||||
}
|
||||
T vlNorm = __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
|
||||
T tmpV = val[0];
|
||||
val[0] = val[0] + (val[0] >= 0 ? 1 : -1) * std::sqrt(vlNorm);
|
||||
vlNorm = std::sqrt(vlNorm + val[0] * val[0] - tmpV*tmpV);
|
||||
for (int i = 0; i < vlSize; i += vl)
|
||||
{
|
||||
vl = rvv<T>::vsetvl(vlSize - i);
|
||||
auto vec_src = rvv<T>::vle(val + i, vl);
|
||||
vec_src = __riscv_vfdiv(vec_src, vlNorm, vl);
|
||||
rvv<T>::vse(val + i, vec_src, vl);
|
||||
}
|
||||
//multiply A_l*val
|
||||
for (int j = l; j < n; j++)
|
||||
{
|
||||
vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
|
||||
for (int i = l; i < m; i += vl)
|
||||
{
|
||||
vl = rvv<T>::vsetvl(m - i);
|
||||
auto vec_src1 = rvv<T>::vle(val + i - l, vl);
|
||||
auto vec_src2 = rvv<T>::vlse(src1 + i * src1_step + j, sizeof(T) * src1_step, vl);
|
||||
vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl);
|
||||
}
|
||||
T v_lA = 2 * __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
|
||||
|
||||
for (int i = l; i < m; i += vl)
|
||||
{
|
||||
vl = rvv<T>::vsetvl(m - i);
|
||||
auto vec_src1 = rvv<T>::vle(val + i - l, vl);
|
||||
auto vec_src2 = rvv<T>::vlse(src1 + i * src1_step + j, sizeof(T) * src1_step, vl);
|
||||
vec_src2 = __riscv_vfnmsac(vec_src2, v_lA, vec_src1, vl);
|
||||
rvv<T>::vsse(src1 + i * src1_step + j, sizeof(T) * src1_step, vec_src2, vl);
|
||||
}
|
||||
}
|
||||
|
||||
//save val and factors
|
||||
dst[l] = val[0] * val[0];
|
||||
for (int i = 1; i < vlSize; i += vl)
|
||||
{
|
||||
vl = rvv<T>::vsetvl(vlSize - i);
|
||||
auto vec_src = rvv<T>::vle(val + i, vl);
|
||||
vec_src = __riscv_vfdiv(vec_src, val[0], vl);
|
||||
rvv<T>::vsse(src1 + (l + i) * src1_step + l, sizeof(T) * src1_step, vec_src, vl);
|
||||
}
|
||||
}
|
||||
|
||||
if (src2)
|
||||
{
|
||||
//generate new rhs
|
||||
for (int l = 0; l < n; l++)
|
||||
{
|
||||
//unpack val
|
||||
val[0] = (T)1;
|
||||
for (int j = 1; j < m - l; j += vl)
|
||||
{
|
||||
vl = rvv<T>::vsetvl(m - l - j);
|
||||
auto vec_src = rvv<T>::vlse(src1 + (j + l) * src1_step + l, sizeof(T) * src1_step, vl);
|
||||
rvv<T>::vse(val + j, vec_src, vl);
|
||||
}
|
||||
|
||||
//h_l*x
|
||||
for (int j = 0; j < k; j++)
|
||||
{
|
||||
auto vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
|
||||
for (int i = l; i < m; i += vl)
|
||||
{
|
||||
vl = rvv<T>::vsetvl(m - i);
|
||||
auto vec_src1 = rvv<T>::vle(val + i - l, vl);
|
||||
auto vec_src2 = rvv<T>::vlse(src2 + i * src2_step + j, sizeof(T) * src2_step, vl);
|
||||
vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl);
|
||||
}
|
||||
T v_lB = 2 * dst[l] * __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
|
||||
|
||||
for (int i = l; i < m; i += vl)
|
||||
{
|
||||
vl = rvv<T>::vsetvl(m - i);
|
||||
auto vec_src1 = rvv<T>::vle(val + i - l, vl);
|
||||
auto vec_src2 = rvv<T>::vlse(src2 + i * src2_step + j, sizeof(T) * src2_step, vl);
|
||||
vec_src2 = __riscv_vfnmsac(vec_src2, v_lB, vec_src1, vl);
|
||||
rvv<T>::vsse(src2 + i * src2_step + j, sizeof(T) * src2_step, vec_src2, vl);
|
||||
}
|
||||
}
|
||||
}
|
||||
//do back substitution
|
||||
for (int i = n - 1; i >= 0; i--)
|
||||
{
|
||||
for (int j = n - 1; j > i; j--)
|
||||
{
|
||||
for (int p = 0; p < k; p += vl)
|
||||
{
|
||||
vl = rvv<T>::vsetvl(k - p);
|
||||
auto vec_src1 = rvv<T>::vle(src2 + i * src2_step + p, vl);
|
||||
auto vec_src2 = rvv<T>::vle(src2 + j * src2_step + p, vl);
|
||||
vec_src1 = __riscv_vfnmsac(vec_src1, src1[i*src1_step + j], vec_src2, vl);
|
||||
rvv<T>::vse(src2 + i * src2_step + p, vec_src1, vl);
|
||||
}
|
||||
}
|
||||
if (std::abs(src1[i*src1_step + i]) < eps)
|
||||
{
|
||||
*info = 0;
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
for (int p = 0; p < k; p += vl)
|
||||
{
|
||||
vl = rvv<T>::vsetvl(k - p);
|
||||
auto vec_src = rvv<T>::vle(src2 + i * src2_step + p, vl);
|
||||
vec_src = __riscv_vfdiv(vec_src, src1[i*src1_step + i], vl);
|
||||
rvv<T>::vse(src2 + i * src2_step + p, vec_src, vl);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*info = 1;
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
}}}
|
||||
|
||||
#endif
|
285
3rdparty/hal_rvv/hal_rvv_1p0/svd.hpp
vendored
Normal file
285
3rdparty/hal_rvv/hal_rvv_1p0/svd.hpp
vendored
Normal file
@ -0,0 +1,285 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
#ifndef OPENCV_HAL_RVV_SVD_HPP_INCLUDED
|
||||
#define OPENCV_HAL_RVV_SVD_HPP_INCLUDED
|
||||
|
||||
#include <riscv_vector.h>
|
||||
|
||||
namespace cv { namespace cv_hal_rvv { namespace svd {
|
||||
|
||||
#undef cv_hal_SVD32f
|
||||
#define cv_hal_SVD32f cv::cv_hal_rvv::svd::SVD
|
||||
#undef cv_hal_SVD64f
|
||||
#define cv_hal_SVD64f cv::cv_hal_rvv::svd::SVD
|
||||
|
||||
template<typename T> struct rvv;
|
||||
|
||||
template<> struct rvv<float>
|
||||
{
|
||||
static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e32m4(); }
|
||||
static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e32m4(a); }
|
||||
static inline vfloat32m4_t vfmv_v_f(float a, size_t b) { return __riscv_vfmv_v_f_f32m4(a, b); }
|
||||
static inline vfloat32m1_t vfmv_s_f(float a, size_t b) { return __riscv_vfmv_s_f_f32m1(a, b); }
|
||||
static inline vfloat32m4_t vle(const float* a, size_t b) { return __riscv_vle32_v_f32m4(a, b); }
|
||||
static inline void vse(float* a, vfloat32m4_t b, size_t c) { __riscv_vse32(a, b, c); }
|
||||
};
|
||||
|
||||
template<> struct rvv<double>
|
||||
{
|
||||
static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e64m4(); }
|
||||
static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e64m4(a); }
|
||||
static inline vfloat64m4_t vfmv_v_f(double a, size_t b) { return __riscv_vfmv_v_f_f64m4(a, b); }
|
||||
static inline vfloat64m1_t vfmv_s_f(double a, size_t b) { return __riscv_vfmv_s_f_f64m1(a, b); }
|
||||
static inline vfloat64m4_t vle(const double* a, size_t b) { return __riscv_vle64_v_f64m4(a, b); }
|
||||
static inline void vse(double* a, vfloat64m4_t b, size_t c) { __riscv_vse64(a, b, c); }
|
||||
};
|
||||
|
||||
// the algorithm is copied from core/src/lapack.cpp,
|
||||
// in the function template static void cv::JacobiSVDImpl_
|
||||
template<typename T>
|
||||
inline int SVD(T* src, size_t src_step, T* w, T*, size_t, T* vt, size_t vt_step, int m, int n, int flags)
|
||||
{
|
||||
T minval, eps;
|
||||
if( typeid(T) == typeid(float) )
|
||||
{
|
||||
minval = FLT_MIN;
|
||||
eps = FLT_EPSILON*2;
|
||||
}
|
||||
else if( typeid(T) == typeid(double) )
|
||||
{
|
||||
minval = DBL_MIN;
|
||||
eps = DBL_EPSILON*10;
|
||||
}
|
||||
else
|
||||
return CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||
|
||||
int n1;
|
||||
if( flags == CV_HAL_SVD_NO_UV )
|
||||
n1 = 0;
|
||||
else if( flags == (CV_HAL_SVD_SHORT_UV | CV_HAL_SVD_MODIFY_A) )
|
||||
n1 = n;
|
||||
else if( flags == (CV_HAL_SVD_FULL_UV | CV_HAL_SVD_MODIFY_A) )
|
||||
n1 = m;
|
||||
else
|
||||
return CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||
|
||||
std::vector<double> Wbuf(n);
|
||||
double* W = Wbuf.data();
|
||||
int i, j, k, iter, max_iter = std::max(m, 30);
|
||||
T c, s;
|
||||
double sd;
|
||||
src_step /= sizeof(src[0]);
|
||||
vt_step /= sizeof(vt[0]);
|
||||
|
||||
int vlmax = rvv<T>::vsetvlmax(), vl;
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
for( k = 0, sd = 0; k < m; k++ )
|
||||
{
|
||||
T t = src[i*src_step + k];
|
||||
sd += (double)t*t;
|
||||
}
|
||||
W[i] = sd;
|
||||
|
||||
if( vt )
|
||||
{
|
||||
for( k = 0; k < n; k++ )
|
||||
vt[i*vt_step + k] = 0;
|
||||
vt[i*vt_step + i] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
for( iter = 0; iter < max_iter; iter++ )
|
||||
{
|
||||
bool changed = false;
|
||||
|
||||
for( i = 0; i < n-1; i++ )
|
||||
for( j = i+1; j < n; j++ )
|
||||
{
|
||||
T *Ai = src + i*src_step, *Aj = src + j*src_step;
|
||||
double a = W[i], p = 0, b = W[j];
|
||||
|
||||
auto vec_sum1 = rvv<T>::vfmv_v_f(0, vlmax);
|
||||
for( k = 0; k < m; k += vl )
|
||||
{
|
||||
vl = rvv<T>::vsetvl(m - k);
|
||||
auto vec_src1 = rvv<T>::vle(Ai + k, vl);
|
||||
auto vec_src2 = rvv<T>::vle(Aj + k, vl);
|
||||
vec_sum1 = __riscv_vfmacc_tu(vec_sum1, vec_src1, vec_src2, vl);
|
||||
}
|
||||
p = __riscv_vfmv_f(__riscv_vfredosum(vec_sum1, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
|
||||
|
||||
if( std::abs(p) <= eps*std::sqrt((double)a*b) )
|
||||
continue;
|
||||
|
||||
p *= 2;
|
||||
double beta = a - b, gamma = hypot((double)p, beta);
|
||||
if( beta < 0 )
|
||||
{
|
||||
double delta = (gamma - beta)*0.5;
|
||||
s = (T)std::sqrt(delta/gamma);
|
||||
c = (T)(p/(gamma*s*2));
|
||||
}
|
||||
else
|
||||
{
|
||||
c = (T)std::sqrt((gamma + beta)/(gamma*2));
|
||||
s = (T)(p/(gamma*c*2));
|
||||
}
|
||||
|
||||
vec_sum1 = rvv<T>::vfmv_v_f(0, vlmax);
|
||||
auto vec_sum2 = rvv<T>::vfmv_v_f(0, vlmax);
|
||||
for( k = 0; k < m; k += vl )
|
||||
{
|
||||
vl = rvv<T>::vsetvl(m - k);
|
||||
auto vec_src1 = rvv<T>::vle(Ai + k, vl);
|
||||
auto vec_src2 = rvv<T>::vle(Aj + k, vl);
|
||||
auto vec_t0 = __riscv_vfadd(__riscv_vfmul(vec_src1, c, vl), __riscv_vfmul(vec_src2, s, vl), vl);
|
||||
auto vec_t1 = __riscv_vfsub(__riscv_vfmul(vec_src2, c, vl), __riscv_vfmul(vec_src1, s, vl), vl);
|
||||
rvv<T>::vse(Ai + k, vec_t0, vl);
|
||||
rvv<T>::vse(Aj + k, vec_t1, vl);
|
||||
vec_sum1 = __riscv_vfmacc_tu(vec_sum1, vec_t0, vec_t0, vl);
|
||||
vec_sum2 = __riscv_vfmacc_tu(vec_sum2, vec_t1, vec_t1, vl);
|
||||
}
|
||||
W[i] = __riscv_vfmv_f(__riscv_vfredosum(vec_sum1, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
|
||||
W[j] = __riscv_vfmv_f(__riscv_vfredosum(vec_sum2, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
|
||||
|
||||
changed = true;
|
||||
|
||||
if( vt )
|
||||
{
|
||||
T *Vi = vt + i*vt_step, *Vj = vt + j*vt_step;
|
||||
for( k = 0; k < n; k += vl )
|
||||
{
|
||||
vl = rvv<T>::vsetvl(n - k);
|
||||
auto vec_src1 = rvv<T>::vle(Vi + k, vl);
|
||||
auto vec_src2 = rvv<T>::vle(Vj + k, vl);
|
||||
auto vec_t0 = __riscv_vfadd(__riscv_vfmul(vec_src1, c, vl), __riscv_vfmul(vec_src2, s, vl), vl);
|
||||
auto vec_t1 = __riscv_vfsub(__riscv_vfmul(vec_src2, c, vl), __riscv_vfmul(vec_src1, s, vl), vl);
|
||||
rvv<T>::vse(Vi + k, vec_t0, vl);
|
||||
rvv<T>::vse(Vj + k, vec_t1, vl);
|
||||
}
|
||||
}
|
||||
}
|
||||
if( !changed )
|
||||
break;
|
||||
}
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
auto vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
|
||||
for( k = 0; k < m; k += vl )
|
||||
{
|
||||
vl = rvv<T>::vsetvl(m - k);
|
||||
auto vec_src = rvv<T>::vle(src + i * src_step + k, vl);
|
||||
vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src, vec_src, vl);
|
||||
}
|
||||
W[i] = std::sqrt(__riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax)));
|
||||
}
|
||||
|
||||
for( i = 0; i < n-1; i++ )
|
||||
{
|
||||
j = i;
|
||||
for( k = i+1; k < n; k++ )
|
||||
{
|
||||
if( W[j] < W[k] )
|
||||
j = k;
|
||||
}
|
||||
if( i != j )
|
||||
{
|
||||
std::swap(W[i], W[j]);
|
||||
if( vt )
|
||||
{
|
||||
for( k = 0; k < m; k++ )
|
||||
std::swap(src[i*src_step + k], src[j*src_step + k]);
|
||||
|
||||
for( k = 0; k < n; k++ )
|
||||
std::swap(vt[i*vt_step + k], vt[j*vt_step + k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
w[i] = (T)W[i];
|
||||
|
||||
if( !vt )
|
||||
return CV_HAL_ERROR_OK;
|
||||
|
||||
for( i = 0; i < n1; i++ )
|
||||
{
|
||||
sd = i < n ? W[i] : 0;
|
||||
|
||||
for( int ii = 0; ii < 100 && sd <= minval; ii++ )
|
||||
{
|
||||
// if we got a zero singular value, then in order to get the corresponding left singular vector
|
||||
// we generate a random vector, project it to the previously computed left singular vectors,
|
||||
// subtract the projection and normalize the difference.
|
||||
const T val0 = (T)(1./m);
|
||||
for( k = 0; k < m; k++ )
|
||||
{
|
||||
T val = (rand() & 256) != 0 ? val0 : -val0;
|
||||
src[i*src_step + k] = val;
|
||||
}
|
||||
for( iter = 0; iter < 2; iter++ )
|
||||
{
|
||||
for( j = 0; j < i; j++ )
|
||||
{
|
||||
auto vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
|
||||
for( k = 0; k < m; k += vl )
|
||||
{
|
||||
vl = rvv<T>::vsetvl(m - k);
|
||||
auto vec_src1 = rvv<T>::vle(src + i * src_step + k, vl);
|
||||
auto vec_src2 = rvv<T>::vle(src + j * src_step + k, vl);
|
||||
vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl);
|
||||
}
|
||||
sd = __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
|
||||
|
||||
vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
|
||||
for( k = 0; k < m; k += vl )
|
||||
{
|
||||
vl = rvv<T>::vsetvl(m - k);
|
||||
auto vec_src1 = rvv<T>::vle(src + i * src_step + k, vl);
|
||||
auto vec_src2 = rvv<T>::vle(src + j * src_step + k, vl);
|
||||
vec_src1 = __riscv_vfnmsac(vec_src1, sd, vec_src2, vl);
|
||||
rvv<T>::vse(src + i * src_step + k, vec_src1, vl);
|
||||
vec_sum = __riscv_vfadd_tu(vec_sum, vec_sum, __riscv_vfabs(vec_src1, vl), vl);
|
||||
}
|
||||
T asum = __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax));
|
||||
asum = asum > eps*100 ? 1/asum : 0;
|
||||
for( k = 0; k < m; k += vl )
|
||||
{
|
||||
vl = rvv<T>::vsetvl(m - k);
|
||||
auto vec_src = rvv<T>::vle(src + i * src_step + k, vl);
|
||||
vec_src = __riscv_vfmul(vec_src, asum, vl);
|
||||
rvv<T>::vse(src + i * src_step + k, vec_src, vl);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto vec_sum = rvv<T>::vfmv_v_f(0, vlmax);
|
||||
for( k = 0; k < m; k += vl )
|
||||
{
|
||||
vl = rvv<T>::vsetvl(m - k);
|
||||
auto vec_src1 = rvv<T>::vle(src + i * src_step + k, vl);
|
||||
auto vec_src2 = rvv<T>::vle(src + j * src_step + k, vl);
|
||||
vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl);
|
||||
}
|
||||
sd = std::sqrt(__riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv<T>::vfmv_s_f(0, vlmax), vlmax)));
|
||||
}
|
||||
|
||||
s = (T)(sd > minval ? 1/sd : 0.);
|
||||
for( k = 0; k < m; k += vl )
|
||||
{
|
||||
vl = rvv<T>::vsetvl(m - k);
|
||||
auto vec_src = rvv<T>::vle(src + i * src_step + k, vl);
|
||||
vec_src = __riscv_vfmul(vec_src, s, vl);
|
||||
rvv<T>::vse(src + i * src_step + k, vec_src, vl);
|
||||
}
|
||||
}
|
||||
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
}}}
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user