diff --git a/3rdparty/hal_rvv/hal_rvv.hpp b/3rdparty/hal_rvv/hal_rvv.hpp index 3b0d596d9d..e372714db3 100644 --- a/3rdparty/hal_rvv/hal_rvv.hpp +++ b/3rdparty/hal_rvv/hal_rvv.hpp @@ -32,6 +32,10 @@ #include "hal_rvv_1p0/split.hpp" // core #include "hal_rvv_1p0/flip.hpp" // core #include "hal_rvv_1p0/lut.hpp" // core +#include "hal_rvv_1p0/lu.hpp" // core +#include "hal_rvv_1p0/cholesky.hpp" // core +#include "hal_rvv_1p0/qr.hpp" // core +#include "hal_rvv_1p0/svd.hpp" // core #include "hal_rvv_1p0/pyramids.hpp" // imgproc #endif diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/cholesky.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/cholesky.hpp new file mode 100644 index 0000000000..aaf0d863f3 --- /dev/null +++ b/3rdparty/hal_rvv/hal_rvv_1p0/cholesky.hpp @@ -0,0 +1,142 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +#ifndef OPENCV_HAL_RVV_CHOLESKY_HPP_INCLUDED +#define OPENCV_HAL_RVV_CHOLESKY_HPP_INCLUDED + +#include + +namespace cv { namespace cv_hal_rvv { namespace cholesky { + +#undef cv_hal_Cholesky32f +#define cv_hal_Cholesky32f cv::cv_hal_rvv::cholesky::Cholesky +#undef cv_hal_Cholesky64f +#define cv_hal_Cholesky64f cv::cv_hal_rvv::cholesky::Cholesky + +template struct rvv; + +template<> struct rvv +{ + static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e32m4(); } + static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e32m4(a); } + static inline vfloat32m4_t vfmv_v_f(float a, size_t b) { return __riscv_vfmv_v_f_f32m4(a, b); } + static inline vfloat32m1_t vfmv_s_f(float a, size_t b) { return __riscv_vfmv_s_f_f32m1(a, b); } + static inline vfloat32m4_t vle(const float* a, size_t b) { return __riscv_vle32_v_f32m4(a, b); } + static inline vfloat32m4_t vlse(const float* a, ptrdiff_t b, size_t c) { return __riscv_vlse32_v_f32m4(a, b, c); } + static inline void vsse(float* a, ptrdiff_t b, vfloat32m4_t c, size_t d) { __riscv_vsse32(a, b, c, d); } +}; + +template<> struct rvv +{ + static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e64m4(); } + static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e64m4(a); } + static inline vfloat64m4_t vfmv_v_f(double a, size_t b) { return __riscv_vfmv_v_f_f64m4(a, b); } + static inline vfloat64m1_t vfmv_s_f(double a, size_t b) { return __riscv_vfmv_s_f_f64m1(a, b); } + static inline vfloat64m4_t vle(const double* a, size_t b) { return __riscv_vle64_v_f64m4(a, b); } + static inline vfloat64m4_t vlse(const double* a, ptrdiff_t b, size_t c) { return __riscv_vlse64_v_f64m4(a, b, c); } + static inline void vsse(double* a, ptrdiff_t b, vfloat64m4_t c, size_t d) { __riscv_vsse64(a, b, c, d); } +}; + +// the algorithm is copied from core/src/matrix_decomp.cpp, +// in the function template static int cv::CholImpl +template +inline int Cholesky(T* src1, size_t src1_step, int m, T* src2, size_t src2_step, int n, bool* info) +{ + int i, j, k; + double s; + src1_step /= sizeof(src1[0]); + src2_step /= sizeof(src2[0]); + + int vlmax = rvv::vsetvlmax(), vl; + for( i = 0; i < m; i++ ) + { + for( j = 0; j < i; j++ ) + { + auto vec_sum = rvv::vfmv_v_f(0, vlmax); + for( k = 0; k < j; k += vl ) + { + vl = rvv::vsetvl(j - k); + auto vec_src1 = rvv::vle(src1 + i * src1_step + k, vl); + auto vec_src2 = rvv::vle(src1 + j * src1_step + k, vl); + vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl); + } + s = src1[i*src1_step + j] - __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv::vfmv_s_f(0, vlmax), vlmax)); + src1[i*src1_step + j] = (T)(s*src1[j*src1_step + j]); + } + auto vec_sum = rvv::vfmv_v_f(0, vlmax); + for( k = 0; k < j; k += vl ) + { + vl = rvv::vsetvl(j - k); + auto vec_src = rvv::vle(src1 + i * src1_step + k, vl); + vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src, vec_src, vl); + } + s = src1[i*src1_step + i] - __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv::vfmv_s_f(0, vlmax), vlmax)); + if( s < std::numeric_limits::epsilon() ) + { + *info = false; + return CV_HAL_ERROR_OK; + } + src1[i*src1_step + i] = (T)(1./std::sqrt(s)); + } + + if (!src2) + { + for( i = 0; i < m; i += vl ) + { + vl = rvv::vsetvl(m - i); + auto vec_src = rvv::vlse(src1 + i * src1_step + i, sizeof(T) * (src1_step + 1), vl); + vec_src = __riscv_vfrdiv(vec_src, 1, vl); + rvv::vsse(src1 + i * src1_step + i, sizeof(T) * (src1_step + 1), vec_src, vl); + } + *info = true; + return CV_HAL_ERROR_OK; + } + + for( i = 0; i < m; i++ ) + { + for( j = 0; j < n; j++ ) + { + auto vec_sum = rvv::vfmv_v_f(0, vlmax); + for( k = 0; k < i; k += vl ) + { + vl = rvv::vsetvl(i - k); + auto vec_src1 = rvv::vle(src1 + i * src1_step + k, vl); + auto vec_src2 = rvv::vlse(src2 + k * src2_step + j, sizeof(T) * src2_step, vl); + vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl); + } + s = src2[i*src2_step + j] - __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv::vfmv_s_f(0, vlmax), vlmax)); + src2[i*src2_step + j] = (T)(s*src1[i*src1_step + i]); + } + } + + for( i = m-1; i >= 0; i-- ) + { + for( j = 0; j < n; j++ ) + { + auto vec_sum = rvv::vfmv_v_f(0, vlmax); + for( k = i + 1; k < m; k += vl ) + { + vl = rvv::vsetvl(m - k); + auto vec_src1 = rvv::vlse(src1 + k * src1_step + i, sizeof(T) * src1_step, vl); + auto vec_src2 = rvv::vlse(src2 + k * src2_step + j, sizeof(T) * src2_step, vl); + vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl); + } + s = src2[i*src2_step + j] - __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv::vfmv_s_f(0, vlmax), vlmax)); + src2[i*src2_step + j] = (T)(s*src1[i*src1_step + i]); + } + } + for( i = 0; i < m; i += vl ) + { + vl = rvv::vsetvl(m - i); + auto vec_src = rvv::vlse(src1 + i * src1_step + i, sizeof(T) * (src1_step + 1), vl); + vec_src = __riscv_vfrdiv(vec_src, 1, vl); + rvv::vsse(src1 + i * src1_step + i, sizeof(T) * (src1_step + 1), vec_src, vl); + } + + *info = true; + return CV_HAL_ERROR_OK; +} + +}}} + +#endif diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/lu.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/lu.hpp new file mode 100644 index 0000000000..d9988972da --- /dev/null +++ b/3rdparty/hal_rvv/hal_rvv_1p0/lu.hpp @@ -0,0 +1,189 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +#ifndef OPENCV_HAL_RVV_LU_HPP_INCLUDED +#define OPENCV_HAL_RVV_LU_HPP_INCLUDED + +#include + +namespace cv { namespace cv_hal_rvv { namespace lu { + +#undef cv_hal_LU32f +#define cv_hal_LU32f cv::cv_hal_rvv::lu::LU +#undef cv_hal_LU64f +#define cv_hal_LU64f cv::cv_hal_rvv::lu::LU + +template struct rvv; + +template<> struct rvv +{ + static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e32m4(); } + static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e32m4(a); } + static inline vfloat32m4_t vfmv_v_f(float a, size_t b) { return __riscv_vfmv_v_f_f32m4(a, b); } + static inline vfloat32m1_t vfmv_s_f(float a, size_t b) { return __riscv_vfmv_s_f_f32m1(a, b); } + static inline vfloat32m4_t vle(const float* a, size_t b) { return __riscv_vle32_v_f32m4(a, b); } + static inline vfloat32m4_t vlse(const float* a, ptrdiff_t b, size_t c) { return __riscv_vlse32_v_f32m4(a, b, c); } + static inline void vse(float* a, vfloat32m4_t b, size_t c) { __riscv_vse32(a, b, c); } +}; + +template<> struct rvv +{ + static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e64m4(); } + static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e64m4(a); } + static inline vfloat64m4_t vfmv_v_f(double a, size_t b) { return __riscv_vfmv_v_f_f64m4(a, b); } + static inline vfloat64m1_t vfmv_s_f(double a, size_t b) { return __riscv_vfmv_s_f_f64m1(a, b); } + static inline vfloat64m4_t vle(const double* a, size_t b) { return __riscv_vle64_v_f64m4(a, b); } + static inline vfloat64m4_t vlse(const double* a, ptrdiff_t b, size_t c) { return __riscv_vlse64_v_f64m4(a, b, c); } + static inline void vse(double* a, vfloat64m4_t b, size_t c) { __riscv_vse64(a, b, c); } +}; + +// the algorithm is copied from core/src/matrix_decomp.cpp, +// in the function template static int cv::LUImpl +template +inline int LU(T* src1, size_t src1_step, int m, T* src2, size_t src2_step, int n, int* info) +{ + T eps; + if( typeid(T) == typeid(float) ) + eps = FLT_EPSILON*10; + else if( typeid(T) == typeid(double) ) + eps = DBL_EPSILON*100; + else + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + int i, j, k, p = 1; + src1_step /= sizeof(src1[0]); + src2_step /= sizeof(src2[0]); + + int vlmax = rvv::vsetvlmax(), vl; + if( src2 ) + { + for( i = 0; i < m; i++ ) + { + k = i; + + for( j = i+1; j < m; j++ ) + if( std::abs(src1[j*src1_step + i]) > std::abs(src1[k*src1_step + i]) ) + k = j; + + if( std::abs(src1[k*src1_step + i]) < eps ) + { + *info = 0; + return CV_HAL_ERROR_OK; + } + + if( k != i ) + { + for( j = i; j < m; j += vl ) + { + vl = rvv::vsetvl(m - j); + auto vec_src1 = rvv::vle(src1 + i * src1_step + j, vl); + auto vec_src2 = rvv::vle(src1 + k * src1_step + j, vl); + rvv::vse(src1 + k * src1_step + j, vec_src1, vl); + rvv::vse(src1 + i * src1_step + j, vec_src2, vl); + } + for( j = 0; j < n; j += vl ) + { + vl = rvv::vsetvl(n - j); + auto vec_src1 = rvv::vle(src2 + i * src2_step + j, vl); + auto vec_src2 = rvv::vle(src2 + k * src2_step + j, vl); + rvv::vse(src2 + k * src2_step + j, vec_src1, vl); + rvv::vse(src2 + i * src2_step + j, vec_src2, vl); + } + p = -p; + } + + T d = -1/src1[i*src1_step + i]; + + for( j = i+1; j < m; j++ ) + { + T alpha = src1[j*src1_step + i]*d; + + for( k = i+1; k < m; k += vl ) + { + vl = rvv::vsetvl(m - k); + auto vec_src = rvv::vle(src1 + i * src1_step + k, vl); + auto vec_dst = rvv::vle(src1 + j * src1_step + k, vl); + vec_dst = __riscv_vfmacc(vec_dst, alpha, vec_src, vl); + rvv::vse(src1 + j * src1_step + k, vec_dst, vl); + } + for( k = 0; k < n; k += vl ) + { + vl = rvv::vsetvl(n - k); + auto vec_src = rvv::vle(src2 + i * src2_step + k, vl); + auto vec_dst = rvv::vle(src2 + j * src2_step + k, vl); + vec_dst = __riscv_vfmacc(vec_dst, alpha, vec_src, vl); + rvv::vse(src2 + j * src2_step + k, vec_dst, vl); + } + } + } + + for( i = m-1; i >= 0; i-- ) + for( j = 0; j < n; j++ ) + { + T s = src2[i*src2_step + j]; + auto vec_sum = rvv::vfmv_v_f(0, vlmax); + for( k = i+1; k < m; k += vl ) + { + vl = rvv::vsetvl(m - k); + auto vec_src1 = rvv::vle(src1 + i * src1_step + k, vl); + auto vec_src2 = rvv::vlse(src2 + k * src2_step + j, sizeof(T) * src2_step, vl); + vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl); + } + s -= __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv::vfmv_s_f(0, vlmax), vlmax)); + src2[i*src2_step + j] = s/src1[i*src1_step + i]; + } + } + else + { + for( i = 0; i < m; i++ ) + { + k = i; + + for( j = i+1; j < m; j++ ) + if( std::abs(src1[j*src1_step + i]) > std::abs(src1[k*src1_step + i]) ) + k = j; + + if( std::abs(src1[k*src1_step + i]) < eps ) + { + *info = 0; + return CV_HAL_ERROR_OK; + } + + if( k != i ) + { + for( j = i; j < m; j += vl ) + { + vl = rvv::vsetvl(m - j); + auto vec_src1 = rvv::vle(src1 + i * src1_step + j, vl); + auto vec_src2 = rvv::vle(src1 + k * src1_step + j, vl); + rvv::vse(src1 + k * src1_step + j, vec_src1, vl); + rvv::vse(src1 + i * src1_step + j, vec_src2, vl); + } + p = -p; + } + + T d = -1/src1[i*src1_step + i]; + + for( j = i+1; j < m; j++ ) + { + T alpha = src1[j*src1_step + i]*d; + + for( k = i+1; k < m; k += vl ) + { + vl = rvv::vsetvl(m - k); + auto vec_src = rvv::vle(src1 + i * src1_step + k, vl); + auto vec_dst = rvv::vle(src1 + j * src1_step + k, vl); + vec_dst = __riscv_vfmacc(vec_dst, alpha, vec_src, vl); + rvv::vse(src1 + j * src1_step + k, vec_dst, vl); + } + } + } + } + + *info = p; + return CV_HAL_ERROR_OK; +} + +}}} + +#endif diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/qr.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/qr.hpp new file mode 100644 index 0000000000..37aa0dd8b3 --- /dev/null +++ b/3rdparty/hal_rvv/hal_rvv_1p0/qr.hpp @@ -0,0 +1,194 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +#ifndef OPENCV_HAL_RVV_QR_HPP_INCLUDED +#define OPENCV_HAL_RVV_QR_HPP_INCLUDED + +#include + +namespace cv { namespace cv_hal_rvv { namespace qr { + +#undef cv_hal_QR32f +#define cv_hal_QR32f cv::cv_hal_rvv::qr::QR +#undef cv_hal_QR64f +#define cv_hal_QR64f cv::cv_hal_rvv::qr::QR + +template struct rvv; + +template<> struct rvv +{ + static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e32m4(); } + static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e32m4(a); } + static inline vfloat32m4_t vfmv_v_f(float a, size_t b) { return __riscv_vfmv_v_f_f32m4(a, b); } + static inline vfloat32m1_t vfmv_s_f(float a, size_t b) { return __riscv_vfmv_s_f_f32m1(a, b); } + static inline vfloat32m4_t vle(const float* a, size_t b) { return __riscv_vle32_v_f32m4(a, b); } + static inline vfloat32m4_t vlse(const float* a, ptrdiff_t b, size_t c) { return __riscv_vlse32_v_f32m4(a, b, c); } + static inline void vse(float* a, vfloat32m4_t b, size_t c) { __riscv_vse32(a, b, c); } + static inline void vsse(float* a, ptrdiff_t b, vfloat32m4_t c, size_t d) { __riscv_vsse32(a, b, c, d); } +}; + +template<> struct rvv +{ + static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e64m4(); } + static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e64m4(a); } + static inline vfloat64m4_t vfmv_v_f(double a, size_t b) { return __riscv_vfmv_v_f_f64m4(a, b); } + static inline vfloat64m1_t vfmv_s_f(double a, size_t b) { return __riscv_vfmv_s_f_f64m1(a, b); } + static inline vfloat64m4_t vle(const double* a, size_t b) { return __riscv_vle64_v_f64m4(a, b); } + static inline vfloat64m4_t vlse(const double* a, ptrdiff_t b, size_t c) { return __riscv_vlse64_v_f64m4(a, b, c); } + static inline void vse(double* a, vfloat64m4_t b, size_t c) { __riscv_vse64(a, b, c); } + static inline void vsse(double* a, ptrdiff_t b, vfloat64m4_t c, size_t d) { __riscv_vsse64(a, b, c, d); } +}; + +// the algorithm is copied from core/src/matrix_decomp.cpp, +// in the function template static int cv::QRImpl +template +inline int QR(T* src1, size_t src1_step, int m, int n, int k, T* src2, size_t src2_step, T* dst, int* info) +{ + T eps; + if (typeid(T) == typeid(float)) + eps = FLT_EPSILON*10; + else if (typeid(T) == typeid(double)) + eps = DBL_EPSILON*400; + else + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + src1_step /= sizeof(T); + src2_step /= sizeof(T); + + size_t buf_size = m ? m + n : dst != NULL; + std::vector buffer(buf_size); + T* val = buffer.data(); + if (dst == NULL) + dst = val + m; + + int vlmax = rvv::vsetvlmax(), vl; + for (int l = 0; l < n; l++) + { + //generate val + int vlSize = m - l; + auto vec_sum = rvv::vfmv_v_f(0, vlmax); + for (int i = 0; i < vlSize; i += vl) + { + vl = rvv::vsetvl(vlSize - i); + auto vec_src = rvv::vlse(src1 + (l + i) * src1_step + l, sizeof(T) * src1_step, vl); + rvv::vse(val + i, vec_src, vl); + vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src, vec_src, vl); + } + T vlNorm = __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv::vfmv_s_f(0, vlmax), vlmax)); + T tmpV = val[0]; + val[0] = val[0] + (val[0] >= 0 ? 1 : -1) * std::sqrt(vlNorm); + vlNorm = std::sqrt(vlNorm + val[0] * val[0] - tmpV*tmpV); + for (int i = 0; i < vlSize; i += vl) + { + vl = rvv::vsetvl(vlSize - i); + auto vec_src = rvv::vle(val + i, vl); + vec_src = __riscv_vfdiv(vec_src, vlNorm, vl); + rvv::vse(val + i, vec_src, vl); + } + //multiply A_l*val + for (int j = l; j < n; j++) + { + vec_sum = rvv::vfmv_v_f(0, vlmax); + for (int i = l; i < m; i += vl) + { + vl = rvv::vsetvl(m - i); + auto vec_src1 = rvv::vle(val + i - l, vl); + auto vec_src2 = rvv::vlse(src1 + i * src1_step + j, sizeof(T) * src1_step, vl); + vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl); + } + T v_lA = 2 * __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv::vfmv_s_f(0, vlmax), vlmax)); + + for (int i = l; i < m; i += vl) + { + vl = rvv::vsetvl(m - i); + auto vec_src1 = rvv::vle(val + i - l, vl); + auto vec_src2 = rvv::vlse(src1 + i * src1_step + j, sizeof(T) * src1_step, vl); + vec_src2 = __riscv_vfnmsac(vec_src2, v_lA, vec_src1, vl); + rvv::vsse(src1 + i * src1_step + j, sizeof(T) * src1_step, vec_src2, vl); + } + } + + //save val and factors + dst[l] = val[0] * val[0]; + for (int i = 1; i < vlSize; i += vl) + { + vl = rvv::vsetvl(vlSize - i); + auto vec_src = rvv::vle(val + i, vl); + vec_src = __riscv_vfdiv(vec_src, val[0], vl); + rvv::vsse(src1 + (l + i) * src1_step + l, sizeof(T) * src1_step, vec_src, vl); + } + } + + if (src2) + { + //generate new rhs + for (int l = 0; l < n; l++) + { + //unpack val + val[0] = (T)1; + for (int j = 1; j < m - l; j += vl) + { + vl = rvv::vsetvl(m - l - j); + auto vec_src = rvv::vlse(src1 + (j + l) * src1_step + l, sizeof(T) * src1_step, vl); + rvv::vse(val + j, vec_src, vl); + } + + //h_l*x + for (int j = 0; j < k; j++) + { + auto vec_sum = rvv::vfmv_v_f(0, vlmax); + for (int i = l; i < m; i += vl) + { + vl = rvv::vsetvl(m - i); + auto vec_src1 = rvv::vle(val + i - l, vl); + auto vec_src2 = rvv::vlse(src2 + i * src2_step + j, sizeof(T) * src2_step, vl); + vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl); + } + T v_lB = 2 * dst[l] * __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv::vfmv_s_f(0, vlmax), vlmax)); + + for (int i = l; i < m; i += vl) + { + vl = rvv::vsetvl(m - i); + auto vec_src1 = rvv::vle(val + i - l, vl); + auto vec_src2 = rvv::vlse(src2 + i * src2_step + j, sizeof(T) * src2_step, vl); + vec_src2 = __riscv_vfnmsac(vec_src2, v_lB, vec_src1, vl); + rvv::vsse(src2 + i * src2_step + j, sizeof(T) * src2_step, vec_src2, vl); + } + } + } + //do back substitution + for (int i = n - 1; i >= 0; i--) + { + for (int j = n - 1; j > i; j--) + { + for (int p = 0; p < k; p += vl) + { + vl = rvv::vsetvl(k - p); + auto vec_src1 = rvv::vle(src2 + i * src2_step + p, vl); + auto vec_src2 = rvv::vle(src2 + j * src2_step + p, vl); + vec_src1 = __riscv_vfnmsac(vec_src1, src1[i*src1_step + j], vec_src2, vl); + rvv::vse(src2 + i * src2_step + p, vec_src1, vl); + } + } + if (std::abs(src1[i*src1_step + i]) < eps) + { + *info = 0; + return CV_HAL_ERROR_OK; + } + for (int p = 0; p < k; p += vl) + { + vl = rvv::vsetvl(k - p); + auto vec_src = rvv::vle(src2 + i * src2_step + p, vl); + vec_src = __riscv_vfdiv(vec_src, src1[i*src1_step + i], vl); + rvv::vse(src2 + i * src2_step + p, vec_src, vl); + } + } + } + + *info = 1; + return CV_HAL_ERROR_OK; +} + +}}} + +#endif diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/svd.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/svd.hpp new file mode 100644 index 0000000000..c2d861e02b --- /dev/null +++ b/3rdparty/hal_rvv/hal_rvv_1p0/svd.hpp @@ -0,0 +1,285 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +#ifndef OPENCV_HAL_RVV_SVD_HPP_INCLUDED +#define OPENCV_HAL_RVV_SVD_HPP_INCLUDED + +#include + +namespace cv { namespace cv_hal_rvv { namespace svd { + +#undef cv_hal_SVD32f +#define cv_hal_SVD32f cv::cv_hal_rvv::svd::SVD +#undef cv_hal_SVD64f +#define cv_hal_SVD64f cv::cv_hal_rvv::svd::SVD + +template struct rvv; + +template<> struct rvv +{ + static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e32m4(); } + static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e32m4(a); } + static inline vfloat32m4_t vfmv_v_f(float a, size_t b) { return __riscv_vfmv_v_f_f32m4(a, b); } + static inline vfloat32m1_t vfmv_s_f(float a, size_t b) { return __riscv_vfmv_s_f_f32m1(a, b); } + static inline vfloat32m4_t vle(const float* a, size_t b) { return __riscv_vle32_v_f32m4(a, b); } + static inline void vse(float* a, vfloat32m4_t b, size_t c) { __riscv_vse32(a, b, c); } +}; + +template<> struct rvv +{ + static inline size_t vsetvlmax() { return __riscv_vsetvlmax_e64m4(); } + static inline size_t vsetvl(size_t a) { return __riscv_vsetvl_e64m4(a); } + static inline vfloat64m4_t vfmv_v_f(double a, size_t b) { return __riscv_vfmv_v_f_f64m4(a, b); } + static inline vfloat64m1_t vfmv_s_f(double a, size_t b) { return __riscv_vfmv_s_f_f64m1(a, b); } + static inline vfloat64m4_t vle(const double* a, size_t b) { return __riscv_vle64_v_f64m4(a, b); } + static inline void vse(double* a, vfloat64m4_t b, size_t c) { __riscv_vse64(a, b, c); } +}; + +// the algorithm is copied from core/src/lapack.cpp, +// in the function template static void cv::JacobiSVDImpl_ +template +inline int SVD(T* src, size_t src_step, T* w, T*, size_t, T* vt, size_t vt_step, int m, int n, int flags) +{ + T minval, eps; + if( typeid(T) == typeid(float) ) + { + minval = FLT_MIN; + eps = FLT_EPSILON*2; + } + else if( typeid(T) == typeid(double) ) + { + minval = DBL_MIN; + eps = DBL_EPSILON*10; + } + else + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + int n1; + if( flags == CV_HAL_SVD_NO_UV ) + n1 = 0; + else if( flags == (CV_HAL_SVD_SHORT_UV | CV_HAL_SVD_MODIFY_A) ) + n1 = n; + else if( flags == (CV_HAL_SVD_FULL_UV | CV_HAL_SVD_MODIFY_A) ) + n1 = m; + else + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + std::vector Wbuf(n); + double* W = Wbuf.data(); + int i, j, k, iter, max_iter = std::max(m, 30); + T c, s; + double sd; + src_step /= sizeof(src[0]); + vt_step /= sizeof(vt[0]); + + int vlmax = rvv::vsetvlmax(), vl; + for( i = 0; i < n; i++ ) + { + for( k = 0, sd = 0; k < m; k++ ) + { + T t = src[i*src_step + k]; + sd += (double)t*t; + } + W[i] = sd; + + if( vt ) + { + for( k = 0; k < n; k++ ) + vt[i*vt_step + k] = 0; + vt[i*vt_step + i] = 1; + } + } + + for( iter = 0; iter < max_iter; iter++ ) + { + bool changed = false; + + for( i = 0; i < n-1; i++ ) + for( j = i+1; j < n; j++ ) + { + T *Ai = src + i*src_step, *Aj = src + j*src_step; + double a = W[i], p = 0, b = W[j]; + + auto vec_sum1 = rvv::vfmv_v_f(0, vlmax); + for( k = 0; k < m; k += vl ) + { + vl = rvv::vsetvl(m - k); + auto vec_src1 = rvv::vle(Ai + k, vl); + auto vec_src2 = rvv::vle(Aj + k, vl); + vec_sum1 = __riscv_vfmacc_tu(vec_sum1, vec_src1, vec_src2, vl); + } + p = __riscv_vfmv_f(__riscv_vfredosum(vec_sum1, rvv::vfmv_s_f(0, vlmax), vlmax)); + + if( std::abs(p) <= eps*std::sqrt((double)a*b) ) + continue; + + p *= 2; + double beta = a - b, gamma = hypot((double)p, beta); + if( beta < 0 ) + { + double delta = (gamma - beta)*0.5; + s = (T)std::sqrt(delta/gamma); + c = (T)(p/(gamma*s*2)); + } + else + { + c = (T)std::sqrt((gamma + beta)/(gamma*2)); + s = (T)(p/(gamma*c*2)); + } + + vec_sum1 = rvv::vfmv_v_f(0, vlmax); + auto vec_sum2 = rvv::vfmv_v_f(0, vlmax); + for( k = 0; k < m; k += vl ) + { + vl = rvv::vsetvl(m - k); + auto vec_src1 = rvv::vle(Ai + k, vl); + auto vec_src2 = rvv::vle(Aj + k, vl); + auto vec_t0 = __riscv_vfadd(__riscv_vfmul(vec_src1, c, vl), __riscv_vfmul(vec_src2, s, vl), vl); + auto vec_t1 = __riscv_vfsub(__riscv_vfmul(vec_src2, c, vl), __riscv_vfmul(vec_src1, s, vl), vl); + rvv::vse(Ai + k, vec_t0, vl); + rvv::vse(Aj + k, vec_t1, vl); + vec_sum1 = __riscv_vfmacc_tu(vec_sum1, vec_t0, vec_t0, vl); + vec_sum2 = __riscv_vfmacc_tu(vec_sum2, vec_t1, vec_t1, vl); + } + W[i] = __riscv_vfmv_f(__riscv_vfredosum(vec_sum1, rvv::vfmv_s_f(0, vlmax), vlmax)); + W[j] = __riscv_vfmv_f(__riscv_vfredosum(vec_sum2, rvv::vfmv_s_f(0, vlmax), vlmax)); + + changed = true; + + if( vt ) + { + T *Vi = vt + i*vt_step, *Vj = vt + j*vt_step; + for( k = 0; k < n; k += vl ) + { + vl = rvv::vsetvl(n - k); + auto vec_src1 = rvv::vle(Vi + k, vl); + auto vec_src2 = rvv::vle(Vj + k, vl); + auto vec_t0 = __riscv_vfadd(__riscv_vfmul(vec_src1, c, vl), __riscv_vfmul(vec_src2, s, vl), vl); + auto vec_t1 = __riscv_vfsub(__riscv_vfmul(vec_src2, c, vl), __riscv_vfmul(vec_src1, s, vl), vl); + rvv::vse(Vi + k, vec_t0, vl); + rvv::vse(Vj + k, vec_t1, vl); + } + } + } + if( !changed ) + break; + } + + for( i = 0; i < n; i++ ) + { + auto vec_sum = rvv::vfmv_v_f(0, vlmax); + for( k = 0; k < m; k += vl ) + { + vl = rvv::vsetvl(m - k); + auto vec_src = rvv::vle(src + i * src_step + k, vl); + vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src, vec_src, vl); + } + W[i] = std::sqrt(__riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv::vfmv_s_f(0, vlmax), vlmax))); + } + + for( i = 0; i < n-1; i++ ) + { + j = i; + for( k = i+1; k < n; k++ ) + { + if( W[j] < W[k] ) + j = k; + } + if( i != j ) + { + std::swap(W[i], W[j]); + if( vt ) + { + for( k = 0; k < m; k++ ) + std::swap(src[i*src_step + k], src[j*src_step + k]); + + for( k = 0; k < n; k++ ) + std::swap(vt[i*vt_step + k], vt[j*vt_step + k]); + } + } + } + + for( i = 0; i < n; i++ ) + w[i] = (T)W[i]; + + if( !vt ) + return CV_HAL_ERROR_OK; + + for( i = 0; i < n1; i++ ) + { + sd = i < n ? W[i] : 0; + + for( int ii = 0; ii < 100 && sd <= minval; ii++ ) + { + // if we got a zero singular value, then in order to get the corresponding left singular vector + // we generate a random vector, project it to the previously computed left singular vectors, + // subtract the projection and normalize the difference. + const T val0 = (T)(1./m); + for( k = 0; k < m; k++ ) + { + T val = (rand() & 256) != 0 ? val0 : -val0; + src[i*src_step + k] = val; + } + for( iter = 0; iter < 2; iter++ ) + { + for( j = 0; j < i; j++ ) + { + auto vec_sum = rvv::vfmv_v_f(0, vlmax); + for( k = 0; k < m; k += vl ) + { + vl = rvv::vsetvl(m - k); + auto vec_src1 = rvv::vle(src + i * src_step + k, vl); + auto vec_src2 = rvv::vle(src + j * src_step + k, vl); + vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl); + } + sd = __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv::vfmv_s_f(0, vlmax), vlmax)); + + vec_sum = rvv::vfmv_v_f(0, vlmax); + for( k = 0; k < m; k += vl ) + { + vl = rvv::vsetvl(m - k); + auto vec_src1 = rvv::vle(src + i * src_step + k, vl); + auto vec_src2 = rvv::vle(src + j * src_step + k, vl); + vec_src1 = __riscv_vfnmsac(vec_src1, sd, vec_src2, vl); + rvv::vse(src + i * src_step + k, vec_src1, vl); + vec_sum = __riscv_vfadd_tu(vec_sum, vec_sum, __riscv_vfabs(vec_src1, vl), vl); + } + T asum = __riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv::vfmv_s_f(0, vlmax), vlmax)); + asum = asum > eps*100 ? 1/asum : 0; + for( k = 0; k < m; k += vl ) + { + vl = rvv::vsetvl(m - k); + auto vec_src = rvv::vle(src + i * src_step + k, vl); + vec_src = __riscv_vfmul(vec_src, asum, vl); + rvv::vse(src + i * src_step + k, vec_src, vl); + } + } + } + + auto vec_sum = rvv::vfmv_v_f(0, vlmax); + for( k = 0; k < m; k += vl ) + { + vl = rvv::vsetvl(m - k); + auto vec_src1 = rvv::vle(src + i * src_step + k, vl); + auto vec_src2 = rvv::vle(src + j * src_step + k, vl); + vec_sum = __riscv_vfmacc_tu(vec_sum, vec_src1, vec_src2, vl); + } + sd = std::sqrt(__riscv_vfmv_f(__riscv_vfredosum(vec_sum, rvv::vfmv_s_f(0, vlmax), vlmax))); + } + + s = (T)(sd > minval ? 1/sd : 0.); + for( k = 0; k < m; k += vl ) + { + vl = rvv::vsetvl(m - k); + auto vec_src = rvv::vle(src + i * src_step + k, vl); + vec_src = __riscv_vfmul(vec_src, s, vl); + rvv::vse(src + i * src_step + k, vec_src, vl); + } + } + + return CV_HAL_ERROR_OK; +} + +}}} + +#endif