From 673e4e20f02881aff08534d17acfb6370a31363a Mon Sep 17 00:00:00 2001 From: Zhangyin Date: Mon, 24 Aug 2020 12:10:42 +0800 Subject: [PATCH] Added RISC-V backend of universal intrinsics --- .../include/opencv2/core/cv_cpu_dispatch.h | 1 + .../core/include/opencv2/core/hal/intrin.hpp | 2 +- .../include/opencv2/core/hal/intrin_rvv.hpp | 4265 +++++++++++------ platforms/linux/riscv64-gcc.toolchain.cmake | 20 + 4 files changed, 2765 insertions(+), 1523 deletions(-) create mode 100644 platforms/linux/riscv64-gcc.toolchain.cmake diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h index eb3f8693c2..1827dbc7c6 100644 --- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h +++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h @@ -170,6 +170,7 @@ #if defined CV_CPU_COMPILE_RVV # define CV_RVV 1 +# include #endif #endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__ diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index 0ffcb49cea..753cd21941 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -202,7 +202,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; # undef CV_RVV #endif -#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD) && !defined(CV_FORCE_SIMD128_CPP) +#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV) && !defined(CV_FORCE_SIMD128_CPP) #define CV__SIMD_FORWARD 128 #include "opencv2/core/hal/intrin_forward.hpp" #endif diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv.hpp index eca787c7fd..cf2d066a1e 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv.hpp @@ -5,306 +5,2310 @@ #ifndef OPENCV_HAL_INTRIN_RVV_HPP #define OPENCV_HAL_INTRIN_RVV_HPP -#include -#include #include -#include "opencv2/core/saturate.hpp" - -#define CV_SIMD128_CPP 1 -#if defined(CV_FORCE_SIMD128_CPP) || defined(CV_DOXYGEN) -#define CV_SIMD128 1 -#define CV_SIMD128_64F 1 -#endif namespace cv { -#ifndef CV_DOXYGEN CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN -#endif +#define CV_SIMD128 1 +#define CV_SIMD128_64F 1 -template struct v_reg +//////////// Unsupported native intrinsics in C++ //////////// + +struct vuint8mf2_t { - typedef _Tp lane_type; - enum { nlanes = n }; - - explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; } - - v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; } - - v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; } - - v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, - _Tp s4, _Tp s5, _Tp s6, _Tp s7) + uchar val[8] = {0}; + vuint8mf2_t() {} + vuint8mf2_t(const uchar* ptr) { - s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; - s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7; - } - - v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, - _Tp s4, _Tp s5, _Tp s6, _Tp s7, - _Tp s8, _Tp s9, _Tp s10, _Tp s11, - _Tp s12, _Tp s13, _Tp s14, _Tp s15) - { - s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; - s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7; - s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11; - s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15; - } - - v_reg() {} - - v_reg(const v_reg<_Tp, n> & r) - { - for( int i = 0; i < n; i++ ) - s[i] = r.s[i]; - } - _Tp get0() const { return s[0]; } - - _Tp get(const int i) const { return s[i]; } - v_reg<_Tp, n> high() const - { - v_reg<_Tp, n> c; - int i; - for( i = 0; i < n/2; i++ ) + for (int i = 0; i < 8; ++i) { - c.s[i] = s[i+(n/2)]; - c.s[i+(n/2)] = 0; + val[i] = ptr[i]; } - return c; } - - static v_reg<_Tp, n> zero() +}; +struct vint8mf2_t +{ + schar val[8] = {0}; + vint8mf2_t() {} + vint8mf2_t(const schar* ptr) { - v_reg<_Tp, n> c; - for( int i = 0; i < n; i++ ) - c.s[i] = (_Tp)0; - return c; + for (int i = 0; i < 8; ++i) + { + val[i] = ptr[i]; + } } - - static v_reg<_Tp, n> all(_Tp s) +}; +struct vuint16mf2_t +{ + ushort val[4] = {0}; + vuint16mf2_t() {} + vuint16mf2_t(const ushort* ptr) { - v_reg<_Tp, n> c; - for( int i = 0; i < n; i++ ) - c.s[i] = s; - return c; + for (int i = 0; i < 4; ++i) + { + val[i] = ptr[i]; + } } - - template v_reg<_Tp2, n2> reinterpret_as() const +}; +struct vint16mf2_t +{ + short val[4] = {0}; + vint16mf2_t() {} + vint16mf2_t(const short* ptr) { - size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n); - v_reg<_Tp2, n2> c; - std::memcpy(&c.s[0], &s[0], bytes); - return c; + for (int i = 0; i < 4; ++i) + { + val[i] = ptr[i]; + } } - - v_reg& operator=(const v_reg<_Tp, n> & r) +}; +struct vuint32mf2_t +{ + unsigned val[2] = {0}; + vuint32mf2_t() {} + vuint32mf2_t(const unsigned* ptr) { - for( int i = 0; i < n; i++ ) - s[i] = r.s[i]; - return *this; + val[0] = ptr[0]; + val[1] = ptr[1]; + } +}; +struct vint32mf2_t +{ + int val[2] = {0}; + vint32mf2_t() {} + vint32mf2_t(const int* ptr) + { + val[0] = ptr[0]; + val[1] = ptr[1]; + } +}; +struct vfloat32mf2_t +{ + float val[2] = {0}; + vfloat32mf2_t() {} + vfloat32mf2_t(const float* ptr) + { + val[0] = ptr[0]; + val[1] = ptr[1]; + } +}; +struct vuint64mf2_t +{ + uint64 val[1] = {0}; + vuint64mf2_t() {} + vuint64mf2_t(const uint64* ptr) + { + val[0] = ptr[0]; + } +}; +struct vint64mf2_t +{ + int64 val[1] = {0}; + vint64mf2_t() {} + vint64mf2_t(const int64* ptr) + { + val[0] = ptr[0]; + } +}; +struct vfloat64mf2_t +{ + double val[1] = {0}; + vfloat64mf2_t() {} + vfloat64mf2_t(const double* ptr) + { + val[0] = ptr[0]; + } +}; +struct vuint8mf4_t +{ + uchar val[4] = {0}; + vuint8mf4_t() {} + vuint8mf4_t(const uchar* ptr) + { + for (int i = 0; i < 4; ++i) + { + val[i] = ptr[i]; + } + } +}; +struct vint8mf4_t +{ + schar val[4] = {0}; + vint8mf4_t() {} + vint8mf4_t(const schar* ptr) + { + for (int i = 0; i < 4; ++i) + { + val[i] = ptr[i]; + } } - - _Tp s[n]; }; -typedef v_reg v_uint8x16; -typedef v_reg v_int8x16; -typedef v_reg v_uint16x8; -typedef v_reg v_int16x8; -typedef v_reg v_uint32x4; -typedef v_reg v_int32x4; -typedef v_reg v_float32x4; -typedef v_reg v_float64x2; -typedef v_reg v_uint64x2; -typedef v_reg v_int64x2; - -template CV_INLINE v_reg<_Tp, n> operator+(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); -template CV_INLINE v_reg<_Tp, n>& operator+=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); - -template CV_INLINE v_reg<_Tp, n> operator-(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); -template CV_INLINE v_reg<_Tp, n>& operator-=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); - -template CV_INLINE v_reg<_Tp, n> operator*(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); -template CV_INLINE v_reg<_Tp, n>& operator*=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); - -template CV_INLINE v_reg<_Tp, n> operator/(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); -template CV_INLINE v_reg<_Tp, n>& operator/=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); - - -template CV_INLINE v_reg<_Tp, n> operator&(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); -template CV_INLINE v_reg<_Tp, n>& operator&=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); - -template CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); -template CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); - -template CV_INLINE v_reg<_Tp, n> operator^(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); -template CV_INLINE v_reg<_Tp, n>& operator^=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); - -template CV_INLINE v_reg<_Tp, n> operator~(const v_reg<_Tp, n>& a); - - -#ifndef CV_DOXYGEN - -#define CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, ...) \ -__CV_EXPAND(macro_name(uchar, __VA_ARGS__)) \ -__CV_EXPAND(macro_name(schar, __VA_ARGS__)) \ -__CV_EXPAND(macro_name(ushort, __VA_ARGS__)) \ -__CV_EXPAND(macro_name(short, __VA_ARGS__)) \ -__CV_EXPAND(macro_name(unsigned, __VA_ARGS__)) \ -__CV_EXPAND(macro_name(int, __VA_ARGS__)) \ -__CV_EXPAND(macro_name(uint64, __VA_ARGS__)) \ -__CV_EXPAND(macro_name(int64, __VA_ARGS__)) \ - -#define CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, ...) \ -__CV_EXPAND(macro_name(float, __VA_ARGS__)) \ -__CV_EXPAND(macro_name(double, __VA_ARGS__)) \ - -#define CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(macro_name, ...) \ -CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, __VA_ARGS__) \ -CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, __VA_ARGS__) \ - -#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op) \ -template inline \ -v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +#define OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(_Tpvec, _Tp, suffix, width, n) \ +inline _Tpvec vle##width##_v_##suffix##mf2(const _Tp* ptr) \ { \ - v_reg<_Tp, n> c; \ - for( int i = 0; i < n; i++ ) \ - c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ - return c; \ + return _Tpvec(ptr); \ } \ -template inline \ -v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +inline void vse##width##_v_##suffix##mf2(_Tp* ptr, _Tpvec v) \ { \ - for( int i = 0; i < n; i++ ) \ - a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ - return a; \ + for (int i = 0; i < n; ++i) \ + { \ + ptr[i] = v.val[i]; \ + } \ } -#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op) +OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint8mf2_t, uint8_t, u8, 8, 8) +OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint8mf2_t, int8_t, i8, 8, 8) +OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint16mf2_t, uint16_t, u16, 16, 4) +OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint16mf2_t, int16_t, i16, 16, 4) +OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint32mf2_t, uint32_t, u32, 32, 2) +OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint32mf2_t, int32_t, i32, 32, 2) +OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat32mf2_t, float32_t, f32, 32, 2) +OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint64mf2_t, uint64_t, u64, 64, 1) +OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint64mf2_t, int64_t, i64, 64, 1) +OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat64mf2_t, float64_t, f64, 64, 1) -CV__HAL_INTRIN_IMPL_BIN_OP(+) -CV__HAL_INTRIN_IMPL_BIN_OP(-) -CV__HAL_INTRIN_IMPL_BIN_OP(*) -CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /) -#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op) \ -template CV_INLINE \ -v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +#define OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(_Tpwvec, _Tpvec, _wTp, wcvt, suffix, width, n) \ +inline _Tpwvec wcvt (_Tpvec v) \ { \ - v_reg<_Tp, n> c; \ - typedef typename V_TypeTraits<_Tp>::int_type itype; \ - for( int i = 0; i < n; i++ ) \ - c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \ - V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \ - return c; \ -} \ -template CV_INLINE \ -v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ -{ \ - typedef typename V_TypeTraits<_Tp>::int_type itype; \ - for( int i = 0; i < n; i++ ) \ - a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \ - V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \ - return a; \ + _wTp tmp[n]; \ + for (int i = 0; i < n; ++i) \ + { \ + tmp[i] = (_wTp)v.val[i]; \ + } \ + vsetvlmax_e##width##m1(); \ + return vle##width##_v_##suffix##m1(tmp); \ } -#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op) \ -CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) \ -CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) /* TODO: FIXIT remove this after masks refactoring */ +OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint16m1_t, vuint8mf2_t, ushort, vwcvtu_x_x_v_u16m1, u16, 16, 8) +OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint16m1_t, vint8mf2_t, short, vwcvt_x_x_v_i16m1, i16, 16, 8) +OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint32m1_t, vuint16mf2_t, unsigned, vwcvtu_x_x_v_u32m1, u32, 32, 4) +OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint32m1_t, vint16mf2_t, int, vwcvt_x_x_v_i32m1, i32, 32, 4) +OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint64m1_t, vuint32mf2_t, uint64, vwcvtu_x_x_v_u64m1, u64, 64, 2) +OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint64m1_t, vint32mf2_t, int64, vwcvt_x_x_v_i64m1, i64, 64, 2) +inline vuint8mf4_t vle8_v_u8mf4 (const uint8_t *base) +{ + return vuint8mf4_t(base); +} +inline vint8mf4_t vle8_v_i8mf4 (const int8_t *base) +{ + return vint8mf4_t(base); +} -CV__HAL_INTRIN_IMPL_BIT_OP(&) -CV__HAL_INTRIN_IMPL_BIT_OP(|) -CV__HAL_INTRIN_IMPL_BIT_OP(^) +inline vuint16mf2_t vwcvtu_x_x_v_u16mf2 (vuint8mf4_t src) +{ + ushort tmp[4]; + for (int i = 0; i < 4; ++i) + { + tmp[i] = (ushort)src.val[i]; + } + return vle16_v_u16mf2(tmp); +} +inline vint16mf2_t vwcvt_x_x_v_i16mf2 (vint8mf4_t src) +{ + short tmp[4]; + for (int i = 0; i < 4; ++i) + { + tmp[i] = (short)src.val[i]; + } + return vle16_v_i16mf2(tmp); +} -#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy) \ -template CV_INLINE \ -v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \ -{ \ - v_reg<_Tp, n> c; \ - for( int i = 0; i < n; i++ ) \ - c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i])); \ - return c; \ -} \ +//////////// Types //////////// -CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~) +struct v_uint8x16 +{ + typedef uchar lane_type; + enum { nlanes = 16 }; + v_uint8x16() {} + explicit v_uint8x16(vuint8m1_t v) + { + vsetvlmax_e8m1(); + vse8_v_u8m1(val, v); + } + v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7, + uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15) + { + uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; + for (int i = 0; i < nlanes; ++i) + { + val[i] = v[i]; + } + } + operator vuint8m1_t() const + { + vsetvlmax_e8m1(); + return vle8_v_u8m1(val); + } + uchar get0() const + { + return val[0]; + } + + uchar val[16]; +}; + +struct v_int8x16 +{ + typedef schar lane_type; + enum { nlanes = 16 }; + + v_int8x16() {} + explicit v_int8x16(vint8m1_t v) + { + vsetvlmax_e8m1(); + vse8_v_i8m1(val, v); + } + v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7, + schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15) + { + schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; + for (int i = 0; i < nlanes; ++i) + { + val[i] = v[i]; + } + } + operator vint8m1_t() const + { + vsetvlmax_e8m1(); + return vle8_v_i8m1(val); + } + schar get0() const + { + return val[0]; + } + + schar val[16]; +}; + +struct v_uint16x8 +{ + typedef ushort lane_type; + enum { nlanes = 8 }; + + v_uint16x8() {} + explicit v_uint16x8(vuint16m1_t v) + { + vsetvlmax_e16m1(); + vse16_v_u16m1(val, v); + } + v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7) + { + ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; + for (int i = 0; i < nlanes; ++i) + { + val[i] = v[i]; + } + } + operator vuint16m1_t() const + { + vsetvlmax_e16m1(); + return vle16_v_u16m1(val); + } + ushort get0() const + { + return val[0]; + } + + ushort val[8]; +}; + +struct v_int16x8 +{ + typedef short lane_type; + enum { nlanes = 8 }; + + v_int16x8() {} + explicit v_int16x8(vint16m1_t v) + { + vsetvlmax_e16m1(); + vse16_v_i16m1(val, v); + } + v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) + { + short v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; + for (int i = 0; i < nlanes; ++i) + { + val[i] = v[i]; + } + } + operator vint16m1_t() const + { + vsetvlmax_e16m1(); + return vle16_v_i16m1(val); + } + short get0() const + { + return val[0]; + } + + short val[8]; +}; + +struct v_uint32x4 +{ + typedef unsigned lane_type; + enum { nlanes = 4 }; + + v_uint32x4() {} + explicit v_uint32x4(vuint32m1_t v) + { + vsetvlmax_e32m1(); + vse32_v_u32m1(val, v); + } + v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) + { + unsigned v[] = {v0, v1, v2, v3}; + for (int i = 0; i < nlanes; ++i) + { + val[i] = v[i]; + } + } + operator vuint32m1_t() const + { + vsetvlmax_e32m1(); + return vle32_v_u32m1(val); + } + unsigned get0() const + { + return val[0]; + } + + unsigned val[4]; +}; + +struct v_int32x4 +{ + typedef int lane_type; + enum { nlanes = 4 }; + + v_int32x4() {} + explicit v_int32x4(vint32m1_t v) + { + vsetvlmax_e32m1(); + vse32_v_i32m1(val, v); + } + v_int32x4(int v0, int v1, int v2, int v3) + { + int v[] = {v0, v1, v2, v3}; + for (int i = 0; i < nlanes; ++i) + { + val[i] = v[i]; + } + } + operator vint32m1_t() const + { + vsetvlmax_e32m1(); + return vle32_v_i32m1(val); + } + int get0() const + { + return val[0]; + } + int val[4]; +}; + +struct v_float32x4 +{ + typedef float lane_type; + enum { nlanes = 4 }; + + v_float32x4() {} + explicit v_float32x4(vfloat32m1_t v) + { + vsetvlmax_e32m1(); + vse32_v_f32m1(val, v); + } + v_float32x4(float v0, float v1, float v2, float v3) + { + float v[] = {v0, v1, v2, v3}; + for (int i = 0; i < nlanes; ++i) + { + val[i] = v[i]; + } + } + operator vfloat32m1_t() const + { + vsetvlmax_e32m1(); + return vle32_v_f32m1(val); + } + float get0() const + { + return val[0]; + } + float val[4]; +}; + +struct v_uint64x2 +{ + typedef uint64 lane_type; + enum { nlanes = 2 }; + + v_uint64x2() {} + explicit v_uint64x2(vuint64m1_t v) + { + vsetvlmax_e64m1(); + vse64_v_u64m1(val, v); + } + v_uint64x2(uint64 v0, uint64 v1) + { + uint64 v[] = {v0, v1}; + for (int i = 0; i < nlanes; ++i) + { + val[i] = v[i]; + } + } + operator vuint64m1_t() const + { + vsetvlmax_e64m1(); + return vle64_v_u64m1(val); + } + uint64 get0() const + { + return val[0]; + } + + uint64 val[2]; +}; + +struct v_int64x2 +{ + typedef int64 lane_type; + enum { nlanes = 2 }; + + v_int64x2() {} + explicit v_int64x2(vint64m1_t v) + { + vsetvlmax_e64m1(); + vse64_v_i64m1(val, v); + } + v_int64x2(int64 v0, int64 v1) + { + int64 v[] = {v0, v1}; + for (int i = 0; i < nlanes; ++i) + { + val[i] = v[i]; + } + } + operator vint64m1_t() const + { + vsetvlmax_e64m1(); + return vle64_v_i64m1(val); + } + int64 get0() const + { + return val[0]; + } + + int64 val[2]; +}; + +#if CV_SIMD128_64F +struct v_float64x2 +{ + typedef double lane_type; + enum { nlanes = 2 }; + + v_float64x2() {} + explicit v_float64x2(vfloat64m1_t v) + { + vsetvlmax_e64m1(); + vse64_v_f64m1(val, v); + } + v_float64x2(double v0, double v1) + { + double v[] = {v0, v1}; + for (int i = 0; i < nlanes; ++i) + { + val[i] = v[i]; + } + } + operator vfloat64m1_t() const + { + vsetvlmax_e64m1(); + return vle64_v_f64m1(val); + } + double get0() const + { + return val[0]; + } + + double val[2]; +}; #endif -#define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \ -template inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \ -{ \ - v_reg<_Tp2, n> c; \ - for( int i = 0; i < n; i++ ) \ - c.s[i] = cfunc(a.s[i]); \ - return c; \ -} +//////////// Initial //////////// -#define OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(func, cfunc) \ -inline v_reg func(const v_reg& a) \ +#define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, width, suffix1, suffix2) \ +inline v_##_Tpvec v_setzero_##suffix1() \ { \ - v_reg c; \ - for( int i = 0; i < 4; i++ ) \ - c.s[i] = cfunc(a.s[i]); \ - return c; \ + vsetvlmax_e##width##m1(); \ + return v_##_Tpvec(vzero_##suffix2##m1()); \ } \ -inline v_reg func(const v_reg& a) \ +inline v_##_Tpvec v_setall_##suffix1(_Tp v) \ { \ - v_reg c; \ - for( int i = 0; i < 2; i++ ) \ + vsetvlmax_e##width##m1(); \ + return v_##_Tpvec(vmv_v_x_##suffix2##m1(v)); \ +} + +OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8x16, uchar, 8, u8, u8) +OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8x16, schar, 8, s8, i8) +OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16x8, ushort, 16, u16, u16) +OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16x8, short, 16, s16, i16) +OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32x4, unsigned, 32, u32, u32) +OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32x4, int, 32, s32, i32) +OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64x2, uint64, 64, u64, u64) +OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64x2, int64, 64, s64, i64) + +#define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, width, suffix) \ +inline v_##_Tpv v_setzero_##suffix() \ +{ \ + vsetvlmax_e##width##m1(); \ + return v_##_Tpv(vzero_##suffix##m1()); \ +} \ +inline v_##_Tpv v_setall_##suffix(_Tp v) \ +{ \ + vsetvlmax_e##width##m1(); \ + return v_##_Tpv(vfmv_v_f_##suffix##m1(v)); \ +} + +OPENCV_HAL_IMPL_RVV_INIT_FP(float32x4, float, 32, f32) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_RVV_INIT_FP(float64x2, double, 64, f64) +#endif + +//////////// Reinterpret //////////// + +#define OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(_Tpvec, suffix) \ +inline v_##_Tpvec v_reinterpret_as_##suffix(const v_##_Tpvec& v) { return v; } + +OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint8x16, u8) +OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int8x16, s8) +OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint16x8, u16) +OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int16x8, s16) +OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint32x4, u32) +OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int32x4, s32) +OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float32x4, f32) +OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint64x2, u64) +OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int64x2, s64) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float64x2, f64) +#endif + +#define OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(_Tpvec1, _Tpvec2, _nTpvec1, _nTpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \ +inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \ +{ \ + vsetvlmax_e##width2##m1(); \ + return v_##_Tpvec1((_nTpvec1)vle##width2##_v_##nsuffix2##m1(v.val)); \ +} \ +inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \ +{ \ + vsetvlmax_e##width1##m1(); \ + return v_##_Tpvec2((_nTpvec2)vle##width1##_v_##nsuffix1##m1(v.val)); \ +} + +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int8x16, vuint8m1_t, vint8m1_t, u8, s8, u8, i8, 8, 8) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int16x8, vuint16m1_t, vint16m1_t, u16, s16, u16, i16, 16, 16) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int32x4, vuint32m1_t, vint32m1_t, u32, s32, u32, i32, 32, 32) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, float32x4, vuint32m1_t, vfloat32m1_t, u32, f32, u32, f32, 32, 32) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, float32x4, vint32m1_t, vfloat32m1_t, s32, f32, i32, f32, 32, 32) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int64x2, vuint64m1_t, vint64m1_t, u64, s64, u64, i64, 64, 64) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint16x8, vuint8m1_t, vuint16m1_t, u8, u16, u8, u16, 8, 16) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint32x4, vuint8m1_t, vuint32m1_t, u8, u32, u8, u32, 8, 32) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint64x2, vuint8m1_t, vuint64m1_t, u8, u64, u8, u64, 8, 64) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, uint32x4, vuint16m1_t, vuint32m1_t, u16, u32, u16, u32, 16, 32) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, uint64x2, vuint16m1_t, vuint64m1_t, u16, u64, u16, u64, 16, 64) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, uint64x2, vuint32m1_t, vuint64m1_t, u32, u64, u32, u64, 32, 64) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int16x8, vint8m1_t, vint16m1_t, s8, s16, i8, i16, 8, 16) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int32x4, vint8m1_t, vint32m1_t, s8, s32, i8, i32, 8, 32) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int64x2, vint8m1_t, vint64m1_t, s8, s64, i8, i64, 8, 64) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, int32x4, vint16m1_t, vint32m1_t, s16, s32, i16, i32, 16, 32) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, int64x2, vint16m1_t, vint64m1_t, s16, s64, i16, i64, 16, 64) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, int64x2, vint32m1_t, vint64m1_t, s32, s64, i32, i64, 32, 64) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int16x8, vuint8m1_t, vint16m1_t, u8, s16, u8, i16, 8, 16) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int32x4, vuint8m1_t, vint32m1_t, u8, s32, u8, i32, 8, 32) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int64x2, vuint8m1_t, vint64m1_t, u8, s64, u8, i64, 8, 64) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int8x16, vuint16m1_t, vint8m1_t, u16, s8, u16, i8, 16, 8) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int32x4, vuint16m1_t, vint32m1_t, u16, s32, u16, i32, 16, 32) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int64x2, vuint16m1_t, vint64m1_t, u16, s64, u16, i64, 16, 64) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int8x16, vuint32m1_t, vint8m1_t, u32, s8, u32, i8, 32, 8) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int16x8, vuint32m1_t, vint16m1_t, u32, s16, u32, i16, 32, 16) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int64x2, vuint32m1_t, vint64m1_t, u32, s64, u32, i64, 32, 64) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int8x16, vuint64m1_t, vint8m1_t, u64, s8, u64, i8, 64, 8) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int16x8, vuint64m1_t, vint16m1_t, u64, s16, u64, i16, 64, 16) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int32x4, vuint64m1_t, vint32m1_t, u64, s32, u64, i32, 64, 32) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, float32x4, vuint8m1_t, vfloat32m1_t, u8, f32, u8, f32, 8, 32) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, float32x4, vuint16m1_t, vfloat32m1_t, u16, f32, u16, f32, 16, 32) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, float32x4, vuint64m1_t, vfloat32m1_t, u64, f32, u64, f32, 64, 32) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, float32x4, vint8m1_t, vfloat32m1_t, s8, f32, i8, f32, 8, 32) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, float32x4, vint16m1_t, vfloat32m1_t, s16, f32, i16, f32, 16, 32) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int64x2, float32x4, vint64m1_t, vfloat32m1_t, s64, f32, i64, f32, 64, 32) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, float64x2, vuint64m1_t, vfloat64m1_t, u64, f64, u64, f64, 64, 64) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int64x2, float64x2, vint64m1_t, vfloat64m1_t, s64, f64, i64, f64, 64, 64) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, float64x2, vuint8m1_t, vfloat64m1_t, u8, f64, u8, f64, 8, 64) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, float64x2, vuint16m1_t, vfloat64m1_t, u16, f64, u16, f64, 16, 64) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, float64x2, vuint32m1_t, vfloat64m1_t, u32, f64, u32, f64, 32, 64) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, float64x2, vint8m1_t, vfloat64m1_t, s8, f64, i8, f64, 8, 64) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, float64x2, vint16m1_t, vfloat64m1_t, s16, f64, i16, f64, 16, 64) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, float64x2, vint32m1_t, vfloat64m1_t, s32, f64, i32, f64, 32, 64) +OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(float32x4, float64x2, vfloat32m1_t, vfloat64m1_t, f32, f64, f32, f64, 32, 64) +#endif + +////////////// Extract ////////////// + +#define OPENCV_HAL_IMPL_RVV_EXTRACT(_Tpvec, _Tp, suffix, width, vmv) \ +template \ +inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \ +{ \ + vsetvlmax_e##width##m1(); \ + return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), a, s), b, _Tpvec::nlanes - s)); \ +} \ +template inline _Tp v_extract_n(_Tpvec v) \ +{ \ + vsetvlmax_e##width##m1(); \ + return _Tp(vmv(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), v, i))); \ +} + + +OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint8x16, uchar, u8, 8, vmv_x_s_u8m1_u8) +OPENCV_HAL_IMPL_RVV_EXTRACT(v_int8x16, schar, i8, 8, vmv_x_s_i8m1_i8) +OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint16x8, ushort, u16, 16, vmv_x_s_u16m1_u16) +OPENCV_HAL_IMPL_RVV_EXTRACT(v_int16x8, short, i16, 16, vmv_x_s_i16m1_i16) +OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint32x4, uint, u32, 32, vmv_x_s_u32m1_u32) +OPENCV_HAL_IMPL_RVV_EXTRACT(v_int32x4, int, i32, 32, vmv_x_s_i32m1_i32) +OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint64x2, uint64, u64, 64, vmv_x_s_u64m1_u64) +OPENCV_HAL_IMPL_RVV_EXTRACT(v_int64x2, int64, i64, 64, vmv_x_s_i64m1_i64) +OPENCV_HAL_IMPL_RVV_EXTRACT(v_float32x4, float, f32, 32, vfmv_f_s_f32m1_f32) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_RVV_EXTRACT(v_float64x2, double, f64, 64, vfmv_f_s_f64m1_f64) +#endif + +////////////// Load/Store ////////////// + +#define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, width, suffix) \ +inline _Tpvec v_load(const _Tp* ptr) \ +{ \ + vsetvlmax_e8m1(); \ + return _Tpvec((_nTpvec)vle8_v_u8m1((uchar*)ptr)); \ +} \ +inline _Tpvec v_load_aligned(const _Tp* ptr) \ +{ \ + vsetvlmax_e##width##m1(); \ + return _Tpvec(vle##width##_v_##suffix##m1(ptr)); \ +} \ +inline _Tpvec v_load_low(const _Tp* ptr) \ +{ \ + vsetvl_e##width##m1(hvl); \ + _Tpvec res = _Tpvec(vle##width##_v_##suffix##m1(ptr)); \ + vsetvlmax_e##width##m1(); \ + return res; \ +} \ +inline void v_store(_Tp* ptr, const _Tpvec& a) \ +{ \ + vsetvlmax_e8m1(); \ + vse8_v_u8m1((uchar*)ptr, vle8_v_u8m1((uchar*)a.val)); \ +} \ +inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ +{ \ + vsetvlmax_e##width##m1(); \ + vse##width##_v_##suffix##m1(ptr, a); \ +} \ +inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ +{ \ + vsetvlmax_e##width##m1(); \ + vse##width##_v_##suffix##m1(ptr, a); \ +} \ +inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \ +{ \ + vsetvlmax_e##width##m1(); \ + vse##width##_v_##suffix##m1(ptr, a); \ +} \ +inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ +{ \ + _Tp CV_DECL_ALIGNED(32) tmp_ptr[_Tpvec::nlanes] = {0}; \ + vsetvlmax_e##width##m1(); \ + vse##width##_v_##suffix##m1(tmp_ptr, a); \ + for(int i = 0; i < _Tpvec::nlanes/2; ++i) \ { \ - c.s[i] = cfunc(a.s[i]); \ - c.s[i + 2] = 0; \ + ptr[i] = tmp_ptr[i]; \ } \ - return c; \ -} - -OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp) - -OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp) -OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp) -OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp) -OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp) - -OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs, - typename V_TypeTraits<_Tp>::abs_type) - -OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_round, cvRound) - -OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_floor, cvFloor) - -OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_ceil, cvCeil) - -OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_trunc, int) - -#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \ -template inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +} \ +inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ { \ - v_reg<_Tp, n> c; \ - for( int i = 0; i < n; i++ ) \ - c.s[i] = cfunc(a.s[i], b.s[i]); \ - return c; \ + _Tp CV_DECL_ALIGNED(32) tmp_ptr[_Tpvec::nlanes] = {0}; \ + vsetvlmax_e##width##m1(); \ + vse##width##_v_##suffix##m1(tmp_ptr, a); \ + for(int i = 0; i < _Tpvec::nlanes/2; ++i) \ + { \ + ptr[i] = tmp_ptr[i+_Tpvec::nlanes/2]; \ + } \ } -#define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \ -template inline _Tp func(const v_reg<_Tp, n>& a) \ +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8x16, vuint8m1_t, uchar, 8, 8, u8) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8x16, vint8m1_t, schar, 8, 8, i8) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16x8, vuint16m1_t, ushort, 4, 16, u16) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16x8, vint16m1_t, short, 4, 16, i16) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32x4, vuint32m1_t, unsigned, 2, 32, u32) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32x4, vint32m1_t, int, 2, 32, i32) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64x2, vuint64m1_t, uint64, 1, 64, u64) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64x2, vint64m1_t, int64, 1, 64, i64) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32x4, vfloat32m1_t, float, 2, 32, f32) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64x2, vfloat64m1_t, double, 1, 64, f64) +#endif + +inline v_int8x16 v_load_halves(const schar* ptr0, const schar* ptr1) +{ + schar CV_DECL_ALIGNED(32) elems[16] = + { + ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr0[4], ptr0[5], ptr0[6], ptr0[7], + ptr1[0], ptr1[1], ptr1[2], ptr1[3], ptr1[4], ptr1[5], ptr1[6], ptr1[7] + }; + vsetvlmax_e8m1(); + return v_int8x16(vle8_v_i8m1(elems)); +} +inline v_uint8x16 v_load_halves(const uchar* ptr0, const uchar* ptr1) { return v_reinterpret_as_u8(v_load_halves((schar*)ptr0, (schar*)ptr1)); } + +inline v_int16x8 v_load_halves(const short* ptr0, const short* ptr1) +{ + short CV_DECL_ALIGNED(32) elems[8] = + { + ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr1[0], ptr1[1], ptr1[2], ptr1[3] + }; + vsetvlmax_e16m1(); + return v_int16x8(vle16_v_i16m1(elems)); +} +inline v_uint16x8 v_load_halves(const ushort* ptr0, const ushort* ptr1) { return v_reinterpret_as_u16(v_load_halves((short*)ptr0, (short*)ptr1)); } + +inline v_int32x4 v_load_halves(const int* ptr0, const int* ptr1) +{ + int CV_DECL_ALIGNED(32) elems[4] = + { + ptr0[0], ptr0[1], ptr1[0], ptr1[1] + }; + vsetvlmax_e32m1(); + return v_int32x4(vle32_v_i32m1(elems)); +} +inline v_float32x4 v_load_halves(const float* ptr0, const float* ptr1) +{ + float CV_DECL_ALIGNED(32) elems[4] = + { + ptr0[0], ptr0[1], ptr1[0], ptr1[1] + }; + vsetvlmax_e32m1(); + return v_float32x4(vle32_v_f32m1(elems)); +} +inline v_uint32x4 v_load_halves(const unsigned* ptr0, const unsigned* ptr1) { return v_reinterpret_as_u32(v_load_halves((int*)ptr0, (int*)ptr1)); } + +inline v_int64x2 v_load_halves(const int64* ptr0, const int64* ptr1) +{ + int64 CV_DECL_ALIGNED(32) elems[2] = + { + ptr0[0], ptr1[0] + }; + vsetvlmax_e64m1(); + return v_int64x2(vle64_v_i64m1(elems)); +} +inline v_uint64x2 v_load_halves(const uint64* ptr0, const uint64* ptr1) { return v_reinterpret_as_u64(v_load_halves((int64*)ptr0, (int64*)ptr1)); } + +#if CV_SIMD128_64F +inline v_float64x2 v_load_halves(const double* ptr0, const double* ptr1) +{ + double CV_DECL_ALIGNED(32) elems[2] = + { + ptr0[0], ptr1[0] + }; + vsetvlmax_e64m1(); + return v_float64x2(vle64_v_f64m1(elems)); +} +#endif + + +////////////// Lookup table access //////////////////// + +inline v_int8x16 v_lut(const schar* tab, const int* idx) +{ + schar CV_DECL_ALIGNED(32) elems[16] = + { + tab[idx[ 0]], + tab[idx[ 1]], + tab[idx[ 2]], + tab[idx[ 3]], + tab[idx[ 4]], + tab[idx[ 5]], + tab[idx[ 6]], + tab[idx[ 7]], + tab[idx[ 8]], + tab[idx[ 9]], + tab[idx[10]], + tab[idx[11]], + tab[idx[12]], + tab[idx[13]], + tab[idx[14]], + tab[idx[15]] + }; + vsetvlmax_e8m1(); + return v_int8x16(vle8_v_i8m1(elems)); +} +inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx) +{ + schar CV_DECL_ALIGNED(32) elems[16] = + { + tab[idx[0]], + tab[idx[0] + 1], + tab[idx[1]], + tab[idx[1] + 1], + tab[idx[2]], + tab[idx[2] + 1], + tab[idx[3]], + tab[idx[3] + 1], + tab[idx[4]], + tab[idx[4] + 1], + tab[idx[5]], + tab[idx[5] + 1], + tab[idx[6]], + tab[idx[6] + 1], + tab[idx[7]], + tab[idx[7] + 1] + }; + vsetvlmax_e8m1(); + return v_int8x16(vle8_v_i8m1(elems)); +} +inline v_int8x16 v_lut_quads(const schar* tab, const int* idx) +{ + schar CV_DECL_ALIGNED(32) elems[16] = + { + tab[idx[0]], + tab[idx[0] + 1], + tab[idx[0] + 2], + tab[idx[0] + 3], + tab[idx[1]], + tab[idx[1] + 1], + tab[idx[1] + 2], + tab[idx[1] + 3], + tab[idx[2]], + tab[idx[2] + 1], + tab[idx[2] + 2], + tab[idx[2] + 3], + tab[idx[3]], + tab[idx[3] + 1], + tab[idx[3] + 2], + tab[idx[3] + 3] + }; + vsetvlmax_e8m1(); + return v_int8x16(vle8_v_i8m1(elems)); +} +inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); } +inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); } +inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); } + +inline v_int16x8 v_lut(const short* tab, const int* idx) +{ + short CV_DECL_ALIGNED(32) elems[8] = + { + tab[idx[0]], + tab[idx[1]], + tab[idx[2]], + tab[idx[3]], + tab[idx[4]], + tab[idx[5]], + tab[idx[6]], + tab[idx[7]] + }; + vsetvlmax_e16m1(); + return v_int16x8(vle16_v_i16m1(elems)); +} +inline v_int16x8 v_lut_pairs(const short* tab, const int* idx) +{ + short CV_DECL_ALIGNED(32) elems[8] = + { + tab[idx[0]], + tab[idx[0] + 1], + tab[idx[1]], + tab[idx[1] + 1], + tab[idx[2]], + tab[idx[2] + 1], + tab[idx[3]], + tab[idx[3] + 1] + }; + vsetvlmax_e16m1(); + return v_int16x8(vle16_v_i16m1(elems)); +} +inline v_int16x8 v_lut_quads(const short* tab, const int* idx) +{ + short CV_DECL_ALIGNED(32) elems[8] = + { + tab[idx[0]], + tab[idx[0] + 1], + tab[idx[0] + 2], + tab[idx[0] + 3], + tab[idx[1]], + tab[idx[1] + 1], + tab[idx[1] + 2], + tab[idx[1] + 3] + }; + vsetvlmax_e16m1(); + return v_int16x8(vle16_v_i16m1(elems)); +} +inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); } +inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); } +inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); } + +inline v_int32x4 v_lut(const int* tab, const int* idx) +{ + int CV_DECL_ALIGNED(32) elems[4] = + { + tab[idx[0]], + tab[idx[1]], + tab[idx[2]], + tab[idx[3]] + }; + vsetvlmax_e32m1(); + return v_int32x4(vle32_v_i32m1(elems)); +} +inline v_int32x4 v_lut_pairs(const int* tab, const int* idx) +{ + int CV_DECL_ALIGNED(32) elems[4] = + { + tab[idx[0]], + tab[idx[0] + 1], + tab[idx[1]], + tab[idx[1] + 1] + }; + vsetvlmax_e32m1(); + return v_int32x4(vle32_v_i32m1(elems)); +} +inline v_int32x4 v_lut_quads(const int* tab, const int* idx) +{ + vsetvlmax_e32m1(); + return v_int32x4(vle32_v_i32m1(tab + idx[0])); +} + +inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); } +inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); } +inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); } + +inline v_int64x2 v_lut(const int64_t* tab, const int* idx) +{ + int64_t CV_DECL_ALIGNED(32) elems[2] = + { + tab[idx[0]], + tab[idx[1]] + }; + vsetvlmax_e64m1(); + return v_int64x2(vle64_v_i64m1(elems)); +} +inline v_int64x2 v_lut_pairs(const int64* tab, const int* idx) +{ + vsetvlmax_e64m1(); + return v_int64x2(vle64_v_i64m1(tab + idx[0])); +} +inline v_uint64x2 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); } +inline v_uint64x2 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); } + +inline v_float32x4 v_lut(const float* tab, const int* idx) +{ + float CV_DECL_ALIGNED(32) elems[4] = + { + tab[idx[0]], + tab[idx[1]], + tab[idx[2]], + tab[idx[3]] + }; + vsetvlmax_e32m1(); + return v_float32x4(vle32_v_f32m1(elems)); +} +inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) +{ + float CV_DECL_ALIGNED(32) elems[4] = + { + tab[idx[0]], + tab[idx[0] + 1], + tab[idx[1]], + tab[idx[1] + 1] + }; + vsetvlmax_e32m1(); + return v_float32x4(vle32_v_f32m1(elems)); +} +inline v_float32x4 v_lut_quads(const float* tab, const int* idx) +{ + vsetvlmax_e32m1(); + return v_float32x4(vle32_v_f32m1(tab + idx[0])); +} + +inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec) +{ + int CV_DECL_ALIGNED(32) elems[4] = + { + tab[v_extract_n<0>(idxvec)], + tab[v_extract_n<1>(idxvec)], + tab[v_extract_n<2>(idxvec)], + tab[v_extract_n<3>(idxvec)] + }; + vsetvlmax_e32m1(); + return v_int32x4(vle32_v_i32m1(elems)); +} + +inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec) +{ + unsigned CV_DECL_ALIGNED(32) elems[4] = + { + tab[v_extract_n<0>(idxvec)], + tab[v_extract_n<1>(idxvec)], + tab[v_extract_n<2>(idxvec)], + tab[v_extract_n<3>(idxvec)] + }; + vsetvlmax_e32m1(); + return v_uint32x4(vle32_v_u32m1(elems)); +} + +inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec) +{ + float CV_DECL_ALIGNED(32) elems[4] = + { + tab[v_extract_n<0>(idxvec)], + tab[v_extract_n<1>(idxvec)], + tab[v_extract_n<2>(idxvec)], + tab[v_extract_n<3>(idxvec)] + }; + vsetvlmax_e32m1(); + return v_float32x4(vle32_v_f32m1(elems)); +} + +inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + + x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]); + y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]); +} + +#if CV_SIMD128_64F +inline v_float64x2 v_lut(const double* tab, const int* idx) +{ + double CV_DECL_ALIGNED(32) elems[2] = + { + tab[idx[0]], + tab[idx[1]] + }; + vsetvlmax_e64m1(); + return v_float64x2(vle64_v_f64m1(elems)); +} + +inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) +{ + vsetvlmax_e64m1(); + return v_float64x2(vle64_v_f64m1(tab + idx[0])); +} + +inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec) +{ + double CV_DECL_ALIGNED(32) elems[2] = + { + tab[v_extract_n<0>(idxvec)], + tab[v_extract_n<1>(idxvec)] + }; + vsetvlmax_e64m1(); + return v_float64x2(vle64_v_f64m1(elems)); +} + +inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y) +{ + int CV_DECL_ALIGNED(32) idx[4] = {0}; + v_store_aligned(idx, idxvec); + + x = v_float64x2(tab[idx[0]], tab[idx[1]]); + y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]); +} +#endif + +////////////// Pack boolean //////////////////// + +inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b) +{ + ushort CV_DECL_ALIGNED(32) ptr[16] = {0}; + v_store(ptr, a); + v_store(ptr + 8, b); + vsetvlmax_e8m1(); + return v_uint8x16(vnsrl_wx_u8m1(vle16_v_u16m2(ptr), 0)); +} + +inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b, + const v_uint32x4& c, const v_uint32x4& d) +{ + unsigned CV_DECL_ALIGNED(32) ptr[16] = {0}; + v_store(ptr, a); + v_store(ptr + 4, b); + v_store(ptr + 8, c); + v_store(ptr + 12, d); + vsetvlmax_e8m1(); + return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vle32_v_u32m4(ptr), 0), 0)); +} + +inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, + const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f, + const v_uint64x2& g, const v_uint64x2& h) +{ + uint64 CV_DECL_ALIGNED(32) ptr[16] = {0}; + v_store(ptr, a); + v_store(ptr + 2, b); + v_store(ptr + 4, c); + v_store(ptr + 6, d); + v_store(ptr + 8, e); + v_store(ptr + 10, f); + v_store(ptr + 12, g); + v_store(ptr + 14, h); + vsetvlmax_e8m1(); + return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vnsrl_wx_u32m4(vle64_v_u64m8(ptr), 0), 0), 0)); +} + +////////////// Arithmetics ////////////// +#define OPENCV_HAL_IMPL_RVV_BIN_OP(bin_op, _Tpvec, intrin, width) \ +inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ { \ - _Tp c = a.s[0]; \ - for( int i = 1; i < n; i++ ) \ - c = cfunc(c, a.s[i]); \ - return c; \ + vsetvlmax_e##width##m1(); \ + return _Tpvec(intrin(a, b)); \ +} \ +inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ +{ \ + vsetvlmax_e##width##m1(); \ + a = _Tpvec(intrin(a, b)); \ + return a; \ } -OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min) +OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint8x16, vsaddu_vv_u8m1, 8) +OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint8x16, vssubu_vv_u8m1, 8) +OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint8x16, vdivu_vv_u8m1, 8) +OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int8x16, vsadd_vv_i8m1, 8) +OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int8x16, vssub_vv_i8m1, 8) +OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int8x16, vdiv_vv_i8m1, 8) +OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint16x8, vsaddu_vv_u16m1, 16) +OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint16x8, vssubu_vv_u16m1, 16) +OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint16x8, vdivu_vv_u16m1, 16) +OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int16x8, vsadd_vv_i16m1, 16) +OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int16x8, vssub_vv_i16m1, 16) +OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int16x8, vdiv_vv_i16m1, 16) +OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint32x4, vadd_vv_u32m1, 32) +OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint32x4, vsub_vv_u32m1, 32) +OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint32x4, vmul_vv_u32m1, 32) +OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint32x4, vdivu_vv_u32m1, 32) +OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int32x4, vadd_vv_i32m1, 32) +OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int32x4, vsub_vv_i32m1, 32) +OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int32x4, vmul_vv_i32m1, 32) +OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int32x4, vdiv_vv_i32m1, 32) +OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float32x4, vfadd_vv_f32m1, 32) +OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float32x4, vfsub_vv_f32m1, 32) +OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float32x4, vfmul_vv_f32m1, 32) +OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float32x4, vfdiv_vv_f32m1, 32) +OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint64x2, vadd_vv_u64m1, 64) +OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint64x2, vsub_vv_u64m1, 64) +OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint64x2, vmul_vv_u64m1, 64) +OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint64x2, vdivu_vv_u64m1, 64) +OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int64x2, vadd_vv_i64m1, 64) +OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int64x2, vsub_vv_i64m1, 64) +OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int64x2, vmul_vv_i64m1, 64) +OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int64x2, vdiv_vv_i64m1, 64) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float64x2, vfadd_vv_f64m1, 64) +OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float64x2, vfsub_vv_f64m1, 64) +OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float64x2, vfmul_vv_f64m1, 64) +OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float64x2, vfdiv_vv_f64m1, 64) +#endif -OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max) -OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min) +////////////// Bitwise logic ////////////// -OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max) +#define OPENCV_HAL_IMPL_RVV_LOGIC_OP(_Tpvec, suffix, width) \ +OPENCV_HAL_IMPL_RVV_BIN_OP(&, _Tpvec, vand_vv_##suffix##m1, width) \ +OPENCV_HAL_IMPL_RVV_BIN_OP(|, _Tpvec, vor_vv_##suffix##m1, width) \ +OPENCV_HAL_IMPL_RVV_BIN_OP(^, _Tpvec, vxor_vv_##suffix##m1, width) \ +inline _Tpvec operator ~ (const _Tpvec& a) \ +{ \ + vsetvlmax_e##width##m1(); \ + return _Tpvec(vnot_v_##suffix##m1(a)); \ +} + +OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint8x16, u8, 8) +OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int8x16, i8, 8) +OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint16x8, u16, 16) +OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int16x8, i16, 16) +OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint32x4, u32, 32) +OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32x4, i32, 32) +OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64x2, u64, 64) +OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64x2, i64, 64) + +#define OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(bin_op, intrin) \ +inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \ +{ \ + vsetvlmax_e32m1(); \ + return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b)))); \ +} \ +inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \ +{ \ + vsetvlmax_e32m1(); \ + a = v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b)))); \ + return a; \ +} + +OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(&, vand_vv_i32m1) +OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(|, vor_vv_i32m1) +OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(^, vxor_vv_i32m1) + +inline v_float32x4 operator ~ (const v_float32x4& a) +{ + vsetvlmax_e32m1(); + return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a)))); +} + +#if CV_SIMD128_64F +#define OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(bin_op, intrin) \ +inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \ +{ \ + vsetvlmax_e64m1(); \ + return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b)))); \ +} \ +inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \ +{ \ + vsetvlmax_e64m1(); \ + a = v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b)))); \ + return a; \ +} + +OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(&, vand_vv_i64m1) +OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(|, vor_vv_i64m1) +OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(^, vxor_vv_i64m1) + +inline v_float64x2 operator ~ (const v_float64x2& a) +{ + vsetvlmax_e64m1(); + return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a)))); +} +#endif + +////////////// Bitwise shifts ////////////// + +#define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, suffix, width) \ +inline _Tpvec operator << (const _Tpvec& a, int n) \ +{ \ + vsetvlmax_e##width##m1(); \ + return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \ +} \ +inline _Tpvec operator >> (const _Tpvec& a, int n) \ +{ \ + vsetvlmax_e##width##m1(); \ + return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n))); \ +} \ +template inline _Tpvec v_shl(const _Tpvec& a) \ +{ \ + vsetvlmax_e##width##m1(); \ + return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \ +} \ +template inline _Tpvec v_shr(const _Tpvec& a) \ +{ \ + vsetvlmax_e##width##m1(); \ + return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n))); \ +} + +#define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, suffix, width) \ +inline _Tpvec operator << (const _Tpvec& a, int n) \ +{ \ + vsetvlmax_e##width##m1(); \ + return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \ +} \ +inline _Tpvec operator >> (const _Tpvec& a, int n) \ +{ \ + vsetvlmax_e##width##m1(); \ + return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n))); \ +} \ +template inline _Tpvec v_shl(const _Tpvec& a) \ +{ \ + vsetvlmax_e##width##m1(); \ + return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \ +} \ +template inline _Tpvec v_shr(const _Tpvec& a) \ +{ \ + vsetvlmax_e##width##m1(); \ + return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n))); \ +} + +OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint8x16, u8, 8) +OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint16x8, u16, 16) +OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint32x4, u32, 32) +OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint64x2, u64, 64) +OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int8x16, i8, 8) +OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int16x8, i16, 16) +OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int32x4, i32, 32) +OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64x2, i64, 64) + + +////////////// Comparison ////////////// + +#define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix, width) \ +inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \ +{ \ + vsetvlmax_e##width##m1(); \ + return _Tpvec(vmerge_vxm_##suffix##m1(intrin(a, b), vzero_##suffix##m1(), 1)); \ +} + +#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix, width) \ +inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \ +{ \ + vsetvlmax_e##width##m1(); \ + return _Tpvec(vfmerge_vfm_##suffix##m1(intrin(a, b), vzero_##suffix##m1(), 1)); \ +} + +#define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix, width) \ +OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, width) \ +OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, width) \ +OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmsltu_vv_##suffix##m1_b##width, suffix, width) \ +OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgtu_vv_##suffix##m1_b##width, suffix, width) \ +OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsleu_vv_##suffix##m1_b##width, suffix, width) \ +OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsgeu_vv_##suffix##m1_b##width, suffix, width) + +#define OPENCV_HAL_IMPL_RVV_SIGNED_CMP(_Tpvec, suffix, width) \ +OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, width) \ +OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, width) \ +OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmslt_vv_##suffix##m1_b##width, suffix, width) \ +OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgt_vv_##suffix##m1_b##width, suffix, width) \ +OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsle_vv_##suffix##m1_b##width, suffix, width) \ +OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsge_vv_##suffix##m1_b##width, suffix, width) + +#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP(_Tpvec, suffix, width) \ +OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ==, vmfeq_vv_##suffix##m1_b##width, suffix, width) \ +OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, !=, vmfne_vv_##suffix##m1_b##width, suffix, width) \ +OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <, vmflt_vv_##suffix##m1_b##width, suffix, width) \ +OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >, vmfgt_vv_##suffix##m1_b##width, suffix, width) \ +OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <=, vmfle_vv_##suffix##m1_b##width, suffix, width) \ +OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >=, vmfge_vv_##suffix##m1_b##width, suffix, width) + + +OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint8x16, u8, 8) +OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint16x8, u16, 16) +OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint32x4, u32, 32) +OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint64x2, u64, 64) +OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int8x16, i8, 8) +OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int16x8, i16, 16) +OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int32x4, i32, 32) +OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int64x2, i64, 64) +OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float32x4, f32, 32) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float64x2, f64, 64) +#endif + +inline v_float32x4 v_not_nan(const v_float32x4& a) +{ return a == a; } + +#if CV_SIMD128_64F +inline v_float64x2 v_not_nan(const v_float64x2& a) +{ return a == a; } +#endif + +////////////// Min/Max ////////////// + +#define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, width) \ +inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ +{ \ + vsetvlmax_e##width##m1(); \ + return _Tpvec(intrin(a, b)); \ +} + +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_min, vminu_vv_u8m1, 8) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_max, vmaxu_vv_u8m1, 8) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_min, vmin_vv_i8m1, 8) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_max, vmax_vv_i8m1, 8) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_min, vminu_vv_u16m1, 16) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_max, vmaxu_vv_u16m1, 16) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_min, vmin_vv_i16m1, 16) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_max, vmax_vv_i16m1, 16) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_min, vminu_vv_u32m1, 32) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_max, vmaxu_vv_u32m1, 32) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_min, vmin_vv_i32m1, 32) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_max, vmax_vv_i32m1, 32) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_min, vfmin_vv_f32m1, 32) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_max, vfmax_vv_f32m1, 32) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_min, vminu_vv_u64m1, 64) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_max, vmaxu_vv_u64m1, 64) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_min, vmin_vv_i64m1, 64) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_max, vmax_vv_i64m1, 64) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_min, vfmin_vv_f64m1, 64) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_max, vfmax_vv_f64m1, 64) +#endif + +////////////// Arithmetics wrap ////////////// + +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 8) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_add_wrap, vadd_vv_i8m1, 8) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_add_wrap, vadd_vv_u16m1, 16) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_add_wrap, vadd_vv_i16m1, 16) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 8) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_sub_wrap, vsub_vv_i8m1, 8) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 16) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_sub_wrap, vsub_vv_i16m1, 16) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 8) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_mul_wrap, vmul_vv_i8m1, 8) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 16) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 16) + +////////////// Reduce ////////////// + +#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, wwidth, red) \ +inline scalartype v_reduce_sum(const _Tpvec& a) \ +{ \ + vsetvlmax_e##wwidth##m1(); \ + _nwTpvec zero = vzero_##wsuffix##m1(); \ + _nwTpvec res = vzero_##wsuffix##m1(); \ + res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero); \ + return (scalartype)(_wTpvec(res).get0()); \ +} + +OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint8x16, v_uint16x8, vuint16m1_t, unsigned, u8, u16, 16, wredsumu) +OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int8x16, v_int16x8, vint16m1_t, int, i8, i16, 16, wredsum) +OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint16x8, v_uint32x4, vuint32m1_t, unsigned, u16, u32, 32, wredsumu) +OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int16x8, v_int32x4, vint32m1_t, int, i16, i32, 32, wredsum) +OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint32x4, v_uint64x2, vuint64m1_t, unsigned, u32, u64, 64, wredsumu) +OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int32x4, v_int64x2, vint64m1_t, int, i32, i64, 64, wredsum) +OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_float32x4, v_float32x4, vfloat32m1_t, float, f32, f32, 32, fredsum) +OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint64x2, v_uint64x2, vuint64m1_t, uint64, u64, u64, 64, redsum) +OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int64x2, v_int64x2, vint64m1_t, int64, i64, i64, 64, redsum) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_float64x2, v_float64x2, vfloat64m1_t, double, f64, f64, 64, fredsum) +#endif + + +#define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, width, red) \ +inline scalartype v_reduce_##func(const _Tpvec& a) \ +{ \ + vsetvlmax_e##width##m1(); \ + _Tpvec res = _Tpvec(v##red##_vs_##suffix##m1_##suffix##m1(a, a, a)); \ + return scalartype(res.get0()); \ +} + +OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, min, uchar, u8, 8, redminu) +OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, min, schar, i8, 8, redmin) +OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, min, ushort, u16, 16, redminu) +OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, min, short, i16, 16, redmin) +OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, min, unsigned, u32, 32, redminu) +OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, min, int, i32, 32, redmin) +OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, min, float, f32, 32, fredmin) +OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, max, uchar, u8, 8, redmaxu) +OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, max, schar, i8, 8, redmax) +OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, max, ushort, u16, 16, redmaxu) +OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, max, short, i16, 16, redmax) +OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, max, unsigned, u32, 32, redmaxu) +OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, max, int, i32, 32, redmax) +OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, max, float, f32, 32, fredmax) + + +inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, + const v_float32x4& c, const v_float32x4& d) +{ + float CV_DECL_ALIGNED(32) elems[4] = + { + v_reduce_sum(a), + v_reduce_sum(b), + v_reduce_sum(c), + v_reduce_sum(d) + }; + vsetvlmax_e32m1(); + return v_float32x4(vle32_v_f32m1(elems)); +} + +////////////// Square-Root ////////////// + +inline v_float32x4 v_sqrt(const v_float32x4& x) +{ + vsetvlmax_e32m1(); + return v_float32x4(vfsqrt_v_f32m1(x)); +} + +inline v_float32x4 v_invsqrt(const v_float32x4& x) +{ + v_float32x4 one = v_setall_f32(1.0f); + return one / v_sqrt(x); +} + +#if CV_SIMD128_64F +inline v_float64x2 v_sqrt(const v_float64x2& x) +{ + vsetvlmax_e64m1(); + return v_float64x2(vfsqrt_v_f64m1(x)); +} + +inline v_float64x2 v_invsqrt(const v_float64x2& x) +{ + v_float64x2 one = v_setall_f64(1.0f); + return one / v_sqrt(x); +} +#endif + +inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b) +{ + vsetvlmax_e32m1(); + v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a), b, b)); + return v_sqrt(x); +} + +inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b) +{ + vsetvlmax_e32m1(); + return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a), b, b)); +} + +#if CV_SIMD128_64F +inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b) +{ + vsetvlmax_e64m1(); + v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a), b, b)); + return v_sqrt(x); +} + +inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b) +{ + vsetvlmax_e64m1(); + return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a), b, b)); +} +#endif + +////////////// Multiply-Add ////////////// + +inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) +{ + vsetvlmax_e32m1(); + return v_float32x4(vfmacc_vv_f32m1(c, a, b)); +} +inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) +{ + vsetvlmax_e32m1(); + return v_int32x4(vmacc_vv_i32m1(c, a, b)); +} + +inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) +{ + return v_fma(a, b, c); +} + +inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) +{ + return v_fma(a, b, c); +} + +#if CV_SIMD128_64F +inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c) +{ + vsetvlmax_e64m1(); + return v_float64x2(vfmacc_vv_f64m1(c, a, b)); +} + +inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c) +{ + return v_fma(a, b, c); +} +#endif + +////////////// Check all/any ////////////// + +#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, suffix, shift, width) \ +inline bool v_check_all(const _Tpvec& a) \ +{ \ + vsetvlmax_e##width##m1(); \ + v_uint64x2 v = v_uint64x2((vuint64m1_t)vsrl_vx_##suffix##m1(vnot_v_##suffix##m1(a), shift)); \ + return (v.val[0] | v.val[1]) == 0; \ +} \ +inline bool v_check_any(const _Tpvec& a) \ +{ \ + vsetvlmax_e##width##m1(); \ + v_uint64x2 v = v_uint64x2((vuint64m1_t)vsrl_vx_##suffix##m1(a, shift)); \ + return (v.val[0] | v.val[1]) != 0; \ +} + +OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint8x16, u8, 7, 8) +OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint16x8, u16, 15, 16) +OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint32x4, u32, 31, 32) +OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint64x2, u64, 63, 64) + + +inline bool v_check_all(const v_int8x16& a) +{ return v_check_all(v_reinterpret_as_u8(a)); } +inline bool v_check_any(const v_int8x16& a) +{ return v_check_any(v_reinterpret_as_u8(a)); } + +inline bool v_check_all(const v_int16x8& a) +{ return v_check_all(v_reinterpret_as_u16(a)); } +inline bool v_check_any(const v_int16x8& a) +{ return v_check_any(v_reinterpret_as_u16(a)); } + +inline bool v_check_all(const v_int32x4& a) +{ return v_check_all(v_reinterpret_as_u32(a)); } +inline bool v_check_any(const v_int32x4& a) +{ return v_check_any(v_reinterpret_as_u32(a)); } + +inline bool v_check_all(const v_float32x4& a) +{ return v_check_all(v_reinterpret_as_u32(a)); } +inline bool v_check_any(const v_float32x4& a) +{ return v_check_any(v_reinterpret_as_u32(a)); } + +inline bool v_check_all(const v_int64x2& a) +{ return v_check_all(v_reinterpret_as_u64(a)); } +inline bool v_check_any(const v_int64x2& a) +{ return v_check_any(v_reinterpret_as_u64(a)); } + +#if CV_SIMD128_64F +inline bool v_check_all(const v_float64x2& a) +{ return v_check_all(v_reinterpret_as_u64(a)); } +inline bool v_check_any(const v_float64x2& a) +{ return v_check_any(v_reinterpret_as_u64(a)); } +#endif + +////////////// abs ////////////// + +#define OPENCV_HAL_IMPL_RVV_ABSDIFF(_Tpvec, abs) \ +inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return v_max(a, b) - v_min(a, b); \ +} + +OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint8x16, absdiff) +OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint16x8, absdiff) +OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint32x4, absdiff) +OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float32x4, absdiff) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float64x2, absdiff) +#endif +OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int8x16, absdiffs) +OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int16x8, absdiffs) + +#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, _nwTpvec, sub, rshr, width) \ +inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \ +{ \ + vsetvlmax_e##width##m1(); \ + return _rTpvec(rshr((_nwTpvec)sub(v_max(a, b), v_min(a, b)), 0)); \ +} + +OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8x16, v_uint8x16, vuint16m2_t, vwsub_vv_i16m2, vnclipu_wx_u8m1, 8) +OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16x8, v_uint16x8, vuint32m2_t, vwsub_vv_i32m2, vnclipu_wx_u16m1, 16) +OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32x4, v_uint32x4, vuint64m2_t, vwsub_vv_i64m2, vnclipu_wx_u32m1, 32) + +#define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \ +inline _Tprvec v_abs(const _Tpvec& a) \ +{ \ + return v_absdiff(a, v_setzero_##suffix()); \ +} + +OPENCV_HAL_IMPL_RVV_ABS(v_uint8x16, v_int8x16, s8) +OPENCV_HAL_IMPL_RVV_ABS(v_uint16x8, v_int16x8, s16) +OPENCV_HAL_IMPL_RVV_ABS(v_uint32x4, v_int32x4, s32) +OPENCV_HAL_IMPL_RVV_ABS(v_float32x4, v_float32x4, f32) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_RVV_ABS(v_float64x2, v_float64x2, f64) +#endif + + +#define OPENCV_HAL_IMPL_RVV_REDUCE_SAD(_Tpvec, scalartype) \ +inline scalartype v_reduce_sad(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return v_reduce_sum(v_absdiff(a, b)); \ +} + +OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint8x16, unsigned) +OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int8x16, unsigned) +OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint16x8, unsigned) +OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int16x8, unsigned) +OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint32x4, unsigned) +OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int32x4, unsigned) +OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_float32x4, float) + +////////////// Select ////////////// + +#define OPENCV_HAL_IMPL_RVV_SELECT(_Tpvec, merge, ne, width) \ +inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ +{ \ + vsetvlmax_e##width##m1(); \ + return _Tpvec(merge(ne(mask, 0), b, a)); \ +} + +OPENCV_HAL_IMPL_RVV_SELECT(v_uint8x16, vmerge_vvm_u8m1, vmsne_vx_u8m1_b8, 8) +OPENCV_HAL_IMPL_RVV_SELECT(v_int8x16, vmerge_vvm_i8m1, vmsne_vx_i8m1_b8, 8) +OPENCV_HAL_IMPL_RVV_SELECT(v_uint16x8, vmerge_vvm_u16m1, vmsne_vx_u16m1_b16, 16) +OPENCV_HAL_IMPL_RVV_SELECT(v_int16x8, vmerge_vvm_i16m1, vmsne_vx_i16m1_b16, 16) +OPENCV_HAL_IMPL_RVV_SELECT(v_uint32x4, vmerge_vvm_u32m1, vmsne_vx_u32m1_b32, 32) +OPENCV_HAL_IMPL_RVV_SELECT(v_int32x4, vmerge_vvm_i32m1, vmsne_vx_i32m1_b32, 32) +OPENCV_HAL_IMPL_RVV_SELECT(v_float32x4, vmerge_vvm_f32m1, vmfne_vf_f32m1_b32, 32) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_RVV_SELECT(v_float64x2, vmerge_vvm_f64m1, vmfne_vf_f64m1_b64, 64) +#endif + +////////////// Rotate shift ////////////// + +#define OPENCV_HAL_IMPL_RVV_ROTATE_OP(_Tpvec, suffix, width) \ +template inline _Tpvec v_rotate_right(const _Tpvec& a) \ +{ \ + vsetvlmax_e##width##m1(); \ + return _Tpvec(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), a, n)); \ +} \ +template inline _Tpvec v_rotate_left(const _Tpvec& a) \ +{ \ + vsetvlmax_e##width##m1(); \ + return _Tpvec(vslideup_vx_##suffix##m1(vzero_##suffix##m1(), a, n)); \ +} \ +template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \ +{ return a; } \ +template inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \ +{ \ + vsetvlmax_e##width##m1(); \ + return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), a, n), b, _Tpvec::nlanes - n)); \ +} \ +template inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \ +{ \ + vsetvlmax_e##width##m1(); \ + return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), b, _Tpvec::nlanes - n), a, n)); \ +} \ +template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \ +{ CV_UNUSED(b); return a; } + + +OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint8x16, u8, 8) +OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int8x16, i8, 8) +OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint16x8, u16, 16) +OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int16x8, i16, 16) +OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint32x4, u32, 32) +OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int32x4, i32, 32) +OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_float32x4, f32, 32) +OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint64x2, u64, 64) +OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int64x2, i64, 64) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_float64x2, f64, 64) +#endif + +////////////// Convert to float ////////////// + +inline v_float32x4 v_cvt_f32(const v_int32x4& a) +{ + vsetvlmax_e32m1(); + return v_float32x4(vfcvt_f_x_v_f32m1(a)); +} + +#if CV_SIMD128_64F +inline v_float32x4 v_cvt_f32(const v_float64x2& a) +{ + double arr[4] = {a.val[0], a.val[1], 0, 0}; + vsetvlmax_e64m2(); + vfloat64m2_t tmp = vle64_v_f64m2(arr); + vsetvlmax_e32m1(); + return v_float32x4(vfncvt_f_f_w_f32m1(tmp)); +} + +inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b) +{ + double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]}; + vsetvlmax_e64m2(); + vfloat64m2_t tmp = vle64_v_f64m2(arr); + vsetvlmax_e32m1(); + return v_float32x4(vfncvt_f_f_w_f32m1(tmp)); +} + +inline v_float64x2 v_cvt_f64(const v_int32x4& a) +{ + double CV_DECL_ALIGNED(32) ptr[4] = {0}; + vsetvlmax_e64m2(); + vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a)); + double CV_DECL_ALIGNED(32) elems[2] = + { + ptr[0], ptr[1] + }; + vsetvlmax_e64m1(); + return v_float64x2(vle64_v_f64m1(elems)); +} + +inline v_float64x2 v_cvt_f64_high(const v_int32x4& a) +{ + double CV_DECL_ALIGNED(32) ptr[4] = {0}; + vsetvlmax_e64m2(); + vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a)); + double CV_DECL_ALIGNED(32) elems[2] = + { + ptr[2], ptr[3] + }; + vsetvlmax_e64m1(); + return v_float64x2(vle64_v_f64m1(elems)); +} + +inline v_float64x2 v_cvt_f64(const v_float32x4& a) +{ + double CV_DECL_ALIGNED(32) ptr[4] = {0}; + vsetvlmax_e64m2(); + vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a)); + double CV_DECL_ALIGNED(32) elems[2] = + { + ptr[0], ptr[1] + }; + vsetvlmax_e64m1(); + return v_float64x2(vle64_v_f64m1(elems)); +} + +inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) +{ + double CV_DECL_ALIGNED(32) ptr[4] = {0}; + vsetvlmax_e64m2(); + vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a)); + double CV_DECL_ALIGNED(32) elems[2] = + { + ptr[2], ptr[3] + }; + vsetvlmax_e64m1(); + return v_float64x2(vle64_v_f64m1(elems)); +} + +inline v_float64x2 v_cvt_f64(const v_int64x2& a) +{ + vsetvlmax_e64m1(); + return v_float64x2(vfcvt_f_x_v_f64m1(a)); +} +#endif + +////////////// Broadcast ////////////// + +#define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \ +template inline _Tpvec v_broadcast_element(_Tpvec v) \ +{ \ + return v_setall_##suffix(v_extract_n(v)); \ +} + +OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint8x16, u8) +OPENCV_HAL_IMPL_RVV_BROADCAST(v_int8x16, s8) +OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint16x8, u16) +OPENCV_HAL_IMPL_RVV_BROADCAST(v_int16x8, s16) +OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint32x4, u32) +OPENCV_HAL_IMPL_RVV_BROADCAST(v_int32x4, s32) +OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint64x2, u64) +OPENCV_HAL_IMPL_RVV_BROADCAST(v_int64x2, s64) +OPENCV_HAL_IMPL_RVV_BROADCAST(v_float32x4, f32) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_RVV_BROADCAST(v_float64x2, f64) +#endif + +////////////// Transpose4x4 ////////////// + +#define OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(_Tpvec, _Tp, suffix) \ +inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \ + const v_##_Tpvec& a2, const v_##_Tpvec& a3, \ + v_##_Tpvec& b0, v_##_Tpvec& b1, \ + v_##_Tpvec& b2, v_##_Tpvec& b3) \ +{ \ + _Tp CV_DECL_ALIGNED(32) elems0[4] = \ + { \ + v_extract_n<0>(a0), \ + v_extract_n<0>(a1), \ + v_extract_n<0>(a2), \ + v_extract_n<0>(a3) \ + }; \ + b0 = v_load(elems0); \ + _Tp CV_DECL_ALIGNED(32) elems1[4] = \ + { \ + v_extract_n<1>(a0), \ + v_extract_n<1>(a1), \ + v_extract_n<1>(a2), \ + v_extract_n<1>(a3) \ + }; \ + b1 = v_load(elems1); \ + _Tp CV_DECL_ALIGNED(32) elems2[4] = \ + { \ + v_extract_n<2>(a0), \ + v_extract_n<2>(a1), \ + v_extract_n<2>(a2), \ + v_extract_n<2>(a3) \ + }; \ + b2 = v_load(elems2); \ + _Tp CV_DECL_ALIGNED(32) elems3[4] = \ + { \ + v_extract_n<3>(a0), \ + v_extract_n<3>(a1), \ + v_extract_n<3>(a2), \ + v_extract_n<3>(a3) \ + }; \ + b3 = v_load(elems3); \ +} + +OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(int32x4, int, i32) +OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(float32x4, float, f32) + +////////////// Reverse ////////////// + +#define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, _Tp, width, suffix) \ +inline _Tpvec v_reverse(const _Tpvec& a) \ +{ \ + _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \ + _Tp CV_DECL_ALIGNED(32) ptra[_Tpvec::nlanes] = {0}; \ + v_store(ptra, a); \ + for (int i = 0; i < _Tpvec::nlanes; i++) \ + { \ + ptr[i] = ptra[_Tpvec::nlanes-i-1]; \ + } \ + return v_load(ptr); \ +} + +OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8x16, uchar, 8, u8) +OPENCV_HAL_IMPL_RVV_REVERSE(v_int8x16, schar, 8, i8) +OPENCV_HAL_IMPL_RVV_REVERSE(v_uint16x8, ushort, 16, u16) +OPENCV_HAL_IMPL_RVV_REVERSE(v_int16x8, short, 16, i16) +OPENCV_HAL_IMPL_RVV_REVERSE(v_uint32x4, unsigned, 32, u32) +OPENCV_HAL_IMPL_RVV_REVERSE(v_int32x4, int, 32, i32) +OPENCV_HAL_IMPL_RVV_REVERSE(v_float32x4, float, 32, f32) +OPENCV_HAL_IMPL_RVV_REVERSE(v_uint64x2, uint64, 64, u64) +OPENCV_HAL_IMPL_RVV_REVERSE(v_int64x2, int64, 64, i64) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_RVV_REVERSE(v_float64x2, double, 64, f64) +#endif + +//////////// Value reordering //////////// + +#define OPENCV_HAL_IMPL_RVV_EXPAND(_Tpwvec, _Tp, _Tpvec, width, suffix, wcvt) \ +inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \ +{ \ + _Tp CV_DECL_ALIGNED(32) lptr[_Tpvec::nlanes/2] = {0}; \ + _Tp CV_DECL_ALIGNED(32) hptr[_Tpvec::nlanes/2] = {0}; \ + v_store_low(lptr, a); \ + v_store_high(hptr, a); \ + b0 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr))); \ + b1 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr))); \ +} \ +inline _Tpwvec v_expand_low(const _Tpvec& a) \ +{ \ + _Tp CV_DECL_ALIGNED(32) lptr[_Tpvec::nlanes/2] = {0}; \ + v_store_low(lptr, a); \ + return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr))); \ +} \ +inline _Tpwvec v_expand_high(const _Tpvec& a) \ +{ \ + _Tp CV_DECL_ALIGNED(32) hptr[_Tpvec::nlanes/2] = {0}; \ + v_store_high(hptr, a); \ + return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr))); \ +} \ +inline _Tpwvec v_load_expand(const _Tp* ptr) \ +{ \ + return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(ptr))); \ +} + +OPENCV_HAL_IMPL_RVV_EXPAND(v_uint16x8, uchar, v_uint8x16, 8, u8, vwcvtu_x_x_v_u16m1) +OPENCV_HAL_IMPL_RVV_EXPAND(v_int16x8, schar, v_int8x16, 8, i8, vwcvt_x_x_v_i16m1) +OPENCV_HAL_IMPL_RVV_EXPAND(v_uint32x4, ushort, v_uint16x8, 16, u16, vwcvtu_x_x_v_u32m1) +OPENCV_HAL_IMPL_RVV_EXPAND(v_int32x4, short, v_int16x8, 16, i16, vwcvt_x_x_v_i32m1) +OPENCV_HAL_IMPL_RVV_EXPAND(v_uint64x2, uint, v_uint32x4, 32, u32, vwcvtu_x_x_v_u64m1) +OPENCV_HAL_IMPL_RVV_EXPAND(v_int64x2, int, v_int32x4, 32, i32, vwcvt_x_x_v_i64m1) + +inline v_uint32x4 v_load_expand_q(const uchar* ptr) +{ + vsetvlmax_e32m1(); + return v_uint32x4(vwcvtu_x_x_v_u32m1(vwcvtu_x_x_v_u16mf2(vle8_v_u8mf4(ptr)))); +} + +inline v_int32x4 v_load_expand_q(const schar* ptr) +{ + vsetvlmax_e32m1(); + return v_int32x4(vwcvt_x_x_v_i32m1(vwcvt_x_x_v_i16mf2(vle8_v_i8mf4(ptr)))); +} + + +#define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, _wTp, width, suffix, rshr, shr) \ +inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \ +{ \ + _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \ + v_store(arr, a); \ + v_store(arr + _wTpvec::nlanes, b); \ + vsetvlmax_e##width##m2(); \ + return _Tpvec(shr(vle##width##_v_##suffix##m2(arr), 0)); \ +} \ +inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \ +{ \ + _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \ + v_store(arr, a); \ + v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \ + vsetvlmax_e##width##m2(); \ + v_store(ptr, _Tpvec(shr(vle##width##_v_##suffix##m2(arr), 0))); \ +} \ +template inline \ +_Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b) \ +{ \ + _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \ + v_store(arr, a); \ + v_store(arr + _wTpvec::nlanes, b); \ + vsetvlmax_e##width##m2(); \ + return _Tpvec(rshr(vle##width##_v_##suffix##m2(arr), n)); \ +} \ +template inline \ +void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a) \ +{ \ + _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \ + v_store(arr, a); \ + v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \ + vsetvlmax_e##width##m2(); \ + v_store(ptr, _Tpvec(rshr(vle##width##_v_##suffix##m2(arr), n))); \ +} + +OPENCV_HAL_IMPL_RVV_PACK(v_uint8x16, uchar, v_uint16x8, ushort, 16, u16, vnclipu_wx_u8m1, vnclipu_wx_u8m1) +OPENCV_HAL_IMPL_RVV_PACK(v_int8x16, schar, v_int16x8, short, 16, i16, vnclip_wx_i8m1, vnclip_wx_i8m1) +OPENCV_HAL_IMPL_RVV_PACK(v_uint16x8, ushort, v_uint32x4, unsigned, 32, u32, vnclipu_wx_u16m1, vnclipu_wx_u16m1) +OPENCV_HAL_IMPL_RVV_PACK(v_int16x8, short, v_int32x4, int, 32, i32, vnclip_wx_i16m1, vnclip_wx_i16m1) +OPENCV_HAL_IMPL_RVV_PACK(v_uint32x4, unsigned, v_uint64x2, uint64, 64, u64, vnclipu_wx_u32m1, vnsrl_wx_u32m1) +OPENCV_HAL_IMPL_RVV_PACK(v_int32x4, int, v_int64x2, int64, 64, i64, vnclip_wx_i32m1, vnsra_wx_i32m1) + + +#define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, width, suffix, rshr, cast) \ +inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \ +{ \ + _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \ + v_store(arr, a); \ + v_store(arr + _wTpvec::nlanes, b); \ + vsetvlmax_e##width##m2(); \ + return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), 0)); \ +} \ +inline void v_pack_u_store(_Tp* ptr, const _wTpvec& a) \ +{ \ + _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \ + v_store(arr, a); \ + v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \ + vsetvlmax_e##width##m2(); \ + v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), 0))); \ +} \ +template inline \ +_Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b) \ +{ \ + _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \ + v_store(arr, a); \ + v_store(arr + _wTpvec::nlanes, b); \ + vsetvlmax_e##width##m2(); \ + return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), n)); \ +} \ +template inline \ +void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a) \ +{ \ + _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \ + v_store(arr, a); \ + v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \ + vsetvlmax_e##width##m2(); \ + v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), n))); \ +} + +OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8x16, uchar, v_int16x8, short, 16, i16, vnclipu_wx_u8m1, vreinterpret_v_i16m2_u16m2) +OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16x8, ushort, v_int32x4, int, 32, i32, vnclipu_wx_u16m1, vreinterpret_v_i32m2_u32m2) + + +#define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, _Tp, width, suffix) \ +inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \ +{ \ + _Tp CV_DECL_ALIGNED(32) ptra0[v_##_Tpvec::nlanes] = {0}; \ + _Tp CV_DECL_ALIGNED(32) ptra1[v_##_Tpvec::nlanes] = {0}; \ + _Tp CV_DECL_ALIGNED(32) ptrb0[v_##_Tpvec::nlanes] = {0}; \ + _Tp CV_DECL_ALIGNED(32) ptrb1[v_##_Tpvec::nlanes] = {0}; \ + v_store(ptra0, a0); \ + v_store(ptra1, a1); \ + int i; \ + for( i = 0; i < v_##_Tpvec::nlanes/2; i++ ) \ + { \ + ptrb0[i*2] = ptra0[i]; \ + ptrb0[i*2+1] = ptra1[i]; \ + } \ + for( ; i < v_##_Tpvec::nlanes; i++ ) \ + { \ + ptrb1[i*2-v_##_Tpvec::nlanes] = ptra0[i]; \ + ptrb1[i*2-v_##_Tpvec::nlanes+1] = ptra1[i]; \ + } \ + b0 = v_load(ptrb0); \ + b1 = v_load(ptrb1); \ +} \ +inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \ +{ \ + _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes/2] = {0}; \ + _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes/2] = {0}; \ + v_store_low(ptra, a); \ + v_store_low(ptrb, b); \ + return v_load_halves(ptra, ptrb); \ +} \ +inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \ +{ \ + _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes/2] = {0}; \ + _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes/2] = {0}; \ + v_store_high(ptra, a); \ + v_store_high(ptrb, b); \ + return v_load_halves(ptra, ptrb); \ +} \ +inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \ +{ \ + c = v_combine_low(a, b); \ + d = v_combine_high(a, b); \ +} + +OPENCV_HAL_IMPL_RVV_UNPACKS(uint8x16, uchar, 8, u8) +OPENCV_HAL_IMPL_RVV_UNPACKS(int8x16, schar, 8, i8) +OPENCV_HAL_IMPL_RVV_UNPACKS(uint16x8, ushort, 16, u16) +OPENCV_HAL_IMPL_RVV_UNPACKS(int16x8, short, 16, i16) +OPENCV_HAL_IMPL_RVV_UNPACKS(uint32x4, unsigned, 32, u32) +OPENCV_HAL_IMPL_RVV_UNPACKS(int32x4, int, 32, i32) +OPENCV_HAL_IMPL_RVV_UNPACKS(float32x4, float, 32, f32) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_RVV_UNPACKS(float64x2, double, 64, f64) +#endif + + +#define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp, suffix, width) \ +inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \ +{ \ + _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \ + _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \ + int i, i2; \ + for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \ + { \ + ptra[i] = ptr[i2]; \ + ptrb[i] = ptr[i2+1]; \ + } \ + a = v_load(ptra); \ + b = v_load(ptrb); \ +} \ +inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \ +{ \ + _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \ + _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \ + _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \ + int i, i3; \ + for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \ + { \ + ptra[i] = ptr[i3]; \ + ptrb[i] = ptr[i3+1]; \ + ptrc[i] = ptr[i3+2]; \ + } \ + a = v_load(ptra); \ + b = v_load(ptrb); \ + c = v_load(ptrc); \ +} \ +inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \ + v_##_Tpvec& c, v_##_Tpvec& d) \ +{ \ + _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \ + _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \ + _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \ + _Tp CV_DECL_ALIGNED(32) ptrd[v_##_Tpvec::nlanes] = {0}; \ + int i, i4; \ + for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \ + { \ + ptra[i] = ptr[i4]; \ + ptrb[i] = ptr[i4+1]; \ + ptrc[i] = ptr[i4+2]; \ + ptrd[i] = ptr[i4+3]; \ + } \ + a = v_load(ptra); \ + b = v_load(ptrb); \ + c = v_load(ptrc); \ + d = v_load(ptrd); \ +} \ +inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ +{ \ + int i, i2; \ + _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \ + _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \ + v_store(ptra, a); \ + v_store(ptrb, b); \ + for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \ + { \ + ptr[i2] = ptra[i]; \ + ptr[i2+1] = ptrb[i]; \ + } \ +} \ +inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ + const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ +{ \ + int i, i3; \ + _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \ + _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \ + _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \ + v_store(ptra, a); \ + v_store(ptrb, b); \ + v_store(ptrc, c); \ + for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \ + { \ + ptr[i3] = ptra[i]; \ + ptr[i3+1] = ptrb[i]; \ + ptr[i3+2] = ptrc[i]; \ + } \ +} \ +inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ + const v_##_Tpvec& c, const v_##_Tpvec& d, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \ +{ \ + int i, i4; \ + _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \ + _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \ + _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \ + _Tp CV_DECL_ALIGNED(32) ptrd[v_##_Tpvec::nlanes] = {0}; \ + v_store(ptra, a); \ + v_store(ptrb, b); \ + v_store(ptrc, c); \ + v_store(ptrd, d); \ + for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \ + { \ + ptr[i4] = ptra[i]; \ + ptr[i4+1] = ptrb[i]; \ + ptr[i4+2] = ptrc[i]; \ + ptr[i4+3] = ptrd[i]; \ + } \ +} \ +inline v_##_Tpvec v_interleave_pairs(const v_##_Tpvec& vec) \ +{ \ + _Tp CV_DECL_ALIGNED(32) ptr[v_##_Tpvec::nlanes] = {0}; \ + _Tp CV_DECL_ALIGNED(32) ptrvec[v_##_Tpvec::nlanes] = {0}; \ + v_store(ptrvec, vec); \ + for (int i = 0; i < v_##_Tpvec::nlanes/4; i++) \ + { \ + ptr[4*i ] = ptrvec[4*i ]; \ + ptr[4*i+1] = ptrvec[4*i+2]; \ + ptr[4*i+2] = ptrvec[4*i+1]; \ + ptr[4*i+3] = ptrvec[4*i+3]; \ + } \ + return v_load(ptr); \ +} \ +inline v_##_Tpvec v_interleave_quads(const v_##_Tpvec& vec) \ +{ \ + _Tp CV_DECL_ALIGNED(32) ptr[v_##_Tpvec::nlanes] = {0}; \ + _Tp CV_DECL_ALIGNED(32) ptrvec[v_##_Tpvec::nlanes] = {0}; \ + v_store(ptrvec, vec); \ + for (int i = 0; i < v_##_Tpvec::nlanes/8; i++) \ + { \ + ptr[8*i ] = ptrvec[4*i ]; \ + ptr[8*i+1] = ptrvec[4*i+4]; \ + ptr[8*i+2] = ptrvec[4*i+1]; \ + ptr[8*i+3] = ptrvec[4*i+5]; \ + ptr[8*i+4] = ptrvec[4*i+2]; \ + ptr[8*i+5] = ptrvec[4*i+6]; \ + ptr[8*i+6] = ptrvec[4*i+3]; \ + ptr[8*i+7] = ptrvec[4*i+7]; \ + } \ + return v_load(ptr); \ +} + +OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8x16, uchar, u8, 8) +OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8x16, schar, i8, 8) +OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16x8, ushort, u16, 16) +OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16x8, short, i16, 16) +OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32x4, unsigned, u32, 32) +OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32x4, int, i32, 32) +OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32x4, float, f32, 32) +OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64x2, uint64, u64, 64) +OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64x2, int64, i64, 64) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64x2, double, f64, 64) +#endif + +//////////// PopCount //////////// static const unsigned char popCountTable[] = { @@ -325,1354 +2329,571 @@ static const unsigned char popCountTable[] = 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, }; -template -inline v_reg::abs_type, n> v_popcount(const v_reg<_Tp, n>& a) -{ - v_reg::abs_type, n> b = v_reg::abs_type, n>::zero(); - for (int i = 0; i < n*(int)sizeof(_Tp); i++) - b.s[i/sizeof(_Tp)] += popCountTable[v_reinterpret_as_u8(a).s[i]]; - return b; -} - -template -inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, - v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval ) -{ - for( int i = 0; i < n; i++ ) - { - minval.s[i] = std::min(a.s[i], b.s[i]); - maxval.s[i] = std::max(a.s[i], b.s[i]); - } -} - -#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \ -template \ -inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +#define OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(_rTpvec, _Tpvec, _rTp, _Tp, suffix) \ +inline _rTpvec v_popcount(const _Tpvec& a) \ { \ - typedef typename V_TypeTraits<_Tp>::int_type itype; \ - v_reg<_Tp, n> c; \ - for( int i = 0; i < n; i++ ) \ - c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \ - return c; \ + uchar CV_DECL_ALIGNED(32) ptra[16] = {0}; \ + v_store(ptra, v_reinterpret_as_u8(a)); \ + _rTp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \ + v_store(ptr, v_setzero_##suffix()); \ + for (int i = 0; i < _Tpvec::nlanes*(int)sizeof(_Tp); i++) \ + ptr[i/sizeof(_Tp)] += popCountTable[ptra[i]]; \ + return v_load(ptr); \ } -OPENCV_HAL_IMPL_CMP_OP(<) +OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint8x16, v_uint8x16, uchar, uchar, u8) +OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint8x16, v_int8x16, uchar, schar, u8) +OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint16x8, v_uint16x8, ushort, ushort, u16) +OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint16x8, v_int16x8, ushort, short, u16) +OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint32x4, v_uint32x4, unsigned, unsigned, u32) +OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint32x4, v_int32x4, unsigned, int, u32) +OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint64x2, v_uint64x2, uint64, uint64, u64) +OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint64x2, v_int64x2, uint64, int64, u64) -OPENCV_HAL_IMPL_CMP_OP(>) +//////////// SignMask //////////// -OPENCV_HAL_IMPL_CMP_OP(<=) - -OPENCV_HAL_IMPL_CMP_OP(>=) - -OPENCV_HAL_IMPL_CMP_OP(==) - -OPENCV_HAL_IMPL_CMP_OP(!=) - -template -inline v_reg v_not_nan(const v_reg& a) -{ - typedef typename V_TypeTraits::int_type itype; - v_reg c; - for (int i = 0; i < n; i++) - c.s[i] = V_TypeTraits::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i])); - return c; -} -template -inline v_reg v_not_nan(const v_reg& a) -{ - typedef typename V_TypeTraits::int_type itype; - v_reg c; - for (int i = 0; i < n; i++) - c.s[i] = V_TypeTraits::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i])); - return c; -} - -#define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \ -template \ -inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +#define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec, _Tp, suffix, width, shift) \ +inline int v_signmask(const _Tpvec& a) \ { \ - typedef _Tp2 rtype; \ - v_reg c; \ - for( int i = 0; i < n; i++ ) \ - c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \ - return c; \ + int mask = 0; \ + vsetvlmax_e##width##m1(); \ + _Tpvec tmp = _Tpvec(vsrl_vx_##suffix##m1(a, shift)); \ + for( int i = 0; i < _Tpvec::nlanes; i++ ) \ + mask |= (int)(tmp.val[i]) << i; \ + return mask; \ } -OPENCV_HAL_IMPL_ARITHM_OP(v_add_wrap, +, (_Tp), _Tp) +OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint8x16, uchar, u8, 8, 7) +OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint16x8, ushort, u16, 16, 15) +OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint32x4, unsigned, u32, 32, 31) +OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint64x2, uint64, u64, 64, 63) -OPENCV_HAL_IMPL_ARITHM_OP(v_sub_wrap, -, (_Tp), _Tp) +inline int v_signmask(const v_int8x16& a) +{ return v_signmask(v_reinterpret_as_u8(a)); } +inline int v_signmask(const v_int16x8& a) +{ return v_signmask(v_reinterpret_as_u16(a)); } +inline int v_signmask(const v_int32x4& a) +{ return v_signmask(v_reinterpret_as_u32(a)); } +inline int v_signmask(const v_float32x4& a) +{ return v_signmask(v_reinterpret_as_u32(a)); } +inline int v_signmask(const v_int64x2& a) +{ return v_signmask(v_reinterpret_as_u64(a)); } +#if CV_SIMD128_64F +inline int v_signmask(const v_float64x2& a) +{ return v_signmask(v_reinterpret_as_u64(a)); } +#endif -OPENCV_HAL_IMPL_ARITHM_OP(v_mul_wrap, *, (_Tp), _Tp) -template inline T _absdiff(T a, T b) -{ - return a > b ? a - b : b - a; -} +//////////// Scan forward //////////// -template -inline v_reg::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b) -{ - typedef typename V_TypeTraits<_Tp>::abs_type rtype; - v_reg c; - const rtype mask = (rtype)(std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0); - for( int i = 0; i < n; i++ ) - { - rtype ua = a.s[i] ^ mask; - rtype ub = b.s[i] ^ mask; - c.s[i] = _absdiff(ua, ub); - } - return c; -} - -inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b) -{ - v_float32x4 c; - for( int i = 0; i < c.nlanes; i++ ) - c.s[i] = _absdiff(a.s[i], b.s[i]); - return c; -} - -inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b) -{ - v_float64x2 c; - for( int i = 0; i < c.nlanes; i++ ) - c.s[i] = _absdiff(a.s[i], b.s[i]); - return c; -} - -template -inline v_reg<_Tp, n> v_absdiffs(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) -{ - v_reg<_Tp, n> c; - for( int i = 0; i < n; i++) - c.s[i] = saturate_cast<_Tp>(std::abs(a.s[i] - b.s[i])); - return c; -} - -template -inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a) -{ - v_reg<_Tp, n> c; - for( int i = 0; i < n; i++ ) - c.s[i] = 1.f/std::sqrt(a.s[i]); - return c; -} - -template -inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) -{ - v_reg<_Tp, n> c; - for( int i = 0; i < n; i++ ) - c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]); - return c; -} - -template -inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) -{ - v_reg<_Tp, n> c; - for( int i = 0; i < n; i++ ) - c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i]; - return c; -} - -template -inline v_reg<_Tp, n> v_fma(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, - const v_reg<_Tp, n>& c) -{ - v_reg<_Tp, n> d; - for( int i = 0; i < n; i++ ) - d.s[i] = a.s[i]*b.s[i] + c.s[i]; - return d; -} - -template -inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, - const v_reg<_Tp, n>& c) -{ - return v_fma(a, b, c); -} - -template inline v_reg::w_type, n/2> -v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) -{ - typedef typename V_TypeTraits<_Tp>::w_type w_type; - v_reg c; - for( int i = 0; i < (n/2); i++ ) - c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1]; - return c; -} - -template inline v_reg::w_type, n/2> -v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, - const v_reg::w_type, n / 2>& c) -{ - typedef typename V_TypeTraits<_Tp>::w_type w_type; - v_reg s; - for( int i = 0; i < (n/2); i++ ) - s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i]; - return s; -} - -template inline v_reg::w_type, n/2> -v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) -{ return v_dotprod(a, b); } - -template inline v_reg::w_type, n/2> -v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, - const v_reg::w_type, n / 2>& c) -{ return v_dotprod(a, b, c); } - -template inline v_reg::q_type, n/4> -v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) -{ - typedef typename V_TypeTraits<_Tp>::q_type q_type; - v_reg s; - for( int i = 0; i < (n/4); i++ ) - s.s[i] = (q_type)a.s[i*4 ]*b.s[i*4 ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] + - (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3]; - return s; -} - -template inline v_reg::q_type, n/4> -v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, - const v_reg::q_type, n / 4>& c) -{ - typedef typename V_TypeTraits<_Tp>::q_type q_type; - v_reg s; - for( int i = 0; i < (n/4); i++ ) - s.s[i] = (q_type)a.s[i*4 ]*b.s[i*4 ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] + - (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3] + c.s[i]; - return s; -} - -template inline v_reg::q_type, n/4> -v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) -{ return v_dotprod_expand(a, b); } - -template inline v_reg::q_type, n/4> -v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, - const v_reg::q_type, n / 4>& c) -{ return v_dotprod_expand(a, b, c); } - -template inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, - v_reg::w_type, n/2>& c, - v_reg::w_type, n/2>& d) -{ - typedef typename V_TypeTraits<_Tp>::w_type w_type; - for( int i = 0; i < (n/2); i++ ) - { - c.s[i] = (w_type)a.s[i]*b.s[i]; - d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)]; - } -} - -template inline v_reg<_Tp, n> v_mul_hi(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) -{ - typedef typename V_TypeTraits<_Tp>::w_type w_type; - v_reg<_Tp, n> c; - for (int i = 0; i < n; i++) - c.s[i] = (_Tp)(((w_type)a.s[i] * b.s[i]) >> sizeof(_Tp)*8); - return c; -} - -template inline void v_hsum(const v_reg<_Tp, n>& a, - v_reg::w_type, n/2>& c) -{ - typedef typename V_TypeTraits<_Tp>::w_type w_type; - for( int i = 0; i < (n/2); i++ ) - { - c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1]; - } -} - -#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \ -template inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \ +#define OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(_Tpvec, _Tp, suffix) \ +inline int v_scan_forward(const _Tpvec& a) \ { \ - v_reg<_Tp, n> c; \ - for( int i = 0; i < n; i++ ) \ - c.s[i] = (_Tp)(a.s[i] shift_op imm); \ - return c; \ + _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \ + v_store(ptr, v_reinterpret_as_##suffix(a)); \ + for (int i = 0; i < _Tpvec::nlanes; i++) \ + if(int(ptr[i]) < 0) \ + return i; \ + return 0; \ } -OPENCV_HAL_IMPL_SHIFT_OP(<< ) +OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint8x16, uchar, u8) +OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int8x16, schar, s8) +OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint16x8, ushort, u16) +OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int16x8, short, s16) +OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int32x4, int, s32) +OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float32x4, float, f32) +OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint64x2, uint64, u64) +OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int64x2, int64, s64) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float64x2, double, f64) +#endif -OPENCV_HAL_IMPL_SHIFT_OP(>> ) +//////////// Pack triplets //////////// -#define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix,opA,opB) \ -template inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a) \ +#define OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(_Tpvec, _Tp) \ +inline _Tpvec v_pack_triplets(const _Tpvec& vec) \ { \ - v_reg<_Tp, n> b; \ - for (int i = 0; i < n; i++) \ + _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \ + _Tp CV_DECL_ALIGNED(32) ptrvec[_Tpvec::nlanes] = {0}; \ + v_store(ptrvec, vec); \ + for (int i = 0; i < _Tpvec::nlanes/4; i++) \ { \ - int sIndex = i opA imm; \ - if (0 <= sIndex && sIndex < n) \ - { \ - b.s[i] = a.s[sIndex]; \ - } \ - else \ - { \ - b.s[i] = 0; \ - } \ + ptr[3*i ] = ptrvec[4*i ]; \ + ptr[3*i+1] = ptrvec[4*i+2]; \ + ptr[3*i+2] = ptrvec[4*i+2]; \ } \ - return b; \ -} \ -template inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ -{ \ - v_reg<_Tp, n> c; \ - for (int i = 0; i < n; i++) \ - { \ - int aIndex = i opA imm; \ - int bIndex = i opA imm opB n; \ - if (0 <= bIndex && bIndex < n) \ - { \ - c.s[i] = b.s[bIndex]; \ - } \ - else if (0 <= aIndex && aIndex < n) \ - { \ - c.s[i] = a.s[aIndex]; \ - } \ - else \ - { \ - c.s[i] = 0; \ - } \ - } \ - return c; \ + return v_load(ptr); \ } -OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(left, -, +) +OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint8x16, uchar) +OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int8x16, schar) +OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint16x8, ushort) +OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int16x8, short) +OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint32x4, unsigned) +OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int32x4, int) +OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float32x4, float) -OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(right, +, -) -template inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a) +////// FP16 support /////// + +#if CV_FP16 +inline v_float32x4 v_load_expand(const float16_t* ptr) { - typename V_TypeTraits<_Tp>::sum_type c = a.s[0]; - for( int i = 1; i < n; i++ ) - c += a.s[i]; - return c; + return v_float32x4(vfwcvt_f_f_v_f32m1(vle16_v_f16mf2(ptr))); } -inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, - const v_float32x4& c, const v_float32x4& d) +inline void v_pack_store(float16_t* ptr, const v_float32x4& v) { - v_float32x4 r; - r.s[0] = a.s[0] + a.s[1] + a.s[2] + a.s[3]; - r.s[1] = b.s[0] + b.s[1] + b.s[2] + b.s[3]; - r.s[2] = c.s[0] + c.s[1] + c.s[2] + c.s[3]; - r.s[3] = d.s[0] + d.s[1] + d.s[2] + d.s[3]; - return r; + vse16_v_f16mf2(ptr, vfncvt_f_f_w_f16mf2(v)); +} +#else +inline v_float32x4 v_load_expand(const float16_t* ptr) +{ + const int N = 4; + float buf[N]; + for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i]; + return v_load(buf); } -template inline typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type v_reduce_sad(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +inline void v_pack_store(float16_t* ptr, const v_float32x4& v) { - typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type c = _absdiff(a.s[0], b.s[0]); - for (int i = 1; i < n; i++) - c += _absdiff(a.s[i], b.s[i]); - return c; + const int N = 4; + float buf[N]; + v_store(buf, v); + for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]); } - -template inline int v_signmask(const v_reg<_Tp, n>& a) -{ - int mask = 0; - for( int i = 0; i < n; i++ ) - mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i; - return mask; -} - -template inline int v_scan_forward(const v_reg<_Tp, n>& a) -{ - for (int i = 0; i < n; i++) - if(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) - return i; - return 0; -} - -template inline bool v_check_all(const v_reg<_Tp, n>& a) -{ - for( int i = 0; i < n; i++ ) - if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 ) - return false; - return true; -} - -template inline bool v_check_any(const v_reg<_Tp, n>& a) -{ - for( int i = 0; i < n; i++ ) - if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 ) - return true; - return false; -} - -template inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask, - const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) -{ - typedef V_TypeTraits<_Tp> Traits; - typedef typename Traits::int_type int_type; - v_reg<_Tp, n> c; - for( int i = 0; i < n; i++ ) - { - int_type m = Traits::reinterpret_int(mask.s[i]); - CV_DbgAssert(m == 0 || m == (~(int_type)0)); // restrict mask values: 0 or 0xff/0xffff/etc - c.s[i] = m ? a.s[i] : b.s[i]; - } - return c; -} - -template inline void v_expand(const v_reg<_Tp, n>& a, - v_reg::w_type, n/2>& b0, - v_reg::w_type, n/2>& b1) -{ - for( int i = 0; i < (n/2); i++ ) - { - b0.s[i] = a.s[i]; - b1.s[i] = a.s[i+(n/2)]; - } -} - -template -inline v_reg::w_type, n/2> -v_expand_low(const v_reg<_Tp, n>& a) -{ - v_reg::w_type, n/2> b; - for( int i = 0; i < (n/2); i++ ) - b.s[i] = a.s[i]; - return b; -} - -template -inline v_reg::w_type, n/2> -v_expand_high(const v_reg<_Tp, n>& a) -{ - v_reg::w_type, n/2> b; - for( int i = 0; i < (n/2); i++ ) - b.s[i] = a.s[i+(n/2)]; - return b; -} - -template inline v_reg::int_type, n> - v_reinterpret_as_int(const v_reg<_Tp, n>& a) -{ - v_reg::int_type, n> c; - for( int i = 0; i < n; i++ ) - c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]); - return c; -} - -template inline v_reg::uint_type, n> - v_reinterpret_as_uint(const v_reg<_Tp, n>& a) -{ - v_reg::uint_type, n> c; - for( int i = 0; i < n; i++ ) - c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]); - return c; -} - -template inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1, - v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 ) -{ - int i; - for( i = 0; i < n/2; i++ ) - { - b0.s[i*2] = a0.s[i]; - b0.s[i*2+1] = a1.s[i]; - } - for( ; i < n; i++ ) - { - b1.s[i*2-n] = a0.s[i]; - b1.s[i*2-n+1] = a1.s[i]; - } -} - -template -inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load(const _Tp* ptr) -{ -#if CV_STRONG_ALIGNMENT - CV_Assert(isAligned(ptr)); #endif - return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr); + +////////////// Rounding ////////////// + +inline v_int32x4 v_round(const v_float32x4& a) +{ + vsetvlmax_e32m1(); + return v_int32x4(vfcvt_x_f_v_i32m1(a)); } -template -inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr) +inline v_int32x4 v_floor(const v_float32x4& a) { - CV_Assert(isAligned::nlanes128>)>(ptr)); - return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr); + v_float32x4 ZP5 = v_setall_f32(0.5f); + v_float32x4 t = a - ZP5; + vsetvlmax_e32m1(); + return v_int32x4(vfcvt_x_f_v_i32m1(t)); } -template -inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr) +inline v_int32x4 v_ceil(const v_float32x4& a) { -#if CV_STRONG_ALIGNMENT - CV_Assert(isAligned(ptr)); + v_float32x4 ZP5 = v_setall_f32(0.5f); + v_float32x4 t = a + ZP5; + vsetvlmax_e32m1(); + return v_int32x4(vfcvt_x_f_v_i32m1(t)); +} + +inline v_int32x4 v_trunc(const v_float32x4& a) +{ + vsetvlmax_e32m1(); + return v_int32x4(vfcvt_rtz_x_f_v_i32m1(a)); +} +#if CV_SIMD128_64F +inline v_int32x4 v_round(const v_float64x2& a) +{ + double arr[4] = {a.val[0], a.val[1], 0, 0}; + vsetvlmax_e64m2(); + vfloat64m2_t tmp = vle64_v_f64m2(arr); + return v_int32x4(vfncvt_x_f_w_i32m1(tmp)); +} + +inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b) +{ + double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]}; + vsetvlmax_e64m2(); + vfloat64m2_t tmp = vle64_v_f64m2(arr); + return v_int32x4(vfncvt_x_f_w_i32m1(tmp)); +} + +inline v_int32x4 v_floor(const v_float64x2& a) +{ + double arr[4] = {a.val[0]-0.5f, a.val[1]-0.5f, 0, 0}; + vsetvlmax_e64m2(); + vfloat64m2_t tmp = vle64_v_f64m2(arr); + return v_int32x4(vfncvt_x_f_w_i32m1(tmp)); +} + +inline v_int32x4 v_ceil(const v_float64x2& a) +{ + double arr[4] = {a.val[0]+0.5f, a.val[1]+0.5f, 0, 0}; + vsetvlmax_e64m2(); + vfloat64m2_t tmp = vle64_v_f64m2(arr); + return v_int32x4(vfncvt_x_f_w_i32m1(tmp)); +} + +inline v_int32x4 v_trunc(const v_float64x2& a) +{ + double arr[4] = {a.val[0], a.val[1], 0, 0}; + vsetvlmax_e64m2(); + vfloat64m2_t tmp = vle64_v_f64m2(arr); + return v_int32x4(vfncvt_rtz_x_f_w_i32m1(tmp)); +} #endif - v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c; - for( int i = 0; i < c.nlanes/2; i++ ) - { - c.s[i] = ptr[i]; - } - return c; + + +//////// Dot Product //////// + +// 16 >> 32 +inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) +{ + int CV_DECL_ALIGNED(32) ptr[8] = {0}; + v_int32x4 t1, t2; + vsetvlmax_e32m2(); + vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b)); + v_load_deinterleave(ptr, t1, t2); + return t1 + t2; +} +inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +{ + int CV_DECL_ALIGNED(32) ptr[8] = {0}; + v_int32x4 t1, t2; + vsetvlmax_e32m2(); + vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b)); + v_load_deinterleave(ptr, t1, t2); + return t1 + t2 + c; } -template -inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_halves(const _Tp* loptr, const _Tp* hiptr) +// 32 >> 64 +inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b) { -#if CV_STRONG_ALIGNMENT - CV_Assert(isAligned(loptr)); - CV_Assert(isAligned(hiptr)); + int64 CV_DECL_ALIGNED(32) ptr[4] = {0}; + v_int64x2 t1, t2; + vsetvlmax_e64m2(); + vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b)); + v_load_deinterleave(ptr, t1, t2); + return t1 + t2; +} +inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) +{ + int64 CV_DECL_ALIGNED(32) ptr[4] = {0}; + v_int64x2 t1, t2; + vsetvlmax_e64m2(); + vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b)); + v_load_deinterleave(ptr, t1, t2); + return t1 + t2 + c; +} + +// 8 >> 32 +inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b) +{ + unsigned CV_DECL_ALIGNED(32) ptr[16] = {0}; + v_uint32x4 t1, t2, t3, t4; + vsetvlmax_e32m4(); + vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b)); + v_load_deinterleave(ptr, t1, t2, t3, t4); + return t1 + t2 + t3 + t4; +} +inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, + const v_uint32x4& c) +{ + unsigned CV_DECL_ALIGNED(32) ptr[16] = {0}; + v_uint32x4 t1, t2, t3, t4; + vsetvlmax_e32m4(); + vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b)); + v_load_deinterleave(ptr, t1, t2, t3, t4); + return t1 + t2 + t3 + t4 + c; +} + +inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) +{ + int CV_DECL_ALIGNED(32) ptr[16] = {0}; + v_int32x4 t1, t2, t3, t4; + vsetvlmax_e32m4(); + vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b)); + v_load_deinterleave(ptr, t1, t2, t3, t4); + return t1 + t2 + t3 + t4; +} +inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, + const v_int32x4& c) +{ + int CV_DECL_ALIGNED(32) ptr[16] = {0}; + v_int32x4 t1, t2, t3, t4; + vsetvlmax_e32m4(); + vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b)); + v_load_deinterleave(ptr, t1, t2, t3, t4); + return t1 + t2 + t3 + t4 + c; +} + +// 16 >> 64 +inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) +{ + uint64 CV_DECL_ALIGNED(32) ptr[8] = {0}; + v_uint64x2 t1, t2, t3, t4; + vsetvlmax_e64m4(); + vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b)); + v_load_deinterleave(ptr, t1, t2, t3, t4); + return t1 + t2 + t3 + t4; +} +inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) +{ + uint64 CV_DECL_ALIGNED(32) ptr[8] = {0}; + v_uint64x2 t1, t2, t3, t4; + vsetvlmax_e64m4(); + vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b)); + v_load_deinterleave(ptr, t1, t2, t3, t4); + return t1 + t2 + t3 + t4 + c; +} + +inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) +{ + int64 CV_DECL_ALIGNED(32) ptr[8] = {0}; + v_int64x2 t1, t2, t3, t4; + vsetvlmax_e64m4(); + vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b)); + v_load_deinterleave(ptr, t1, t2, t3, t4); + return t1 + t2 + t3 + t4; +} +inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, + const v_int64x2& c) +{ + int64 CV_DECL_ALIGNED(32) ptr[8] = {0}; + v_int64x2 t1, t2, t3, t4; + vsetvlmax_e64m4(); + vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b)); + v_load_deinterleave(ptr, t1, t2, t3, t4); + return t1 + t2 + t3 + t4 + c; +} + +// 32 >> 64f +#if CV_SIMD128_64F +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) +{ return v_cvt_f64(v_dotprod(a, b)); } +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, + const v_float64x2& c) +{ return v_dotprod_expand(a, b) + c; } #endif - v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c; - for( int i = 0; i < c.nlanes/2; i++ ) - { - c.s[i] = loptr[i]; - c.s[i+c.nlanes/2] = hiptr[i]; - } - return c; + +//////// Fast Dot Product //////// + +// 16 >> 32 +inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b) +{ + int CV_DECL_ALIGNED(32) ptr[8] = {0}; + vsetvlmax_e32m2(); + vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b)); + v_int32x4 t1 = v_load(ptr); + v_int32x4 t2 = v_load(ptr+4); + return t1 + t2; +} +inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +{ + int CV_DECL_ALIGNED(32) ptr[8] = {0}; + vsetvlmax_e32m2(); + vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b)); + v_int32x4 t1 = v_load(ptr); + v_int32x4 t2 = v_load(ptr+4); + return t1 + t2 + c; } -template -inline v_reg::w_type, V_TypeTraits<_Tp>::nlanes128 / 2> -v_load_expand(const _Tp* ptr) +// 32 >> 64 +inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b) { -#if CV_STRONG_ALIGNMENT - CV_Assert(isAligned(ptr)); + int64 CV_DECL_ALIGNED(32) ptr[4] = {0}; + vsetvlmax_e64m2(); + vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b)); + v_int64x2 t1 = v_load(ptr); + v_int64x2 t2 = v_load(ptr+2); + return t1 + t2; +} +inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) +{ + int64 CV_DECL_ALIGNED(32) ptr[4] = {0}; + vsetvlmax_e64m2(); + vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b)); + v_int64x2 t1 = v_load(ptr); + v_int64x2 t2 = v_load(ptr+2); + return t1 + t2 + c; +} + + +// 8 >> 32 +inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b) +{ + unsigned CV_DECL_ALIGNED(32) ptr[16] = {0}; + vsetvlmax_e32m4(); + vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b)); + v_uint32x4 t1 = v_load(ptr); + v_uint32x4 t2 = v_load(ptr+4); + v_uint32x4 t3 = v_load(ptr+8); + v_uint32x4 t4 = v_load(ptr+12); + return t1 + t2 + t3 + t4; +} +inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) +{ + unsigned CV_DECL_ALIGNED(32) ptr[16] = {0}; + vsetvlmax_e32m4(); + vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b)); + v_uint32x4 t1 = v_load(ptr); + v_uint32x4 t2 = v_load(ptr+4); + v_uint32x4 t3 = v_load(ptr+8); + v_uint32x4 t4 = v_load(ptr+12); + return t1 + t2 + t3 + t4 + c; +} +inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b) +{ + int CV_DECL_ALIGNED(32) ptr[16] = {0}; + vsetvlmax_e32m4(); + vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b)); + v_int32x4 t1 = v_load(ptr); + v_int32x4 t2 = v_load(ptr+4); + v_int32x4 t3 = v_load(ptr+8); + v_int32x4 t4 = v_load(ptr+12); + return t1 + t2 + t3 + t4; +} +inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) +{ + int CV_DECL_ALIGNED(32) ptr[16] = {0}; + vsetvlmax_e32m4(); + vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b)); + v_int32x4 t1 = v_load(ptr); + v_int32x4 t2 = v_load(ptr+4); + v_int32x4 t3 = v_load(ptr+8); + v_int32x4 t4 = v_load(ptr+12); + return t1 + t2 + t3 + t4 + c; +} + +// 16 >> 64 +inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b) +{ + uint64 CV_DECL_ALIGNED(32) ptr[8] = {0}; + vsetvlmax_e64m4(); + vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b)); + v_uint64x2 t1 = v_load(ptr); + v_uint64x2 t2 = v_load(ptr+2); + v_uint64x2 t3 = v_load(ptr+4); + v_uint64x2 t4 = v_load(ptr+6); + return t1 + t2 + t3 + t4; +} +inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) +{ + uint64 CV_DECL_ALIGNED(32) ptr[8] = {0}; + vsetvlmax_e64m4(); + vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b)); + v_uint64x2 t1 = v_load(ptr); + v_uint64x2 t2 = v_load(ptr+2); + v_uint64x2 t3 = v_load(ptr+4); + v_uint64x2 t4 = v_load(ptr+6); + return t1 + t2 + t3 + t4 + c; +} +inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b) +{ + int64 CV_DECL_ALIGNED(32) ptr[8] = {0}; + vsetvlmax_e64m4(); + vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b)); + v_int64x2 t1 = v_load(ptr); + v_int64x2 t2 = v_load(ptr+2); + v_int64x2 t3 = v_load(ptr+4); + v_int64x2 t4 = v_load(ptr+6); + return t1 + t2 + t3 + t4; +} +inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) +{ + int64 CV_DECL_ALIGNED(32) ptr[8] = {0}; + vsetvlmax_e64m4(); + vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b)); + v_int64x2 t1 = v_load(ptr); + v_int64x2 t2 = v_load(ptr+2); + v_int64x2 t3 = v_load(ptr+4); + v_int64x2 t4 = v_load(ptr+6); + return t1 + t2 + t3 + t4 + c; +} + +// 32 >> 64f +#if CV_SIMD128_64F +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) +{ return v_cvt_f64(v_dotprod_fast(a, b)); } +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) +{ return v_dotprod_expand_fast(a, b) + c; } #endif - typedef typename V_TypeTraits<_Tp>::w_type w_type; - v_reg::nlanes128> c; - for( int i = 0; i < c.nlanes; i++ ) - { - c.s[i] = ptr[i]; - } - return c; -} -template -inline v_reg::q_type, V_TypeTraits<_Tp>::nlanes128 / 4> -v_load_expand_q(const _Tp* ptr) -{ -#if CV_STRONG_ALIGNMENT - CV_Assert(isAligned(ptr)); -#endif - typedef typename V_TypeTraits<_Tp>::q_type q_type; - v_reg::nlanes128> c; - for( int i = 0; i < c.nlanes; i++ ) - { - c.s[i] = ptr[i]; - } - return c; -} - -template inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, - v_reg<_Tp, n>& b) -{ -#if CV_STRONG_ALIGNMENT - CV_Assert(isAligned(ptr)); -#endif - int i, i2; - for( i = i2 = 0; i < n; i++, i2 += 2 ) - { - a.s[i] = ptr[i2]; - b.s[i] = ptr[i2+1]; - } -} - -template inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, - v_reg<_Tp, n>& b, v_reg<_Tp, n>& c) -{ -#if CV_STRONG_ALIGNMENT - CV_Assert(isAligned(ptr)); -#endif - int i, i3; - for( i = i3 = 0; i < n; i++, i3 += 3 ) - { - a.s[i] = ptr[i3]; - b.s[i] = ptr[i3+1]; - c.s[i] = ptr[i3+2]; - } -} - -template -inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, - v_reg<_Tp, n>& b, v_reg<_Tp, n>& c, - v_reg<_Tp, n>& d) -{ -#if CV_STRONG_ALIGNMENT - CV_Assert(isAligned(ptr)); -#endif - int i, i4; - for( i = i4 = 0; i < n; i++, i4 += 4 ) - { - a.s[i] = ptr[i4]; - b.s[i] = ptr[i4+1]; - c.s[i] = ptr[i4+2]; - d.s[i] = ptr[i4+3]; - } -} - -template -inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, - const v_reg<_Tp, n>& b, - hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) -{ -#if CV_STRONG_ALIGNMENT - CV_Assert(isAligned(ptr)); -#endif - int i, i2; - for( i = i2 = 0; i < n; i++, i2 += 2 ) - { - ptr[i2] = a.s[i]; - ptr[i2+1] = b.s[i]; - } -} - -template -inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, - const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c, - hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) -{ -#if CV_STRONG_ALIGNMENT - CV_Assert(isAligned(ptr)); -#endif - int i, i3; - for( i = i3 = 0; i < n; i++, i3 += 3 ) - { - ptr[i3] = a.s[i]; - ptr[i3+1] = b.s[i]; - ptr[i3+2] = c.s[i]; - } -} - -template inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, - const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c, - const v_reg<_Tp, n>& d, - hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) -{ -#if CV_STRONG_ALIGNMENT - CV_Assert(isAligned(ptr)); -#endif - int i, i4; - for( i = i4 = 0; i < n; i++, i4 += 4 ) - { - ptr[i4] = a.s[i]; - ptr[i4+1] = b.s[i]; - ptr[i4+2] = c.s[i]; - ptr[i4+3] = d.s[i]; - } -} - -template -inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a) -{ -#if CV_STRONG_ALIGNMENT - CV_Assert(isAligned(ptr)); -#endif - for( int i = 0; i < n; i++ ) - ptr[i] = a.s[i]; -} - -template -inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/) -{ -#if CV_STRONG_ALIGNMENT - CV_Assert(isAligned(ptr)); -#endif - v_store(ptr, a); -} - -template -inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a) -{ -#if CV_STRONG_ALIGNMENT - CV_Assert(isAligned(ptr)); -#endif - for( int i = 0; i < (n/2); i++ ) - ptr[i] = a.s[i]; -} - -template -inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a) -{ -#if CV_STRONG_ALIGNMENT - CV_Assert(isAligned(ptr)); -#endif - for( int i = 0; i < (n/2); i++ ) - ptr[i] = a.s[i+(n/2)]; -} - -template -inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a) -{ - CV_Assert(isAligned)>(ptr)); - v_store(ptr, a); -} - -template -inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a) -{ - CV_Assert(isAligned)>(ptr)); - v_store(ptr, a); -} - -template -inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/) -{ - CV_Assert(isAligned)>(ptr)); - v_store(ptr, a); -} - -template -inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) -{ - v_reg<_Tp, n> c; - for( int i = 0; i < (n/2); i++ ) - { - c.s[i] = a.s[i]; - c.s[i+(n/2)] = b.s[i]; - } - return c; -} - -template -inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) -{ - v_reg<_Tp, n> c; - for( int i = 0; i < (n/2); i++ ) - { - c.s[i] = a.s[i+(n/2)]; - c.s[i+(n/2)] = b.s[i+(n/2)]; - } - return c; -} - -template -inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, - v_reg<_Tp, n>& low, v_reg<_Tp, n>& high) -{ - for( int i = 0; i < (n/2); i++ ) - { - low.s[i] = a.s[i]; - low.s[i+(n/2)] = b.s[i]; - high.s[i] = a.s[i+(n/2)]; - high.s[i+(n/2)] = b.s[i+(n/2)]; - } -} - -template -inline v_reg<_Tp, n> v_reverse(const v_reg<_Tp, n>& a) -{ - v_reg<_Tp, n> c; - for( int i = 0; i < n; i++ ) - c.s[i] = a.s[n-i-1]; - return c; -} - -template -inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) -{ - v_reg<_Tp, n> r; - const int shift = n - s; - int i = 0; - for (; i < shift; ++i) - r.s[i] = a.s[i+s]; - for (; i < n; ++i) - r.s[i] = b.s[i-shift]; - return r; -} - -template -inline _Tp v_extract_n(const v_reg<_Tp, n>& v) -{ - CV_DbgAssert(s >= 0 && s < n); - return v.s[s]; -} - -template -inline v_reg<_Tp, n> v_broadcast_element(const v_reg<_Tp, n>& a) -{ - CV_DbgAssert(i >= 0 && i < n); - return v_reg<_Tp, n>::all(a.s[i]); -} - -template inline v_reg v_round(const v_reg& a) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - c.s[i] = cvRound(a.s[i]); - return c; -} - -template inline v_reg v_round(const v_reg& a, const v_reg& b) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - { - c.s[i] = cvRound(a.s[i]); - c.s[i+n] = cvRound(b.s[i]); - } - return c; -} - -template inline v_reg v_floor(const v_reg& a) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - c.s[i] = cvFloor(a.s[i]); - return c; -} - -template inline v_reg v_ceil(const v_reg& a) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - c.s[i] = cvCeil(a.s[i]); - return c; -} - -template inline v_reg v_trunc(const v_reg& a) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - c.s[i] = (int)(a.s[i]); - return c; -} - -template inline v_reg v_round(const v_reg& a) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - { - c.s[i] = cvRound(a.s[i]); - c.s[i+n] = 0; - } - return c; -} - -template inline v_reg v_floor(const v_reg& a) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - { - c.s[i] = cvFloor(a.s[i]); - c.s[i+n] = 0; - } - return c; -} - -template inline v_reg v_ceil(const v_reg& a) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - { - c.s[i] = cvCeil(a.s[i]); - c.s[i+n] = 0; - } - return c; -} - -template inline v_reg v_trunc(const v_reg& a) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - { - c.s[i] = cvCeil(a.s[i]); - c.s[i+n] = 0; - } - return c; -} - -template inline v_reg v_cvt_f32(const v_reg& a) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - c.s[i] = (float)a.s[i]; - return c; -} - -template inline v_reg v_cvt_f32(const v_reg& a) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - { - c.s[i] = (float)a.s[i]; - c.s[i+n] = 0; - } - return c; -} - -template inline v_reg v_cvt_f32(const v_reg& a, const v_reg& b) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - { - c.s[i] = (float)a.s[i]; - c.s[i+n] = (float)b.s[i]; - } - return c; -} - -CV_INLINE v_reg v_cvt_f64(const v_reg& a) -{ - enum { n = 2 }; - v_reg c; - for( int i = 0; i < n; i++ ) - c.s[i] = (double)a.s[i]; - return c; -} - -CV_INLINE v_reg v_cvt_f64_high(const v_reg& a) -{ - enum { n = 2 }; - v_reg c; - for( int i = 0; i < n; i++ ) - c.s[i] = (double)a.s[i + 2]; - return c; -} - -CV_INLINE v_reg v_cvt_f64(const v_reg& a) -{ - enum { n = 2 }; - v_reg c; - for( int i = 0; i < n; i++ ) - c.s[i] = (double)a.s[i]; - return c; -} - -CV_INLINE v_reg v_cvt_f64_high(const v_reg& a) -{ - enum { n = 2 }; - v_reg c; - for( int i = 0; i < n; i++ ) - c.s[i] = (double)a.s[i + 2]; - return c; -} - -CV_INLINE v_reg v_cvt_f64(const v_reg& a) -{ - enum { n = 2 }; - v_reg c; - for( int i = 0; i < n; i++ ) - c.s[i] = (double)a.s[i]; - return c; -} - -CV_INLINE v_reg v_cvt_f64_high(const v_reg& a) -{ - enum { n = 2 }; - v_reg c; - for( int i = 0; i < n; i++ ) - c.s[i] = (double)a.s[i]; - return c; -} - - -template inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut(const _Tp* tab, const int* idx) -{ - v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c; - for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++) - c.s[i] = tab[idx[i]]; - return c; -} -template inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_pairs(const _Tp* tab, const int* idx) -{ - v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c; - for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++) - c.s[i] = tab[idx[i / 2] + i % 2]; - return c; -} -template inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_quads(const _Tp* tab, const int* idx) -{ - v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c; - for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++) - c.s[i] = tab[idx[i / 4] + i % 4]; - return c; -} - -template inline v_reg v_lut(const int* tab, const v_reg& idx) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - c.s[i] = tab[idx.s[i]]; - return c; -} - -template inline v_reg v_lut(const unsigned* tab, const v_reg& idx) -{ - v_reg c; - for (int i = 0; i < n; i++) - c.s[i] = tab[idx.s[i]]; - return c; -} - -template inline v_reg v_lut(const float* tab, const v_reg& idx) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - c.s[i] = tab[idx.s[i]]; - return c; -} - -template inline v_reg v_lut(const double* tab, const v_reg& idx) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - c.s[i] = tab[idx.s[i]]; - return c; -} - - -inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec) -{ - return v_lut(tab, idxvec.s); -} - -inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec) -{ - return v_lut(tab, idxvec.s); -} - -inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec) -{ - return v_lut(tab, idxvec.s); -} - -inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec) -{ - return v_lut(tab, idxvec.s); -} - - -template inline void v_lut_deinterleave(const float* tab, const v_reg& idx, - v_reg& x, v_reg& y) -{ - for( int i = 0; i < n; i++ ) - { - int j = idx.s[i]; - x.s[i] = tab[j]; - y.s[i] = tab[j+1]; - } -} - -template inline void v_lut_deinterleave(const double* tab, const v_reg& idx, - v_reg& x, v_reg& y) -{ - for( int i = 0; i < n; i++ ) - { - int j = idx.s[i]; - x.s[i] = tab[j]; - y.s[i] = tab[j+1]; - } -} - -template inline v_reg<_Tp, n> v_interleave_pairs(const v_reg<_Tp, n>& vec) -{ - v_reg<_Tp, n> c; - for (int i = 0; i < n/4; i++) - { - c.s[4*i ] = vec.s[4*i ]; - c.s[4*i+1] = vec.s[4*i+2]; - c.s[4*i+2] = vec.s[4*i+1]; - c.s[4*i+3] = vec.s[4*i+3]; - } - return c; -} - -template inline v_reg<_Tp, n> v_interleave_quads(const v_reg<_Tp, n>& vec) -{ - v_reg<_Tp, n> c; - for (int i = 0; i < n/8; i++) - { - c.s[8*i ] = vec.s[8*i ]; - c.s[8*i+1] = vec.s[8*i+4]; - c.s[8*i+2] = vec.s[8*i+1]; - c.s[8*i+3] = vec.s[8*i+5]; - c.s[8*i+4] = vec.s[8*i+2]; - c.s[8*i+5] = vec.s[8*i+6]; - c.s[8*i+6] = vec.s[8*i+3]; - c.s[8*i+7] = vec.s[8*i+7]; - } - return c; -} - -template inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec) -{ - v_reg<_Tp, n> c; - for (int i = 0; i < n/4; i++) - { - c.s[3*i ] = vec.s[4*i ]; - c.s[3*i+1] = vec.s[4*i+1]; - c.s[3*i+2] = vec.s[4*i+2]; - } - return c; -} - -template -inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1, - const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3, - v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1, - v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 ) -{ - b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]); - b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]); - b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]); - b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]); -} - -#define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, _Tp, suffix) \ -inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); } - -OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, uchar, u8) -OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, schar, s8) -OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, ushort, u16) -OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, short, s16) -OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, unsigned, u32) -OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, int, s32) -OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, float, f32) -OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, double, f64) -OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, uint64, u64) -OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, int64, s64) - -#define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, suffix) \ -inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); } - -OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, u8) -OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, s8) -OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, u16) -OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, s16) -OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, u32) -OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, s32) -OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, f32) -OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, f64) -OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, u64) -OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, s64) - -#define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \ -template inline _Tpvec \ - v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \ -{ return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(); } - -OPENCV_HAL_IMPL_C_REINTERPRET(v_uint8x16, uchar, u8) -OPENCV_HAL_IMPL_C_REINTERPRET(v_int8x16, schar, s8) -OPENCV_HAL_IMPL_C_REINTERPRET(v_uint16x8, ushort, u16) -OPENCV_HAL_IMPL_C_REINTERPRET(v_int16x8, short, s16) -OPENCV_HAL_IMPL_C_REINTERPRET(v_uint32x4, unsigned, u32) -OPENCV_HAL_IMPL_C_REINTERPRET(v_int32x4, int, s32) -OPENCV_HAL_IMPL_C_REINTERPRET(v_float32x4, float, f32) -OPENCV_HAL_IMPL_C_REINTERPRET(v_float64x2, double, f64) -OPENCV_HAL_IMPL_C_REINTERPRET(v_uint64x2, uint64, u64) -OPENCV_HAL_IMPL_C_REINTERPRET(v_int64x2, int64, s64) - -#define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \ -template inline _Tpvec v_shl(const _Tpvec& a) \ -{ return a << n; } - -OPENCV_HAL_IMPL_C_SHIFTL(v_uint16x8, ushort) -OPENCV_HAL_IMPL_C_SHIFTL(v_int16x8, short) -OPENCV_HAL_IMPL_C_SHIFTL(v_uint32x4, unsigned) -OPENCV_HAL_IMPL_C_SHIFTL(v_int32x4, int) -OPENCV_HAL_IMPL_C_SHIFTL(v_uint64x2, uint64) -OPENCV_HAL_IMPL_C_SHIFTL(v_int64x2, int64) - -#define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \ -template inline _Tpvec v_shr(const _Tpvec& a) \ -{ return a >> n; } - -OPENCV_HAL_IMPL_C_SHIFTR(v_uint16x8, ushort) -OPENCV_HAL_IMPL_C_SHIFTR(v_int16x8, short) -OPENCV_HAL_IMPL_C_SHIFTR(v_uint32x4, unsigned) -OPENCV_HAL_IMPL_C_SHIFTR(v_int32x4, int) -OPENCV_HAL_IMPL_C_SHIFTR(v_uint64x2, uint64) -OPENCV_HAL_IMPL_C_SHIFTR(v_int64x2, int64) - -#define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \ -template inline _Tpvec v_rshr(const _Tpvec& a) \ -{ \ - _Tpvec c; \ - for( int i = 0; i < _Tpvec::nlanes; i++ ) \ - c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ - return c; \ -} - -OPENCV_HAL_IMPL_C_RSHIFTR(v_uint16x8, ushort) -OPENCV_HAL_IMPL_C_RSHIFTR(v_int16x8, short) -OPENCV_HAL_IMPL_C_RSHIFTR(v_uint32x4, unsigned) -OPENCV_HAL_IMPL_C_RSHIFTR(v_int32x4, int) -OPENCV_HAL_IMPL_C_RSHIFTR(v_uint64x2, uint64) -OPENCV_HAL_IMPL_C_RSHIFTR(v_int64x2, int64) - -#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix, cast) \ -inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \ -{ \ - _Tpnvec c; \ - for( int i = 0; i < _Tpvec::nlanes; i++ ) \ - { \ - c.s[i] = cast<_Tpn>(a.s[i]); \ - c.s[i+_Tpvec::nlanes] = cast<_Tpn>(b.s[i]); \ - } \ - return c; \ -} - -OPENCV_HAL_IMPL_C_PACK(v_uint16x8, v_uint8x16, uchar, pack, saturate_cast) -OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_int8x16, schar, pack, saturate_cast) -OPENCV_HAL_IMPL_C_PACK(v_uint32x4, v_uint16x8, ushort, pack, saturate_cast) -OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_int16x8, short, pack, saturate_cast) -OPENCV_HAL_IMPL_C_PACK(v_uint64x2, v_uint32x4, unsigned, pack, static_cast) -OPENCV_HAL_IMPL_C_PACK(v_int64x2, v_int32x4, int, pack, static_cast) -OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_uint8x16, uchar, pack_u, saturate_cast) -OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u, saturate_cast) - -#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \ -template inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \ -{ \ - _Tpnvec c; \ - for( int i = 0; i < _Tpvec::nlanes; i++ ) \ - { \ - c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ - c.s[i+_Tpvec::nlanes] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \ - } \ - return c; \ -} - -OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast) -OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_int8x16, schar, pack, saturate_cast) -OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast) -OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_int16x8, short, pack, saturate_cast) -OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast) -OPENCV_HAL_IMPL_C_RSHR_PACK(v_int64x2, int64, v_int32x4, int, pack, static_cast) -OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast) -OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast) - -#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \ -inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \ -{ \ - for( int i = 0; i < _Tpvec::nlanes; i++ ) \ - ptr[i] = cast<_Tpn>(a.s[i]); \ -} - -OPENCV_HAL_IMPL_C_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast) -OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast) -OPENCV_HAL_IMPL_C_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast) -OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast) -OPENCV_HAL_IMPL_C_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast) -OPENCV_HAL_IMPL_C_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast) -OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast) -OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast) - -#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \ -template inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \ -{ \ - for( int i = 0; i < _Tpvec::nlanes; i++ ) \ - ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ -} - -OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast) -OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast) -OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast) -OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast) -OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast) -OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast) -OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast) -OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast) - -template -inline void _pack_b(_Tpm* mptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) -{ - for (int i = 0; i < n; ++i) - { - mptr[i] = (_Tpm)a.s[i]; - mptr[i + n] = (_Tpm)b.s[i]; - } -} - - - -inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b) -{ - v_uint8x16 mask; - _pack_b(mask.s, a, b); - return mask; -} - - -inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b, - const v_uint32x4& c, const v_uint32x4& d) -{ - v_uint8x16 mask; - _pack_b(mask.s, a, b); - _pack_b(mask.s + 8, c, d); - return mask; -} - -inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, - const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f, - const v_uint64x2& g, const v_uint64x2& h) -{ - v_uint8x16 mask; - _pack_b(mask.s, a, b); - _pack_b(mask.s + 4, c, d); - _pack_b(mask.s + 8, e, f); - _pack_b(mask.s + 12, g, h); - return mask; -} inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& m3) { - return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0], - v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1], - v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2], - v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]); + vsetvlmax_e32m1(); + vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v)); + res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1); + res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2); + res = vfmacc_vf_f32m1(res, v_extract_n<3>(v), m3); + return v_float32x4(res); } inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, const v_float32x4& m1, const v_float32x4& m2, - const v_float32x4& m3) + const v_float32x4& a) { - return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + m3.s[0], - v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + m3.s[1], - v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + m3.s[2], - v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]); + vsetvlmax_e32m1(); + vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v)); + res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1); + res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2); + return v_float32x4(res) + a; +} + +#define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _Tpw, suffix, wmul, width) \ +inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \ +{ \ + _Tpw CV_DECL_ALIGNED(32) ptr[_Tpwvec::nlanes*2] = {0}; \ + vsetvlmax_e##width##m2(); \ + vse##width##_v_##suffix##m2(ptr, wmul(a, b)); \ + vsetvlmax_e##width##m1(); \ + c = _Tpwvec(vle##width##_v_##suffix##m1(ptr)); \ + d = _Tpwvec(vle##width##_v_##suffix##m1(ptr+_Tpwvec::nlanes)); \ +} + +OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint8x16, v_uint16x8, ushort, u16, vwmulu_vv_u16m2, 16) +OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int8x16, v_int16x8, short, i16, vwmul_vv_i16m2, 16) +OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint16x8, v_uint32x4, unsigned, u32, vwmulu_vv_u32m2, 32) +OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int16x8, v_int32x4, int, i32, vwmul_vv_i32m2, 32) +OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint32x4, v_uint64x2, uint64, u64, vwmulu_vv_u64m2, 64) + + +inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) +{ + vsetvlmax_e16m1(); + return v_int16x8(vnsra_wx_i16m1(vwmul_vv_i32m2(a, b), 16)); +} +inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) +{ + vsetvlmax_e16m1(); + return v_uint16x8(vnsrl_wx_u16m1(vwmulu_vv_u32m2(a, b), 16)); } -inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) -{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); } -inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) -{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); } +//////// Saturating Multiply //////// -inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) -{ return v_dotprod_expand(a, b); } -inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) -{ return v_dotprod_expand(a, b, c); } - -////// FP16 support /////// - -inline v_reg::nlanes128> -v_load_expand(const float16_t* ptr) -{ - v_reg::nlanes128> v; - for( int i = 0; i < v.nlanes; i++ ) - { - v.s[i] = ptr[i]; - } - return v; +#define OPENCV_HAL_IMPL_RVV_MUL_SAT(_Tpvec, _wTpvec) \ +inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \ +{ \ + _wTpvec c, d; \ + v_mul_expand(a, b, c, d); \ + return v_pack(c, d); \ +} \ +inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \ +{ \ + a = a * b; \ + return a; \ } -inline void -v_pack_store(float16_t* ptr, const v_reg::nlanes128>& v) -{ - for( int i = 0; i < v.nlanes; i++ ) - { - ptr[i] = float16_t(v.s[i]); - } -} +OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint8x16, v_uint16x8) +OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int8x16, v_int16x8) +OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint16x8, v_uint32x4) +OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int16x8, v_int32x4) + inline void v_cleanup() {} - -#ifndef CV_DOXYGEN CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END -#endif + + } #endif diff --git a/platforms/linux/riscv64-gcc.toolchain.cmake b/platforms/linux/riscv64-gcc.toolchain.cmake new file mode 100644 index 0000000000..c46d62a360 --- /dev/null +++ b/platforms/linux/riscv64-gcc.toolchain.cmake @@ -0,0 +1,20 @@ +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_PROCESSOR riscv64) + +set(RISCV_GCC_INSTALL_ROOT /opt/RISCV CACHE PATH "Path to GCC for RISC-V cross compiler installation directory") +set(CMAKE_SYSROOT ${RISCV_GCC_INSTALL_ROOT}/sysroot CACHE PATH "RISC-V sysroot") + +set(CMAKE_C_COMPILER ${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-unknown-linux-gnu-gcc) +set(CMAKE_CXX_COMPILER ${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-unknown-linux-gnu-g++) + +# Don't run the linker on compiler check +set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) + +set(CMAKE_C_FLAGS "-march=rv64gcv_zvqmac ${CMAKE_C_FLAGS}") +set(CMAKE_CXX_FLAGS "-march=rv64gcv_zvqmac ${CXX_FLAGS}") + +set(CMAKE_FIND_ROOT_PATH ${CMAKE_SYSROOT}) +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) \ No newline at end of file