From a59a66a2c79d138f193bda2edc3dd7fbc28a8386 Mon Sep 17 00:00:00 2001 From: Liutong HAN Date: Thu, 24 Oct 2024 15:08:43 +0800 Subject: [PATCH] Use LMUL=2 in the RISC-V Vector (RVV) FP16 part. --- .../opencv2/core/hal/intrin_rvv_scalable.hpp | 899 +++++++++--------- modules/core/src/matmul.simd.hpp | 5 +- 2 files changed, 432 insertions(+), 472 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp index 7628816605..33bd4acc67 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp @@ -37,21 +37,21 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN #endif -using v_uint8 = vuint8m1_t; -using v_int8 = vint8m1_t; -using v_uint16 = vuint16m1_t; -using v_int16 = vint16m1_t; -using v_uint32 = vuint32m1_t; -using v_int32 = vint32m1_t; -using v_uint64 = vuint64m1_t; -using v_int64 = vint64m1_t; +using v_uint8 = vuint8m2_t; +using v_int8 = vint8m2_t; +using v_uint16 = vuint16m2_t; +using v_int16 = vint16m2_t; +using v_uint32 = vuint32m2_t; +using v_int32 = vint32m2_t; +using v_uint64 = vuint64m2_t; +using v_int64 = vint64m2_t; #if CV_SIMD_SCALABLE_FP16 -using v_float16 = vfloat16m1_t; +using v_float16 = vfloat16m2_t; #endif -using v_float32 = vfloat32m1_t; +using v_float32 = vfloat32m2_t; #if CV_SIMD_SCALABLE_64F -using v_float64 = vfloat64m1_t; +using v_float64 = vfloat64m2_t; #endif using uchar = unsigned char; @@ -177,11 +177,11 @@ inline double v_get0(const v_float64& v) \ #define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, suffix1, suffix2, vl) \ inline v_##_Tpvec v_setzero_##suffix1() \ { \ - return __riscv_vmv_v_x_##suffix2##m1(0, vl); \ + return __riscv_vmv_v_x_##suffix2##m2(0, vl); \ } \ inline v_##_Tpvec v_setall_##suffix1(_Tp v) \ { \ - return __riscv_vmv_v_x_##suffix2##m1(v, vl); \ + return __riscv_vmv_v_x_##suffix2##m2(v, vl); \ } \ template <> inline v_##_Tpvec v_setzero_() \ { \ @@ -204,11 +204,11 @@ OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64, int64, s64, i64, VTraits::vlane #define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, suffix, vl) \ inline v_##_Tpv v_setzero_##suffix() \ { \ - return __riscv_vfmv_v_f_##suffix##m1(0, vl); \ + return __riscv_vfmv_v_f_##suffix##m2(0, vl); \ } \ inline v_##_Tpv v_setall_##suffix(_Tp v) \ { \ - return __riscv_vfmv_v_f_##suffix##m1(v, vl); \ + return __riscv_vfmv_v_f_##suffix##m2(v, vl); \ } \ template <> inline v_##_Tpv v_setzero_() \ { \ @@ -222,15 +222,23 @@ template <> inline v_##_Tpv v_setall_(_Tp v) \ #if CV_SIMD_SCALABLE_FP16 inline v_float16 v_setzero_f16() { - return __riscv_vfmv_v_f_f16m1(0, VTraits::vlanes()); + return __riscv_vfmv_v_f_f16m2(0, VTraits::vlanes()); } inline v_float16 v_setall_f16(float v) // In some cases we may use v_setall_f16(1.0f) { - return __riscv_vfmv_v_f_f16m1((_Float16)v, VTraits::vlanes()); + return __riscv_vfmv_v_f_f16m2((_Float16)v, VTraits::vlanes()); } inline v_float16 v_setall_f16(hfloat v) { - return __riscv_vfmv_v_f_f16m1((_Float16)v, VTraits::vlanes()); + return __riscv_vfmv_v_f_f16m2((_Float16)v, VTraits::vlanes()); +} +template <> inline v_float16 v_setzero_() \ +{ \ + return v_setzero_f16(); \ +} \ +template <> inline v_float16 v_setall_(hfloat v) \ +{ \ + return v_setall_f16(v); \ } #endif OPENCV_HAL_IMPL_RVV_INIT_FP(float32, float, f32, VTraits::vlanes()) @@ -263,11 +271,11 @@ OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float64, f64) #define OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2) \ inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \ { \ - return v_##_Tpvec1(__riscv_vreinterpret_v_##nsuffix2##m1_##nsuffix1##m1(v));\ + return v_##_Tpvec1(__riscv_vreinterpret_v_##nsuffix2##m2_##nsuffix1##m2(v));\ } \ inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \ { \ - return v_##_Tpvec2(__riscv_vreinterpret_v_##nsuffix1##m1_##nsuffix2##m1(v));\ + return v_##_Tpvec2(__riscv_vreinterpret_v_##nsuffix1##m2_##nsuffix2##m2(v));\ } OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, int8, u8, s8, u8, i8) @@ -301,11 +309,11 @@ OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, int64, s32, s64, i32, i64) #define OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \ inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \ { \ - return __riscv_vreinterpret_v_##nsuffix1##width2##m1_##nsuffix1##width1##m1(__riscv_vreinterpret_v_##nsuffix2##width2##m1_##nsuffix1##width2##m1(v));\ + return __riscv_vreinterpret_v_##nsuffix1##width2##m2_##nsuffix1##width1##m2(__riscv_vreinterpret_v_##nsuffix2##width2##m2_##nsuffix1##width2##m2(v));\ } \ inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \ { \ - return __riscv_vreinterpret_v_##nsuffix1##width2##m1_##nsuffix2##width2##m1(__riscv_vreinterpret_v_##nsuffix1##width1##m1_##nsuffix1##width2##m1(v));\ + return __riscv_vreinterpret_v_##nsuffix1##width2##m2_##nsuffix2##width2##m2(__riscv_vreinterpret_v_##nsuffix1##width1##m2_##nsuffix1##width2##m2(v));\ } OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int16, u8, s16, u, i, 8, 16) @@ -345,22 +353,22 @@ OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32, float64, s32, f64, i, f, 32, 64 #if CV_SIMD_SCALABLE_FP16 inline v_float16 v_reinterpret_as_f16(const v_float64& v) \ { \ - return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vreinterpret_v_u64m1_u16m1(__riscv_vreinterpret_v_f64m1_u64m1(v)));\ + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vreinterpret_v_u64m2_u16m2(__riscv_vreinterpret_v_f64m2_u64m2(v)));\ } inline v_float64 v_reinterpret_as_f64(const v_float16& v) \ { \ - return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u16m1_u64m1(__riscv_vreinterpret_v_f16m1_u16m1(v)));\ + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vreinterpret_v_u16m2_u64m2(__riscv_vreinterpret_v_f16m2_u16m2(v)));\ } #endif inline v_float32 v_reinterpret_as_f32(const v_float64& v) \ { \ - return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u64m1_u32m1(__riscv_vreinterpret_v_f64m1_u64m1(v)));\ + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vreinterpret_v_u64m2_u32m2(__riscv_vreinterpret_v_f64m2_u64m2(v)));\ } inline v_float64 v_reinterpret_as_f64(const v_float32& v) \ { \ - return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u32m1_u64m1(__riscv_vreinterpret_v_f32m1_u32m1(v)));\ + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vreinterpret_v_u32m2_u64m2(__riscv_vreinterpret_v_f32m2_u32m2(v)));\ } #endif @@ -432,23 +440,23 @@ OPENCV_HAL_IMPL_RVV_EXTRACT(v_float64, double, VTraits::vlanes()) #define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix) \ inline _Tpvec v_load(const _Tp* ptr) \ { \ - return __riscv_vle##width##_v_##suffix##m1(ptr, vl); \ + return __riscv_vle##width##_v_##suffix##m2(ptr, vl); \ } \ inline _Tpvec v_load_aligned(const _Tp* ptr) \ { \ - return __riscv_vle##width##_v_##suffix##m1(ptr, vl); \ + return __riscv_vle##width##_v_##suffix##m2(ptr, vl); \ } \ inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \ { \ - __riscv_vse##width##_v_##suffix##m1(ptr, a, vl); \ + __riscv_vse##width##_v_##suffix##m2(ptr, a, vl); \ } \ inline _Tpvec v_load_low(const _Tp* ptr) \ { \ - return __riscv_vle##width##_v_##suffix##m1(ptr, hvl); \ + return __riscv_vle##width##_v_##suffix##m2(ptr, hvl); \ } \ inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ { \ - return __riscv_vslideup(__riscv_vle##width##_v_##suffix##m1(ptr0, hvl), __riscv_vle##width##_v_##suffix##m1(ptr1, hvl), hvl, vl); \ + return __riscv_vslideup(__riscv_vle##width##_v_##suffix##m2(ptr0, hvl), __riscv_vle##width##_v_##suffix##m2(ptr1, hvl), hvl, vl); \ } \ inline void v_store(_Tp* ptr, const _Tpvec& a) \ { \ @@ -468,7 +476,7 @@ inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ } \ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ { \ - __riscv_vse##width(ptr, __riscv_vslidedown_vx_##suffix##m1(a, hvl, vl), hvl); \ + __riscv_vse##width(ptr, __riscv_vslidedown_vx_##suffix##m2(a, hvl, vl), hvl); \ } \ template \ _Tpvec v_load_##suffix(Targs... nScalars) \ @@ -479,23 +487,23 @@ _Tpvec v_load_##suffix(Targs... nScalars) \ #define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP_FP16(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix) \ inline _Tpvec v_load(const _Tp* ptr) \ { \ - return __riscv_vle##width##_v_##suffix##m1((_Float16*)ptr, vl); \ + return __riscv_vle##width##_v_##suffix##m2((_Float16*)ptr, vl); \ } \ inline _Tpvec v_load_aligned(const _Tp* ptr) \ { \ - return __riscv_vle##width##_v_##suffix##m1((_Float16*)ptr, vl); \ + return __riscv_vle##width##_v_##suffix##m2((_Float16*)ptr, vl); \ } \ inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \ { \ - __riscv_vse##width##_v_##suffix##m1((_Float16*)ptr, a, vl); \ + __riscv_vse##width##_v_##suffix##m2((_Float16*)ptr, a, vl); \ } \ inline _Tpvec v_load_low(const _Tp* ptr) \ { \ - return __riscv_vle##width##_v_##suffix##m1((_Float16*)ptr, hvl); \ + return __riscv_vle##width##_v_##suffix##m2((_Float16*)ptr, hvl); \ } \ inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ { \ - return __riscv_vslideup(__riscv_vle##width##_v_##suffix##m1((_Float16*)ptr0, hvl), __riscv_vle##width##_v_##suffix##m1((_Float16*)ptr1, hvl), hvl, vl); \ + return __riscv_vslideup(__riscv_vle##width##_v_##suffix##m2((_Float16*)ptr0, hvl), __riscv_vle##width##_v_##suffix##m2((_Float16*)ptr1, hvl), hvl, vl); \ } \ inline void v_store(_Tp* ptr, const _Tpvec& a) \ { \ @@ -515,24 +523,24 @@ inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ } \ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ { \ - __riscv_vse##width((_Float16*)ptr, __riscv_vslidedown_vx_##suffix##m1(a, hvl, vl), hvl); \ + __riscv_vse##width((_Float16*)ptr, __riscv_vslidedown_vx_##suffix##m2(a, hvl, vl), hvl); \ } -OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8, vuint8m1_t, uchar, VTraits::vlanes() / 2, VTraits::vlanes(), 8, u8) -OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8, vint8m1_t, schar, VTraits::vlanes() / 2, VTraits::vlanes(), 8, i8) -OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16, vuint16m1_t, ushort, VTraits::vlanes() / 2, VTraits::vlanes(), 16, u16) -OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16, vint16m1_t, short, VTraits::vlanes() / 2, VTraits::vlanes(), 16, i16) -OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32, vuint32m1_t, unsigned int, VTraits::vlanes() / 2, VTraits::vlanes(), 32, u32) -OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32, vint32m1_t, int, VTraits::vlanes() / 2, VTraits::vlanes(), 32, i32) -OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64, vuint64m1_t, uint64, VTraits::vlanes() / 2, VTraits::vlanes(), 64, u64) -OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64, vint64m1_t, int64, VTraits::vlanes() / 2, VTraits::vlanes(), 64, i64) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8, vuint8m2_t, uchar, VTraits::vlanes() / 2, VTraits::vlanes(), 8, u8) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8, vint8m2_t, schar, VTraits::vlanes() / 2, VTraits::vlanes(), 8, i8) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16, vuint16m2_t, ushort, VTraits::vlanes() / 2, VTraits::vlanes(), 16, u16) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16, vint16m2_t, short, VTraits::vlanes() / 2, VTraits::vlanes(), 16, i16) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32, vuint32m2_t, unsigned int, VTraits::vlanes() / 2, VTraits::vlanes(), 32, u32) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32, vint32m2_t, int, VTraits::vlanes() / 2, VTraits::vlanes(), 32, i32) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64, vuint64m2_t, uint64, VTraits::vlanes() / 2, VTraits::vlanes(), 64, u64) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64, vint64m2_t, int64, VTraits::vlanes() / 2, VTraits::vlanes(), 64, i64) #if CV_SIMD_SCALABLE_FP16 -OPENCV_HAL_IMPL_RVV_LOADSTORE_OP_FP16(v_float16, vfloat16m1_t, hfloat, VTraits::vlanes() /2 , VTraits::vlanes(), 16, f16) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP_FP16(v_float16, vfloat16m2_t, hfloat, VTraits::vlanes() /2 , VTraits::vlanes(), 16, f16) #endif -OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32, vfloat32m1_t, float, VTraits::vlanes() /2 , VTraits::vlanes(), 32, f32) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32, vfloat32m2_t, float, VTraits::vlanes() /2 , VTraits::vlanes(), 32, f32) #if CV_SIMD_SCALABLE_64F -OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64, vfloat64m1_t, double, VTraits::vlanes() / 2, VTraits::vlanes(), 64, f64) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64, vfloat64m2_t, double, VTraits::vlanes() / 2, VTraits::vlanes(), 64, f64) #endif ////////////// Lookup table access //////////////////// @@ -542,22 +550,22 @@ inline _Tpvec v_lut(const _Tp* tab, const int* idx) \ auto vidx = __riscv_vmul(__riscv_vreinterpret_u32##suffix(__riscv_vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \ return __riscv_vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \ } +OPENCV_HAL_IMPL_RVV_LUT(v_int8, schar, m8) #define OPENCV_HAL_IMPL_RVV_LUT_FP16(_Tpvec, _Tp, suffix) \ inline _Tpvec v_lut(const _Tp* tab, const int* idx) \ { \ auto vidx = __riscv_vmul(__riscv_vreinterpret_u32##suffix(__riscv_vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \ return __riscv_vloxei32((_Float16*)tab, vidx, VTraits<_Tpvec>::vlanes()); \ } -OPENCV_HAL_IMPL_RVV_LUT(v_int8, schar, m4) -OPENCV_HAL_IMPL_RVV_LUT(v_int16, short, m2) -OPENCV_HAL_IMPL_RVV_LUT(v_int32, int, m1) -OPENCV_HAL_IMPL_RVV_LUT(v_int64, int64_t, mf2) +OPENCV_HAL_IMPL_RVV_LUT(v_int16, short, m4) +OPENCV_HAL_IMPL_RVV_LUT(v_int32, int, m2) +OPENCV_HAL_IMPL_RVV_LUT(v_int64, int64_t, m1) #if CV_SIMD_SCALABLE_FP16 -OPENCV_HAL_IMPL_RVV_LUT_FP16(v_float16, hfloat, m2) +OPENCV_HAL_IMPL_RVV_LUT_FP16(v_float16, hfloat, m4) #endif -OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1) +OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m2) #if CV_SIMD_SCALABLE_64F -OPENCV_HAL_IMPL_RVV_LUT(v_float64, double, mf2) +OPENCV_HAL_IMPL_RVV_LUT(v_float64, double, m1) #endif #define OPENCV_HAL_IMPL_RVV_LUT_PAIRS(_Tpvec, _Tp, _TpCast, suffix1, suffix2, v_trunc) \ @@ -572,16 +580,16 @@ inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \ auto vidx = __riscv_vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \ return __riscv_vloxei32((_TpCast *)tab, vidx, VTraits<_Tpvec>::vlanes()); \ } -OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int8, schar, schar, m2, m4, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int16, short, short, m1, m2, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int8, schar, schar, m4, m8, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int16, short, short, m2, m4, OPENCV_HAL_NOP) #if CV_SIMD_SCALABLE_FP16 -OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float16, hfloat, _Float16, m1, m2, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float16, hfloat, _Float16, m2, m4, OPENCV_HAL_NOP) #endif -OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int32, int, int, mf2, m1, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float32, float, float, mf2, m1, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int64, int64_t, int64_t, mf2, m1, __riscv_vlmul_trunc_u32mf2) +OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int32, int, int, m1, m2, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float32, float, float, m1, m2, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int64, int64_t, int64_t, m1, m2, __riscv_vlmul_trunc_u32m1) #if CV_SIMD_SCALABLE_64F -OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float64, double, double, mf2, m1, __riscv_vlmul_trunc_u32mf2) +OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float64, double, double, m1, m2, __riscv_vlmul_trunc_u32m1) #endif @@ -607,18 +615,18 @@ inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \ auto vidx = __riscv_vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \ return __riscv_vloxei32((_TpCast *)tab, vidx, VTraits<_Tpvec>::vlanes()); \ } -OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int8, schar, schar, m1, m2, m4, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int16, short, short, mf2 , m1, m2, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int32, int, int, mf2, m1, m1, __riscv_vlmul_trunc_u32mf2) +OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int8, schar, schar, m2, m4, m8, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int16, short, short, m1 , m2, m4, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int32, int, int, m1, m2, m2, __riscv_vlmul_trunc_u32m1) #if CV_SIMD_SCALABLE_FP16 -OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_float16, hfloat, _Float16, mf2 , m1, m2, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_float16, hfloat, _Float16, m1 , m2, m4, OPENCV_HAL_NOP) #endif -OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_float32, float, float, mf2, m1, m1, __riscv_vlmul_trunc_u32mf2) +OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_float32, float, float, m1, m2, m2, __riscv_vlmul_trunc_u32m1) #define OPENCV_HAL_IMPL_RVV_LUT_VEC(_Tpvec, _Tp) \ inline _Tpvec v_lut(const _Tp* tab, const v_int32& vidx) \ { \ - v_uint32 vidx_ = __riscv_vmul(__riscv_vreinterpret_u32m1(vidx), sizeof(_Tp), VTraits::vlanes()); \ + v_uint32 vidx_ = __riscv_vmul(__riscv_vreinterpret_u32m2(vidx), sizeof(_Tp), VTraits::vlanes()); \ return __riscv_vloxei32(tab, vidx_, VTraits<_Tpvec>::vlanes()); \ } OPENCV_HAL_IMPL_RVV_LUT_VEC(v_float32, float) @@ -628,7 +636,7 @@ OPENCV_HAL_IMPL_RVV_LUT_VEC(v_uint32, unsigned) #if CV_SIMD_SCALABLE_64F inline v_float64 v_lut(const double* tab, const v_int32& vidx) \ { \ - vuint32mf2_t vidx_ = __riscv_vmul(__riscv_vlmul_trunc_u32mf2(__riscv_vreinterpret_u32m1(vidx)), sizeof(double), VTraits::vlanes()); \ + vuint32m1_t vidx_ = __riscv_vmul(__riscv_vlmul_trunc_u32m1(__riscv_vreinterpret_u32m2(vidx)), sizeof(double), VTraits::vlanes()); \ return __riscv_vloxei32(tab, vidx_, VTraits::vlanes()); \ } #endif @@ -649,24 +657,24 @@ inline v_uint64 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinte ////////////// Pack boolean //////////////////// inline v_uint8 v_pack_b(const v_uint16& a, const v_uint16& b) { - return __riscv_vnsrl(__riscv_vset(__riscv_vlmul_ext_v_u16m1_u16m2(a),1,b), 0, VTraits::vlanes()); + return __riscv_vnsrl(__riscv_vset(__riscv_vlmul_ext_v_u16m2_u16m4(a),1,b), 0, VTraits::vlanes()); } inline v_uint8 v_pack_b(const v_uint32& a, const v_uint32& b, const v_uint32& c, const v_uint32& d) { - return __riscv_vnsrl(__riscv_vnsrl(__riscv_vset(__riscv_vset(__riscv_vset(__riscv_vlmul_ext_u32m4(a),1,b),2,c),3,d), 0, VTraits::vlanes()), 0, VTraits::vlanes()); + return __riscv_vnsrl(__riscv_vnsrl(__riscv_vset(__riscv_vset(__riscv_vset(__riscv_vlmul_ext_u32m8(a),1,b),2,c),3,d), 0, VTraits::vlanes()), 0, VTraits::vlanes()); } inline v_uint8 v_pack_b(const v_uint64& a, const v_uint64& b, const v_uint64& c, const v_uint64& d, const v_uint64& e, const v_uint64& f, const v_uint64& g, const v_uint64& h) { - return __riscv_vnsrl(__riscv_vnsrl(__riscv_vnsrl( - __riscv_vset(__riscv_vset(__riscv_vset(__riscv_vset(__riscv_vset(__riscv_vset(__riscv_vset(__riscv_vlmul_ext_u64m8(a), - 1,b),2,c),3,d),4,e),5,f),6,g),7,h), - 0, VTraits::vlanes()), 0, VTraits::vlanes()), 0, VTraits::vlanes()); + vuint8m1_t t0 = __riscv_vnsrl(__riscv_vnsrl(__riscv_vnsrl(__riscv_vset(__riscv_vset(__riscv_vset(__riscv_vlmul_ext_u64m8(a),1,b),2,c),3,d), 0, VTraits::vlanes()), 0, VTraits::vlanes()), 0, VTraits::vlanes()); + vuint8m1_t t1 = __riscv_vnsrl(__riscv_vnsrl(__riscv_vnsrl(__riscv_vset(__riscv_vset(__riscv_vset(__riscv_vlmul_ext_u64m8(e),1,f),2,g),3,h), 0, VTraits::vlanes()), 0, VTraits::vlanes()), 0, VTraits::vlanes()); + + return __riscv_vset(__riscv_vlmul_ext_u8m2(t0), 1, t1); } ////////////// Arithmetics ////////////// @@ -748,15 +756,15 @@ OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float64, __riscv_vfmul) inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \ { \ _TpwvecM2 temp = wmul(a, b, VTraits<_Tpvec>::vlanes()); \ - c = __riscv_vget_##suffix##m1(temp, 0); \ - d = __riscv_vget_##suffix##m1(temp, 1); \ + c = __riscv_vget_##suffix##m2(temp, 0); \ + d = __riscv_vget_##suffix##m2(temp, 1); \ } -OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint8, v_uint16, vuint16m2_t, u16, __riscv_vwmulu) -OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int8, v_int16, vint16m2_t, i16, __riscv_vwmul) -OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint16, v_uint32, vuint32m2_t, u32, __riscv_vwmulu) -OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int16, v_int32, vint32m2_t, i32, __riscv_vwmul) -OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint32, v_uint64, vuint64m2_t, u64, __riscv_vwmulu) +OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint8, v_uint16, vuint16m4_t, u16, __riscv_vwmulu) +OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int8, v_int16, vint16m4_t, i16, __riscv_vwmul) +OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint16, v_uint32, vuint32m4_t, u32, __riscv_vwmulu) +OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int16, v_int32, vint32m4_t, i32, __riscv_vwmul) +OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint32, v_uint64, vuint64m4_t, u64, __riscv_vwmulu) inline v_int16 v_mul_hi(const v_int16& a, const v_int16& b) { @@ -830,7 +838,7 @@ OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64, VTraits::vlanes()) #define OPENCV_HAL_IMPL_RVV_FLT16_BIT_OP(intrin) \ inline v_float16 intrin (const v_float16& a, const v_float16& b) \ { \ - return __riscv_vreinterpret_f16m1(intrin(__riscv_vreinterpret_i16m1(a), __riscv_vreinterpret_i16m1(b))); \ + return __riscv_vreinterpret_f16m2(intrin(__riscv_vreinterpret_i16m2(a), __riscv_vreinterpret_i16m2(b))); \ } OPENCV_HAL_IMPL_RVV_FLT16_BIT_OP(v_and) OPENCV_HAL_IMPL_RVV_FLT16_BIT_OP(v_or) @@ -838,14 +846,14 @@ OPENCV_HAL_IMPL_RVV_FLT16_BIT_OP(v_xor) inline v_float16 v_not (const v_float16& a) \ { \ - return __riscv_vreinterpret_f16m1(v_not(__riscv_vreinterpret_i16m1(a))); \ + return __riscv_vreinterpret_f16m2(v_not(__riscv_vreinterpret_i16m2(a))); \ } #endif #define OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(intrin) \ inline v_float32 intrin (const v_float32& a, const v_float32& b) \ { \ - return __riscv_vreinterpret_f32m1(intrin(__riscv_vreinterpret_i32m1(a), __riscv_vreinterpret_i32m1(b))); \ + return __riscv_vreinterpret_f32m2(intrin(__riscv_vreinterpret_i32m2(a), __riscv_vreinterpret_i32m2(b))); \ } OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(v_and) OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(v_or) @@ -853,14 +861,14 @@ OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(v_xor) inline v_float32 v_not (const v_float32& a) \ { \ - return __riscv_vreinterpret_f32m1(v_not(__riscv_vreinterpret_i32m1(a))); \ + return __riscv_vreinterpret_f32m2(v_not(__riscv_vreinterpret_i32m2(a))); \ } #if CV_SIMD_SCALABLE_64F #define OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(intrin) \ inline v_float64 intrin (const v_float64& a, const v_float64& b) \ { \ - return __riscv_vreinterpret_f64m1(intrin(__riscv_vreinterpret_i64m1(a), __riscv_vreinterpret_i64m1(b))); \ + return __riscv_vreinterpret_f64m2(intrin(__riscv_vreinterpret_i64m2(a), __riscv_vreinterpret_i64m2(b))); \ } OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(v_and) OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(v_or) @@ -868,7 +876,7 @@ OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(v_xor) inline v_float64 v_not (const v_float64& a) \ { \ - return __riscv_vreinterpret_f64m1(v_not(__riscv_vreinterpret_i64m1(a))); \ + return __riscv_vreinterpret_f64m2(v_not(__riscv_vreinterpret_i64m2(a))); \ } #endif @@ -912,7 +920,7 @@ inline _Tpvec v_##op(const _Tpvec& a, const _Tpvec& b) \ { \ size_t VLEN = VTraits<_Tpvec>::vlanes(); \ uint64_t ones = -1; \ - return __riscv_vmerge(__riscv_vmv_v_x_##suffix##m1(0, VLEN), ones, intrin(a, b, VLEN), VLEN); \ + return __riscv_vmerge(__riscv_vmv_v_x_##suffix##m2(0, VLEN), ones, intrin(a, b, VLEN), VLEN); \ } #define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix) \ @@ -922,7 +930,7 @@ inline _Tpvec v_##op (const _Tpvec& a, const _Tpvec& b) \ union { uint64_t u; VTraits<_Tpvec>::lane_type d; } ones; \ ones.u = -1; \ auto diff = intrin(a, b, VLEN); \ - auto z = __riscv_vfmv_v_f_##suffix##m1(0, VLEN); \ + auto z = __riscv_vfmv_v_f_##suffix##m2(0, VLEN); \ auto res = __riscv_vfmerge(z, ones.d, diff, VLEN); \ return _Tpvec(res); \ } //TODO @@ -934,7 +942,7 @@ inline _Tpvec v_##op (const _Tpvec& a, const _Tpvec& b) \ union { uint64_t u; _Float16 d; } ones; \ ones.u = -1; \ auto diff = intrin(a, b, VLEN); \ - auto z = __riscv_vfmv_v_f_##suffix##m1(0, VLEN); \ + auto z = __riscv_vfmv_v_f_##suffix##m2(0, VLEN); \ auto res = __riscv_vfmerge(z, ones.d, diff, VLEN); \ return _Tpvec(res); \ } //TODO @@ -1035,77 +1043,18 @@ OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_max, __riscv_vfmax, VTraits #define OPENCV_HAL_IMPL_RVV_ZIP4(_Tpvec, _wTpvec, suffix, convert2u, convert) \ inline void v_zip4(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) { \ int vl = 4; \ - _wTpvec temp = __riscv_vreinterpret_##suffix##m2(convert2u( \ + _wTpvec temp = __riscv_vreinterpret_##suffix##m4(convert2u( \ __riscv_vor(__riscv_vzext_vf2(convert(a0), vl), \ - __riscv_vreinterpret_u64m2(__riscv_vslide1up(__riscv_vreinterpret_u32m2(__riscv_vzext_vf2(convert(a1), vl)), 0, vl*2)), \ + __riscv_vreinterpret_u64m4(__riscv_vslide1up(__riscv_vreinterpret_u32m4(__riscv_vzext_vf2(convert(a1), vl)), 0, vl*2)), \ vl))); \ - b0 = __riscv_vget_##suffix##m1(temp, 0); \ - b1 = __riscv_vget_##suffix##m1(__riscv_vrgather(temp, __riscv_vadd(__riscv_vid_v_u32m2(vl), 4, vl)/*{4,5,6,7} */, vl) ,0); \ + b0 = __riscv_vget_##suffix##m2(temp, 0); \ + b1 = __riscv_vget_##suffix##m2(__riscv_vrgather(temp, __riscv_vadd(__riscv_vid_v_u32m4(vl), 4, vl)/*{4,5,6,7} */, vl) ,0); \ } -OPENCV_HAL_IMPL_RVV_ZIP4(v_uint32, vuint32m2_t, u32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_RVV_ZIP4(v_int32, vint32m2_t, i32, __riscv_vreinterpret_u32m2, __riscv_vreinterpret_u32m1) -OPENCV_HAL_IMPL_RVV_ZIP4(v_float32, vfloat32m2_t, f32, __riscv_vreinterpret_u32m2, __riscv_vreinterpret_u32m1) +OPENCV_HAL_IMPL_RVV_ZIP4(v_uint32, vuint32m4_t, u32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_ZIP4(v_int32, vint32m4_t, i32, __riscv_vreinterpret_u32m4, __riscv_vreinterpret_u32m2) +OPENCV_HAL_IMPL_RVV_ZIP4(v_float32, vfloat32m4_t, f32, __riscv_vreinterpret_u32m4, __riscv_vreinterpret_u32m2) -#if 0 -// this is v_zip4 and v_tranpose4x4 for scalable VLEN, costs more instruction than current 128-bit only version. -inline void v_zip4(const v_float32& a0, const v_float32& a1, v_float32& b0, v_float32& b1) { - vuint64m1_t vid1 = __riscv_vid_v_u64m1(VTraits::vlanes()); - vuint16m1_t t1 = __riscv_vreinterpret_u16m1(vid1); - vuint16m1_t t2 = __riscv_vslide1up(t1, 0, VTraits::vlanes()); - vuint16m1_t t3 = __riscv_vslide1up(t2, 0, VTraits::vlanes()); - vuint16m1_t t4 = __riscv_vslide1up(t3, 0, VTraits::vlanes()); - t1 = __riscv_vor( - __riscv_vor(t1, t2, VTraits::vlanes()), - __riscv_vor(t3, t4, VTraits::vlanes()), - VTraits::vlanes() - ); - vuint32m2_t vidx0 = __riscv_vwmulu(t1, 4, VTraits::vlanes()); - vidx0 = __riscv_vadd(vidx0, __riscv_vid_v_u32m2(VTraits::vlanes()), VTraits::vlanes()); - vuint32m2_t vidx1 = __riscv_vadd(vidx0, 4, VTraits::vlanes()); - vfloat32m2_t temp = __riscv_vreinterpret_f32m2(__riscv_vreinterpret_u32m2( - __riscv_vor(__riscv_vzext_vf2(__riscv_vreinterpret_u32m1(a0), VTraits::vlanes()), - __riscv_vreinterpret_u64m2(__riscv_vslide1up(__riscv_vreinterpret_u32m2(__riscv_vzext_vf2(__riscv_vreinterpret_u32m1(a1), VTraits::vlanes())), 0, VTraits::vlanes()*2)), - VTraits::vlanes()))); - b0 = __riscv_vlmul_trunc_f32m1(__riscv_vrgather(temp, vidx0, VTraits::vlanes())); - b1 = __riscv_vlmul_trunc_f32m1(__riscv_vrgather(temp, vidx1, VTraits::vlanes())); -} - -inline void v_transpose4x4(const v_float32& a0, const v_float32& a1, const v_float32& a2, const v_float32& a3,\ - v_float32& b0, v_float32& b1, v_float32& b2, v_float32& b3) { \ - vuint64m2_t vid1 = __riscv_vid_v_u64m2(VTraits::vlanes()); - vuint16m2_t t1 = __riscv_vreinterpret_u16m2(vid1); - vuint16m2_t t2 = __riscv_vslide1up(t1, 0, VTraits::vlanes()); - vuint16m2_t t3 = __riscv_vslide1up(t2, 0, VTraits::vlanes()); - vuint16m2_t t4 = __riscv_vslide1up(t3, 0, VTraits::vlanes()); - t1 = __riscv_vor( - __riscv_vor(t1, t2, VTraits::vlanes()), - __riscv_vor(t3, t4, VTraits::vlanes()), - VTraits::vlanes() - ); - vuint16m2_t vidx0 = __riscv_vmul(t1, 12, VTraits::vlanes()); - vidx0 = __riscv_vadd(vidx0, __riscv_vid_v_u16m2(VTraits::vlanes()), VTraits::vlanes()); - vuint16m2_t vidx1 = __riscv_vadd(vidx0, 4, VTraits::vlanes()); - vuint16m2_t vidx2 = __riscv_vadd(vidx0, 8, VTraits::vlanes()); - vuint16m2_t vidx3 = __riscv_vadd(vidx0, 12, VTraits::vlanes()); - vuint32m2_t tempA = __riscv_vreinterpret_u32m2( \ - __riscv_vor(__riscv_vzext_vf2(__riscv_vreinterpret_u32m1(a0), VTraits::vlanes()), \ - __riscv_vreinterpret_u64m2(__riscv_vslide1up(__riscv_vreinterpret_u32m2(__riscv_vzext_vf2(__riscv_vreinterpret_u32m1(a2), VTraits::vlanes())), 0, VTraits::vlanes())), \ - VTraits::vlanes())); \ - vuint32m2_t tempB = __riscv_vreinterpret_u32m2( \ - __riscv_vor(__riscv_vzext_vf2(__riscv_vreinterpret_u32m1(a1), VTraits::vlanes()), \ - __riscv_vreinterpret_u64m2(__riscv_vslide1up(__riscv_vreinterpret_u32m2(__riscv_vzext_vf2(__riscv_vreinterpret_u32m1(a3), VTraits::vlanes())), 0, VTraits::vlanes())), \ - VTraits::vlanes())); \ - vfloat32m4_t temp = __riscv_vreinterpret_f32m4(__riscv_vreinterpret_u32m4( \ - __riscv_vor(__riscv_vzext_vf2(tempA, VTraits::vlanes()), \ - __riscv_vreinterpret_u64m4(__riscv_vslide1up(__riscv_vreinterpret_u32m4(__riscv_vzext_vf2(tempB, VTraits::vlanes())), 0, VTraits::vlanes())), \ - VTraits::vlanes()))); \ - b0 = __riscv_vlmul_trunc_f32m1(__riscv_vrgatherei16(temp, vidx0, VTraits::vlanes())); - b1 = __riscv_vlmul_trunc_f32m1(__riscv_vrgatherei16(temp, vidx1, VTraits::vlanes())); - b2 = __riscv_vlmul_trunc_f32m1(__riscv_vrgatherei16(temp, vidx2, VTraits::vlanes())); - b3 = __riscv_vlmul_trunc_f32m1(__riscv_vrgatherei16(temp, vidx3, VTraits::vlanes())); -} -#endif #define OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(_Tpvec, suffix) \ inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, const _Tpvec& a2, const _Tpvec& a3, _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) { \ @@ -1128,7 +1077,7 @@ inline scalartype v_reduce_sum(const _Tpvec& a) \ _nwTpvec zero = __riscv_vmv_v_x_##wsuffix##m1(0, vl); \ _nwTpvec res = __riscv_vmv_v_x_##wsuffix##m1(0, vl); \ res = __riscv_v##red(a, zero, vl); \ - return (scalartype)v_get0(res); \ + return (scalartype)__riscv_vmv_x(res); \ } OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint8, v_uint16, vuint16m1_t, unsigned, u16, VTraits::vlanes(), wredsumu) OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int8, v_int16, vint16m1_t, int, i16, VTraits::vlanes(), wredsum) @@ -1146,86 +1095,93 @@ inline scalartype v_reduce_sum(const _Tpvec& a) \ _nwTpvec zero = __riscv_vfmv_v_f_##wsuffix##m1(0, vl); \ _nwTpvec res = __riscv_vfmv_v_f_##wsuffix##m1(0, vl); \ res = __riscv_vfredusum(a, zero, vl); \ - return (scalartype)v_get0(res); \ + return (scalartype)__riscv_vfmv_f(res); \ } OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float32, v_float32, vfloat32m1_t, float, f32, VTraits::vlanes()) #if CV_SIMD_SCALABLE_64F OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float64, v_float64, vfloat64m1_t, float, f64, VTraits::vlanes()) #endif -#define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, vl, red) \ +#define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, _nTpvec, func, scalartype, suffix, vl, red) \ inline scalartype v_reduce_##func(const _Tpvec& a) \ { \ - _Tpvec res = _Tpvec(__riscv_v##red(a, a, vl)); \ - return (scalartype)v_get0(res); \ + _nTpvec narrowM1 = __riscv_vlmul_trunc_##suffix##m1(a); \ + return (scalartype)__riscv_vmv_x(__riscv_v##red(a, narrowM1, vl)); \ } -OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8, min, uchar, u8, VTraits::vlanes(), redminu) -OPENCV_HAL_IMPL_RVV_REDUCE(v_int8, min, schar, i8, VTraits::vlanes(), redmin) -OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16, min, ushort, u16, VTraits::vlanes(), redminu) -OPENCV_HAL_IMPL_RVV_REDUCE(v_int16, min, short, i16, VTraits::vlanes(), redmin) -OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32, min, unsigned, u32, VTraits::vlanes(), redminu) -OPENCV_HAL_IMPL_RVV_REDUCE(v_int32, min, int, i32, VTraits::vlanes(), redmin) -OPENCV_HAL_IMPL_RVV_REDUCE(v_float32, min, float, f32, VTraits::vlanes(), fredmin) -OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8, max, uchar, u8, VTraits::vlanes(), redmaxu) -OPENCV_HAL_IMPL_RVV_REDUCE(v_int8, max, schar, i8, VTraits::vlanes(), redmax) -OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16, max, ushort, u16, VTraits::vlanes(), redmaxu) -OPENCV_HAL_IMPL_RVV_REDUCE(v_int16, max, short, i16, VTraits::vlanes(), redmax) -OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32, max, unsigned, u32, VTraits::vlanes(), redmaxu) -OPENCV_HAL_IMPL_RVV_REDUCE(v_int32, max, int, i32, VTraits::vlanes(), redmax) -OPENCV_HAL_IMPL_RVV_REDUCE(v_float32, max, float, f32, VTraits::vlanes(), fredmax) +#define OPENCV_HAL_IMPL_RVV_REDUCE_FP(_Tpvec, _nTpvec, func, scalartype, suffix, vl, red) \ +inline scalartype v_reduce_##func(const _Tpvec& a) \ +{ \ + _nTpvec narrowM1 = __riscv_vlmul_trunc_##suffix##m1(a); \ + return (scalartype)__riscv_vfmv_f(__riscv_v##red(a, narrowM1, vl)); \ +} + +OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8, vuint8m1_t, min, uchar, u8, VTraits::vlanes(), redminu) +OPENCV_HAL_IMPL_RVV_REDUCE(v_int8, vint8m1_t, min, schar, i8, VTraits::vlanes(), redmin) +OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16, vuint16m1_t, min, ushort, u16, VTraits::vlanes(), redminu) +OPENCV_HAL_IMPL_RVV_REDUCE(v_int16, vint16m1_t, min, short, i16, VTraits::vlanes(), redmin) +OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32, vuint32m1_t, min, unsigned, u32, VTraits::vlanes(), redminu) +OPENCV_HAL_IMPL_RVV_REDUCE(v_int32, vint32m1_t, min, int, i32, VTraits::vlanes(), redmin) +OPENCV_HAL_IMPL_RVV_REDUCE_FP(v_float32, vfloat32m1_t, min, float, f32, VTraits::vlanes(), fredmin) +OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8, vuint8m1_t, max, uchar, u8, VTraits::vlanes(), redmaxu) +OPENCV_HAL_IMPL_RVV_REDUCE(v_int8, vint8m1_t, max, schar, i8, VTraits::vlanes(), redmax) +OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16, vuint16m1_t, max, ushort, u16, VTraits::vlanes(), redmaxu) +OPENCV_HAL_IMPL_RVV_REDUCE(v_int16, vint16m1_t, max, short, i16, VTraits::vlanes(), redmax) +OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32, vuint32m1_t, max, unsigned, u32, VTraits::vlanes(), redmaxu) +OPENCV_HAL_IMPL_RVV_REDUCE(v_int32, vint32m1_t, max, int, i32, VTraits::vlanes(), redmax) +OPENCV_HAL_IMPL_RVV_REDUCE_FP(v_float32, vfloat32m1_t, max, float, f32, VTraits::vlanes(), fredmax) #if CV_SIMD_SCALABLE_FP16 -OPENCV_HAL_IMPL_RVV_REDUCE(v_float16, max, hfloat, f16, VTraits::vlanes(), fredmax) -OPENCV_HAL_IMPL_RVV_REDUCE(v_float16, min, hfloat, f16, VTraits::vlanes(), fredmin) +OPENCV_HAL_IMPL_RVV_REDUCE_FP(v_float16, vfloat16m1_t, max, hfloat, f16, VTraits::vlanes(), fredmax) +OPENCV_HAL_IMPL_RVV_REDUCE_FP(v_float16, vfloat16m1_t, min, hfloat, f16, VTraits::vlanes(), fredmin) #endif inline v_float32 v_reduce_sum4(const v_float32& a, const v_float32& b, const v_float32& c, const v_float32& d) { // 0000 1111 2222 3333 .... - vuint64m2_t vid1 = __riscv_vid_v_u64m2(VTraits::vlanes()); - vuint16m2_t t1 = __riscv_vreinterpret_u16m2(vid1); - vuint16m2_t t2 = __riscv_vslide1up(t1, 0, VTraits::vlanes()); - vuint16m2_t t3 = __riscv_vslide1up(t2, 0, VTraits::vlanes()); - vuint16m2_t t4 = __riscv_vslide1up(t3, 0, VTraits::vlanes()); + vuint64m4_t vid1 = __riscv_vid_v_u64m4(VTraits::vlanes()); + vuint16m4_t t1 = __riscv_vreinterpret_u16m4(vid1); + vuint16m4_t t2 = __riscv_vslide1up(t1, 0, VTraits::vlanes()); + vuint16m4_t t3 = __riscv_vslide1up(t2, 0, VTraits::vlanes()); + vuint16m4_t t4 = __riscv_vslide1up(t3, 0, VTraits::vlanes()); t1 = __riscv_vor( - __riscv_vor(t1, t2, VTraits::vlanes()), - __riscv_vor(t3, t4, VTraits::vlanes()), - VTraits::vlanes() + __riscv_vor(t1, t2, VTraits::vlanes()), + __riscv_vor(t3, t4, VTraits::vlanes()), + VTraits::vlanes() ); // index for transpose4X4 - vuint16m2_t vidx0 = __riscv_vmul(t1, 12, VTraits::vlanes()); - vidx0 = __riscv_vadd(vidx0, __riscv_vid_v_u16m2(VTraits::vlanes()), VTraits::vlanes()); - vuint16m2_t vidx1 = __riscv_vadd(vidx0, 4, VTraits::vlanes()); - vuint16m2_t vidx2 = __riscv_vadd(vidx0, 8, VTraits::vlanes()); - vuint16m2_t vidx3 = __riscv_vadd(vidx0, 12, VTraits::vlanes()); + vuint16m4_t vidx0 = __riscv_vmul(t1, 12, VTraits::vlanes()); + vidx0 = __riscv_vadd(vidx0, __riscv_vid_v_u16m4(VTraits::vlanes()), VTraits::vlanes()); + vuint16m4_t vidx1 = __riscv_vadd(vidx0, 4, VTraits::vlanes()); + vuint16m4_t vidx2 = __riscv_vadd(vidx0, 8, VTraits::vlanes()); + vuint16m4_t vidx3 = __riscv_vadd(vidx0, 12, VTraits::vlanes()); // zip - vuint32m2_t tempA = __riscv_vreinterpret_u32m2( \ - __riscv_vor(__riscv_vzext_vf2(__riscv_vreinterpret_u32m1(a), VTraits::vlanes()), \ - __riscv_vreinterpret_u64m2(__riscv_vslide1up(__riscv_vreinterpret_u32m2(__riscv_vzext_vf2(__riscv_vreinterpret_u32m1(c), VTraits::vlanes())), 0, VTraits::vlanes())), \ - VTraits::vlanes())); \ - vuint32m2_t tempB = __riscv_vreinterpret_u32m2( \ - __riscv_vor(__riscv_vzext_vf2(__riscv_vreinterpret_u32m1(b), VTraits::vlanes()), \ - __riscv_vreinterpret_u64m2(__riscv_vslide1up(__riscv_vreinterpret_u32m2(__riscv_vzext_vf2(__riscv_vreinterpret_u32m1(d), VTraits::vlanes())), 0, VTraits::vlanes())), \ - VTraits::vlanes())); \ - vfloat32m4_t temp = __riscv_vreinterpret_f32m4(__riscv_vreinterpret_u32m4( \ - __riscv_vor(__riscv_vzext_vf2(tempA, VTraits::vlanes()), \ - __riscv_vreinterpret_u64m4(__riscv_vslide1up(__riscv_vreinterpret_u32m4(__riscv_vzext_vf2(tempB, VTraits::vlanes())), 0, VTraits::vlanes())), \ - VTraits::vlanes()))); + vuint32m4_t tempA = __riscv_vreinterpret_u32m4( \ + __riscv_vor(__riscv_vzext_vf2(__riscv_vreinterpret_u32m2(a), VTraits::vlanes()), \ + __riscv_vreinterpret_u64m4(__riscv_vslide1up(__riscv_vreinterpret_u32m4(__riscv_vzext_vf2(__riscv_vreinterpret_u32m2(c), VTraits::vlanes())), 0, VTraits::vlanes())), \ + VTraits::vlanes())); \ + vuint32m4_t tempB = __riscv_vreinterpret_u32m4( \ + __riscv_vor(__riscv_vzext_vf2(__riscv_vreinterpret_u32m2(b), VTraits::vlanes()), \ + __riscv_vreinterpret_u64m4(__riscv_vslide1up(__riscv_vreinterpret_u32m4(__riscv_vzext_vf2(__riscv_vreinterpret_u32m2(d), VTraits::vlanes())), 0, VTraits::vlanes())), \ + VTraits::vlanes())); \ + vfloat32m8_t temp = __riscv_vreinterpret_f32m8(__riscv_vreinterpret_u32m8( \ + __riscv_vor(__riscv_vzext_vf2(tempA, VTraits::vlanes()), \ + __riscv_vreinterpret_u64m8(__riscv_vslide1up(__riscv_vreinterpret_u32m8(__riscv_vzext_vf2(tempB, VTraits::vlanes())), 0, VTraits::vlanes())), \ + VTraits::vlanes()))); // transpose - vfloat32m1_t b0 = __riscv_vlmul_trunc_f32m1(__riscv_vrgatherei16(temp, vidx0, VTraits::vlanes())); - vfloat32m1_t b1 = __riscv_vlmul_trunc_f32m1(__riscv_vrgatherei16(temp, vidx1, VTraits::vlanes())); - vfloat32m1_t b2 = __riscv_vlmul_trunc_f32m1(__riscv_vrgatherei16(temp, vidx2, VTraits::vlanes())); - vfloat32m1_t b3 = __riscv_vlmul_trunc_f32m1(__riscv_vrgatherei16(temp, vidx3, VTraits::vlanes())); + vfloat32m2_t b0 = __riscv_vlmul_trunc_f32m2(__riscv_vrgatherei16(temp, vidx0, VTraits::vlanes())); + vfloat32m2_t b1 = __riscv_vlmul_trunc_f32m2(__riscv_vrgatherei16(temp, vidx1, VTraits::vlanes())); + vfloat32m2_t b2 = __riscv_vlmul_trunc_f32m2(__riscv_vrgatherei16(temp, vidx2, VTraits::vlanes())); + vfloat32m2_t b3 = __riscv_vlmul_trunc_f32m2(__riscv_vrgatherei16(temp, vidx3, VTraits::vlanes())); // vector add v_float32 res = __riscv_vfadd( - __riscv_vfadd(b0, b1, VTraits::vlanes()), - __riscv_vfadd(b2, b3, VTraits::vlanes()), - VTraits::vlanes() + __riscv_vfadd(b0, b1, VTraits::vlanes()), + __riscv_vfadd(b2, b3, VTraits::vlanes()), + VTraits::vlanes() ); return res; } @@ -1294,7 +1250,7 @@ inline v_float16 v_muladd(const v_float16& a, const v_float16& b, const v_float1 #if CV_SIMD_SCALABLE_64F inline v_float64 v_fma(const v_float64& a, const v_float64& b, const v_float64& c) { - return __riscv_vfmacc_vv_f64m1(c, a, b, VTraits::vlanes()); + return __riscv_vfmacc_vv_f64m2(c, a, b, VTraits::vlanes()); } inline v_float64 v_muladd(const v_float64& a, const v_float64& b, const v_float64& c) @@ -1384,7 +1340,7 @@ OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int16, absdiffs) #define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, width) \ inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \ { \ - return __riscv_vnclipu(__riscv_vreinterpret_u##width##m2(__riscv_vwsub_vv(v_max(a, b), v_min(a, b), VTraits<_Tpvec>::vlanes())), 0, 0, VTraits<_Tpvec>::vlanes()); \ + return __riscv_vnclipu(__riscv_vreinterpret_u##width##m4(__riscv_vwsub_vv(v_max(a, b), v_min(a, b), VTraits<_Tpvec>::vlanes())), 0, 0, VTraits<_Tpvec>::vlanes()); \ } OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8, v_uint8, 16) @@ -1465,7 +1421,7 @@ template inline _Tpvec v_rotate_right(const _Tpvec& a) \ } \ template inline _Tpvec v_rotate_left(const _Tpvec& a) \ { \ - return __riscv_vslideup(__riscv_vmv_v_x_##suffix##m1(0, vl), a, n, vl); \ + return __riscv_vslideup(__riscv_vmv_v_x_##suffix##m2(0, vl), a, n, vl); \ } \ template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \ { return a; } \ @@ -1496,7 +1452,7 @@ template inline _Tpvec v_rotate_right(const _Tpvec& a) \ } \ template inline _Tpvec v_rotate_left(const _Tpvec& a) \ { \ - return __riscv_vslideup(__riscv_vfmv_v_f_##suffix##m1(0, vl), a, n, vl); \ + return __riscv_vslideup(__riscv_vfmv_v_f_##suffix##m2(0, vl), a, n, vl); \ } \ template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \ { return a; } \ @@ -1524,11 +1480,11 @@ OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float64, f64, VTraits::vlanes()) #if CV_SIMD_SCALABLE_FP16 inline v_float16 v_cvt_f16(const v_float32 &a) { - return __riscv_vfncvt_f(__riscv_vlmul_ext_f32m2(a), VTraits::vlanes()); + return __riscv_vfncvt_f(__riscv_vlmul_ext_f32m4(a), VTraits::vlanes()); } inline v_float16 v_cvt_f16(const v_float32 &a, const v_float32 &b) { - return __riscv_vfncvt_f(__riscv_vset(__riscv_vlmul_ext_f32m2(a),1,b), VTraits::vlanes()); + return __riscv_vfncvt_f(__riscv_vset(__riscv_vlmul_ext_f32m4(a),1,b), VTraits::vlanes()); } inline v_float16 v_cvt_f16(const v_int16 &a) { @@ -1536,48 +1492,48 @@ inline v_float16 v_cvt_f16(const v_int16 &a) } inline v_float32 v_cvt_f32(const v_float16 &a) { - return __riscv_vget_f32m1(__riscv_vfwcvt_f(a, VTraits::vlanes()), 0); + return __riscv_vget_f32m2(__riscv_vfwcvt_f(a, VTraits::vlanes()), 0); } inline v_float32 v_cvt_f32_high(const v_float16 &a) { - return __riscv_vget_f32m1(__riscv_vfwcvt_f(a, VTraits::vlanes()), 1); + return __riscv_vget_f32m2(__riscv_vfwcvt_f(a, VTraits::vlanes()), 1); } #endif inline v_float32 v_cvt_f32(const v_int32& a) { - return __riscv_vfcvt_f_x_v_f32m1(a, VTraits::vlanes()); + return __riscv_vfcvt_f_x_v_f32m2(a, VTraits::vlanes()); } #if CV_SIMD_SCALABLE_64F inline v_float32 v_cvt_f32(const v_float64& a) { - return __riscv_vfncvt_f(__riscv_vlmul_ext_f64m2(a), VTraits::vlanes()); + return __riscv_vfncvt_f(__riscv_vlmul_ext_f64m4(a), VTraits::vlanes()); } inline v_float32 v_cvt_f32(const v_float64& a, const v_float64& b) { - return __riscv_vfncvt_f(__riscv_vset(__riscv_vlmul_ext_f64m2(a),1,b), VTraits::vlanes()); + return __riscv_vfncvt_f(__riscv_vset(__riscv_vlmul_ext_f64m4(a),1,b), VTraits::vlanes()); } inline v_float64 v_cvt_f64(const v_int32& a) { - return __riscv_vget_f64m1(__riscv_vfwcvt_f(a, VTraits::vlanes()), 0); + return __riscv_vget_f64m2(__riscv_vfwcvt_f(a, VTraits::vlanes()), 0); } inline v_float64 v_cvt_f64_high(const v_int32& a) { - return __riscv_vget_f64m1(__riscv_vfwcvt_f(a, VTraits::vlanes()), 1); + return __riscv_vget_f64m2(__riscv_vfwcvt_f(a, VTraits::vlanes()), 1); } inline v_float64 v_cvt_f64(const v_float32& a) { - return __riscv_vget_f64m1(__riscv_vfwcvt_f(a, VTraits::vlanes()), 0); + return __riscv_vget_f64m2(__riscv_vfwcvt_f(a, VTraits::vlanes()), 0); } inline v_float64 v_cvt_f64_high(const v_float32& a) { - return __riscv_vget_f64m1(__riscv_vfwcvt_f(a, VTraits::vlanes()), 1); + return __riscv_vget_f64m2(__riscv_vfwcvt_f(a, VTraits::vlanes()), 1); } inline v_float64 v_cvt_f64(const v_int64& a) @@ -1610,7 +1566,7 @@ OPENCV_HAL_IMPL_RVV_BROADCAST(v_float32, f32) #define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, width) \ inline _Tpvec v_reverse(const _Tpvec& a) \ { \ - vuint##width##m1_t vidx = __riscv_vrsub(__riscv_vid_v_u##width##m1(VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()-1, VTraits<_Tpvec>::vlanes()); \ + vuint##width##m2_t vidx = __riscv_vrsub(__riscv_vid_v_u##width##m2(VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()-1, VTraits<_Tpvec>::vlanes()); \ return __riscv_vrgather(a, vidx, VTraits<_Tpvec>::vlanes()); \ } OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8, 8) @@ -1635,79 +1591,79 @@ OPENCV_HAL_IMPL_RVV_REVERSE(v_float64, 64) inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \ { \ _Tpwvec_m2 temp = cvt(a, VTraits<_Tpvec>::vlanes()); \ - b0 = __riscv_vget_##suffix##m1(temp, 0); \ - b1 = __riscv_vget_##suffix##m1(temp, 1); \ + b0 = __riscv_vget_##suffix##m2(temp, 0); \ + b1 = __riscv_vget_##suffix##m2(temp, 1); \ } \ inline _Tpwvec v_expand_low(const _Tpvec& a) \ { \ _Tpwvec_m2 temp = cvt(a, VTraits<_Tpvec>::vlanes()); \ - return __riscv_vget_##suffix##m1(temp, 0); \ + return __riscv_vget_##suffix##m2(temp, 0); \ } \ inline _Tpwvec v_expand_high(const _Tpvec& a) \ { \ _Tpwvec_m2 temp = cvt(a, VTraits<_Tpvec>::vlanes()); \ - return __riscv_vget_##suffix##m1(temp, 1); \ + return __riscv_vget_##suffix##m2(temp, 1); \ } \ inline _Tpwvec v_load_expand(const _Tp* ptr) \ { \ - return cvt(__riscv_vle##width##_v_##suffix2##mf2(ptr, VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()); \ + return cvt(__riscv_vle##width##_v_##suffix2##m1(ptr, VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()); \ } -OPENCV_HAL_IMPL_RVV_EXPAND(uchar, v_uint16, vuint16m2_t, v_uint8, 8, u16, u8, __riscv_vwcvtu_x) -OPENCV_HAL_IMPL_RVV_EXPAND(schar, v_int16, vint16m2_t, v_int8, 8, i16, i8, __riscv_vwcvt_x) -OPENCV_HAL_IMPL_RVV_EXPAND(ushort, v_uint32, vuint32m2_t, v_uint16, 16, u32, u16, __riscv_vwcvtu_x) -OPENCV_HAL_IMPL_RVV_EXPAND(short, v_int32, vint32m2_t, v_int16, 16, i32, i16, __riscv_vwcvt_x) -OPENCV_HAL_IMPL_RVV_EXPAND(uint, v_uint64, vuint64m2_t, v_uint32, 32, u64, u32, __riscv_vwcvtu_x) -OPENCV_HAL_IMPL_RVV_EXPAND(int, v_int64, vint64m2_t, v_int32, 32, i64, i32, __riscv_vwcvt_x) +OPENCV_HAL_IMPL_RVV_EXPAND(uchar, v_uint16, vuint16m4_t, v_uint8, 8, u16, u8, __riscv_vwcvtu_x) +OPENCV_HAL_IMPL_RVV_EXPAND(schar, v_int16, vint16m4_t, v_int8, 8, i16, i8, __riscv_vwcvt_x) +OPENCV_HAL_IMPL_RVV_EXPAND(ushort, v_uint32, vuint32m4_t, v_uint16, 16, u32, u16, __riscv_vwcvtu_x) +OPENCV_HAL_IMPL_RVV_EXPAND(short, v_int32, vint32m4_t, v_int16, 16, i32, i16, __riscv_vwcvt_x) +OPENCV_HAL_IMPL_RVV_EXPAND(uint, v_uint64, vuint64m4_t, v_uint32, 32, u64, u32, __riscv_vwcvtu_x) +OPENCV_HAL_IMPL_RVV_EXPAND(int, v_int64, vint64m4_t, v_int32, 32, i64, i32, __riscv_vwcvt_x) inline v_uint32 v_load_expand_q(const uchar* ptr) { - return __riscv_vwcvtu_x(__riscv_vwcvtu_x(__riscv_vle8_v_u8mf4(ptr, VTraits::vlanes()), VTraits::vlanes()), VTraits::vlanes()); + return __riscv_vwcvtu_x(__riscv_vwcvtu_x(__riscv_vle8_v_u8mf2(ptr, VTraits::vlanes()), VTraits::vlanes()), VTraits::vlanes()); } inline v_int32 v_load_expand_q(const schar* ptr) { - return __riscv_vwcvt_x(__riscv_vwcvt_x(__riscv_vle8_v_i8mf4(ptr, VTraits::vlanes()), VTraits::vlanes()), VTraits::vlanes()); + return __riscv_vwcvt_x(__riscv_vwcvt_x(__riscv_vle8_v_i8mf2(ptr, VTraits::vlanes()), VTraits::vlanes()), VTraits::vlanes()); } #define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, hwidth, hsuffix, suffix, rshr, shr) \ inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \ { \ - return shr(__riscv_vset(__riscv_vlmul_ext_##suffix##m2(a), 1, b), 0, 0, VTraits<_Tpvec>::vlanes()); \ + return shr(__riscv_vset(__riscv_vlmul_ext_##suffix##m4(a), 1, b), 0, 0, VTraits<_Tpvec>::vlanes()); \ } \ inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \ { \ - __riscv_vse##hwidth##_v_##hsuffix##mf2(ptr, shr(a, 0, 0, VTraits<_Tpvec>::vlanes()), VTraits<_wTpvec>::vlanes()); \ + __riscv_vse##hwidth##_v_##hsuffix##m1(ptr, shr(a, 0, 0, VTraits<_Tpvec>::vlanes()), VTraits<_wTpvec>::vlanes()); \ } \ template inline \ _Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b, int N = n) \ { \ - return rshr(__riscv_vset(__riscv_vlmul_ext_##suffix##m2(a), 1, b), N, 0, VTraits<_Tpvec>::vlanes()); \ + return rshr(__riscv_vset(__riscv_vlmul_ext_##suffix##m4(a), 1, b), N, 0, VTraits<_Tpvec>::vlanes()); \ } \ template inline \ void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a, int N = n) \ { \ - __riscv_vse##hwidth##_v_##hsuffix##mf2(ptr, rshr(a, N, 0, VTraits<_Tpvec>::vlanes()), VTraits<_wTpvec>::vlanes()); \ + __riscv_vse##hwidth##_v_##hsuffix##m1(ptr, rshr(a, N, 0, VTraits<_Tpvec>::vlanes()), VTraits<_wTpvec>::vlanes()); \ } #define OPENCV_HAL_IMPL_RVV_PACK_32(_Tpvec, _Tp, _wTpvec, hwidth, hsuffix, suffix, rshr, shr) \ inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \ { \ - return shr(__riscv_vset(__riscv_vlmul_ext_##suffix##m2(a), 1, b), 0, VTraits<_Tpvec>::vlanes()); \ + return shr(__riscv_vset(__riscv_vlmul_ext_##suffix##m4(a), 1, b), 0, VTraits<_Tpvec>::vlanes()); \ } \ inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \ { \ - __riscv_vse##hwidth##_v_##hsuffix##mf2(ptr, shr(a, 0, VTraits<_Tpvec>::vlanes()), VTraits<_wTpvec>::vlanes()); \ + __riscv_vse##hwidth##_v_##hsuffix##m1(ptr, shr(a, 0, VTraits<_Tpvec>::vlanes()), VTraits<_wTpvec>::vlanes()); \ } \ template inline \ _Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b, int N = n) \ { \ - return rshr(__riscv_vset(__riscv_vlmul_ext_##suffix##m2(a), 1, b), N, 0, VTraits<_Tpvec>::vlanes()); \ + return rshr(__riscv_vset(__riscv_vlmul_ext_##suffix##m4(a), 1, b), N, 0, VTraits<_Tpvec>::vlanes()); \ } \ template inline \ void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a, int N = n) \ { \ - __riscv_vse##hwidth##_v_##hsuffix##mf2(ptr, rshr(a, N, 0, VTraits<_Tpvec>::vlanes()), VTraits<_wTpvec>::vlanes()); \ + __riscv_vse##hwidth##_v_##hsuffix##m1(ptr, rshr(a, N, 0, VTraits<_Tpvec>::vlanes()), VTraits<_wTpvec>::vlanes()); \ } OPENCV_HAL_IMPL_RVV_PACK(v_uint8, uchar, v_uint16, 8, u8, u16, __riscv_vnclipu, __riscv_vnclipu) @@ -1720,25 +1676,25 @@ OPENCV_HAL_IMPL_RVV_PACK_32(v_int32, int, v_int64, 32, i32, i64, __riscv_vnclip, #define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, cast, hvl, vl) \ inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \ { \ - return __riscv_vnclipu(cast(__riscv_vmax(__riscv_vset(__riscv_vlmul_ext_##suffix##m2(a), 1, b), 0, vl)), 0, 0, vl); \ + return __riscv_vnclipu(cast(__riscv_vmax(__riscv_vset(__riscv_vlmul_ext_##suffix##m4(a), 1, b), 0, vl)), 0, 0, vl); \ } \ inline void v_pack_u_store(_Tp* ptr, const _wTpvec& a) \ { \ - __riscv_vse##hwidth##_v_##hsuffix##mf2(ptr, __riscv_vnclipu(__riscv_vreinterpret_u##width##m1(__riscv_vmax(a, 0, vl)), 0, 0, vl), hvl); \ + __riscv_vse##hwidth##_v_##hsuffix##m1(ptr, __riscv_vnclipu(__riscv_vreinterpret_u##width##m2(__riscv_vmax(a, 0, vl)), 0, 0, vl), hvl); \ } \ template inline \ _Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b, int n = N) \ { \ - return __riscv_vnclipu(cast(__riscv_vmax(__riscv_vset(__riscv_vlmul_ext_##suffix##m2(a), 1, b), 0, vl)), n, 0, vl); \ + return __riscv_vnclipu(cast(__riscv_vmax(__riscv_vset(__riscv_vlmul_ext_##suffix##m4(a), 1, b), 0, vl)), n, 0, vl); \ } \ template inline \ void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a, int n = N) \ { \ - __riscv_vse##hwidth##_v_##hsuffix##mf2(ptr, __riscv_vnclipu(__riscv_vreinterpret_u##width##m1(__riscv_vmax(a, 0, vl)), n, 0, vl), hvl); \ + __riscv_vse##hwidth##_v_##hsuffix##m1(ptr, __riscv_vnclipu(__riscv_vreinterpret_u##width##m2(__riscv_vmax(a, 0, vl)), n, 0, vl), hvl); \ } -OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8, uchar, v_int16, short, 8, 16, u8, i16, __riscv_vreinterpret_v_i16m2_u16m2, VTraits::vlanes(), VTraits::vlanes()) -OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16, ushort, v_int32, int, 16, 32, u16, i32, __riscv_vreinterpret_v_i32m2_u32m2, VTraits::vlanes(), VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8, uchar, v_int16, short, 8, 16, u8, i16, __riscv_vreinterpret_v_i16m4_u16m4, VTraits::vlanes(), VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16, ushort, v_int32, int, 16, 32, u16, i32, __riscv_vreinterpret_v_i32m4_u32m4, VTraits::vlanes(), VTraits::vlanes()) /* void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) @@ -1750,41 +1706,41 @@ OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16, ushort, v_int32, int, 16, 32, u16, i32, __ #define OPENCV_HAL_IMPL_RVV_ZIP(_Tpvec, _wTpvec, suffix, width, width2, convert2um2, convert2um1) \ inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) { \ - _wTpvec temp = __riscv_vreinterpret_##suffix##m2(convert2um2( \ + _wTpvec temp = __riscv_vreinterpret_##suffix##m4(convert2um2( \ __riscv_vor(__riscv_vzext_vf2(convert2um1(a0), VTraits<_Tpvec>::vlanes()*2), \ - __riscv_vreinterpret_u##width2##m2(__riscv_vslide1up(__riscv_vreinterpret_u##width##m2(__riscv_vzext_vf2(convert2um1(a1), VTraits<_Tpvec>::vlanes()*2)), 0, VTraits<_Tpvec>::vlanes()*2)), \ + __riscv_vreinterpret_u##width2##m4(__riscv_vslide1up(__riscv_vreinterpret_u##width##m4(__riscv_vzext_vf2(convert2um1(a1), VTraits<_Tpvec>::vlanes()*2)), 0, VTraits<_Tpvec>::vlanes()*2)), \ VTraits<_Tpvec>::vlanes()))); \ - b0 = __riscv_vget_##suffix##m1(temp, 0); \ - b1 = __riscv_vget_##suffix##m1(temp, 1); \ + b0 = __riscv_vget_##suffix##m2(temp, 0); \ + b1 = __riscv_vget_##suffix##m2(temp, 1); \ } -OPENCV_HAL_IMPL_RVV_ZIP(v_uint8, vuint8m2_t, u8, 8, 16, OPENCV_HAL_NOP, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_RVV_ZIP(v_int8, vint8m2_t, i8, 8, 16, __riscv_vreinterpret_u8m2, __riscv_vreinterpret_u8m1) -OPENCV_HAL_IMPL_RVV_ZIP(v_uint16, vuint16m2_t, u16, 16, 32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_RVV_ZIP(v_int16, vint16m2_t, i16, 16, 32, __riscv_vreinterpret_u16m2, __riscv_vreinterpret_u16m1) +OPENCV_HAL_IMPL_RVV_ZIP(v_uint8, vuint8m4_t, u8, 8, 16, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_ZIP(v_int8, vint8m4_t, i8, 8, 16, __riscv_vreinterpret_u8m4, __riscv_vreinterpret_u8m2) +OPENCV_HAL_IMPL_RVV_ZIP(v_uint16, vuint16m4_t, u16, 16, 32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_ZIP(v_int16, vint16m4_t, i16, 16, 32, __riscv_vreinterpret_u16m4, __riscv_vreinterpret_u16m2) #if CV_SIMD_SCALABLE_FP16 -OPENCV_HAL_IMPL_RVV_ZIP(v_float16, vfloat16m2_t, f16, 16, 32, __riscv_vreinterpret_u16m2, __riscv_vreinterpret_u16m1) +OPENCV_HAL_IMPL_RVV_ZIP(v_float16, vfloat16m4_t, f16, 16, 32, __riscv_vreinterpret_u16m4, __riscv_vreinterpret_u16m2) #endif -OPENCV_HAL_IMPL_RVV_ZIP(v_uint32, vuint32m2_t, u32, 32, 64, OPENCV_HAL_NOP, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_RVV_ZIP(v_int32, vint32m2_t, i32, 32, 64, __riscv_vreinterpret_u32m2, __riscv_vreinterpret_u32m1) -OPENCV_HAL_IMPL_RVV_ZIP(v_float32, vfloat32m2_t, f32, 32, 64, __riscv_vreinterpret_u32m2, __riscv_vreinterpret_u32m1) +OPENCV_HAL_IMPL_RVV_ZIP(v_uint32, vuint32m4_t, u32, 32, 64, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_ZIP(v_int32, vint32m4_t, i32, 32, 64, __riscv_vreinterpret_u32m4, __riscv_vreinterpret_u32m2) +OPENCV_HAL_IMPL_RVV_ZIP(v_float32, vfloat32m4_t, f32, 32, 64, __riscv_vreinterpret_u32m4, __riscv_vreinterpret_u32m2) #if CV_SIMD_SCALABLE_64F inline void v_zip(const v_float64& a0, const v_float64& a1, v_float64& b0, v_float64& b1) { \ - vuint16mf4_t idx0 = __riscv_vid_v_u16mf4(VTraits::vlanes()); - vuint16mf4_t idx1 = __riscv_vadd(idx0, VTraits::vlanes(), VTraits::vlanes()); - vuint16mf2_t idx = __riscv_vreinterpret_u16mf2(( \ + vuint16mf2_t idx0 = __riscv_vid_v_u16mf2(VTraits::vlanes()); + vuint16mf2_t idx1 = __riscv_vadd(idx0, VTraits::vlanes(), VTraits::vlanes()); + vuint16m1_t idx = __riscv_vreinterpret_u16m1(( \ __riscv_vor(__riscv_vzext_vf2(idx0, VTraits::vlanes()), \ - __riscv_vreinterpret_u32mf2(__riscv_vslide1up(__riscv_vreinterpret_u16mf2(__riscv_vzext_vf2(idx1, VTraits::vlanes())), 0, VTraits::vlanes())), \ + __riscv_vreinterpret_u32m1(__riscv_vslide1up(__riscv_vreinterpret_u16m1(__riscv_vzext_vf2(idx1, VTraits::vlanes())), 0, VTraits::vlanes())), \ VTraits::vlanes()))); #if 0 - vfloat64m2_t temp = __riscv_vcreate_v_f64m1_f64m2(a0, a1); + vfloat64m4_t temp = __riscv_vcreate_v_f64m2_f64m4(a0, a1); #else // TODO: clean up when RVV Intrinsic is frozen. - vfloat64m2_t temp = __riscv_vlmul_ext_f64m2(a0); + vfloat64m4_t temp = __riscv_vlmul_ext_f64m4(a0); temp = __riscv_vset(temp, 1, a1); #endif temp = __riscv_vrgatherei16(temp, idx, VTraits::vlanes()*2); - b0 = __riscv_vget_f64m1(temp, 0); \ - b1 = __riscv_vget_f64m1(temp, 1); \ + b0 = __riscv_vget_f64m2(temp, 0); \ + b1 = __riscv_vget_f64m2(temp, 1); \ } #endif @@ -1824,23 +1780,23 @@ OPENCV_HAL_IMPL_RVV_UNPACKS(v_float64, 64) #define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp, _TpCast, suffix, width, hwidth, vl) \ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \ { \ - a = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)ptr , sizeof(_Tp)*2, VTraits::vlanes()); \ - b = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+1), sizeof(_Tp)*2, VTraits::vlanes()); \ + a = __riscv_vlse##width##_v_##suffix##m2((_TpCast *)ptr , sizeof(_Tp)*2, VTraits::vlanes()); \ + b = __riscv_vlse##width##_v_##suffix##m2((_TpCast *)(ptr+1), sizeof(_Tp)*2, VTraits::vlanes()); \ }\ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \ { \ - a = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)ptr , sizeof(_Tp)*3, VTraits::vlanes()); \ - b = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+1), sizeof(_Tp)*3, VTraits::vlanes()); \ - c = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+2), sizeof(_Tp)*3, VTraits::vlanes()); \ + a = __riscv_vlse##width##_v_##suffix##m2((_TpCast *)ptr , sizeof(_Tp)*3, VTraits::vlanes()); \ + b = __riscv_vlse##width##_v_##suffix##m2((_TpCast *)(ptr+1), sizeof(_Tp)*3, VTraits::vlanes()); \ + c = __riscv_vlse##width##_v_##suffix##m2((_TpCast *)(ptr+2), sizeof(_Tp)*3, VTraits::vlanes()); \ } \ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \ v_##_Tpvec& c, v_##_Tpvec& d) \ { \ \ - a = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)ptr , sizeof(_Tp)*4, VTraits::vlanes()); \ - b = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+1), sizeof(_Tp)*4, VTraits::vlanes()); \ - c = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+2), sizeof(_Tp)*4, VTraits::vlanes()); \ - d = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+3), sizeof(_Tp)*4, VTraits::vlanes()); \ + a = __riscv_vlse##width##_v_##suffix##m2((_TpCast *)ptr , sizeof(_Tp)*4, VTraits::vlanes()); \ + b = __riscv_vlse##width##_v_##suffix##m2((_TpCast *)(ptr+1), sizeof(_Tp)*4, VTraits::vlanes()); \ + c = __riscv_vlse##width##_v_##suffix##m2((_TpCast *)(ptr+2), sizeof(_Tp)*4, VTraits::vlanes()); \ + d = __riscv_vlse##width##_v_##suffix##m2((_TpCast *)(ptr+3), sizeof(_Tp)*4, VTraits::vlanes()); \ } \ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ @@ -1896,8 +1852,8 @@ static uint64_t idx_interleave_quads[] = { \ #define OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(_Tpvec, func) \ inline _Tpvec v_interleave_##func(const _Tpvec& vec) { \ CV_CheckLE(VTraits<_Tpvec>::vlanes(), VTraits<_Tpvec>::max_nlanes, "RVV implementation only supports VLEN in the range [128, 1024]"); \ - vuint8m1_t vidx = __riscv_vundefined_u8m1();\ - vidx = __riscv_vreinterpret_u8m1(__riscv_vle64_v_u64m1(idx_interleave_##func, 16)); \ + vuint8m2_t vidx = __riscv_vundefined_u8m2();\ + vidx = __riscv_vreinterpret_u8m2(__riscv_vle64_v_u64m2(idx_interleave_##func, 16)); \ return __riscv_vrgather(vec, vidx, VTraits::vlanes()); \ } OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_uint8, pairs) @@ -1908,8 +1864,8 @@ OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_int8, quads) #define OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(_Tpvec, width, vzext_vfx, func) \ inline _Tpvec v_interleave_##func(const _Tpvec& vec) { \ CV_CheckLE(VTraits<_Tpvec>::vlanes(), VTraits<_Tpvec>::max_nlanes, "RVV implementation only supports VLEN in the range [128, 1024]"); \ - vuint##width##m1_t vidx = __riscv_vundefined_u##width##m1();\ - vidx = __riscv_vget_u##width##m1(vzext_vfx(__riscv_vreinterpret_u8m1(__riscv_vle64_v_u64m1(idx_interleave_##func, 16)), VTraits::vlanes()), 0); \ + vuint##width##m2_t vidx = __riscv_vundefined_u##width##m2();\ + vidx = __riscv_vget_u##width##m2(vzext_vfx(__riscv_vreinterpret_u8m2(__riscv_vle64_v_u64m2(idx_interleave_##func, 16)), VTraits::vlanes()), 0); \ return __riscv_vrgather(vec, vidx, VTraits<_Tpvec>::vlanes()); \ } @@ -1947,20 +1903,20 @@ static const unsigned char popCountTable[256] = }; #define OPENCV_HAL_IMPL_RVV_HADD(_Tpvec, _Tpvec2, _Tm2, width, width2, suffix, add) \ static inline _Tpvec2 v_hadd(_Tpvec a) { \ - vuint##width2##m1_t oneX2 = __riscv_vmv_v_x_u##width2##m1(1, VTraits::vlanes()); \ - vuint##width##m1_t one = __riscv_vreinterpret_u##width##m1(oneX2); \ + vuint##width2##m2_t oneX2 = __riscv_vmv_v_x_u##width2##m2(1, VTraits::vlanes()); \ + vuint##width##m2_t one = __riscv_vreinterpret_u##width##m2(oneX2); \ _Tm2 res = add(a, __riscv_vslide1down(a, 0, VTraits::vlanes()), VTraits::vlanes()); \ - return __riscv_vget_##suffix##m1(__riscv_vcompress(res, __riscv_vmseq(one, 1, VTraits::vlanes()), VTraits::vlanes()), 0); \ + return __riscv_vget_##suffix##m2(__riscv_vcompress(res, __riscv_vmseq(one, 1, VTraits::vlanes()), VTraits::vlanes()), 0); \ } -OPENCV_HAL_IMPL_RVV_HADD(v_uint8, v_uint16, vuint16m2_t, 8, 16, u16, __riscv_vwaddu_vv) -OPENCV_HAL_IMPL_RVV_HADD(v_uint16, v_uint32, vuint32m2_t, 16, 32, u32, __riscv_vwaddu_vv) -OPENCV_HAL_IMPL_RVV_HADD(v_uint32, v_uint64, vuint64m2_t, 32, 64, u64, __riscv_vwaddu_vv) -OPENCV_HAL_IMPL_RVV_HADD(v_int8, v_int16, vint16m2_t, 8, 16, i16, __riscv_vwadd_vv) -OPENCV_HAL_IMPL_RVV_HADD(v_int16, v_int32, vint32m2_t, 16, 32, i32, __riscv_vwadd_vv) -OPENCV_HAL_IMPL_RVV_HADD(v_int32, v_int64, vint64m2_t, 32, 64, i64, __riscv_vwadd_vv) +OPENCV_HAL_IMPL_RVV_HADD(v_uint8, v_uint16, vuint16m4_t, 8, 16, u16, __riscv_vwaddu_vv) +OPENCV_HAL_IMPL_RVV_HADD(v_uint16, v_uint32, vuint32m4_t, 16, 32, u32, __riscv_vwaddu_vv) +OPENCV_HAL_IMPL_RVV_HADD(v_uint32, v_uint64, vuint64m4_t, 32, 64, u64, __riscv_vwaddu_vv) +OPENCV_HAL_IMPL_RVV_HADD(v_int8, v_int16, vint16m4_t, 8, 16, i16, __riscv_vwadd_vv) +OPENCV_HAL_IMPL_RVV_HADD(v_int16, v_int32, vint32m4_t, 16, 32, i32, __riscv_vwadd_vv) +OPENCV_HAL_IMPL_RVV_HADD(v_int32, v_int64, vint64m4_t, 32, 64, i64, __riscv_vwadd_vv) -OPENCV_HAL_IMPL_RVV_HADD(vint32m2_t, v_int32, vint32m2_t, 16, 32, i32, __riscv_vadd) -OPENCV_HAL_IMPL_RVV_HADD(vint64m2_t, v_int64, vint64m2_t, 32, 64, i64, __riscv_vadd) +OPENCV_HAL_IMPL_RVV_HADD(vint32m4_t, v_int32, vint32m4_t, 16, 32, i32, __riscv_vadd) +OPENCV_HAL_IMPL_RVV_HADD(vint64m4_t, v_int64, vint64m4_t, 32, 64, i64, __riscv_vadd) inline v_uint8 v_popcount(const v_uint8& a) { @@ -1968,15 +1924,15 @@ inline v_uint8 v_popcount(const v_uint8& a) } inline v_uint16 v_popcount(const v_uint16& a) { - return v_hadd(v_popcount(__riscv_vreinterpret_u8m1(a))); + return v_hadd(v_popcount(__riscv_vreinterpret_u8m2(a))); } inline v_uint32 v_popcount(const v_uint32& a) { - return v_hadd(v_hadd(v_popcount(__riscv_vreinterpret_u8m1(a)))); + return v_hadd(v_hadd(v_popcount(__riscv_vreinterpret_u8m2(a)))); } inline v_uint64 v_popcount(const v_uint64& a) { - return v_hadd(v_hadd(v_hadd(v_popcount(__riscv_vreinterpret_u8m1(a))))); + return v_hadd(v_hadd(v_hadd(v_popcount(__riscv_vreinterpret_u8m2(a))))); } inline v_uint8 v_popcount(const v_int8& a) @@ -2057,23 +2013,23 @@ inline int v_scan_forward(const v_float64& a) #define OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(_Tpvec, v_trunc) \ inline _Tpvec v_pack_triplets(const _Tpvec& vec) { \ size_t vl = VTraits::vlanes(); \ - vuint32m1_t one = __riscv_vmv_v_x_u32m1(1, VTraits::vlanes()); \ - vuint8m1_t zero = __riscv_vmv_v_x_u8m1(0, vl); \ - vuint8m1_t mask = __riscv_vreinterpret_u8m1(one); \ + vuint32m2_t one = __riscv_vmv_v_x_u32m2(1, VTraits::vlanes()); \ + vuint8m2_t zero = __riscv_vmv_v_x_u8m2(0, vl); \ + vuint8m2_t mask = __riscv_vreinterpret_u8m2(one); \ return __riscv_vcompress(vec, __riscv_vmseq(v_trunc(__riscv_vslideup(zero, mask, 3, vl)), 0, vl), VTraits<_Tpvec>::vlanes()); \ } OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint8, OPENCV_HAL_NOP) OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int8, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint16, __riscv_vlmul_trunc_u8mf2) -OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int16, __riscv_vlmul_trunc_u8mf2) -OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint32, __riscv_vlmul_trunc_u8mf4) -OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int32, __riscv_vlmul_trunc_u8mf4) -OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float32, __riscv_vlmul_trunc_u8mf4) -OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint64, __riscv_vlmul_trunc_u8mf8) -OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int64, __riscv_vlmul_trunc_u8mf8) +OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint16, __riscv_vlmul_trunc_u8m1) +OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int16, __riscv_vlmul_trunc_u8m1) +OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint32, __riscv_vlmul_trunc_u8mf2) +OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int32, __riscv_vlmul_trunc_u8mf2) +OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float32, __riscv_vlmul_trunc_u8mf2) +OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint64, __riscv_vlmul_trunc_u8mf4) +OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int64, __riscv_vlmul_trunc_u8mf4) #if CV_SIMD_SCALABLE_64F -OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float64, __riscv_vlmul_trunc_u8mf8) +OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float64, __riscv_vlmul_trunc_u8mf4) #endif @@ -2082,12 +2038,12 @@ OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float64, __riscv_vlmul_trunc_u8mf8) #if defined(__riscv_zfh) && __riscv_zfh inline v_float32 v_load_expand(const hfloat* ptr) { - return __riscv_vfwcvt_f(__riscv_vle16_v_f16mf2((_Float16*)ptr, VTraits::vlanes()) ,VTraits::vlanes());; + return __riscv_vfwcvt_f(__riscv_vle16_v_f16m1((_Float16*)ptr, VTraits::vlanes()) ,VTraits::vlanes());; } inline void v_pack_store(hfloat* ptr, const v_float32& v) { - __riscv_vse16_v_f16mf2((_Float16*)ptr, __riscv_vfncvt_f_f_w_f16mf2(v, VTraits::vlanes()), VTraits::vlanes()); + __riscv_vse16_v_f16m1((_Float16*)ptr, __riscv_vfncvt_f_f_w_f16m1(v, VTraits::vlanes()), VTraits::vlanes()); } #else inline v_float32 v_load_expand(const hfloat* ptr) @@ -2113,20 +2069,12 @@ inline v_int16 v_round(const v_float16& a) inline v_int16 v_floor(const v_float16& a) { -#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>11999 - return __riscv_vfcvt_x_f_v_i16m1_rm(a, 1 /*RNE, round-to-nearest-even*/, VTraits::vlanes()); -#else - return __riscv_vfcvt_x(vfsub(a, 0.5f - 1e-5, VTraits::vlanes()), VTraits::vlanes()); -#endif + return __riscv_vfcvt_x_f_v_i16m2_rm(a, 1 /*RNE, round-to-nearest-even*/, VTraits::vlanes()); } inline v_int16 v_ceil(const v_float16& a) { -#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>11999 - return __riscv_vfcvt_x_f_v_i16m1_rm(a, 3 /*ROD, round-to-odd*/, VTraits::vlanes()); -#else - return __riscv_vfcvt_x(vfadd(a, 0.5f - 1e-5, VTraits::vlanes()), VTraits::vlanes()); -#endif + return __riscv_vfcvt_x_f_v_i16m2_rm(a, 3 /*ROD, round-to-odd*/, VTraits::vlanes()); } inline v_int16 v_trunc(const v_float16& a) @@ -2158,29 +2106,29 @@ inline v_int32 v_trunc(const v_float32& a) #if CV_SIMD_SCALABLE_64F inline v_int32 v_round(const v_float64& a) { - return __riscv_vfncvt_x(__riscv_vlmul_ext_f64m2(a), VTraits::vlanes()); + return __riscv_vfncvt_x(__riscv_vlmul_ext_f64m4(a), VTraits::vlanes()); } inline v_int32 v_round(const v_float64& a, const v_float64& b) { // return vfncvt_x(vset(vlmul_ext_f64m2(vfadd(a, 1e-6, VTraits::vlanes())), 1, b), VTraits::vlanes()); // Fix https://github.com/opencv/opencv/issues/24746 - return __riscv_vfncvt_x(__riscv_vset(__riscv_vlmul_ext_f64m2(a), 1, b), VTraits::vlanes()); + return __riscv_vfncvt_x(__riscv_vset(__riscv_vlmul_ext_f64m4(a), 1, b), VTraits::vlanes()); } inline v_int32 v_floor(const v_float64& a) { - return __riscv_vfncvt_x(__riscv_vlmul_ext_f64m2(__riscv_vfsub(a, 0.5f - 1e-6, VTraits::vlanes())), VTraits::vlanes()); + return __riscv_vfncvt_x(__riscv_vlmul_ext_f64m4(__riscv_vfsub(a, 0.5f - 1e-6, VTraits::vlanes())), VTraits::vlanes()); } inline v_int32 v_ceil(const v_float64& a) { - return __riscv_vfncvt_x(__riscv_vlmul_ext_f64m2(__riscv_vfadd(a, 0.5f - 1e-6, VTraits::vlanes())), VTraits::vlanes()); + return __riscv_vfncvt_x(__riscv_vlmul_ext_f64m4(__riscv_vfadd(a, 0.5f - 1e-6, VTraits::vlanes())), VTraits::vlanes()); } inline v_int32 v_trunc(const v_float64& a) { - return __riscv_vfncvt_rtz_x(__riscv_vlmul_ext_f64m2(a), VTraits::vlanes()); + return __riscv_vfncvt_rtz_x(__riscv_vlmul_ext_f64m4(a), VTraits::vlanes()); } #endif @@ -2189,154 +2137,154 @@ inline v_int32 v_trunc(const v_float64& a) // 16 >> 32 inline v_int32 v_dotprod(const v_int16& a, const v_int16& b) { - vint32m2_t temp1 = __riscv_vwmul(a, b, VTraits::vlanes()); + vint32m4_t temp1 = __riscv_vwmul(a, b, VTraits::vlanes()); return v_hadd(temp1); } inline v_int32 v_dotprod(const v_int16& a, const v_int16& b, const v_int32& c) { - vint32m2_t temp1 = __riscv_vwmul(a, b, VTraits::vlanes()); + vint32m4_t temp1 = __riscv_vwmul(a, b, VTraits::vlanes()); return __riscv_vadd(v_hadd(temp1), c, VTraits::vlanes()); } // 32 >> 64 inline v_int64 v_dotprod(const v_int32& a, const v_int32& b) { - vuint64m1_t one64 = __riscv_vmv_v_x_u64m1(1, VTraits::vlanes()); \ - vuint32m1_t one32 = __riscv_vreinterpret_u32m1(one64); \ - vbool32_t mask = __riscv_vmseq(one32, 1, VTraits::vlanes()); \ - vint64m2_t temp1 = __riscv_vwmul(a, b, VTraits::vlanes()); \ - vint64m2_t temp2 = __riscv_vslide1down(temp1, 0, VTraits::vlanes()); - vint64m2_t res = __riscv_vadd(temp1, temp2, VTraits::vlanes()); + vuint64m2_t one64 = __riscv_vmv_v_x_u64m2(1, VTraits::vlanes()); \ + vuint32m2_t one32 = __riscv_vreinterpret_u32m2(one64); \ + vbool16_t mask = __riscv_vmseq(one32, 1, VTraits::vlanes()); \ + vint64m4_t temp1 = __riscv_vwmul(a, b, VTraits::vlanes()); \ + vint64m4_t temp2 = __riscv_vslide1down(temp1, 0, VTraits::vlanes()); + vint64m4_t res = __riscv_vadd(temp1, temp2, VTraits::vlanes()); res = __riscv_vcompress(res, mask, VTraits::vlanes()); \ - return __riscv_vlmul_trunc_i64m1(res); \ + return __riscv_vlmul_trunc_i64m2(res); \ } inline v_int64 v_dotprod(const v_int32& a, const v_int32& b, const v_int64& c) { - vuint64m1_t one64 = __riscv_vmv_v_x_u64m1(1, VTraits::vlanes()); \ - vuint32m1_t one32 = __riscv_vreinterpret_u32m1(one64); \ - vbool32_t mask = __riscv_vmseq(one32, 1, VTraits::vlanes()); \ - vint64m2_t temp1 = __riscv_vwmul(a, b, VTraits::vlanes()); \ - vint64m2_t temp2 = __riscv_vslide1down(temp1, 0, VTraits::vlanes()); - vint64m2_t res = __riscv_vadd(temp1, temp2, VTraits::vlanes()); + vuint64m2_t one64 = __riscv_vmv_v_x_u64m2(1, VTraits::vlanes()); \ + vuint32m2_t one32 = __riscv_vreinterpret_u32m2(one64); \ + vbool16_t mask = __riscv_vmseq(one32, 1, VTraits::vlanes()); \ + vint64m4_t temp1 = __riscv_vwmul(a, b, VTraits::vlanes()); \ + vint64m4_t temp2 = __riscv_vslide1down(temp1, 0, VTraits::vlanes()); + vint64m4_t res = __riscv_vadd(temp1, temp2, VTraits::vlanes()); res = __riscv_vcompress(res, mask, VTraits::vlanes()); \ - return __riscv_vadd(__riscv_vlmul_trunc_i64m1(res), c, VTraits::vlanes()); \ + return __riscv_vadd(__riscv_vlmul_trunc_i64m2(res), c, VTraits::vlanes()); \ } // 8 >> 32 inline v_uint32 v_dotprod_expand(const v_uint8& a, const v_uint8& b) { - vuint32m1_t one32 = __riscv_vmv_v_x_u32m1(1, VTraits::vlanes()); \ - vuint8m1_t one8 = __riscv_vreinterpret_u8m1(one32); \ - vbool8_t mask = __riscv_vmseq(one8, 1, VTraits::vlanes()); \ - vuint16m2_t t0 = __riscv_vwmulu(a, b, VTraits::vlanes()); \ - vuint16m2_t t1= __riscv_vslide1down(t0, 0, VTraits::vlanes()); - vuint16m2_t t2= __riscv_vslide1down(t1, 0, VTraits::vlanes()); - vuint16m2_t t3= __riscv_vslide1down(t2, 0, VTraits::vlanes()); - vuint32m4_t res = __riscv_vadd(__riscv_vwaddu_vv(t2, t3, VTraits::vlanes()), __riscv_vwaddu_vv(t0, t1, VTraits::vlanes()), VTraits::vlanes()); + vuint32m2_t one32 = __riscv_vmv_v_x_u32m2(1, VTraits::vlanes()); \ + vuint8m2_t one8 = __riscv_vreinterpret_u8m2(one32); \ + vbool4_t mask = __riscv_vmseq(one8, 1, VTraits::vlanes()); \ + vuint16m4_t t0 = __riscv_vwmulu(a, b, VTraits::vlanes()); \ + vuint16m4_t t1= __riscv_vslide1down(t0, 0, VTraits::vlanes()); + vuint16m4_t t2= __riscv_vslide1down(t1, 0, VTraits::vlanes()); + vuint16m4_t t3= __riscv_vslide1down(t2, 0, VTraits::vlanes()); + vuint32m8_t res = __riscv_vadd(__riscv_vwaddu_vv(t2, t3, VTraits::vlanes()), __riscv_vwaddu_vv(t0, t1, VTraits::vlanes()), VTraits::vlanes()); res = __riscv_vcompress(res, mask, VTraits::vlanes()); \ - return __riscv_vlmul_trunc_u32m1(res); + return __riscv_vlmul_trunc_u32m2(res); } inline v_uint32 v_dotprod_expand(const v_uint8& a, const v_uint8& b, const v_uint32& c) { - vuint32m1_t one32 = __riscv_vmv_v_x_u32m1(1, VTraits::vlanes()); \ - vuint8m1_t one8 = __riscv_vreinterpret_u8m1(one32); \ - vbool8_t mask = __riscv_vmseq(one8, 1, VTraits::vlanes()); \ - vuint16m2_t t0 = __riscv_vwmulu(a, b, VTraits::vlanes()); \ - vuint16m2_t t1= __riscv_vslide1down(t0, 0, VTraits::vlanes()); - vuint16m2_t t2= __riscv_vslide1down(t1, 0, VTraits::vlanes()); - vuint16m2_t t3= __riscv_vslide1down(t2, 0, VTraits::vlanes()); - vuint32m4_t res = __riscv_vadd(__riscv_vwaddu_vv(t2, t3, VTraits::vlanes()), __riscv_vwaddu_vv(t0, t1, VTraits::vlanes()), VTraits::vlanes()); + vuint32m2_t one32 = __riscv_vmv_v_x_u32m2(1, VTraits::vlanes()); \ + vuint8m2_t one8 = __riscv_vreinterpret_u8m2(one32); \ + vbool4_t mask = __riscv_vmseq(one8, 1, VTraits::vlanes()); \ + vuint16m4_t t0 = __riscv_vwmulu(a, b, VTraits::vlanes()); \ + vuint16m4_t t1= __riscv_vslide1down(t0, 0, VTraits::vlanes()); + vuint16m4_t t2= __riscv_vslide1down(t1, 0, VTraits::vlanes()); + vuint16m4_t t3= __riscv_vslide1down(t2, 0, VTraits::vlanes()); + vuint32m8_t res = __riscv_vadd(__riscv_vwaddu_vv(t2, t3, VTraits::vlanes()), __riscv_vwaddu_vv(t0, t1, VTraits::vlanes()), VTraits::vlanes()); res = __riscv_vcompress(res, mask, VTraits::vlanes()); \ - return __riscv_vadd(__riscv_vlmul_trunc_u32m1(res), c, VTraits::vlanes()); + return __riscv_vadd(__riscv_vlmul_trunc_u32m2(res), c, VTraits::vlanes()); } inline v_int32 v_dotprod_expand(const v_int8& a, const v_int8& b) { - vuint32m1_t one32 = __riscv_vmv_v_x_u32m1(1, VTraits::vlanes()); \ - vuint8m1_t one8 = __riscv_vreinterpret_u8m1(one32); \ - vbool8_t mask = __riscv_vmseq(one8, 1, VTraits::vlanes()); \ - vint16m2_t t0 = __riscv_vwmul(a, b, VTraits::vlanes()); \ - vint16m2_t t1= __riscv_vslide1down(t0, 0, VTraits::vlanes()); - vint16m2_t t2= __riscv_vslide1down(t1, 0, VTraits::vlanes()); - vint16m2_t t3= __riscv_vslide1down(t2, 0, VTraits::vlanes()); - vint32m4_t res = __riscv_vadd(__riscv_vwadd_vv(t2, t3, VTraits::vlanes()), __riscv_vwadd_vv(t0, t1, VTraits::vlanes()), VTraits::vlanes()); + vuint32m2_t one32 = __riscv_vmv_v_x_u32m2(1, VTraits::vlanes()); \ + vuint8m2_t one8 = __riscv_vreinterpret_u8m2(one32); \ + vbool4_t mask = __riscv_vmseq(one8, 1, VTraits::vlanes()); \ + vint16m4_t t0 = __riscv_vwmul(a, b, VTraits::vlanes()); \ + vint16m4_t t1= __riscv_vslide1down(t0, 0, VTraits::vlanes()); + vint16m4_t t2= __riscv_vslide1down(t1, 0, VTraits::vlanes()); + vint16m4_t t3= __riscv_vslide1down(t2, 0, VTraits::vlanes()); + vint32m8_t res = __riscv_vadd(__riscv_vwadd_vv(t2, t3, VTraits::vlanes()), __riscv_vwadd_vv(t0, t1, VTraits::vlanes()), VTraits::vlanes()); res = __riscv_vcompress(res, mask, VTraits::vlanes()); \ - return __riscv_vlmul_trunc_i32m1(res); + return __riscv_vlmul_trunc_i32m2(res); } inline v_int32 v_dotprod_expand(const v_int8& a, const v_int8& b, const v_int32& c) { - vuint32m1_t one32 = __riscv_vmv_v_x_u32m1(1, VTraits::vlanes()); \ - vuint8m1_t one8 = __riscv_vreinterpret_u8m1(one32); \ - vbool8_t mask = __riscv_vmseq(one8, 1, VTraits::vlanes()); \ - vint16m2_t t0 = __riscv_vwmul(a, b, VTraits::vlanes()); \ - vint16m2_t t1= __riscv_vslide1down(t0, 0, VTraits::vlanes()); - vint16m2_t t2= __riscv_vslide1down(t1, 0, VTraits::vlanes()); - vint16m2_t t3= __riscv_vslide1down(t2, 0, VTraits::vlanes()); - vint32m4_t res = __riscv_vadd(__riscv_vwadd_vv(t2, t3, VTraits::vlanes()), __riscv_vwadd_vv(t0, t1, VTraits::vlanes()), VTraits::vlanes()); + vuint32m2_t one32 = __riscv_vmv_v_x_u32m2(1, VTraits::vlanes()); \ + vuint8m2_t one8 = __riscv_vreinterpret_u8m2(one32); \ + vbool4_t mask = __riscv_vmseq(one8, 1, VTraits::vlanes()); \ + vint16m4_t t0 = __riscv_vwmul(a, b, VTraits::vlanes()); \ + vint16m4_t t1= __riscv_vslide1down(t0, 0, VTraits::vlanes()); + vint16m4_t t2= __riscv_vslide1down(t1, 0, VTraits::vlanes()); + vint16m4_t t3= __riscv_vslide1down(t2, 0, VTraits::vlanes()); + vint32m8_t res = __riscv_vadd(__riscv_vwadd_vv(t2, t3, VTraits::vlanes()), __riscv_vwadd_vv(t0, t1, VTraits::vlanes()), VTraits::vlanes()); res = __riscv_vcompress(res, mask, VTraits::vlanes()); \ - return __riscv_vadd(__riscv_vlmul_trunc_i32m1(res), c, VTraits::vlanes()); + return __riscv_vadd(__riscv_vlmul_trunc_i32m2(res), c, VTraits::vlanes()); } // // 16 >> 64 inline v_uint64 v_dotprod_expand(const v_uint16& a, const v_uint16& b) { - vuint64m1_t one64 = __riscv_vmv_v_x_u64m1(1, VTraits::vlanes()); \ - vuint16m1_t one16 = __riscv_vreinterpret_u16m1(one64); \ - vbool16_t mask = __riscv_vmseq(one16, 1, VTraits::vlanes()); \ - vuint32m2_t t0 = __riscv_vwmulu(a, b, VTraits::vlanes()); \ - vuint32m2_t t1= __riscv_vslide1down(t0, 0, VTraits::vlanes()); - vuint32m2_t t2= __riscv_vslide1down(t1, 0, VTraits::vlanes()); - vuint32m2_t t3= __riscv_vslide1down(t2, 0, VTraits::vlanes()); - vuint64m4_t res = __riscv_vadd(__riscv_vwaddu_vv(t2, t3, VTraits::vlanes()), __riscv_vwaddu_vv(t0, t1, VTraits::vlanes()), VTraits::vlanes()); + vuint64m2_t one64 = __riscv_vmv_v_x_u64m2(1, VTraits::vlanes()); \ + vuint16m2_t one16 = __riscv_vreinterpret_u16m2(one64); \ + vbool8_t mask = __riscv_vmseq(one16, 1, VTraits::vlanes()); \ + vuint32m4_t t0 = __riscv_vwmulu(a, b, VTraits::vlanes()); \ + vuint32m4_t t1= __riscv_vslide1down(t0, 0, VTraits::vlanes()); + vuint32m4_t t2= __riscv_vslide1down(t1, 0, VTraits::vlanes()); + vuint32m4_t t3= __riscv_vslide1down(t2, 0, VTraits::vlanes()); + vuint64m8_t res = __riscv_vadd(__riscv_vwaddu_vv(t2, t3, VTraits::vlanes()), __riscv_vwaddu_vv(t0, t1, VTraits::vlanes()), VTraits::vlanes()); res = __riscv_vcompress(res, mask, VTraits::vlanes()); \ - return __riscv_vlmul_trunc_u64m1(res); + return __riscv_vlmul_trunc_u64m2(res); } inline v_uint64 v_dotprod_expand(const v_uint16& a, const v_uint16& b, const v_uint64& c) { - vuint64m1_t one64 = __riscv_vmv_v_x_u64m1(1, VTraits::vlanes()); \ - vuint16m1_t one16 = __riscv_vreinterpret_u16m1(one64); \ - vbool16_t mask = __riscv_vmseq(one16, 1, VTraits::vlanes()); \ - vuint32m2_t t0 = __riscv_vwmulu(a, b, VTraits::vlanes()); \ - vuint32m2_t t1= __riscv_vslide1down(t0, 0, VTraits::vlanes()); - vuint32m2_t t2= __riscv_vslide1down(t1, 0, VTraits::vlanes()); - vuint32m2_t t3= __riscv_vslide1down(t2, 0, VTraits::vlanes()); - vuint64m4_t res = __riscv_vadd(__riscv_vwaddu_vv(t2, t3, VTraits::vlanes()), __riscv_vwaddu_vv(t0, t1, VTraits::vlanes()), VTraits::vlanes()); + vuint64m2_t one64 = __riscv_vmv_v_x_u64m2(1, VTraits::vlanes()); \ + vuint16m2_t one16 = __riscv_vreinterpret_u16m2(one64); \ + vbool8_t mask = __riscv_vmseq(one16, 1, VTraits::vlanes()); \ + vuint32m4_t t0 = __riscv_vwmulu(a, b, VTraits::vlanes()); \ + vuint32m4_t t1= __riscv_vslide1down(t0, 0, VTraits::vlanes()); + vuint32m4_t t2= __riscv_vslide1down(t1, 0, VTraits::vlanes()); + vuint32m4_t t3= __riscv_vslide1down(t2, 0, VTraits::vlanes()); + vuint64m8_t res = __riscv_vadd(__riscv_vwaddu_vv(t2, t3, VTraits::vlanes()), __riscv_vwaddu_vv(t0, t1, VTraits::vlanes()), VTraits::vlanes()); res = __riscv_vcompress(res, mask, VTraits::vlanes()); \ - return __riscv_vadd(__riscv_vlmul_trunc_u64m1(res), c, VTraits::vlanes()); + return __riscv_vadd(__riscv_vlmul_trunc_u64m2(res), c, VTraits::vlanes()); } inline v_int64 v_dotprod_expand(const v_int16& a, const v_int16& b) { - vuint64m1_t one64 = __riscv_vmv_v_x_u64m1(1, VTraits::vlanes()); \ - vuint16m1_t one16 = __riscv_vreinterpret_u16m1(one64); \ - vbool16_t mask = __riscv_vmseq(one16, 1, VTraits::vlanes()); \ - vint32m2_t t0 = __riscv_vwmul(a, b, VTraits::vlanes()); \ - vint32m2_t t1= __riscv_vslide1down(t0, 0, VTraits::vlanes()); - vint32m2_t t2= __riscv_vslide1down(t1, 0, VTraits::vlanes()); - vint32m2_t t3= __riscv_vslide1down(t2, 0, VTraits::vlanes()); - vint64m4_t res = __riscv_vadd(__riscv_vwadd_vv(t2, t3, VTraits::vlanes()), __riscv_vwadd_vv(t0, t1, VTraits::vlanes()), VTraits::vlanes()); + vuint64m2_t one64 = __riscv_vmv_v_x_u64m2(1, VTraits::vlanes()); \ + vuint16m2_t one16 = __riscv_vreinterpret_u16m2(one64); \ + vbool8_t mask = __riscv_vmseq(one16, 1, VTraits::vlanes()); \ + vint32m4_t t0 = __riscv_vwmul(a, b, VTraits::vlanes()); \ + vint32m4_t t1= __riscv_vslide1down(t0, 0, VTraits::vlanes()); + vint32m4_t t2= __riscv_vslide1down(t1, 0, VTraits::vlanes()); + vint32m4_t t3= __riscv_vslide1down(t2, 0, VTraits::vlanes()); + vint64m8_t res = __riscv_vadd(__riscv_vwadd_vv(t2, t3, VTraits::vlanes()), __riscv_vwadd_vv(t0, t1, VTraits::vlanes()), VTraits::vlanes()); res = __riscv_vcompress(res, mask, VTraits::vlanes()); \ - return __riscv_vlmul_trunc_i64m1(res); + return __riscv_vlmul_trunc_i64m2(res); } inline v_int64 v_dotprod_expand(const v_int16& a, const v_int16& b, const v_int64& c) { - vuint64m1_t one64 = __riscv_vmv_v_x_u64m1(1, VTraits::vlanes()); \ - vuint16m1_t one16 = __riscv_vreinterpret_u16m1(one64); \ - vbool16_t mask = __riscv_vmseq(one16, 1, VTraits::vlanes()); \ - vint32m2_t t0 = __riscv_vwmul(a, b, VTraits::vlanes()); \ - vint32m2_t t1= __riscv_vslide1down(t0, 0, VTraits::vlanes()); - vint32m2_t t2= __riscv_vslide1down(t1, 0, VTraits::vlanes()); - vint32m2_t t3= __riscv_vslide1down(t2, 0, VTraits::vlanes()); - vint64m4_t res = __riscv_vadd(__riscv_vwadd_vv(t2, t3, VTraits::vlanes()), __riscv_vwadd_vv(t0, t1, VTraits::vlanes()), VTraits::vlanes()); + vuint64m2_t one64 = __riscv_vmv_v_x_u64m2(1, VTraits::vlanes()); \ + vuint16m2_t one16 = __riscv_vreinterpret_u16m2(one64); \ + vbool8_t mask = __riscv_vmseq(one16, 1, VTraits::vlanes()); \ + vint32m4_t t0 = __riscv_vwmul(a, b, VTraits::vlanes()); \ + vint32m4_t t1= __riscv_vslide1down(t0, 0, VTraits::vlanes()); + vint32m4_t t2= __riscv_vslide1down(t1, 0, VTraits::vlanes()); + vint32m4_t t3= __riscv_vslide1down(t2, 0, VTraits::vlanes()); + vint64m8_t res = __riscv_vadd(__riscv_vwadd_vv(t2, t3, VTraits::vlanes()), __riscv_vwadd_vv(t0, t1, VTraits::vlanes()), VTraits::vlanes()); res = __riscv_vcompress(res, mask, VTraits::vlanes()); \ - return __riscv_vadd(__riscv_vlmul_trunc_i64m1(res), c, VTraits::vlanes()); + return __riscv_vadd(__riscv_vlmul_trunc_i64m2(res), c, VTraits::vlanes()); } // // 32 >> 64f @@ -2352,70 +2300,72 @@ inline v_float64 v_dotprod_expand(const v_int32& a, const v_int32& b, // 16 >> 32 inline v_int32 v_dotprod_fast(const v_int16& a, const v_int16& b) { - v_int32 zero = v_setzero_s32(); - return __riscv_vredsum_tu(zero, __riscv_vwmul(a, b, VTraits::vlanes()), zero, VTraits::vlanes()); + vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, VTraits::vlanes()); + return __riscv_vset(__riscv_vmv_v_x_i32m2(0, VTraits::vlanes()), 0, __riscv_vredsum_tu(zero, __riscv_vwmul(a, b, VTraits::vlanes()), zero, VTraits::vlanes())); } inline v_int32 v_dotprod_fast(const v_int16& a, const v_int16& b, const v_int32& c) { - v_int32 zero = v_setzero_s32(); - return __riscv_vredsum_tu(zero, __riscv_vwmul(a, b, VTraits::vlanes()), __riscv_vredsum_tu(zero,c, zero, VTraits::vlanes()), VTraits::vlanes()); + vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, VTraits::vlanes()); + return __riscv_vadd(c, __riscv_vset(__riscv_vmv_v_x_i32m2(0, VTraits::vlanes()), 0, __riscv_vredsum_tu(zero, __riscv_vwmul(a, b, VTraits::vlanes()), zero, VTraits::vlanes())), VTraits::vlanes()); } // 32 >> 64 inline v_int64 v_dotprod_fast(const v_int32& a, const v_int32& b) { - v_int64 zero = v_setzero_s64(); - return __riscv_vredsum_tu(zero, __riscv_vwmul(a, b, VTraits::vlanes()), zero, VTraits::vlanes()); + vint64m1_t zero = __riscv_vmv_v_x_i64m1(0, VTraits::vlanes()); + return __riscv_vset(__riscv_vmv_v_x_i64m2(0, VTraits::vlanes()), 0, __riscv_vredsum_tu(zero, __riscv_vwmul(a, b, VTraits::vlanes()), zero, VTraits::vlanes())); } inline v_int64 v_dotprod_fast(const v_int32& a, const v_int32& b, const v_int64& c) { - v_int64 zero = v_setzero_s64(); - return __riscv_vadd(__riscv_vredsum_tu(zero,__riscv_vwmul(a, b, VTraits::vlanes()), zero, VTraits::vlanes()) , __riscv_vredsum_tu(zero,c, zero, VTraits::vlanes()), VTraits::vlanes()); + vint64m1_t zero = __riscv_vmv_v_x_i64m1(0, VTraits::vlanes()); + return __riscv_vadd(c, __riscv_vset(__riscv_vmv_v_x_i64m2(0, VTraits::vlanes()), 0, __riscv_vredsum_tu(zero, __riscv_vwmul(a, b, VTraits::vlanes()), zero, VTraits::vlanes())), VTraits::vlanes()); } // 8 >> 32 inline v_uint32 v_dotprod_expand_fast(const v_uint8& a, const v_uint8& b) { - v_uint32 zero = v_setzero_u32(); - return __riscv_vwredsumu_tu(zero, __riscv_vwmulu(a, b, VTraits::vlanes()), zero, VTraits::vlanes()); + vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, VTraits::vlanes()); + auto res = __riscv_vwredsumu_tu(zero, __riscv_vwmulu(a, b, VTraits::vlanes()), zero, VTraits::vlanes()); + return __riscv_vset(__riscv_vmv_v_x_u32m2(0, VTraits::vlanes()), 0, res); } inline v_uint32 v_dotprod_expand_fast(const v_uint8& a, const v_uint8& b, const v_uint32& c) { - v_uint32 zero = v_setzero_u32(); - return __riscv_vadd(__riscv_vwredsumu_tu(zero, __riscv_vwmulu(a, b, VTraits::vlanes()), zero, VTraits::vlanes()) , __riscv_vredsum_tu(zero, c, zero, VTraits::vlanes()), VTraits::vlanes()); + vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, VTraits::vlanes()); + auto res = __riscv_vwredsumu_tu(zero, __riscv_vwmulu(a, b, VTraits::vlanes()), zero, VTraits::vlanes()); + return __riscv_vadd(c, __riscv_vset(__riscv_vmv_v_x_u32m2(0, VTraits::vlanes()), 0, res), VTraits::vlanes()); } inline v_int32 v_dotprod_expand_fast(const v_int8& a, const v_int8& b) { - v_int32 zero = v_setzero_s32(); - return __riscv_vwredsum_tu(zero, __riscv_vwmul(a, b, VTraits::vlanes()), zero, VTraits::vlanes()); + vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, VTraits::vlanes()); + return __riscv_vset(__riscv_vmv_v_x_i32m2(0, VTraits::vlanes()), 0, __riscv_vwredsum_tu(zero, __riscv_vwmul(a, b, VTraits::vlanes()), zero, VTraits::vlanes())); } inline v_int32 v_dotprod_expand_fast(const v_int8& a, const v_int8& b, const v_int32& c) { - v_int32 zero = v_setzero_s32(); - return __riscv_vadd(__riscv_vwredsum_tu(zero, __riscv_vwmul(a, b, VTraits::vlanes()), zero, VTraits::vlanes()) , __riscv_vredsum_tu(zero,c, zero, VTraits::vlanes()), VTraits::vlanes()); + vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, VTraits::vlanes()); + return __riscv_vadd(c, __riscv_vset(__riscv_vmv_v_x_i32m2(0, VTraits::vlanes()), 0, __riscv_vwredsum_tu(zero, __riscv_vwmul(a, b, VTraits::vlanes()), zero, VTraits::vlanes())), VTraits::vlanes()); } // 16 >> 64 inline v_uint64 v_dotprod_expand_fast(const v_uint16& a, const v_uint16& b) { - v_uint64 zero = v_setzero_u64(); - return __riscv_vwredsumu_tu(zero, __riscv_vwmulu(a, b, VTraits::vlanes()), zero, VTraits::vlanes()); + vuint64m1_t zero = __riscv_vmv_v_x_u64m1(0, VTraits::vlanes()); + return __riscv_vset(__riscv_vmv_v_x_u64m2(0, VTraits::vlanes()), 0, __riscv_vwredsumu_tu(zero, __riscv_vwmulu(a, b, VTraits::vlanes()), zero, VTraits::vlanes())); } inline v_uint64 v_dotprod_expand_fast(const v_uint16& a, const v_uint16& b, const v_uint64& c) { - v_uint64 zero = v_setzero_u64(); - return __riscv_vadd(__riscv_vwredsumu_tu(zero, __riscv_vwmulu(a, b, VTraits::vlanes()), zero, VTraits::vlanes()), __riscv_vredsum_tu(zero,c, zero, VTraits::vlanes()), VTraits::vlanes()); + vuint64m1_t zero = __riscv_vmv_v_x_u64m1(0, VTraits::vlanes()); + return __riscv_vadd(c, __riscv_vset(__riscv_vmv_v_x_u64m2(0, VTraits::vlanes()), 0, __riscv_vwredsumu_tu(zero, __riscv_vwmulu(a, b, VTraits::vlanes()), zero, VTraits::vlanes())), VTraits::vlanes()); } inline v_int64 v_dotprod_expand_fast(const v_int16& a, const v_int16& b) { - v_int64 zero = v_setzero_s64(); - return __riscv_vwredsum_tu(zero, __riscv_vwmul(a, b, VTraits::vlanes()), zero, VTraits::vlanes()); + vint64m1_t zero = __riscv_vmv_v_x_i64m1(0, VTraits::vlanes()); + return __riscv_vset(__riscv_vmv_v_x_i64m2(0, VTraits::vlanes()), 0, __riscv_vwredsum_tu(zero, __riscv_vwmul(a, b, VTraits::vlanes()), zero, VTraits::vlanes())); } inline v_int64 v_dotprod_expand_fast(const v_int16& a, const v_int16& b, const v_int64& c) { - v_int64 zero = v_setzero_s64(); - return __riscv_vadd(__riscv_vwredsum_tu(zero, __riscv_vwmul(a, b, VTraits::vlanes()), zero, VTraits::vlanes()), __riscv_vredsum_tu(zero, c, zero, VTraits::vlanes()), VTraits::vlanes()); + vint64m1_t zero = __riscv_vmv_v_x_i64m1(0, VTraits::vlanes()); + return __riscv_vadd(c, __riscv_vset(__riscv_vmv_v_x_i64m2(0, VTraits::vlanes()), 0, __riscv_vwredsum_tu(zero, __riscv_vwmul(a, b, VTraits::vlanes()), zero, VTraits::vlanes())), VTraits::vlanes()); } // 32 >> 64f @@ -2433,15 +2383,15 @@ inline v_float16 v_matmul( const v_float16 &v, const v_float16 &m2, const v_float16 &m3, const v_float16 &m4, const v_float16 &m5, const v_float16 &m6, const v_float16 &m7) { - vfloat16m1_t res; - res = __riscv_vfmul_vf_f16m1(m0, (_Float16)v_extract_n(v, 0), VTraits::vlanes()); - res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 1), m1, VTraits::vlanes()); - res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 2), m2, VTraits::vlanes()); - res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 3), m3, VTraits::vlanes()); - res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 4), m4, VTraits::vlanes()); - res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 5), m5, VTraits::vlanes()); - res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 6), m6, VTraits::vlanes()); - res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 7), m7, VTraits::vlanes()); + vfloat16m2_t res; + res = __riscv_vfmul_vf_f16m2(m0, (_Float16)v_extract_n(v, 0), VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m2(res, (_Float16)v_extract_n(v, 1), m1, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m2(res, (_Float16)v_extract_n(v, 2), m2, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m2(res, (_Float16)v_extract_n(v, 3), m3, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m2(res, (_Float16)v_extract_n(v, 4), m4, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m2(res, (_Float16)v_extract_n(v, 5), m5, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m2(res, (_Float16)v_extract_n(v, 6), m6, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m2(res, (_Float16)v_extract_n(v, 7), m7, VTraits::vlanes()); return res; } inline v_float16 v_matmuladd( const v_float16 &v, @@ -2450,44 +2400,51 @@ inline v_float16 v_matmuladd( const v_float16 &v, const v_float16 &m4, const v_float16 &m5, const v_float16 &m6, const v_float16 &a) { - vfloat16m1_t res; - res = __riscv_vfmul_vf_f16m1(m0, (_Float16)v_extract_n(v, 0), VTraits::vlanes()); - res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 1), m1, VTraits::vlanes()); - res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 2), m2, VTraits::vlanes()); - res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 3), m3, VTraits::vlanes()); - res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 4), m4, VTraits::vlanes()); - res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 5), m5, VTraits::vlanes()); - res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 6), m6, VTraits::vlanes()); + vfloat16m2_t res; + res = __riscv_vfmul_vf_f16m2(m0, (_Float16)v_extract_n(v, 0), VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m2(res, (_Float16)v_extract_n(v, 1), m1, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m2(res, (_Float16)v_extract_n(v, 2), m2, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m2(res, (_Float16)v_extract_n(v, 3), m3, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m2(res, (_Float16)v_extract_n(v, 4), m4, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m2(res, (_Float16)v_extract_n(v, 5), m5, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m2(res, (_Float16)v_extract_n(v, 6), m6, VTraits::vlanes()); return __riscv_vfadd(res, a, VTraits::vlanes()); } #endif -inline v_float32 v_matmul(const v_float32& v, const v_float32& m0, - const v_float32& m1, const v_float32& m2, - const v_float32& m3) +inline v_float32 v_matmul(const v_float32& v, const v_float32& mat0, + const v_float32& mat1, const v_float32& mat2, + const v_float32& mat3) { - vfloat32m1_t res; - res = __riscv_vfmul_vf_f32m1(m0, v_extract_n(v, 0), VTraits::vlanes()); - res = __riscv_vfmacc_vf_f32m1(res, v_extract_n(v, 1), m1, VTraits::vlanes()); - res = __riscv_vfmacc_vf_f32m1(res, v_extract_n(v, 2), m2, VTraits::vlanes()); - res = __riscv_vfmacc_vf_f32m1(res, v_extract_n(v, 3), m3, VTraits::vlanes()); + vfloat32m2_t res; + res = __riscv_vfmul_vf_f32m2(mat0, v_extract_n(v, 0), VTraits::vlanes()); + res = __riscv_vfmacc_vf_f32m2(res, v_extract_n(v, 1), mat1, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f32m2(res, v_extract_n(v, 2), mat2, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f32m2(res, v_extract_n(v, 3), mat3, VTraits::vlanes()); return res; } // TODO: only 128 bit now. -inline v_float32 v_matmuladd(const v_float32& v, const v_float32& m0, - const v_float32& m1, const v_float32& m2, +inline v_float32 v_matmuladd(const v_float32& v, const v_float32& mat0, + const v_float32& mat1, const v_float32& mat2, const v_float32& a) { - vfloat32m1_t res = __riscv_vfmul_vf_f32m1(m0, v_extract_n(v,0), VTraits::vlanes()); - res = __riscv_vfmacc_vf_f32m1(res, v_extract_n(v,1), m1, VTraits::vlanes()); - res = __riscv_vfmacc_vf_f32m1(res, v_extract_n(v,2), m2, VTraits::vlanes()); + vfloat32m2_t res = __riscv_vfmul_vf_f32m2(mat0, v_extract_n(v,0), VTraits::vlanes()); + res = __riscv_vfmacc_vf_f32m2(res, v_extract_n(v,1), mat1, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f32m2(res, v_extract_n(v,2), mat2, VTraits::vlanes()); return __riscv_vfadd(res, a, VTraits::vlanes()); } inline void v_cleanup() {} #include "intrin_math.hpp" +#if CV_SIMD_SCALABLE_FP16 +inline v_float16 v_exp(const v_float16& x) { return v_exp_default_16f(x); } +inline v_float16 v_log(const v_float16& x) { return v_log_default_16f(x); } +inline void v_sincos(const v_float16& x, v_float16& s, v_float16& c) { v_sincos_default_16f(x, s, c); } +inline v_float16 v_sin(const v_float16& x) { return v_sin_default_16f(x); } +inline v_float16 v_cos(const v_float16& x) { return v_cos_default_16f(x); } +#endif inline v_float32 v_exp(const v_float32& x) { return v_exp_default_32f(x); } inline v_float32 v_log(const v_float32& x) { return v_log_default_32f(x); } inline void v_sincos(const v_float32& x, v_float32& s, v_float32& c) { v_sincos_default_32f(x, s, c); } diff --git a/modules/core/src/matmul.simd.hpp b/modules/core/src/matmul.simd.hpp index ce3a48799e..6f77adc68b 100644 --- a/modules/core/src/matmul.simd.hpp +++ b/modules/core/src/matmul.simd.hpp @@ -1595,7 +1595,10 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, static void transform_32f( const float* src, float* dst, const float* m, int len, int scn, int dcn ) { -#if (CV_SIMD || CV_SIMD_SCALABLE) && !defined(__aarch64__) && !defined(_M_ARM64) +// Disabled for RISC-V Vector (scalable), because of: +// 1. v_matmuladd for RVV is 128-bit only but not scalable, this will fail the test `Core_Transform.accuracy`. +// 2. Both gcc and clang can autovectorize this, with better performance than using Universal intrinsic. +#if (CV_SIMD || CV_SIMD_SCALABLE) && !defined(__aarch64__) && !defined(_M_ARM64) && !(CV_TRY_RVV && CV_RVV) int x = 0; if( scn == 3 && dcn == 3 ) {