Optimize the v_lut for RVV.

This commit is contained in:
Liutong HAN 2023-11-23 15:06:04 +08:00
parent 2c1ec4245d
commit ce0516282a

View File

@ -448,29 +448,7 @@ OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64, vfloat64m1_t, double, VTraits<v_floa
#define OPENCV_HAL_IMPL_RVV_LUT(_Tpvec, _Tp, suffix) \
inline _Tpvec v_lut(const _Tp* tab, const int* idx) \
{ \
vuint32##suffix##_t vidx = vmul(vreinterpret_u32##suffix(vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
} \
inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \
{ \
std::vector<uint> idx_; \
for (int i = 0; i < VTraits<v_int16>::vlanes(); ++i) { \
idx_.push_back(idx[i]); \
idx_.push_back(idx[i]+1); \
} \
vuint32##suffix##_t vidx = vmul(vle32_v_u32##suffix(idx_.data(), VTraits<_Tpvec>::vlanes()), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
} \
inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \
{ \
std::vector<uint> idx_; \
for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i) { \
idx_.push_back(idx[i]); \
idx_.push_back(idx[i]+1); \
idx_.push_back(idx[i]+2); \
idx_.push_back(idx[i]+3); \
} \
vuint32##suffix##_t vidx = vmul(vle32_v_u32##suffix(idx_.data(), VTraits<_Tpvec>::vlanes()), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
auto vidx = vmul(vreinterpret_u32##suffix(vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
}
OPENCV_HAL_IMPL_RVV_LUT(v_int8, schar, m4)
@ -482,6 +460,55 @@ OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1)
OPENCV_HAL_IMPL_RVV_LUT(v_float64, double, mf2)
#endif
#define OPENCV_HAL_IMPL_RVV_LUT_PAIRS(_Tpvec, _Tp, suffix1, suffix2, v_trunc) \
inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \
{ \
auto v0 = vle32_v_u32##suffix1((unsigned*)idx, VTraits<_Tpvec>::vlanes()/2); \
auto v1 = vadd(v0, 1, VTraits<_Tpvec>::vlanes()/2); \
auto w0 = vwcvtu_x(v0, VTraits<_Tpvec>::vlanes()/2); \
auto w1 = vwcvtu_x(v1, VTraits<_Tpvec>::vlanes()/2); \
auto sh1 = vslide1up(v_trunc(vreinterpret_u32##suffix2(w1)),0, VTraits<_Tpvec>::vlanes()); \
auto vid = vor(sh1, v_trunc(vreinterpret_u32##suffix2(w0)), VTraits<_Tpvec>::vlanes()); \
auto vidx = vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
}
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int8, schar, m2, m4, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int16, short, m1, m2, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int32, int, mf2, m1, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float32, float, mf2, m1, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int64, int64_t, mf2, m1, vlmul_trunc_u32mf2)
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float64, double, mf2, m1, vlmul_trunc_u32mf2)
#endif
#define OPENCV_HAL_IMPL_RVV_LUT_QUADS(_Tpvec, _Tp, suffix0, suffix1, suffix2, v_trunc) \
inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \
{ \
auto v0 = vle32_v_u32##suffix0((unsigned*)idx, VTraits<_Tpvec>::vlanes()/4); \
auto v1 = vadd(v0, 1, VTraits<_Tpvec>::vlanes()/4); \
auto v2 = vadd(v0, 2, VTraits<_Tpvec>::vlanes()/4); \
auto v3 = vadd(v0, 3, VTraits<_Tpvec>::vlanes()/4); \
auto w0 = vwcvtu_x(v0, VTraits<_Tpvec>::vlanes()/4); \
auto w1 = vwcvtu_x(v1, VTraits<_Tpvec>::vlanes()/4); \
auto w2 = vwcvtu_x(v2, VTraits<_Tpvec>::vlanes()/4); \
auto w3 = vwcvtu_x(v3, VTraits<_Tpvec>::vlanes()/4); \
auto sh2 = vslide1up(vreinterpret_u32##suffix1(w2),0, VTraits<_Tpvec>::vlanes()/2); \
auto sh3 = vslide1up(vreinterpret_u32##suffix1(w3),0, VTraits<_Tpvec>::vlanes()/2); \
auto vid0 = vor(sh2, vreinterpret_u32##suffix1(w0), VTraits<_Tpvec>::vlanes()/2); \
auto vid1 = vor(sh3, vreinterpret_u32##suffix1(w1), VTraits<_Tpvec>::vlanes()/2); \
auto wid0 = vwcvtu_x(v_trunc(vid0), VTraits<_Tpvec>::vlanes()/2); \
auto wid1 = vwcvtu_x(v_trunc(vid1), VTraits<_Tpvec>::vlanes()/2); \
auto shwid1 = vslide1up(vreinterpret_u32##suffix2(wid1),0, VTraits<_Tpvec>::vlanes()); \
auto vid = vor(shwid1, vreinterpret_u32##suffix2(wid0), VTraits<_Tpvec>::vlanes()); \
auto vidx = vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
}
OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int8, schar, m1, m2, m4, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int16, short, mf2 , m1, m2, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int32, int, mf2, m1, m1, vlmul_trunc_u32mf2)
OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_float32, float, mf2, m1, m1, vlmul_trunc_u32mf2)
#define OPENCV_HAL_IMPL_RVV_LUT_VEC(_Tpvec, _Tp) \
inline _Tpvec v_lut(const _Tp* tab, const v_int32& vidx) \
{ \
@ -512,7 +539,6 @@ inline v_uint32 v_lut_pairs(const unsigned* tab, const int* idx) { return v_rein
inline v_uint32 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
inline v_uint64 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
inline v_uint64 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
inline v_uint64 v_lut_quads(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_quads((const int64_t*)tab, idx)); }
////////////// Pack boolean ////////////////////
inline v_uint8 v_pack_b(const v_uint16& a, const v_uint16& b)