diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/exp.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/exp.hpp index ea7f0c67d4..60efee2b8b 100644 --- a/3rdparty/hal_rvv/hal_rvv_1p0/exp.hpp +++ b/3rdparty/hal_rvv/hal_rvv_1p0/exp.hpp @@ -123,8 +123,8 @@ inline int exp32f(const float* src, float* dst, int _len) vl = __riscv_vsetvl_e32m4(len); auto x0 = __riscv_vle32_v_f32m4(src, vl); - x0 = __riscv_vfmerge(x0, detail::exp_min_val, __riscv_vmflt(x0, detail::exp_min_val, vl), vl); - x0 = __riscv_vfmerge(x0, detail::exp_max_val, __riscv_vmfgt(x0, detail::exp_max_val, vl), vl); + x0 = __riscv_vfmax(x0, detail::exp_min_val, vl); + x0 = __riscv_vfmin(x0, detail::exp_max_val, vl); x0 = __riscv_vfmul(x0, detail::exp_prescale, vl); auto xi = __riscv_vfcvt_rtz_x_f_v_i32m4(x0, vl); @@ -133,8 +133,8 @@ inline int exp32f(const float* src, float* dst, int _len) auto t = __riscv_vsra(xi, detail::exp_scale, vl); t = __riscv_vadd(t, 127, vl); - t = __riscv_vmerge(t, 0, __riscv_vmslt(t, 0, vl), vl); - t = __riscv_vand(t, 255, vl); + t = __riscv_vmax(t, 0, vl); + t = __riscv_vmin(t, 255, vl); auto buf = __riscv_vreinterpret_f32m4(__riscv_vsll(t, 23, vl)); auto _xi = __riscv_vreinterpret_u32m4(xi); @@ -158,6 +158,7 @@ inline int exp64f(const double* src, double* dst, int _len) { size_t vl = __riscv_vsetvlmax_e64m4(); // all vector registers are used up, so not load more constants + auto exp_a2 = __riscv_vfmv_v_f_f64m4(detail::exp64f_a2, vl); auto exp_a3 = __riscv_vfmv_v_f_f64m4(detail::exp64f_a3, vl); auto exp_a4 = __riscv_vfmv_v_f_f64m4(detail::exp64f_a4, vl); auto exp_a5 = __riscv_vfmv_v_f_f64m4(detail::exp64f_a5, vl); @@ -166,8 +167,8 @@ inline int exp64f(const double* src, double* dst, int _len) vl = __riscv_vsetvl_e64m4(len); auto x0 = __riscv_vle64_v_f64m4(src, vl); - x0 = __riscv_vfmerge(x0, detail::exp_min_val, __riscv_vmflt(x0, detail::exp_min_val, vl), vl); - x0 = __riscv_vfmerge(x0, detail::exp_max_val, __riscv_vmfgt(x0, detail::exp_max_val, vl), vl); + x0 = __riscv_vfmax(x0, detail::exp_min_val, vl); + x0 = __riscv_vfmin(x0, detail::exp_max_val, vl); x0 = __riscv_vfmul(x0, detail::exp_prescale, vl); auto xi = __riscv_vfcvt_rtz_x_f_v_i64m4(x0, vl); @@ -176,8 +177,8 @@ inline int exp64f(const double* src, double* dst, int _len) auto t = __riscv_vsra(xi, detail::exp_scale, vl); t = __riscv_vadd(t, 1023, vl); - t = __riscv_vmerge(t, 0, __riscv_vmslt(t, 0, vl), vl); - t = __riscv_vand(t, 2047, vl); + t = __riscv_vmax(t, 0, vl); + t = __riscv_vmin(t, 2047, vl); auto buf = __riscv_vreinterpret_f64m4(__riscv_vsll(t, 52, vl)); auto _xi = __riscv_vreinterpret_u64m4(xi); @@ -186,7 +187,7 @@ inline int exp64f(const double* src, double* dst, int _len) auto res = __riscv_vfmul(buf, tab_v, vl); auto xn = __riscv_vfadd(__riscv_vfmul(x0, detail::exp64f_a0, vl), detail::exp64f_a1, vl); - xn = __riscv_vfadd(__riscv_vfmul(x0, xn, vl), detail::exp64f_a2, vl); + xn = __riscv_vfmadd(xn, x0, exp_a2, vl); xn = __riscv_vfmadd(xn, x0, exp_a3, vl); xn = __riscv_vfmadd(xn, x0, exp_a4, vl); xn = __riscv_vfmadd(xn, x0, exp_a5, vl); diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/log.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/log.hpp index 4d8399ee88..02c62f4400 100644 --- a/3rdparty/hal_rvv/hal_rvv_1p0/log.hpp +++ b/3rdparty/hal_rvv/hal_rvv_1p0/log.hpp @@ -324,7 +324,7 @@ inline int log32f(const float* src, float* dst, int _len) tab_v = __riscv_vluxei32(detail::log_tab_32f, __riscv_vadd(idx, 4, vl), vl); auto buf_f = __riscv_vreinterpret_f32m4(buf_i); auto x0 = __riscv_vfmul(__riscv_vfsub(buf_f, 1.f, vl), tab_v, vl); - x0 = __riscv_vfsub(__riscv_vmseq(idx, (uint32_t)510 * 4, vl), x0, 1.f / 512, vl); + x0 = __riscv_vfsub_mu(__riscv_vmseq(idx, (uint32_t)510 * 4, vl), x0, x0, 1.f / 512, vl); auto res = __riscv_vfadd(__riscv_vfmul(x0, detail::log32f_a0, vl), detail::log32f_a1, vl); res = __riscv_vfmadd(res, x0, log_a2, vl); @@ -361,7 +361,7 @@ inline int log64f(const double* src, double* dst, int _len) tab_v = __riscv_vluxei64(detail::log_tab_64f, __riscv_vadd(idx, 8, vl), vl); auto buf_f = __riscv_vreinterpret_f64m4(buf_i); auto x0 = __riscv_vfmul(__riscv_vfsub(buf_f, 1.0, vl), tab_v, vl); - x0 = __riscv_vfsub(__riscv_vmseq(idx, (uint64_t)510 * 8, vl), x0, 1. / 512, vl); + x0 = __riscv_vfsub_mu(__riscv_vmseq(idx, (uint64_t)510 * 8, vl), x0, x0, 1. / 512, vl); auto res = __riscv_vfadd(__riscv_vfmul(x0, detail::log64f_a0, vl), detail::log64f_a1, vl); res = __riscv_vfadd(__riscv_vfmul(x0, res, vl), detail::log64f_a2, vl); diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp index 7912e1fdc4..85acd09d4b 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp @@ -509,6 +509,28 @@ inline v_float64 v_lut(const double* tab, const v_int32& vidx) \ #endif +// Strangely, __riscv_vluxseg2ei32 is slower (tested on Muse-Pi and CanMV K230) +#define OPENCV_HAL_IMPL_RVV_LUT_DEINTERLEAVE(_Tpvec, _Tp, suffix) \ +inline void v_lut_deinterleave(const _Tp* tab, const v_int32& vidx, _Tpvec& vx, _Tpvec& vy) \ +{ \ + v_uint32 vidx_ = __riscv_vmul(__riscv_vreinterpret_u32m2(vidx), sizeof(_Tp), VTraits::vlanes()); \ + vx = __riscv_vluxei32(tab, vidx_, VTraits<_Tpvec>::vlanes()); \ + vy = __riscv_vluxei32(tab, __riscv_vadd(vidx_, sizeof(_Tp), VTraits::vlanes()), VTraits<_Tpvec>::vlanes()); \ +} +OPENCV_HAL_IMPL_RVV_LUT_DEINTERLEAVE(v_float32, float, f32) +OPENCV_HAL_IMPL_RVV_LUT_DEINTERLEAVE(v_int32, int, i32) +OPENCV_HAL_IMPL_RVV_LUT_DEINTERLEAVE(v_uint32, unsigned, u32) + +#if CV_SIMD_SCALABLE_64F +inline void v_lut_deinterleave(const double* tab, const v_int32& vidx, v_float64& vx, v_float64& vy) \ +{ \ + vuint32m1_t vidx_ = __riscv_vmul(__riscv_vlmul_trunc_u32m1(__riscv_vreinterpret_u32m2(vidx)), sizeof(double), VTraits::vlanes()); \ + vx = __riscv_vluxei32(tab, vidx_, VTraits::vlanes()); \ + vy = __riscv_vluxei32(tab, __riscv_vadd(vidx_, sizeof(double), VTraits::vlanes()), VTraits::vlanes()); \ +} +#endif + + inline v_uint8 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); } inline v_uint8 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); } inline v_uint8 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); } diff --git a/modules/core/src/mathfuncs_core.simd.hpp b/modules/core/src/mathfuncs_core.simd.hpp index f92234f140..41a3261c64 100644 --- a/modules/core/src/mathfuncs_core.simd.hpp +++ b/modules/core/src/mathfuncs_core.simd.hpp @@ -771,7 +771,7 @@ void log32f( const float *_x, float *y, int n ) int i = 0; const int* x = (const int*)_x; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) const int VECSZ = VTraits::vlanes(); const v_float32 vln2 = vx_setall_f32((float)ln_2); const v_float32 v1 = vx_setall_f32(1.f); @@ -846,7 +846,7 @@ void log64f( const double *x, double *y, int n ) int i = 0; -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) const int VECSZ = VTraits::vlanes(); const v_float64 vln2 = vx_setall_f64(ln_2); diff --git a/modules/core/test/test_arithm.cpp b/modules/core/test/test_arithm.cpp index 721cf70b6d..88d646b09f 100644 --- a/modules/core/test/test_arithm.cpp +++ b/modules/core/test/test_arithm.cpp @@ -1099,7 +1099,7 @@ struct ExpOp : public BaseElemWiseOp } void getValueRange(int depth, double& minval, double& maxval) { - maxval = depth == CV_32F ? 50 : 100; + maxval = depth == CV_32F ? 80 : 700; minval = -maxval; } void op(const vector& src, Mat& dst, const Mat&) diff --git a/modules/core/test/test_hal_core.cpp b/modules/core/test/test_hal_core.cpp index a86016d44f..6e85cce99b 100644 --- a/modules/core/test/test_hal_core.cpp +++ b/modules/core/test/test_hal_core.cpp @@ -74,7 +74,8 @@ TEST_P(mathfuncs, accuracy) int n = 100; Mat src(1, n, depth), dst(1, n, depth), dst0(1, n, depth); - randu(src, 1, 10); + double maxval = depth == CV_32F ? 80 : 700; + randu(src, -maxval, maxval); switch (nfunc) {