fix exp, log | enable ui for log | strengthen test

Co-authored-by: Liutong HAN <liutong2020@iscas.ac.cn>
2025-07-24 14:06:27 +08:00 · 2025-03-06 10:00:35 +00:00 · 2025-03-06 10:00:35 +00:00 · 0fed1fa184
commit 0fed1fa184
parent 524d8ae01c
6 changed files with 39 additions and 15 deletions
--- a/3rdparty/hal_rvv/hal_rvv_1p0/exp.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/exp.hpp
@ -123,8 +123,8 @@ inline int exp32f(const float* src, float* dst, int _len)
        vl = __riscv_vsetvl_e32m4(len);
        auto x0 = __riscv_vle32_v_f32m4(src, vl);

-        x0 = __riscv_vfmerge(x0, detail::exp_min_val, __riscv_vmflt(x0, detail::exp_min_val, vl), vl);
-        x0 = __riscv_vfmerge(x0, detail::exp_max_val, __riscv_vmfgt(x0, detail::exp_max_val, vl), vl);
+        x0 = __riscv_vfmax(x0, detail::exp_min_val, vl);
+        x0 = __riscv_vfmin(x0, detail::exp_max_val, vl);
        x0 = __riscv_vfmul(x0, detail::exp_prescale, vl);

        auto xi = __riscv_vfcvt_rtz_x_f_v_i32m4(x0, vl);
@ -133,8 +133,8 @@ inline int exp32f(const float* src, float* dst, int _len)

        auto t = __riscv_vsra(xi, detail::exp_scale, vl);
        t = __riscv_vadd(t, 127, vl);
-        t = __riscv_vmerge(t, 0, __riscv_vmslt(t, 0, vl), vl);
-        t = __riscv_vand(t, 255, vl);
+        t = __riscv_vmax(t, 0, vl);
+        t = __riscv_vmin(t, 255, vl);
        auto buf = __riscv_vreinterpret_f32m4(__riscv_vsll(t, 23, vl));

        auto _xi = __riscv_vreinterpret_u32m4(xi);
@ -158,6 +158,7 @@ inline int exp64f(const double* src, double* dst, int _len)
 {
    size_t vl = __riscv_vsetvlmax_e64m4();
    // all vector registers are used up, so not load more constants
+    auto exp_a2 = __riscv_vfmv_v_f_f64m4(detail::exp64f_a2, vl);
    auto exp_a3 = __riscv_vfmv_v_f_f64m4(detail::exp64f_a3, vl);
    auto exp_a4 = __riscv_vfmv_v_f_f64m4(detail::exp64f_a4, vl);
    auto exp_a5 = __riscv_vfmv_v_f_f64m4(detail::exp64f_a5, vl);
@ -166,8 +167,8 @@ inline int exp64f(const double* src, double* dst, int _len)
        vl = __riscv_vsetvl_e64m4(len);
        auto x0 = __riscv_vle64_v_f64m4(src, vl);

-        x0 = __riscv_vfmerge(x0, detail::exp_min_val, __riscv_vmflt(x0, detail::exp_min_val, vl), vl);
-        x0 = __riscv_vfmerge(x0, detail::exp_max_val, __riscv_vmfgt(x0, detail::exp_max_val, vl), vl);
+        x0 = __riscv_vfmax(x0, detail::exp_min_val, vl);
+        x0 = __riscv_vfmin(x0, detail::exp_max_val, vl);
        x0 = __riscv_vfmul(x0, detail::exp_prescale, vl);

        auto xi = __riscv_vfcvt_rtz_x_f_v_i64m4(x0, vl);
@ -176,8 +177,8 @@ inline int exp64f(const double* src, double* dst, int _len)

        auto t = __riscv_vsra(xi, detail::exp_scale, vl);
        t = __riscv_vadd(t, 1023, vl);
-        t = __riscv_vmerge(t, 0, __riscv_vmslt(t, 0, vl), vl);
-        t = __riscv_vand(t, 2047, vl);
+        t = __riscv_vmax(t, 0, vl);
+        t = __riscv_vmin(t, 2047, vl);
        auto buf = __riscv_vreinterpret_f64m4(__riscv_vsll(t, 52, vl));

        auto _xi = __riscv_vreinterpret_u64m4(xi);
@ -186,7 +187,7 @@ inline int exp64f(const double* src, double* dst, int _len)

        auto res = __riscv_vfmul(buf, tab_v, vl);
        auto xn = __riscv_vfadd(__riscv_vfmul(x0, detail::exp64f_a0, vl), detail::exp64f_a1, vl);
-        xn = __riscv_vfadd(__riscv_vfmul(x0, xn, vl), detail::exp64f_a2, vl);
+        xn = __riscv_vfmadd(xn, x0, exp_a2, vl);
        xn = __riscv_vfmadd(xn, x0, exp_a3, vl);
        xn = __riscv_vfmadd(xn, x0, exp_a4, vl);
        xn = __riscv_vfmadd(xn, x0, exp_a5, vl);
--- a/3rdparty/hal_rvv/hal_rvv_1p0/log.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/log.hpp
@ -324,7 +324,7 @@ inline int log32f(const float* src, float* dst, int _len)
        tab_v = __riscv_vluxei32(detail::log_tab_32f, __riscv_vadd(idx, 4, vl), vl);
        auto buf_f = __riscv_vreinterpret_f32m4(buf_i);
        auto x0 = __riscv_vfmul(__riscv_vfsub(buf_f, 1.f, vl), tab_v, vl);
-        x0 = __riscv_vfsub(__riscv_vmseq(idx, (uint32_t)510 * 4, vl), x0, 1.f / 512, vl);
+        x0 = __riscv_vfsub_mu(__riscv_vmseq(idx, (uint32_t)510 * 4, vl), x0, x0, 1.f / 512, vl);

        auto res = __riscv_vfadd(__riscv_vfmul(x0, detail::log32f_a0, vl), detail::log32f_a1, vl);
        res = __riscv_vfmadd(res, x0, log_a2, vl);
@ -361,7 +361,7 @@ inline int log64f(const double* src, double* dst, int _len)
        tab_v = __riscv_vluxei64(detail::log_tab_64f, __riscv_vadd(idx, 8, vl), vl);
        auto buf_f = __riscv_vreinterpret_f64m4(buf_i);
        auto x0 = __riscv_vfmul(__riscv_vfsub(buf_f, 1.0, vl), tab_v, vl);
-        x0 = __riscv_vfsub(__riscv_vmseq(idx, (uint64_t)510 * 8, vl), x0, 1. / 512, vl);
+        x0 = __riscv_vfsub_mu(__riscv_vmseq(idx, (uint64_t)510 * 8, vl), x0, x0, 1. / 512, vl);

        auto res = __riscv_vfadd(__riscv_vfmul(x0, detail::log64f_a0, vl), detail::log64f_a1, vl);
        res = __riscv_vfadd(__riscv_vfmul(x0, res, vl), detail::log64f_a2, vl);
--- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
@ -509,6 +509,28 @@ inline v_float64 v_lut(const double* tab, const v_int32& vidx) \
 #endif


+// Strangely, __riscv_vluxseg2ei32 is slower (tested on Muse-Pi and CanMV K230)
+#define OPENCV_HAL_IMPL_RVV_LUT_DEINTERLEAVE(_Tpvec, _Tp, suffix) \
+inline void v_lut_deinterleave(const _Tp* tab, const v_int32& vidx, _Tpvec& vx, _Tpvec& vy) \
+{ \
+    v_uint32 vidx_ = __riscv_vmul(__riscv_vreinterpret_u32m2(vidx), sizeof(_Tp), VTraits<v_int32>::vlanes()); \
+    vx = __riscv_vluxei32(tab, vidx_, VTraits<_Tpvec>::vlanes()); \
+    vy = __riscv_vluxei32(tab, __riscv_vadd(vidx_, sizeof(_Tp), VTraits<v_int32>::vlanes()), VTraits<_Tpvec>::vlanes()); \
+}
+OPENCV_HAL_IMPL_RVV_LUT_DEINTERLEAVE(v_float32, float, f32)
+OPENCV_HAL_IMPL_RVV_LUT_DEINTERLEAVE(v_int32, int, i32)
+OPENCV_HAL_IMPL_RVV_LUT_DEINTERLEAVE(v_uint32, unsigned, u32)
+
+#if CV_SIMD_SCALABLE_64F
+inline void v_lut_deinterleave(const double* tab, const v_int32& vidx, v_float64& vx, v_float64& vy) \
+{ \
+    vuint32m1_t vidx_ = __riscv_vmul(__riscv_vlmul_trunc_u32m1(__riscv_vreinterpret_u32m2(vidx)), sizeof(double), VTraits<v_float64>::vlanes()); \
+    vx = __riscv_vluxei32(tab, vidx_, VTraits<v_float64>::vlanes()); \
+    vy = __riscv_vluxei32(tab, __riscv_vadd(vidx_, sizeof(double), VTraits<v_int32>::vlanes()), VTraits<v_float64>::vlanes()); \
+}
+#endif
+
+
 inline v_uint8 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
 inline v_uint8 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
 inline v_uint8 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
--- a/modules/core/src/mathfuncs_core.simd.hpp
+++ b/modules/core/src/mathfuncs_core.simd.hpp
@ -771,7 +771,7 @@ void log32f( const float *_x, float *y, int n )
    int i = 0;
    const int* x = (const int*)_x;

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    const int VECSZ = VTraits<v_float32>::vlanes();
    const v_float32 vln2 = vx_setall_f32((float)ln_2);
    const v_float32 v1 = vx_setall_f32(1.f);
@ -846,7 +846,7 @@ void log64f( const double *x, double *y, int n )

    int i = 0;

-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
    const int VECSZ = VTraits<v_float64>::vlanes();
    const v_float64 vln2 = vx_setall_f64(ln_2);

--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@ -1099,7 +1099,7 @@ struct ExpOp : public BaseElemWiseOp
    }
    void getValueRange(int depth, double& minval, double& maxval)
    {
-        maxval = depth == CV_32F ? 50 : 100;
+        maxval = depth == CV_32F ? 80 : 700;
        minval = -maxval;
    }
    void op(const vector<Mat>& src, Mat& dst, const Mat&)
--- a/modules/core/test/test_hal_core.cpp
+++ b/modules/core/test/test_hal_core.cpp
@ -74,7 +74,8 @@ TEST_P(mathfuncs, accuracy)
    int n = 100;

    Mat src(1, n, depth), dst(1, n, depth), dst0(1, n, depth);
-    randu(src, 1, 10);
+    double maxval = depth == CV_32F ? 80 : 700;
+    randu(src, -maxval, maxval);

    switch (nfunc)
    {