Merge pull request #27348 from fengyuentau:4x/hal/riscv_rvv/faster_div_f32

hal/riscv-rvv: further optimize div
2025-08-06 06:26:29 +08:00 · 2025-05-23 11:31:16 +03:00 · 2025-05-23 11:31:16 +03:00 · b8099d3cc2
commit b8099d3cc2
parent 5a457842f1 2c4eab0969
2 changed files with 68 additions and 33 deletions
--- a/hal/riscv-rvv/src/core/common.hpp
+++ b/hal/riscv-rvv/src/core/common.hpp
@ -56,6 +56,18 @@ CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m8_t, vuint16m8_t, __riscv_vreinte
 CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m4_t, vuint32m4_t, __riscv_vreinterpret_u32m4, vsub, vmax, vmin)
 CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m8_t, vuint32m8_t, __riscv_vreinterpret_u32m8, vsub, vmax, vmin)
 // ############ reciprocal ############
 inline vfloat32m4_t __riscv_vfrec(const vfloat32m4_t &x, const int vl) {
    auto rec = __riscv_vfrec7(x, vl);
    auto cls = __riscv_vfclass(rec, vl);
    auto m = __riscv_vmseq(__riscv_vand(cls, 0b10111000, vl), 0, vl);
    auto two = __riscv_vfmv_v_f_f32m4(2.f, vl);
    rec = __riscv_vfmul_mu(m, rec, __riscv_vfnmsac(two, x, rec, vl), rec, vl);
    rec = __riscv_vfmul_mu(m, rec, __riscv_vfnmsac(two, x, rec, vl), rec, vl);
    return rec;
 }
 // ############ atan ############
 // ref: mathfuncs_core.simd.hpp
--- a/hal/riscv-rvv/src/core/div.cpp
+++ b/hal/riscv-rvv/src/core/div.cpp
@ -6,6 +6,7 @@
 // Third party copyrights are property of their respective owners.
 #include "rvv_hal.hpp"
 #include "common.hpp"
 #include <limits>
 namespace cv { namespace rvv_hal { namespace core {
@ -14,48 +15,58 @@ namespace cv { namespace rvv_hal { namespace core {
 namespace {
-inline size_t setvl(int l) { return __riscv_vsetvl_e8m2(l); }
+inline size_t setvl(int l) { return __riscv_vsetvl_e8m1(l); }
-inline   vuint8m2_t vle(const uint8_t  *p, int vl) { return __riscv_vle8_v_u8m2(p, vl); }
+inline   vuint8m1_t vle(const uint8_t  *p, int vl) { return __riscv_vle8_v_u8m1(p, vl); }
-inline    vint8m2_t vle(const int8_t   *p, int vl) { return __riscv_vle8_v_i8m2(p, vl); }
+inline    vint8m1_t vle(const int8_t   *p, int vl) { return __riscv_vle8_v_i8m1(p, vl); }
-inline  vuint16m4_t vle(const uint16_t *p, int vl) { return __riscv_vle16_v_u16m4(p, vl); }
+inline  vuint16m2_t vle(const uint16_t *p, int vl) { return __riscv_vle16_v_u16m2(p, vl); }
-inline   vint16m4_t vle(const int16_t  *p, int vl) { return __riscv_vle16_v_i16m4(p, vl); }
+inline   vint16m2_t vle(const int16_t  *p, int vl) { return __riscv_vle16_v_i16m2(p, vl); }
-inline   vint32m8_t vle(const int      *p, int vl) { return __riscv_vle32_v_i32m8(p, vl); }
+inline   vint32m4_t vle(const int      *p, int vl) { return __riscv_vle32_v_i32m4(p, vl); }
-inline vfloat32m8_t vle(const float    *p, int vl) { return __riscv_vle32_v_f32m8(p, vl); }
+inline vfloat32m4_t vle(const float    *p, int vl) { return __riscv_vle32_v_f32m4(p, vl); }
-inline void vse(uint8_t  *p, const   vuint8m2_t &v, int vl) { __riscv_vse8(p, v, vl); }
+inline void vse(uint8_t  *p, const   vuint8m1_t &v, int vl) { __riscv_vse8(p, v, vl); }
-inline void vse(int8_t   *p, const    vint8m2_t &v, int vl) { __riscv_vse8(p, v, vl); }
+inline void vse(int8_t   *p, const    vint8m1_t &v, int vl) { __riscv_vse8(p, v, vl); }
-inline void vse(uint16_t *p, const  vuint16m4_t &v, int vl) { __riscv_vse16(p, v, vl); }
+inline void vse(uint16_t *p, const  vuint16m2_t &v, int vl) { __riscv_vse16(p, v, vl); }
-inline void vse(int16_t  *p, const   vint16m4_t &v, int vl) { __riscv_vse16(p, v, vl); }
+inline void vse(int16_t  *p, const   vint16m2_t &v, int vl) { __riscv_vse16(p, v, vl); }
-inline void vse(int      *p, const   vint32m8_t &v, int vl) { __riscv_vse32(p, v, vl); }
+inline void vse(int      *p, const   vint32m4_t &v, int vl) { __riscv_vse32(p, v, vl); }
-inline void vse(float    *p, const vfloat32m8_t &v, int vl) { __riscv_vse32(p, v, vl); }
+inline void vse(float    *p, const vfloat32m4_t &v, int vl) { __riscv_vse32(p, v, vl); }
-inline vuint16m4_t ext(const  vuint8m2_t &v, const int vl) { return __riscv_vzext_vf2(v, vl); }
+inline vuint16m2_t ext(const  vuint8m1_t &v, const int vl) { return __riscv_vzext_vf2(v, vl); }
-inline  vint16m4_t ext(const   vint8m2_t &v, const int vl) { return __riscv_vsext_vf2(v, vl); }
+inline  vint16m2_t ext(const   vint8m1_t &v, const int vl) { return __riscv_vsext_vf2(v, vl); }
 inline vuint32m8_t ext(const vuint16m4_t &v, const int vl) { return __riscv_vzext_vf2(v, vl); }
 inline  vint32m8_t ext(const  vint16m4_t &v, const int vl) { return __riscv_vsext_vf2(v, vl); }
-inline  vuint8m2_t nclip(const vuint16m4_t &v, const int vl) { return __riscv_vnclipu(v, 0, __RISCV_VXRM_RNU, vl); }
+inline  vuint8m1_t nclip(const vuint16m2_t &v, const int vl) { return __riscv_vnclipu(v, 0, __RISCV_VXRM_RNU, vl); }
-inline   vint8m2_t nclip(const  vint16m4_t &v, const int vl) { return __riscv_vnclip(v, 0, __RISCV_VXRM_RNU, vl); }
+inline   vint8m1_t nclip(const  vint16m2_t &v, const int vl) { return __riscv_vnclip(v, 0, __RISCV_VXRM_RNU, vl); }
 inline vuint16m4_t nclip(const vuint32m8_t &v, const int vl) { return __riscv_vnclipu(v, 0, __RISCV_VXRM_RNU, vl); }
 inline  vint16m4_t nclip(const  vint32m8_t &v, const int vl) { return __riscv_vnclip(v, 0, __RISCV_VXRM_RNU, vl); }
 template <typename VT> inline
 VT div_sat(const VT &v1, const VT &v2, const float scale, const int vl) {
    return nclip(div_sat(ext(v1, vl), ext(v2, vl), scale, vl), vl);
 }
 template <> inline
-vint32m8_t div_sat(const vint32m8_t &v1, const vint32m8_t &v2, const float scale, const int vl) {
+vint16m2_t div_sat(const vint16m2_t &v1, const vint16m2_t &v2, const float scale, const int vl) {
    auto f1 = __riscv_vfwcvt_f(v1, vl);
    auto f2 = __riscv_vfwcvt_f(v2, vl);
    auto res = __riscv_vfmul(f1, __riscv_vfmul(common::__riscv_vfrec(f2, vl), scale, vl), vl);
    return __riscv_vfncvt_x(res, vl);
 }
 template <> inline
 vuint16m2_t div_sat(const vuint16m2_t &v1, const vuint16m2_t &v2, const float scale, const int vl) {
    auto f1 = __riscv_vfwcvt_f(v1, vl);
    auto f2 = __riscv_vfwcvt_f(v2, vl);
    auto res = __riscv_vfmul(f1, __riscv_vfmul(common::__riscv_vfrec(f2, vl), scale, vl), vl);
    return __riscv_vfncvt_xu(res, vl);
 }
 template <> inline
 vint32m4_t div_sat(const vint32m4_t &v1, const vint32m4_t &v2, const float scale, const int vl) {
    auto f1 = __riscv_vfcvt_f(v1, vl);
    auto f2 = __riscv_vfcvt_f(v2, vl);
-    auto res = __riscv_vfmul(f1, __riscv_vfrdiv(f2, scale, vl), vl);
+    auto res = __riscv_vfmul(f1, __riscv_vfmul(common::__riscv_vfrec(f2, vl), scale, vl), vl);
    return __riscv_vfcvt_x(res, vl);
 }
 template <> inline
-vuint32m8_t div_sat(const vuint32m8_t &v1, const vuint32m8_t &v2, const float scale, const int vl) {
+vuint32m4_t div_sat(const vuint32m4_t &v1, const vuint32m4_t &v2, const float scale, const int vl) {
    auto f1 = __riscv_vfcvt_f(v1, vl);
    auto f2 = __riscv_vfcvt_f(v2, vl);
-    auto res = __riscv_vfmul(f1, __riscv_vfrdiv(f2, scale, vl), vl);
+    auto res = __riscv_vfmul(f1, __riscv_vfmul(common::__riscv_vfrec(f2, vl), scale, vl), vl);
    return __riscv_vfcvt_xu(res, vl);
 }
@ -64,15 +75,27 @@ VT recip_sat(const VT &v, const float scale, const int vl) {
    return nclip(recip_sat(ext(v, vl), scale, vl), vl);
 }
 template <> inline
-vint32m8_t recip_sat(const vint32m8_t &v, const float scale, const int vl) {
+vint16m2_t recip_sat(const vint16m2_t &v, const float scale, const int vl) {
    auto f = __riscv_vfwcvt_f(v, vl);
    auto res = __riscv_vfmul(common::__riscv_vfrec(f, vl), scale, vl);
    return __riscv_vfncvt_x(res, vl);
 }
 template <> inline
 vuint16m2_t recip_sat(const vuint16m2_t &v, const float scale, const int vl) {
    auto f = __riscv_vfwcvt_f(v, vl);
    auto res = __riscv_vfmul(common::__riscv_vfrec(f, vl), scale, vl);
    return __riscv_vfncvt_xu(res, vl);
 }
 template <> inline
 vint32m4_t recip_sat(const vint32m4_t &v, const float scale, const int vl) {
    auto f = __riscv_vfcvt_f(v, vl);
-    auto res = __riscv_vfrdiv(f, scale, vl);
+    auto res = __riscv_vfmul(common::__riscv_vfrec(f, vl), scale, vl);
    return __riscv_vfcvt_x(res, vl);
 }
 template <> inline
-vuint32m8_t recip_sat(const vuint32m8_t &v, const float scale, const int vl) {
+vuint32m4_t recip_sat(const vuint32m4_t &v, const float scale, const int vl) {
    auto f = __riscv_vfcvt_f(v, vl);
-    auto res = __riscv_vfrdiv(f, scale, vl);
+    auto res = __riscv_vfmul(common::__riscv_vfrec(f, vl), scale, vl);
    return __riscv_vfcvt_xu(res, vl);
 }
@ -134,7 +157,7 @@ int div(const float *src1, size_t step1, const float *src2, size_t step2,
                auto v1 = vle(src1_h + w, vl);
                auto v2 = vle(src2_h + w, vl);
-                vse(dst_h + w, __riscv_vfmul(v1, __riscv_vfrdiv(v2, 1.f, vl), vl), vl);
+                vse(dst_h + w, __riscv_vfmul(v1, common::__riscv_vfrec(v2, vl), vl), vl);
            }
        }
    } else {
@ -150,7 +173,7 @@ int div(const float *src1, size_t step1, const float *src2, size_t step2,
                auto v1 = vle(src1_h + w, vl);
                auto v2 = vle(src2_h + w, vl);
-                vse(dst_h + w, __riscv_vfmul(v1, __riscv_vfrdiv(v2, scale, vl), vl), vl);
+                vse(dst_h + w, __riscv_vfmul(v1, __riscv_vfmul(common::__riscv_vfrec(v2, vl), scale, vl), vl), vl);
            }
        }
    }
@ -209,7 +232,7 @@ int recip(const float *src_data, size_t src_step, float *dst_data, size_t dst_st
                auto v = vle(src_h + w, vl);
-                vse(dst_h + w, __riscv_vfrdiv(v, 1.f, vl), vl);
+                vse(dst_h + w, common::__riscv_vfrec(v, vl), vl);
            }
        }
    } else {
@ -223,7 +246,7 @@ int recip(const float *src_data, size_t src_step, float *dst_data, size_t dst_st
                auto v = vle(src_h + w, vl);
-                vse(dst_h + w, __riscv_vfrdiv(v, scale, vl), vl);
+                vse(dst_h + w, __riscv_vfmul(common::__riscv_vfrec(v, vl), scale, vl), vl);
            }
        }
    }