mirror of
https://github.com/opencv/opencv.git
synced 2025-08-06 06:26:29 +08:00
Merge pull request #27348 from fengyuentau:4x/hal/riscv_rvv/faster_div_f32
hal/riscv-rvv: further optimize div
This commit is contained in:
commit
b8099d3cc2
@ -56,6 +56,18 @@ CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m8_t, vuint16m8_t, __riscv_vreinte
|
|||||||
CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m4_t, vuint32m4_t, __riscv_vreinterpret_u32m4, vsub, vmax, vmin)
|
CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m4_t, vuint32m4_t, __riscv_vreinterpret_u32m4, vsub, vmax, vmin)
|
||||||
CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m8_t, vuint32m8_t, __riscv_vreinterpret_u32m8, vsub, vmax, vmin)
|
CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m8_t, vuint32m8_t, __riscv_vreinterpret_u32m8, vsub, vmax, vmin)
|
||||||
|
|
||||||
|
// ############ reciprocal ############
|
||||||
|
|
||||||
|
inline vfloat32m4_t __riscv_vfrec(const vfloat32m4_t &x, const int vl) {
|
||||||
|
auto rec = __riscv_vfrec7(x, vl);
|
||||||
|
auto cls = __riscv_vfclass(rec, vl);
|
||||||
|
auto m = __riscv_vmseq(__riscv_vand(cls, 0b10111000, vl), 0, vl);
|
||||||
|
auto two = __riscv_vfmv_v_f_f32m4(2.f, vl);
|
||||||
|
rec = __riscv_vfmul_mu(m, rec, __riscv_vfnmsac(two, x, rec, vl), rec, vl);
|
||||||
|
rec = __riscv_vfmul_mu(m, rec, __riscv_vfnmsac(two, x, rec, vl), rec, vl);
|
||||||
|
return rec;
|
||||||
|
}
|
||||||
|
|
||||||
// ############ atan ############
|
// ############ atan ############
|
||||||
|
|
||||||
// ref: mathfuncs_core.simd.hpp
|
// ref: mathfuncs_core.simd.hpp
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
// Third party copyrights are property of their respective owners.
|
// Third party copyrights are property of their respective owners.
|
||||||
|
|
||||||
#include "rvv_hal.hpp"
|
#include "rvv_hal.hpp"
|
||||||
|
#include "common.hpp"
|
||||||
#include <limits>
|
#include <limits>
|
||||||
|
|
||||||
namespace cv { namespace rvv_hal { namespace core {
|
namespace cv { namespace rvv_hal { namespace core {
|
||||||
@ -14,48 +15,58 @@ namespace cv { namespace rvv_hal { namespace core {
|
|||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
inline size_t setvl(int l) { return __riscv_vsetvl_e8m2(l); }
|
inline size_t setvl(int l) { return __riscv_vsetvl_e8m1(l); }
|
||||||
|
|
||||||
inline vuint8m2_t vle(const uint8_t *p, int vl) { return __riscv_vle8_v_u8m2(p, vl); }
|
inline vuint8m1_t vle(const uint8_t *p, int vl) { return __riscv_vle8_v_u8m1(p, vl); }
|
||||||
inline vint8m2_t vle(const int8_t *p, int vl) { return __riscv_vle8_v_i8m2(p, vl); }
|
inline vint8m1_t vle(const int8_t *p, int vl) { return __riscv_vle8_v_i8m1(p, vl); }
|
||||||
inline vuint16m4_t vle(const uint16_t *p, int vl) { return __riscv_vle16_v_u16m4(p, vl); }
|
inline vuint16m2_t vle(const uint16_t *p, int vl) { return __riscv_vle16_v_u16m2(p, vl); }
|
||||||
inline vint16m4_t vle(const int16_t *p, int vl) { return __riscv_vle16_v_i16m4(p, vl); }
|
inline vint16m2_t vle(const int16_t *p, int vl) { return __riscv_vle16_v_i16m2(p, vl); }
|
||||||
inline vint32m8_t vle(const int *p, int vl) { return __riscv_vle32_v_i32m8(p, vl); }
|
inline vint32m4_t vle(const int *p, int vl) { return __riscv_vle32_v_i32m4(p, vl); }
|
||||||
inline vfloat32m8_t vle(const float *p, int vl) { return __riscv_vle32_v_f32m8(p, vl); }
|
inline vfloat32m4_t vle(const float *p, int vl) { return __riscv_vle32_v_f32m4(p, vl); }
|
||||||
|
|
||||||
inline void vse(uint8_t *p, const vuint8m2_t &v, int vl) { __riscv_vse8(p, v, vl); }
|
inline void vse(uint8_t *p, const vuint8m1_t &v, int vl) { __riscv_vse8(p, v, vl); }
|
||||||
inline void vse(int8_t *p, const vint8m2_t &v, int vl) { __riscv_vse8(p, v, vl); }
|
inline void vse(int8_t *p, const vint8m1_t &v, int vl) { __riscv_vse8(p, v, vl); }
|
||||||
inline void vse(uint16_t *p, const vuint16m4_t &v, int vl) { __riscv_vse16(p, v, vl); }
|
inline void vse(uint16_t *p, const vuint16m2_t &v, int vl) { __riscv_vse16(p, v, vl); }
|
||||||
inline void vse(int16_t *p, const vint16m4_t &v, int vl) { __riscv_vse16(p, v, vl); }
|
inline void vse(int16_t *p, const vint16m2_t &v, int vl) { __riscv_vse16(p, v, vl); }
|
||||||
inline void vse(int *p, const vint32m8_t &v, int vl) { __riscv_vse32(p, v, vl); }
|
inline void vse(int *p, const vint32m4_t &v, int vl) { __riscv_vse32(p, v, vl); }
|
||||||
inline void vse(float *p, const vfloat32m8_t &v, int vl) { __riscv_vse32(p, v, vl); }
|
inline void vse(float *p, const vfloat32m4_t &v, int vl) { __riscv_vse32(p, v, vl); }
|
||||||
|
|
||||||
inline vuint16m4_t ext(const vuint8m2_t &v, const int vl) { return __riscv_vzext_vf2(v, vl); }
|
inline vuint16m2_t ext(const vuint8m1_t &v, const int vl) { return __riscv_vzext_vf2(v, vl); }
|
||||||
inline vint16m4_t ext(const vint8m2_t &v, const int vl) { return __riscv_vsext_vf2(v, vl); }
|
inline vint16m2_t ext(const vint8m1_t &v, const int vl) { return __riscv_vsext_vf2(v, vl); }
|
||||||
inline vuint32m8_t ext(const vuint16m4_t &v, const int vl) { return __riscv_vzext_vf2(v, vl); }
|
|
||||||
inline vint32m8_t ext(const vint16m4_t &v, const int vl) { return __riscv_vsext_vf2(v, vl); }
|
|
||||||
|
|
||||||
inline vuint8m2_t nclip(const vuint16m4_t &v, const int vl) { return __riscv_vnclipu(v, 0, __RISCV_VXRM_RNU, vl); }
|
inline vuint8m1_t nclip(const vuint16m2_t &v, const int vl) { return __riscv_vnclipu(v, 0, __RISCV_VXRM_RNU, vl); }
|
||||||
inline vint8m2_t nclip(const vint16m4_t &v, const int vl) { return __riscv_vnclip(v, 0, __RISCV_VXRM_RNU, vl); }
|
inline vint8m1_t nclip(const vint16m2_t &v, const int vl) { return __riscv_vnclip(v, 0, __RISCV_VXRM_RNU, vl); }
|
||||||
inline vuint16m4_t nclip(const vuint32m8_t &v, const int vl) { return __riscv_vnclipu(v, 0, __RISCV_VXRM_RNU, vl); }
|
|
||||||
inline vint16m4_t nclip(const vint32m8_t &v, const int vl) { return __riscv_vnclip(v, 0, __RISCV_VXRM_RNU, vl); }
|
|
||||||
|
|
||||||
template <typename VT> inline
|
template <typename VT> inline
|
||||||
VT div_sat(const VT &v1, const VT &v2, const float scale, const int vl) {
|
VT div_sat(const VT &v1, const VT &v2, const float scale, const int vl) {
|
||||||
return nclip(div_sat(ext(v1, vl), ext(v2, vl), scale, vl), vl);
|
return nclip(div_sat(ext(v1, vl), ext(v2, vl), scale, vl), vl);
|
||||||
}
|
}
|
||||||
template <> inline
|
template <> inline
|
||||||
vint32m8_t div_sat(const vint32m8_t &v1, const vint32m8_t &v2, const float scale, const int vl) {
|
vint16m2_t div_sat(const vint16m2_t &v1, const vint16m2_t &v2, const float scale, const int vl) {
|
||||||
|
auto f1 = __riscv_vfwcvt_f(v1, vl);
|
||||||
|
auto f2 = __riscv_vfwcvt_f(v2, vl);
|
||||||
|
auto res = __riscv_vfmul(f1, __riscv_vfmul(common::__riscv_vfrec(f2, vl), scale, vl), vl);
|
||||||
|
return __riscv_vfncvt_x(res, vl);
|
||||||
|
}
|
||||||
|
template <> inline
|
||||||
|
vuint16m2_t div_sat(const vuint16m2_t &v1, const vuint16m2_t &v2, const float scale, const int vl) {
|
||||||
|
auto f1 = __riscv_vfwcvt_f(v1, vl);
|
||||||
|
auto f2 = __riscv_vfwcvt_f(v2, vl);
|
||||||
|
auto res = __riscv_vfmul(f1, __riscv_vfmul(common::__riscv_vfrec(f2, vl), scale, vl), vl);
|
||||||
|
return __riscv_vfncvt_xu(res, vl);
|
||||||
|
}
|
||||||
|
template <> inline
|
||||||
|
vint32m4_t div_sat(const vint32m4_t &v1, const vint32m4_t &v2, const float scale, const int vl) {
|
||||||
auto f1 = __riscv_vfcvt_f(v1, vl);
|
auto f1 = __riscv_vfcvt_f(v1, vl);
|
||||||
auto f2 = __riscv_vfcvt_f(v2, vl);
|
auto f2 = __riscv_vfcvt_f(v2, vl);
|
||||||
auto res = __riscv_vfmul(f1, __riscv_vfrdiv(f2, scale, vl), vl);
|
auto res = __riscv_vfmul(f1, __riscv_vfmul(common::__riscv_vfrec(f2, vl), scale, vl), vl);
|
||||||
return __riscv_vfcvt_x(res, vl);
|
return __riscv_vfcvt_x(res, vl);
|
||||||
}
|
}
|
||||||
template <> inline
|
template <> inline
|
||||||
vuint32m8_t div_sat(const vuint32m8_t &v1, const vuint32m8_t &v2, const float scale, const int vl) {
|
vuint32m4_t div_sat(const vuint32m4_t &v1, const vuint32m4_t &v2, const float scale, const int vl) {
|
||||||
auto f1 = __riscv_vfcvt_f(v1, vl);
|
auto f1 = __riscv_vfcvt_f(v1, vl);
|
||||||
auto f2 = __riscv_vfcvt_f(v2, vl);
|
auto f2 = __riscv_vfcvt_f(v2, vl);
|
||||||
auto res = __riscv_vfmul(f1, __riscv_vfrdiv(f2, scale, vl), vl);
|
auto res = __riscv_vfmul(f1, __riscv_vfmul(common::__riscv_vfrec(f2, vl), scale, vl), vl);
|
||||||
return __riscv_vfcvt_xu(res, vl);
|
return __riscv_vfcvt_xu(res, vl);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -64,15 +75,27 @@ VT recip_sat(const VT &v, const float scale, const int vl) {
|
|||||||
return nclip(recip_sat(ext(v, vl), scale, vl), vl);
|
return nclip(recip_sat(ext(v, vl), scale, vl), vl);
|
||||||
}
|
}
|
||||||
template <> inline
|
template <> inline
|
||||||
vint32m8_t recip_sat(const vint32m8_t &v, const float scale, const int vl) {
|
vint16m2_t recip_sat(const vint16m2_t &v, const float scale, const int vl) {
|
||||||
|
auto f = __riscv_vfwcvt_f(v, vl);
|
||||||
|
auto res = __riscv_vfmul(common::__riscv_vfrec(f, vl), scale, vl);
|
||||||
|
return __riscv_vfncvt_x(res, vl);
|
||||||
|
}
|
||||||
|
template <> inline
|
||||||
|
vuint16m2_t recip_sat(const vuint16m2_t &v, const float scale, const int vl) {
|
||||||
|
auto f = __riscv_vfwcvt_f(v, vl);
|
||||||
|
auto res = __riscv_vfmul(common::__riscv_vfrec(f, vl), scale, vl);
|
||||||
|
return __riscv_vfncvt_xu(res, vl);
|
||||||
|
}
|
||||||
|
template <> inline
|
||||||
|
vint32m4_t recip_sat(const vint32m4_t &v, const float scale, const int vl) {
|
||||||
auto f = __riscv_vfcvt_f(v, vl);
|
auto f = __riscv_vfcvt_f(v, vl);
|
||||||
auto res = __riscv_vfrdiv(f, scale, vl);
|
auto res = __riscv_vfmul(common::__riscv_vfrec(f, vl), scale, vl);
|
||||||
return __riscv_vfcvt_x(res, vl);
|
return __riscv_vfcvt_x(res, vl);
|
||||||
}
|
}
|
||||||
template <> inline
|
template <> inline
|
||||||
vuint32m8_t recip_sat(const vuint32m8_t &v, const float scale, const int vl) {
|
vuint32m4_t recip_sat(const vuint32m4_t &v, const float scale, const int vl) {
|
||||||
auto f = __riscv_vfcvt_f(v, vl);
|
auto f = __riscv_vfcvt_f(v, vl);
|
||||||
auto res = __riscv_vfrdiv(f, scale, vl);
|
auto res = __riscv_vfmul(common::__riscv_vfrec(f, vl), scale, vl);
|
||||||
return __riscv_vfcvt_xu(res, vl);
|
return __riscv_vfcvt_xu(res, vl);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -134,7 +157,7 @@ int div(const float *src1, size_t step1, const float *src2, size_t step2,
|
|||||||
auto v1 = vle(src1_h + w, vl);
|
auto v1 = vle(src1_h + w, vl);
|
||||||
auto v2 = vle(src2_h + w, vl);
|
auto v2 = vle(src2_h + w, vl);
|
||||||
|
|
||||||
vse(dst_h + w, __riscv_vfmul(v1, __riscv_vfrdiv(v2, 1.f, vl), vl), vl);
|
vse(dst_h + w, __riscv_vfmul(v1, common::__riscv_vfrec(v2, vl), vl), vl);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -150,7 +173,7 @@ int div(const float *src1, size_t step1, const float *src2, size_t step2,
|
|||||||
auto v1 = vle(src1_h + w, vl);
|
auto v1 = vle(src1_h + w, vl);
|
||||||
auto v2 = vle(src2_h + w, vl);
|
auto v2 = vle(src2_h + w, vl);
|
||||||
|
|
||||||
vse(dst_h + w, __riscv_vfmul(v1, __riscv_vfrdiv(v2, scale, vl), vl), vl);
|
vse(dst_h + w, __riscv_vfmul(v1, __riscv_vfmul(common::__riscv_vfrec(v2, vl), scale, vl), vl), vl);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -209,7 +232,7 @@ int recip(const float *src_data, size_t src_step, float *dst_data, size_t dst_st
|
|||||||
|
|
||||||
auto v = vle(src_h + w, vl);
|
auto v = vle(src_h + w, vl);
|
||||||
|
|
||||||
vse(dst_h + w, __riscv_vfrdiv(v, 1.f, vl), vl);
|
vse(dst_h + w, common::__riscv_vfrec(v, vl), vl);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -223,7 +246,7 @@ int recip(const float *src_data, size_t src_step, float *dst_data, size_t dst_st
|
|||||||
|
|
||||||
auto v = vle(src_h + w, vl);
|
auto v = vle(src_h + w, vl);
|
||||||
|
|
||||||
vse(dst_h + w, __riscv_vfrdiv(v, scale, vl), vl);
|
vse(dst_h + w, __riscv_vfmul(common::__riscv_vfrec(v, vl), scale, vl), vl);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user