fix exp, log | enable ui for log | strengthen test

Co-authored-by: Liutong HAN <liutong2020@iscas.ac.cn>
This commit is contained in:
GenshinImpactStarts 2025-03-06 10:00:35 +00:00
parent 524d8ae01c
commit 0fed1fa184
6 changed files with 39 additions and 15 deletions

View File

@ -123,8 +123,8 @@ inline int exp32f(const float* src, float* dst, int _len)
vl = __riscv_vsetvl_e32m4(len);
auto x0 = __riscv_vle32_v_f32m4(src, vl);
x0 = __riscv_vfmerge(x0, detail::exp_min_val, __riscv_vmflt(x0, detail::exp_min_val, vl), vl);
x0 = __riscv_vfmerge(x0, detail::exp_max_val, __riscv_vmfgt(x0, detail::exp_max_val, vl), vl);
x0 = __riscv_vfmax(x0, detail::exp_min_val, vl);
x0 = __riscv_vfmin(x0, detail::exp_max_val, vl);
x0 = __riscv_vfmul(x0, detail::exp_prescale, vl);
auto xi = __riscv_vfcvt_rtz_x_f_v_i32m4(x0, vl);
@ -133,8 +133,8 @@ inline int exp32f(const float* src, float* dst, int _len)
auto t = __riscv_vsra(xi, detail::exp_scale, vl);
t = __riscv_vadd(t, 127, vl);
t = __riscv_vmerge(t, 0, __riscv_vmslt(t, 0, vl), vl);
t = __riscv_vand(t, 255, vl);
t = __riscv_vmax(t, 0, vl);
t = __riscv_vmin(t, 255, vl);
auto buf = __riscv_vreinterpret_f32m4(__riscv_vsll(t, 23, vl));
auto _xi = __riscv_vreinterpret_u32m4(xi);
@ -158,6 +158,7 @@ inline int exp64f(const double* src, double* dst, int _len)
{
size_t vl = __riscv_vsetvlmax_e64m4();
// all vector registers are used up, so not load more constants
auto exp_a2 = __riscv_vfmv_v_f_f64m4(detail::exp64f_a2, vl);
auto exp_a3 = __riscv_vfmv_v_f_f64m4(detail::exp64f_a3, vl);
auto exp_a4 = __riscv_vfmv_v_f_f64m4(detail::exp64f_a4, vl);
auto exp_a5 = __riscv_vfmv_v_f_f64m4(detail::exp64f_a5, vl);
@ -166,8 +167,8 @@ inline int exp64f(const double* src, double* dst, int _len)
vl = __riscv_vsetvl_e64m4(len);
auto x0 = __riscv_vle64_v_f64m4(src, vl);
x0 = __riscv_vfmerge(x0, detail::exp_min_val, __riscv_vmflt(x0, detail::exp_min_val, vl), vl);
x0 = __riscv_vfmerge(x0, detail::exp_max_val, __riscv_vmfgt(x0, detail::exp_max_val, vl), vl);
x0 = __riscv_vfmax(x0, detail::exp_min_val, vl);
x0 = __riscv_vfmin(x0, detail::exp_max_val, vl);
x0 = __riscv_vfmul(x0, detail::exp_prescale, vl);
auto xi = __riscv_vfcvt_rtz_x_f_v_i64m4(x0, vl);
@ -176,8 +177,8 @@ inline int exp64f(const double* src, double* dst, int _len)
auto t = __riscv_vsra(xi, detail::exp_scale, vl);
t = __riscv_vadd(t, 1023, vl);
t = __riscv_vmerge(t, 0, __riscv_vmslt(t, 0, vl), vl);
t = __riscv_vand(t, 2047, vl);
t = __riscv_vmax(t, 0, vl);
t = __riscv_vmin(t, 2047, vl);
auto buf = __riscv_vreinterpret_f64m4(__riscv_vsll(t, 52, vl));
auto _xi = __riscv_vreinterpret_u64m4(xi);
@ -186,7 +187,7 @@ inline int exp64f(const double* src, double* dst, int _len)
auto res = __riscv_vfmul(buf, tab_v, vl);
auto xn = __riscv_vfadd(__riscv_vfmul(x0, detail::exp64f_a0, vl), detail::exp64f_a1, vl);
xn = __riscv_vfadd(__riscv_vfmul(x0, xn, vl), detail::exp64f_a2, vl);
xn = __riscv_vfmadd(xn, x0, exp_a2, vl);
xn = __riscv_vfmadd(xn, x0, exp_a3, vl);
xn = __riscv_vfmadd(xn, x0, exp_a4, vl);
xn = __riscv_vfmadd(xn, x0, exp_a5, vl);

View File

@ -324,7 +324,7 @@ inline int log32f(const float* src, float* dst, int _len)
tab_v = __riscv_vluxei32(detail::log_tab_32f, __riscv_vadd(idx, 4, vl), vl);
auto buf_f = __riscv_vreinterpret_f32m4(buf_i);
auto x0 = __riscv_vfmul(__riscv_vfsub(buf_f, 1.f, vl), tab_v, vl);
x0 = __riscv_vfsub(__riscv_vmseq(idx, (uint32_t)510 * 4, vl), x0, 1.f / 512, vl);
x0 = __riscv_vfsub_mu(__riscv_vmseq(idx, (uint32_t)510 * 4, vl), x0, x0, 1.f / 512, vl);
auto res = __riscv_vfadd(__riscv_vfmul(x0, detail::log32f_a0, vl), detail::log32f_a1, vl);
res = __riscv_vfmadd(res, x0, log_a2, vl);
@ -361,7 +361,7 @@ inline int log64f(const double* src, double* dst, int _len)
tab_v = __riscv_vluxei64(detail::log_tab_64f, __riscv_vadd(idx, 8, vl), vl);
auto buf_f = __riscv_vreinterpret_f64m4(buf_i);
auto x0 = __riscv_vfmul(__riscv_vfsub(buf_f, 1.0, vl), tab_v, vl);
x0 = __riscv_vfsub(__riscv_vmseq(idx, (uint64_t)510 * 8, vl), x0, 1. / 512, vl);
x0 = __riscv_vfsub_mu(__riscv_vmseq(idx, (uint64_t)510 * 8, vl), x0, x0, 1. / 512, vl);
auto res = __riscv_vfadd(__riscv_vfmul(x0, detail::log64f_a0, vl), detail::log64f_a1, vl);
res = __riscv_vfadd(__riscv_vfmul(x0, res, vl), detail::log64f_a2, vl);

View File

@ -509,6 +509,28 @@ inline v_float64 v_lut(const double* tab, const v_int32& vidx) \
#endif
// Strangely, __riscv_vluxseg2ei32 is slower (tested on Muse-Pi and CanMV K230)
#define OPENCV_HAL_IMPL_RVV_LUT_DEINTERLEAVE(_Tpvec, _Tp, suffix) \
inline void v_lut_deinterleave(const _Tp* tab, const v_int32& vidx, _Tpvec& vx, _Tpvec& vy) \
{ \
v_uint32 vidx_ = __riscv_vmul(__riscv_vreinterpret_u32m2(vidx), sizeof(_Tp), VTraits<v_int32>::vlanes()); \
vx = __riscv_vluxei32(tab, vidx_, VTraits<_Tpvec>::vlanes()); \
vy = __riscv_vluxei32(tab, __riscv_vadd(vidx_, sizeof(_Tp), VTraits<v_int32>::vlanes()), VTraits<_Tpvec>::vlanes()); \
}
OPENCV_HAL_IMPL_RVV_LUT_DEINTERLEAVE(v_float32, float, f32)
OPENCV_HAL_IMPL_RVV_LUT_DEINTERLEAVE(v_int32, int, i32)
OPENCV_HAL_IMPL_RVV_LUT_DEINTERLEAVE(v_uint32, unsigned, u32)
#if CV_SIMD_SCALABLE_64F
inline void v_lut_deinterleave(const double* tab, const v_int32& vidx, v_float64& vx, v_float64& vy) \
{ \
vuint32m1_t vidx_ = __riscv_vmul(__riscv_vlmul_trunc_u32m1(__riscv_vreinterpret_u32m2(vidx)), sizeof(double), VTraits<v_float64>::vlanes()); \
vx = __riscv_vluxei32(tab, vidx_, VTraits<v_float64>::vlanes()); \
vy = __riscv_vluxei32(tab, __riscv_vadd(vidx_, sizeof(double), VTraits<v_int32>::vlanes()), VTraits<v_float64>::vlanes()); \
}
#endif
inline v_uint8 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
inline v_uint8 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
inline v_uint8 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }

View File

@ -771,7 +771,7 @@ void log32f( const float *_x, float *y, int n )
int i = 0;
const int* x = (const int*)_x;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
const int VECSZ = VTraits<v_float32>::vlanes();
const v_float32 vln2 = vx_setall_f32((float)ln_2);
const v_float32 v1 = vx_setall_f32(1.f);
@ -846,7 +846,7 @@ void log64f( const double *x, double *y, int n )
int i = 0;
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
const int VECSZ = VTraits<v_float64>::vlanes();
const v_float64 vln2 = vx_setall_f64(ln_2);

View File

@ -1099,7 +1099,7 @@ struct ExpOp : public BaseElemWiseOp
}
void getValueRange(int depth, double& minval, double& maxval)
{
maxval = depth == CV_32F ? 50 : 100;
maxval = depth == CV_32F ? 80 : 700;
minval = -maxval;
}
void op(const vector<Mat>& src, Mat& dst, const Mat&)

View File

@ -74,7 +74,8 @@ TEST_P(mathfuncs, accuracy)
int n = 100;
Mat src(1, n, depth), dst(1, n, depth), dst0(1, n, depth);
randu(src, 1, 10);
double maxval = depth == CV_32F ? 80 : 700;
randu(src, -maxval, maxval);
switch (nfunc)
{