From ea0f9336e209bdda32ace351f576af0a4ad6e32f Mon Sep 17 00:00:00 2001 From: Yuantao Feng Date: Tue, 19 Nov 2024 14:43:59 +0800 Subject: [PATCH] Merge pull request #26454 from fengyuentau/imgproc:update_warp_c4_kernels imgproc: fix perf regressions on the c4 kernels of warpAffine / warpPerspective / remap #26454 ## Performance Previous performance regressions on c4 kernels are mainly on A311D https://github.com/opencv/opencv/pull/26348. Regressions on c3 kernels on intel platform will be fixed in another pull request. M2 ``` Geometric mean (ms) Name of Test base patch patch vs base (x-factor) WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 0.338 0.163 2.08 WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 0.310 0.107 2.90 WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 0.344 0.162 2.13 WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 0.313 0.111 2.83 WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 0.676 0.333 2.03 WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 0.640 0.240 2.66 WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 1.212 0.885 1.37 WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 1.153 0.756 1.53 WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 0.950 0.475 2.00 WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 1.158 0.500 2.32 WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 3.441 3.106 1.11 WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 3.351 2.837 1.18 WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 0.336 0.163 2.07 WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 0.314 0.124 2.54 WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 0.385 0.226 1.70 WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 0.364 0.183 1.99 WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 0.541 0.290 1.87 WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 0.523 0.243 2.16 WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 1.540 1.239 1.24 WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 1.504 1.134 1.33 WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 0.751 0.465 1.62 WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 0.958 0.507 1.89 WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 3.785 3.487 1.09 WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 3.602 3.280 1.10 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 0.331 0.153 2.16 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 0.304 0.128 2.37 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 0.329 0.156 2.11 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 0.306 0.121 2.53 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 2.046 0.930 2.20 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 2.122 1.391 1.53 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 2.035 0.954 2.13 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 2.127 1.410 1.51 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 0.329 0.157 2.09 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 0.306 0.124 2.47 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 0.327 0.158 2.08 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 0.308 0.127 2.43 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 2.039 0.948 2.15 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 2.175 1.373 1.58 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 2.065 0.956 2.16 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 2.158 1.372 1.57 ``` Intel i7-12700K: ``` Geometric mean (ms) Name of Test base patch patch vs base (x-factor) WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 0.140 0.051 2.77 WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 0.140 0.054 2.57 WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 0.140 0.050 2.78 WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 0.143 0.054 2.64 WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 0.297 0.118 2.51 WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 0.296 0.130 2.28 WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 0.481 0.304 1.58 WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 0.470 0.309 1.52 WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 0.381 0.184 2.07 WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 0.811 0.781 1.04 WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 1.297 1.063 1.22 WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 1.275 1.171 1.09 WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 0.135 0.057 2.36 WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 0.134 0.062 2.16 WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 0.155 0.076 2.04 WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 0.150 0.079 1.90 WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 0.229 0.114 2.02 WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 0.227 0.120 1.89 WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 0.560 0.444 1.26 WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 0.529 0.442 1.20 WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 0.326 0.192 1.70 WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 0.805 0.762 1.06 WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 1.395 1.255 1.11 WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 1.381 1.306 1.06 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 0.138 0.049 2.81 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 0.134 0.053 2.53 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 0.137 0.049 2.79 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 0.134 0.053 2.51 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 1.362 1.352 1.01 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 3.124 3.038 1.03 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 1.354 1.351 1.00 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 3.142 3.049 1.03 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 0.140 0.052 2.70 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 0.136 0.056 2.43 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 0.139 0.051 2.70 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 0.135 0.056 2.41 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 1.335 1.345 0.99 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 3.117 3.024 1.03 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 1.327 1.319 1.01 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 3.126 3.026 1.03 ``` A311D ``` Geometric mean (ms) Name of Test base patch patch vs base (x-factor) WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 1.762 1.361 1.29 WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 2.390 2.005 1.19 WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 1.747 1.238 1.41 WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 2.399 2.016 1.19 WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 3.917 3.104 1.26 WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 5.995 5.172 1.16 WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 6.711 5.460 1.23 WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 8.017 6.890 1.16 WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 6.269 5.596 1.12 WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 10.301 9.507 1.08 WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 18.871 17.375 1.09 WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 20.365 18.227 1.12 WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 2.083 1.514 1.38 WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 2.966 2.309 1.28 WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 2.358 1.715 1.37 WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 3.220 2.464 1.31 WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 3.763 3.014 1.25 WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 5.777 4.940 1.17 WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 8.791 7.819 1.12 WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 10.165 8.426 1.21 WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 6.047 5.293 1.14 WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 9.851 9.023 1.09 WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 31.739 29.323 1.08 WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 32.439 29.236 1.11 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 1.759 1.441 1.22 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 2.681 2.270 1.18 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 1.774 1.425 1.24 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 2.672 2.252 1.19 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 14.079 9.334 1.51 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 17.770 16.155 1.10 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 15.872 11.192 1.42 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 19.167 15.342 1.25 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 2.284 1.545 1.48 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 3.040 2.231 1.36 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 2.280 1.380 1.65 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 2.882 2.185 1.32 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 15.877 11.381 1.40 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 19.521 16.106 1.21 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 15.950 11.532 1.38 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 19.699 16.276 1.21 ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake --- .../opencv2/core/hal/intrin_rvv_scalable.hpp | 115 ++++++++- modules/imgproc/src/warp_common.vector.hpp | 176 +++++++++++-- modules/imgproc/src/warp_kernels.simd.hpp | 243 +++++++++++------- 3 files changed, 400 insertions(+), 134 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp index ff57d39041..fa5c7f280d 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp @@ -543,6 +543,40 @@ OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32, vfloat32m2_t, float, VTraits::vlanes() / 2, VTraits::vlanes(), 64, f64) #endif +template ::max_nlanes> +inline void v_store(ushort* ptr, const v_uint16& a) +{ + ushort buf[VTraits::max_nlanes]; + v_store(buf, a); + for (int i = 0; i < N; i++) { + ptr[i] = buf[i]; + } +} +template <> inline void v_store<8>(ushort* ptr, const v_uint16& a) +{ + ushort buf[VTraits::max_nlanes]; + v_store(buf, a); + ptr[0] = buf[0]; ptr[1] = buf[1]; ptr[2] = buf[2]; ptr[3] = buf[3]; + ptr[4] = buf[4]; ptr[5] = buf[5]; ptr[6] = buf[6]; ptr[7] = buf[7]; +} + +template ::max_nlanes> +inline void v_store(float* ptr, const v_float32& a) +{ + float buf[VTraits::max_nlanes]; + v_store(buf, a); + for (int i = 0; i < N; i++) { + ptr[i] = buf[i]; + } +} +template <> inline void v_store<4>(float* ptr, const v_float32& a) +{ + float buf[VTraits::max_nlanes]; + v_store(buf, a); + ptr[0] = buf[0]; ptr[1] = buf[1]; + ptr[2] = buf[2]; ptr[3] = buf[3]; +} + ////////////// Lookup table access //////////////////// #define OPENCV_HAL_IMPL_RVV_LUT(_Tpvec, _Tp, suffix) \ inline _Tpvec v_lut(const _Tp* tab, const int* idx) \ @@ -1616,6 +1650,42 @@ OPENCV_HAL_IMPL_RVV_EXPAND(short, v_int32, vint32m4_t, v_int16, 16, i32, i16, __ OPENCV_HAL_IMPL_RVV_EXPAND(uint, v_uint64, vuint64m4_t, v_uint32, 32, u64, u32, __riscv_vwcvtu_x) OPENCV_HAL_IMPL_RVV_EXPAND(int, v_int64, vint64m4_t, v_int32, 32, i64, i32, __riscv_vwcvt_x) +template ::max_nlanes> +inline v_float32 v_load(const float* ptr) +{ + float buf[VTraits::max_nlanes]; + v_store(buf, v_setzero_f32()); + for (int i = 0; i < N; i++) { + buf[i] = ptr[i]; + } + return v_load(buf); +} +template <> inline v_float32 v_load<4>(const float* ptr) +{ + float buf[VTraits::max_nlanes]; + v_store(buf, v_setzero_f32()); + buf[0] = ptr[0]; buf[1] = ptr[1]; buf[2] = ptr[2]; buf[3] = ptr[3]; + return v_load(buf); +} + +template ::max_nlanes> +inline v_uint32 v_load_expand(const ushort* ptr) +{ + ushort buf[VTraits::max_nlanes]; + v_store(buf, v_setzero_u16()); + for (int i = 0; i < N; i++) { + buf[i] = ptr[i]; + } + return v_load_expand(buf); +} +template <> inline v_uint32 v_load_expand<4>(const ushort* ptr) +{ + ushort buf[VTraits::max_nlanes]; + v_store(buf, v_setzero_u16()); + buf[0] = ptr[0]; buf[1] = ptr[1]; buf[2] = ptr[2]; buf[3] = ptr[3]; + return v_load_expand(buf); +} + inline v_uint32 v_load_expand_q(const uchar* ptr) { return __riscv_vwcvtu_x(__riscv_vwcvtu_x(__riscv_vle8_v_u8mf2(ptr, VTraits::vlanes()), VTraits::vlanes()), VTraits::vlanes()); @@ -1627,16 +1697,16 @@ inline v_int32 v_load_expand_q(const schar* ptr) } template ::max_nlanes> -inline v_uint32 v_load_expand_q(const uchar* ptr, int n = N) +inline v_uint32 v_load_expand_q(const uchar* ptr) { uchar buf[VTraits::max_nlanes]; v_store(buf, v_setzero_u8()); - for (int i = 0; i < n; i++) { + for (int i = 0; i < N; i++) { buf[i] = ptr[i]; } return v_load_expand_q(buf); } -template <> inline v_uint32 v_load_expand_q<4>(const uchar* ptr, int n) +template <> inline v_uint32 v_load_expand_q<4>(const uchar* ptr) { uchar buf[VTraits::max_nlanes]; v_store(buf, v_setzero_u8()); @@ -1714,19 +1784,48 @@ void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a, int n = N) \ OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8, uchar, v_int16, short, 8, 16, u8, i16, __riscv_vreinterpret_v_i16m4_u16m4, VTraits::vlanes(), VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16, ushort, v_int32, int, 16, 32, u16, i32, __riscv_vreinterpret_v_i32m4_u32m4, VTraits::vlanes(), VTraits::vlanes()) +template ::max_nlanes> +inline v_uint16 v_pack_u(const v_int32& a, const v_int32& b) +{ + ushort bufa[N]; + ushort bufb[N]; + v_pack_u_store(bufa, a); + v_pack_u_store(bufb, b); + ushort buf[N]; + for (int i = 0; i < N; i++) { + buf[i] = bufa[i]; + buf[i+N/2] = bufb[i]; + } + return v_load(buf); +} + +template <> inline v_uint16 v_pack_u<4>(const v_int32& a, const v_int32& b) +{ + constexpr int N = VTraits::max_nlanes; + ushort bufa[N]; + ushort bufb[N]; + v_pack_u_store(bufa, a); + v_pack_u_store(bufb, b); + + ushort buf[N]; + buf[0] = bufa[0]; buf[1] = bufa[1]; buf[2] = bufa[2]; buf[3] = bufa[3]; + buf[4] = bufb[0]; buf[5] = bufb[1]; buf[6] = bufb[2]; buf[7] = bufb[3]; + return v_load(buf); +} + template ::max_nlanes> -inline void v_pack_u_store(uchar* ptr, const v_int16& a, int n = N) +inline void v_pack_store(uchar* ptr, const v_uint16& a) { uchar buf[VTraits::max_nlanes]; - v_pack_u_store(buf, a); - for (int i = 0; i < n; i++) { + v_pack_store(buf, a); + for (int i = 0; i < N; i++) { ptr[i] = buf[i]; } } -template <> inline void v_pack_u_store<8>(uchar* ptr, const v_int16& a, int n) +template <> inline void v_pack_store<8>(uchar* ptr, const v_uint16& a) { uchar buf[VTraits::max_nlanes]; - v_pack_u_store(buf, a); + v_pack_store(buf, a); ptr[0] = buf[0]; ptr[1] = buf[1]; ptr[2] = buf[2]; ptr[3] = buf[3]; ptr[4] = buf[4]; ptr[5] = buf[5]; ptr[6] = buf[6]; ptr[7] = buf[7]; } diff --git a/modules/imgproc/src/warp_common.vector.hpp b/modules/imgproc/src/warp_common.vector.hpp index 6405ee6aab..1e14ae20d9 100644 --- a/modules/imgproc/src/warp_common.vector.hpp +++ b/modules/imgproc/src/warp_common.vector.hpp @@ -569,16 +569,50 @@ i##ofs##_pix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix1, i##ofs##_pix0), i##ofs##_pix0); \ i##ofs##_pix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix3, i##ofs##_pix2), i##ofs##_pix2); \ i##ofs##_pix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_pix2, i##ofs##_pix0), i##ofs##_pix0); -#define CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4() \ +#define CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_16UC4_I(ofs) \ + const uint16_t *srcptr##ofs = src + addr[i+ofs]; \ + v_float32 i##ofs##_pix0 = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(srcptr##ofs))); \ + v_float32 i##ofs##_pix1 = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(srcptr##ofs+4))); \ + v_float32 i##ofs##_pix2 = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(srcptr##ofs+srcstep))); \ + v_float32 i##ofs##_pix3 = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(srcptr##ofs+srcstep+4))); \ + v_float32 i##ofs##_alpha = vx_setall_f32(valpha[i+ofs]), \ + i##ofs##_beta = vx_setall_f32(vbeta[i+ofs]); \ + i##ofs##_pix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix1, i##ofs##_pix0), i##ofs##_pix0); \ + i##ofs##_pix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix3, i##ofs##_pix2), i##ofs##_pix2); \ + i##ofs##_pix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_pix2, i##ofs##_pix0), i##ofs##_pix0); +#define CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_32FC4_I(ofs) \ + const float *srcptr##ofs = src + addr[i+ofs]; \ + v_float32 i##ofs##_pix0 = vx_load(srcptr##ofs); \ + v_float32 i##ofs##_pix1 = vx_load(srcptr##ofs+4); \ + v_float32 i##ofs##_pix2 = vx_load(srcptr##ofs+srcstep); \ + v_float32 i##ofs##_pix3 = vx_load(srcptr##ofs+srcstep+4); \ + v_float32 i##ofs##_alpha = vx_setall_f32(valpha[i+ofs]), \ + i##ofs##_beta = vx_setall_f32(vbeta[i+ofs]); \ + i##ofs##_pix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix1, i##ofs##_pix0), i##ofs##_pix0); \ + i##ofs##_pix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix3, i##ofs##_pix2), i##ofs##_pix2); \ + i##ofs##_pix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_pix2, i##ofs##_pix0), i##ofs##_pix0); +#define CV_WARP_SIMD128_STORE_8UC4_I() \ + v_uint16 i01_pix = v_pack_u(v_round(i0_pix0), v_round(i1_pix0)); \ + v_uint16 i23_pix = v_pack_u(v_round(i2_pix0), v_round(i3_pix0)); \ + v_pack_store(dstptr + 4*(x+i), i01_pix); \ + v_pack_store(dstptr + 4*(x+i+2), i23_pix); +#define CV_WARP_SIMD128_STORE_16UC4_I() \ + v_uint16 i01_pix = v_pack_u(v_round(i0_pix0), v_round(i1_pix0)); \ + v_uint16 i23_pix = v_pack_u(v_round(i2_pix0), v_round(i3_pix0)); \ + vx_store(dstptr + 4*(x+i), i01_pix); \ + vx_store(dstptr + 4*(x+i+2), i23_pix); +#define CV_WARP_SIMD128_STORE_32FC4_I() \ + vx_store(dstptr + 4*(x+i), i0_pix0); \ + vx_store(dstptr + 4*(x+i)+4, i1_pix0); \ + vx_store(dstptr + 4*(x+i)+8, i2_pix0); \ + vx_store(dstptr + 4*(x+i)+12, i3_pix0); +#define CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(DEPTH) \ for (int i = 0; i < uf; i+=vlanes_32) { \ - CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4_I(0); \ - CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4_I(1); \ - CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4_I(2); \ - CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4_I(3); \ - auto i01_pix = v_pack_u(v_round(i0_pix0), v_round(i1_pix0)), \ - i23_pix = v_pack_u(v_round(i2_pix0), v_round(i3_pix0)); \ - v_pack_store(dstptr + 4*(x+i), i01_pix); \ - v_pack_store(dstptr + 4*(x+i+2), i23_pix); \ + CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(0); \ + CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(1); \ + CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(2); \ + CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(3); \ + CV_WARP_SIMD128_STORE_##DEPTH##C4_I(); \ } #define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4_I(ofs0, ofs1) \ const uint8_t *srcptr##ofs0 = src + addr[i+ofs0]; \ @@ -602,16 +636,70 @@ i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix11, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); \ i##ofs0##ofs1##_fpix22 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix33, i##ofs0##ofs1##_fpix22), i##ofs0##ofs1##_fpix22); \ i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_beta, v_sub(i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); -#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4() \ +#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_16UC4_I(ofs0, ofs1) \ + const uint16_t *srcptr##ofs0 = src + addr[i+ofs0]; \ + const uint16_t *srcptr##ofs1 = src + addr[i+ofs1]; \ + v_int32 i##ofs0##_pix01 = v_reinterpret_as_s32(v256_load_expand(srcptr##ofs0)), \ + i##ofs0##_pix23 = v_reinterpret_as_s32(v256_load_expand(srcptr##ofs0+srcstep)); \ + v_int32 i##ofs1##_pix01 = v_reinterpret_as_s32(v256_load_expand(srcptr##ofs1)), \ + i##ofs1##_pix23 = v_reinterpret_as_s32(v256_load_expand(srcptr##ofs1+srcstep)); \ + v_float32 i##ofs0##_fpix01 = v_cvt_f32(i##ofs0##_pix01), i##ofs0##_fpix23 = v_cvt_f32(i##ofs0##_pix23); \ + v_float32 i##ofs1##_fpix01 = v_cvt_f32(i##ofs1##_pix01), i##ofs1##_fpix23 = v_cvt_f32(i##ofs1##_pix23); \ + v_float32 i##ofs0##ofs1##_fpix00, i##ofs0##ofs1##_fpix11, \ + i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix33; \ + v_recombine(i##ofs0##_fpix01, i##ofs1##_fpix01, i##ofs0##ofs1##_fpix00, i##ofs0##ofs1##_fpix11); \ + v_recombine(i##ofs0##_fpix23, i##ofs1##_fpix23, i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix33); \ + v_float32 i##ofs0##_alpha = vx_setall_f32(valpha[i+ofs0]), \ + i##ofs1##_alpha = vx_setall_f32(valpha[i+ofs1]), \ + i##ofs0##_beta = vx_setall_f32(vbeta[i+ofs0]), \ + i##ofs1##_beta = vx_setall_f32(vbeta[i+ofs1]); \ + v_float32 i##ofs0##ofs1##_alpha = v_combine_low(i##ofs0##_alpha, i##ofs1##_alpha), \ + i##ofs0##ofs1##_beta = v_combine_low(i##ofs0##_beta, i##ofs1##_beta); \ + i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix11, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); \ + i##ofs0##ofs1##_fpix22 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix33, i##ofs0##ofs1##_fpix22), i##ofs0##ofs1##_fpix22); \ + i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_beta, v_sub(i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); +#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_32FC4_I(ofs0, ofs1) \ + const float *srcptr##ofs0 = src + addr[i+ofs0]; \ + const float *srcptr##ofs1 = src + addr[i+ofs1]; \ + v_float32 i##ofs0##_fpix01 = v256_load(srcptr##ofs0), \ + i##ofs0##_fpix23 = v256_load(srcptr##ofs0+srcstep); \ + v_float32 i##ofs1##_fpix01 = v256_load(srcptr##ofs1), \ + i##ofs1##_fpix23 = v256_load(srcptr##ofs1+srcstep); \ + v_float32 i##ofs0##ofs1##_fpix00, i##ofs0##ofs1##_fpix11, \ + i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix33; \ + v_recombine(i##ofs0##_fpix01, i##ofs1##_fpix01, i##ofs0##ofs1##_fpix00, i##ofs0##ofs1##_fpix11); \ + v_recombine(i##ofs0##_fpix23, i##ofs1##_fpix23, i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix33); \ + v_float32 i##ofs0##_alpha = vx_setall_f32(valpha[i+ofs0]), \ + i##ofs1##_alpha = vx_setall_f32(valpha[i+ofs1]), \ + i##ofs0##_beta = vx_setall_f32(vbeta[i+ofs0]), \ + i##ofs1##_beta = vx_setall_f32(vbeta[i+ofs1]); \ + v_float32 i##ofs0##ofs1##_alpha = v_combine_low(i##ofs0##_alpha, i##ofs1##_alpha), \ + i##ofs0##ofs1##_beta = v_combine_low(i##ofs0##_beta, i##ofs1##_beta); \ + i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix11, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); \ + i##ofs0##ofs1##_fpix22 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix33, i##ofs0##ofs1##_fpix22), i##ofs0##ofs1##_fpix22); \ + i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_beta, v_sub(i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); +#define CV_WARP_SIMD256_STORE_8UC4_I() \ + auto i01_pix = v_round(i01_fpix00), i23_pix = v_round(i23_fpix00); \ + v_pack_store(dstptr + 4*(x+i), v_pack_u(i01_pix, i23_pix)); \ + auto i45_pix = v_round(i45_fpix00), i67_pix = v_round(i67_fpix00); \ + v_pack_store(dstptr + 4*(x+i+4), v_pack_u(i45_pix, i67_pix)); +#define CV_WARP_SIMD256_STORE_16UC4_I() \ + auto i01_pix = v_round(i01_fpix00), i23_pix = v_round(i23_fpix00); \ + vx_store(dstptr + 4*(x+i), v_pack_u(i01_pix, i23_pix)); \ + auto i45_pix = v_round(i45_fpix00), i67_pix = v_round(i67_fpix00); \ + vx_store(dstptr + 4*(x+i+4), v_pack_u(i45_pix, i67_pix)); +#define CV_WARP_SIMD256_STORE_32FC4_I() \ + vx_store(dstptr + 4*(x+i), i01_fpix00); \ + vx_store(dstptr + 4*(x+i)+8, i23_fpix00); \ + vx_store(dstptr + 4*(x+i)+16, i45_fpix00); \ + vx_store(dstptr + 4*(x+i)+24, i67_fpix00); +#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(DEPTH) \ for (int i = 0; i < uf; i+=vlanes_32) { \ - CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4_I(0, 1); \ - CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4_I(2, 3); \ - auto i01_pix = v_round(i01_fpix00), i23_pix = v_round(i23_fpix00); \ - v_pack_store(dstptr + 4*(x+i), v_pack_u(i01_pix, i23_pix)); \ - CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4_I(4, 5); \ - CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4_I(6, 7); \ - auto i45_pix = v_round(i45_fpix00), i67_pix = v_round(i67_fpix00); \ - v_pack_store(dstptr + 4*(x+i+4), v_pack_u(i45_pix, i67_pix)); \ + CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(0, 1); \ + CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(2, 3); \ + CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(4, 5); \ + CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(6, 7); \ + CV_WARP_SIMD256_STORE_##DEPTH##C4_I(); \ } #define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4_I(ofs) \ const uint8_t *srcptr##ofs = src + addr[i+ofs]; \ @@ -624,14 +712,48 @@ i##ofs##_fpix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix1, i##ofs##_fpix0), i##ofs##_fpix0); \ i##ofs##_fpix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix3, i##ofs##_fpix2), i##ofs##_fpix2); \ i##ofs##_fpix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_fpix2, i##ofs##_fpix0), i##ofs##_fpix0); -#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4() \ +#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_16UC4_I(ofs) \ + const uint16_t *srcptr##ofs = src + addr[i+ofs]; \ + v_float32 i##ofs##_fpix0 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand<4>(srcptr##ofs))), \ + i##ofs##_fpix1 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand<4>(srcptr##ofs+4))), \ + i##ofs##_fpix2 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand<4>(srcptr##ofs+srcstep))), \ + i##ofs##_fpix3 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand<4>(srcptr##ofs+srcstep+4))); \ + v_float32 i##ofs##_alpha = vx_setall_f32(valpha[i+ofs]), \ + i##ofs##_beta = vx_setall_f32(vbeta[i+ofs]); \ + i##ofs##_fpix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix1, i##ofs##_fpix0), i##ofs##_fpix0); \ + i##ofs##_fpix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix3, i##ofs##_fpix2), i##ofs##_fpix2); \ + i##ofs##_fpix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_fpix2, i##ofs##_fpix0), i##ofs##_fpix0); +#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_32FC4_I(ofs) \ + const float *srcptr##ofs = src + addr[i+ofs]; \ + v_float32 i##ofs##_fpix0 = v_load<4>(srcptr##ofs), \ + i##ofs##_fpix1 = v_load<4>(srcptr##ofs+4), \ + i##ofs##_fpix2 = v_load<4>(srcptr##ofs+srcstep), \ + i##ofs##_fpix3 = v_load<4>(srcptr##ofs+srcstep+4); \ + v_float32 i##ofs##_alpha = vx_setall_f32(valpha[i+ofs]), \ + i##ofs##_beta = vx_setall_f32(vbeta[i+ofs]); \ + i##ofs##_fpix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix1, i##ofs##_fpix0), i##ofs##_fpix0); \ + i##ofs##_fpix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix3, i##ofs##_fpix2), i##ofs##_fpix2); \ + i##ofs##_fpix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_fpix2, i##ofs##_fpix0), i##ofs##_fpix0); +#define CV_WARP_SIMDX_STORE_8UC4_I() \ + auto i01_pix = v_pack_u<4>(v_round(i0_fpix0), v_round(i1_fpix0)), \ + i23_pix = v_pack_u<4>(v_round(i2_fpix0), v_round(i3_fpix0)); \ + v_pack_store<8>(dstptr + 4*(x+i), i01_pix); \ + v_pack_store<8>(dstptr + 4*(x+i+2), i23_pix); +#define CV_WARP_SIMDX_STORE_16UC4_I() \ + auto i01_pix = v_pack_u<4>(v_round(i0_fpix0), v_round(i1_fpix0)), \ + i23_pix = v_pack_u<4>(v_round(i2_fpix0), v_round(i3_fpix0)); \ + v_store<8>(dstptr + 4*(x+i), i01_pix); \ + v_store<8>(dstptr + 4*(x+i+2), i23_pix); +#define CV_WARP_SIMDX_STORE_32FC4_I() \ + v_store<4>(dstptr + 4*(x+i), i0_fpix0); \ + v_store<4>(dstptr + 4*(x+i)+4, i1_fpix0); \ + v_store<4>(dstptr + 4*(x+i)+8, i2_fpix0); \ + v_store<4>(dstptr + 4*(x+i)+12, i3_fpix0); +#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(DEPTH) \ for (int i = 0; i < uf; i+=4) { \ - CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4_I(0); \ - CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4_I(1); \ - CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4_I(2); \ - CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4_I(3); \ - auto i01_pix = v_pack(v_round(i0_fpix0), v_round(i1_fpix0)), \ - i23_pix = v_pack(v_round(i2_fpix0), v_round(i3_fpix0)); \ - v_pack_u_store<8>(dstptr + 4*(x+i), i01_pix); \ - v_pack_u_store<8>(dstptr + 4*(x+i+2), i23_pix); \ + CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(0); \ + CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(1); \ + CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(2); \ + CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(3); \ + CV_WARP_SIMDX_STORE_##DEPTH##C4_I(); \ } diff --git a/modules/imgproc/src/warp_kernels.simd.hpp b/modules/imgproc/src/warp_kernels.simd.hpp index b1ad131079..41f994bc24 100644 --- a/modules/imgproc/src/warp_kernels.simd.hpp +++ b/modules/imgproc/src/warp_kernels.simd.hpp @@ -298,7 +298,7 @@ void warpAffineLinearInvoker_8UC1(const uint8_t *src_data, size_t src_step, int std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -418,7 +418,7 @@ void warpAffineLinearInvoker_8UC3(const uint8_t *src_data, size_t src_step, int std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -547,7 +547,7 @@ void warpAffineLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, int std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -590,11 +590,11 @@ void warpAffineLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, int vx_store(vbeta, src_y0); vx_store(vbeta+vlanes_32, src_y1); #if CV_SIMD256 - CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4(); + CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(8U); #elif CV_SIMD128 - CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4(); + CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(8U); #elif CV_SIMD_SCALABLE - CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4(); + CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(8U); #endif } else { uint8_t pixbuf[max_uf*4*4]; @@ -660,7 +660,7 @@ void warpAffineLinearInvoker_16UC1(const uint16_t *src_data, size_t src_step, in std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -761,7 +761,7 @@ void warpAffineLinearInvoker_16UC3(const uint16_t *src_data, size_t src_step, in std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -866,7 +866,7 @@ void warpAffineLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, in std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -876,7 +876,6 @@ void warpAffineLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, in int32_t addr[max_uf], src_ix[max_uf], src_iy[max_uf]; - uint16_t pixbuf[max_uf*4*4]; uint16_t bvalbuf[max_uf*4]; for (int i = 0; i < uf; i++) { @@ -904,18 +903,26 @@ void warpAffineLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, in CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4); if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C4, 16U); + float valpha[max_uf], vbeta[max_uf]; + vx_store(valpha, src_x0); + vx_store(valpha+vlanes_32, src_x1); + vx_store(vbeta, src_y0); + vx_store(vbeta+vlanes_32, src_y1); + #if CV_SIMD256 + CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(16U); + #elif CV_SIMD128 + CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(16U); + #elif CV_SIMD_SCALABLE + CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(16U); + #endif } else { + uint16_t pixbuf[max_uf*4*4]; CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 16U); + CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C4); + CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4); + CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4); + CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C4); } - - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C4); - - CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4); - - CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4); - - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C4); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -972,7 +979,7 @@ void warpAffineLinearInvoker_32FC1(const float *src_data, size_t src_step, int s std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -1071,7 +1078,7 @@ void warpAffineLinearInvoker_32FC3(const float *src_data, size_t src_step, int s std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -1176,7 +1183,7 @@ void warpAffineLinearInvoker_32FC4(const float *src_data, size_t src_step, int s std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -1186,7 +1193,6 @@ void warpAffineLinearInvoker_32FC4(const float *src_data, size_t src_step, int s int32_t addr[max_uf], src_ix[max_uf], src_iy[max_uf]; - float pixbuf[max_uf*4*4]; float bvalbuf[max_uf*4]; for (int i = 0; i < uf; i++) { @@ -1218,16 +1224,25 @@ void warpAffineLinearInvoker_32FC4(const float *src_data, size_t src_step, int s CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4); if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C4, 32F); + float valpha[max_uf], vbeta[max_uf]; + vx_store(valpha, src_x0); + vx_store(valpha+vlanes_32, src_x1); + vx_store(vbeta, src_y0); + vx_store(vbeta+vlanes_32, src_y1); + #if CV_SIMD256 + CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(32F); + #elif CV_SIMD128 + CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(32F); + #elif CV_SIMD_SCALABLE + CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(32F); + #endif } else { + float pixbuf[max_uf*4*4]; CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 32F); + CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C4); + CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4); + CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C4); } - - CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C4); - - CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4); - - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C4); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -1284,7 +1299,7 @@ void warpAffineLinearApproxInvoker_8UC1(const uint8_t *src_data, size_t src_step std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -1391,7 +1406,7 @@ void warpAffineLinearApproxInvoker_8UC3(const uint8_t *src_data, size_t src_step std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -1505,7 +1520,7 @@ void warpAffineLinearApproxInvoker_8UC4(const uint8_t *src_data, size_t src_step std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -1622,7 +1637,7 @@ void warpPerspectiveLinearInvoker_8UC1(const uint8_t *src_data, size_t src_step, std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -1744,7 +1759,7 @@ void warpPerspectiveLinearInvoker_8UC3(const uint8_t *src_data, size_t src_step, std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -1874,7 +1889,7 @@ void warpPerspectiveLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -1917,11 +1932,11 @@ void warpPerspectiveLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, vx_store(vbeta, src_y0); vx_store(vbeta+vlanes_32, src_y1); #if CV_SIMD256 - CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4(); + CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(8U); #elif CV_SIMD128 - CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4(); + CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(8U); #elif CV_SIMD_SCALABLE - CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4(); + CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(8U); #endif } else { uint8_t pixbuf[max_uf*4*4]; @@ -1988,7 +2003,7 @@ void warpPerspectiveLinearInvoker_16UC1(const uint16_t *src_data, size_t src_ste std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -2089,7 +2104,7 @@ void warpPerspectiveLinearInvoker_16UC3(const uint16_t *src_data, size_t src_ste std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -2194,7 +2209,7 @@ void warpPerspectiveLinearInvoker_16UC4(const uint16_t *src_data, size_t src_ste std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -2204,7 +2219,6 @@ void warpPerspectiveLinearInvoker_16UC4(const uint16_t *src_data, size_t src_ste int32_t addr[max_uf], src_ix[max_uf], src_iy[max_uf]; - uint16_t pixbuf[max_uf*4*4]; uint16_t bvalbuf[max_uf*4]; for (int i = 0; i < uf; i++) { @@ -2232,18 +2246,26 @@ void warpPerspectiveLinearInvoker_16UC4(const uint16_t *src_data, size_t src_ste CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4); if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C4, 16U); + float valpha[max_uf], vbeta[max_uf]; + vx_store(valpha, src_x0); + vx_store(valpha+vlanes_32, src_x1); + vx_store(vbeta, src_y0); + vx_store(vbeta+vlanes_32, src_y1); + #if CV_SIMD256 + CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(16U); + #elif CV_SIMD128 + CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(16U); + #elif CV_SIMD_SCALABLE + CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(16U); + #endif } else { + uint16_t pixbuf[max_uf*4*4]; CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 16U); + CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C4); + CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4); + CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4); + CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C4); } - - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C4); - - CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4); - - CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4); - - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C4); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -2301,7 +2323,7 @@ void warpPerspectiveLinearInvoker_32FC1(const float *src_data, size_t src_step, std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -2401,7 +2423,7 @@ void warpPerspectiveLinearInvoker_32FC3(const float *src_data, size_t src_step, std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -2507,7 +2529,7 @@ void warpPerspectiveLinearInvoker_32FC4(const float *src_data, size_t src_step, std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -2517,7 +2539,6 @@ void warpPerspectiveLinearInvoker_32FC4(const float *src_data, size_t src_step, int32_t addr[max_uf], src_ix[max_uf], src_iy[max_uf]; - float pixbuf[max_uf*4*4]; float bvalbuf[max_uf*4]; for (int i = 0; i < uf; i++) { @@ -2549,16 +2570,25 @@ void warpPerspectiveLinearInvoker_32FC4(const float *src_data, size_t src_step, CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4); if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C4, 32F); + float valpha[max_uf], vbeta[max_uf]; + vx_store(valpha, src_x0); + vx_store(valpha+vlanes_32, src_x1); + vx_store(vbeta, src_y0); + vx_store(vbeta+vlanes_32, src_y1); + #if CV_SIMD256 + CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(32F); + #elif CV_SIMD128 + CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(32F); + #elif CV_SIMD_SCALABLE + CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(32F); + #endif } else { + float pixbuf[max_uf*4*4]; CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 32F); + CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C4); + CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4); + CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C4); } - - CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C4); - - CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4); - - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C4); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -2616,7 +2646,7 @@ void warpPerspectiveLinearApproxInvoker_8UC1(const uint8_t *src_data, size_t src std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -2724,7 +2754,7 @@ void warpPerspectiveLinearApproxInvoker_8UC3(const uint8_t *src_data, size_t src std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -2838,7 +2868,7 @@ void warpPerspectiveLinearApproxInvoker_8UC4(const uint8_t *src_data, size_t src std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -2961,7 +2991,7 @@ void remapLinearInvoker_8UC1(const uint8_t *src_data, size_t src_step, int src_r std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -3100,7 +3130,7 @@ void remapLinearInvoker_8UC3(const uint8_t *src_data, size_t src_step, int src_r std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -3247,7 +3277,7 @@ void remapLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, int src_r std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -3292,11 +3322,11 @@ void remapLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, int src_r vx_store(vbeta, src_y0); vx_store(vbeta+vlanes_32, src_y1); #if CV_SIMD256 - CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4(); + CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(8U); #elif CV_SIMD128 - CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4(); + CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(8U); #elif CV_SIMD_SCALABLE - CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4(); + CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(8U); #endif } else { uint8_t pixbuf[max_uf*4*4]; @@ -3378,7 +3408,7 @@ void remapLinearInvoker_16UC1(const uint16_t *src_data, size_t src_step, int src std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -3496,7 +3526,7 @@ void remapLinearInvoker_16UC3(const uint16_t *src_data, size_t src_step, int src std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -3618,7 +3648,7 @@ void remapLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, int src std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -3628,7 +3658,6 @@ void remapLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, int src int32_t addr[max_uf], src_ix[max_uf], src_iy[max_uf]; - uint16_t pixbuf[max_uf*4*4]; uint16_t bvalbuf[max_uf*4]; for (int i = 0; i < uf; i++) { @@ -3658,18 +3687,26 @@ void remapLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, int src CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4); if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C4, 16U); + float valpha[max_uf], vbeta[max_uf]; + vx_store(valpha, src_x0); + vx_store(valpha+vlanes_32, src_x1); + vx_store(vbeta, src_y0); + vx_store(vbeta+vlanes_32, src_y1); + #if CV_SIMD256 + CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(16U); + #elif CV_SIMD128 + CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(16U); + #elif CV_SIMD_SCALABLE + CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(16U); + #endif } else { + uint16_t pixbuf[max_uf*4*4]; CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 16U); + CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C4); + CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4); + CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4); + CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C4); } - - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C4); - - CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4); - - CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4); - - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C4); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -3742,7 +3779,7 @@ void remapLinearInvoker_32FC1(const float *src_data, size_t src_step, int src_ro std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -3859,7 +3896,7 @@ void remapLinearInvoker_32FC3(const float *src_data, size_t src_step, int src_ro std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -3982,7 +4019,7 @@ void remapLinearInvoker_32FC4(const float *src_data, size_t src_step, int src_ro std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -3992,7 +4029,6 @@ void remapLinearInvoker_32FC4(const float *src_data, size_t src_step, int src_ro int32_t addr[max_uf], src_ix[max_uf], src_iy[max_uf]; - float pixbuf[max_uf*4*4]; float bvalbuf[max_uf*4]; for (int i = 0; i < uf; i++) { @@ -4026,16 +4062,25 @@ void remapLinearInvoker_32FC4(const float *src_data, size_t src_step, int src_ro CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4); if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C4, 32F); + float valpha[max_uf], vbeta[max_uf]; + vx_store(valpha, src_x0); + vx_store(valpha+vlanes_32, src_x1); + vx_store(vbeta, src_y0); + vx_store(vbeta+vlanes_32, src_y1); + #if CV_SIMD256 + CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(32F); + #elif CV_SIMD128 + CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(32F); + #elif CV_SIMD_SCALABLE + CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(32F); + #endif } else { + float pixbuf[max_uf*4*4]; CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 32F); + CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C4); + CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4); + CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C4); } - - CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C4); - - CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4); - - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C4); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -4107,7 +4152,7 @@ void remapLinearApproxInvoker_8UC1(const uint8_t *src_data, size_t src_step, int std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -4229,7 +4274,7 @@ void remapLinearApproxInvoker_8UC3(const uint8_t *src_data, size_t src_step, int std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -4359,7 +4404,7 @@ void remapLinearApproxInvoker_8UC4(const uint8_t *src_data, size_t src_step, int std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2), + v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1);