From b476ed6d06c319ed5632a3f6a7e6bdc1639875eb Mon Sep 17 00:00:00 2001 From: Yuantao Feng Date: Sat, 30 Nov 2024 15:41:21 +0800 Subject: [PATCH] Merge pull request #26505 from fengyuentau:imgproc/new_nearest_inter imgproc: optimized nearest neighbour interpolation for warpAffine, warpPerspective and remap #26505 PR Description has a limit of 65536 characters. So performance stats are attached below. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake --- .../opencv2/core/hal/intrin_rvv_scalable.hpp | 29 + modules/imgproc/perf/perf_warp.cpp | 42 - modules/imgproc/src/imgwarp.cpp | 157 +- modules/imgproc/src/opencl/remap.cl | 102 +- modules/imgproc/src/opencl/warp_affine.cl | 26 +- .../imgproc/src/opencl/warp_perspective.cl | 21 +- modules/imgproc/src/warp_common.scalar.hpp | 193 +- modules/imgproc/src/warp_common.vector.hpp | 607 +-- modules/imgproc/src/warp_kernels.simd.hpp | 3942 ++++++++++++++--- modules/imgproc/test/test_imgwarp_strict.cpp | 136 +- 10 files changed, 4026 insertions(+), 1229 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp index fa5c7f280d..7c42abf703 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp @@ -1761,6 +1761,35 @@ OPENCV_HAL_IMPL_RVV_PACK(v_int16, short, v_int32, 16, i16, i32, __riscv_vnclip, OPENCV_HAL_IMPL_RVV_PACK_32(v_uint32, unsigned, v_uint64, 32, u32, u64, __riscv_vnclipu, __riscv_vnsrl) OPENCV_HAL_IMPL_RVV_PACK_32(v_int32, int, v_int64, 32, i32, i64, __riscv_vnclip, __riscv_vnsra) +template ::max_nlanes> +inline v_uint16 v_pack(const v_uint32& a, const v_uint32& b) +{ + ushort bufa[N]; + ushort bufb[N]; + v_pack_store(bufa, a); + v_pack_store(bufb, b); + ushort buf[N]; + for (int i = 0; i < N; i++) { + buf[i] = bufa[i]; + buf[i+N/2] = bufb[i]; + } + return v_load(buf); +} + +template <> inline v_uint16 v_pack<4>(const v_uint32& a, const v_uint32& b) +{ + constexpr int N = VTraits::max_nlanes; + ushort bufa[N]; + ushort bufb[N]; + v_pack_store(bufa, a); + v_pack_store(bufb, b); + + ushort buf[N]; + buf[0] = bufa[0]; buf[1] = bufa[1]; buf[2] = bufa[2]; buf[3] = bufa[3]; + buf[4] = bufb[0]; buf[5] = bufb[1]; buf[6] = bufb[2]; buf[7] = bufb[3]; + return v_load(buf); +} + #define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, cast, hvl, vl) \ inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \ { \ diff --git a/modules/imgproc/perf/perf_warp.cpp b/modules/imgproc/perf/perf_warp.cpp index 5e966dad1f..b35ef4b5a6 100644 --- a/modules/imgproc/perf/perf_warp.cpp +++ b/modules/imgproc/perf/perf_warp.cpp @@ -111,48 +111,6 @@ PERF_TEST_P( TestWarpPerspective, WarpPerspective, SANITY_CHECK(dst, 1); } -PERF_TEST_P( TestWarpPerspectiveNear_t, WarpPerspectiveNear, - Combine( - Values( Size(640,480), Size(1920,1080), Size(2592,1944) ), - InterType::all(), - BorderMode::all(), - Values( CV_8UC1, CV_8UC4 ) - ) - ) -{ - Size size; - int borderMode, interType, type; - size = get<0>(GetParam()); - interType = get<1>(GetParam()); - borderMode = get<2>(GetParam()); - type = get<3>(GetParam()); - Scalar borderColor = Scalar::all(150); - - Mat src(size, type), dst(size, type); - cvtest::fillGradient(src); - if(borderMode == BORDER_CONSTANT) cvtest::smoothBorder(src, borderColor, 1); - int shift = static_cast(src.cols*0.04); - Mat srcVertices = (Mat_(1, 4) << Vec2f(0, 0), - Vec2f(static_cast(size.width-1), 0), - Vec2f(static_cast(size.width-1), static_cast(size.height-1)), - Vec2f(0, static_cast(size.height-1))); - Mat dstVertices = (Mat_(1, 4) << Vec2f(0, static_cast(shift)), - Vec2f(static_cast(size.width-shift/2), 0), - Vec2f(static_cast(size.width-shift), static_cast(size.height-shift)), - Vec2f(static_cast(shift/2), static_cast(size.height-1))); - Mat warpMat = getPerspectiveTransform(srcVertices, dstVertices); - - declare.in(src).out(dst); - declare.time(100); - - TEST_CYCLE() - { - warpPerspective( src, dst, warpMat, size, interType, borderMode, borderColor ); - } - - SANITY_CHECK(dst, 1); -} - PERF_TEST_P( TestRemap, map1_32fc1, Combine( Values( szVGA, sz1080p ), diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index 5e7e58756c..fb0dd04ced 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -1672,6 +1672,56 @@ void cv::remap( InputArray _src, OutputArray _dst, int type = src.type(), depth = CV_MAT_DEPTH(type); + if (interpolation == INTER_NEAREST && map1.depth() == CV_32F) { + const auto *src_data = src.ptr(); + auto *dst_data = dst.ptr(); + size_t src_step = src.step, dst_step = dst.step, + map1_step = map1.step, map2_step = map2.step; + int src_rows = src.rows, src_cols = src.cols; + int dst_rows = dst.rows, dst_cols = dst.cols; + const float *map1_data = map1.ptr(); + const float *map2_data = map2.ptr(); + switch (src.type()) { + case CV_8UC1: { + CV_CPU_DISPATCH(remapNearestInvoker_8UC1, (src_data, src_step, src_rows, src_cols, dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_8UC3: { + CV_CPU_DISPATCH(remapNearestInvoker_8UC3, (src_data, src_step, src_rows, src_cols, dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_8UC4: { + CV_CPU_DISPATCH(remapNearestInvoker_8UC4, (src_data, src_step, src_rows, src_cols, dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_16UC1: { + CV_CPU_DISPATCH(remapNearestInvoker_16UC1, ((const uint16_t*)src_data, src_step, src_rows, src_cols, (uint16_t*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_16UC3: { + CV_CPU_DISPATCH(remapNearestInvoker_16UC3, ((const uint16_t*)src_data, src_step, src_rows, src_cols, (uint16_t*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_16UC4: { + CV_CPU_DISPATCH(remapNearestInvoker_16UC4, ((const uint16_t*)src_data, src_step, src_rows, src_cols, (uint16_t*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_32FC1: { + CV_CPU_DISPATCH(remapNearestInvoker_32FC1, ((const float*)src_data, src_step, src_rows, src_cols, (float*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_32FC3: { + CV_CPU_DISPATCH(remapNearestInvoker_32FC3, ((const float*)src_data, src_step, src_rows, src_cols, (float*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_32FC4: { + CV_CPU_DISPATCH(remapNearestInvoker_32FC4, ((const float*)src_data, src_step, src_rows, src_cols, (float*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL); + break; + } + // no default + } + } + if (interpolation == INTER_LINEAR) { if (map1.depth() == CV_32F) { const auto *src_data = src.ptr(); @@ -1708,27 +1758,27 @@ void cv::remap( InputArray _src, OutputArray _dst, break; } case CV_16UC1: { - CV_CPU_DISPATCH(remapLinearInvoker_16UC1, ((uint16_t*)src_data, src_step, src_rows, src_cols, (uint16_t*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL); + CV_CPU_DISPATCH(remapLinearInvoker_16UC1, ((const uint16_t*)src_data, src_step, src_rows, src_cols, (uint16_t*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL); break; } case CV_16UC3: { - CV_CPU_DISPATCH(remapLinearInvoker_16UC3, ((uint16_t*)src_data, src_step, src_rows, src_cols, (uint16_t*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL); + CV_CPU_DISPATCH(remapLinearInvoker_16UC3, ((const uint16_t*)src_data, src_step, src_rows, src_cols, (uint16_t*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL); break; } case CV_16UC4: { - CV_CPU_DISPATCH(remapLinearInvoker_16UC4, ((uint16_t*)src_data, src_step, src_rows, src_cols, (uint16_t*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL); + CV_CPU_DISPATCH(remapLinearInvoker_16UC4, ((const uint16_t*)src_data, src_step, src_rows, src_cols, (uint16_t*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL); break; } case CV_32FC1: { - CV_CPU_DISPATCH(remapLinearInvoker_32FC1, ((float*)src_data, src_step, src_rows, src_cols, (float*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL); + CV_CPU_DISPATCH(remapLinearInvoker_32FC1, ((const float*)src_data, src_step, src_rows, src_cols, (float*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL); break; } case CV_32FC3: { - CV_CPU_DISPATCH(remapLinearInvoker_32FC3, ((float*)src_data, src_step, src_rows, src_cols, (float*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL); + CV_CPU_DISPATCH(remapLinearInvoker_32FC3, ((const float*)src_data, src_step, src_rows, src_cols, (float*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL); break; } case CV_32FC4: { - CV_CPU_DISPATCH(remapLinearInvoker_32FC4, ((float*)src_data, src_step, src_rows, src_cols, (float*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL); + CV_CPU_DISPATCH(remapLinearInvoker_32FC4, ((const float*)src_data, src_step, src_rows, src_cols, (float*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL); break; } // no default @@ -2657,6 +2707,48 @@ static void warpAffine(int src_type, Mat src(Size(src_width, src_height), src_type, const_cast(src_data), src_step); Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step); + if (interpolation == INTER_NEAREST) { + switch (src_type) { + case CV_8UC1: { + CV_CPU_DISPATCH(warpAffineNearestInvoker_8UC1, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_8UC3: { + CV_CPU_DISPATCH(warpAffineNearestInvoker_8UC3, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_8UC4: { + CV_CPU_DISPATCH(warpAffineNearestInvoker_8UC4, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_16UC1: { + CV_CPU_DISPATCH(warpAffineNearestInvoker_16UC1, ((const uint16_t*)src_data, src_step, src_height, src_width, (uint16_t*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_16UC3: { + CV_CPU_DISPATCH(warpAffineNearestInvoker_16UC3, ((const uint16_t*)src_data, src_step, src_height, src_width, (uint16_t*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_16UC4: { + CV_CPU_DISPATCH(warpAffineNearestInvoker_16UC4, ((const uint16_t*)src_data, src_step, src_height, src_width, (uint16_t*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_32FC1: { + CV_CPU_DISPATCH(warpAffineNearestInvoker_32FC1, ((const float*)src_data, src_step, src_height, src_width, (float*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_32FC3: { + CV_CPU_DISPATCH(warpAffineNearestInvoker_32FC3, ((const float*)src_data, src_step, src_height, src_width, (float*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_32FC4: { + CV_CPU_DISPATCH(warpAffineNearestInvoker_32FC4, ((const float*)src_data, src_step, src_height, src_width, (float*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; + } + // no default + } + } + if (interpolation == INTER_LINEAR) { switch (src_type) { case CV_8UC1: { @@ -3324,46 +3416,99 @@ static void warpPerspective(int src_type, { CALL_HAL(warpPerspective, cv_hal_warpPerspective, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, M, interpolation, borderType, borderValue); + if (interpolation == INTER_NEAREST) { + switch (src_type) { + case CV_8UC1: { + CV_CPU_DISPATCH(warpPerspectiveNearestInvoker_8UC1, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_8UC3: { + CV_CPU_DISPATCH(warpPerspectiveNearestInvoker_8UC3, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_8UC4: { + CV_CPU_DISPATCH(warpPerspectiveNearestInvoker_8UC4, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_16UC1: { + CV_CPU_DISPATCH(warpPerspectiveNearestInvoker_16UC1, ((const uint16_t*)src_data, src_step, src_height, src_width, (uint16_t*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_16UC3: { + CV_CPU_DISPATCH(warpPerspectiveNearestInvoker_16UC3, ((const uint16_t*)src_data, src_step, src_height, src_width, (uint16_t*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_16UC4: { + CV_CPU_DISPATCH(warpPerspectiveNearestInvoker_16UC4, ((const uint16_t*)src_data, src_step, src_height, src_width, (uint16_t*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_32FC1: { + CV_CPU_DISPATCH(warpPerspectiveNearestInvoker_32FC1, ((const float*)src_data, src_step, src_height, src_width, (float*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_32FC3: { + CV_CPU_DISPATCH(warpPerspectiveNearestInvoker_32FC3, ((const float*)src_data, src_step, src_height, src_width, (float*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; + } + case CV_32FC4: { + CV_CPU_DISPATCH(warpPerspectiveNearestInvoker_32FC4, ((const float*)src_data, src_step, src_height, src_width, (float*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; + } + } + } + if (interpolation == INTER_LINEAR) { switch (src_type) { case CV_8UC1: { if (hint == cv::ALGO_HINT_APPROX) { CV_CPU_DISPATCH(warpPerspectiveLinearApproxInvoker_8UC1, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; } else { CV_CPU_DISPATCH(warpPerspectiveLinearInvoker_8UC1, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; } } case CV_8UC3: { if (hint == cv::ALGO_HINT_APPROX) { CV_CPU_DISPATCH(warpPerspectiveLinearApproxInvoker_8UC3, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; } else { CV_CPU_DISPATCH(warpPerspectiveLinearInvoker_8UC3, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; } } case CV_8UC4: { if (hint == cv::ALGO_HINT_APPROX) { CV_CPU_DISPATCH(warpPerspectiveLinearApproxInvoker_8UC4, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; } else { CV_CPU_DISPATCH(warpPerspectiveLinearInvoker_8UC4, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; } } case CV_16UC1: { CV_CPU_DISPATCH(warpPerspectiveLinearInvoker_16UC1, ((const uint16_t*)src_data, src_step, src_height, src_width, (uint16_t*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; } case CV_16UC3: { CV_CPU_DISPATCH(warpPerspectiveLinearInvoker_16UC3, ((const uint16_t*)src_data, src_step, src_height, src_width, (uint16_t*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; } case CV_16UC4: { CV_CPU_DISPATCH(warpPerspectiveLinearInvoker_16UC4, ((const uint16_t*)src_data, src_step, src_height, src_width, (uint16_t*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; } case CV_32FC1: { CV_CPU_DISPATCH(warpPerspectiveLinearInvoker_32FC1, ((const float*)src_data, src_step, src_height, src_width, (float*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; } case CV_32FC3: { CV_CPU_DISPATCH(warpPerspectiveLinearInvoker_32FC3, ((const float*)src_data, src_step, src_height, src_width, (float*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; } case CV_32FC4: { CV_CPU_DISPATCH(warpPerspectiveLinearInvoker_32FC4, ((const float*)src_data, src_step, src_height, src_width, (float*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL); + break; } // no default } diff --git a/modules/imgproc/src/opencl/remap.cl b/modules/imgproc/src/opencl/remap.cl index 1500cae98e..9a5b30d193 100644 --- a/modules/imgproc/src/opencl/remap.cl +++ b/modules/imgproc/src/opencl/remap.cl @@ -152,42 +152,36 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src if (x < dst_cols) { T scalar = convertScalar(nVal); - int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(float), map1_offset)); int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(float), map2_offset)); - int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset)); - #pragma unroll - for (int i = 0; i < ROWS_PER_WI; ++i, ++y, - map1_index += map1_step, map2_index += map2_step, dst_index += dst_step) - if (y < dst_rows) - { - __global const float * map1 = (__global const float *)(map1ptr + map1_index); - __global const float * map2 = (__global const float *)(map2ptr + map2_index); - __global T * dst = (__global T *)(dstptr + dst_index); + for (int dy = y, dy1 = min(dst_rows, y + ROWS_PER_WI); dy < dy1; ++dy, map1_index += map1_step, map2_index += map2_step) + { + __global const float * map1 = (__global const float *)(map1ptr + map1_index); + __global const float * map2 = (__global const float *)(map2ptr + map2_index); - int gx = convert_int_sat_rte(map1[0]); - int gy = convert_int_sat_rte(map2[0]); - #if WARP_RELATIVE - gx += x; - gy += y; - #endif + float X0 = map1[0]; + float Y0 = map2[0]; + #if WARP_RELATIVE + X0 += x; + Y0 += dy; + #endif - if (NEED_EXTRAPOLATION(gx, gy)) - { -#ifndef BORDER_CONSTANT - int2 gxy = (int2)(gx, gy); -#endif - T v; - EXTRAPOLATE(gxy, v) - storepix(v, dst); - } - else - { - int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset)); - storepix(loadpix((__global const T*)(srcptr + src_index)), dst); - } + int sx = convert_int_sat(rint(X0)); + int sy = convert_int_sat(rint(Y0)); + + int2 map_data0 = (int2)(sx, sy); + + T v0 = scalar; + if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) { + v0 = loadpix((__global const T *)(srcptr + mad24(sy, src_step, mad24(sx, TSIZE, src_offset)))); + } else { + EXTRAPOLATE(map_data0, v0); } + + int dst_index = mad24(dy, dst_step, mad24(x, TSIZE, dst_offset)); + storepix(v0, dstptr + dst_index); + } } } @@ -202,36 +196,34 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o if (x < dst_cols) { T scalar = convertScalar(nVal); - int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset)); int map_index = mad24(y, map_step, mad24(x, (int)sizeof(float2), map_offset)); - #pragma unroll - for (int i = 0; i < ROWS_PER_WI; ++i, ++y, - map_index += map_step, dst_index += dst_step) - if (y < dst_rows) - { - __global const float2 * map = (__global const float2 *)(mapptr + map_index); - __global T * dst = (__global T *)(dstptr + dst_index); + for (int dy = y, dy1 = min(dst_rows, y + ROWS_PER_WI); dy < dy1; ++dy, map_index += map_step) + { + __global const float2 * map = (__global const float2 *)(mapptr + map_index); + float2 map_data = map[0]; - int2 gxy = convert_int2_sat_rte(map[0]); - #if WARP_RELATIVE - gxy.x += x; - gxy.y += y; - #endif + float X0 = map_data.x; + float Y0 = map_data.y; + #if WARP_RELATIVE + X0 += x; + Y0 += dy; + #endif - int gx = gxy.x, gy = gxy.y; + int sx = convert_int_sat(rint(X0)); + int sy = convert_int_sat(rint(Y0)); - if (NEED_EXTRAPOLATION(gx, gy)) - { - T v; - EXTRAPOLATE(gxy, v) - storepix(v, dst); - } - else - { - int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset)); - storepix(loadpix((__global const T *)(srcptr + src_index)), dst); - } + int2 map_data0 = (int2)(sx, sy); + + T v0 = scalar; + if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) { + v0 = loadpix((__global const T *)(srcptr + mad24(sy, src_step, mad24(sx, TSIZE, src_offset)))); + } else { + EXTRAPOLATE(map_data0, v0); + } + + int dst_index = mad24(dy, dst_step, mad24(x, TSIZE, dst_offset)); + storepix(v0, dstptr + dst_index); } } } diff --git a/modules/imgproc/src/opencl/warp_affine.cl b/modules/imgproc/src/opencl/warp_affine.cl index 1c762873d9..f83aa17b0b 100644 --- a/modules/imgproc/src/opencl/warp_affine.cl +++ b/modules/imgproc/src/opencl/warp_affine.cl @@ -93,27 +93,25 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of if (dx < dst_cols) { - int round_delta = (AB_SCALE >> 1); + float X0_ = fma(M[0], (CT)dx, M[2]); + float Y0_ = fma(M[3], (CT)dx, M[5]); - int X0_ = rint(M[0] * dx * AB_SCALE); - int Y0_ = rint(M[3] * dx * AB_SCALE); - int dst_index = mad24(dy0, dst_step, mad24(dx, pixsize, dst_offset)); - - for (int dy = dy0, dy1 = min(dst_rows, dy0 + ROWS_PER_WI); dy < dy1; ++dy, dst_index += dst_step) + for (int dy = dy0, dy1 = min(dst_rows, dy0 + ROWS_PER_WI); dy < dy1; ++dy) { - int X0 = X0_ + rint(fma(M[1], (CT)dy, M[2]) * AB_SCALE) + round_delta; - int Y0 = Y0_ + rint(fma(M[4], (CT)dy, M[5]) * AB_SCALE) + round_delta; + float X0 = fma(M[1], (CT)dy, X0_); + float Y0 = fma(M[4], (CT)dy, Y0_); - short sx = convert_short_sat(X0 >> AB_BITS); - short sy = convert_short_sat(Y0 >> AB_BITS); + int sx = convert_int_sat(rint(X0)); + int sy = convert_int_sat(rint(Y0)); + T v0 = scalar; if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) { - int src_index = mad24(sy, src_step, mad24(sx, pixsize, src_offset)); - storepix(loadpix(srcptr + src_index), dstptr + dst_index); + v0 = loadpix(srcptr + mad24(sy, src_step, mad24(sx, pixsize, src_offset))); } - else - storepix(scalar, dstptr + dst_index); + + int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset)); + storepix(v0, dstptr + dst_index); } } } diff --git a/modules/imgproc/src/opencl/warp_perspective.cl b/modules/imgproc/src/opencl/warp_perspective.cl index 36a8cdc11f..b28eca85b8 100644 --- a/modules/imgproc/src/opencl/warp_perspective.cl +++ b/modules/imgproc/src/opencl/warp_perspective.cl @@ -92,22 +92,21 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s if (dx < dst_cols && dy < dst_rows) { - CT X0 = M[0] * dx + M[1] * dy + M[2]; - CT Y0 = M[3] * dx + M[4] * dy + M[5]; - CT W = M[6] * dx + M[7] * dy + M[8]; - W = W != 0.0f ? 1.f / W : 0.0f; - short sx = convert_short_sat_rte(X0*W); - short sy = convert_short_sat_rte(Y0*W); + float W = fma(M[6], (CT)dx, fma(M[7], (CT)dy, M[8])); + float X0 = fma(M[0], (CT)dx, fma(M[1], (CT)dy, M[2])) / W; + float Y0 = fma(M[3], (CT)dx, fma(M[4], (CT)dy, M[5])) / W; - int dst_index = mad24(dy, dst_step, dx * pixsize + dst_offset); + int sx = convert_int_sat(rint(X0)); + int sy = convert_int_sat(rint(Y0)); + T v0 = scalar; if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) { - int src_index = mad24(sy, src_step, sx * pixsize + src_offset); - storepix(loadpix(srcptr + src_index), dstptr + dst_index); + v0 = loadpix(srcptr + mad24(sy, src_step, mad24(sx, pixsize, src_offset))); } - else - storepix(scalar, dstptr + dst_index); + + int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset)); + storepix(v0, dstptr + dst_index); } } diff --git a/modules/imgproc/src/warp_common.scalar.hpp b/modules/imgproc/src/warp_common.scalar.hpp index dd127d212b..026b6f6bc7 100644 --- a/modules/imgproc/src/warp_common.scalar.hpp +++ b/modules/imgproc/src/warp_common.scalar.hpp @@ -3,57 +3,61 @@ // of this distribution and at http://opencv.org/license.html. // Shuffle +#define CV_WARP_NEAREST_SCALAR_SHUFFLE_DEF(cn, dtype_reg) \ + dtype_reg p00##cn; #define CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(cn, dtype_reg) \ dtype_reg p00##cn, p01##cn, p10##cn, p11##cn; -#define CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_C1(dtype_reg, dtype_ptr) \ - CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(g, dtype_reg) \ +#define CV_WARP_SCALAR_SHUFFLE_DEF_C1(inter, dtype_reg, dtype_ptr) \ + CV_WARP_##inter##_SCALAR_SHUFFLE_DEF(g, dtype_reg) \ const dtype_ptr *srcptr = src + srcstep * iy + ix; -#define CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_C3(dtype_reg, dtype_ptr) \ - CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(r, dtype_reg) \ - CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(g, dtype_reg) \ - CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(b, dtype_reg) \ +#define CV_WARP_SCALAR_SHUFFLE_DEF_C3(inter, dtype_reg, dtype_ptr) \ + CV_WARP_##inter##_SCALAR_SHUFFLE_DEF(r, dtype_reg) \ + CV_WARP_##inter##_SCALAR_SHUFFLE_DEF(g, dtype_reg) \ + CV_WARP_##inter##_SCALAR_SHUFFLE_DEF(b, dtype_reg) \ const dtype_ptr *srcptr = src + srcstep * iy + ix*3; -#define CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_C4(dtype_reg, dtype_ptr) \ - CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(r, dtype_reg) \ - CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(g, dtype_reg) \ - CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(b, dtype_reg) \ - CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(a, dtype_reg) \ +#define CV_WARP_SCALAR_SHUFFLE_DEF_C4(inter, dtype_reg, dtype_ptr) \ + CV_WARP_##inter##_SCALAR_SHUFFLE_DEF(r, dtype_reg) \ + CV_WARP_##inter##_SCALAR_SHUFFLE_DEF(g, dtype_reg) \ + CV_WARP_##inter##_SCALAR_SHUFFLE_DEF(b, dtype_reg) \ + CV_WARP_##inter##_SCALAR_SHUFFLE_DEF(a, dtype_reg) \ const dtype_ptr *srcptr = src + srcstep * iy + ix*4; -#define CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_8U(CN) \ - CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_##CN(int, uint8_t) -#define CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_16U(CN) \ - CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_##CN(int, uint16_t) -#define CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_32F(CN) \ - CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_##CN(float, float) +#define CV_WARP_SCALAR_SHUFFLE_DEF_8U(INTER, CN) \ + CV_WARP_SCALAR_SHUFFLE_DEF_##CN(INTER, int, uint8_t) +#define CV_WARP_SCALAR_SHUFFLE_DEF_16U(INTER, CN) \ + CV_WARP_SCALAR_SHUFFLE_DEF_##CN(INTER, int, uint16_t) +#define CV_WARP_SCALAR_SHUFFLE_DEF_32F(INTER, CN) \ + CV_WARP_SCALAR_SHUFFLE_DEF_##CN(INTER, float, float) +#define CV_WARP_NEAREST_SCALAR_SHUFFLE_LOAD(CN, cn, i) \ + p00##CN = srcptr[i]; #define CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(CN, cn, i) \ p00##CN = srcptr[i]; p01##CN = srcptr[i + cn]; \ p10##CN = srcptr[srcstep + i]; p11##CN = srcptr[srcstep + cn + i]; -#define CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD_C1() \ - CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(g, 1, 0) -#define CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD_C3() \ - CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(r, 3, 0) \ - CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(g, 3, 1) \ - CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(b, 3, 2) -#define CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD_C4() \ - CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(r, 4, 0) \ - CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(g, 4, 1) \ - CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(b, 4, 2) \ - CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(a, 4, 3) +#define CV_WARP_SCALAR_SHUFFLE_LOAD_C1(inter) \ + CV_WARP_##inter##_SCALAR_SHUFFLE_LOAD(g, 1, 0) +#define CV_WARP_SCALAR_SHUFFLE_LOAD_C3(inter) \ + CV_WARP_##inter##_SCALAR_SHUFFLE_LOAD(r, 3, 0) \ + CV_WARP_##inter##_SCALAR_SHUFFLE_LOAD(g, 3, 1) \ + CV_WARP_##inter##_SCALAR_SHUFFLE_LOAD(b, 3, 2) +#define CV_WARP_SCALAR_SHUFFLE_LOAD_C4(inter) \ + CV_WARP_##inter##_SCALAR_SHUFFLE_LOAD(r, 4, 0) \ + CV_WARP_##inter##_SCALAR_SHUFFLE_LOAD(g, 4, 1) \ + CV_WARP_##inter##_SCALAR_SHUFFLE_LOAD(b, 4, 2) \ + CV_WARP_##inter##_SCALAR_SHUFFLE_LOAD(a, 4, 3) -#define CV_WARP_LINEAR_SCALAR_SHUFFLE_STORE_CONSTANT_BORDER_C1() \ +#define CV_WARP_SCALAR_SHUFFLE_STORE_CONSTANT_BORDER_C1() \ dstptr[x] = bval[0]; -#define CV_WARP_LINEAR_SCALAR_SHUFFLE_STORE_CONSTANT_BORDER_C3() \ +#define CV_WARP_SCALAR_SHUFFLE_STORE_CONSTANT_BORDER_C3() \ dstptr[x*3] = bval[0]; \ dstptr[x*3+1] = bval[1]; \ dstptr[x*3+2] = bval[2]; -#define CV_WARP_LINEAR_SCALAR_SHUFFLE_STORE_CONSTANT_BORDER_C4() \ +#define CV_WARP_SCALAR_SHUFFLE_STORE_CONSTANT_BORDER_C4() \ dstptr[x*4] = bval[0]; \ dstptr[x*4+1] = bval[1]; \ dstptr[x*4+2] = bval[2]; \ dstptr[x*4+3] = bval[3]; -#define CV_WARP_LINEAR_SCALAR_FETCH_PIXEL_C1(dy, dx, pxy) \ +#define CV_WARP_SCALAR_FETCH_PIXEL_C1(dy, dx, pxy) \ if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \ size_t ofs = dy*srcstep + dx; \ pxy##g = srcptr[ofs]; \ @@ -67,7 +71,7 @@ size_t glob_ofs = iy_*srcstep + ix_; \ pxy##g = src[glob_ofs]; \ } -#define CV_WARP_LINEAR_SCALAR_FETCH_PIXEL_C3(dy, dx, pxy) \ +#define CV_WARP_SCALAR_FETCH_PIXEL_C3(dy, dx, pxy) \ if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \ size_t ofs = dy*srcstep + dx*3; \ pxy##r = srcptr[ofs]; \ @@ -89,7 +93,7 @@ pxy##g = src[glob_ofs+1]; \ pxy##b = src[glob_ofs+2]; \ } -#define CV_WARP_LINEAR_SCALAR_FETCH_PIXEL_C4(dy, dx, pxy) \ +#define CV_WARP_SCALAR_FETCH_PIXEL_C4(dy, dx, pxy) \ if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \ size_t ofs = dy*srcstep + dx*4; \ pxy##r = srcptr[ofs]; \ @@ -115,83 +119,96 @@ pxy##b = src[glob_ofs+2]; \ pxy##a = src[glob_ofs+3]; \ } +#define CV_WARP_NEAREST_SCALAR_FETCH_PIXEL(CN) \ + CV_WARP_SCALAR_FETCH_PIXEL_##CN(0, 0, p00) +#define CV_WARP_LINEAR_SCALAR_FETCH_PIXEL(CN) \ + CV_WARP_SCALAR_FETCH_PIXEL_##CN(0, 0, p00) \ + CV_WARP_SCALAR_FETCH_PIXEL_##CN(0, 1, p01) \ + CV_WARP_SCALAR_FETCH_PIXEL_##CN(1, 0, p10) \ + CV_WARP_SCALAR_FETCH_PIXEL_##CN(1, 1, p11) -#define CV_WARP_LINEAR_SCALAR_SHUFFLE(CN, DEPTH) \ +#define CV_WARP_SCALAR_NEAREST_COMPUTE_COORD() \ + int ix = cvRound(sx), iy = cvRound(sy); +#define CV_WARP_SCALAR_LINEAR_COMPUTE_COORD() \ int ix = cvFloor(sx), iy = cvFloor(sy); \ - sx -= ix; sy -= iy; \ - CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_##DEPTH(CN); \ + sx -= ix; sy -= iy; + +#define CV_WARP_SCALAR_SHUFFLE(INTER, CN, DEPTH) \ + CV_WARP_SCALAR_##INTER##_COMPUTE_COORD() \ + CV_WARP_SCALAR_SHUFFLE_DEF_##DEPTH(INTER, CN) \ if ((((unsigned)ix < (unsigned)(srccols-1)) & \ ((unsigned)iy < (unsigned)(srcrows-1))) != 0) { \ - CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD_##CN() \ + CV_WARP_SCALAR_SHUFFLE_LOAD_##CN(INTER) \ } else { \ if ((border_type == BORDER_CONSTANT || border_type == BORDER_TRANSPARENT) && \ (((unsigned)(ix+1) >= (unsigned)(srccols+1))| \ ((unsigned)(iy+1) >= (unsigned)(srcrows+1))) != 0) { \ if (border_type == BORDER_CONSTANT) { \ - CV_WARP_LINEAR_SCALAR_SHUFFLE_STORE_CONSTANT_BORDER_##CN() \ + CV_WARP_SCALAR_SHUFFLE_STORE_CONSTANT_BORDER_##CN() \ } \ continue; \ } \ - CV_WARP_LINEAR_SCALAR_FETCH_PIXEL_##CN(0, 0, p00); \ - CV_WARP_LINEAR_SCALAR_FETCH_PIXEL_##CN(0, 1, p01); \ - CV_WARP_LINEAR_SCALAR_FETCH_PIXEL_##CN(1, 0, p10); \ - CV_WARP_LINEAR_SCALAR_FETCH_PIXEL_##CN(1, 1, p11); \ + CV_WARP_##INTER##_SCALAR_FETCH_PIXEL(CN) \ } // Linear interpolation calculation -#define CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(cn) \ +#define CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32(cn) \ float v0##cn = p00##cn + sx*(p01##cn - p00##cn); \ float v1##cn = p10##cn + sx*(p11##cn - p10##cn); -#define CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32_C1() \ - CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(g) -#define CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32_C3() \ - CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(r) \ - CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(g) \ - CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(b) -#define CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32_C4() \ - CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(r) \ - CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(g) \ - CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(b) \ - CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(a) +#define CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32_C1() \ + CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32(g) +#define CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32_C3() \ + CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32(r) \ + CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32(g) \ + CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32(b) +#define CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32_C4() \ + CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32(r) \ + CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32(g) \ + CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32(b) \ + CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32(a) -#define CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(cn) \ +#define CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32(cn) \ v0##cn += sy*(v1##cn - v0##cn); -#define CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32_C1() \ - CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(g) -#define CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32_C3() \ - CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(r) \ - CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(g) \ - CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(b) -#define CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32_C4() \ - CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(r) \ - CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(g) \ - CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(b) \ - CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(a) +#define CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32_C1() \ + CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32(g) +#define CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32_C3() \ + CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32(r) \ + CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32(g) \ + CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32(b) +#define CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32_C4() \ + CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32(r) \ + CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32(g) \ + CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32(b) \ + CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32(a) -#define CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(CN) \ - CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32_##CN() \ - CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32_##CN() +#define CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(CN) \ + CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32_##CN() \ + CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32_##CN() // Store -#define CV_WARP_LINEAR_SCALAR_STORE_C1(dtype) \ - dstptr[x] = saturate_cast(v0g); -#define CV_WARP_LINEAR_SCALAR_STORE_C3(dtype) \ - dstptr[x*3] = saturate_cast(v0r); \ - dstptr[x*3+1] = saturate_cast(v0g); \ - dstptr[x*3+2] = saturate_cast(v0b); -#define CV_WARP_LINEAR_SCALAR_STORE_C4(dtype) \ - dstptr[x*4] = saturate_cast(v0r); \ - dstptr[x*4+1] = saturate_cast(v0g); \ - dstptr[x*4+2] = saturate_cast(v0b); \ - dstptr[x*4+3] = saturate_cast(v0a); -#define CV_WARP_LINEAR_SCALAR_STORE_8U(CN) \ - CV_WARP_LINEAR_SCALAR_STORE_##CN(uint8_t) -#define CV_WARP_LINEAR_SCALAR_STORE_16U(CN) \ - CV_WARP_LINEAR_SCALAR_STORE_##CN(uint16_t) -#define CV_WARP_LINEAR_SCALAR_STORE_32F(CN) \ - CV_WARP_LINEAR_SCALAR_STORE_##CN(float) +#define CV_WARP_SCALAR_STORE_C1(dtype, var) \ + dstptr[x] = saturate_cast(var##g); +#define CV_WARP_SCALAR_STORE_C3(dtype, var) \ + dstptr[x*3] = saturate_cast(var##r); \ + dstptr[x*3+1] = saturate_cast(var##g); \ + dstptr[x*3+2] = saturate_cast(var##b); +#define CV_WARP_SCALAR_STORE_C4(dtype, var) \ + dstptr[x*4] = saturate_cast(var##r); \ + dstptr[x*4+1] = saturate_cast(var##g); \ + dstptr[x*4+2] = saturate_cast(var##b); \ + dstptr[x*4+3] = saturate_cast(var##a); +#define CV_WARP_SCALAR_STORE_8U(CN, var) \ + CV_WARP_SCALAR_STORE_##CN(uint8_t, var) +#define CV_WARP_SCALAR_STORE_16U(CN, var) \ + CV_WARP_SCALAR_STORE_##CN(uint16_t, var) +#define CV_WARP_SCALAR_STORE_32F(CN, var) \ + CV_WARP_SCALAR_STORE_##CN(float, var) +#define CV_WARP_NEAREST_SCALAR_STORE(CN, DEPTH) \ + CV_WARP_SCALAR_STORE_##DEPTH(CN, p00) #define CV_WARP_LINEAR_SCALAR_STORE(CN, DEPTH) \ - CV_WARP_LINEAR_SCALAR_STORE_##DEPTH(CN) + CV_WARP_SCALAR_STORE_##DEPTH(CN, v0) +#define CV_WARP_SCALAR_STORE(INTER, CN, DEPTH) \ + CV_WARP_##INTER##_SCALAR_STORE(CN, DEPTH) diff --git a/modules/imgproc/src/warp_common.vector.hpp b/modules/imgproc/src/warp_common.vector.hpp index 1e14ae20d9..384759430e 100644 --- a/modules/imgproc/src/warp_common.vector.hpp +++ b/modules/imgproc/src/warp_common.vector.hpp @@ -3,6 +3,26 @@ // of this distribution and at http://opencv.org/license.html. // Shuffle (all pixels within image) +#define CV_WARP_NEAREST_VECTOR_SHUFFLE_ALLWITHIN_C1(dtype) \ + for (int i = 0; i < uf; i++) { \ + const dtype* srcptr = src + addr[i]; \ + pixbuf[i] = srcptr[0];\ + } +#define CV_WARP_NEAREST_VECTOR_SHUFFLE_ALLWITHIN_C3(dtype) \ + for (int i = 0; i < uf; i++) { \ + const dtype* srcptr = src + addr[i]; \ + pixbuf[3*i] = srcptr[0];\ + pixbuf[3*i + 1] = srcptr[1]; \ + pixbuf[3*i + 2] = srcptr[2]; \ + } +#define CV_WARP_NEAREST_VECTOR_SHUFFLE_ALLWITHIN_C4(dtype) \ + for (int i = 0; i < uf; i++) { \ + const dtype* srcptr = src + addr[i]; \ + pixbuf[4*i] = srcptr[0];\ + pixbuf[4*i + 1] = srcptr[1]; \ + pixbuf[4*i + 2] = srcptr[2]; \ + pixbuf[4*i + 3] = srcptr[3]; \ + } #define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_C1(dtype) \ for (int i = 0; i < uf; i++) { \ const dtype* srcptr = src + addr[i]; \ @@ -47,18 +67,17 @@ pixbuf[i + uf*11] = srcptr[srcstep + 6]; \ pixbuf[i + uf*15] = srcptr[srcstep + 7]; \ } -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_8U(CN) \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_##CN(uint8_t) -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_16U(CN) \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_##CN(uint16_t) -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_32F(CN) \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_##CN(float) - -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(CN, DEPTH) \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_##DEPTH(CN) +#define CV_WARP_VECTOR_SHUFFLE_ALLWITHIN_8U(INTER, CN) \ + CV_WARP_##INTER##_VECTOR_SHUFFLE_ALLWITHIN_##CN(uint8_t) +#define CV_WARP_VECTOR_SHUFFLE_ALLWITHIN_16U(INTER, CN) \ + CV_WARP_##INTER##_VECTOR_SHUFFLE_ALLWITHIN_##CN(uint16_t) +#define CV_WARP_VECTOR_SHUFFLE_ALLWITHIN_32F(INTER, CN) \ + CV_WARP_##INTER##_VECTOR_SHUFFLE_ALLWITHIN_##CN(float) +#define CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(INTER, CN, DEPTH) \ + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN_##DEPTH(INTER, CN) // Shuffle (ARM NEON) -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_LOAD() \ +#define CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_LOAD() \ uint8x8x4_t t00 = { \ vld1_u8(src + addr[0]), \ vld1_u8(src + addr[1]), \ @@ -84,7 +103,7 @@ vld1_u8(src + addr[7] + srcstep) \ }; \ uint32x2_t p00_, p01_, p10_, p11_; -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(coords, cn) \ +#define CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(coords, cn) \ p00_ = vreinterpret_u32_u8(vtbl4_u8(t00, coords)); \ p01_ = vreinterpret_u32_u8(vtbl4_u8(t01, coords)); \ p10_ = vreinterpret_u32_u8(vtbl4_u8(t10, coords)); \ @@ -93,58 +112,58 @@ p01##cn = vreinterpret_u8_u32(vtrn2_u32(p00_, p01_)); \ p10##cn = vreinterpret_u8_u32(vtrn1_u32(p10_, p11_)); \ p11##cn = vreinterpret_u8_u32(vtrn2_u32(p10_, p11_)); -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_C1() \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_LOAD() \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(grays, g) -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_C3() \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_LOAD() \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(reds, r) \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(greens, g) \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(blues, b) -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_C4() \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_LOAD() \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(reds, r) \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(greens, g) \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(blues, b) \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(alphas, a) -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8(CN) \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_##CN() +#define CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_C1() \ + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_LOAD() \ + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(grays, g) +#define CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_C3() \ + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_LOAD() \ + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(reds, r) \ + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(greens, g) \ + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(blues, b) +#define CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_C4() \ + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_LOAD() \ + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(reds, r) \ + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(greens, g) \ + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(blues, b) \ + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(alphas, a) +#define CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8(CN) \ + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_##CN() // Shuffle (not all pixels within image) -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_8UC1() \ +#define CV_WARP_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_8UC1() \ v_store_low(dstptr + x, bval_v0); -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_8UC3() \ +#define CV_WARP_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_8UC3() \ v_store_low(dstptr + x*3, bval_v0); \ v_store_low(dstptr + x*3 + uf, bval_v1); \ v_store_low(dstptr + x*3 + uf*2, bval_v2); -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_8UC4() \ +#define CV_WARP_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_8UC4() \ v_store_low(dstptr + x*4, bval_v0); \ v_store_low(dstptr + x*4 + uf, bval_v1); \ v_store_low(dstptr + x*4 + uf*2, bval_v2); \ v_store_low(dstptr + x*4 + uf*3, bval_v3); -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_16UC1() \ +#define CV_WARP_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_16UC1() \ v_store(dstptr + x, bval_v0); -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_16UC3() \ +#define CV_WARP_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_16UC3() \ v_store(dstptr + x*3, bval_v0); \ v_store(dstptr + x*3 + uf, bval_v1); \ v_store(dstptr + x*3 + uf*2, bval_v2); -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_16UC4() \ +#define CV_WARP_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_16UC4() \ v_store(dstptr + x*4, bval_v0); \ v_store(dstptr + x*4 + uf, bval_v1); \ v_store(dstptr + x*4 + uf*2, bval_v2); \ v_store(dstptr + x*4 + uf*3, bval_v3); -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_32FC1() \ +#define CV_WARP_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_32FC1() \ v_store(dstptr + x, bval_v0_l); \ v_store(dstptr + x + vlanes_32, bval_v0_h); -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_32FC3() \ +#define CV_WARP_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_32FC3() \ v_store(dstptr + x*3, bval_v0_l); \ v_store(dstptr + x*3 + vlanes_32, bval_v0_h); \ v_store(dstptr + x*3 + uf, bval_v1_l); \ v_store(dstptr + x*3 + uf + vlanes_32, bval_v1_h); \ v_store(dstptr + x*3 + uf*2, bval_v2_l); \ v_store(dstptr + x*3 + uf*2 + vlanes_32, bval_v2_h); -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_32FC4() \ +#define CV_WARP_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_32FC4() \ v_store(dstptr + x*4, bval_v0_l); \ v_store(dstptr + x*4 + vlanes_32, bval_v0_h); \ v_store(dstptr + x*4 + uf, bval_v1_l); \ @@ -154,70 +173,83 @@ v_store(dstptr + x*4 + uf*3, bval_v3_l); \ v_store(dstptr + x*4 + uf*3 + vlanes_32, bval_v3_h); -#define CV_WARP_LINEAR_VECTOR_FETCH_PIXEL_C1(dy, dx, pixbuf_ofs) \ +#define CV_WARP_VECTOR_FETCH_PIXEL_C1(dy, dx, pixbuf_ofs0, pixbuf_ofs1) \ if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \ size_t addr_i = addr[i] + dy*srcstep + dx; \ - pixbuf[i + pixbuf_ofs] = src[addr_i]; \ + pixbuf[i + pixbuf_ofs0] = src[addr_i]; \ } else if (border_type == BORDER_CONSTANT) { \ - pixbuf[i + pixbuf_ofs] = bval[0]; \ + pixbuf[i + pixbuf_ofs0] = bval[0]; \ } else if (border_type == BORDER_TRANSPARENT) { \ - pixbuf[i + pixbuf_ofs] = dstptr[x + i]; \ + pixbuf[i + pixbuf_ofs0] = dstptr[x + i]; \ } else { \ int ix_ = borderInterpolate_fast(ix + dx, srccols, border_type_x); \ int iy_ = borderInterpolate_fast(iy + dy, srcrows, border_type_y); \ size_t addr_i = iy_*srcstep + ix_; \ - pixbuf[i + pixbuf_ofs] = src[addr_i]; \ + pixbuf[i + pixbuf_ofs0] = src[addr_i]; \ } -#define CV_WARP_LINEAR_VECTOR_FETCH_PIXEL_C3(dy, dx, pixbuf_ofs) \ +#define CV_WARP_VECTOR_FETCH_PIXEL_C3(dy, dx, pixbuf_ofs0, pixbuf_ofs1) \ if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \ size_t addr_i = addr[i] + dy*srcstep + dx*3; \ - pixbuf[i + pixbuf_ofs] = src[addr_i]; \ - pixbuf[i + pixbuf_ofs + uf*4] = src[addr_i+1]; \ - pixbuf[i + pixbuf_ofs + uf*8] = src[addr_i+2]; \ + pixbuf[i + pixbuf_ofs0] = src[addr_i]; \ + pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1] = src[addr_i+1]; \ + pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*2] = src[addr_i+2]; \ } else if (border_type == BORDER_CONSTANT) { \ - pixbuf[i + pixbuf_ofs] = bval[0]; \ - pixbuf[i + pixbuf_ofs + uf*4] = bval[1]; \ - pixbuf[i + pixbuf_ofs + uf*8] = bval[2]; \ + pixbuf[i + pixbuf_ofs0] = bval[0]; \ + pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1] = bval[1]; \ + pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*2] = bval[2]; \ } else if (border_type == BORDER_TRANSPARENT) { \ - pixbuf[i + pixbuf_ofs] = dstptr[(x + i)*3]; \ - pixbuf[i + pixbuf_ofs + uf*4] = dstptr[(x + i)*3 + 1]; \ - pixbuf[i + pixbuf_ofs + uf*8] = dstptr[(x + i)*3 + 2]; \ + pixbuf[i + pixbuf_ofs0] = dstptr[(x + i)*3]; \ + pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1] = dstptr[(x + i)*3 + 1]; \ + pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*2] = dstptr[(x + i)*3 + 2]; \ } else { \ int ix_ = borderInterpolate_fast(ix + dx, srccols, border_type_x); \ int iy_ = borderInterpolate_fast(iy + dy, srcrows, border_type_y); \ size_t addr_i = iy_*srcstep + ix_*3; \ - pixbuf[i + pixbuf_ofs] = src[addr_i]; \ - pixbuf[i + pixbuf_ofs + uf*4] = src[addr_i+1]; \ - pixbuf[i + pixbuf_ofs + uf*8] = src[addr_i+2]; \ + pixbuf[i + pixbuf_ofs0] = src[addr_i]; \ + pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1] = src[addr_i+1]; \ + pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*2] = src[addr_i+2]; \ } -#define CV_WARP_LINEAR_VECTOR_FETCH_PIXEL_C4(dy, dx, pixbuf_ofs) \ +#define CV_WARP_VECTOR_FETCH_PIXEL_C4(dy, dx, pixbuf_ofs0, pixbuf_ofs1) \ if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \ size_t addr_i = addr[i] + dy*srcstep + dx*4; \ - pixbuf[i + pixbuf_ofs] = src[addr_i]; \ - pixbuf[i + pixbuf_ofs + uf*4] = src[addr_i+1]; \ - pixbuf[i + pixbuf_ofs + uf*8] = src[addr_i+2]; \ - pixbuf[i + pixbuf_ofs + uf*12] = src[addr_i+3]; \ + pixbuf[i + pixbuf_ofs0] = src[addr_i]; \ + pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1] = src[addr_i+1]; \ + pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*2] = src[addr_i+2]; \ + pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*3] = src[addr_i+3]; \ } else if (border_type == BORDER_CONSTANT) { \ - pixbuf[i + pixbuf_ofs] = bval[0]; \ - pixbuf[i + pixbuf_ofs + uf*4] = bval[1]; \ - pixbuf[i + pixbuf_ofs + uf*8] = bval[2]; \ - pixbuf[i + pixbuf_ofs + uf*12] = bval[3]; \ + pixbuf[i + pixbuf_ofs0] = bval[0]; \ + pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1] = bval[1]; \ + pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*2] = bval[2]; \ + pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*3] = bval[3]; \ } else if (border_type == BORDER_TRANSPARENT) { \ - pixbuf[i + pixbuf_ofs] = dstptr[(x + i)*4]; \ - pixbuf[i + pixbuf_ofs + uf*4] = dstptr[(x + i)*4 + 1]; \ - pixbuf[i + pixbuf_ofs + uf*8] = dstptr[(x + i)*4 + 2]; \ - pixbuf[i + pixbuf_ofs + uf*12] = dstptr[(x + i)*4 + 3]; \ + pixbuf[i + pixbuf_ofs0] = dstptr[(x + i)*4]; \ + pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1] = dstptr[(x + i)*4 + 1]; \ + pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*2] = dstptr[(x + i)*4 + 2]; \ + pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*3] = dstptr[(x + i)*4 + 3]; \ } else { \ int ix_ = borderInterpolate_fast(ix + dx, srccols, border_type_x); \ int iy_ = borderInterpolate_fast(iy + dy, srcrows, border_type_y); \ size_t addr_i = iy_*srcstep + ix_*4; \ - pixbuf[i + pixbuf_ofs] = src[addr_i]; \ - pixbuf[i + pixbuf_ofs + uf*4] = src[addr_i+1]; \ - pixbuf[i + pixbuf_ofs + uf*8] = src[addr_i+2]; \ - pixbuf[i + pixbuf_ofs + uf*12] = src[addr_i+3]; \ + pixbuf[i + pixbuf_ofs0] = src[addr_i]; \ + pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1] = src[addr_i+1]; \ + pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*2] = src[addr_i+2]; \ + pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*3] = src[addr_i+3]; \ } +#define CV_WARP_NEAREST_VECTOR_FETCH_PIXEL_C1() \ + CV_WARP_VECTOR_FETCH_PIXEL_C1(0, 0, 0, 1); +#define CV_WARP_NEAREST_VECTOR_FETCH_PIXEL_C3() \ + CV_WARP_VECTOR_FETCH_PIXEL_C3(0, 0, 2*i, 1); +#define CV_WARP_NEAREST_VECTOR_FETCH_PIXEL_C4() \ + CV_WARP_VECTOR_FETCH_PIXEL_C4(0, 0, 3*i, 1); +#define CV_WARP_NEAREST_VECTOR_FETCH_PIXEL(CN) \ + CV_WARP_NEAREST_VECTOR_FETCH_PIXEL_##CN() +#define CV_WARP_LINEAR_VECTOR_FETCH_PIXEL(CN) \ + CV_WARP_VECTOR_FETCH_PIXEL_##CN(0, 0, 0, uf*4); \ + CV_WARP_VECTOR_FETCH_PIXEL_##CN(0, 1, uf, uf*4); \ + CV_WARP_VECTOR_FETCH_PIXEL_##CN(1, 0, uf*2, uf*4); \ + CV_WARP_VECTOR_FETCH_PIXEL_##CN(1, 1, uf*3, uf*4); -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(CN, DEPTH) \ +#define CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(INTER, CN, DEPTH) \ if (border_type == BORDER_CONSTANT || border_type == BORDER_TRANSPARENT) { \ mask_0 = v_lt(v_reinterpret_as_u32(v_add(src_ix0, one)), outer_scols); \ mask_1 = v_lt(v_reinterpret_as_u32(v_add(src_ix1, one)), outer_scols); \ @@ -226,7 +258,7 @@ v_uint16 outer_mask = v_pack(mask_0, mask_1); \ if (v_reduce_max(outer_mask) == 0) { \ if (border_type == BORDER_CONSTANT) { \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_##DEPTH##CN() \ + CV_WARP_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_##DEPTH##CN() \ } \ continue; \ } \ @@ -237,111 +269,135 @@ vx_store(src_iy + vlanes_32, src_iy1); \ for (int i = 0; i < uf; i++) { \ int ix = src_ix[i], iy = src_iy[i]; \ - CV_WARP_LINEAR_VECTOR_FETCH_PIXEL_##CN(0, 0, 0); \ - CV_WARP_LINEAR_VECTOR_FETCH_PIXEL_##CN(0, 1, uf); \ - CV_WARP_LINEAR_VECTOR_FETCH_PIXEL_##CN(1, 0, uf*2); \ - CV_WARP_LINEAR_VECTOR_FETCH_PIXEL_##CN(1, 1, uf*3); \ + CV_WARP_##INTER##_VECTOR_FETCH_PIXEL(CN) \ } // Shuffle (not all pixels within image) (ARM NEON) -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(cn, offset)\ +#define CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(cn, offset)\ p00##cn = vld1_u8(pixbuf + offset); \ p01##cn = vld1_u8(pixbuf + offset + 8); \ p10##cn = vld1_u8(pixbuf + offset + 16); \ p11##cn = vld1_u8(pixbuf + offset + 24); -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_C1() \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(g, 0) -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_C3() \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(r, 0) \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(g, 32) \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(b, 64) -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_C4() \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(r, 0) \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(g, 32) \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(b, 64) \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(a, 96) -#define CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8(CN) \ - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_##CN() +#define CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_C1() \ + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(g, 0) +#define CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_C3() \ + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(r, 0) \ + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(g, 32) \ + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(b, 64) +#define CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_C4() \ + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(r, 0) \ + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(g, 32) \ + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(b, 64) \ + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(a, 96) +#define CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8(CN) \ + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_##CN() -// Load pixels for linear interpolation (uint8_t -> int16_t) -#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(cn, i) \ - v_int16 f00##cn = v_reinterpret_as_s16(vx_load_expand(pixbuf + uf * i)), \ - f01##cn = v_reinterpret_as_s16(vx_load_expand(pixbuf + uf * (i+1))), \ - f10##cn = v_reinterpret_as_s16(vx_load_expand(pixbuf + uf * (i+2))), \ - f11##cn = v_reinterpret_as_s16(vx_load_expand(pixbuf + uf * (i+3))); -#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_C1() \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(g, 0) -#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_C3() \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(r, 0) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(g, 4) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(b, 8) -#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_C4() \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(r, 0) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(g, 4) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(b, 8) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(a, 12) -#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16(CN) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_##CN(); +// [New] Load pixels for interpolation +#define CV_WARP_VECTOR_NEAREST_LOAD_CN_8U_16U(cn, i) \ + v_uint16 f00##cn = vx_load_expand(pixbuf + uf * i); +#define CV_WARP_VECTOR_NEAREST_LOAD_CN_16U_16U(cn, i) \ + v_uint16 f00##cn = vx_load(pixbuf + uf * i); +#define CV_WARP_VECTOR_NEAREST_LOAD_CN_32F_32F(cn, i) \ + v_float32 f00##cn##l = vx_load(pixbuf + uf * i); \ + v_float32 f00##cn##h = vx_load(pixbuf + uf * i + vlanes_32); +#define CV_WARP_VECTOR_LINEAR_LOAD_CN_8U_16U(cn, i) \ + v_uint16 f00##cn = vx_load_expand(pixbuf + uf * 4*i), \ + f01##cn = vx_load_expand(pixbuf + uf * (4*i+1)), \ + f10##cn = vx_load_expand(pixbuf + uf * (4*i+2)), \ + f11##cn = vx_load_expand(pixbuf + uf * (4*i+3)); +#define CV_WARP_VECTOR_LINEAR_LOAD_CN_16U_16U(cn, i) \ + v_uint16 f00##cn = vx_load(pixbuf + uf * 4*i), \ + f01##cn = vx_load(pixbuf + uf * (4*i+1)), \ + f10##cn = vx_load(pixbuf + uf * (4*i+2)), \ + f11##cn = vx_load(pixbuf + uf * (4*i+3)); +#define CV_WARP_VECTOR_LINEAR_LOAD_CN_32F_32F(cn, i) \ + v_float32 f00##cn##l = vx_load(pixbuf + uf * 4*i), \ + f00##cn##h = vx_load(pixbuf + uf * 4*i + vlanes_32); \ + v_float32 f01##cn##l = vx_load(pixbuf + uf * (4*i+1)), \ + f01##cn##h = vx_load(pixbuf + uf * (4*i+1) + vlanes_32); \ + v_float32 f10##cn##l = vx_load(pixbuf + uf * (4*i+2)), \ + f10##cn##h = vx_load(pixbuf + uf * (4*i+2) + vlanes_32); \ + v_float32 f11##cn##l = vx_load(pixbuf + uf * (4*i+3)), \ + f11##cn##h = vx_load(pixbuf + uf * (4*i+3) + vlanes_32); +#define CV_WARP_VECTOR_INTER_LOAD_C1(INTER, SDEPTH, DDEPTH) \ + CV_WARP_VECTOR_##INTER##_LOAD_CN_##SDEPTH##_##DDEPTH(g, 0) +#define CV_WARP_VECTOR_INTER_LOAD_C3(INTER, SDEPTH, DDEPTH) \ + CV_WARP_VECTOR_##INTER##_LOAD_CN_##SDEPTH##_##DDEPTH(r, 0) \ + CV_WARP_VECTOR_##INTER##_LOAD_CN_##SDEPTH##_##DDEPTH(g, 1) \ + CV_WARP_VECTOR_##INTER##_LOAD_CN_##SDEPTH##_##DDEPTH(b, 2) +#define CV_WARP_VECTOR_INTER_LOAD_C4(INTER, SDEPTH, DDEPTH) \ + CV_WARP_VECTOR_##INTER##_LOAD_CN_##SDEPTH##_##DDEPTH(r, 0) \ + CV_WARP_VECTOR_##INTER##_LOAD_CN_##SDEPTH##_##DDEPTH(g, 1) \ + CV_WARP_VECTOR_##INTER##_LOAD_CN_##SDEPTH##_##DDEPTH(b, 2) \ + CV_WARP_VECTOR_##INTER##_LOAD_CN_##SDEPTH##_##DDEPTH(a, 3) +#define CV_WARP_VECTOR_INTER_LOAD(INTER, CN, SDEPTH, DDEPTH) \ + CV_WARP_VECTOR_INTER_LOAD_##CN(INTER, SDEPTH, DDEPTH) -// Load pixels for linear interpolation (uint8_t -> int16_t) (ARM NEON) -#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16_NEON(cn) \ - v_int16 f00##cn = v_reinterpret_as_s16(v_uint16(vmovl_u8(p00##cn))), \ - f01##cn = v_reinterpret_as_s16(v_uint16(vmovl_u8(p01##cn))), \ - f10##cn = v_reinterpret_as_s16(v_uint16(vmovl_u8(p10##cn))), \ - f11##cn = v_reinterpret_as_s16(v_uint16(vmovl_u8(p11##cn))); -#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_NEON_C1() \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16_NEON(g) -#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_NEON_C3() \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16_NEON(r) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16_NEON(g) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16_NEON(b) +// [New] Store +#define CV_WARP_VECTOR_NEAREST_STORE_C1_16U_8U() \ + v_pack_store(dstptr + x, f00g); +#define CV_WARP_VECTOR_NEAREST_STORE_C3_16U_8U() \ + v_pack_store(dstptr + 3*x, f00r); \ + v_pack_store(dstptr + 3*x + uf, f00g); \ + v_pack_store(dstptr + 3*x + uf*2, f00b); +#define CV_WARP_VECTOR_NEAREST_STORE_C4_16U_8U() \ + v_pack_store(dstptr + 4*x, f00r); \ + v_pack_store(dstptr + 4*x + uf, f00g); \ + v_pack_store(dstptr + 4*x + uf*2, f00b); \ + v_pack_store(dstptr + 4*x + uf*3, f00a); +#define CV_WARP_VECTOR_NEAREST_STORE_C1_16U_16U() \ + vx_store(dstptr + x, f00g); +#define CV_WARP_VECTOR_NEAREST_STORE_C3_16U_16U() \ + vx_store(dstptr + 3*x, f00r); \ + vx_store(dstptr + 3*x + uf, f00g); \ + vx_store(dstptr + 3*x + uf*2, f00b); +#define CV_WARP_VECTOR_NEAREST_STORE_C4_16U_16U() \ + vx_store(dstptr + 4*x, f00r); \ + vx_store(dstptr + 4*x + uf, f00g); \ + vx_store(dstptr + 4*x + uf*2, f00b); \ + vx_store(dstptr + 4*x + uf*3, f00a); +#define CV_WARP_VECTOR_NEAREST_STORE_C1_32F_32F() \ + vx_store(dstptr + x, f00gl); \ + vx_store(dstptr + x + vlanes_32, f00gh); +#define CV_WARP_VECTOR_NEAREST_STORE_C3_32F_32F() \ + vx_store(dstptr + 3*x, f00rl); \ + vx_store(dstptr + 3*x + vlanes_32, f00rh); \ + vx_store(dstptr + 3*x + uf, f00gl); \ + vx_store(dstptr + 3*x + uf + vlanes_32, f00gh); \ + vx_store(dstptr + 3*x + uf*2, f00bl); \ + vx_store(dstptr + 3*x + uf*2 + vlanes_32, f00bh); +#define CV_WARP_VECTOR_NEAREST_STORE_C4_32F_32F() \ + vx_store(dstptr + 4*x, f00rl); \ + vx_store(dstptr + 4*x + vlanes_32, f00rh); \ + vx_store(dstptr + 4*x + uf, f00gl); \ + vx_store(dstptr + 4*x + uf + vlanes_32, f00gh); \ + vx_store(dstptr + 4*x + uf*2, f00bl); \ + vx_store(dstptr + 4*x + uf*2 + vlanes_32, f00bh); \ + vx_store(dstptr + 4*x + uf*3, f00al); \ + vx_store(dstptr + 4*x + uf*3 + vlanes_32, f00ah); +#define CV_WARP_VECTOR_INTER_STORE(INTER, CN, SDEPTH, DDEPTH) \ + CV_WARP_VECTOR_##INTER##_STORE_##CN##_##SDEPTH##_##DDEPTH() + + +// Load pixels for linear interpolation (uint8_t -> uint16_t) (ARM NEON) +#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8U16_NEON(cn) \ + v_uint16 f00##cn = v_uint16(vmovl_u8(p00##cn)), \ + f01##cn = v_uint16(vmovl_u8(p01##cn)), \ + f10##cn = v_uint16(vmovl_u8(p10##cn)), \ + f11##cn = v_uint16(vmovl_u8(p11##cn)); +#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8U16_NEON_C1() \ + CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8U16_NEON(g) +#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8U16_NEON_C3() \ + CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8U16_NEON(r) \ + CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8U16_NEON(g) \ + CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8U16_NEON(b) #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_NEON_C4() \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16_NEON(r) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16_NEON(g) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16_NEON(b) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16_NEON(a) -#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_NEON(CN) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_NEON_##CN(); - -// Load pixels for linear interpolation (uint16_t -> uint16_t) -#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(cn, i) \ - v_uint16 f00##cn = vx_load(pixbuf + uf * i), \ - f01##cn = vx_load(pixbuf + uf * (i+1)), \ - f10##cn = vx_load(pixbuf + uf * (i+2)), \ - f11##cn = vx_load(pixbuf + uf * (i+3)); -#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16_C1() \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(g, 0) -#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16_C3() \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(r, 0) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(g, 4) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(b, 8) -#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16_C4() \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(r, 0) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(g, 4) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(b, 8) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(a, 12) -#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(CN) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16_##CN(); - -// Load pixels for linear interpolation (int16_t -> float) -#define CV_WARP_LINEAR_VECTOR_INTER_CONVERT_CN_S16F32(cn) \ - v_float32 f00##cn##l = v_cvt_f32(v_expand_low(f00##cn)), f00##cn##h = v_cvt_f32(v_expand_high(f00##cn)), \ - f01##cn##l = v_cvt_f32(v_expand_low(f01##cn)), f01##cn##h = v_cvt_f32(v_expand_high(f01##cn)), \ - f10##cn##l = v_cvt_f32(v_expand_low(f10##cn)), f10##cn##h = v_cvt_f32(v_expand_high(f10##cn)), \ - f11##cn##l = v_cvt_f32(v_expand_low(f11##cn)), f11##cn##h = v_cvt_f32(v_expand_high(f11##cn)); -#define CV_WARP_LINEAR_VECTOR_INTER_CONVERT_S16F32_C1() \ - CV_WARP_LINEAR_VECTOR_INTER_CONVERT_CN_S16F32(g) -#define CV_WARP_LINEAR_VECTOR_INTER_CONVERT_S16F32_C3() \ - CV_WARP_LINEAR_VECTOR_INTER_CONVERT_CN_S16F32(r) \ - CV_WARP_LINEAR_VECTOR_INTER_CONVERT_CN_S16F32(g) \ - CV_WARP_LINEAR_VECTOR_INTER_CONVERT_CN_S16F32(b) -#define CV_WARP_LINEAR_VECTOR_INTER_CONVERT_S16F32_C4() \ - CV_WARP_LINEAR_VECTOR_INTER_CONVERT_CN_S16F32(r) \ - CV_WARP_LINEAR_VECTOR_INTER_CONVERT_CN_S16F32(g) \ - CV_WARP_LINEAR_VECTOR_INTER_CONVERT_CN_S16F32(b) \ - CV_WARP_LINEAR_VECTOR_INTER_CONVERT_CN_S16F32(a) -#define CV_WARP_LINEAR_VECTOR_INTER_CONVERT_S16F32(CN) \ - CV_WARP_LINEAR_VECTOR_INTER_CONVERT_S16F32_##CN() + CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8U16_NEON(r) \ + CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8U16_NEON(g) \ + CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8U16_NEON(b) \ + CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8U16_NEON(a) +#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8U16_NEON(CN) \ + CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8U16_NEON_##CN(); // Load pixels for linear interpolation (uint16_t -> float) #define CV_WARP_LINEAR_VECTOR_INTER_CONVERT_CN_U16F32(cn) \ @@ -363,26 +419,6 @@ #define CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(CN) \ CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32_##CN() -// Load pixels for linear interpolation (float -> float) -#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(cn, i) \ - v_float32 f00##cn##l = vx_load(pixbuf + uf * i), f00##cn##h = vx_load(pixbuf + uf * i + vlanes_32), \ - f01##cn##l = vx_load(pixbuf + uf * (i+1)), f01##cn##h = vx_load(pixbuf + uf * (i+1) + vlanes_32), \ - f10##cn##l = vx_load(pixbuf + uf * (i+2)), f10##cn##h = vx_load(pixbuf + uf * (i+2) + vlanes_32), \ - f11##cn##l = vx_load(pixbuf + uf * (i+3)), f11##cn##h = vx_load(pixbuf + uf * (i+3) + vlanes_32); -#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32_C1() \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(g, 0) -#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32_C3() \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(r, 0) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(g, 4) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(b, 8) -#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32_C4() \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(r, 0) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(g, 4) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(b, 8) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(a, 12) -#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(CN) \ - CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32_##CN() - // Load pixels for linear interpolation (uint8_t -> float16) #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8F16(cn) \ v_float16 f00##cn = v_float16(vcvtq_f16_u16(vmovl_u8(p00##cn))), \ @@ -556,9 +592,30 @@ #define CV_WARP_LINEAR_VECTOR_INTER_STORE_F16U8(CN) \ CV_WARP_LINEAR_VECTOR_INTER_STORE_F16U8_##CN() - -// Special case for C4 load, shuffle and bilinear interpolation -#define CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4_I(ofs) \ +// Special case for C4 shuffle, interpolation and store +// SIMD128, nearest +#define CV_WARP_SIMD128_NEAREST_SHUFFLE_INTER_8UC4_I(ofs) \ + const uint8_t *srcptr##ofs = src + addr[i+ofs]; \ + v_uint32 i##ofs##_pix0 = vx_load_expand_q(srcptr##ofs); +#define CV_WARP_SIMD128_NEAREST_SHUFFLE_INTER_16UC4_I(ofs) \ + const uint16_t *srcptr##ofs = src + addr[i+ofs]; \ + v_uint32 i##ofs##_pix0 = vx_load_expand(srcptr##ofs); +#define CV_WARP_SIMD128_NEAREST_SHUFFLE_INTER_32FC4_I(ofs) \ + const float *srcptr##ofs = src + addr[i+ofs]; \ + v_float32 i##ofs##_pix0 = vx_load(srcptr##ofs); +#define CV_WARP_SIMD128_NEAREST_STORE_8UC4_I() \ + v_pack_store(dstptr + 4*(x+i), v_pack(i0_pix0, i1_pix0)); \ + v_pack_store(dstptr + 4*(x+i+2), v_pack(i2_pix0, i3_pix0)); +#define CV_WARP_SIMD128_NEAREST_STORE_16UC4_I() \ + vx_store(dstptr + 4*(x+i), v_pack(i0_pix0, i1_pix0)); \ + vx_store(dstptr + 4*(x+i+2), v_pack(i2_pix0, i3_pix0)); +#define CV_WARP_SIMD128_NEAREST_STORE_32FC4_I() \ + vx_store(dstptr + 4*(x+i), i0_pix0); \ + vx_store(dstptr + 4*(x+i+1), i1_pix0); \ + vx_store(dstptr + 4*(x+i+2), i2_pix0); \ + vx_store(dstptr + 4*(x+i+3), i3_pix0); +// SIMD128, bilinear +#define CV_WARP_SIMD128_LINEAR_SHUFFLE_INTER_8UC4_I(ofs) \ const uint8_t *srcptr##ofs = src + addr[i+ofs]; \ v_float32 i##ofs##_pix0 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(srcptr##ofs))); \ v_float32 i##ofs##_pix1 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(srcptr##ofs+4))); \ @@ -569,7 +626,7 @@ i##ofs##_pix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix1, i##ofs##_pix0), i##ofs##_pix0); \ i##ofs##_pix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix3, i##ofs##_pix2), i##ofs##_pix2); \ i##ofs##_pix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_pix2, i##ofs##_pix0), i##ofs##_pix0); -#define CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_16UC4_I(ofs) \ +#define CV_WARP_SIMD128_LINEAR_SHUFFLE_INTER_16UC4_I(ofs) \ const uint16_t *srcptr##ofs = src + addr[i+ofs]; \ v_float32 i##ofs##_pix0 = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(srcptr##ofs))); \ v_float32 i##ofs##_pix1 = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(srcptr##ofs+4))); \ @@ -580,7 +637,7 @@ i##ofs##_pix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix1, i##ofs##_pix0), i##ofs##_pix0); \ i##ofs##_pix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix3, i##ofs##_pix2), i##ofs##_pix2); \ i##ofs##_pix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_pix2, i##ofs##_pix0), i##ofs##_pix0); -#define CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_32FC4_I(ofs) \ +#define CV_WARP_SIMD128_LINEAR_SHUFFLE_INTER_32FC4_I(ofs) \ const float *srcptr##ofs = src + addr[i+ofs]; \ v_float32 i##ofs##_pix0 = vx_load(srcptr##ofs); \ v_float32 i##ofs##_pix1 = vx_load(srcptr##ofs+4); \ @@ -591,30 +648,59 @@ i##ofs##_pix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix1, i##ofs##_pix0), i##ofs##_pix0); \ i##ofs##_pix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix3, i##ofs##_pix2), i##ofs##_pix2); \ i##ofs##_pix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_pix2, i##ofs##_pix0), i##ofs##_pix0); -#define CV_WARP_SIMD128_STORE_8UC4_I() \ +#define CV_WARP_SIMD128_LINEAR_STORE_8UC4_I() \ v_uint16 i01_pix = v_pack_u(v_round(i0_pix0), v_round(i1_pix0)); \ v_uint16 i23_pix = v_pack_u(v_round(i2_pix0), v_round(i3_pix0)); \ v_pack_store(dstptr + 4*(x+i), i01_pix); \ v_pack_store(dstptr + 4*(x+i+2), i23_pix); -#define CV_WARP_SIMD128_STORE_16UC4_I() \ +#define CV_WARP_SIMD128_LINEAR_STORE_16UC4_I() \ v_uint16 i01_pix = v_pack_u(v_round(i0_pix0), v_round(i1_pix0)); \ v_uint16 i23_pix = v_pack_u(v_round(i2_pix0), v_round(i3_pix0)); \ vx_store(dstptr + 4*(x+i), i01_pix); \ vx_store(dstptr + 4*(x+i+2), i23_pix); -#define CV_WARP_SIMD128_STORE_32FC4_I() \ - vx_store(dstptr + 4*(x+i), i0_pix0); \ - vx_store(dstptr + 4*(x+i)+4, i1_pix0); \ - vx_store(dstptr + 4*(x+i)+8, i2_pix0); \ - vx_store(dstptr + 4*(x+i)+12, i3_pix0); -#define CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(DEPTH) \ +#define CV_WARP_SIMD128_LINEAR_STORE_32FC4_I() \ + vx_store(dstptr + 4*(x+i), i0_pix0); \ + vx_store(dstptr + 4*(x+i+1), i1_pix0); \ + vx_store(dstptr + 4*(x+i+2), i2_pix0); \ + vx_store(dstptr + 4*(x+i+3), i3_pix0); +#define CV_WARP_SIMD128_SHUFFLE_INTER_STORE_C4(INTER, DEPTH) \ for (int i = 0; i < uf; i+=vlanes_32) { \ - CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(0); \ - CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(1); \ - CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(2); \ - CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(3); \ - CV_WARP_SIMD128_STORE_##DEPTH##C4_I(); \ + CV_WARP_SIMD128_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(0); \ + CV_WARP_SIMD128_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(1); \ + CV_WARP_SIMD128_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(2); \ + CV_WARP_SIMD128_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(3); \ + CV_WARP_SIMD128_##INTER##_STORE_##DEPTH##C4_I(); \ } -#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4_I(ofs0, ofs1) \ +// SIMD128, nearest +#define CV_WARP_SIMD256_NEAREST_SHUFFLE_INTER_8UC4_I(ofs0, ofs1) \ + const uint8_t *srcptr##ofs0 = src + addr[i+ofs0]; \ + const uint8_t *srcptr##ofs1 = src + addr[i+ofs1]; \ + v_uint32 i##ofs0##_pix0x = v256_load_expand_q(srcptr##ofs0); \ + v_uint32 i##ofs1##_pix0x = v256_load_expand_q(srcptr##ofs1); \ + v_uint32 i##ofs0##ofs1##_pix00 = v_combine_low(i##ofs0##_pix0x, i##ofs1##_pix0x); +#define CV_WARP_SIMD256_NEAREST_SHUFFLE_INTER_16UC4_I(ofs0, ofs1) \ + const uint16_t *srcptr##ofs0 = src + addr[i+ofs0]; \ + const uint16_t *srcptr##ofs1 = src + addr[i+ofs1]; \ + v_uint32 i##ofs0##_pix0x = v256_load_expand(srcptr##ofs0); \ + v_uint32 i##ofs1##_pix0x = v256_load_expand(srcptr##ofs1); \ + v_uint32 i##ofs0##ofs1##_pix00 = v_combine_low(i##ofs0##_pix0x, i##ofs1##_pix0x); +#define CV_WARP_SIMD256_NEAREST_SHUFFLE_INTER_32FC4_I(ofs0, ofs1) \ + const float *srcptr##ofs0 = src + addr[i+ofs0]; \ + const float *srcptr##ofs1 = src + addr[i+ofs1]; \ + v_float32 i##ofs0##ofs1##_fpix00 = vx_load_halves(srcptr##ofs0, srcptr##ofs1); +#define CV_WARP_SIMD256_NEAREST_STORE_8UC4_I() \ + v_pack_store(dstptr + 4*(x+i), v_pack(i01_pix00, i23_pix00)); \ + v_pack_store(dstptr + 4*(x+i+4), v_pack(i45_pix00, i67_pix00)); +#define CV_WARP_SIMD256_NEAREST_STORE_16UC4_I() \ + vx_store(dstptr + 4*(x+i), v_pack(i01_pix00, i23_pix00)); \ + vx_store(dstptr + 4*(x+i+4), v_pack(i45_pix00, i67_pix00)); +#define CV_WARP_SIMD256_NEAREST_STORE_32FC4_I() \ + vx_store(dstptr + 4*(x+i), i01_fpix00); \ + vx_store(dstptr + 4*(x+i)+8, i23_fpix00); \ + vx_store(dstptr + 4*(x+i)+16, i45_fpix00); \ + vx_store(dstptr + 4*(x+i)+24, i67_fpix00); +// SIMD256, bilinear +#define CV_WARP_SIMD256_LINEAR_SHUFFLE_INTER_8UC4_I(ofs0, ofs1) \ const uint8_t *srcptr##ofs0 = src + addr[i+ofs0]; \ const uint8_t *srcptr##ofs1 = src + addr[i+ofs1]; \ v_int32 i##ofs0##_pix01 = v_reinterpret_as_s32(v256_load_expand_q(srcptr##ofs0)), \ @@ -635,8 +721,9 @@ i##ofs0##ofs1##_beta = v_combine_low(i##ofs0##_beta, i##ofs1##_beta); \ i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix11, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); \ i##ofs0##ofs1##_fpix22 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix33, i##ofs0##ofs1##_fpix22), i##ofs0##ofs1##_fpix22); \ - i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_beta, v_sub(i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); -#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_16UC4_I(ofs0, ofs1) \ + i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_beta, v_sub(i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); \ + auto i##ofs0##ofs1##_pix00 = v_round(i##ofs0##ofs1##_fpix00); +#define CV_WARP_SIMD256_LINEAR_SHUFFLE_INTER_16UC4_I(ofs0, ofs1) \ const uint16_t *srcptr##ofs0 = src + addr[i+ofs0]; \ const uint16_t *srcptr##ofs1 = src + addr[i+ofs1]; \ v_int32 i##ofs0##_pix01 = v_reinterpret_as_s32(v256_load_expand(srcptr##ofs0)), \ @@ -657,8 +744,9 @@ i##ofs0##ofs1##_beta = v_combine_low(i##ofs0##_beta, i##ofs1##_beta); \ i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix11, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); \ i##ofs0##ofs1##_fpix22 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix33, i##ofs0##ofs1##_fpix22), i##ofs0##ofs1##_fpix22); \ - i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_beta, v_sub(i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); -#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_32FC4_I(ofs0, ofs1) \ + i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_beta, v_sub(i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); \ + auto i##ofs0##ofs1##_pix00 = v_round(i##ofs0##ofs1##_fpix00); +#define CV_WARP_SIMD256_LINEAR_SHUFFLE_INTER_32FC4_I(ofs0, ofs1) \ const float *srcptr##ofs0 = src + addr[i+ofs0]; \ const float *srcptr##ofs1 = src + addr[i+ofs1]; \ v_float32 i##ofs0##_fpix01 = v256_load(srcptr##ofs0), \ @@ -678,30 +766,48 @@ i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix11, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); \ i##ofs0##ofs1##_fpix22 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix33, i##ofs0##ofs1##_fpix22), i##ofs0##ofs1##_fpix22); \ i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_beta, v_sub(i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); -#define CV_WARP_SIMD256_STORE_8UC4_I() \ - auto i01_pix = v_round(i01_fpix00), i23_pix = v_round(i23_fpix00); \ - v_pack_store(dstptr + 4*(x+i), v_pack_u(i01_pix, i23_pix)); \ - auto i45_pix = v_round(i45_fpix00), i67_pix = v_round(i67_fpix00); \ - v_pack_store(dstptr + 4*(x+i+4), v_pack_u(i45_pix, i67_pix)); -#define CV_WARP_SIMD256_STORE_16UC4_I() \ - auto i01_pix = v_round(i01_fpix00), i23_pix = v_round(i23_fpix00); \ - vx_store(dstptr + 4*(x+i), v_pack_u(i01_pix, i23_pix)); \ - auto i45_pix = v_round(i45_fpix00), i67_pix = v_round(i67_fpix00); \ - vx_store(dstptr + 4*(x+i+4), v_pack_u(i45_pix, i67_pix)); -#define CV_WARP_SIMD256_STORE_32FC4_I() \ +#define CV_WARP_SIMD256_LINEAR_STORE_8UC4_I() \ + v_pack_store(dstptr + 4*(x+i), v_pack_u(i01_pix00, i23_pix00)); \ + v_pack_store(dstptr + 4*(x+i+4), v_pack_u(i45_pix00, i67_pix00)); +#define CV_WARP_SIMD256_LINEAR_STORE_16UC4_I() \ + vx_store(dstptr + 4*(x+i), v_pack_u(i01_pix00, i23_pix00)); \ + vx_store(dstptr + 4*(x+i+4), v_pack_u(i45_pix00, i67_pix00)); +#define CV_WARP_SIMD256_LINEAR_STORE_32FC4_I() \ vx_store(dstptr + 4*(x+i), i01_fpix00); \ vx_store(dstptr + 4*(x+i)+8, i23_fpix00); \ vx_store(dstptr + 4*(x+i)+16, i45_fpix00); \ vx_store(dstptr + 4*(x+i)+24, i67_fpix00); -#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(DEPTH) \ +#define CV_WARP_SIMD256_SHUFFLE_INTER_STORE_C4(INTER, DEPTH) \ for (int i = 0; i < uf; i+=vlanes_32) { \ - CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(0, 1); \ - CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(2, 3); \ - CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(4, 5); \ - CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(6, 7); \ - CV_WARP_SIMD256_STORE_##DEPTH##C4_I(); \ + CV_WARP_SIMD256_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(0, 1) \ + CV_WARP_SIMD256_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(2, 3) \ + CV_WARP_SIMD256_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(4, 5) \ + CV_WARP_SIMD256_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(6, 7) \ + CV_WARP_SIMD256_##INTER##_STORE_##DEPTH##C4_I() \ } -#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4_I(ofs) \ +// SIMD_SCALABLE (SIMDX), nearest +#define CV_WARP_SIMDX_NEAREST_SHUFFLE_INTER_8UC4_I(ofs) \ + const uint8_t *srcptr##ofs = src + addr[i+ofs]; \ + v_uint32 i##ofs##_pix0 = v_load_expand_q<4>(srcptr##ofs); +#define CV_WARP_SIMDX_NEAREST_SHUFFLE_INTER_16UC4_I(ofs) \ + const uint16_t *srcptr##ofs = src + addr[i+ofs]; \ + v_uint32 i##ofs##_pix0 = v_load_expand<4>(srcptr##ofs); +#define CV_WARP_SIMDX_NEAREST_SHUFFLE_INTER_32FC4_I(ofs) \ + const float *srcptr##ofs = src + addr[i+ofs]; \ + v_float32 i##ofs##_fpix0 = v_load<4>(srcptr##ofs); +#define CV_WARP_SIMDX_NEAREST_STORE_8UC4_I() \ + v_pack_store<8>(dstptr + 4*(x+i), v_pack<4>(i0_pix0, i1_pix0)); \ + v_pack_store<8>(dstptr + 4*(x+i+2), v_pack<4>(i2_pix0, i3_pix0)); +#define CV_WARP_SIMDX_NEAREST_STORE_16UC4_I() \ + v_store<8>(dstptr + 4*(x+i), v_pack<4>(i0_pix0, i1_pix0)); \ + v_store<8>(dstptr + 4*(x+i+2), v_pack<4>(i2_pix0, i3_pix0)); +#define CV_WARP_SIMDX_NEAREST_STORE_32FC4_I() \ + v_store<4>(dstptr + 4*(x+i), i0_fpix0); \ + v_store<4>(dstptr + 4*(x+i)+4, i1_fpix0); \ + v_store<4>(dstptr + 4*(x+i)+8, i2_fpix0); \ + v_store<4>(dstptr + 4*(x+i)+12, i3_fpix0); +// SIMD_SCALABLE (SIMDX), bilinear +#define CV_WARP_SIMDX_LINEAR_SHUFFLE_INTER_8UC4_I(ofs) \ const uint8_t *srcptr##ofs = src + addr[i+ofs]; \ v_float32 i##ofs##_fpix0 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q<4>(srcptr##ofs))), \ i##ofs##_fpix1 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q<4>(srcptr##ofs+4))), \ @@ -711,8 +817,9 @@ i##ofs##_beta = vx_setall_f32(vbeta[i+ofs]); \ i##ofs##_fpix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix1, i##ofs##_fpix0), i##ofs##_fpix0); \ i##ofs##_fpix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix3, i##ofs##_fpix2), i##ofs##_fpix2); \ - i##ofs##_fpix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_fpix2, i##ofs##_fpix0), i##ofs##_fpix0); -#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_16UC4_I(ofs) \ + i##ofs##_fpix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_fpix2, i##ofs##_fpix0), i##ofs##_fpix0); \ + auto i##ofs##_pix0 = v_round(i##ofs##_fpix0); +#define CV_WARP_SIMDX_LINEAR_SHUFFLE_INTER_16UC4_I(ofs) \ const uint16_t *srcptr##ofs = src + addr[i+ofs]; \ v_float32 i##ofs##_fpix0 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand<4>(srcptr##ofs))), \ i##ofs##_fpix1 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand<4>(srcptr##ofs+4))), \ @@ -722,8 +829,9 @@ i##ofs##_beta = vx_setall_f32(vbeta[i+ofs]); \ i##ofs##_fpix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix1, i##ofs##_fpix0), i##ofs##_fpix0); \ i##ofs##_fpix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix3, i##ofs##_fpix2), i##ofs##_fpix2); \ - i##ofs##_fpix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_fpix2, i##ofs##_fpix0), i##ofs##_fpix0); -#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_32FC4_I(ofs) \ + i##ofs##_fpix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_fpix2, i##ofs##_fpix0), i##ofs##_fpix0); \ + auto i##ofs##_pix0 = v_round(i##ofs##_fpix0); +#define CV_WARP_SIMDX_LINEAR_SHUFFLE_INTER_32FC4_I(ofs) \ const float *srcptr##ofs = src + addr[i+ofs]; \ v_float32 i##ofs##_fpix0 = v_load<4>(srcptr##ofs), \ i##ofs##_fpix1 = v_load<4>(srcptr##ofs+4), \ @@ -734,26 +842,25 @@ i##ofs##_fpix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix1, i##ofs##_fpix0), i##ofs##_fpix0); \ i##ofs##_fpix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix3, i##ofs##_fpix2), i##ofs##_fpix2); \ i##ofs##_fpix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_fpix2, i##ofs##_fpix0), i##ofs##_fpix0); -#define CV_WARP_SIMDX_STORE_8UC4_I() \ - auto i01_pix = v_pack_u<4>(v_round(i0_fpix0), v_round(i1_fpix0)), \ - i23_pix = v_pack_u<4>(v_round(i2_fpix0), v_round(i3_fpix0)); \ - v_pack_store<8>(dstptr + 4*(x+i), i01_pix); \ - v_pack_store<8>(dstptr + 4*(x+i+2), i23_pix); -#define CV_WARP_SIMDX_STORE_16UC4_I() \ - auto i01_pix = v_pack_u<4>(v_round(i0_fpix0), v_round(i1_fpix0)), \ - i23_pix = v_pack_u<4>(v_round(i2_fpix0), v_round(i3_fpix0)); \ - v_store<8>(dstptr + 4*(x+i), i01_pix); \ - v_store<8>(dstptr + 4*(x+i+2), i23_pix); -#define CV_WARP_SIMDX_STORE_32FC4_I() \ +#define CV_WARP_SIMDX_LINEAR_STORE_8UC4_I() \ + v_pack_store<8>(dstptr + 4*(x+i), v_pack_u<4>(i0_pix0, i1_pix0)); \ + v_pack_store<8>(dstptr + 4*(x+i+2), v_pack_u<4>(i2_pix0, i3_pix0)); +#define CV_WARP_SIMDX_LINEAR_STORE_16UC4_I() \ + v_store<8>(dstptr + 4*(x+i), v_pack_u<4>(i0_pix0, i1_pix0)); \ + v_store<8>(dstptr + 4*(x+i+2), v_pack_u<4>(i2_pix0, i3_pix0)); +#define CV_WARP_SIMDX_LINEAR_STORE_32FC4_I() \ v_store<4>(dstptr + 4*(x+i), i0_fpix0); \ v_store<4>(dstptr + 4*(x+i)+4, i1_fpix0); \ v_store<4>(dstptr + 4*(x+i)+8, i2_fpix0); \ v_store<4>(dstptr + 4*(x+i)+12, i3_fpix0); -#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(DEPTH) \ +#define CV_WARP_SIMDX_SHUFFLE_INTER_STORE_C4(INTER, DEPTH) \ for (int i = 0; i < uf; i+=4) { \ - CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(0); \ - CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(1); \ - CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(2); \ - CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(3); \ - CV_WARP_SIMDX_STORE_##DEPTH##C4_I(); \ + CV_WARP_SIMDX_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(0); \ + CV_WARP_SIMDX_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(1); \ + CV_WARP_SIMDX_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(2); \ + CV_WARP_SIMDX_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(3); \ + CV_WARP_SIMDX_##INTER##_STORE_##DEPTH##C4_I(); \ } + +#define CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD, INTER, DEPTH) \ + CV_WARP_##SIMD##_SHUFFLE_INTER_STORE_C4(INTER, DEPTH) diff --git a/modules/imgproc/src/warp_kernels.simd.hpp b/modules/imgproc/src/warp_kernels.simd.hpp index 41f994bc24..d48609656d 100644 --- a/modules/imgproc/src/warp_kernels.simd.hpp +++ b/modules/imgproc/src/warp_kernels.simd.hpp @@ -7,62 +7,66 @@ #include "warp_common.hpp" #include "opencv2/core/hal/intrin.hpp" -#define CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1() \ +#define CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1() \ v_float32 dst_x0 = vx_load(start_indices.data()); \ v_float32 dst_x1 = v_add(dst_x0, vx_setall_f32(float(vlanes_32))); \ v_float32 M0 = vx_setall_f32(M[0]), \ M3 = vx_setall_f32(M[3]); \ v_float32 M_x = vx_setall_f32(static_cast(y * M[1] + M[2])), \ M_y = vx_setall_f32(static_cast(y * M[4] + M[5])); -#define CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1() \ - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1() \ +#define CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD1() \ + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1() \ v_float32 M6 = vx_setall_f32(M[6]); \ v_float32 M_w = vx_setall_f32(static_cast(y * M[7] + M[8])); -#define CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1() \ +#define CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD1() \ v_float32 dst_x0 = vx_load(start_indices.data()); \ v_float32 dst_x1 = v_add(dst_x0, vx_setall_f32(float(vlanes_32))); \ v_float32 dst_y = vx_setall_f32(float(y)); -#define CV_WARP_LINEAR_VECTOR_GET_ADDR_C1() \ +#define CV_WARP_VECTOR_GET_ADDR_C1() \ v_int32 addr_0 = v_fma(v_srcstep, src_iy0, src_ix0), \ addr_1 = v_fma(v_srcstep, src_iy1, src_ix1); -#define CV_WARP_LINEAR_VECTOR_GET_ADDR_C3() \ +#define CV_WARP_VECTOR_GET_ADDR_C3() \ v_int32 addr_0 = v_fma(v_srcstep, src_iy0, v_mul(src_ix0, three)), \ addr_1 = v_fma(v_srcstep, src_iy1, v_mul(src_ix1, three)); -#define CV_WARP_LINEAR_VECTOR_GET_ADDR_C4() \ +#define CV_WARP_VECTOR_GET_ADDR_C4() \ v_int32 addr_0 = v_fma(v_srcstep, src_iy0, v_mul(src_ix0, four)), \ addr_1 = v_fma(v_srcstep, src_iy1, v_mul(src_ix1, four)); -#define CV_WARP_LINEAR_VECTOR_GET_ADDR(CN) \ - CV_WARP_LINEAR_VECTOR_GET_ADDR_##CN() \ +#define CV_WARP_VECTOR_GET_ADDR(CN) \ + CV_WARP_VECTOR_GET_ADDR_##CN() \ vx_store(addr, addr_0); \ vx_store(addr + vlanes_32, addr_1); -#define CV_WARP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD(CN) \ - v_int32 src_ix0 = v_floor(src_x0), \ - src_iy0 = v_floor(src_y0), \ - src_ix1 = v_floor(src_x1), \ - src_iy1 = v_floor(src_y1); \ +#define CV_WARP_VECTOR_LINEAR_COMPUTE_COORD() \ + v_int32 src_ix0 = v_floor(src_x0), src_iy0 = v_floor(src_y0); \ + v_int32 src_ix1 = v_floor(src_x1), src_iy1 = v_floor(src_y1); \ + src_x0 = v_sub(src_x0, v_cvt_f32(src_ix0)); \ + src_y0 = v_sub(src_y0, v_cvt_f32(src_iy0)); \ + src_x1 = v_sub(src_x1, v_cvt_f32(src_ix1)); \ + src_y1 = v_sub(src_y1, v_cvt_f32(src_iy1)); +#define CV_WARP_VECTOR_NEAREST_COMPUTE_COORD() \ + v_int32 src_ix0 = v_round(src_x0), src_iy0 = v_round(src_y0); \ + v_int32 src_ix1 = v_round(src_x1), src_iy1 = v_round(src_y1); \ + +#define CV_WARP_VECTOR_COMPUTE_MAPPED_COORD(INTER, CN) \ + CV_WARP_VECTOR_##INTER##_COMPUTE_COORD() \ v_uint32 mask_0 = v_lt(v_reinterpret_as_u32(src_ix0), inner_scols), \ mask_1 = v_lt(v_reinterpret_as_u32(src_ix1), inner_scols); \ mask_0 = v_and(mask_0, v_lt(v_reinterpret_as_u32(src_iy0), inner_srows)); \ mask_1 = v_and(mask_1, v_lt(v_reinterpret_as_u32(src_iy1), inner_srows)); \ v_uint16 inner_mask = v_pack(mask_0, mask_1); \ - src_x0 = v_sub(src_x0, v_cvt_f32(src_ix0)); \ - src_y0 = v_sub(src_y0, v_cvt_f32(src_iy0)); \ - src_x1 = v_sub(src_x1, v_cvt_f32(src_ix1)); \ - src_y1 = v_sub(src_y1, v_cvt_f32(src_iy1)); \ - CV_WARP_LINEAR_VECTOR_GET_ADDR(CN); + CV_WARP_VECTOR_GET_ADDR(CN) -#define CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(CN) \ +#define CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD2(INTER, CN) \ v_float32 src_x0 = v_fma(M0, dst_x0, M_x), \ src_y0 = v_fma(M3, dst_x0, M_y), \ src_x1 = v_fma(M0, dst_x1, M_x), \ src_y1 = v_fma(M3, dst_x1, M_y); \ dst_x0 = v_add(dst_x0, delta); \ dst_x1 = v_add(dst_x1, delta); \ - CV_WARP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD(CN) + CV_WARP_VECTOR_COMPUTE_MAPPED_COORD(INTER, CN) -#define CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(CN) \ +#define CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD2(INTER, CN) \ v_float32 src_x0 = v_fma(M0, dst_x0, M_x), \ src_y0 = v_fma(M3, dst_x0, M_y), \ src_w0 = v_fma(M6, dst_x0, M_w), \ @@ -75,9 +79,9 @@ src_y1 = v_div(src_y1, src_w1); \ dst_x0 = v_add(dst_x0, delta); \ dst_x1 = v_add(dst_x1, delta); \ - CV_WARP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD(CN) + CV_WARP_VECTOR_COMPUTE_MAPPED_COORD(INTER, CN) -#define CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(CN) \ +#define CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD2(INTER, CN) \ v_float32 src_x0, src_y0, \ src_x1, src_y1; \ if (map1 == map2) { \ @@ -97,11 +101,102 @@ dst_x0 = v_add(dst_x0, delta); \ dst_x1 = v_add(dst_x1, delta); \ } \ - CV_WARP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD(CN) + CV_WARP_VECTOR_COMPUTE_MAPPED_COORD(INTER, CN) namespace cv{ CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +void warpAffineNearestInvoker_8UC1(const uint8_t *src_data, size_t src_step, int src_rows, int src_cols, + uint8_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double M[6], int border_type, const double border_value[4]); +void warpAffineNearestInvoker_8UC3(const uint8_t *src_data, size_t src_step, int src_rows, int src_cols, + uint8_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double M[6], int border_type, const double border_value[4]); +void warpAffineNearestInvoker_8UC4(const uint8_t *src_data, size_t src_step, int src_rows, int src_cols, + uint8_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double M[6], int border_type, const double border_value[4]); +void warpAffineNearestInvoker_16UC1(const uint16_t *src_data, size_t src_step, int src_rows, int src_cols, + uint16_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double M[6], int border_type, const double border_value[4]); +void warpAffineNearestInvoker_16UC3(const uint16_t *src_data, size_t src_step, int src_rows, int src_cols, + uint16_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double M[6], int border_type, const double border_value[4]); +void warpAffineNearestInvoker_16UC4(const uint16_t *src_data, size_t src_step, int src_rows, int src_cols, + uint16_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double M[6], int border_type, const double border_value[4]); +void warpAffineNearestInvoker_32FC1(const float *src_data, size_t src_step, int src_rows, int src_cols, + float *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double M[6], int border_type, const double border_value[4]); +void warpAffineNearestInvoker_32FC3(const float *src_data, size_t src_step, int src_rows, int src_cols, + float *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double M[6], int border_type, const double border_value[4]); +void warpAffineNearestInvoker_32FC4(const float *src_data, size_t src_step, int src_rows, int src_cols, + float *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double M[6], int border_type, const double border_value[4]); +void warpPerspectiveNearestInvoker_8UC1(const uint8_t *src_data, size_t src_step, int src_rows, int src_cols, + uint8_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double M[9], int border_type, const double border_value[4]); +void warpPerspectiveNearestInvoker_8UC3(const uint8_t *src_data, size_t src_step, int src_rows, int src_cols, + uint8_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double M[9], int border_type, const double border_value[4]); +void warpPerspectiveNearestInvoker_8UC4(const uint8_t *src_data, size_t src_step, int src_rows, int src_cols, + uint8_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double M[9], int border_type, const double border_value[4]); +void warpPerspectiveNearestInvoker_16UC1(const uint16_t *src_data, size_t src_step, int src_rows, int src_cols, + uint16_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double M[9], int border_type, const double border_value[4]); +void warpPerspectiveNearestInvoker_16UC3(const uint16_t *src_data, size_t src_step, int src_rows, int src_cols, + uint16_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double M[9], int border_type, const double border_value[4]); +void warpPerspectiveNearestInvoker_16UC4(const uint16_t *src_data, size_t src_step, int src_rows, int src_cols, + uint16_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double M[9], int border_type, const double border_value[4]); +void warpPerspectiveNearestInvoker_32FC1(const float *src_data, size_t src_step, int src_rows, int src_cols, + float *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double M[9], int border_type, const double border_value[4]); +void warpPerspectiveNearestInvoker_32FC3(const float *src_data, size_t src_step, int src_rows, int src_cols, + float *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double M[9], int border_type, const double border_value[4]); +void warpPerspectiveNearestInvoker_32FC4(const float *src_data, size_t src_step, int src_rows, int src_cols, + float *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double M[9], int border_type, const double border_value[4]); +void remapNearestInvoker_8UC1(const uint8_t *src_data, size_t src_step, int src_rows, int src_cols, + uint8_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + int border_type, const double border_value[4], + const float *map1_data, size_t map1_step, const float *map2_data, size_t map2_step, bool is_relative); +void remapNearestInvoker_8UC3(const uint8_t *src_data, size_t src_step, int src_rows, int src_cols, + uint8_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + int border_type, const double border_value[4], + const float *map1_data, size_t map1_step, const float *map2_data, size_t map2_step, bool is_relative); +void remapNearestInvoker_8UC4(const uint8_t *src_data, size_t src_step, int src_rows, int src_cols, + uint8_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + int border_type, const double border_value[4], + const float *map1_data, size_t map1_step, const float *map2_data, size_t map2_step, bool is_relative); +void remapNearestInvoker_16UC1(const uint16_t *src_data, size_t src_step, int src_rows, int src_cols, + uint16_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + int border_type, const double border_value[4], + const float *map1_data, size_t map1_step, const float *map2_data, size_t map2_step, bool is_relative); +void remapNearestInvoker_16UC3(const uint16_t *src_data, size_t src_step, int src_rows, int src_cols, + uint16_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + int border_type, const double border_value[4], + const float *map1_data, size_t map1_step, const float *map2_data, size_t map2_step, bool is_relative); +void remapNearestInvoker_16UC4(const uint16_t *src_data, size_t src_step, int src_rows, int src_cols, + uint16_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + int border_type, const double border_value[4], + const float *map1_data, size_t map1_step, const float *map2_data, size_t map2_step, bool is_relative); +void remapNearestInvoker_32FC1(const float *src_data, size_t src_step, int src_rows, int src_cols, + float *dst_data, size_t dst_step, int dst_rows, int dst_cols, + int border_type, const double border_value[4], + const float *map1_data, size_t map1_step, const float *map2_data, size_t map2_step, bool is_relative); +void remapNearestInvoker_32FC3(const float *src_data, size_t src_step, int src_rows, int src_cols, + float *dst_data, size_t dst_step, int dst_rows, int dst_cols, + int border_type, const double border_value[4], + const float *map1_data, size_t map1_step, const float *map2_data, size_t map2_step, bool is_relative); +void remapNearestInvoker_32FC4(const float *src_data, size_t src_step, int src_rows, int src_cols, + float *dst_data, size_t dst_step, int dst_rows, int dst_cols, + int border_type, const double border_value[4], + const float *map1_data, size_t map1_step, const float *map2_data, size_t map2_step, bool is_relative); + void warpAffineLinearInvoker_8UC1(const uint8_t *src_data, size_t src_step, int src_rows, int src_cols, uint8_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, const double M[6], int border_type, const double border_value[4]); @@ -260,6 +355,2692 @@ static inline int borderInterpolate_fast( int p, int len, int borderType ) } } // anonymous +void warpAffineNearestInvoker_8UC1(const uint8_t *src_data, size_t src_step, int src_rows, int src_cols, + uint8_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double dM[6], int border_type, const double border_value[4]) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + size_t srcstep = src_step, dststep = dst_step; + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + float M[6]; + for (int i = 0; i < 6; i++) { + M[i] = static_cast(dM[i]); + } + uint8_t bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + uint8_t pixbuf[max_uf]; + + uint8_t bvalbuf[max_uf]; + for (int i = 0; i < uf; i++) { + bvalbuf[i] = bval[0]; + } + v_uint8 bval_v0 = vx_load_low(&bvalbuf[0]); +#endif + + for (int y = r.start; y < r.end; y++) { + uint8_t* dstptr = dst + y*dststep; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C1); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(NEAREST, C1, 8U); + } else { + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C1, 8U); + } + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C1, 8U, 16U); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C1, 16U, 8U); + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float sx = x*M[0] + y*M[1] + M[2]; + float sy = x*M[3] + y*M[4] + M[5]; + CV_WARP_SCALAR_SHUFFLE(NEAREST, C1, 8U); + CV_WARP_SCALAR_STORE(NEAREST, C1, 8U); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} + +void warpAffineNearestInvoker_8UC3(const uint8_t *src_data, size_t src_step, int src_rows, int src_cols, + uint8_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double dM[6], int border_type, const double border_value[4]) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + size_t srcstep = src_step, dststep = dst_step; + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + float M[6]; + for (int i = 0; i < 6; i++) { + M[i] = static_cast(dM[i]); + } + uint8_t bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1), three = vx_setall_s32(3); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + uint8_t pixbuf[max_uf*3]; + + uint8_t bvalbuf[max_uf*3]; + for (int i = 0; i < uf; i++) { + bvalbuf[i*3] = bval[0]; + bvalbuf[i*3+1] = bval[1]; + bvalbuf[i*3+2] = bval[2]; + } + v_uint8 bval_v0 = vx_load_low(&bvalbuf[0]); + v_uint8 bval_v1 = vx_load_low(&bvalbuf[uf]); + v_uint8 bval_v2 = vx_load_low(&bvalbuf[uf*2]); +#endif + + for (int y = r.start; y < r.end; y++) { + uint8_t* dstptr = dst + y*dststep; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C3); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(NEAREST, C3, 8U); + } else { + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C3, 8U); + } + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C3, 8U, 16U); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C3, 16U, 8U); + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float sx = x*M[0] + y*M[1] + M[2]; + float sy = x*M[3] + y*M[4] + M[5]; + CV_WARP_SCALAR_SHUFFLE(NEAREST, C3, 8U); + CV_WARP_SCALAR_STORE(NEAREST, C3, 8U); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} + +void warpAffineNearestInvoker_8UC4(const uint8_t *src_data, size_t src_step, int src_rows, int src_cols, + uint8_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double dM[6], int border_type, const double border_value[4]) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + size_t srcstep = src_step, dststep = dst_step; + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + float M[6]; + for (int i = 0; i < 6; i++) { + M[i] = static_cast(dM[i]); + } + uint8_t bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1), four = vx_setall_s32(4); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + + uint8_t bvalbuf[max_uf*4]; + for (int i = 0; i < uf; i++) { + bvalbuf[i*4] = bval[0]; + bvalbuf[i*4+1] = bval[1]; + bvalbuf[i*4+2] = bval[2]; + bvalbuf[i*4+3] = bval[3]; + } + v_uint8 bval_v0 = vx_load_low(&bvalbuf[0]); + v_uint8 bval_v1 = vx_load_low(&bvalbuf[uf]); + v_uint8 bval_v2 = vx_load_low(&bvalbuf[uf*2]); + v_uint8 bval_v3 = vx_load_low(&bvalbuf[uf*3]); +#endif + + for (int y = r.start; y < r.end; y++) { + uint8_t* dstptr = dst + y*dststep; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C4); + if (v_reduce_min(inner_mask) != 0) { + #if CV_SIMD256 + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD256, NEAREST, 8U); + #elif CV_SIMD128 + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD128, NEAREST, 8U); + #elif CV_SIMD_SCALABLE + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMDX, NEAREST, 8U); + #endif + } else { + uint8_t pixbuf[max_uf*4]; + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C4, 8U); + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C4, 8U, 16U); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C4, 16U, 8U); + } + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float sx = x*M[0] + y*M[1] + M[2]; + float sy = x*M[3] + y*M[4] + M[5]; + + CV_WARP_SCALAR_SHUFFLE(NEAREST, C4, 8U); + + CV_WARP_SCALAR_STORE(NEAREST, C4, 8U); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} +void warpAffineNearestInvoker_16UC1(const uint16_t *src_data, size_t src_step, int src_rows, int src_cols, + uint16_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double dM[6], int border_type, const double border_value[4]) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + size_t srcstep = src_step/sizeof(uint16_t), dststep = dst_step/sizeof(uint16_t); + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + float M[6]; + for (int i = 0; i < 6; i++) { + M[i] = static_cast(dM[i]); + } + uint16_t bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + uint16_t pixbuf[max_uf]; + + uint16_t bvalbuf[max_uf]; + for (int i = 0; i < uf; i++) { + bvalbuf[i] = bval[0]; + } + v_uint16 bval_v0 = vx_load(&bvalbuf[0]); +#endif + + for (int y = r.start; y < r.end; y++) { + uint16_t* dstptr = dst + y*dststep; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C1); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(NEAREST, C1, 16U); + } else { + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C1, 16U); + } + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C1, 16U, 16U); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C1, 16U, 16U); + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float sx = x*M[0] + y*M[1] + M[2]; + float sy = x*M[3] + y*M[4] + M[5]; + + CV_WARP_SCALAR_SHUFFLE(NEAREST, C1, 16U); + + CV_WARP_SCALAR_STORE(NEAREST, C1, 16U); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} + +void warpAffineNearestInvoker_16UC3(const uint16_t *src_data, size_t src_step, int src_rows, int src_cols, + uint16_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double dM[6], int border_type, const double border_value[4]) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + size_t srcstep = src_step/sizeof(uint16_t), dststep = dst_step/sizeof(uint16_t); + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + float M[6]; + for (int i = 0; i < 6; i++) { + M[i] = static_cast(dM[i]); + } + uint16_t bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1), three = vx_setall_s32(3); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + uint16_t pixbuf[max_uf*3]; + + uint16_t bvalbuf[max_uf*3]; + for (int i = 0; i < uf; i++) { + bvalbuf[i*3] = bval[0]; + bvalbuf[i*3+1] = bval[1]; + bvalbuf[i*3+2] = bval[2]; + } + v_uint16 bval_v0 = vx_load(&bvalbuf[0]); + v_uint16 bval_v1 = vx_load(&bvalbuf[uf]); + v_uint16 bval_v2 = vx_load(&bvalbuf[uf*2]); +#endif + + for (int y = r.start; y < r.end; y++) { + uint16_t* dstptr = dst + y*dststep; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C3); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(NEAREST, C3, 16U); + } else { + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C3, 16U); + } + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C3, 16U, 16U); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C3, 16U, 16U); + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float sx = x*M[0] + y*M[1] + M[2]; + float sy = x*M[3] + y*M[4] + M[5]; + + CV_WARP_SCALAR_SHUFFLE(NEAREST, C3, 16U); + + CV_WARP_SCALAR_STORE(NEAREST, C3, 16U); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} +void warpAffineNearestInvoker_16UC4(const uint16_t *src_data, size_t src_step, int src_rows, int src_cols, + uint16_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double dM[6], int border_type, const double border_value[4]) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + size_t srcstep = src_step/sizeof(uint16_t), dststep = dst_step/sizeof(uint16_t); + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + float M[6]; + for (int i = 0; i < 6; i++) { + M[i] = static_cast(dM[i]); + } + uint16_t bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1), four = vx_setall_s32(4); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + + uint16_t bvalbuf[max_uf*4]; + for (int i = 0; i < uf; i++) { + bvalbuf[i*4] = bval[0]; + bvalbuf[i*4+1] = bval[1]; + bvalbuf[i*4+2] = bval[2]; + bvalbuf[i*4+3] = bval[3]; + } + v_uint16 bval_v0 = vx_load(&bvalbuf[0]); + v_uint16 bval_v1 = vx_load(&bvalbuf[uf]); + v_uint16 bval_v2 = vx_load(&bvalbuf[uf*2]); + v_uint16 bval_v3 = vx_load(&bvalbuf[uf*3]); +#endif + + for (int y = r.start; y < r.end; y++) { + uint16_t* dstptr = dst + y*dststep; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C4); + if (v_reduce_min(inner_mask) != 0) { + #if CV_SIMD256 + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD256, NEAREST, 16U); + #elif CV_SIMD128 + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD128, NEAREST, 16U); + #elif CV_SIMD_SCALABLE + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMDX, NEAREST, 16U); + #endif + } else { + uint16_t pixbuf[max_uf*4]; + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C4, 16U); + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C4, 16U, 16U); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C4, 16U, 16U); + } + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float sx = x*M[0] + y*M[1] + M[2]; + float sy = x*M[3] + y*M[4] + M[5]; + CV_WARP_SCALAR_SHUFFLE(NEAREST, C4, 16U); + CV_WARP_SCALAR_STORE(NEAREST, C4, 16U); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} +void warpAffineNearestInvoker_32FC1(const float *src_data, size_t src_step, int src_rows, int src_cols, + float *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double dM[6], int border_type, const double border_value[4]) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + size_t srcstep = src_step/sizeof(float), dststep = dst_step/sizeof(float); + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + float M[6]; + for (int i = 0; i < 6; i++) { + M[i] = static_cast(dM[i]); + } + float bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + float pixbuf[max_uf]; + + float bvalbuf[max_uf]; + for (int i = 0; i < uf; i++) { + bvalbuf[i] = bval[0]; + } + v_float32 bval_v0_l = vx_load(&bvalbuf[0]); + v_float32 bval_v0_h = vx_load(&bvalbuf[vlanes_32]); +#endif + + for (int y = r.start; y < r.end; y++) { + float* dstptr = dst + y*dststep; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C1); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(NEAREST, C1, 32F); + } else { + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C1, 32F); + } + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C1, 32F, 32F); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C1, 32F, 32F); + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float sx = x*M[0] + y*M[1] + M[2]; + float sy = x*M[3] + y*M[4] + M[5]; + + CV_WARP_SCALAR_SHUFFLE(NEAREST, C1, 32F); + + CV_WARP_SCALAR_STORE(NEAREST, C1, 32F); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} +void warpAffineNearestInvoker_32FC3(const float *src_data, size_t src_step, int src_rows, int src_cols, + float *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double dM[6], int border_type, const double border_value[4]) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + size_t srcstep = src_step/sizeof(float), dststep = dst_step/sizeof(float); + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + float M[6]; + for (int i = 0; i < 6; i++) { + M[i] = static_cast(dM[i]); + } + float bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1), three = vx_setall_s32(3); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + float pixbuf[max_uf*3]; + + float bvalbuf[max_uf*3]; + for (int i = 0; i < uf; i++) { + bvalbuf[i*3] = bval[0]; + bvalbuf[i*3+1] = bval[1]; + bvalbuf[i*3+2] = bval[2]; + } + v_float32 bval_v0_l = vx_load(&bvalbuf[0]); + v_float32 bval_v0_h = vx_load(&bvalbuf[vlanes_32]); + v_float32 bval_v1_l = vx_load(&bvalbuf[uf]); + v_float32 bval_v1_h = vx_load(&bvalbuf[uf+vlanes_32]); + v_float32 bval_v2_l = vx_load(&bvalbuf[uf*2]); + v_float32 bval_v2_h = vx_load(&bvalbuf[uf*2+vlanes_32]); +#endif + + for (int y = r.start; y < r.end; y++) { + float* dstptr = dst + y*dststep; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C3); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(NEAREST, C3, 32F); + } else { + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C3, 32F); + } + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C3, 32F, 32F); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C3, 32F, 32F); + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float sx = x*M[0] + y*M[1] + M[2]; + float sy = x*M[3] + y*M[4] + M[5]; + + CV_WARP_SCALAR_SHUFFLE(NEAREST, C3, 32F); + + CV_WARP_SCALAR_STORE(NEAREST, C3, 32F); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} +void warpAffineNearestInvoker_32FC4(const float *src_data, size_t src_step, int src_rows, int src_cols, + float *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double dM[6], int border_type, const double border_value[4]) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + size_t srcstep = src_step/sizeof(float), dststep = dst_step/sizeof(float); + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + float M[6]; + for (int i = 0; i < 6; i++) { + M[i] = static_cast(dM[i]); + } + float bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1), four = vx_setall_s32(4); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + + float bvalbuf[max_uf*4]; + for (int i = 0; i < uf; i++) { + bvalbuf[i*4] = bval[0]; + bvalbuf[i*4+1] = bval[1]; + bvalbuf[i*4+2] = bval[2]; + bvalbuf[i*4+3] = bval[3]; + } + v_float32 bval_v0_l = vx_load(&bvalbuf[0]); + v_float32 bval_v0_h = vx_load(&bvalbuf[vlanes_32]); + v_float32 bval_v1_l = vx_load(&bvalbuf[uf]); + v_float32 bval_v1_h = vx_load(&bvalbuf[uf+vlanes_32]); + v_float32 bval_v2_l = vx_load(&bvalbuf[uf*2]); + v_float32 bval_v2_h = vx_load(&bvalbuf[uf*2+vlanes_32]); + v_float32 bval_v3_l = vx_load(&bvalbuf[uf*3]); + v_float32 bval_v3_h = vx_load(&bvalbuf[uf*3+vlanes_32]); +#endif + + for (int y = r.start; y < r.end; y++) { + float* dstptr = dst + y*dststep; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C4); + if (v_reduce_min(inner_mask) != 0) { + #if CV_SIMD256 + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD256, NEAREST, 32F); + #elif CV_SIMD128 + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD128, NEAREST, 32F); + #elif CV_SIMD_SCALABLE + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMDX, NEAREST, 32F); + #endif + } else { + float pixbuf[max_uf*4]; + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C4, 32F); + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C4, 32F, 32F); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C4, 32F, 32F); + } + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float sx = x*M[0] + y*M[1] + M[2]; + float sy = x*M[3] + y*M[4] + M[5]; + + CV_WARP_SCALAR_SHUFFLE(NEAREST, C4, 32F); + + CV_WARP_SCALAR_STORE(NEAREST, C4, 32F); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} + +void warpPerspectiveNearestInvoker_8UC1(const uint8_t *src_data, size_t src_step, int src_rows, int src_cols, + uint8_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double dM[9], int border_type, const double border_value[4]) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + size_t srcstep = src_step, dststep = dst_step; + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + float M[9]; + for (int i = 0; i < 9; i++) { + M[i] = static_cast(dM[i]); + } + uint8_t bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + uint8_t pixbuf[max_uf]; + + uint8_t bvalbuf[max_uf]; + for (int i = 0; i < uf; i++) { + bvalbuf[i] = bval[0]; + } + v_uint8 bval_v0 = vx_load_low(&bvalbuf[0]); +#endif + + for (int y = r.start; y < r.end; y++) { + uint8_t* dstptr = dst + y*dststep; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C1); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(NEAREST, C1, 8U); + } else { + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C1, 8U); + } + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C1, 8U, 16U); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C1, 16U, 8U); + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float w = x*M[6] + y*M[7] + M[8]; + float sx = (x*M[0] + y*M[1] + M[2]) / w; + float sy = (x*M[3] + y*M[4] + M[5]) / w; + CV_WARP_SCALAR_SHUFFLE(NEAREST, C1, 8U); + CV_WARP_SCALAR_STORE(NEAREST, C1, 8U); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} +void warpPerspectiveNearestInvoker_8UC3(const uint8_t *src_data, size_t src_step, int src_rows, int src_cols, + uint8_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double dM[9], int border_type, const double border_value[4]) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + size_t srcstep = src_step, dststep = dst_step; + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + float M[9]; + for (int i = 0; i < 9; i++) { + M[i] = static_cast(dM[i]); + } + uint8_t bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1), three = vx_setall_s32(3); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + uint8_t pixbuf[max_uf*3]; + + uint8_t bvalbuf[max_uf*3]; + for (int i = 0; i < uf; i++) { + bvalbuf[i*3] = bval[0]; + bvalbuf[i*3+1] = bval[1]; + bvalbuf[i*3+2] = bval[2]; + } + v_uint8 bval_v0 = vx_load_low(&bvalbuf[0]); + v_uint8 bval_v1 = vx_load_low(&bvalbuf[uf]); + v_uint8 bval_v2 = vx_load_low(&bvalbuf[uf*2]); +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (int y = r.start; y < r.end; y++) { + uint8_t* dstptr = dst + y*dststep; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C3); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(NEAREST, C3, 8U); + } else { + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C3, 8U); + } + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C3, 8U, 16U); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C3, 16U, 8U); + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float w = x*M[6] + y*M[7] + M[8]; + float sx = (x*M[0] + y*M[1] + M[2]) / w; + float sy = (x*M[3] + y*M[4] + M[5]) / w; + CV_WARP_SCALAR_SHUFFLE(NEAREST, C3, 8U); + CV_WARP_SCALAR_STORE(NEAREST, C3, 8U); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} +void warpPerspectiveNearestInvoker_8UC4(const uint8_t *src_data, size_t src_step, int src_rows, int src_cols, + uint8_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double dM[9], int border_type, const double border_value[4]) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + size_t srcstep = src_step, dststep = dst_step; + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + float M[9]; + for (int i = 0; i < 9; i++) { + M[i] = static_cast(dM[i]); + } + uint8_t bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1), four = vx_setall_s32(4); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + + uint8_t bvalbuf[max_uf*4]; + for (int i = 0; i < uf; i++) { + bvalbuf[i*4] = bval[0]; + bvalbuf[i*4+1] = bval[1]; + bvalbuf[i*4+2] = bval[2]; + bvalbuf[i*4+3] = bval[3]; + } + v_uint8 bval_v0 = vx_load_low(&bvalbuf[0]); + v_uint8 bval_v1 = vx_load_low(&bvalbuf[uf]); + v_uint8 bval_v2 = vx_load_low(&bvalbuf[uf*2]); + v_uint8 bval_v3 = vx_load_low(&bvalbuf[uf*3]); +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (int y = r.start; y < r.end; y++) { + uint8_t* dstptr = dst + y*dststep; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + // CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD1(); + v_float32 dst_x0 = vx_load(start_indices.data()); + v_float32 dst_x1 = v_add(dst_x0, vx_setall_f32(float(vlanes_32))); + v_float32 M0 = vx_setall_f32(M[0]), M3 = vx_setall_f32(M[3]); + v_float32 M_x = vx_setall_f32(static_cast(y * M[1] + M[2])), + M_y = vx_setall_f32(static_cast(y * M[4] + M[5])); + v_float32 M6 = vx_setall_f32(M[6]); + v_float32 M_w = vx_setall_f32(static_cast(y * M[7] + M[8])); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C4); + if (v_reduce_min(inner_mask) != 0) { + #if CV_SIMD256 + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD256, NEAREST, 8U); + #elif CV_SIMD128 + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD128, NEAREST, 8U); + #elif CV_SIMD_SCALABLE + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMDX, NEAREST, 8U); + #endif + } else { + uint8_t pixbuf[max_uf*4]; + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C4, 8U); + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C4, 8U, 16U); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C4, 16U, 8U); + } + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float w = x*M[6] + y*M[7] + M[8]; + float sx = (x*M[0] + y*M[1] + M[2]) / w; + float sy = (x*M[3] + y*M[4] + M[5]) / w; + CV_WARP_SCALAR_SHUFFLE(NEAREST, C4, 8U); + CV_WARP_SCALAR_STORE(NEAREST, C4, 8U); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} +void warpPerspectiveNearestInvoker_16UC1(const uint16_t *src_data, size_t src_step, int src_rows, int src_cols, + uint16_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double dM[9], int border_type, const double border_value[4]) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + size_t srcstep = src_step/sizeof(uint16_t), dststep = dst_step/sizeof(uint16_t); + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + float M[9]; + for (int i = 0; i < 9; i++) { + M[i] = static_cast(dM[i]); + } + uint16_t bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + uint16_t pixbuf[max_uf]; + + uint16_t bvalbuf[max_uf]; + for (int i = 0; i < uf; i++) { + bvalbuf[i] = bval[0]; + } + v_uint16 bval_v0 = vx_load(&bvalbuf[0]); +#endif + + for (int y = r.start; y < r.end; y++) { + uint16_t* dstptr = dst + y*dststep; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C1); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(NEAREST, C1, 16U); + } else { + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C1, 16U); + } + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C1, 16U, 16U); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C1, 16U, 16U); + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float w = x*M[6] + y*M[7] + M[8]; + float sx = (x*M[0] + y*M[1] + M[2]) / w; + float sy = (x*M[3] + y*M[4] + M[5]) / w; + CV_WARP_SCALAR_SHUFFLE(NEAREST, C1, 16U); + CV_WARP_SCALAR_STORE(NEAREST, C1, 16U); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} +void warpPerspectiveNearestInvoker_16UC3(const uint16_t *src_data, size_t src_step, int src_rows, int src_cols, + uint16_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double dM[9], int border_type, const double border_value[4]) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + size_t srcstep = src_step/sizeof(uint16_t), dststep = dst_step/sizeof(uint16_t); + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + float M[9]; + for (int i = 0; i < 9; i++) { + M[i] = static_cast(dM[i]); + } + uint16_t bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1), three = vx_setall_s32(3); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + uint16_t pixbuf[max_uf*3]; + + uint16_t bvalbuf[max_uf*3]; + for (int i = 0; i < uf; i++) { + bvalbuf[i*3] = bval[0]; + bvalbuf[i*3+1] = bval[1]; + bvalbuf[i*3+2] = bval[2]; + } + v_uint16 bval_v0 = vx_load(&bvalbuf[0]); + v_uint16 bval_v1 = vx_load(&bvalbuf[uf]); + v_uint16 bval_v2 = vx_load(&bvalbuf[uf*2]); +#endif + + for (int y = r.start; y < r.end; y++) { + uint16_t* dstptr = dst + y*dststep; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C3); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(NEAREST, C3, 16U); + } else { + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C3, 16U); + } + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C3, 16U, 16U); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C3, 16U, 16U); + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float w = x*M[6] + y*M[7] + M[8]; + float sx = (x*M[0] + y*M[1] + M[2]) / w; + float sy = (x*M[3] + y*M[4] + M[5]) / w; + CV_WARP_SCALAR_SHUFFLE(NEAREST, C3, 16U); + CV_WARP_SCALAR_STORE(NEAREST, C3, 16U); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} +void warpPerspectiveNearestInvoker_16UC4(const uint16_t *src_data, size_t src_step, int src_rows, int src_cols, + uint16_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double dM[9], int border_type, const double border_value[4]) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + size_t srcstep = src_step/sizeof(uint16_t), dststep = dst_step/sizeof(uint16_t); + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + float M[9]; + for (int i = 0; i < 9; i++) { + M[i] = static_cast(dM[i]); + } + uint16_t bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1), four = vx_setall_s32(4); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + + uint16_t bvalbuf[max_uf*4]; + for (int i = 0; i < uf; i++) { + bvalbuf[i*4] = bval[0]; + bvalbuf[i*4+1] = bval[1]; + bvalbuf[i*4+2] = bval[2]; + bvalbuf[i*4+3] = bval[3]; + } + v_uint16 bval_v0 = vx_load(&bvalbuf[0]); + v_uint16 bval_v1 = vx_load(&bvalbuf[uf]); + v_uint16 bval_v2 = vx_load(&bvalbuf[uf*2]); + v_uint16 bval_v3 = vx_load(&bvalbuf[uf*3]); +#endif + + for (int y = r.start; y < r.end; y++) { + uint16_t* dstptr = dst + y*dststep; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C4); + if (v_reduce_min(inner_mask) != 0) { + #if CV_SIMD256 + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD256, NEAREST, 16U); + #elif CV_SIMD128 + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD128, NEAREST, 16U); + #elif CV_SIMD_SCALABLE + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMDX, NEAREST, 16U); + #endif + } else { + uint16_t pixbuf[max_uf*4]; + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C4, 16U); + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C4, 16U, 16U); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C4, 16U, 16U); + } + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float w = x*M[6] + y*M[7] + M[8]; + float sx = (x*M[0] + y*M[1] + M[2]) / w; + float sy = (x*M[3] + y*M[4] + M[5]) / w; + CV_WARP_SCALAR_SHUFFLE(NEAREST, C4, 16U); + CV_WARP_SCALAR_STORE(NEAREST, C4, 16U); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} +void warpPerspectiveNearestInvoker_32FC1(const float *src_data, size_t src_step, int src_rows, int src_cols, + float *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double dM[9], int border_type, const double border_value[4]) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + size_t srcstep = src_step/sizeof(float), dststep = dst_step/sizeof(float); + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + float M[9]; + for (int i = 0; i < 9; i++) { + M[i] = static_cast(dM[i]); + } + float bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + float pixbuf[max_uf]; + + float bvalbuf[max_uf]; + for (int i = 0; i < uf; i++) { + bvalbuf[i] = bval[0]; + } + v_float32 bval_v0_l = vx_load(&bvalbuf[0]); + v_float32 bval_v0_h = vx_load(&bvalbuf[vlanes_32]); +#endif + + for (int y = r.start; y < r.end; y++) { + float* dstptr = dst + y*dststep; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C1); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(NEAREST, C1, 32F); + } else { + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C1, 32F); + } + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C1, 32F, 32F); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C1, 32F, 32F); + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float w = x*M[6] + y*M[7] + M[8]; + float sx = (x*M[0] + y*M[1] + M[2]) / w; + float sy = (x*M[3] + y*M[4] + M[5]) / w; + CV_WARP_SCALAR_SHUFFLE(NEAREST, C1, 32F); + CV_WARP_SCALAR_STORE(NEAREST, C1, 32F); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} +void warpPerspectiveNearestInvoker_32FC3(const float *src_data, size_t src_step, int src_rows, int src_cols, + float *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double dM[9], int border_type, const double border_value[4]) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + size_t srcstep = src_step/sizeof(float), dststep = dst_step/sizeof(float); + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + float M[9]; + for (int i = 0; i < 9; i++) { + M[i] = static_cast(dM[i]); + } + float bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1), three = vx_setall_s32(3); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + float pixbuf[max_uf*3]; + + float bvalbuf[max_uf*3]; + for (int i = 0; i < uf; i++) { + bvalbuf[i*3] = bval[0]; + bvalbuf[i*3+1] = bval[1]; + bvalbuf[i*3+2] = bval[2]; + } + v_float32 bval_v0_l = vx_load(&bvalbuf[0]); + v_float32 bval_v0_h = vx_load(&bvalbuf[vlanes_32]); + v_float32 bval_v1_l = vx_load(&bvalbuf[uf]); + v_float32 bval_v1_h = vx_load(&bvalbuf[uf+vlanes_32]); + v_float32 bval_v2_l = vx_load(&bvalbuf[uf*2]); + v_float32 bval_v2_h = vx_load(&bvalbuf[uf*2+vlanes_32]); +#endif + + for (int y = r.start; y < r.end; y++) { + float* dstptr = dst + y*dststep; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C3); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(NEAREST, C3, 32F); + } else { + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C3, 32F); + } + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C3, 32F, 32F); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C3, 32F, 32F); + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float w = x*M[6] + y*M[7] + M[8]; + float sx = (x*M[0] + y*M[1] + M[2]) / w; + float sy = (x*M[3] + y*M[4] + M[5]) / w; + CV_WARP_SCALAR_SHUFFLE(NEAREST, C3, 32F); + CV_WARP_SCALAR_STORE(NEAREST, C3, 32F); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} +void warpPerspectiveNearestInvoker_32FC4(const float *src_data, size_t src_step, int src_rows, int src_cols, + float *dst_data, size_t dst_step, int dst_rows, int dst_cols, + const double dM[9], int border_type, const double border_value[4]) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + size_t srcstep = src_step/sizeof(float), dststep = dst_step/sizeof(float); + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + float M[9]; + for (int i = 0; i < 9; i++) { + M[i] = static_cast(dM[i]); + } + float bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1), four = vx_setall_s32(4); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + + float bvalbuf[max_uf*4]; + for (int i = 0; i < uf; i++) { + bvalbuf[i*4] = bval[0]; + bvalbuf[i*4+1] = bval[1]; + bvalbuf[i*4+2] = bval[2]; + bvalbuf[i*4+3] = bval[3]; + } + v_float32 bval_v0_l = vx_load(&bvalbuf[0]); + v_float32 bval_v0_h = vx_load(&bvalbuf[vlanes_32]); + v_float32 bval_v1_l = vx_load(&bvalbuf[uf]); + v_float32 bval_v1_h = vx_load(&bvalbuf[uf+vlanes_32]); + v_float32 bval_v2_l = vx_load(&bvalbuf[uf*2]); + v_float32 bval_v2_h = vx_load(&bvalbuf[uf*2+vlanes_32]); + v_float32 bval_v3_l = vx_load(&bvalbuf[uf*3]); + v_float32 bval_v3_h = vx_load(&bvalbuf[uf*3+vlanes_32]); +#endif + + for (int y = r.start; y < r.end; y++) { + float* dstptr = dst + y*dststep; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C4); + if (v_reduce_min(inner_mask) != 0) { + #if CV_SIMD256 + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD256, NEAREST, 32F); + #elif CV_SIMD128 + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD128, NEAREST, 32F); + #elif CV_SIMD_SCALABLE + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMDX, NEAREST, 32F); + #endif + } else { + float pixbuf[max_uf*4]; + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C4, 32F); + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C4, 32F, 32F); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C4, 32F, 32F); + } + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float w = x*M[6] + y*M[7] + M[8]; + float sx = (x*M[0] + y*M[1] + M[2]) / w; + float sy = (x*M[3] + y*M[4] + M[5]) / w; + CV_WARP_SCALAR_SHUFFLE(NEAREST, C4, 32F); + CV_WARP_SCALAR_STORE(NEAREST, C4, 32F); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} + +void remapNearestInvoker_8UC1(const uint8_t *src_data, size_t src_step, int src_rows, int src_cols, + uint8_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + int border_type, const double border_value[4], + const float *map1_data, size_t map1_step, const float *map2_data, size_t map2_step, bool is_relative) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + auto *map1 = map1_data, *map2 = map2_data; + size_t srcstep = src_step, dststep = dst_step, + map1step = map1_step/sizeof(float), map2step=map2_step/sizeof(float); + if (map2 == nullptr) { + map2 = map1; + map2step = map1step; + } + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + bool relative = is_relative; + + uint8_t bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + uint8_t pixbuf[max_uf]; + + uint8_t bvalbuf[max_uf]; + for (int i = 0; i < uf; i++) { + bvalbuf[i] = bval[0]; + } + v_uint8 bval_v0 = vx_load_low(&bvalbuf[0]); +#endif + + for (int y = r.start; y < r.end; y++) { + uint8_t* dstptr = dst + y*dststep; + const float *sx_data = map1 + y*map1step; + const float *sy_data = map2 + y*map2step; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C1); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(NEAREST, C1, 8U); + } else { + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C1, 8U); + } + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C1, 8U, 16U); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C1, 16U, 8U); + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float sx, sy; + if (map1 == map2) { + sx = sx_data[2*x]; + sy = sy_data[2*x+1]; + } else { + sx = sx_data[x]; + sy = sy_data[x]; + } + if (relative) { + sx += x; + sy += y; + } + + CV_WARP_SCALAR_SHUFFLE(NEAREST, C1, 8U); + CV_WARP_SCALAR_STORE(NEAREST, C1, 8U); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} +void remapNearestInvoker_8UC3(const uint8_t *src_data, size_t src_step, int src_rows, int src_cols, + uint8_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + int border_type, const double border_value[4], + const float *map1_data, size_t map1_step, const float *map2_data, size_t map2_step, bool is_relative) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + auto *map1 = map1_data, *map2 = map2_data; + size_t srcstep = src_step, dststep = dst_step, + map1step = map1_step/sizeof(float), map2step=map2_step/sizeof(float); + if (map2 == nullptr) { + map2 = map1; + map2step = map1step; + } + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + bool relative = is_relative; + + uint8_t bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1), three = vx_setall_s32(3); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + uint8_t pixbuf[max_uf*3]; + + uint8_t bvalbuf[max_uf*3]; + for (int i = 0; i < uf; i++) { + bvalbuf[i*3] = bval[0]; + bvalbuf[i*3+1] = bval[1]; + bvalbuf[i*3+2] = bval[2]; + } + v_uint8 bval_v0 = vx_load_low(&bvalbuf[0]); + v_uint8 bval_v1 = vx_load_low(&bvalbuf[uf]); + v_uint8 bval_v2 = vx_load_low(&bvalbuf[uf*2]); +#endif + + for (int y = r.start; y < r.end; y++) { + uint8_t* dstptr = dst + y*dststep; + const float *sx_data = map1 + y*map1step; + const float *sy_data = map2 + y*map2step; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C3); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(NEAREST, C3, 8U); + } else { + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C3, 8U); + } + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C3, 8U, 16U); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C3, 16U, 8U); + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float sx, sy; + if (map1 == map2) { + sx = sx_data[2*x]; + sy = sy_data[2*x+1]; + } else { + sx = sx_data[x]; + sy = sy_data[x]; + } + if (relative) { + sx += x; + sy += y; + } + + CV_WARP_SCALAR_SHUFFLE(NEAREST, C3, 8U); + CV_WARP_SCALAR_STORE(NEAREST, C3, 8U); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} +void remapNearestInvoker_8UC4(const uint8_t *src_data, size_t src_step, int src_rows, int src_cols, + uint8_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + int border_type, const double border_value[4], + const float *map1_data, size_t map1_step, const float *map2_data, size_t map2_step, bool is_relative) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + auto *map1 = map1_data, *map2 = map2_data; + size_t srcstep = src_step, dststep = dst_step, + map1step = map1_step/sizeof(float), map2step=map2_step/sizeof(float); + if (map2 == nullptr) { + map2 = map1; + map2step = map1step; + } + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + bool relative = is_relative; + + uint8_t bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1), four = vx_setall_s32(4); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + + uint8_t bvalbuf[max_uf*4]; + for (int i = 0; i < uf; i++) { + bvalbuf[i*4] = bval[0]; + bvalbuf[i*4+1] = bval[1]; + bvalbuf[i*4+2] = bval[2]; + bvalbuf[i*4+3] = bval[3]; + } + v_uint8 bval_v0 = vx_load_low(&bvalbuf[0]); + v_uint8 bval_v1 = vx_load_low(&bvalbuf[uf]); + v_uint8 bval_v2 = vx_load_low(&bvalbuf[uf*2]); + v_uint8 bval_v3 = vx_load_low(&bvalbuf[uf*3]); +#endif + + for (int y = r.start; y < r.end; y++) { + uint8_t* dstptr = dst + y*dststep; + const float *sx_data = map1 + y*map1step; + const float *sy_data = map2 + y*map2step; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C4); + if (v_reduce_min(inner_mask) != 0) { + #if CV_SIMD256 + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD256, NEAREST, 8U); + #elif CV_SIMD128 + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD128, NEAREST, 8U); + #elif CV_SIMD_SCALABLE + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMDX, NEAREST, 8U); + #endif + } else { + uint8_t pixbuf[max_uf*4]; + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C4, 8U); + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C4, 8U, 16U); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C4, 16U, 8U); + } + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float sx, sy; + if (map1 == map2) { + sx = sx_data[2*x]; + sy = sy_data[2*x+1]; + } else { + sx = sx_data[x]; + sy = sy_data[x]; + } + if (relative) { + sx += x; + sy += y; + } + + CV_WARP_SCALAR_SHUFFLE(NEAREST, C4, 8U); + CV_WARP_SCALAR_STORE(NEAREST, C4, 8U); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} +void remapNearestInvoker_16UC1(const uint16_t *src_data, size_t src_step, int src_rows, int src_cols, + uint16_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + int border_type, const double border_value[4], + const float *map1_data, size_t map1_step, const float *map2_data, size_t map2_step, bool is_relative) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + auto *map1 = map1_data, *map2 = map2_data; + size_t srcstep = src_step/sizeof(uint16_t), dststep = dst_step/sizeof(uint16_t), + map1step = map1_step/sizeof(float), map2step=map2_step/sizeof(float); + if (map2 == nullptr) { + map2 = map1; + map2step = map1step; + } + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + bool relative = is_relative; + + uint16_t bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + uint16_t pixbuf[max_uf]; + + uint16_t bvalbuf[max_uf]; + for (int i = 0; i < uf; i++) { + bvalbuf[i] = bval[0]; + } + v_uint16 bval_v0 = vx_load(&bvalbuf[0]); +#endif + + for (int y = r.start; y < r.end; y++) { + uint16_t* dstptr = dst + y*dststep; + const float *sx_data = map1 + y*map1step; + const float *sy_data = map2 + y*map2step; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C1); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(NEAREST, C1, 16U); + } else { + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C1, 16U); + } + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C1, 16U, 16U); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C1, 16U, 16U); + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float sx, sy; + if (map1 == map2) { + sx = sx_data[2*x]; + sy = sy_data[2*x+1]; + } else { + sx = sx_data[x]; + sy = sy_data[x]; + } + if (relative) { + sx += x; + sy += y; + } + + CV_WARP_SCALAR_SHUFFLE(NEAREST, C1, 16U); + CV_WARP_SCALAR_STORE(NEAREST, C1, 16U); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} +void remapNearestInvoker_16UC3(const uint16_t *src_data, size_t src_step, int src_rows, int src_cols, + uint16_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + int border_type, const double border_value[4], + const float *map1_data, size_t map1_step, const float *map2_data, size_t map2_step, bool is_relative) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + auto *map1 = map1_data, *map2 = map2_data; + size_t srcstep = src_step/sizeof(uint16_t), dststep = dst_step/sizeof(uint16_t), + map1step = map1_step/sizeof(float), map2step=map2_step/sizeof(float); + if (map2 == nullptr) { + map2 = map1; + map2step = map1step; + } + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + bool relative = is_relative; + + uint16_t bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1), three = vx_setall_s32(3); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + uint16_t pixbuf[max_uf*3]; + + uint16_t bvalbuf[max_uf*3]; + for (int i = 0; i < uf; i++) { + bvalbuf[i*3] = bval[0]; + bvalbuf[i*3+1] = bval[1]; + bvalbuf[i*3+2] = bval[2]; + } + v_uint16 bval_v0 = vx_load(&bvalbuf[0]); + v_uint16 bval_v1 = vx_load(&bvalbuf[uf]); + v_uint16 bval_v2 = vx_load(&bvalbuf[uf*2]); +#endif + + for (int y = r.start; y < r.end; y++) { + uint16_t* dstptr = dst + y*dststep; + const float *sx_data = map1 + y*map1step; + const float *sy_data = map2 + y*map2step; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C3); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(NEAREST, C3, 16U); + } else { + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C3, 16U); + } + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C3, 16U, 16U); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C3, 16U, 16U); + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float sx, sy; + if (map1 == map2) { + sx = sx_data[2*x]; + sy = sy_data[2*x+1]; + } else { + sx = sx_data[x]; + sy = sy_data[x]; + } + if (relative) { + sx += x; + sy += y; + } + + CV_WARP_SCALAR_SHUFFLE(NEAREST, C3, 16U); + CV_WARP_SCALAR_STORE(NEAREST, C3, 16U); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} +void remapNearestInvoker_16UC4(const uint16_t *src_data, size_t src_step, int src_rows, int src_cols, + uint16_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, + int border_type, const double border_value[4], + const float *map1_data, size_t map1_step, const float *map2_data, size_t map2_step, bool is_relative) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + auto *map1 = map1_data, *map2 = map2_data; + size_t srcstep = src_step/sizeof(uint16_t), dststep = dst_step/sizeof(uint16_t), + map1step = map1_step/sizeof(float), map2step=map2_step/sizeof(float); + if (map2 == nullptr) { + map2 = map1; + map2step = map1step; + } + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + bool relative = is_relative; + + uint16_t bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1), four = vx_setall_s32(4); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + + uint16_t bvalbuf[max_uf*4]; + for (int i = 0; i < uf; i++) { + bvalbuf[i*4] = bval[0]; + bvalbuf[i*4+1] = bval[1]; + bvalbuf[i*4+2] = bval[2]; + bvalbuf[i*4+3] = bval[3]; + } + v_uint16 bval_v0 = vx_load(&bvalbuf[0]); + v_uint16 bval_v1 = vx_load(&bvalbuf[uf]); + v_uint16 bval_v2 = vx_load(&bvalbuf[uf*2]); + v_uint16 bval_v3 = vx_load(&bvalbuf[uf*3]); +#endif + + for (int y = r.start; y < r.end; y++) { + uint16_t* dstptr = dst + y*dststep; + const float *sx_data = map1 + y*map1step; + const float *sy_data = map2 + y*map2step; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C4); + if (v_reduce_min(inner_mask) != 0) { + #if CV_SIMD256 + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD256, NEAREST, 16U); + #elif CV_SIMD128 + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD128, NEAREST, 16U); + #elif CV_SIMD_SCALABLE + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMDX, NEAREST, 16U); + #endif + } else { + uint16_t pixbuf[max_uf*4]; + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C4, 16U); + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C4, 16U, 16U); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C4, 16U, 16U); + } + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float sx, sy; + if (map1 == map2) { + sx = sx_data[2*x]; + sy = sy_data[2*x+1]; + } else { + sx = sx_data[x]; + sy = sy_data[x]; + } + if (relative) { + sx += x; + sy += y; + } + + CV_WARP_SCALAR_SHUFFLE(NEAREST, C4, 16U); + CV_WARP_SCALAR_STORE(NEAREST, C4, 16U); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} +void remapNearestInvoker_32FC1(const float *src_data, size_t src_step, int src_rows, int src_cols, + float *dst_data, size_t dst_step, int dst_rows, int dst_cols, + int border_type, const double border_value[4], + const float *map1_data, size_t map1_step, const float *map2_data, size_t map2_step, bool is_relative) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + auto *map1 = map1_data, *map2 = map2_data; + size_t srcstep = src_step/sizeof(float), dststep = dst_step/sizeof(float), + map1step = map1_step/sizeof(float), map2step=map2_step/sizeof(float); + if (map2 == nullptr) { + map2 = map1; + map2step = map1step; + } + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + bool relative = is_relative; + + float bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + float pixbuf[max_uf]; + + float bvalbuf[max_uf]; + for (int i = 0; i < uf; i++) { + bvalbuf[i] = bval[0]; + } + v_float32 bval_v0_l = vx_load(&bvalbuf[0]); + v_float32 bval_v0_h = vx_load(&bvalbuf[vlanes_32]); +#endif + + for (int y = r.start; y < r.end; y++) { + float* dstptr = dst + y*dststep; + const float *sx_data = map1 + y*map1step; + const float *sy_data = map2 + y*map2step; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C1); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(NEAREST, C1, 32F); + } else { + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C1, 32F); + } + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C1, 32F, 32F); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C1, 32F, 32F); + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float sx, sy; + if (map1 == map2) { + sx = sx_data[2*x]; + sy = sy_data[2*x+1]; + } else { + sx = sx_data[x]; + sy = sy_data[x]; + } + if (relative) { + sx += x; + sy += y; + } + + CV_WARP_SCALAR_SHUFFLE(NEAREST, C1, 32F); + CV_WARP_SCALAR_STORE(NEAREST, C1, 32F); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} +void remapNearestInvoker_32FC3(const float *src_data, size_t src_step, int src_rows, int src_cols, + float *dst_data, size_t dst_step, int dst_rows, int dst_cols, + int border_type, const double border_value[4], + const float *map1_data, size_t map1_step, const float *map2_data, size_t map2_step, bool is_relative) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + auto *map1 = map1_data, *map2 = map2_data; + size_t srcstep = src_step/sizeof(float), dststep = dst_step/sizeof(float), + map1step = map1_step/sizeof(float), map2step=map2_step/sizeof(float); + if (map2 == nullptr) { + map2 = map1; + map2step = map1step; + } + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + bool relative = is_relative; + + float bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1), three = vx_setall_s32(3); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + float pixbuf[max_uf*3]; + + float bvalbuf[max_uf*3]; + for (int i = 0; i < uf; i++) { + bvalbuf[i*3] = bval[0]; + bvalbuf[i*3+1] = bval[1]; + bvalbuf[i*3+2] = bval[2]; + } + v_float32 bval_v0_l = vx_load(&bvalbuf[0]); + v_float32 bval_v0_h = vx_load(&bvalbuf[vlanes_32]); + v_float32 bval_v1_l = vx_load(&bvalbuf[uf]); + v_float32 bval_v1_h = vx_load(&bvalbuf[uf+vlanes_32]); + v_float32 bval_v2_l = vx_load(&bvalbuf[uf*2]); + v_float32 bval_v2_h = vx_load(&bvalbuf[uf*2+vlanes_32]); +#endif + + for (int y = r.start; y < r.end; y++) { + float* dstptr = dst + y*dststep; + const float *sx_data = map1 + y*map1step; + const float *sy_data = map2 + y*map2step; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C3); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(NEAREST, C3, 32F); + } else { + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C3, 32F); + } + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C3, 32F, 32F); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C3, 32F, 32F); + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float sx, sy; + if (map1 == map2) { + sx = sx_data[2*x]; + sy = sy_data[2*x+1]; + } else { + sx = sx_data[x]; + sy = sy_data[x]; + } + if (relative) { + sx += x; + sy += y; + } + + CV_WARP_SCALAR_SHUFFLE(NEAREST, C3, 32F); + CV_WARP_SCALAR_STORE(NEAREST, C3, 32F); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} +void remapNearestInvoker_32FC4(const float *src_data, size_t src_step, int src_rows, int src_cols, + float *dst_data, size_t dst_step, int dst_rows, int dst_cols, + int border_type, const double border_value[4], + const float *map1_data, size_t map1_step, const float *map2_data, size_t map2_step, bool is_relative) { + auto worker = [&](const Range &r) { + CV_INSTRUMENT_REGION(); + + const auto *src = src_data; + auto *dst = dst_data; + auto *map1 = map1_data, *map2 = map2_data; + size_t srcstep = src_step/sizeof(float), dststep = dst_step/sizeof(float), + map1step = map1_step/sizeof(float), map2step=map2_step/sizeof(float); + if (map2 == nullptr) { + map2 = map1; + map2step = map1step; + } + int srccols = src_cols, srcrows = src_rows; + int dstcols = dst_cols; + bool relative = is_relative; + + float bval[] = { + saturate_cast(border_value[0]), + saturate_cast(border_value[1]), + saturate_cast(border_value[2]), + saturate_cast(border_value[3]), + }; + int border_type_x = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srccols <= 1 ? BORDER_REPLICATE : border_type; + int border_type_y = border_type != BORDER_CONSTANT && + border_type != BORDER_TRANSPARENT && + srcrows <= 1 ? BORDER_REPLICATE : border_type; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + constexpr int max_vlanes_32{VTraits::max_nlanes}; + constexpr int max_uf{max_vlanes_32*2}; + int vlanes_32 = VTraits::vlanes(); + // unrolling_factor = lane_size / 16 = vlanes_32 * 32 / 16 = vlanes_32 * 2 + int uf = vlanes_32 * 2; + + std::array start_indices; + std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); + + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), + inner_scols = vx_setall_u32((unsigned)srccols - 1), + outer_srows = vx_setall_u32((unsigned)srcrows + 1), + outer_scols = vx_setall_u32((unsigned)srccols + 1); + v_float32 delta = vx_setall_f32(static_cast(uf)); + v_int32 one = vx_setall_s32(1), four = vx_setall_s32(4); + v_int32 v_srcstep = vx_setall_s32(int(srcstep)); + int32_t addr[max_uf], + src_ix[max_uf], + src_iy[max_uf]; + + float bvalbuf[max_uf*4]; + for (int i = 0; i < uf; i++) { + bvalbuf[i*4] = bval[0]; + bvalbuf[i*4+1] = bval[1]; + bvalbuf[i*4+2] = bval[2]; + bvalbuf[i*4+3] = bval[3]; + } + v_float32 bval_v0_l = vx_load(&bvalbuf[0]); + v_float32 bval_v0_h = vx_load(&bvalbuf[vlanes_32]); + v_float32 bval_v1_l = vx_load(&bvalbuf[uf]); + v_float32 bval_v1_h = vx_load(&bvalbuf[uf+vlanes_32]); + v_float32 bval_v2_l = vx_load(&bvalbuf[uf*2]); + v_float32 bval_v2_h = vx_load(&bvalbuf[uf*2+vlanes_32]); + v_float32 bval_v3_l = vx_load(&bvalbuf[uf*3]); + v_float32 bval_v3_h = vx_load(&bvalbuf[uf*3+vlanes_32]); +#endif + + for (int y = r.start; y < r.end; y++) { + float* dstptr = dst + y*dststep; + const float *sx_data = map1 + y*map1step; + const float *sy_data = map2 + y*map2step; + int x = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD1(); + for (; x <= dstcols - uf; x += uf) { + // [TODO] apply halide trick + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD2(NEAREST, C4); + if (v_reduce_min(inner_mask) != 0) { + #if CV_SIMD256 + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD256, NEAREST, 32F); + #elif CV_SIMD128 + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD128, NEAREST, 32F); + #elif CV_SIMD_SCALABLE + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMDX, NEAREST, 32F); + #endif + } else { + float pixbuf[max_uf*4]; + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(NEAREST, C4, 32F); + CV_WARP_VECTOR_INTER_LOAD(NEAREST, C4, 32F, 32F); + CV_WARP_VECTOR_INTER_STORE(NEAREST, C4, 32F, 32F); + } + } +#endif // (CV_SIMD || CV_SIMD_SCALABLE) + + for (; x < dstcols; x++) { + float sx, sy; + if (map1 == map2) { + sx = sx_data[2*x]; + sy = sy_data[2*x+1]; + } else { + sx = sx_data[x]; + sy = sy_data[x]; + } + if (relative) { + sx += x; + sy += y; + } + + CV_WARP_SCALAR_SHUFFLE(NEAREST, C4, 32F); + CV_WARP_SCALAR_STORE(NEAREST, C4, 32F); + } + } + }; + parallel_for_(Range(0, dst_rows), worker); +} + void warpAffineLinearInvoker_8UC1(const uint8_t *src_data, size_t src_step, int src_rows, int src_cols, uint8_t *dst_data, size_t dst_step, int dst_rows, int dst_cols, const double dM[6], int border_type, const double border_value[4]) { @@ -298,7 +3079,7 @@ void warpAffineLinearInvoker_8UC1(const uint8_t *src_data, size_t src_step, int std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -325,40 +3106,32 @@ void warpAffineLinearInvoker_8UC1(const uint8_t *src_data, size_t src_step, int int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C1); - + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C1); #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 uint8x8_t p00g, p01g, p10g, p11g; #endif - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image + if (v_reduce_min(inner_mask) != 0) { #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8(C1); + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8(C1); #else - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C1, 8U); + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(LINEAR, C1, 8U); #endif } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C1, 8U); - + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C1, 8U); #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8(C1); + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8(C1); #endif } - - #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 // In case neon fp16 intrinsics are not available; still requires A64 - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_NEON(C1); + #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 + CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8U16_NEON(C1); #else - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16(C1); + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C1, 8U, 16U); #endif - CV_WARP_LINEAR_VECTOR_INTER_CONVERT_S16F32(C1); - + CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C1); CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C1); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U8(C1); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -367,11 +3140,9 @@ void warpAffineLinearInvoker_8UC1(const uint8_t *src_data, size_t src_step, int float sx = x*M[0] + y*M[1] + M[2]; float sy = x*M[3] + y*M[4] + M[5]; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C1, 8U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C1); - - CV_WARP_LINEAR_SCALAR_STORE(C1, 8U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C1, 8U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C1); + CV_WARP_SCALAR_STORE(LINEAR, C1, 8U); } } }; @@ -418,7 +3189,7 @@ void warpAffineLinearInvoker_8UC3(const uint8_t *src_data, size_t src_step, int std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -451,42 +3222,34 @@ void warpAffineLinearInvoker_8UC3(const uint8_t *src_data, size_t src_step, int int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C3); - + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C3); #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 uint8x8_t p00r, p01r, p10r, p11r, p00g, p01g, p10g, p11g, p00b, p01b, p10b, p11b; #endif - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image + if (v_reduce_min(inner_mask) != 0) { #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8(C3); + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8(C3); #else - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C3, 8U); + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(LINEAR, C3, 8U); #endif } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C3, 8U); - + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C3, 8U); #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8(C3); + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8(C3); #endif } - - #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 // In case neon fp16 intrinsics are not available; still requires A64 - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_NEON(C3); + #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 + CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8U16_NEON(C3); #else - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16(C3); + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C3, 8U, 16U); #endif - CV_WARP_LINEAR_VECTOR_INTER_CONVERT_S16F32(C3); - + CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C3); CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C3); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U8(C3); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -495,11 +3258,9 @@ void warpAffineLinearInvoker_8UC3(const uint8_t *src_data, size_t src_step, int float sx = x*M[0] + y*M[1] + M[2]; float sy = x*M[3] + y*M[4] + M[5]; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C3, 8U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C3); - - CV_WARP_LINEAR_SCALAR_STORE(C3, 8U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C3, 8U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C3); + CV_WARP_SCALAR_STORE(LINEAR, C3, 8U); } } }; @@ -547,7 +3308,7 @@ void warpAffineLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, int std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -576,31 +3337,31 @@ void warpAffineLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, int int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4); + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C4); - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image + if (v_reduce_min(inner_mask) != 0) { float valpha[max_uf], vbeta[max_uf]; vx_store(valpha, src_x0); vx_store(valpha+vlanes_32, src_x1); vx_store(vbeta, src_y0); vx_store(vbeta+vlanes_32, src_y1); #if CV_SIMD256 - CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(8U); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD256, LINEAR, 8U); #elif CV_SIMD128 - CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(8U); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD128, LINEAR, 8U); #elif CV_SIMD_SCALABLE - CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(8U); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMDX, LINEAR, 8U); #endif } else { uint8_t pixbuf[max_uf*4*4]; - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 8U); - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16(C4); - CV_WARP_LINEAR_VECTOR_INTER_CONVERT_S16F32(C4); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C4, 8U); + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C4, 8U, 16U); + CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4); CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4); CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U8(C4); } @@ -611,11 +3372,9 @@ void warpAffineLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, int float sx = x*M[0] + y*M[1] + M[2]; float sy = x*M[3] + y*M[4] + M[5]; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C4, 8U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C4); - - CV_WARP_LINEAR_SCALAR_STORE(C4, 8U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C4, 8U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C4); + CV_WARP_SCALAR_STORE(LINEAR, C4, 8U); } } }; @@ -660,7 +3419,7 @@ void warpAffineLinearInvoker_16UC1(const uint16_t *src_data, size_t src_step, in std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -684,25 +3443,18 @@ void warpAffineLinearInvoker_16UC1(const uint16_t *src_data, size_t src_step, in int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C1); - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C1, 16U); + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C1); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(LINEAR, C1, 16U); } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C1, 16U); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C1, 16U); } - - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C1); - + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C1, 16U, 16U); CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C1); - CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C1); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C1); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -711,11 +3463,9 @@ void warpAffineLinearInvoker_16UC1(const uint16_t *src_data, size_t src_step, in float sx = x*M[0] + y*M[1] + M[2]; float sy = x*M[3] + y*M[4] + M[5]; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C1, 16U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C1); - - CV_WARP_LINEAR_SCALAR_STORE(C1, 16U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C1, 16U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C1); + CV_WARP_SCALAR_STORE(LINEAR, C1, 16U); } } }; @@ -761,7 +3511,7 @@ void warpAffineLinearInvoker_16UC3(const uint16_t *src_data, size_t src_step, in std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -789,25 +3539,18 @@ void warpAffineLinearInvoker_16UC3(const uint16_t *src_data, size_t src_step, in int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C3); - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C3, 16U); + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C3); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(LINEAR, C3, 16U); } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C3, 16U); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C3, 16U); } - - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C3); - + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C3, 16U, 16U); CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C3); - CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C3); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C3); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -816,11 +3559,9 @@ void warpAffineLinearInvoker_16UC3(const uint16_t *src_data, size_t src_step, in float sx = x*M[0] + y*M[1] + M[2]; float sy = x*M[3] + y*M[4] + M[5]; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C3, 16U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C3); - - CV_WARP_LINEAR_SCALAR_STORE(C3, 16U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C3, 16U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C3); + CV_WARP_SCALAR_STORE(LINEAR, C3, 16U); } } }; @@ -866,7 +3607,7 @@ void warpAffineLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, in std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -895,30 +3636,27 @@ void warpAffineLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, in int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4); - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C4); + if (v_reduce_min(inner_mask) != 0) { float valpha[max_uf], vbeta[max_uf]; vx_store(valpha, src_x0); vx_store(valpha+vlanes_32, src_x1); vx_store(vbeta, src_y0); vx_store(vbeta+vlanes_32, src_y1); #if CV_SIMD256 - CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(16U); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD256, LINEAR, 16U); #elif CV_SIMD128 - CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(16U); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD128, LINEAR, 16U); #elif CV_SIMD_SCALABLE - CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(16U); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMDX, LINEAR, 16U); #endif } else { uint16_t pixbuf[max_uf*4*4]; - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 16U); - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C4); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C4, 16U); + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C4, 16U, 16U); CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4); CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4); CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C4); @@ -930,11 +3668,9 @@ void warpAffineLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, in float sx = x*M[0] + y*M[1] + M[2]; float sy = x*M[3] + y*M[4] + M[5]; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C4, 16U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C4); - - CV_WARP_LINEAR_SCALAR_STORE(C4, 16U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C4, 16U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C4); + CV_WARP_SCALAR_STORE(LINEAR, C4, 16U); } } }; @@ -979,7 +3715,7 @@ void warpAffineLinearInvoker_32FC1(const float *src_data, size_t src_step, int s std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -1004,23 +3740,17 @@ void warpAffineLinearInvoker_32FC1(const float *src_data, size_t src_step, int s int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C1); - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C1, 32F); + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C1); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(LINEAR, C1, 32F); } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C1, 32F); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C1, 32F); } - - CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C1); - + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C1, 32F, 32F); CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C1); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C1); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -1029,11 +3759,9 @@ void warpAffineLinearInvoker_32FC1(const float *src_data, size_t src_step, int s float sx = x*M[0] + y*M[1] + M[2]; float sy = x*M[3] + y*M[4] + M[5]; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C1, 32F); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C1); - - CV_WARP_LINEAR_SCALAR_STORE(C1, 32F); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C1, 32F); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C1); + CV_WARP_SCALAR_STORE(LINEAR, C1, 32F); } } }; @@ -1078,7 +3806,7 @@ void warpAffineLinearInvoker_32FC3(const float *src_data, size_t src_step, int s std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -1109,23 +3837,17 @@ void warpAffineLinearInvoker_32FC3(const float *src_data, size_t src_step, int s int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C3); - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C3, 32F); + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C3); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(LINEAR, C3, 32F); } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C3, 32F); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C3, 32F); } - - CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C3); - + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C3, 32F, 32F); CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C3); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C3); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -1134,11 +3856,9 @@ void warpAffineLinearInvoker_32FC3(const float *src_data, size_t src_step, int s float sx = x*M[0] + y*M[1] + M[2]; float sy = x*M[3] + y*M[4] + M[5]; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C3, 32F); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C3); - - CV_WARP_LINEAR_SCALAR_STORE(C3, 32F); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C3, 32F); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C3); + CV_WARP_SCALAR_STORE(LINEAR, C3, 32F); } } }; @@ -1183,7 +3903,7 @@ void warpAffineLinearInvoker_32FC4(const float *src_data, size_t src_step, int s std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -1216,30 +3936,28 @@ void warpAffineLinearInvoker_32FC4(const float *src_data, size_t src_step, int s int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4); - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C4); + if (v_reduce_min(inner_mask) != 0) { float valpha[max_uf], vbeta[max_uf]; vx_store(valpha, src_x0); vx_store(valpha+vlanes_32, src_x1); vx_store(vbeta, src_y0); vx_store(vbeta+vlanes_32, src_y1); #if CV_SIMD256 - CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(32F); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD256, LINEAR, 32F); #elif CV_SIMD128 - CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(32F); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD128, LINEAR, 32F); #elif CV_SIMD_SCALABLE - CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(32F); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMDX, LINEAR, 32F); #endif } else { float pixbuf[max_uf*4*4]; - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 32F); - CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C4); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C4, 32F); + // CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C4); + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C4, 32F, 32F); CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4); CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C4); } @@ -1250,11 +3968,9 @@ void warpAffineLinearInvoker_32FC4(const float *src_data, size_t src_step, int s float sx = x*M[0] + y*M[1] + M[2]; float sy = x*M[3] + y*M[4] + M[5]; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C4, 32F); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C4); - - CV_WARP_LINEAR_SCALAR_STORE(C4, 32F); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C4, 32F); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C4); + CV_WARP_SCALAR_STORE(LINEAR, C4, 32F); } } }; @@ -1299,7 +4015,7 @@ void warpAffineLinearApproxInvoker_8UC1(const uint8_t *src_data, size_t src_step std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -1322,27 +4038,19 @@ void warpAffineLinearApproxInvoker_8UC1(const uint8_t *src_data, size_t src_step uint8_t* dstptr = dst + y*dststep; int x = 0; - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C1); - + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C1); uint8x8_t p00g, p01g, p10g, p11g; - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8(C1); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8(C1); } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C1, 8U); - - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8(C1); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C1, 8U); + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8(C1); } - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8F16(C1); - CV_WARP_LINEAR_VECTOR_INTER_CALC_F16(C1); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F16U8(C1); } @@ -1350,11 +4058,9 @@ void warpAffineLinearApproxInvoker_8UC1(const uint8_t *src_data, size_t src_step float sx = x*M[0] + y*M[1] + M[2]; float sy = x*M[3] + y*M[4] + M[5]; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C1, 8U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C1); - - CV_WARP_LINEAR_SCALAR_STORE(C1, 8U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C1, 8U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C1); + CV_WARP_SCALAR_STORE(LINEAR, C1, 8U); } } }; @@ -1406,7 +4112,7 @@ void warpAffineLinearApproxInvoker_8UC3(const uint8_t *src_data, size_t src_step std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -1435,29 +4141,22 @@ void warpAffineLinearApproxInvoker_8UC3(const uint8_t *src_data, size_t src_step uint8_t* dstptr = dst + y*dststep; int x = 0; - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C3); - + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C3); uint8x8_t p00r, p01r, p10r, p11r, p00g, p01g, p10g, p11g, p00b, p01b, p10b, p11b; - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8(C3); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8(C3); } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C3, 8U); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C3, 8U); - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8(C3); + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8(C3); } - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8F16(C3); - CV_WARP_LINEAR_VECTOR_INTER_CALC_F16(C3); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F16U8(C3); } @@ -1465,11 +4164,9 @@ void warpAffineLinearApproxInvoker_8UC3(const uint8_t *src_data, size_t src_step float sx = x*M[0] + y*M[1] + M[2]; float sy = x*M[3] + y*M[4] + M[5]; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C3, 8U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C3); - - CV_WARP_LINEAR_SCALAR_STORE(C3, 8U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C3, 8U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C3); + CV_WARP_SCALAR_STORE(LINEAR, C3, 8U); } } @@ -1520,7 +4217,7 @@ void warpAffineLinearApproxInvoker_8UC4(const uint8_t *src_data, size_t src_step std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -1552,30 +4249,27 @@ void warpAffineLinearApproxInvoker_8UC4(const uint8_t *src_data, size_t src_step uint8_t* dstptr = dst + y*dststep; int x = 0; - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4); + CV_WARPAFFINE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C4); uint8x8_t p00r, p01r, p10r, p11r, p00g, p01g, p10g, p11g, p00b, p01b, p10b, p11b, p00a, p01a, p10a, p11a; - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8(C4); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8(C4); } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 8U); - - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8(C4); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C4, 8U); + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8(C4); } CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8F16(C4); - CV_WARP_LINEAR_VECTOR_INTER_CALC_F16(C4); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F16U8(C4); } @@ -1583,11 +4277,9 @@ void warpAffineLinearApproxInvoker_8UC4(const uint8_t *src_data, size_t src_step float sx = x*M[0] + y*M[1] + M[2]; float sy = x*M[3] + y*M[4] + M[5]; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C4, 8U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C4); - - CV_WARP_LINEAR_SCALAR_STORE(C4, 8U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C4, 8U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C4); + CV_WARP_SCALAR_STORE(LINEAR, C4, 8U); } } }; @@ -1637,7 +4329,7 @@ void warpPerspectiveLinearInvoker_8UC1(const uint8_t *src_data, size_t src_step, std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -1664,41 +4356,32 @@ void warpPerspectiveLinearInvoker_8UC1(const uint8_t *src_data, size_t src_step, int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C1); - + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C1); #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 uint8x8_t p00g, p01g, p10g, p11g; #endif - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image + if (v_reduce_min(inner_mask) != 0) { #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8(C1); + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8(C1); #else - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C1, 8U); + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(LINEAR, C1, 8U); #endif } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C1, 8U); - + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C1, 8U); #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8(C1); + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8(C1); #endif } - - #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 // In case neon fp16 intrinsics are not available; still requires A64 - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_NEON(C1); + #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 + CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8U16_NEON(C1); #else - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16(C1); + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C1, 8U, 16U); #endif - - CV_WARP_LINEAR_VECTOR_INTER_CONVERT_S16F32(C1); - + CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C1); CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C1); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U8(C1); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -1708,11 +4391,9 @@ void warpPerspectiveLinearInvoker_8UC1(const uint8_t *src_data, size_t src_step, float sx = (x*M[0] + y*M[1] + M[2]) / w; float sy = (x*M[3] + y*M[4] + M[5]) / w; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C1, 8U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C1); - - CV_WARP_LINEAR_SCALAR_STORE(C1, 8U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C1, 8U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C1); + CV_WARP_SCALAR_STORE(LINEAR, C1, 8U); } } }; @@ -1759,7 +4440,7 @@ void warpPerspectiveLinearInvoker_8UC3(const uint8_t *src_data, size_t src_step, std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -1792,43 +4473,34 @@ void warpPerspectiveLinearInvoker_8UC3(const uint8_t *src_data, size_t src_step, int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C3); - + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C3); #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 uint8x8_t p00r, p01r, p10r, p11r, p00g, p01g, p10g, p11g, p00b, p01b, p10b, p11b; #endif - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image + if (v_reduce_min(inner_mask) != 0) { #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8(C3); + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8(C3); #else - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C3, 8U); + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(LINEAR, C3, 8U); #endif } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C3, 8U); - + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C3, 8U); #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8(C3); + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8(C3); #endif } - - #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 // In case neon fp16 intrinsics are not available; still requires A64 - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_NEON(C3); + #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 + CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8U16_NEON(C3); #else - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16(C3); + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C3, 8U, 16U); #endif - - CV_WARP_LINEAR_VECTOR_INTER_CONVERT_S16F32(C3); - + CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C3); CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C3); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U8(C3); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -1838,11 +4510,9 @@ void warpPerspectiveLinearInvoker_8UC3(const uint8_t *src_data, size_t src_step, float sx = (x*M[0] + y*M[1] + M[2]) / w; float sy = (x*M[3] + y*M[4] + M[5]) / w; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C3, 8U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C3); - - CV_WARP_LINEAR_SCALAR_STORE(C3, 8U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C3, 8U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C3); + CV_WARP_SCALAR_STORE(LINEAR, C3, 8U); } } }; @@ -1889,7 +4559,7 @@ void warpPerspectiveLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -1918,31 +4588,28 @@ void warpPerspectiveLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4); - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C4); + if (v_reduce_min(inner_mask) != 0) { float valpha[max_uf], vbeta[max_uf]; vx_store(valpha, src_x0); vx_store(valpha+vlanes_32, src_x1); vx_store(vbeta, src_y0); vx_store(vbeta+vlanes_32, src_y1); #if CV_SIMD256 - CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(8U); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD256, LINEAR, 8U); #elif CV_SIMD128 - CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(8U); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD128, LINEAR, 8U); #elif CV_SIMD_SCALABLE - CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(8U); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMDX, LINEAR, 8U); #endif } else { uint8_t pixbuf[max_uf*4*4]; - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 8U); - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16(C4); - CV_WARP_LINEAR_VECTOR_INTER_CONVERT_S16F32(C4); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C4, 8U); + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C4, 8U, 16U); + CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4); CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4); CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U8(C4); } @@ -1954,11 +4621,9 @@ void warpPerspectiveLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, float sx = (x*M[0] + y*M[1] + M[2]) / w; float sy = (x*M[3] + y*M[4] + M[5]) / w; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C4, 8U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C4); - - CV_WARP_LINEAR_SCALAR_STORE(C4, 8U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C4, 8U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C4); + CV_WARP_SCALAR_STORE(LINEAR, C4, 8U); } } }; @@ -2003,7 +4668,7 @@ void warpPerspectiveLinearInvoker_16UC1(const uint16_t *src_data, size_t src_ste std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -2027,25 +4692,18 @@ void warpPerspectiveLinearInvoker_16UC1(const uint16_t *src_data, size_t src_ste int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C1); - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C1, 16U); + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C1); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(LINEAR, C1, 16U); } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C1, 16U); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C1, 16U); } - - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C1); - + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C1, 16U, 16U); CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C1); - CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C1); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C1); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -2055,11 +4713,9 @@ void warpPerspectiveLinearInvoker_16UC1(const uint16_t *src_data, size_t src_ste float sx = (x*M[0] + y*M[1] + M[2]) / w; float sy = (x*M[3] + y*M[4] + M[5]) / w; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C1, 16U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C1); - - CV_WARP_LINEAR_SCALAR_STORE(C1, 16U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C1, 16U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C1); + CV_WARP_SCALAR_STORE(LINEAR, C1, 16U); } } }; @@ -2104,7 +4760,7 @@ void warpPerspectiveLinearInvoker_16UC3(const uint16_t *src_data, size_t src_ste std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -2132,25 +4788,18 @@ void warpPerspectiveLinearInvoker_16UC3(const uint16_t *src_data, size_t src_ste int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C3); - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C3, 16U); + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C3); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(LINEAR, C3, 16U); } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C3, 16U); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C3, 16U); } - - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C3); - + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C3, 16U, 16U); CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C3); - CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C3); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C3); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -2160,11 +4809,11 @@ void warpPerspectiveLinearInvoker_16UC3(const uint16_t *src_data, size_t src_ste float sx = (x*M[0] + y*M[1] + M[2]) / w; float sy = (x*M[3] + y*M[4] + M[5]) / w; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C3, 16U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C3, 16U); - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C3); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C3); - CV_WARP_LINEAR_SCALAR_STORE(C3, 16U); + CV_WARP_SCALAR_STORE(LINEAR, C3, 16U); } } }; @@ -2209,7 +4858,7 @@ void warpPerspectiveLinearInvoker_16UC4(const uint16_t *src_data, size_t src_ste std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -2238,30 +4887,27 @@ void warpPerspectiveLinearInvoker_16UC4(const uint16_t *src_data, size_t src_ste int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4); - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C4); + if (v_reduce_min(inner_mask) != 0) { float valpha[max_uf], vbeta[max_uf]; vx_store(valpha, src_x0); vx_store(valpha+vlanes_32, src_x1); vx_store(vbeta, src_y0); vx_store(vbeta+vlanes_32, src_y1); #if CV_SIMD256 - CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(16U); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD256, LINEAR, 16U); #elif CV_SIMD128 - CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(16U); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD128, LINEAR, 16U); #elif CV_SIMD_SCALABLE - CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(16U); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMDX, LINEAR, 16U); #endif } else { uint16_t pixbuf[max_uf*4*4]; - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 16U); - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C4); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C4, 16U); + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C4, 16U, 16U); CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4); CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4); CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C4); @@ -2274,11 +4920,9 @@ void warpPerspectiveLinearInvoker_16UC4(const uint16_t *src_data, size_t src_ste float sx = (x*M[0] + y*M[1] + M[2]) / w; float sy = (x*M[3] + y*M[4] + M[5]) / w; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C4, 16U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C4); - - CV_WARP_LINEAR_SCALAR_STORE(C4, 16U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C4, 16U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C4); + CV_WARP_SCALAR_STORE(LINEAR, C4, 16U); } } }; @@ -2323,7 +4967,7 @@ void warpPerspectiveLinearInvoker_32FC1(const float *src_data, size_t src_step, std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -2348,23 +4992,17 @@ void warpPerspectiveLinearInvoker_32FC1(const float *src_data, size_t src_step, int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C1); - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C1, 32F); + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C1); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(LINEAR, C1, 32F); } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C1, 32F); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C1, 32F); } - - CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C1); - + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C1, 32F, 32F); CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C1); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C1); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -2374,11 +5012,9 @@ void warpPerspectiveLinearInvoker_32FC1(const float *src_data, size_t src_step, float sx = (x*M[0] + y*M[1] + M[2]) / w; float sy = (x*M[3] + y*M[4] + M[5]) / w; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C1, 32F); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C1); - - CV_WARP_LINEAR_SCALAR_STORE(C1, 32F); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C1, 32F); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C1); + CV_WARP_SCALAR_STORE(LINEAR, C1, 32F); } } }; @@ -2423,7 +5059,7 @@ void warpPerspectiveLinearInvoker_32FC3(const float *src_data, size_t src_step, std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -2454,23 +5090,17 @@ void warpPerspectiveLinearInvoker_32FC3(const float *src_data, size_t src_step, int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C3); - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C3, 32F); + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C3); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(LINEAR, C3, 32F); } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C3, 32F); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C3, 32F); } - - CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C3); - + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C3, 32F, 32F); CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C3); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C3); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -2480,11 +5110,11 @@ void warpPerspectiveLinearInvoker_32FC3(const float *src_data, size_t src_step, float sx = (x*M[0] + y*M[1] + M[2]) / w; float sy = (x*M[3] + y*M[4] + M[5]) / w; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C3, 32F); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C3, 32F); - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C3); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C3); - CV_WARP_LINEAR_SCALAR_STORE(C3, 32F); + CV_WARP_SCALAR_STORE(LINEAR, C3, 32F); } } }; @@ -2529,7 +5159,7 @@ void warpPerspectiveLinearInvoker_32FC4(const float *src_data, size_t src_step, std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -2562,30 +5192,27 @@ void warpPerspectiveLinearInvoker_32FC4(const float *src_data, size_t src_step, int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4); - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C4); + if (v_reduce_min(inner_mask) != 0) { float valpha[max_uf], vbeta[max_uf]; vx_store(valpha, src_x0); vx_store(valpha+vlanes_32, src_x1); vx_store(vbeta, src_y0); vx_store(vbeta+vlanes_32, src_y1); #if CV_SIMD256 - CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(32F); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD256, LINEAR, 32F); #elif CV_SIMD128 - CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(32F); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD128, LINEAR, 32F); #elif CV_SIMD_SCALABLE - CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(32F); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMDX, LINEAR, 32F); #endif } else { float pixbuf[max_uf*4*4]; - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 32F); - CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C4); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C4, 32F); + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C4, 32F, 32F); CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4); CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C4); } @@ -2597,11 +5224,9 @@ void warpPerspectiveLinearInvoker_32FC4(const float *src_data, size_t src_step, float sx = (x*M[0] + y*M[1] + M[2]) / w; float sy = (x*M[3] + y*M[4] + M[5]) / w; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C4, 32F); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C4); - - CV_WARP_LINEAR_SCALAR_STORE(C4, 32F); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C4, 32F); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C4); + CV_WARP_SCALAR_STORE(LINEAR, C4, 32F); } } }; @@ -2646,7 +5271,7 @@ void warpPerspectiveLinearApproxInvoker_8UC1(const uint8_t *src_data, size_t src std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -2669,27 +5294,19 @@ void warpPerspectiveLinearApproxInvoker_8UC1(const uint8_t *src_data, size_t src uint8_t* dstptr = dst + y*dststep; int x = 0; - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C1); - + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C1); uint8x8_t p00g, p01g, p10g, p11g; - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8(C1); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8(C1); } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C1, 8U); - - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8(C1); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C1, 8U); + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8(C1); } - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8F16(C1); - CV_WARP_LINEAR_VECTOR_INTER_CALC_F16(C1); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F16U8(C1); } @@ -2698,11 +5315,9 @@ void warpPerspectiveLinearApproxInvoker_8UC1(const uint8_t *src_data, size_t src float sx = (x*M[0] + y*M[1] + M[2]) / w; float sy = (x*M[3] + y*M[4] + M[5]) / w; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C1, 8U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C1); - - CV_WARP_LINEAR_SCALAR_STORE(C1, 8U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C1, 8U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C1); + CV_WARP_SCALAR_STORE(LINEAR, C1, 8U); } } }; @@ -2754,7 +5369,7 @@ void warpPerspectiveLinearApproxInvoker_8UC3(const uint8_t *src_data, size_t src std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -2783,29 +5398,21 @@ void warpPerspectiveLinearApproxInvoker_8UC3(const uint8_t *src_data, size_t src uint8_t* dstptr = dst + y*dststep; int x = 0; - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C3); - + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C3); uint8x8_t p00r, p01r, p10r, p11r, p00g, p01g, p10g, p11g, p00b, p01b, p10b, p11b; - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8(C3); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8(C3); } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C3, 8U); - - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8(C3); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C3, 8U); + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8(C3); } - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8F16(C3); - CV_WARP_LINEAR_VECTOR_INTER_CALC_F16(C3); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F16U8(C3); } @@ -2814,11 +5421,9 @@ void warpPerspectiveLinearApproxInvoker_8UC3(const uint8_t *src_data, size_t src float sx = (x*M[0] + y*M[1] + M[2]) / w; float sy = (x*M[3] + y*M[4] + M[5]) / w; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C3, 8U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C3); - - CV_WARP_LINEAR_SCALAR_STORE(C3, 8U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C3, 8U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C3); + CV_WARP_SCALAR_STORE(LINEAR, C3, 8U); } } }; @@ -2868,7 +5473,7 @@ void warpPerspectiveLinearApproxInvoker_8UC4(const uint8_t *src_data, size_t src std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -2900,30 +5505,22 @@ void warpPerspectiveLinearApproxInvoker_8UC4(const uint8_t *src_data, size_t src uint8_t* dstptr = dst + y*dststep; int x = 0; - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4); - + CV_WARPPERSPECTIVE_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C4); uint8x8_t p00r, p01r, p10r, p11r, p00g, p01g, p10g, p11g, p00b, p01b, p10b, p11b, p00a, p01a, p10a, p11a; - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8(C4); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8(C4); } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 8U); - - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8(C4); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C4, 8U); + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8(C4); } - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8F16(C4); - CV_WARP_LINEAR_VECTOR_INTER_CALC_F16(C4); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F16U8(C4); } @@ -2932,11 +5529,9 @@ void warpPerspectiveLinearApproxInvoker_8UC4(const uint8_t *src_data, size_t src float sx = (x*M[0] + y*M[1] + M[2]) / w; float sy = (x*M[3] + y*M[4] + M[5]) / w; - CV_WARP_LINEAR_SCALAR_SHUFFLE(C4, 8U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C4); - - CV_WARP_LINEAR_SCALAR_STORE(C4, 8U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C4, 8U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C4); + CV_WARP_SCALAR_STORE(LINEAR, C4, 8U); } } }; @@ -2991,7 +5586,7 @@ void remapLinearInvoker_8UC1(const uint8_t *src_data, size_t src_step, int src_r std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -3020,41 +5615,32 @@ void remapLinearInvoker_8UC1(const uint8_t *src_data, size_t src_step, int src_r int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C1); - + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C1); #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 uint8x8_t p00g, p01g, p10g, p11g; #endif - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image + if (v_reduce_min(inner_mask) != 0) { #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8(C1); + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8(C1); #else - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C1, 8U); + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(LINEAR, C1, 8U); #endif } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C1, 8U); - + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C1, 8U); #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8(C1); + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8(C1); #endif } - - #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 // In case neon fp16 intrinsics are not available; still requires A64 - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_NEON(C1); + #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 + CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8U16_NEON(C1); #else - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16(C1); + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C1, 8U, 16U); #endif - - CV_WARP_LINEAR_VECTOR_INTER_CONVERT_S16F32(C1); - + CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C1); CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C1); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U8(C1); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -3074,11 +5660,9 @@ void remapLinearInvoker_8UC1(const uint8_t *src_data, size_t src_step, int src_r sy += y; } - CV_WARP_LINEAR_SCALAR_SHUFFLE(C1, 8U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C1); - - CV_WARP_LINEAR_SCALAR_STORE(C1, 8U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C1, 8U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C1); + CV_WARP_SCALAR_STORE(LINEAR, C1, 8U); } } }; @@ -3130,7 +5714,7 @@ void remapLinearInvoker_8UC3(const uint8_t *src_data, size_t src_step, int src_r std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -3165,43 +5749,34 @@ void remapLinearInvoker_8UC3(const uint8_t *src_data, size_t src_step, int src_r int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C3); - + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C3); #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 uint8x8_t p00r, p01r, p10r, p11r, p00g, p01g, p10g, p11g, p00b, p01b, p10b, p11b; #endif - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image + if (v_reduce_min(inner_mask) != 0) { #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8(C3); + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8(C3); #else - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C3, 8U); + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(LINEAR, C3, 8U); #endif } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C3, 8U); - + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C3, 8U); #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8(C3); + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8(C3); #endif } - - #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 // In case neon fp16 intrinsics are not available; still requires A64 - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_NEON(C3); + #if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 + CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8U16_NEON(C3); #else - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16(C3); + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C3, 8U, 16U); #endif - - CV_WARP_LINEAR_VECTOR_INTER_CONVERT_S16F32(C3); - + CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C3); CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C3); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U8(C3); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -3221,11 +5796,9 @@ void remapLinearInvoker_8UC3(const uint8_t *src_data, size_t src_step, int src_r sy += y; } - CV_WARP_LINEAR_SCALAR_SHUFFLE(C3, 8U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C3); - - CV_WARP_LINEAR_SCALAR_STORE(C3, 8U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C3, 8U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C3); + CV_WARP_SCALAR_STORE(LINEAR, C3, 8U); } } }; @@ -3277,7 +5850,7 @@ void remapLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, int src_r std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -3308,31 +5881,28 @@ void remapLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, int src_r int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4); - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C4); + if (v_reduce_min(inner_mask) != 0) { float valpha[max_uf], vbeta[max_uf]; vx_store(valpha, src_x0); vx_store(valpha+vlanes_32, src_x1); vx_store(vbeta, src_y0); vx_store(vbeta+vlanes_32, src_y1); #if CV_SIMD256 - CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(8U); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD256, LINEAR, 8U); #elif CV_SIMD128 - CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(8U); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD128, LINEAR, 8U); #elif CV_SIMD_SCALABLE - CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(8U); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMDX, LINEAR, 8U); #endif } else { uint8_t pixbuf[max_uf*4*4]; - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 8U); - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16(C4); - CV_WARP_LINEAR_VECTOR_INTER_CONVERT_S16F32(C4); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C4, 8U); + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C4, 8U, 16U); + CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4); CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4); CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U8(C4); } @@ -3354,11 +5924,9 @@ void remapLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, int src_r sy += y; } - CV_WARP_LINEAR_SCALAR_SHUFFLE(C4, 8U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C4); - - CV_WARP_LINEAR_SCALAR_STORE(C4, 8U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C4, 8U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C4); + CV_WARP_SCALAR_STORE(LINEAR, C4, 8U); } } }; @@ -3408,7 +5976,7 @@ void remapLinearInvoker_16UC1(const uint16_t *src_data, size_t src_step, int src std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -3434,25 +6002,18 @@ void remapLinearInvoker_16UC1(const uint16_t *src_data, size_t src_step, int src int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C1); - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C1, 16U); + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C1); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(LINEAR, C1, 16U); } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C1, 16U); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C1, 16U); } - - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C1); - + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C1, 16U, 16U); CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C1); - CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C1); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C1); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -3472,11 +6033,9 @@ void remapLinearInvoker_16UC1(const uint16_t *src_data, size_t src_step, int src sy += y; } - CV_WARP_LINEAR_SCALAR_SHUFFLE(C1, 16U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C1); - - CV_WARP_LINEAR_SCALAR_STORE(C1, 16U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C1, 16U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C1); + CV_WARP_SCALAR_STORE(LINEAR, C1, 16U); } } }; @@ -3526,7 +6085,7 @@ void remapLinearInvoker_16UC3(const uint16_t *src_data, size_t src_step, int src std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -3556,25 +6115,18 @@ void remapLinearInvoker_16UC3(const uint16_t *src_data, size_t src_step, int src int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C3); - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C3, 16U); + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C3); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(LINEAR, C3, 16U); } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C3, 16U); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C3, 16U); } - - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C3); - + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C3, 16U, 16U); CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C3); - CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C3); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C3); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -3594,11 +6146,9 @@ void remapLinearInvoker_16UC3(const uint16_t *src_data, size_t src_step, int src sy += y; } - CV_WARP_LINEAR_SCALAR_SHUFFLE(C3, 16U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C3); - - CV_WARP_LINEAR_SCALAR_STORE(C3, 16U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C3, 16U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C3); + CV_WARP_SCALAR_STORE(LINEAR, C3, 16U); } } }; @@ -3648,7 +6198,7 @@ void remapLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, int src std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -3679,30 +6229,27 @@ void remapLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, int src int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4); - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C4); + if (v_reduce_min(inner_mask) != 0) { float valpha[max_uf], vbeta[max_uf]; vx_store(valpha, src_x0); vx_store(valpha+vlanes_32, src_x1); vx_store(vbeta, src_y0); vx_store(vbeta+vlanes_32, src_y1); #if CV_SIMD256 - CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(16U); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD256, LINEAR, 16U); #elif CV_SIMD128 - CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(16U); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD128, LINEAR, 16U); #elif CV_SIMD_SCALABLE - CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(16U); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMDX, LINEAR, 16U); #endif } else { uint16_t pixbuf[max_uf*4*4]; - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 16U); - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C4); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C4, 16U); + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C4, 16U, 16U); CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4); CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4); CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C4); @@ -3725,11 +6272,9 @@ void remapLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, int src sy += y; } - CV_WARP_LINEAR_SCALAR_SHUFFLE(C4, 16U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C4); - - CV_WARP_LINEAR_SCALAR_STORE(C4, 16U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C4, 16U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C4); + CV_WARP_SCALAR_STORE(LINEAR, C4, 16U); } } }; @@ -3779,7 +6324,7 @@ void remapLinearInvoker_32FC1(const float *src_data, size_t src_step, int src_ro std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -3806,23 +6351,17 @@ void remapLinearInvoker_32FC1(const float *src_data, size_t src_step, int src_ro int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C1); - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C1, 32F); + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C1); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(LINEAR, C1, 32F); } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C1, 32F); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C1, 32F); } - - CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C1); - + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C1, 32F, 32F); CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C1); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C1); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -3842,11 +6381,9 @@ void remapLinearInvoker_32FC1(const float *src_data, size_t src_step, int src_ro sy += y; } - CV_WARP_LINEAR_SCALAR_SHUFFLE(C1, 32F); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C1); - - CV_WARP_LINEAR_SCALAR_STORE(C1, 32F); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C1, 32F); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C1); + CV_WARP_SCALAR_STORE(LINEAR, C1, 32F); } } }; @@ -3896,7 +6433,7 @@ void remapLinearInvoker_32FC3(const float *src_data, size_t src_step, int src_ro std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -3929,23 +6466,17 @@ void remapLinearInvoker_32FC3(const float *src_data, size_t src_step, int src_ro int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C3); - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C3, 32F); + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C3); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(LINEAR, C3, 32F); } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C3, 32F); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C3, 32F); } - - CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C3); - + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C3, 32F, 32F); CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C3); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C3); } #endif // (CV_SIMD || CV_SIMD_SCALABLE) @@ -3965,11 +6496,9 @@ void remapLinearInvoker_32FC3(const float *src_data, size_t src_step, int src_ro sy += y; } - CV_WARP_LINEAR_SCALAR_SHUFFLE(C3, 32F); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C3); - - CV_WARP_LINEAR_SCALAR_STORE(C3, 32F); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C3, 32F); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C3); + CV_WARP_SCALAR_STORE(LINEAR, C3, 32F); } } }; @@ -4019,7 +6548,7 @@ void remapLinearInvoker_32FC4(const float *src_data, size_t src_step, int src_ro std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -4054,30 +6583,27 @@ void remapLinearInvoker_32FC4(const float *src_data, size_t src_step, int src_ro int x = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4); - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C4); + if (v_reduce_min(inner_mask) != 0) { float valpha[max_uf], vbeta[max_uf]; vx_store(valpha, src_x0); vx_store(valpha+vlanes_32, src_x1); vx_store(vbeta, src_y0); vx_store(vbeta+vlanes_32, src_y1); #if CV_SIMD256 - CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(32F); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD256, LINEAR, 32F); #elif CV_SIMD128 - CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(32F); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD128, LINEAR, 32F); #elif CV_SIMD_SCALABLE - CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(32F); + CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMDX, LINEAR, 32F); #endif } else { float pixbuf[max_uf*4*4]; - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 32F); - CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C4); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C4, 32F); + CV_WARP_VECTOR_INTER_LOAD(LINEAR, C4, 32F, 32F); CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4); CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C4); } @@ -4099,11 +6625,9 @@ void remapLinearInvoker_32FC4(const float *src_data, size_t src_step, int src_ro sy += y; } - CV_WARP_LINEAR_SCALAR_SHUFFLE(C4, 32F); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C4); - - CV_WARP_LINEAR_SCALAR_STORE(C4, 32F); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C4, 32F); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C4); + CV_WARP_SCALAR_STORE(LINEAR, C4, 32F); } } }; @@ -4152,7 +6676,7 @@ void remapLinearApproxInvoker_8UC1(const uint8_t *src_data, size_t src_step, int std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -4177,27 +6701,19 @@ void remapLinearApproxInvoker_8UC1(const uint8_t *src_data, size_t src_step, int const float *sy_data = map2 + y*map2step; int x = 0; - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C1); - + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C1); uint8x8_t p00g, p01g, p10g, p11g; - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8(C1); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8(C1); } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C1, 8U); - - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8(C1); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C1, 8U); + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8(C1); } - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8F16(C1); - CV_WARP_LINEAR_VECTOR_INTER_CALC_F16(C1); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F16U8(C1); } @@ -4216,11 +6732,9 @@ void remapLinearApproxInvoker_8UC1(const uint8_t *src_data, size_t src_step, int sy += y; } - CV_WARP_LINEAR_SCALAR_SHUFFLE(C1, 8U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C1); - - CV_WARP_LINEAR_SCALAR_STORE(C1, 8U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C1, 8U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C1); + CV_WARP_SCALAR_STORE(LINEAR, C1, 8U); } } }; @@ -4274,7 +6788,7 @@ void remapLinearApproxInvoker_8UC3(const uint8_t *src_data, size_t src_step, int std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -4305,29 +6819,21 @@ void remapLinearApproxInvoker_8UC3(const uint8_t *src_data, size_t src_step, int const float *sy_data = map2 + y*map2step; int x = 0; - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C3); - + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C3); uint8x8_t p00r, p01r, p10r, p11r, p00g, p01g, p10g, p11g, p00b, p01b, p10b, p11b; - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8(C3); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8(C3); } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C3, 8U); - - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8(C3); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C3, 8U); + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8(C3); } - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8F16(C3); - CV_WARP_LINEAR_VECTOR_INTER_CALC_F16(C3); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F16U8(C3); } @@ -4346,11 +6852,9 @@ void remapLinearApproxInvoker_8UC3(const uint8_t *src_data, size_t src_step, int sy += y; } - CV_WARP_LINEAR_SCALAR_SHUFFLE(C3, 8U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C3); - - CV_WARP_LINEAR_SCALAR_STORE(C3, 8U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C3, 8U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C3); + CV_WARP_SCALAR_STORE(LINEAR, C3, 8U); } } }; @@ -4404,7 +6908,7 @@ void remapLinearApproxInvoker_8UC4(const uint8_t *src_data, size_t src_step, int std::array start_indices; std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f); - v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1), + v_uint32 inner_srows = vx_setall_u32((unsigned)std::max(srcrows - 2, 0)), inner_scols = vx_setall_u32((unsigned)srccols - 1), outer_srows = vx_setall_u32((unsigned)srcrows + 1), outer_scols = vx_setall_u32((unsigned)srccols + 1); @@ -4438,30 +6942,22 @@ void remapLinearApproxInvoker_8UC4(const uint8_t *src_data, size_t src_step, int const float *sy_data = map2 + y*map2step; int x = 0; - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD1(); - + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD1(); for (; x <= dstcols - uf; x += uf) { // [TODO] apply halide trick - - CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4); - + CV_REMAP_VECTOR_COMPUTE_MAPPED_COORD2(LINEAR, C4); uint8x8_t p00r, p01r, p10r, p11r, p00g, p01g, p10g, p11g, p00b, p01b, p10b, p11b, p00a, p01a, p10a, p11a; - - if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image - CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8(C4); + if (v_reduce_min(inner_mask) != 0) { + CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8(C4); } else { - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 8U); - - CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8(C4); + CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(LINEAR, C4, 8U); + CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8(C4); } - CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8F16(C4); - CV_WARP_LINEAR_VECTOR_INTER_CALC_F16(C4); - CV_WARP_LINEAR_VECTOR_INTER_STORE_F16U8(C4); } @@ -4480,11 +6976,9 @@ void remapLinearApproxInvoker_8UC4(const uint8_t *src_data, size_t src_step, int sy += y; } - CV_WARP_LINEAR_SCALAR_SHUFFLE(C4, 8U); - - CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(C4); - - CV_WARP_LINEAR_SCALAR_STORE(C4, 8U); + CV_WARP_SCALAR_SHUFFLE(LINEAR, C4, 8U); + CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(C4); + CV_WARP_SCALAR_STORE(LINEAR, C4, 8U); } } }; diff --git a/modules/imgproc/test/test_imgwarp_strict.cpp b/modules/imgproc/test/test_imgwarp_strict.cpp index 1831482686..8d61d227b6 100644 --- a/modules/imgproc/test/test_imgwarp_strict.cpp +++ b/modules/imgproc/test/test_imgwarp_strict.cpp @@ -703,6 +703,16 @@ protected: virtual void run_func(); virtual void run_reference_func(); + template + void new_nearest_c1(int x, float sx, float sy, const T *srcptr_, T *dstptr, int srccols, int srcrows, size_t srcstep, + const T *bval, int borderType_x, int borderType_y); + template + void new_nearest_c3(int x, float sx, float sy, const T *srcptr_, T *dstptr, int srccols, int srcrows, size_t srcstep, + const T *bval, int borderType_x, int borderType_y); + template + void new_nearest_c4(int x, float sx, float sy, const T *srcptr_, T *dstptr, int srccols, int srcrows, size_t srcstep, + const T *bval, int borderType_x, int borderType_y); + template void new_linear_c1(int x, float sx, float sy, const T *srcptr_, T *dstptr, int srccols, int srcrows, size_t srcstep, const T *bval, int borderType_x, int borderType_y); @@ -720,7 +730,7 @@ protected: remap_func funcs[2]; private: - template void new_remap(const Mat&, Mat&); + template void new_remap(const Mat&, Mat&, int); void remap_nearest(const Mat&, Mat&); void remap_generic(const Mat&, Mat&); @@ -879,19 +889,19 @@ void CV_Remap_Test::run_reference_func() if (interpolation == INTER_AREA) interpolation = INTER_LINEAR; - if (interpolation == INTER_LINEAR && mapx.depth() == CV_32F) { + if ((interpolation == INTER_LINEAR) && mapx.depth() == CV_32F) { int src_depth = src.depth(), src_channels = src.channels(); Mat tmp = Mat::zeros(dst.size(), dst.type()); if (src_depth == CV_8U && (src_channels == 1 || src_channels == 3 || src_channels == 4)) { - new_remap(src, tmp); + new_remap(src, tmp, interpolation); tmp.convertTo(reference_dst, reference_dst.depth()); return; } else if (src_depth == CV_16U && (src_channels == 1 || src_channels == 3 || src_channels == 4)) { - new_remap(src, tmp); + new_remap(src, tmp, interpolation); tmp.convertTo(reference_dst, reference_dst.depth()); return; } else if (src_depth == CV_32F && (src_channels == 1 || src_channels == 3 || src_channels == 4)) { - new_remap(src, tmp); + new_remap(src, tmp, interpolation); tmp.convertTo(reference_dst, reference_dst.depth()); return; } @@ -903,7 +913,7 @@ void CV_Remap_Test::run_reference_func() (this->*funcs[index])(src, reference_dst); } -#define FETCH_PIXEL_SCALAR(cn, dy, dx) \ +#define WARP_SHUFFLE_FETCH_PIXEL_OUT_RANGE(cn, dy, dx) \ if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \ size_t ofs = dy*srcstep + dx*cn; \ for (int ci = 0; ci < cn; ci++) { pxy[2*dy*cn+dx*cn+ci] = srcptr[ofs+ci];} \ @@ -917,16 +927,28 @@ void CV_Remap_Test::run_reference_func() size_t glob_ofs = iy_*srcstep + ix_*cn; \ for (int ci = 0; ci < cn; ci++) { pxy[2*dy*cn+dx*cn+ci] = srcptr_[glob_ofs+ci];} \ } - -#define WARPAFFINE_SHUFFLE(cn) \ +#define WARP_NEAREST_SHUFFLE_FETCH_PIXEL_IN_RANGE(cn) \ + for (int ci = 0; ci < cn; ci++) { \ + pxy[ci] = srcptr[ci]; \ + } +#define WARP_LINEAR_SHUFFLE_FETCH_PIXEL_IN_RANGE(cn) \ + for (int ci = 0; ci < cn; ci++) { \ + pxy[ci] = srcptr[ci]; \ + pxy[ci+cn] = srcptr[ci+cn]; \ + pxy[ci+cn*2] = srcptr[srcstep+ci]; \ + pxy[ci+cn*3] = srcptr[srcstep+ci+cn]; \ + } +#define WARP_NEAREST_SHUFFLE_FETCH_PIXEL_OUT_RANGE(cn) \ + WARP_SHUFFLE_FETCH_PIXEL_OUT_RANGE(cn, 0, 0); +#define WARP_LINEAR_SHUFFLE_FETCH_PIXEL_OUT_RANGE(cn) \ + WARP_SHUFFLE_FETCH_PIXEL_OUT_RANGE(cn, 0, 0); \ + WARP_SHUFFLE_FETCH_PIXEL_OUT_RANGE(cn, 0, 1); \ + WARP_SHUFFLE_FETCH_PIXEL_OUT_RANGE(cn, 1, 0); \ + WARP_SHUFFLE_FETCH_PIXEL_OUT_RANGE(cn, 1, 1); +#define WARP_SHUFFLE(inter, cn) \ if ((((unsigned)ix < (unsigned)(srccols-1)) & \ ((unsigned)iy < (unsigned)(srcrows-1))) != 0) { \ - for (int ci = 0; ci < cn; ci++) { \ - pxy[ci] = srcptr[ci]; \ - pxy[ci+cn] = srcptr[ci+cn]; \ - pxy[ci+cn*2] = srcptr[srcstep+ci]; \ - pxy[ci+cn*3] = srcptr[srcstep+ci+cn]; \ - } \ + WARP_##inter##_SHUFFLE_FETCH_PIXEL_IN_RANGE(cn) \ } else { \ if ((borderType == BORDER_CONSTANT || borderType == BORDER_TRANSPARENT) && \ (((unsigned)(ix+1) >= (unsigned)(srccols+1))| \ @@ -936,14 +958,50 @@ void CV_Remap_Test::run_reference_func() } \ return; \ } \ - FETCH_PIXEL_SCALAR(cn, 0, 0); \ - FETCH_PIXEL_SCALAR(cn, 0, 1); \ - FETCH_PIXEL_SCALAR(cn, 1, 0); \ - FETCH_PIXEL_SCALAR(cn, 1, 1); \ + WARP_##inter##_SHUFFLE_FETCH_PIXEL_OUT_RANGE(cn) \ } template -static inline void warpaffine_linear_calc(int cn, const T *pxy, T *dst, float sx, float sy) +void CV_Remap_Test::new_nearest_c1(int x, float sx, float sy, const T *srcptr_, T *dstptr, int srccols, int srcrows, size_t srcstep, + const T *bval, int borderType_x, int borderType_y) { + int ix = (int)roundf(sx), iy = (int)roundf(sy); + + T pxy[1]; + const T *srcptr = srcptr_ + srcstep*iy + ix; + WARP_SHUFFLE(NEAREST, 1); + + dstptr[x+0] = saturate_cast(pxy[0]); +} +template +void CV_Remap_Test::new_nearest_c3(int x, float sx, float sy, const T *srcptr_, T *dstptr, int srccols, int srcrows, size_t srcstep, + const T *bval, int borderType_x, int borderType_y) { + int ix = (int)roundf(sx), iy = (int)roundf(sy); + + T pxy[3]; + const T *srcptr = srcptr_ + srcstep*iy + ix*3; + WARP_SHUFFLE(NEAREST, 3); + + dstptr[x*3+0] = saturate_cast(pxy[0]); + dstptr[x*3+1] = saturate_cast(pxy[1]); + dstptr[x*3+2] = saturate_cast(pxy[2]); +} +template +void CV_Remap_Test::new_nearest_c4(int x, float sx, float sy, const T *srcptr_, T *dstptr, int srccols, int srcrows, size_t srcstep, + const T *bval, int borderType_x, int borderType_y) { + int ix = (int)roundf(sx), iy = (int)roundf(sy); + + T pxy[4]; + const T *srcptr = srcptr_ + srcstep*iy + ix*4; + WARP_SHUFFLE(NEAREST, 4); + + dstptr[x*4+0] = saturate_cast(pxy[0]); + dstptr[x*4+1] = saturate_cast(pxy[1]); + dstptr[x*4+2] = saturate_cast(pxy[2]); + dstptr[x*4+3] = saturate_cast(pxy[3]); +} + +template +static inline void warp_linear_calc(int cn, const T *pxy, T *dst, float sx, float sy) { for (int ci = 0; ci < cn; ci++) { float p00 = pxy[ci]; @@ -956,7 +1014,6 @@ static inline void warpaffine_linear_calc(int cn, const T *pxy, T *dst, float sx dst[ci] = saturate_cast(v0); } } - template void CV_Remap_Test::new_linear_c1(int x, float sx, float sy, const T *srcptr_, T *dstptr, int srccols, int srcrows, size_t srcstep, @@ -968,11 +1025,10 @@ void CV_Remap_Test::new_linear_c1(int x, float sx, float sy, const T *srcptr_, T T pxy[4]; const T *srcptr = srcptr_ + srcstep*iy + ix; - WARPAFFINE_SHUFFLE(1); + WARP_SHUFFLE(LINEAR, 1); - warpaffine_linear_calc(1, pxy, dstptr+x, sx, sy); + warp_linear_calc(1, pxy, dstptr+x, sx, sy); } - template void CV_Remap_Test::new_linear_c3(int x, float sx, float sy, const T *srcptr_, T *dstptr, int srccols, int srcrows, size_t srcstep, @@ -984,11 +1040,10 @@ void CV_Remap_Test::new_linear_c3(int x, float sx, float sy, const T *srcptr_, T T pxy[12]; const T *srcptr = srcptr_ + srcstep*iy + ix*3; - WARPAFFINE_SHUFFLE(3); + WARP_SHUFFLE(LINEAR, 3); - warpaffine_linear_calc(3, pxy, dstptr+x*3, sx, sy); + warp_linear_calc(3, pxy, dstptr+x*3, sx, sy); } - template void CV_Remap_Test::new_linear_c4(int x, float sx, float sy, const T *srcptr_, T *dstptr, int srccols, int srcrows, size_t srcstep, @@ -1000,13 +1055,14 @@ void CV_Remap_Test::new_linear_c4(int x, float sx, float sy, const T *srcptr_, T T pxy[16]; const T *srcptr = srcptr_ + srcstep*iy + ix*4; - WARPAFFINE_SHUFFLE(4); + WARP_SHUFFLE(LINEAR, 4); - warpaffine_linear_calc(4, pxy, dstptr+x*4, sx, sy); + warp_linear_calc(4, pxy, dstptr+x*4, sx, sy); } template -void CV_Remap_Test::new_remap(const Mat &_src, Mat &_dst) { +void CV_Remap_Test::new_remap(const Mat &_src, Mat &_dst, int inter) { + CV_UNUSED(inter); int src_channels = _src.channels(); CV_CheckTrue(_src.channels() == 1 || _src.channels() == 3 || _src.channels() == 4, ""); CV_CheckTrue(mapx.depth() == CV_32F, ""); @@ -1232,7 +1288,7 @@ private: void warpAffine(const Mat&, Mat&); template - void newWarpAffine(const Mat&, Mat&, const Mat&); + void new_warpAffine(const Mat&, Mat&, const Mat&, int); }; CV_WarpAffine_Test::CV_WarpAffine_Test() : @@ -1287,8 +1343,9 @@ void CV_WarpAffine_Test::run_reference_func() } template -void CV_WarpAffine_Test::newWarpAffine(const Mat &_src, Mat &_dst, const Mat &tM) +void CV_WarpAffine_Test::new_warpAffine(const Mat &_src, Mat &_dst, const Mat &tM, int inter) { + CV_UNUSED(inter); int num_channels = _dst.channels(); CV_CheckTrue(num_channels == 1 || num_channels == 3 || num_channels == 4, ""); @@ -1360,11 +1417,11 @@ void CV_WarpAffine_Test::warpAffine(const Mat& _src, Mat& _dst) if (inter == INTER_LINEAR) { int dst_depth = _dst.depth(), dst_channels = _dst.channels(); if (dst_depth == CV_8U && (dst_channels == 1 || dst_channels == 3 || dst_channels == 4)) { - return newWarpAffine(_src, _dst, tM); + return new_warpAffine(_src, _dst, tM, inter); } else if (dst_depth == CV_16U && (dst_channels == 1 || dst_channels == 3 || dst_channels == 4)) { - return newWarpAffine(_src, _dst, tM); + return new_warpAffine(_src, _dst, tM, inter); } else if (dst_depth == CV_32F && (dst_channels == 1 || dst_channels == 3 || dst_channels == 4)) { - return newWarpAffine(_src, _dst, tM); + return new_warpAffine(_src, _dst, tM, inter); } } @@ -1420,7 +1477,7 @@ private: void warpPerspective(const Mat&, Mat&); template - void newWarpPerspective(const Mat&, Mat&, const Mat&); + void new_warpPerspective(const Mat&, Mat&, const Mat&, int); }; CV_WarpPerspective_Test::CV_WarpPerspective_Test() : @@ -1470,8 +1527,9 @@ void CV_WarpPerspective_Test::run_reference_func() } template -void CV_WarpPerspective_Test::newWarpPerspective(const Mat &_src, Mat &_dst, const Mat &tM) +void CV_WarpPerspective_Test::new_warpPerspective(const Mat &_src, Mat &_dst, const Mat &tM, int inter) { + CV_UNUSED(inter); int num_channels = _dst.channels(); CV_CheckTrue(num_channels == 1 || num_channels == 3 || num_channels == 4, ""); @@ -1546,11 +1604,11 @@ void CV_WarpPerspective_Test::warpPerspective(const Mat& _src, Mat& _dst) if (inter == INTER_LINEAR) { int dst_depth = _dst.depth(), dst_channels = _dst.channels(); if (dst_depth == CV_8U && (dst_channels == 1 || dst_channels == 3 || dst_channels == 4)) { - return newWarpPerspective(_src, _dst, M); + return new_warpPerspective(_src, _dst, M, inter); } else if (dst_depth == CV_16U && (dst_channels == 1 || dst_channels == 3 || dst_channels == 4)) { - return newWarpPerspective(_src, _dst, M); + return new_warpPerspective(_src, _dst, M, inter); } else if (dst_depth == CV_32F && (dst_channels == 1 || dst_channels == 3 || dst_channels == 4)) { - return newWarpPerspective(_src, _dst, M); + return new_warpPerspective(_src, _dst, M, inter); } }