Merge pull request #25984 from fengyuentau:imgproc/warpaffine_opt

imgproc: add optimized warpAffine kernels for 8U/16U/32F + C1/C3/C4 inputs #25984 Merge wtih https://github.com/opencv/opencv_extra/pull/1198. Merge with https://github.com/opencv/opencv_contrib/pull/3787. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
2025-06-13 13:13:26 +08:00 · 2024-10-03 19:01:36 +08:00 · 2024-10-03 19:01:36 +08:00 · 97681bdfce
commit 97681bdfce
parent ebf11d36f4
16 changed files with 3070 additions and 179 deletions
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -2228,9 +2228,7 @@ inline v_int16x8 v_round(const v_float16x8 &a)
 inline v_int16x8 v_floor(const v_float16x8 &a)
 {
-    int16x8_t a1 = vcvtq_s16_f16(a.val);
+    return v_int16x8(vcvtmq_s16_f16(a.val));
    uint16x8_t mask = vcgtq_f16(vcvtq_f16_s16(a1), a.val);
    return v_int16x8(vaddq_s16(a1, vreinterpretq_s16_u16(mask)));
 }
 inline v_int16x8 v_ceil(const v_float16x8 &a)
@ -2271,9 +2269,13 @@ inline v_int32x4 v_round(const v_float32x4& a)
 #endif
 inline v_int32x4 v_floor(const v_float32x4& a)
 {
 #if __ARM_ARCH > 7
    return v_int32x4(vcvtmq_s32_f32(a.val));
 #else
    int32x4_t a1 = vcvtq_s32_f32(a.val);
    uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
    return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
 #endif
 }
 inline v_int32x4 v_ceil(const v_float32x4& a)
--- a/modules/features2d/src/affine_feature.cpp
+++ b/modules/features2d/src/affine_feature.cpp
@ -261,7 +261,7 @@ private:
            h = rect.height; w = rect.width;
            pose = Matx23f(c, -s, -(float)rect.x,
                        s,  c, -(float)rect.y);
-            warpAffine(image, rotImage, pose, Size(w, h), INTER_LINEAR, BORDER_REPLICATE);
+            warpAffine(image, rotImage, pose, Size(w, h), INTER_LINEAR, BORDER_REPLICATE, Scalar(), cv::ALGO_HINT_ACCURATE);
        }
        if( tilt == 1 )
            warpedImage = rotImage;
@ -275,7 +275,7 @@ private:
            pose(0, 2) /= tilt;
        }
        if( phi != 0 || tilt != 1 )
-            warpAffine(mask0, warpedMask, pose, warpedImage.size(), INTER_NEAREST);
+            warpAffine(mask0, warpedMask, pose, warpedImage.size(), INTER_NEAREST, BORDER_CONSTANT, Scalar(), cv::ALGO_HINT_ACCURATE);
        else
            warpedMask = mask0;
    }
--- a/modules/imgproc/CMakeLists.txt
+++ b/modules/imgproc/CMakeLists.txt
@ -10,6 +10,7 @@ ocv_add_dispatched_file(median_blur SSE2 SSE4_1 AVX2)
 ocv_add_dispatched_file(morph SSE2 SSE4_1 AVX2)
 ocv_add_dispatched_file(smooth SSE2 SSE4_1 AVX2)
 ocv_add_dispatched_file(sumpixels SSE2 AVX2 AVX512_SKX)
 ocv_add_dispatched_file(warp_kernels SSE2 SSE4_1 AVX2 NEON NEON_FP16 RVV LASX)
 ocv_define_module(imgproc opencv_core WRAP java objc python js)
 ocv_module_include_directories(opencv_imgproc ${ZLIB_INCLUDE_DIRS})
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@ -2474,6 +2474,7 @@ flag #WARP_INVERSE_MAP that means that M is the inverse transformation (
 borderMode=#BORDER_TRANSPARENT, it means that the pixels in the destination image corresponding to
 the "outliers" in the source image are not modified by the function.
@param borderValue value used in case of a constant border; by default, it is 0.
@param hint Implementation modfication flags. See #AlgorithmHint
@sa  warpPerspective, resize, remap, getRectSubPix, transform
 */
@ -2481,7 +2482,8 @@ CV_EXPORTS_W void warpAffine( InputArray src, OutputArray dst,
                              InputArray M, Size dsize,
                              int flags = INTER_LINEAR,
                              int borderMode = BORDER_CONSTANT,
-                              const Scalar& borderValue = Scalar());
+                              const Scalar& borderValue = Scalar(),
                              AlgorithmHint hint = cv::ALGO_HINT_DEFAULT);
 /** @example samples/cpp/snippets/warpPerspective_demo.cpp
 An example program shows using cv::getPerspectiveTransform and cv::warpPerspective for image warping
--- a/modules/imgproc/perf/opencl/perf_imgwarp.cpp
+++ b/modules/imgproc/perf/opencl/perf_imgwarp.cpp
@ -72,7 +72,10 @@ OCL_PERF_TEST_P(WarpAffineFixture, WarpAffine,
    const WarpAffineParams params = GetParam();
    const Size srcSize = get<0>(params);
    const int type = get<1>(params), interpolation = get<2>(params);
-    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : interpolation == INTER_CUBIC ? 2e-3 : 1e-4;
+
    // BUG: OpenCL and CPU version diverges a bit
    // Ticket: https://github.com/opencv/opencv/issues/26235
    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 2 : interpolation == INTER_CUBIC ? 2e-3 : 3e-2;
    checkDeviceMaxMemoryAllocSize(srcSize, type);
--- a/modules/imgproc/perf/perf_resize.cpp
+++ b/modules/imgproc/perf/perf_resize.cpp
@ -15,24 +15,6 @@ typedef TestBaseWithParam<MatInfo_SizePair_t> MatInfo_SizePair;
                          CV_16UC1, CV_16UC2, CV_16UC3, CV_16UC4, \
                          CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4
 // For gradient-ish testing of the other matrix formats
 template<typename T>
 static void fillFPGradient(Mat& img)
 {
    const int ch = img.channels();
    int r, c, i;
    for(r=0; r<img.rows; r++)
    {
        for(c=0; c<img.cols; c++)
        {
            T vals[] = {(T)r, (T)c, (T)(r*c), (T)(r*c/(r+c+1))};
            T *p = (T*)img.ptr(r, c);
            for(i=0; i<ch; i++) p[i] = (T)vals[i];
        }
    }
 }
 PERF_TEST_P(MatInfo_Size_Size, resizeUpLinear,
            testing::Values(
                MatInfo_Size_Size_t(CV_8UC1, szVGA, szqHD),
@ -51,7 +33,7 @@ PERF_TEST_P(MatInfo_Size_Size, resizeUpLinear,
    Size to = get<2>(GetParam());
    cv::Mat src(from, matType), dst(to, matType);
-    cvtest::fillGradient(src);
+    cvtest::fillGradient<uint8_t>(src);
    declare.in(src).out(dst);
    TEST_CYCLE_MULTIRUN(10) resize(src, dst, to, 0, 0, INTER_LINEAR_EXACT);
@ -79,9 +61,9 @@ PERF_TEST_P(MatInfo_SizePair, resizeUpLinearNonExact,
    cv::Mat src(from, matType), dst(to, matType);
    switch(src.depth())
    {
-        case CV_8U: cvtest::fillGradient(src); break;
+        case CV_8U: cvtest::fillGradient<uint8_t>(src); break;
-        case CV_16U: fillFPGradient<ushort>(src); break;
+        case CV_16U: cvtest::fillGradient<ushort>(src); break;
-        case CV_32F: fillFPGradient<float>(src); break;
+        case CV_32F: cvtest::fillGradient<float>(src); break;
    }
    declare.in(src).out(dst);
@ -120,7 +102,7 @@ PERF_TEST_P(MatInfo_Size_Size, resizeDownLinear,
    Size to = get<2>(GetParam());
    cv::Mat src(from, matType), dst(to, matType);
-    cvtest::fillGradient(src);
+    cvtest::fillGradient<uint8_t>(src);
    declare.in(src).out(dst);
    TEST_CYCLE_MULTIRUN(10) resize(src, dst, to, 0, 0, INTER_LINEAR_EXACT);
@ -155,9 +137,9 @@ PERF_TEST_P(MatInfo_SizePair, resizeDownLinearNonExact,
    cv::Mat src(from, matType), dst(to, matType);
    switch(src.depth())
    {
-        case CV_8U: cvtest::fillGradient(src); break;
+        case CV_8U: cvtest::fillGradient<uint8_t>(src); break;
-        case CV_16U: fillFPGradient<ushort>(src); break;
+        case CV_16U: cvtest::fillGradient<ushort>(src); break;
-        case CV_32F: fillFPGradient<float>(src); break;
+        case CV_32F: cvtest::fillGradient<float>(src); break;
    }
    declare.in(src).out(dst);
--- a/modules/imgproc/perf/perf_warp.cpp
+++ b/modules/imgproc/perf/perf_warp.cpp
@ -12,7 +12,7 @@ CV_ENUM(InterType, INTER_NEAREST, INTER_LINEAR)
 CV_ENUM(InterTypeExtended, INTER_NEAREST, INTER_LINEAR, WARP_RELATIVE_MAP)
 CV_ENUM(RemapMode, HALF_SIZE, UPSIDE_DOWN, REFLECTION_X, REFLECTION_BOTH)
-typedef TestBaseWithParam< tuple<MatType, Size, InterType, BorderMode> > TestWarpAffine;
+typedef TestBaseWithParam< tuple<Size, InterType, BorderMode, MatType> > TestWarpAffine;
 typedef TestBaseWithParam< tuple<Size, InterType, BorderMode, int> > TestWarpPerspective;
 typedef TestBaseWithParam< tuple<Size, InterType, BorderMode, MatType> > TestWarpPerspectiveNear_t;
 typedef TestBaseWithParam< tuple<MatType, Size, InterTypeExtended, BorderMode, RemapMode> > TestRemap;
@ -21,24 +21,39 @@ void update_map(const Mat& src, Mat& map_x, Mat& map_y, const int remapMode, boo
 PERF_TEST_P( TestWarpAffine, WarpAffine,
             Combine(
                Values(CV_8UC1, CV_8UC4),
                Values( szVGA, sz720p, sz1080p ),
                InterType::all(),
-                BorderMode::all()
+                BorderMode::all(),
                Values(CV_8UC3, CV_16UC3, CV_32FC3, CV_8UC1, CV_16UC1, CV_32FC1, CV_8UC4, CV_16UC4, CV_32FC4)
             )
 )
 {
    Size sz, szSrc(512, 512);
-    int borderMode, interType, dataType;
+    int type, borderMode, interType;
-    dataType   = get<0>(GetParam());
+    sz         = get<0>(GetParam());
-    sz         = get<1>(GetParam());
+    interType  = get<1>(GetParam());
-    interType  = get<2>(GetParam());
+    borderMode = get<2>(GetParam());
-    borderMode = get<3>(GetParam());
+    type       = get<3>(GetParam());
    Scalar borderColor = Scalar::all(150);
-    Mat src(szSrc, dataType), dst(sz, dataType);
+    Mat src(szSrc,type), dst(sz, type);
-    cvtest::fillGradient(src);
+    switch (src.depth()) {
-    if(borderMode == BORDER_CONSTANT) cvtest::smoothBorder(src, borderColor, 1);
+        case CV_8U: {
            cvtest::fillGradient<uint8_t>(src);
            if(borderMode == BORDER_CONSTANT) cvtest::smoothBorder<uint8_t>(src, borderColor, 1);
            break;
        }
        case CV_16U: {
            cvtest::fillGradient<uint16_t>(src);
            if(borderMode == BORDER_CONSTANT) cvtest::smoothBorder<uint16_t>(src, borderColor, 1);
            break;
        }
        case CV_32F: {
            cvtest::fillGradient<float>(src);
            if(borderMode == BORDER_CONSTANT) cvtest::smoothBorder<float>(src, borderColor, 1);
            break;
        }
    }
    Mat warpMat = getRotationMatrix2D(Point2f(src.cols/2.f, src.rows/2.f), 30., 2.2);
    declare.in(src).out(dst);
@ -47,36 +62,6 @@ PERF_TEST_P( TestWarpAffine, WarpAffine,
    SANITY_CHECK(dst, 1);
 }
 PERF_TEST_P(TestWarpAffine, DISABLED_WarpAffine_ovx,
    Combine(
        Values(CV_8UC1, CV_8UC4),
        Values(szVGA, sz720p, sz1080p),
        InterType::all(),
        BorderMode::all()
    )
 )
 {
    Size sz, szSrc(512, 512);
    int borderMode, interType, dataType;
    dataType   = get<0>(GetParam());
    sz         = get<1>(GetParam());
    interType  = get<2>(GetParam());
    borderMode = get<3>(GetParam());
    Scalar borderColor = Scalar::all(150);
    Mat src(szSrc, dataType), dst(sz, dataType);
    cvtest::fillGradient(src);
    if (borderMode == BORDER_CONSTANT) cvtest::smoothBorder(src, borderColor, 1);
    Mat warpMat = getRotationMatrix2D(Point2f(src.cols / 2.f, src.rows / 2.f), 30., 2.2);
    declare.in(src).out(dst);
    TEST_CYCLE() warpAffine(src, dst, warpMat, sz, interType, borderMode, borderColor);
    SANITY_CHECK(dst, 1);
 }
 PERF_TEST_P( TestWarpPerspective, WarpPerspective,
             Combine(
                Values( szVGA, sz720p, sz1080p ),
@ -96,8 +81,8 @@ PERF_TEST_P( TestWarpPerspective, WarpPerspective,
    Scalar borderColor = Scalar::all(150);
    Mat src(szSrc, CV_8UC(channels)), dst(sz, CV_8UC(channels));
-    cvtest::fillGradient(src);
+    cvtest::fillGradient<uint8_t>(src);
-    if(borderMode == BORDER_CONSTANT) cvtest::smoothBorder(src, borderColor, 1);
+    if(borderMode == BORDER_CONSTANT) cvtest::smoothBorder<uint8_t>(src, borderColor, 1);
    Mat rotMat = getRotationMatrix2D(Point2f(src.cols/2.f, src.rows/2.f), 30., 2.2);
    Mat warpMat(3, 3, CV_64FC1);
    for(int r=0; r<2; r++)
@ -114,42 +99,6 @@ PERF_TEST_P( TestWarpPerspective, WarpPerspective,
    SANITY_CHECK(dst, 1);
 }
 PERF_TEST_P(TestWarpPerspective, DISABLED_WarpPerspective_ovx,
    Combine(
        Values(szVGA, sz720p, sz1080p),
        InterType::all(),
        BorderMode::all(),
        Values(1)
    )
 )
 {
    Size sz, szSrc(512, 512);
    int borderMode, interType, channels;
    sz = get<0>(GetParam());
    interType = get<1>(GetParam());
    borderMode = get<2>(GetParam());
    channels   = get<3>(GetParam());
    Scalar borderColor = Scalar::all(150);
    Mat src(szSrc, CV_8UC(channels)), dst(sz, CV_8UC(channels));
    cvtest::fillGradient(src);
    if (borderMode == BORDER_CONSTANT) cvtest::smoothBorder(src, borderColor, 1);
    Mat rotMat = getRotationMatrix2D(Point2f(src.cols / 2.f, src.rows / 2.f), 30., 2.2);
    Mat warpMat(3, 3, CV_64FC1);
    for (int r = 0; r<2; r++)
        for (int c = 0; c<3; c++)
            warpMat.at<double>(r, c) = rotMat.at<double>(r, c);
    warpMat.at<double>(2, 0) = .3 / sz.width;
    warpMat.at<double>(2, 1) = .3 / sz.height;
    warpMat.at<double>(2, 2) = 1;
    declare.in(src).out(dst);
    TEST_CYCLE() warpPerspective(src, dst, warpMat, sz, interType, borderMode, borderColor);
    SANITY_CHECK(dst, 1);
 }
 PERF_TEST_P( TestWarpPerspectiveNear_t, WarpPerspectiveNear,
             Combine(
                 Values( Size(640,480), Size(1920,1080), Size(2592,1944) ),
@ -168,8 +117,8 @@ PERF_TEST_P( TestWarpPerspectiveNear_t, WarpPerspectiveNear,
    Scalar borderColor = Scalar::all(150);
    Mat src(size, type), dst(size, type);
-    cvtest::fillGradient(src);
+    cvtest::fillGradient<uint8_t>(src);
-    if(borderMode == BORDER_CONSTANT) cvtest::smoothBorder(src, borderColor, 1);
+    if(borderMode == BORDER_CONSTANT) cvtest::smoothBorder<uint8_t>(src, borderColor, 1);
    int shift = static_cast<int>(src.cols*0.04);
    Mat srcVertices = (Mat_<Vec2f>(1, 4) << Vec2f(0, 0),
                                            Vec2f(static_cast<float>(size.width-1), 0),
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@ -55,6 +55,9 @@
 #include "opencv2/core/softfloat.hpp"
 #include "imgwarp.hpp"
 #include "warp_kernels.simd.hpp"
 #include "warp_kernels.simd_declarations.hpp"
 using namespace cv;
 namespace cv
@ -1351,6 +1354,9 @@ static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, Input
    int cn = _src.channels(), type = _src.type(), depth = _src.depth(),
            rowsPerWI = dev.isIntel() ? 4 : 1;
    if(!dev.hasFP64() && depth == CV_64F)
        return false;
    if (borderType == BORDER_TRANSPARENT || !(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST)
            || _map1.type() == CV_16SC1 || _map2.type() == CV_16SC1)
        return false;
@ -2571,16 +2577,70 @@ static bool ipp_warpAffine( InputArray _src, OutputArray _dst, int interpolation
 namespace hal {
-void warpAffine(int src_type,
+static void warpAffine(int src_type,
-                const uchar * src_data, size_t src_step, int src_width, int src_height,
+                       const uchar * src_data, size_t src_step, int src_width, int src_height,
-                uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
+                       uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
-                const double M[6], int interpolation, int borderType, const double borderValue[4])
+                       const double M[6], int interpolation, int borderType, const double borderValue[4], AlgorithmHint hint)
 {
    CALL_HAL(warpAffine, cv_hal_warpAffine, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, M, interpolation, borderType, borderValue);
    Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
    Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
    if (interpolation == INTER_LINEAR) {
        switch (src_type) {
            case CV_8UC1: {
                if (hint == cv::ALGO_HINT_APPROX) {
                    CV_CPU_DISPATCH(warpAffineLinearApproxInvoker_8UC1, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
                } else {
                    CV_CPU_DISPATCH(warpAffineLinearInvoker_8UC1, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
                }
                break;
            }
            case CV_8UC3: {
                if (hint == cv::ALGO_HINT_APPROX) {
                    CV_CPU_DISPATCH(warpAffineLinearApproxInvoker_8UC3, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
                } else {
                    CV_CPU_DISPATCH(warpAffineLinearInvoker_8UC3, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
                }
                break;
            }
            case CV_8UC4: {
                if (hint == cv::ALGO_HINT_APPROX) {
                    CV_CPU_DISPATCH(warpAffineLinearApproxInvoker_8UC4, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
                } else {
                    CV_CPU_DISPATCH(warpAffineLinearInvoker_8UC4, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
                }
                break;
            }
            case CV_16UC1: {
                CV_CPU_DISPATCH(warpAffineLinearInvoker_16UC1, ((const uint16_t*)src_data, src_step, src_height, src_width, (uint16_t*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
                break;
            }
            case CV_16UC3: {
                CV_CPU_DISPATCH(warpAffineLinearInvoker_16UC3, ((const uint16_t*)src_data, src_step, src_height, src_width, (uint16_t*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
                break;
            }
            case CV_16UC4: {
                CV_CPU_DISPATCH(warpAffineLinearInvoker_16UC4, ((const uint16_t*)src_data, src_step, src_height, src_width, (uint16_t*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
                break;
            }
            case CV_32FC1: {
                CV_CPU_DISPATCH(warpAffineLinearInvoker_32FC1, ((const float*)src_data, src_step, src_height, src_width, (float*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
                break;
            }
            case CV_32FC3: {
                CV_CPU_DISPATCH(warpAffineLinearInvoker_32FC3, ((const float*)src_data, src_step, src_height, src_width, (float*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
                break;
            }
            case CV_32FC4: {
                CV_CPU_DISPATCH(warpAffineLinearInvoker_32FC4, ((const float*)src_data, src_step, src_height, src_width, (float*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
                break;
            }
            // no default
        }
    }
    int x;
    AutoBuffer<int> _abdelta(dst.cols*2);
    int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
@ -2697,10 +2757,14 @@ void warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int
 void cv::warpAffine( InputArray _src, OutputArray _dst,
                     InputArray _M0, Size dsize,
-                     int flags, int borderType, const Scalar& borderValue )
+                     int flags, int borderType, const Scalar& borderValue,
                     AlgorithmHint hint )
 {
    CV_INSTRUMENT_REGION();
    if (hint == cv::ALGO_HINT_DEFAULT)
        hint = cv::getDefaultAlgorithmHint();
    int interpolation = flags & INTER_MAX;
    CV_Assert( _src.channels() <= 4 || (interpolation != INTER_LANCZOS4 &&
                                        interpolation != INTER_CUBIC) );
@ -2808,7 +2872,7 @@ void cv::warpAffine( InputArray _src, OutputArray _dst,
 #endif
    hal::warpAffine(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows,
-                    M, interpolation, borderType, borderValue.val);
+                    M, interpolation, borderType, borderValue.val, hint);
 }
--- a/modules/imgproc/src/warp_common.hpp
+++ b/modules/imgproc/src/warp_common.hpp
@ -0,0 +1,11 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 #ifndef __OPENCV_IMGPROC_WARP_COMMON_HPP__
 #define __OPENCV_IMGPROC_WARP_COMMON_HPP__
 #include "warp_common.vector.hpp"
 #include "warp_common.scalar.hpp"
 #endif // __OPENCV_IMGPROC_WARP_COMMON_HPP__
--- a/modules/imgproc/src/warp_common.scalar.hpp
+++ b/modules/imgproc/src/warp_common.scalar.hpp
@ -0,0 +1,171 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 // Shuffle
 #define CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(CN, cn, i) \
    p00##CN = srcptr[i]; p01##CN = srcptr[i + cn]; \
    p10##CN = srcptr[srcstep + i]; p11##CN = srcptr[srcstep + cn + i];
 #define CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD_C1() \
    CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(g, 1, 0)
 #define CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD_C3() \
    CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(r, 3, 0) \
    CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(g, 3, 1) \
    CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(b, 3, 2)
 #define CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD_C4() \
    CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(r, 4, 0) \
    CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(g, 4, 1) \
    CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(b, 4, 2) \
    CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(a, 4, 3)
 #define CV_WARP_LINEAR_SCALAR_SHUFFLE_STORE_CONSTANT_BORDER_C1() \
    dstptr[x] = bval[0];
 #define CV_WARP_LINEAR_SCALAR_SHUFFLE_STORE_CONSTANT_BORDER_C3() \
    dstptr[x*3] = bval[0]; \
    dstptr[x*3+1] = bval[1]; \
    dstptr[x*3+2] = bval[2];
 #define CV_WARP_LINEAR_SCALAR_SHUFFLE_STORE_CONSTANT_BORDER_C4() \
    dstptr[x*4] = bval[0]; \
    dstptr[x*4+1] = bval[1]; \
    dstptr[x*4+2] = bval[2]; \
    dstptr[x*4+3] = bval[3];
 #define CV_WARP_LINEAR_SCALAR_FETCH_PIXEL_C1(dy, dx, pxy) \
    if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \
        size_t ofs = dy*srcstep + dx; \
        pxy##g = srcptr[ofs]; \
    } else if (border_type == BORDER_CONSTANT) { \
        pxy##g = bval[0]; \
    } else if (border_type == BORDER_TRANSPARENT) { \
        pxy##g = dstptr[x]; \
    } else { \
        int ix_ = borderInterpolate_fast(ix + dx, srccols, border_type_x); \
        int iy_ = borderInterpolate_fast(iy + dy, srcrows, border_type_y); \
        size_t glob_ofs = iy_*srcstep + ix_; \
        pxy##g = src[glob_ofs]; \
    }
 #define CV_WARP_LINEAR_SCALAR_FETCH_PIXEL_C3(dy, dx, pxy) \
    if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \
        size_t ofs = dy*srcstep + dx*3; \
        pxy##r = srcptr[ofs]; \
        pxy##g = srcptr[ofs+1]; \
        pxy##b = srcptr[ofs+2]; \
    } else if (border_type == BORDER_CONSTANT) { \
        pxy##r = bval[0]; \
        pxy##g = bval[1]; \
        pxy##b = bval[2]; \
    } else if (border_type == BORDER_TRANSPARENT) { \
        pxy##r = dstptr[x*3]; \
        pxy##g = dstptr[x*3+1]; \
        pxy##b = dstptr[x*3+2]; \
    } else { \
        int ix_ = borderInterpolate_fast(ix + dx, srccols, border_type_x); \
        int iy_ = borderInterpolate_fast(iy + dy, srcrows, border_type_y); \
        size_t glob_ofs = iy_*srcstep + ix_*3; \
        pxy##r = src[glob_ofs]; \
        pxy##g = src[glob_ofs+1]; \
        pxy##b = src[glob_ofs+2]; \
    }
 #define CV_WARP_LINEAR_SCALAR_FETCH_PIXEL_C4(dy, dx, pxy) \
    if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \
        size_t ofs = dy*srcstep + dx*4; \
        pxy##r = srcptr[ofs]; \
        pxy##g = srcptr[ofs+1]; \
        pxy##b = srcptr[ofs+2]; \
        pxy##a = srcptr[ofs+3]; \
    } else if (border_type == BORDER_CONSTANT) { \
        pxy##r = bval[0]; \
        pxy##g = bval[1]; \
        pxy##b = bval[2]; \
        pxy##a = bval[3]; \
    } else if (border_type == BORDER_TRANSPARENT) { \
        pxy##r = dstptr[x*4]; \
        pxy##g = dstptr[x*4+1]; \
        pxy##b = dstptr[x*4+2]; \
        pxy##a = dstptr[x*4+3]; \
    } else { \
        int ix_ = borderInterpolate_fast(ix + dx, srccols, border_type_x); \
        int iy_ = borderInterpolate_fast(iy + dy, srcrows, border_type_y); \
        size_t glob_ofs = iy_*srcstep + ix_*4; \
        pxy##r = src[glob_ofs]; \
        pxy##g = src[glob_ofs+1]; \
        pxy##b = src[glob_ofs+2]; \
        pxy##a = src[glob_ofs+3]; \
    }
 #define CV_WARP_LINEAR_SCALAR_SHUFFLE(CN) \
    if ((((unsigned)ix < (unsigned)(srccols-1)) & \
        ((unsigned)iy < (unsigned)(srcrows-1))) != 0) { \
        CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD_##CN() \
    } else { \
        if ((border_type == BORDER_CONSTANT || border_type == BORDER_TRANSPARENT) && \
            (((unsigned)(ix+1) >= (unsigned)(srccols+1))| \
                ((unsigned)(iy+1) >= (unsigned)(srcrows+1))) != 0) { \
            if (border_type == BORDER_CONSTANT) { \
                CV_WARP_LINEAR_SCALAR_SHUFFLE_STORE_CONSTANT_BORDER_##CN() \
            } \
            continue; \
        } \
        CV_WARP_LINEAR_SCALAR_FETCH_PIXEL_##CN(0, 0, p00); \
        CV_WARP_LINEAR_SCALAR_FETCH_PIXEL_##CN(0, 1, p01); \
        CV_WARP_LINEAR_SCALAR_FETCH_PIXEL_##CN(1, 0, p10); \
        CV_WARP_LINEAR_SCALAR_FETCH_PIXEL_##CN(1, 1, p11); \
    }
 // Linear interpolation calculation
 #define CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(cn) \
    float v0##cn = p00##cn + sx*(p01##cn - p00##cn); \
    float v1##cn = p10##cn + sx*(p11##cn - p10##cn);
 #define CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32_C1() \
    CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(g)
 #define CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32_C3() \
    CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(r) \
    CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(g) \
    CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(b)
 #define CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32_C4() \
    CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(r) \
    CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(g) \
    CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(b) \
    CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(a)
 #define CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(cn) \
    v0##cn += sy*(v1##cn - v0##cn);
 #define CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32_C1() \
    CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(g)
 #define CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32_C3() \
    CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(r) \
    CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(g) \
    CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(b)
 #define CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32_C4() \
    CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(r) \
    CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(g) \
    CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(b) \
    CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(a)
 #define CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(CN) \
    CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32_##CN() \
    CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32_##CN()
 // Store
 #define CV_WARP_LINEAR_SCALAR_STORE_C1(dtype) \
    dstptr[x] = saturate_cast<dtype>(v0g);
 #define CV_WARP_LINEAR_SCALAR_STORE_C3(dtype) \
    dstptr[x*3] = saturate_cast<dtype>(v0r); \
    dstptr[x*3+1] = saturate_cast<dtype>(v0g); \
    dstptr[x*3+2] = saturate_cast<dtype>(v0b);
 #define CV_WARP_LINEAR_SCALAR_STORE_C4(dtype) \
    dstptr[x*4] = saturate_cast<dtype>(v0r); \
    dstptr[x*4+1] = saturate_cast<dtype>(v0g); \
    dstptr[x*4+2] = saturate_cast<dtype>(v0b); \
    dstptr[x*4+3] = saturate_cast<dtype>(v0a);
 #define CV_WARP_LINEAR_SCALAR_STORE_8U(CN) \
    CV_WARP_LINEAR_SCALAR_STORE_##CN(uint8_t)
 #define CV_WARP_LINEAR_SCALAR_STORE_16U(CN) \
    CV_WARP_LINEAR_SCALAR_STORE_##CN(uint16_t)
 #define CV_WARP_LINEAR_SCALAR_STORE_32F(CN) \
    CV_WARP_LINEAR_SCALAR_STORE_##CN(float)
 #define CV_WARP_LINEAR_SCALAR_STORE(CN, DEPTH) \
    CV_WARP_LINEAR_SCALAR_STORE_##DEPTH(CN)
--- a/modules/imgproc/src/warp_common.vector.hpp
+++ b/modules/imgproc/src/warp_common.vector.hpp
@ -0,0 +1,387 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 // Shuffle (all pixels within image)
 #define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_C1(dtype) \
    for (int i = 0; i < uf; i++) { \
        const dtype* srcptr = src + addr[i]; \
        pixbuf[i] = srcptr[0]; \
        pixbuf[i + uf] = srcptr[1]; \
        pixbuf[i + uf*2] = srcptr[srcstep]; \
        pixbuf[i + uf*3] = srcptr[srcstep + 1]; \
    }
 #define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_C3(dtype) \
    for (int i = 0; i < uf; i++) { \
        const dtype* srcptr = src + addr[i]; \
        pixbuf[i] = srcptr[0]; \
        pixbuf[i + uf*4] = srcptr[1]; \
        pixbuf[i + uf*8] = srcptr[2]; \
        pixbuf[i + uf] = srcptr[3]; \
        pixbuf[i + uf*5] = srcptr[4]; \
        pixbuf[i + uf*9] = srcptr[5]; \
        pixbuf[i + uf*2] = srcptr[srcstep]; \
        pixbuf[i + uf*6] = srcptr[srcstep + 1]; \
        pixbuf[i + uf*10] = srcptr[srcstep + 2]; \
        pixbuf[i + uf*3] = srcptr[srcstep + 3]; \
        pixbuf[i + uf*7] = srcptr[srcstep + 4]; \
        pixbuf[i + uf*11] = srcptr[srcstep + 5]; \
    }
 #define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_C4(dtype) \
    for (int i = 0; i < uf; i++) { \
        const dtype* srcptr = src + addr[i]; \
        pixbuf[i] = srcptr[0]; \
        pixbuf[i + uf*4] = srcptr[1]; \
        pixbuf[i + uf*8] = srcptr[2]; \
        pixbuf[i + uf*12] = srcptr[3]; \
        pixbuf[i + uf] = srcptr[4]; \
        pixbuf[i + uf*5] = srcptr[5]; \
        pixbuf[i + uf*9] = srcptr[6]; \
        pixbuf[i + uf*13] = srcptr[7]; \
        pixbuf[i + uf*2] = srcptr[srcstep]; \
        pixbuf[i + uf*6] = srcptr[srcstep + 1]; \
        pixbuf[i + uf*10] = srcptr[srcstep + 2]; \
        pixbuf[i + uf*14] = srcptr[srcstep + 3]; \
        pixbuf[i + uf*3] = srcptr[srcstep + 4]; \
        pixbuf[i + uf*7] = srcptr[srcstep + 5]; \
        pixbuf[i + uf*11] = srcptr[srcstep + 6]; \
        pixbuf[i + uf*15] = srcptr[srcstep + 7]; \
    }
 #define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_8U(CN) \
    CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_##CN(uint8_t)
 #define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_16U(CN) \
    CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_##CN(uint16_t)
 #define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_32F(CN) \
    CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_##CN(float)
 #define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(CN, DEPTH) \
    CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_##DEPTH(CN)
 // Shuffle (not all pixels within image)
 #define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_8UC1() \
    v_store_low(dstptr + x, bval_v0);
 #define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_8UC3() \
    v_store_low(dstptr + x*3,        bval_v0); \
    v_store_low(dstptr + x*3 + uf,   bval_v1); \
    v_store_low(dstptr + x*3 + uf*2, bval_v2);
 #define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_8UC4() \
    v_store_low(dstptr + x*4,        bval_v0); \
    v_store_low(dstptr + x*4 + uf,   bval_v1); \
    v_store_low(dstptr + x*4 + uf*2, bval_v2); \
    v_store_low(dstptr + x*4 + uf*3, bval_v3);
 #define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_16UC1() \
    v_store(dstptr + x, bval_v0);
 #define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_16UC3() \
    v_store(dstptr + x*3,        bval_v0); \
    v_store(dstptr + x*3 + uf,   bval_v1); \
    v_store(dstptr + x*3 + uf*2, bval_v2);
 #define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_16UC4() \
    v_store(dstptr + x*4,        bval_v0); \
    v_store(dstptr + x*4 + uf,   bval_v1); \
    v_store(dstptr + x*4 + uf*2, bval_v2); \
    v_store(dstptr + x*4 + uf*3, bval_v3);
 #define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_32FC1() \
    v_store(dstptr + x,             bval_v0_l); \
    v_store(dstptr + x + vlanes_32, bval_v0_h);
 #define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_32FC3() \
    v_store(dstptr + x*3,                    bval_v0_l); \
    v_store(dstptr + x*3 + vlanes_32,        bval_v0_h); \
    v_store(dstptr + x*3 + uf,               bval_v1_l); \
    v_store(dstptr + x*3 + uf + vlanes_32,   bval_v1_h); \
    v_store(dstptr + x*3 + uf*2,             bval_v2_l); \
    v_store(dstptr + x*3 + uf*2 + vlanes_32, bval_v2_h);
 #define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_32FC4() \
    v_store(dstptr + x*4,                    bval_v0_l); \
    v_store(dstptr + x*4 + vlanes_32,        bval_v0_h); \
    v_store(dstptr + x*4 + uf,               bval_v1_l); \
    v_store(dstptr + x*4 + uf + vlanes_32,   bval_v1_h); \
    v_store(dstptr + x*4 + uf*2,             bval_v2_l); \
    v_store(dstptr + x*4 + uf*2 + vlanes_32, bval_v2_h); \
    v_store(dstptr + x*4 + uf*3,             bval_v3_l); \
    v_store(dstptr + x*4 + uf*3 + vlanes_32, bval_v3_h);
 #define CV_WARP_LINEAR_VECTOR_FETCH_PIXEL_C1(dy, dx, pixbuf_ofs) \
    if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \
        size_t addr_i = addr[i] + dy*srcstep + dx; \
        pixbuf[i + pixbuf_ofs] = src[addr_i]; \
    } else if (border_type == BORDER_CONSTANT) { \
        pixbuf[i + pixbuf_ofs] = bval[0]; \
    } else if (border_type == BORDER_TRANSPARENT) { \
        pixbuf[i + pixbuf_ofs] = dstptr[x + i]; \
    } else { \
        int ix_ = borderInterpolate_fast(ix + dx, srccols, border_type_x); \
        int iy_ = borderInterpolate_fast(iy + dy, srcrows, border_type_y); \
        size_t addr_i = iy_*srcstep + ix_; \
        pixbuf[i + pixbuf_ofs] = src[addr_i]; \
    }
 #define CV_WARP_LINEAR_VECTOR_FETCH_PIXEL_C3(dy, dx, pixbuf_ofs) \
    if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \
        size_t addr_i = addr[i] + dy*srcstep + dx*3; \
        pixbuf[i + pixbuf_ofs] = src[addr_i]; \
        pixbuf[i + pixbuf_ofs + uf*4] = src[addr_i+1]; \
        pixbuf[i + pixbuf_ofs + uf*8] = src[addr_i+2]; \
    } else if (border_type == BORDER_CONSTANT) { \
        pixbuf[i + pixbuf_ofs] = bval[0]; \
        pixbuf[i + pixbuf_ofs + uf*4] = bval[1]; \
        pixbuf[i + pixbuf_ofs + uf*8] = bval[2]; \
    } else if (border_type == BORDER_TRANSPARENT) { \
        pixbuf[i + pixbuf_ofs] = dstptr[(x + i)*3]; \
        pixbuf[i + pixbuf_ofs + uf*4] = dstptr[(x + i)*3 + 1]; \
        pixbuf[i + pixbuf_ofs + uf*8] = dstptr[(x + i)*3 + 2]; \
    } else { \
        int ix_ = borderInterpolate_fast(ix + dx, srccols, border_type_x); \
        int iy_ = borderInterpolate_fast(iy + dy, srcrows, border_type_y); \
        size_t addr_i = iy_*srcstep + ix_*3; \
        pixbuf[i + pixbuf_ofs] = src[addr_i]; \
        pixbuf[i + pixbuf_ofs + uf*4] = src[addr_i+1]; \
        pixbuf[i + pixbuf_ofs + uf*8] = src[addr_i+2]; \
    }
 #define CV_WARP_LINEAR_VECTOR_FETCH_PIXEL_C4(dy, dx, pixbuf_ofs) \
    if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \
        size_t addr_i = addr[i] + dy*srcstep + dx*4; \
        pixbuf[i + pixbuf_ofs] = src[addr_i]; \
        pixbuf[i + pixbuf_ofs + uf*4] = src[addr_i+1]; \
        pixbuf[i + pixbuf_ofs + uf*8] = src[addr_i+2]; \
        pixbuf[i + pixbuf_ofs + uf*12] = src[addr_i+3]; \
    } else if (border_type == BORDER_CONSTANT) { \
        pixbuf[i + pixbuf_ofs] = bval[0]; \
        pixbuf[i + pixbuf_ofs + uf*4] = bval[1]; \
        pixbuf[i + pixbuf_ofs + uf*8] = bval[2]; \
        pixbuf[i + pixbuf_ofs + uf*12] = bval[3]; \
    } else if (border_type == BORDER_TRANSPARENT) { \
        pixbuf[i + pixbuf_ofs] = dstptr[(x + i)*4]; \
        pixbuf[i + pixbuf_ofs + uf*4] = dstptr[(x + i)*4 + 1]; \
        pixbuf[i + pixbuf_ofs + uf*8] = dstptr[(x + i)*4 + 2]; \
        pixbuf[i + pixbuf_ofs + uf*12] = dstptr[(x + i)*4 + 3]; \
    } else { \
        int ix_ = borderInterpolate_fast(ix + dx, srccols, border_type_x); \
        int iy_ = borderInterpolate_fast(iy + dy, srcrows, border_type_y); \
        size_t addr_i = iy_*srcstep + ix_*4; \
        pixbuf[i + pixbuf_ofs] = src[addr_i]; \
        pixbuf[i + pixbuf_ofs + uf*4] = src[addr_i+1]; \
        pixbuf[i + pixbuf_ofs + uf*8] = src[addr_i+2]; \
        pixbuf[i + pixbuf_ofs + uf*12] = src[addr_i+3]; \
    }
 #define CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(CN, DEPTH) \
    if (border_type == BORDER_CONSTANT || border_type == BORDER_TRANSPARENT) { \
        mask_0 = v_lt(v_reinterpret_as_u32(v_add(src_ix0, one)), outer_scols); \
        mask_1 = v_lt(v_reinterpret_as_u32(v_add(src_ix1, one)), outer_scols); \
        mask_0 = v_and(mask_0, v_lt(v_reinterpret_as_u32(v_add(src_iy0, one)), outer_srows)); \
        mask_1 = v_and(mask_1, v_lt(v_reinterpret_as_u32(v_add(src_iy1, one)), outer_srows)); \
        v_uint16 outer_mask = v_pack(mask_0, mask_1); \
        if (v_reduce_max(outer_mask) == 0) { \
            if (border_type == BORDER_CONSTANT) { \
                CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_##DEPTH##CN() \
            } \
            continue; \
        } \
    } \
    vx_store(src_ix, src_ix0); \
    vx_store(src_iy, src_iy0); \
    vx_store(src_ix + vlanes_32, src_ix1); \
    vx_store(src_iy + vlanes_32, src_iy1); \
    for (int i = 0; i < uf; i++) { \
        int ix = src_ix[i], iy = src_iy[i]; \
        CV_WARP_LINEAR_VECTOR_FETCH_PIXEL_##CN(0, 0, 0); \
        CV_WARP_LINEAR_VECTOR_FETCH_PIXEL_##CN(0, 1, uf); \
        CV_WARP_LINEAR_VECTOR_FETCH_PIXEL_##CN(1, 0, uf*2); \
        CV_WARP_LINEAR_VECTOR_FETCH_PIXEL_##CN(1, 1, uf*3); \
    }
 // Load pixels for linear interpolation (uint8_t -> int16_t)
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(cn, i) \
    v_int16  f00##cn = v_reinterpret_as_s16(vx_load_expand(pixbuf + uf * i)), \
             f01##cn = v_reinterpret_as_s16(vx_load_expand(pixbuf + uf * (i+1))), \
             f10##cn = v_reinterpret_as_s16(vx_load_expand(pixbuf + uf * (i+2))), \
             f11##cn = v_reinterpret_as_s16(vx_load_expand(pixbuf + uf * (i+3)));
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_C1() \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(g, 0)
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_C3() \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(r, 0) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(g, 4) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(b, 8)
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_C4() \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(r, 0) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(g, 4) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(b, 8) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(a, 12)
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16(CN) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_##CN();
 // Load pixels for linear interpolation (uint16_t -> uint16_t)
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(cn, i) \
    v_uint16 f00##cn = vx_load(pixbuf + uf * i), \
             f01##cn = vx_load(pixbuf + uf * (i+1)), \
             f10##cn = vx_load(pixbuf + uf * (i+2)), \
             f11##cn = vx_load(pixbuf + uf * (i+3));
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16_C1() \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(g, 0)
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16_C3() \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(r, 0) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(g, 4) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(b, 8)
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16_C4() \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(r, 0) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(g, 4) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(b, 8) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(a, 12)
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(CN) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16_##CN();
 // Load pixels for linear interpolation (int16_t -> float)
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_S16F32(cn) \
    v_float32 f00##cn##l = v_cvt_f32(v_expand_low(f00##cn)), f00##cn##h = v_cvt_f32(v_expand_high(f00##cn)), \
              f01##cn##l = v_cvt_f32(v_expand_low(f01##cn)), f01##cn##h = v_cvt_f32(v_expand_high(f01##cn)), \
              f10##cn##l = v_cvt_f32(v_expand_low(f10##cn)), f10##cn##h = v_cvt_f32(v_expand_high(f10##cn)), \
              f11##cn##l = v_cvt_f32(v_expand_low(f11##cn)), f11##cn##h = v_cvt_f32(v_expand_high(f11##cn));
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_S16F32_C1() \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_S16F32(g)
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_S16F32_C3() \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_S16F32(r) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_S16F32(g) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_S16F32(b)
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_S16F32_C4() \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_S16F32(r) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_S16F32(g) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_S16F32(b) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_S16F32(a)
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_S16F32(CN) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_S16F32_##CN()
 // Load pixels for linear interpolation (uint16_t -> float)
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16F32(cn) \
    v_float32 f00##cn##l = v_cvt_f32(v_reinterpret_as_s32(v_expand_low(f00##cn))), f00##cn##h = v_cvt_f32(v_reinterpret_as_s32(v_expand_high(f00##cn))), \
              f01##cn##l = v_cvt_f32(v_reinterpret_as_s32(v_expand_low(f01##cn))), f01##cn##h = v_cvt_f32(v_reinterpret_as_s32(v_expand_high(f01##cn))), \
              f10##cn##l = v_cvt_f32(v_reinterpret_as_s32(v_expand_low(f10##cn))), f10##cn##h = v_cvt_f32(v_reinterpret_as_s32(v_expand_high(f10##cn))), \
              f11##cn##l = v_cvt_f32(v_reinterpret_as_s32(v_expand_low(f11##cn))), f11##cn##h = v_cvt_f32(v_reinterpret_as_s32(v_expand_high(f11##cn)));
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16F32_C1() \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16F32(g)
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16F32_C3() \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16F32(r) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16F32(g) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16F32(b)
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16F32_C4() \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16F32(r) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16F32(g) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16F32(b) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16F32(a)
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16F32(CN) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16F32_##CN()
 // Load pixels for linear interpolation (float -> float)
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(cn, i) \
    v_float32 f00##cn##l = vx_load(pixbuf + uf * i),      f00##cn##h = vx_load(pixbuf + uf * i     + vlanes_32), \
              f01##cn##l = vx_load(pixbuf + uf * (i+1)),  f01##cn##h = vx_load(pixbuf + uf * (i+1) + vlanes_32), \
              f10##cn##l = vx_load(pixbuf + uf * (i+2)),  f10##cn##h = vx_load(pixbuf + uf * (i+2) + vlanes_32), \
              f11##cn##l = vx_load(pixbuf + uf * (i+3)),  f11##cn##h = vx_load(pixbuf + uf * (i+3) + vlanes_32);
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32_C1() \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(g, 0)
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32_C3() \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(r, 0) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(g, 4) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(b, 8)
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32_C4() \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(r, 0) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(g, 4) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(b, 8) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(a, 12)
 #define CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(CN) \
    CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32_##CN()
 // Linear interpolation calculation
 #define CV_WARP_LINEAR_VECTOR_INTER_CALC_ALPHA_F32(cn) \
    f00##cn##l = v_fma(alphal, v_sub(f01##cn##l, f00##cn##l), f00##cn##l); f00##cn##h = v_fma(alphah, v_sub(f01##cn##h, f00##cn##h), f00##cn##h); \
    f10##cn##l = v_fma(alphal, v_sub(f11##cn##l, f10##cn##l), f10##cn##l); f10##cn##h = v_fma(alphah, v_sub(f11##cn##h, f10##cn##h), f10##cn##h);
 #define CV_WARP_LINEAR_VECTOR_INTER_CALC_ALPHA_F32_C1() \
    CV_WARP_LINEAR_VECTOR_INTER_CALC_ALPHA_F32(g)
 #define CV_WARP_LINEAR_VECTOR_INTER_CALC_ALPHA_F32_C3() \
    CV_WARP_LINEAR_VECTOR_INTER_CALC_ALPHA_F32(r) \
    CV_WARP_LINEAR_VECTOR_INTER_CALC_ALPHA_F32(g) \
    CV_WARP_LINEAR_VECTOR_INTER_CALC_ALPHA_F32(b)
 #define CV_WARP_LINEAR_VECTOR_INTER_CALC_ALPHA_F32_C4() \
    CV_WARP_LINEAR_VECTOR_INTER_CALC_ALPHA_F32(r) \
    CV_WARP_LINEAR_VECTOR_INTER_CALC_ALPHA_F32(g) \
    CV_WARP_LINEAR_VECTOR_INTER_CALC_ALPHA_F32(b) \
    CV_WARP_LINEAR_VECTOR_INTER_CALC_ALPHA_F32(a)
 #define CV_WARP_LINEAR_VECTOR_INTER_CALC_BETA_F32(cn) \
    f00##cn##l = v_fma(betal,  v_sub(f10##cn##l, f00##cn##l), f00##cn##l); f00##cn##h = v_fma(betah,  v_sub(f10##cn##h, f00##cn##h), f00##cn##h);
 #define CV_WARP_LINEAR_VECTOR_INTER_CALC_BETA_F32_C1() \
    CV_WARP_LINEAR_VECTOR_INTER_CALC_BETA_F32(g)
 #define CV_WARP_LINEAR_VECTOR_INTER_CALC_BETA_F32_C3() \
    CV_WARP_LINEAR_VECTOR_INTER_CALC_BETA_F32(r) \
    CV_WARP_LINEAR_VECTOR_INTER_CALC_BETA_F32(g) \
    CV_WARP_LINEAR_VECTOR_INTER_CALC_BETA_F32(b)
 #define CV_WARP_LINEAR_VECTOR_INTER_CALC_BETA_F32_C4() \
    CV_WARP_LINEAR_VECTOR_INTER_CALC_BETA_F32(r) \
    CV_WARP_LINEAR_VECTOR_INTER_CALC_BETA_F32(g) \
    CV_WARP_LINEAR_VECTOR_INTER_CALC_BETA_F32(b) \
    CV_WARP_LINEAR_VECTOR_INTER_CALC_BETA_F32(a)
 #define CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(CN) \
    v_float32 alphal = src_x0, alphah = src_x1, \
              betal = src_y0, betah = src_y1; \
    CV_WARP_LINEAR_VECTOR_INTER_CALC_ALPHA_F32_##CN() \
    CV_WARP_LINEAR_VECTOR_INTER_CALC_BETA_F32_##CN()
 // Store
 #define CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U8_C1() \
    v_uint16 f00_u16 = v_pack_u(v_round(f00gl), v_round(f00gh)); \
    v_uint8 f00_u8 = v_pack(f00_u16, vx_setall_u16(0)); \
    v_store_low(dstptr + x, f00_u8);
 #define CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U8_C3() \
    v_uint16 f00r_u16 = v_pack_u(v_round(f00rl), v_round(f00rh)), \
             f00g_u16 = v_pack_u(v_round(f00gl), v_round(f00gh)), \
             f00b_u16 = v_pack_u(v_round(f00bl), v_round(f00bh)); \
    uint16_t tbuf[max_vlanes_16*3]; \
    v_store_interleave(tbuf, f00r_u16, f00g_u16, f00b_u16); \
    v_pack_store(dstptr + x*3, vx_load(tbuf)); \
    v_pack_store(dstptr + x*3 + vlanes_16, vx_load(tbuf + vlanes_16)); \
    v_pack_store(dstptr + x*3 + vlanes_16*2, vx_load(tbuf + vlanes_16*2));
 #define CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U8_C4() \
    v_uint16 f00r_u16 = v_pack_u(v_round(f00rl), v_round(f00rh)), \
             f00g_u16 = v_pack_u(v_round(f00gl), v_round(f00gh)), \
             f00b_u16 = v_pack_u(v_round(f00bl), v_round(f00bh)), \
             f00a_u16 = v_pack_u(v_round(f00al), v_round(f00ah)); \
    uint16_t tbuf[max_vlanes_16*4]; \
    v_store_interleave(tbuf, f00r_u16, f00g_u16, f00b_u16, f00a_u16); \
    v_pack_store(dstptr + x*4, vx_load(tbuf)); \
    v_pack_store(dstptr + x*4 + vlanes_16, vx_load(tbuf + vlanes_16)); \
    v_pack_store(dstptr + x*4 + vlanes_16*2, vx_load(tbuf + vlanes_16*2)); \
    v_pack_store(dstptr + x*4 + vlanes_16*3, vx_load(tbuf + vlanes_16*3));
 #define CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U8(CN) \
    CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U8_##CN()
 #define CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16_C1() \
    v_uint16 f00_u16 = v_pack_u(v_round(f00gl), v_round(f00gh)); \
    v_store(dstptr + x, f00_u16);
 #define CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16_C3() \
    v_uint16 f00r_u16 = v_pack_u(v_round(f00rl), v_round(f00rh)), \
             f00g_u16 = v_pack_u(v_round(f00gl), v_round(f00gh)), \
             f00b_u16 = v_pack_u(v_round(f00bl), v_round(f00bh)); \
    v_store_interleave(dstptr + x*3, f00r_u16, f00g_u16, f00b_u16);
 #define CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16_C4() \
    v_uint16 f00r_u16 = v_pack_u(v_round(f00rl), v_round(f00rh)), \
             f00g_u16 = v_pack_u(v_round(f00gl), v_round(f00gh)), \
             f00b_u16 = v_pack_u(v_round(f00bl), v_round(f00bh)), \
             f00a_u16 = v_pack_u(v_round(f00al), v_round(f00ah)); \
    v_store_interleave(dstptr + x*4, f00r_u16, f00g_u16, f00b_u16, f00a_u16);
 #define CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(CN) \
    CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16_##CN()
 #define CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32_C1() \
    vx_store(dstptr + x, f00gl); \
    vx_store(dstptr + x + vlanes_32, f00gh);
 #define CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32_C3() \
    v_store_interleave(dstptr + x*3, f00rl, f00gl, f00bl); \
    v_store_interleave(dstptr + x*3 + vlanes_32*3, f00rh, f00gh, f00bh);
 #define CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32_C4() \
    v_store_interleave(dstptr + x*4, f00rl, f00gl, f00bl, f00al); \
    v_store_interleave(dstptr + x*4 + vlanes_32*4, f00rh, f00gh, f00bh, f00ah);
 #define CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(CN) \
    CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32_##CN()
--- a/modules/imgproc/src/warp_kernels.simd.hpp
+++ b/modules/imgproc/src/warp_kernels.simd.hpp
--- a/modules/imgproc/test/ocl/test_warp.cpp
+++ b/modules/imgproc/test/ocl/test_warp.cpp
@ -172,7 +172,7 @@ OCL_TEST_P(WarpAffine, Mat)
 {
    for (int j = 0; j < test_loop_times; j++)
    {
-        double eps = depth < CV_32F ? 0.04 : 0.06;
+        double eps = depth < CV_32F ? ( depth < CV_16U ? 0.09 : 0.04 ) : 0.06;
        random_roi();
        Mat M = getRotationMatrix2D(Point2f(src_roi.cols / 2.0f, src_roi.rows / 2.0f),
@ -189,7 +189,7 @@ OCL_TEST_P(WarpAffine, inplace_25853) // when src and dst are the same variable,
 {
    for (int j = 0; j < test_loop_times; j++)
    {
-        double eps = depth < CV_32F ? 0.04 : 0.06;
+        double eps = depth < CV_32F ? ( depth < CV_16U ? 0.09 : 0.04 ) : 0.06;
        random_roi();
        Mat M = getRotationMatrix2D(Point2f(src_roi.cols / 2.0f, src_roi.rows / 2.0f),
--- a/modules/imgproc/test/test_imgwarp_strict.cpp
+++ b/modules/imgproc/test/test_imgwarp_strict.cpp
@ -150,7 +150,7 @@ void CV_ImageWarpBaseTest::generate_test_data()
    while (depth == CV_8S || depth == CV_32S)
        depth = rng.uniform(0, CV_64F);
-    int cn = rng.uniform(1, 4);
+    int cn = rng.uniform(1, 5);
    src.create(ssize, CV_MAKE_TYPE(depth, cn));
@ -1045,6 +1045,13 @@ protected:
    Mat M;
 private:
    void warpAffine(const Mat&, Mat&);
    template<typename T>
    void newWarpAffine(const Mat&, Mat&, const Mat&);
    template<int channels, typename T>
    void newLinear(int x, float sx, float sy, const T *srcptr_, T *dstptr, int srccols, int srcrows, size_t srcstep,
                   const T *bval, int borderType_x, int borderType_y);
 };
 CV_WarpAffine_Test::CV_WarpAffine_Test() :
@ -1088,7 +1095,7 @@ void CV_WarpAffine_Test::run_func()
 float CV_WarpAffine_Test::get_success_error_level(int _interpolation, int _depth) const
 {
-    return _depth == CV_8U ? 0 : CV_ImageWarpBaseTest::get_success_error_level(_interpolation, _depth);
+    return _depth == CV_8U ? 0.f : CV_ImageWarpBaseTest::get_success_error_level(_interpolation, _depth);
 }
 void CV_WarpAffine_Test::run_reference_func()
@ -1098,6 +1105,152 @@ void CV_WarpAffine_Test::run_reference_func()
    tmp.convertTo(reference_dst, reference_dst.depth());
 }
 #define FETCH_PIXEL_SCALAR(cn, dy, dx) \
    if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \
        size_t ofs = dy*srcstep + dx*cn; \
        for (int ci = 0; ci < cn; ci++) { pxy[2*dy*cn+dx*cn+ci] = srcptr[ofs+ci];} \
    } else if (borderType == BORDER_CONSTANT) { \
        for (int ci = 0; ci < cn; ci++) { pxy[2*dy*cn+dx*cn+ci] = bval[ci];} \
    } else if (borderType == BORDER_TRANSPARENT) { \
        for (int ci = 0; ci < cn; ci++) { pxy[2*dy*cn+dx*cn+ci] = dstptr[x*cn+ci];} \
    } else { \
        int ix_ = borderInterpolate(ix + dx, srccols, borderType_x); \
        int iy_ = borderInterpolate(iy + dy, srcrows, borderType_y); \
        size_t glob_ofs = iy_*srcstep + ix_*cn; \
        for (int ci = 0; ci < cn; ci++) { pxy[2*dy*cn+dx*cn+ci] = srcptr_[glob_ofs+ci];} \
    }
 #define WARPAFFINE_SHUFFLE(cn) \
    if ((((unsigned)ix < (unsigned)(srccols-1)) & \
        ((unsigned)iy < (unsigned)(srcrows-1))) != 0) { \
        for (int ci = 0; ci < cn; ci++) { \
            pxy[ci] = srcptr[ci]; \
            pxy[ci+cn] = srcptr[ci+cn]; \
            pxy[ci+cn*2] = srcptr[srcstep+ci]; \
            pxy[ci+cn*3] = srcptr[srcstep+ci+cn]; \
        } \
    } else { \
        if ((borderType == BORDER_CONSTANT || borderType == BORDER_TRANSPARENT) && \
            (((unsigned)(ix+1) >= (unsigned)(srccols+1))| \
            ((unsigned)(iy+1) >= (unsigned)(srcrows+1))) != 0) { \
            if (borderType == BORDER_CONSTANT) { \
                for (int ci = 0; ci < cn; ci++) { dstptr[x*cn+ci] = bval[ci]; } \
            } \
            return; \
        } \
        FETCH_PIXEL_SCALAR(cn, 0, 0); \
        FETCH_PIXEL_SCALAR(cn, 0, 1); \
        FETCH_PIXEL_SCALAR(cn, 1, 0); \
        FETCH_PIXEL_SCALAR(cn, 1, 1); \
    }
 template<typename T>
 static inline void warpaffine_linear_calc(int cn, const T *pxy, T *dst, float sx, float sy)
 {
    for (int ci = 0; ci < cn; ci++) {
        float p00 = pxy[ci];
        float p01 = pxy[ci+cn];
        float p10 = pxy[ci+cn*2];
        float p11 = pxy[ci+cn*3];
        float v0 = p00 + sx*(p01 - p00);
        float v1 = p10 + sx*(p11 - p10);
        v0 += sy*(v1 - v0);
        dst[ci] = saturate_cast<T>(v0);
    }
 }
 template<>
 inline void warpaffine_linear_calc<float>(int cn, const float *pxy, float *dst, float sx, float sy)
 {
    for (int ci = 0; ci < cn; ci++) {
        float p00 = pxy[ci];
        float p01 = pxy[ci+cn];
        float p10 = pxy[ci+cn*2];
        float p11 = pxy[ci+cn*3];
        float v0 = p00 + sx*(p01 - p00);
        float v1 = p10 + sx*(p11 - p10);
        v0 += sy*(v1 - v0);
        dst[ci] = v0;
    }
 }
 template<int channels, typename T>
 void CV_WarpAffine_Test::newLinear(int x, float sx, float sy, const T *srcptr_, T *dstptr,
                                   int srccols, int srcrows, size_t srcstep,
                                   const T *bval, int borderType_x, int borderType_y)
 {
    int ix = (int)floorf(sx), iy = (int)floorf(sy);
    sx -= ix; sy -= iy;
    T pxy[channels*4];
    const T *srcptr = srcptr_ + srcstep*iy + ix*channels;
    WARPAFFINE_SHUFFLE(channels);
    warpaffine_linear_calc(channels, pxy, dstptr+x*channels, sx, sy);
 }
 template<>
 void CV_WarpAffine_Test::newLinear<3, float>(int x, float sx, float sy, const float *srcptr_, float *dstptr,
                                          int srccols, int srcrows, size_t srcstep,
                                          const float *bval, int borderType_x, int borderType_y)
 {
    int ix = (int)floorf(sx), iy = (int)floorf(sy);
    sx -= ix; sy -= iy;
    float pxy[12];
    const float *srcptr = srcptr_ + srcstep*iy + ix*3;
    WARPAFFINE_SHUFFLE(3);
    warpaffine_linear_calc(3, pxy, dstptr+x*3, sx, sy);
 }
 template<typename T>
 void CV_WarpAffine_Test::newWarpAffine(const Mat &_src, Mat &_dst, const Mat &tM)
 {
    int num_channels = _dst.channels();
    CV_CheckTrue(num_channels == 1 || num_channels == 3 || num_channels == 4, "");
    auto *srcptr_ = _src.ptr<const T>();
    auto *dstptr_ = _dst.ptr<T>();
    size_t srcstep = _src.step/sizeof(T), dststep = _dst.step/sizeof(T);
    int srccols = _src.cols, srcrows = _src.rows;
    int dstcols = _dst.cols, dstrows = _dst.rows;
    Mat ttM;
    tM.convertTo(ttM, CV_32F);
    auto *_M = ttM.ptr<const float>();
    T bval[] = {
        saturate_cast<T>(borderValue[0]),
        saturate_cast<T>(borderValue[1]),
        saturate_cast<T>(borderValue[2]),
        saturate_cast<T>(borderValue[3]),
    };
    int borderType_x = borderType != BORDER_CONSTANT &&
                       borderType != BORDER_TRANSPARENT &&
                       srccols <= 1 ? BORDER_REPLICATE : borderType;
    int borderType_y = borderType != BORDER_CONSTANT &&
                       borderType != BORDER_TRANSPARENT &&
                       srcrows <= 1 ? BORDER_REPLICATE : borderType;
    for (int y = 0; y < dstrows; y++) {
        T* dstptr = dstptr_ + y*dststep;
        for (int x = 0; x < dstcols; x++) {
            float sx = x*_M[0] + y*_M[1] + _M[2];
            float sy = x*_M[3] + y*_M[4] + _M[5];
            if (num_channels == 3) {
                newLinear<3>(x, sx, sy, srcptr_, dstptr, srccols, srcrows, srcstep, bval, borderType_x, borderType_y);
            } else if (num_channels == 4) {
                newLinear<4>(x, sx, sy, srcptr_, dstptr, srccols, srcrows, srcstep, bval, borderType_x, borderType_y);
            } else {
                newLinear<1>(x, sx, sy, srcptr_, dstptr, srccols, srcrows, srcstep, bval, borderType_x, borderType_y);
            }
        }
    }
 }
 void CV_WarpAffine_Test::warpAffine(const Mat& _src, Mat& _dst)
 {
    Size dsize = _dst.size();
@ -1122,6 +1275,17 @@ void CV_WarpAffine_Test::warpAffine(const Mat& _src, Mat& _dst)
    if (!(interpolation & cv::WARP_INVERSE_MAP))
        invertAffineTransform(tM.clone(), tM);
    if (inter == INTER_LINEAR) {
        int dst_depth = _dst.depth(), dst_channels = _dst.channels();
        if (dst_depth == CV_8U && (dst_channels == 1 || dst_channels == 3 || dst_channels == 4)) {
            return newWarpAffine<uint8_t>(_src, _dst, tM);
        } else if (dst_depth == CV_16U && (dst_channels == 1 || dst_channels == 3 || dst_channels == 4)) {
            return newWarpAffine<uint16_t>(_src, _dst, tM);
        } else if (dst_depth == CV_32F && (dst_channels == 1 || dst_channels == 3 || dst_channels == 4)) {
            return newWarpAffine<float>(_src, _dst, tM);
        }
    }
    const int AB_BITS = MAX(10, (int)INTER_BITS);
    const int AB_SCALE = 1 << AB_BITS;
    int round_delta = (inter == INTER_NEAREST) ? AB_SCALE / 2 : (AB_SCALE / INTER_TAB_SIZE / 2);
@ -1134,7 +1298,7 @@ void CV_WarpAffine_Test::warpAffine(const Mat& _src, Mat& _dst)
        {
            int v1 = saturate_cast<int>(saturate_cast<int>(data_tM[0] * dx * AB_SCALE) +
                    saturate_cast<int>((data_tM[1] * dy + data_tM[2]) * AB_SCALE) + round_delta),
-                   v2 = saturate_cast<int>(saturate_cast<int>(data_tM[3] * dx * AB_SCALE) +
+                v2 = saturate_cast<int>(saturate_cast<int>(data_tM[3] * dx * AB_SCALE) +
                    saturate_cast<int>((data_tM[4] * dy + data_tM[5]) * AB_SCALE) + round_delta);
            v1 >>= AB_BITS - INTER_BITS;
            v2 >>= AB_BITS - INTER_BITS;
--- a/modules/ts/include/opencv2/ts.hpp
+++ b/modules/ts/include/opencv2/ts.hpp
@ -748,8 +748,76 @@ struct DefaultRngAuto
 // test images generation functions
-void fillGradient(Mat& img, int delta = 5);
+template<typename T>
-void smoothBorder(Mat& img, const Scalar& color, int delta = 3);
+void fillGradient(Mat& img, int delta = 5)
 {
    CV_UNUSED(delta);
    const int ch = img.channels();
    int r, c, i;
    for(r=0; r<img.rows; r++)
    {
        for(c=0; c<img.cols; c++)
        {
            T vals[] = {(T)r, (T)c, (T)(r*c), (T)(r*c/(r+c+1))};
            T *p = (T*)img.ptr(r, c);
            for(i=0; i<ch; i++) p[i] = (T)vals[i];
        }
    }
 }
 template<>
 void fillGradient<uint8_t>(Mat& img, int delta);
 template<typename T>
 void smoothBorder(Mat& img, const Scalar& color, int delta = 3)
 {
    const int ch = img.channels();
    CV_Assert(!img.empty() && ch <= 4);
    Scalar s;
    int n = 100/delta;
    int nR = std::min(n, (img.rows+1)/2), nC = std::min(n, (img.cols+1)/2);
    int r, c, i;
    for(r=0; r<nR; r++)
    {
        double k1 = r*delta/100., k2 = 1-k1;
        for(c=0; c<img.cols; c++)
        {
            auto *p = img.ptr<T>(r, c);
            for(i=0; i<ch; i++) s[i] = p[i];
            s = s * k1 + color * k2;
            for(i=0; i<ch; i++) p[i] = static_cast<T>((s[i]));
        }
        for(c=0; c<img.cols; c++)
        {
            auto *p = img.ptr<T>(img.rows-r-1, c);
            for(i=0; i<ch; i++) s[i] = p[i];
            s = s * k1 + color * k2;
            for(i=0; i<ch; i++) p[i] = static_cast<T>((s[i]));
        }
    }
    for(r=0; r<img.rows; r++)
    {
        for(c=0; c<nC; c++)
        {
            double k1 = c*delta/100., k2 = 1-k1;
            auto *p = img.ptr<T>(r, c);
            for(i=0; i<ch; i++) s[i] = p[i];
            s = s * k1 + color * k2;
            for(i=0; i<ch; i++) p[i] = static_cast<T>((s[i]));
        }
        for(c=0; c<n; c++)
        {
            double k1 = c*delta/100., k2 = 1-k1;
            auto *p = img.ptr<T>(r, img.cols-c-1);
            for(i=0; i<ch; i++) s[i] = p[i];
            s = s * k1 + color * k2;
            for(i=0; i<ch; i++) p[i] = static_cast<T>((s[i]));
        }
    }
 }
 // Utility functions
--- a/modules/ts/src/ts.cpp
+++ b/modules/ts/src/ts.cpp
@ -686,7 +686,8 @@ TS* TS::ptr()
    return &ts;
 }
-void fillGradient(Mat& img, int delta)
+template<>
 void fillGradient<uint8_t>(Mat& img, int delta)
 {
    const int ch = img.channels();
    CV_Assert(!img.empty() && img.depth() == CV_8U && ch <= 4);
@ -708,57 +709,6 @@ void fillGradient(Mat& img, int delta)
    }
 }
 void smoothBorder(Mat& img, const Scalar& color, int delta)
 {
    const int ch = img.channels();
    CV_Assert(!img.empty() && img.depth() == CV_8U && ch <= 4);
    Scalar s;
    uchar *p = NULL;
    int n = 100/delta;
    int nR = std::min(n, (img.rows+1)/2), nC = std::min(n, (img.cols+1)/2);
    int r, c, i;
    for(r=0; r<nR; r++)
    {
        double k1 = r*delta/100., k2 = 1-k1;
        for(c=0; c<img.cols; c++)
        {
            p = img.ptr(r, c);
            for(i=0; i<ch; i++) s[i] = p[i];
            s = s * k1 + color * k2;
            for(i=0; i<ch; i++) p[i] = uchar(s[i]);
        }
        for(c=0; c<img.cols; c++)
        {
            p = img.ptr(img.rows-r-1, c);
            for(i=0; i<ch; i++) s[i] = p[i];
            s = s * k1 + color * k2;
            for(i=0; i<ch; i++) p[i] = uchar(s[i]);
        }
    }
    for(r=0; r<img.rows; r++)
    {
        for(c=0; c<nC; c++)
        {
            double k1 = c*delta/100., k2 = 1-k1;
            p = img.ptr(r, c);
            for(i=0; i<ch; i++) s[i] = p[i];
            s = s * k1 + color * k2;
            for(i=0; i<ch; i++) p[i] = uchar(s[i]);
        }
        for(c=0; c<n; c++)
        {
            double k1 = c*delta/100., k2 = 1-k1;
            p = img.ptr(r, img.cols-c-1);
            for(i=0; i<ch; i++) s[i] = p[i];
            s = s * k1 + color * k2;
            for(i=0; i<ch; i++) p[i] = uchar(s[i]);
        }
    }
 }
 bool test_ipp_check = false;