Merge pull request #25786 from plctlab:rvp_3rdparty

3rdparty: NDSRVP - Part 1.5: New Interfaces
2025-08-06 14:36:36 +08:00 · 2024-08-05 15:26:31 +03:00 · 2024-08-05 15:26:31 +03:00 · ecbff5a20c
commit ecbff5a20c
parent 6ed603e917 35463e079c
15 changed files with 829 additions and 517 deletions
--- a/3rdparty/ndsrvp/include/core.hpp
+++ b/3rdparty/ndsrvp/include/core.hpp
@ -1,6 +1,6 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.	
+// of this distribution and at http://opencv.org/license.html.

 #ifndef OPENCV_NDSRVP_CORE_HPP
 #define OPENCV_NDSRVP_CORE_HPP
--- a/3rdparty/ndsrvp/include/imgproc.hpp
+++ b/3rdparty/ndsrvp/include/imgproc.hpp
@ -1,18 +1,12 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.	
+// of this distribution and at http://opencv.org/license.html.

 #ifndef OPENCV_NDSRVP_IMGPROC_HPP
 #define OPENCV_NDSRVP_IMGPROC_HPP

 namespace cv {

-// ################ remap ################
-
-void remap(InputArray _src, OutputArray _dst,
-    InputArray _map1, InputArray _map2,
-    int interpolation, int borderType, const Scalar& borderValue);
-
 namespace ndsrvp {

 enum InterpolationMasks {
@ -36,23 +30,36 @@ int integral(int depth, int sdepth, int sqdepth,

 // ################ warpAffine ################

-int warpAffine(int src_type,
-    const uchar* src_data, size_t src_step, int src_width, int src_height,
-    uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
-    const double M[6], int interpolation, int borderType, const double borderValue[4]);
+int warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw);

-#undef cv_hal_warpAffine
-#define cv_hal_warpAffine (cv::ndsrvp::warpAffine)
+#undef cv_hal_warpAffineBlocklineNN
+#define cv_hal_warpAffineBlocklineNN (cv::ndsrvp::warpAffineBlocklineNN)
+
+int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw);
+
+#undef cv_hal_warpAffineBlockline
+#define cv_hal_warpAffineBlockline (cv::ndsrvp::warpAffineBlockline)

 // ################ warpPerspective ################

-int warpPerspective(int src_type,
-    const uchar* src_data, size_t src_step, int src_width, int src_height,
-    uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
-    const double M[9], int interpolation, int borderType, const double borderValue[4]);
+int warpPerspectiveBlocklineNN(const double *M, short* xy, double X0, double Y0, double W0, int bw);

-#undef cv_hal_warpPerspective
-#define cv_hal_warpPerspective (cv::ndsrvp::warpPerspective)
+#undef cv_hal_warpPerspectiveBlocklineNN
+#define cv_hal_warpPerspectiveBlocklineNN (cv::ndsrvp::warpPerspectiveBlocklineNN)
+
+int warpPerspectiveBlockline(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw);
+
+#undef cv_hal_warpPerspectiveBlockline
+#define cv_hal_warpPerspectiveBlockline (cv::ndsrvp::warpPerspectiveBlockline)
+
+// ################ remap ################
+
+int remap32f(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+    uchar *dst_data, size_t dst_step, int dst_width, int dst_height, float* mapx, size_t mapx_step,
+    float* mapy, size_t mapy_step, int interpolation, int border_type, const double border_value[4]);
+
+#undef cv_hal_remap32f
+#define cv_hal_remap32f (cv::ndsrvp::remap32f)

 // ################ threshold ################

--- a/3rdparty/ndsrvp/ndsrvp_hal.hpp
+++ b/3rdparty/ndsrvp/ndsrvp_hal.hpp
@ -1,13 +1,14 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.	
+// of this distribution and at http://opencv.org/license.html.

 #ifndef OPENCV_NDSRVP_HAL_HPP
 #define OPENCV_NDSRVP_HAL_HPP

-#include "opencv2/core/mat.hpp"
 #include <nds_intrinsic.h>

+#include "opencv2/core/hal/interface.h"
+
 #include "include/core.hpp"
 #include "include/imgproc.hpp"
 #include "include/features2d.hpp"
--- a/3rdparty/ndsrvp/src/cvutils.cpp
+++ b/3rdparty/ndsrvp/src/cvutils.cpp
@ -0,0 +1,78 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.	
+
+#include "cvutils.hpp"
+
+namespace cv {
+
+namespace ndsrvp {
+
+// fastMalloc
+
+// [0][1][2][3][4][5][6][7][8][9]
+//     ^udata
+//                          ^adata
+//              ^adata[-1] == udata
+
+void* fastMalloc(size_t size)
+{
+    uchar* udata = (uchar*)malloc(size + sizeof(void*) + CV_MALLOC_ALIGN);
+    if(!udata)
+        ndsrvp_error(Error::StsNoMem, "fastMalloc(): Not enough memory");
+    uchar** adata = (uchar**)align((size_t)((uchar**)udata + 1), CV_MALLOC_ALIGN);
+    adata[-1] = udata;
+    return adata;
+}
+
+void fastFree(void* ptr)
+{
+    if(ptr)
+    {
+        uchar* udata = ((uchar**)ptr)[-1];
+        if(!(udata < (uchar*)ptr && ((uchar*)ptr - udata) <= (ptrdiff_t)(sizeof(void*) + CV_MALLOC_ALIGN)))
+            ndsrvp_error(Error::StsBadArg, "fastFree(): Invalid memory block");
+        free(udata);
+    }
+}
+
+// borderInterpolate
+
+int borderInterpolate(int p, int len, int borderType)
+{
+    if( (unsigned)p < (unsigned)len )
+        ;
+    else if( borderType == CV_HAL_BORDER_REPLICATE )
+        p = p < 0 ? 0 : len - 1;
+    else if( borderType == CV_HAL_BORDER_REFLECT || borderType == CV_HAL_BORDER_REFLECT_101 )
+    {
+        int delta = borderType == CV_HAL_BORDER_REFLECT_101;
+        if( len == 1 )
+            return 0;
+        do
+        {
+            if( p < 0 )
+                p = -p - 1 + delta;
+            else
+                p = len - 1 - (p - len) - delta;
+        }
+        while( (unsigned)p >= (unsigned)len );
+    }
+    else if( borderType == CV_HAL_BORDER_WRAP )
+    {
+        ndsrvp_assert(len > 0);
+        if( p < 0 )
+            p -= ((p - len + 1) / len) * len;
+        if( p >= len )
+            p %= len;
+    }
+    else if( borderType == CV_HAL_BORDER_CONSTANT )
+        p = -1;
+    else
+        ndsrvp_error(Error::StsBadArg, "borderInterpolate(): Unknown/unsupported border type");
+    return p;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
--- a/3rdparty/ndsrvp/src/cvutils.hpp
+++ b/3rdparty/ndsrvp/src/cvutils.hpp
@ -0,0 +1,108 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.	
+
+#ifndef OPENCV_NDSRVP_CVUTILS_HPP
+#define OPENCV_NDSRVP_CVUTILS_HPP
+
+#include <nds_intrinsic.h>
+
+#include "opencv2/core/hal/interface.h"
+
+#include <cstring>
+#include <cmath>
+#include <iostream>
+#include <string>
+#include <array>
+#include <climits>
+#include <algorithm>
+
+// misc functions that not exposed to public interface
+
+namespace cv {
+
+namespace ndsrvp {
+
+void* fastMalloc(size_t size);
+void fastFree(void* ptr);
+int borderInterpolate(int p, int len, int borderType);
+
+#ifndef MAX
+#  define MAX(a,b)  ((a) < (b) ? (b) : (a))
+#endif
+
+#define CV_MAT_CN_MASK          ((CV_CN_MAX - 1) << CV_CN_SHIFT)
+#define CV_MAT_CN(flags)        ((((flags) & CV_MAT_CN_MASK) >> CV_CN_SHIFT) + 1)
+
+#define CV_MALLOC_ALIGN 64
+
+// error codes
+
+enum Error{
+    StsNoMem = -4,
+    StsBadArg = -5,
+    StsAssert = -215
+};
+
+// output error
+
+#define ndsrvp_assert(expr) { if(!(expr)) ndsrvp_error(Error::StsAssert, std::string(#expr)); }
+
+inline void ndsrvp_error(int code, std::string msg = "")
+{
+    std::cerr << "NDSRVP Error: code " << code << std::endl;
+    if(!msg.empty())
+        std::cerr << msg << std::endl;
+    if(code < 0)
+        throw code;
+}
+
+// clip & vclip
+
+inline int clip(int x, int a, int b)
+{
+    return x >= a ? (x < b ? x : b - 1) : a;
+}
+
+inline int32x2_t vclip(int32x2_t x, int32x2_t a, int32x2_t b)
+{
+    return (int32x2_t)__nds__bpick((long)a, __nds__bpick((long)(b - 1), (long)x, (long)(x < b)), (long)(x >= a));
+}
+
+// saturate
+
+template<typename _Tp> static inline _Tp saturate_cast(int v)    { return _Tp(v); }
+template<typename _Tp> static inline _Tp saturate_cast(float v)    { return _Tp(v); }
+template<typename _Tp> static inline _Tp saturate_cast(double v)   { return _Tp(v); }
+
+template<> inline uchar saturate_cast<uchar>(int v)     { return __nds__uclip32(v, 8); }
+template<> inline uchar saturate_cast<uchar>(float v)     { return saturate_cast<uchar>((int)lrintf(v)); }
+template<> inline uchar saturate_cast<uchar>(double v)     { return saturate_cast<uchar>((int)lrint(v)); }
+
+template<> inline char saturate_cast<char>(int v)     { return __nds__sclip32(v, 7); }
+template<> inline char saturate_cast<char>(float v)     { return saturate_cast<char>((int)lrintf(v)); }
+template<> inline char saturate_cast<char>(double v)     { return saturate_cast<char>((int)lrint(v)); }
+
+template<> inline ushort saturate_cast<ushort>(int v)     { return __nds__uclip32(v, 16); }
+template<> inline ushort saturate_cast<ushort>(float v)     { return saturate_cast<ushort>((int)lrintf(v)); }
+template<> inline ushort saturate_cast<ushort>(double v)     { return saturate_cast<ushort>((int)lrint(v)); }
+
+template<> inline short saturate_cast<short>(int v)     { return __nds__sclip32(v, 15); }
+template<> inline short saturate_cast<short>(float v)     { return saturate_cast<short>((int)lrintf(v)); }
+template<> inline short saturate_cast<short>(double v)     { return saturate_cast<short>((int)lrint(v)); }
+
+template<> inline int saturate_cast<int>(float v)     { return (int)lrintf(v); }
+template<> inline int saturate_cast<int>(double v)     { return (int)lrint(v); }
+
+// align
+
+inline long align(size_t v, int n)
+{
+    return (v + n - 1) & -n;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
+
+#endif
--- a/3rdparty/ndsrvp/src/integral.cpp
+++ b/3rdparty/ndsrvp/src/integral.cpp
@ -3,6 +3,8 @@
 // of this distribution and at http://opencv.org/license.html.	

 #include "ndsrvp_hal.hpp"
+#include "opencv2/imgproc/hal/interface.h"
+#include "cvutils.hpp"

 namespace cv {

--- a/3rdparty/ndsrvp/src/remap.cpp
+++ b/3rdparty/ndsrvp/src/remap.cpp
@ -0,0 +1,188 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "ndsrvp_hal.hpp"
+#include "opencv2/imgproc/hal/interface.h"
+#include "cvutils.hpp"
+
+namespace cv {
+
+namespace ndsrvp {
+
+int remap32f(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height,
+    uchar* dst_data, size_t dst_step, int dst_width, int dst_height, float* mapx, size_t mapx_step,
+    float* mapy, size_t mapy_step, int interpolation, int border_type, const double border_value[4])
+{
+    const bool isRelative = ((interpolation & CV_HAL_WARP_RELATIVE_MAP) != 0);
+    interpolation &= ~CV_HAL_WARP_RELATIVE_MAP;
+
+    if( interpolation == CV_HAL_INTER_AREA )
+        interpolation = CV_HAL_INTER_LINEAR;
+
+    if( interpolation != CV_HAL_INTER_NEAREST )
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    // only CV_8U
+    if( (src_type & CV_MAT_DEPTH_MASK) != CV_8U )
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    int cn = CV_MAT_CN(src_type);
+
+    src_step /= sizeof(uchar);
+    dst_step /= sizeof(uchar);
+
+    // mapping CV_32FC1
+    mapx_step /= sizeof(float);
+    mapy_step /= sizeof(float);
+
+    // border
+    uchar border_const[CV_CN_MAX];
+    for( int k = 0; k < CV_CN_MAX; k++ )
+        border_const[k] = saturate_cast<uchar>(border_value[k & 3]);
+
+    // divide into blocks
+    const int BLOCK_SIZE = 1024;
+    int x, y, x1, y1;
+    std::array<short, BLOCK_SIZE * BLOCK_SIZE * 2> aXY;
+    short* XY = aXY.data();
+    size_t XY_step = BLOCK_SIZE * 2;
+
+    // vectorize
+    const int32x2_t src_wh = {src_width, src_height};
+    const int32x2_t arr_index = {cn, (int)src_step};
+
+    for (y = 0; y < dst_height; y += BLOCK_SIZE)
+    {
+        int dy = std::min(BLOCK_SIZE, dst_height - y);
+        for (x = 0; x < dst_width; x += BLOCK_SIZE)
+        {
+            const int off_y = isRelative ? y : 0;
+            const int off_x = isRelative ? x : 0;
+            const int32x2_t voff = {off_x, off_y};
+
+            int dx = std::min(BLOCK_SIZE, dst_width - x);
+            // prepare mapping data XY
+            for (y1 = 0; y1 < dy; y1++)
+            {
+                short* rXY = XY + y1 * XY_step;
+                const float* sX = mapx + (y + y1) * mapx_step + x;
+                const float* sY = mapy + (y + y1) * mapy_step + x;
+                for (x1 = 0; x1 < dx; x1++)
+                {
+                    rXY[x1 * 2] = saturate_cast<short>(sX[x1]);
+                    rXY[x1 * 2 + 1] = saturate_cast<short>(sY[x1]);
+                }
+            }
+
+            // precalulate offset
+            if(isRelative)
+            {
+                int16x8_t voff_x;
+                int16x8_t voff_y = {0, 0, 1, 0, 2, 0, 3, 0};
+                int16x8_t vones_x = {4, 0, 4, 0, 4, 0, 4, 0};
+                int16x8_t vones_y = {0, 1, 0, 1, 0, 1, 0, 1};
+                for(y1 = 0; y1 < BLOCK_SIZE; y1++, voff_y += vones_y)
+                {
+                    int16x8_t* vrXY = (int16x8_t*)(XY + y1 * XY_step);
+                    for(x1 = 0, voff_x = voff_y; x1 < BLOCK_SIZE; x1 += 4, vrXY++, voff_x += vones_x)
+                    {
+                        *vrXY += voff_x;
+                    }
+                }
+            }
+
+            // process the block
+            for( y1 = 0; y1 < dy; y1++ )
+            {
+                uchar* dst_row = dst_data + (y + y1) * dst_step + x * cn;
+                const short* rXY = XY + y1 * XY_step;
+                if( cn == 1 )
+                {
+                    for( x1 = 0; x1 < dx; x1++ )
+                    {
+                        int32x2_t vsxy = (int32x2_t){rXY[x1 * 2], rXY[x1 * 2 + 1]} + voff;
+                        if( (long)((uint32x2_t)vsxy < (uint32x2_t)src_wh) == -1 )
+                            dst_row[x1] = src_data[__nds__v_smar64(0, vsxy, arr_index)];
+                        else
+                        {
+                            if( border_type == CV_HAL_BORDER_REPLICATE )
+                            {
+                                vsxy = vclip(vsxy, (int32x2_t){0, 0}, src_wh);
+                                dst_row[x1] = src_data[__nds__v_smar64(0, vsxy, arr_index)];
+                            }
+                            else if( border_type == CV_HAL_BORDER_CONSTANT )
+                                dst_row[x1] = border_const[0];
+                            else if( border_type != CV_HAL_BORDER_TRANSPARENT )
+                            {
+                                vsxy[0] = borderInterpolate(vsxy[0], src_width, border_type);
+                                vsxy[1] = borderInterpolate(vsxy[1], src_height, border_type);
+                                dst_row[x1] = src_data[__nds__v_smar64(0, vsxy, arr_index)];
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    uchar* dst_ptr = dst_row;
+                    for(x1 = 0; x1 < dx; x1++, dst_ptr += cn )
+                    {
+                        int32x2_t vsxy = (int32x2_t){rXY[x1 * 2], rXY[x1 * 2 + 1]} + voff;
+                        const uchar *src_ptr;
+                        if( (long)((uint32x2_t)vsxy < (uint32x2_t)src_wh) == -1 )
+                        {
+                            if( cn == 3 )
+                            {
+                                src_ptr = (uchar*)__nds__v_smar64((long)src_data, vsxy, arr_index);
+                                dst_ptr[0] = src_ptr[0]; dst_ptr[1] = src_ptr[1]; dst_ptr[2] = src_ptr[2];
+                                // performance loss, commented out
+                                // *(unsigned*)dst_ptr = __nds__bpick(*(unsigned*)dst_ptr, *(unsigned*)src_ptr, 0xFF000000);
+                            }
+                            else if( cn == 4 )
+                            {
+                                src_ptr = (uchar*)__nds__v_smar64((long)src_data, vsxy, arr_index);
+                                *(uint8x4_t*)dst_ptr = *(uint8x4_t*)src_ptr;
+                            }
+                            else
+                            {
+                                src_ptr = (uchar*)__nds__v_smar64((long)src_data, vsxy, arr_index);
+                                int k = cn;
+                                for(; k >= 8; k -= 8, dst_ptr += 8, src_ptr += 8)
+                                    *(uint8x8_t*)dst_ptr = *(uint8x8_t*)src_ptr;
+                                while( k-- )
+                                    dst_ptr[k] = src_ptr[k];
+                            }
+                        }
+                        else if( border_type != CV_HAL_BORDER_TRANSPARENT )
+                        {
+                            if( border_type == CV_HAL_BORDER_REPLICATE )
+                            {
+                                vsxy = vclip(vsxy, (int32x2_t){0, 0}, src_wh);
+                                src_ptr = (uchar*)__nds__v_smar64((long)src_data, vsxy, arr_index);
+                            }
+                            else if( border_type == CV_HAL_BORDER_CONSTANT )
+                                src_ptr = &border_const[0];
+                            else
+                            {
+                                vsxy[0] = borderInterpolate(vsxy[0], src_width, border_type);
+                                vsxy[1] = borderInterpolate(vsxy[1], src_height, border_type);
+                                src_ptr = (uchar*)__nds__v_smar64((long)src_data, vsxy, arr_index);
+                            }
+                            int k = cn;
+                            for(; k >= 8; k -= 8, dst_ptr += 8, src_ptr += 8)
+                                *(uint8x8_t*)dst_ptr = *(uint8x8_t*)src_ptr;
+                            while( k-- )
+                                dst_ptr[k] = src_ptr[k];
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
--- a/3rdparty/ndsrvp/src/threshold.cpp
+++ b/3rdparty/ndsrvp/src/threshold.cpp
@ -4,65 +4,44 @@

 #include "ndsrvp_hal.hpp"
 #include "opencv2/imgproc/hal/interface.h"
+#include "cvutils.hpp"

 namespace cv {

 namespace ndsrvp {

 template <typename type, typename vtype>
-class operators_threshold_t {
-public:
-    virtual ~operators_threshold_t() {};
-    virtual inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
-    {
-        (void)src;
-        (void)thresh;
-        (void)maxval;
-        CV_Error(cv::Error::StsBadArg, "");
-        return vtype();
-    }
-    virtual inline type scalar(const type& src, const type& thresh, const type& maxval)
-    {
-        (void)src;
-        (void)thresh;
-        (void)maxval;
-        CV_Error(cv::Error::StsBadArg, "");
-        return type();
-    }
-};
-
-template <typename type, typename vtype>
-class opThreshBinary : public operators_threshold_t<type, vtype> {
-    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+struct opThreshBinary_t {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
    {
        return (vtype)__nds__bpick((long)maxval, (long)0, (long)(src > thresh));
    }
-    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    inline type scalar(const type& src, const type& thresh, const type& maxval)
    {
        return src > thresh ? maxval : 0;
    }
 };

 template <typename type, typename vtype>
-class opThreshBinaryInv : public operators_threshold_t<type, vtype> {
-    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+struct opThreshBinaryInv_t {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
    {
        return (vtype)__nds__bpick((long)0, (long)maxval, (long)(src > thresh));
    }
-    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    inline type scalar(const type& src, const type& thresh, const type& maxval)
    {
        return src > thresh ? 0 : maxval;
    }
 };

 template <typename type, typename vtype>
-class opThreshTrunc : public operators_threshold_t<type, vtype> {
-    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+struct opThreshTrunc_t {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
    {
        (void)maxval;
        return (vtype)__nds__bpick((long)thresh, (long)src, (long)(src > thresh));
    }
-    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    inline type scalar(const type& src, const type& thresh, const type& maxval)
    {
        (void)maxval;
        return src > thresh ? thresh : src;
@ -70,13 +49,13 @@ class opThreshTrunc : public operators_threshold_t<type, vtype> {
 };

 template <typename type, typename vtype>
-class opThreshToZero : public operators_threshold_t<type, vtype> {
-    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+struct opThreshToZero_t {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
    {
        (void)maxval;
        return (vtype)__nds__bpick((long)src, (long)0, (long)(src > thresh));
    }
-    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    inline type scalar(const type& src, const type& thresh, const type& maxval)
    {
        (void)maxval;
        return src > thresh ? src : 0;
@ -84,29 +63,36 @@ class opThreshToZero : public operators_threshold_t<type, vtype> {
 };

 template <typename type, typename vtype>
-class opThreshToZeroInv : public operators_threshold_t<type, vtype> {
-    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+struct opThreshToZeroInv_t {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
    {
        (void)maxval;
        return (vtype)__nds__bpick((long)0, (long)src, (long)(src > thresh));
    }
-    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    inline type scalar(const type& src, const type& thresh, const type& maxval)
    {
        (void)maxval;
        return src > thresh ? 0 : src;
    }
 };

-template <typename type, typename vtype, int nlane>
-static void threshold_op(const type* src_data, size_t src_step,
-    type* dst_data, size_t dst_step,
+template <typename type, typename vtype, int nlane,
+    template <typename ttype, typename vttype> typename opThresh_t>
+static inline void threshold_op(const uchar* src, size_t src_step,
+    uchar* dst, size_t dst_step,
    int width, int height, int cn,
-    type thresh, type maxval, int thtype)
+    double thresh_d, double maxval_d)
 {
    int i, j;
    width *= cn;
+
+    type* src_data = (type*)src;
+    type* dst_data = (type*)dst;
    src_step /= sizeof(type);
    dst_step /= sizeof(type);
+
+    type thresh = saturate_cast<type>(thresh_d);
+    type maxval = saturate_cast<type>(maxval_d);
    vtype vthresh;
    vtype vmaxval;
    for (i = 0; i < nlane; i++) {
@ -114,62 +100,63 @@ static void threshold_op(const type* src_data, size_t src_step,
        vmaxval[i] = maxval;
    }

-    operators_threshold_t<type, vtype>* op;
-    switch (thtype) {
-    case CV_HAL_THRESH_BINARY:
-        op = new opThreshBinary<type, vtype>();
-        break;
-    case CV_HAL_THRESH_BINARY_INV:
-        op = new opThreshBinaryInv<type, vtype>();
-        break;
-    case CV_HAL_THRESH_TRUNC:
-        op = new opThreshTrunc<type, vtype>();
-        break;
-    case CV_HAL_THRESH_TOZERO:
-        op = new opThreshToZero<type, vtype>();
-        break;
-    case CV_HAL_THRESH_TOZERO_INV:
-        op = new opThreshToZeroInv<type, vtype>();
-        break;
-    default:
-        CV_Error(cv::Error::StsBadArg, "");
-        return;
-    }
+    opThresh_t<type, vtype> opThresh;

    for (i = 0; i < height; i++, src_data += src_step, dst_data += dst_step) {
        for (j = 0; j <= width - nlane; j += nlane) {
-            vtype vs = *(vtype*)(src_data + j);
-            *(vtype*)(dst_data + j) = op->vector(vs, vthresh, vmaxval);
+            *(vtype*)(dst_data + j) = opThresh.vector(*(vtype*)(src_data + j), vthresh, vmaxval);
        }
        for (; j < width; j++) {
-            dst_data[j] = op->scalar(src_data[j], thresh, maxval);
+            dst_data[j] = opThresh.scalar(src_data[j], thresh, maxval);
        }
    }

-    delete op;
    return;
 }

+typedef void (*ThreshFunc)(const uchar* src_data, size_t src_step,
+    uchar* dst_data, size_t dst_step,
+    int width, int height, int cn,
+    double thresh, double maxval);
+
 int threshold(const uchar* src_data, size_t src_step,
    uchar* dst_data, size_t dst_step,
    int width, int height, int depth, int cn,
    double thresh, double maxValue, int thresholdType)
 {
-    if (width <= 255 && height <= 255) // slower at small size
+    static ThreshFunc thfuncs[4][5] =
+    {
+        {
+            threshold_op<uchar, uint8x8_t, 8, opThreshBinary_t>,
+            threshold_op<uchar, uint8x8_t, 8, opThreshBinaryInv_t>,
+            threshold_op<uchar, uint8x8_t, 8, opThreshTrunc_t>, 
+            threshold_op<uchar, uint8x8_t, 8, opThreshToZero_t>,
+            threshold_op<uchar, uint8x8_t, 8, opThreshToZeroInv_t> },
+        {
+            threshold_op<char, int8x8_t, 8, opThreshBinary_t>,
+            threshold_op<char, int8x8_t, 8, opThreshBinaryInv_t>,
+            threshold_op<char, int8x8_t, 8, opThreshTrunc_t>, 
+            threshold_op<char, int8x8_t, 8, opThreshToZero_t>,
+            threshold_op<char, int8x8_t, 8, opThreshToZeroInv_t> },
+        {
+            threshold_op<ushort, uint16x4_t, 4, opThreshBinary_t>,
+            threshold_op<ushort, uint16x4_t, 4, opThreshBinaryInv_t>,
+            threshold_op<ushort, uint16x4_t, 4, opThreshTrunc_t>,
+            threshold_op<ushort, uint16x4_t, 4, opThreshToZero_t>,
+            threshold_op<ushort, uint16x4_t, 4, opThreshToZeroInv_t> },
+        {
+            threshold_op<short, int16x4_t, 4, opThreshBinary_t>,
+            threshold_op<short, int16x4_t, 4, opThreshBinaryInv_t>,
+            threshold_op<short, int16x4_t, 4, opThreshTrunc_t>,
+            threshold_op<short, int16x4_t, 4, opThreshToZero_t>,
+            threshold_op<short, int16x4_t, 4, opThreshToZeroInv_t> }
+    };
+
+    if(depth < 0 || depth > 3 || thresholdType < 0 || thresholdType > 4 || (width < 256 && height < 256))
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (depth == CV_8U) {
-        threshold_op<uchar, uint8x8_t, 8>((uchar*)src_data, src_step, (uchar*)dst_data, dst_step, width, height, cn, (uchar)thresh, (uchar)maxValue, thresholdType);
-        return CV_HAL_ERROR_OK;
-    } else if (depth == CV_16S) {
-        threshold_op<short, int16x4_t, 4>((short*)src_data, src_step, (short*)dst_data, dst_step, width, height, cn, (short)thresh, (short)maxValue, thresholdType);
-        return CV_HAL_ERROR_OK;
-    } else if (depth == CV_16U) {
-        threshold_op<ushort, uint16x4_t, 4>((ushort*)src_data, src_step, (ushort*)dst_data, dst_step, width, height, cn, (ushort)thresh, (ushort)maxValue, thresholdType);
-        return CV_HAL_ERROR_OK;
-    } else {
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    }
-    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    thfuncs[depth][thresholdType](src_data, src_step, dst_data, dst_step, width, height, cn, thresh, maxValue);
+    return CV_HAL_ERROR_OK;
 }

 } // namespace ndsrvp
--- a/3rdparty/ndsrvp/src/warpAffine.cpp
+++ b/3rdparty/ndsrvp/src/warpAffine.cpp
@ -3,148 +3,68 @@
 // of this distribution and at http://opencv.org/license.html.	

 #include "ndsrvp_hal.hpp"
-#include "opencv2/core.hpp"
 #include "opencv2/imgproc/hal/interface.h"
+#include "cvutils.hpp"

 namespace cv {

 namespace ndsrvp {

-class WarpAffineInvoker : public ParallelLoopBody {
-public:
-    WarpAffineInvoker(const Mat& _src, Mat& _dst, int _interpolation, int _borderType,
-        const Scalar& _borderValue, int* _adelta, int* _bdelta, const double* _M)
-        : ParallelLoopBody()
-        , src(_src)
-        , dst(_dst)
-        , interpolation(_interpolation)
-        , borderType(_borderType)
-        , borderValue(_borderValue)
-        , adelta(_adelta)
-        , bdelta(_bdelta)
-        , M(_M)
-    {
-    }
-
-    virtual void operator()(const Range& range) const CV_OVERRIDE
-    {
-        const int BLOCK_SZ = 64;
-        AutoBuffer<short, 0> __XY(BLOCK_SZ * BLOCK_SZ * 2), __A(BLOCK_SZ * BLOCK_SZ);
-        short *XY = __XY.data(), *A = __A.data();
-        const int AB_BITS = MAX(10, (int)INTER_BITS);
-        const int AB_SCALE = 1 << AB_BITS;
-        int round_delta = interpolation == CV_HAL_INTER_NEAREST ? AB_SCALE / 2 : AB_SCALE / INTER_TAB_SIZE / 2, x, y, x1, y1;
-
-        int bh0 = std::min(BLOCK_SZ / 2, dst.rows);
-        int bw0 = std::min(BLOCK_SZ * BLOCK_SZ / bh0, dst.cols);
-        bh0 = std::min(BLOCK_SZ * BLOCK_SZ / bw0, dst.rows);
-
-        for (y = range.start; y < range.end; y += bh0) {
-            for (x = 0; x < dst.cols; x += bw0) {
-                int bw = std::min(bw0, dst.cols - x);
-                int bh = std::min(bh0, range.end - y);
-
-                Mat _XY(bh, bw, CV_16SC2, XY);
-                Mat dpart(dst, Rect(x, y, bw, bh));
-
-                for (y1 = 0; y1 < bh; y1++) {
-                    short* xy = XY + y1 * bw * 2;
-                    int X0 = saturate_cast<int>((M[1] * (y + y1) + M[2]) * AB_SCALE) + round_delta;
-                    int Y0 = saturate_cast<int>((M[4] * (y + y1) + M[5]) * AB_SCALE) + round_delta;
-
-                    if (interpolation == CV_HAL_INTER_NEAREST) {
-                        x1 = 0;
-
-                        for (; x1 < bw; x1 += 2) {
-                            int32x2_t vX = { X0 + adelta[x + x1], X0 + adelta[x + x1 + 1] };
-                            int32x2_t vY = { Y0 + bdelta[x + x1], Y0 + bdelta[x + x1 + 1] };
-
-                            vX = __nds__v_sclip32(__nds__v_sra32(vX, AB_BITS), 15);
-                            vY = __nds__v_sclip32(__nds__v_sra32(vY, AB_BITS), 15);
-
-                            *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vY, (unsigned long)vX);
-                        }
-
-                        for (; x1 < bw; x1++) {
-                            int X = (X0 + adelta[x + x1]) >> AB_BITS;
-                            int Y = (Y0 + bdelta[x + x1]) >> AB_BITS;
-                            xy[x1 * 2] = saturate_cast<short>(X);
-                            xy[x1 * 2 + 1] = saturate_cast<short>(Y);
-                        }
-                    } else {
-                        short* alpha = A + y1 * bw;
-                        x1 = 0;
-
-                        const int INTER_MASK = INTER_TAB_SIZE - 1;
-                        const uint32x2_t vmask = { INTER_MASK, INTER_MASK };
-                        for (; x1 < bw; x1 += 2) {
-                            int32x2_t vX = { X0 + adelta[x + x1], X0 + adelta[x + x1 + 1] };
-                            int32x2_t vY = { Y0 + bdelta[x + x1], Y0 + bdelta[x + x1 + 1] };
-                            vX = __nds__v_sra32(vX, (AB_BITS - INTER_BITS));
-                            vY = __nds__v_sra32(vY, (AB_BITS - INTER_BITS));
-
-                            int32x2_t vx = __nds__v_sclip32(__nds__v_sra32(vX, INTER_BITS), 15);
-                            int32x2_t vy = __nds__v_sclip32(__nds__v_sra32(vY, INTER_BITS), 15);
-
-                            *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vy, (unsigned long)vx);
-
-                            uint32x2_t valpha = __nds__v_uadd32(__nds__v_sll32((uint32x2_t)(vY & vmask), INTER_BITS), (uint32x2_t)(vX & vmask));
-                            *(int16x2_t*)(alpha + x1) = (int16x2_t) { (short)(valpha[0]), (short)(valpha[1]) };
-                        }
-
-                        for (; x1 < bw; x1++) {
-                            int X = (X0 + adelta[x + x1]) >> (AB_BITS - INTER_BITS);
-                            int Y = (Y0 + bdelta[x + x1]) >> (AB_BITS - INTER_BITS);
-                            xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
-                            xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
-                            alpha[x1] = (short)((Y & (INTER_TAB_SIZE - 1)) * INTER_TAB_SIZE + (X & (INTER_TAB_SIZE - 1)));
-                        }
-                    }
-                }
-
-                if (interpolation == CV_HAL_INTER_NEAREST)
-                    remap(src, dpart, _XY, Mat(), interpolation, borderType, borderValue);
-                else {
-                    Mat _matA(bh, bw, CV_16U, A);
-                    remap(src, dpart, _XY, _matA, interpolation, borderType, borderValue);
-                }
-            }
-        }
-    }
-
-private:
-    Mat src;
-    Mat dst;
-    int interpolation, borderType;
-    Scalar borderValue;
-    int *adelta, *bdelta;
-    const double* M;
-};
-
-int warpAffine(int src_type,
-    const uchar* src_data, size_t src_step, int src_width, int src_height,
-    uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
-    const double M[6], int interpolation, int borderType, const double borderValue[4])
+int warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw)
 {
-    Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
-    Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
-
-    int x;
-    AutoBuffer<int> _abdelta(dst.cols * 2);
-    int *adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
    const int AB_BITS = MAX(10, (int)INTER_BITS);
-    const int AB_SCALE = 1 << AB_BITS;
+    int x1 = 0;

-    for (x = 0; x < dst.cols; x++) {
-        adelta[x] = saturate_cast<int>(M[0] * x * AB_SCALE);
-        bdelta[x] = saturate_cast<int>(M[3] * x * AB_SCALE);
+    for (; x1 < bw; x1 += 2) {
+        int32x2_t vX = { X0 + adelta[x1], X0 + adelta[x1 + 1] };
+        int32x2_t vY = { Y0 + bdelta[x1], Y0 + bdelta[x1 + 1] };
+
+        vX = __nds__v_sclip32(__nds__v_sra32(vX, AB_BITS), 15);
+        vY = __nds__v_sclip32(__nds__v_sra32(vY, AB_BITS), 15);
+
+        *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vY, (unsigned long)vX);
+    }
+
+    for (; x1 < bw; x1++) {
+        int X = X0 + adelta[x1];
+        int Y = Y0 + bdelta[x1];
+        xy[x1 * 2] = saturate_cast<short>(X);
+        xy[x1 * 2 + 1] = saturate_cast<short>(Y);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)
+{
+    const int AB_BITS = MAX(10, (int)INTER_BITS);
+    int x1 = 0;
+
+    const int INTER_MASK = INTER_TAB_SIZE - 1;
+    const uint32x2_t vmask = { INTER_MASK, INTER_MASK };
+    for (; x1 < bw; x1 += 2) {
+        int32x2_t vX = { X0 + adelta[x1], X0 + adelta[x1 + 1] };
+        int32x2_t vY = { Y0 + bdelta[x1], Y0 + bdelta[x1 + 1] };
+        vX = __nds__v_sra32(vX, (AB_BITS - INTER_BITS));
+        vY = __nds__v_sra32(vY, (AB_BITS - INTER_BITS));
+
+        int32x2_t vx = __nds__v_sclip32(__nds__v_sra32(vX, INTER_BITS), 15);
+        int32x2_t vy = __nds__v_sclip32(__nds__v_sra32(vY, INTER_BITS), 15);
+
+        *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vy, (unsigned long)vx);
+
+        uint32x2_t valpha = __nds__v_uadd32(__nds__v_sll32((uint32x2_t)(vY & vmask), INTER_BITS), (uint32x2_t)(vX & vmask));
+        *(int16x2_t*)(alpha + x1) = (int16x2_t) { (short)(valpha[0]), (short)(valpha[1]) };
+    }
+
+    for (; x1 < bw; x1++) {
+        int X = X0 + adelta[x1];
+        int Y = Y0 + bdelta[x1];
+        xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
+        xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
+        alpha[x1] = (short)((Y & INTER_MASK) * INTER_TAB_SIZE + (X & INTER_MASK));
    }

-    Range range(0, dst.rows);
-    WarpAffineInvoker invoker(src, dst, interpolation, borderType,
-        Scalar(borderValue[0], borderValue[1], borderValue[2], borderValue[3]),
-        adelta, bdelta, M);
-    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
    return CV_HAL_ERROR_OK;
 }

--- a/3rdparty/ndsrvp/src/warpPerspective.cpp
+++ b/3rdparty/ndsrvp/src/warpPerspective.cpp
@ -3,154 +3,90 @@
 // of this distribution and at http://opencv.org/license.html.	

 #include "ndsrvp_hal.hpp"
-#include "opencv2/core.hpp"
 #include "opencv2/imgproc/hal/interface.h"
+#include "cvutils.hpp"

 namespace cv {

 namespace ndsrvp {

-class WarpPerspectiveInvoker : public ParallelLoopBody {
-public:
-    WarpPerspectiveInvoker(const Mat& _src, Mat& _dst, const double* _M, int _interpolation,
-        int _borderType, const Scalar& _borderValue)
-        : ParallelLoopBody()
-        , src(_src)
-        , dst(_dst)
-        , M(_M)
-        , interpolation(_interpolation)
-        , borderType(_borderType)
-        , borderValue(_borderValue)
-    {
-    }
-
-    virtual void operator()(const Range& range) const CV_OVERRIDE
-    {
-        const int BLOCK_SZ = 32;
-        short XY[BLOCK_SZ * BLOCK_SZ * 2], A[BLOCK_SZ * BLOCK_SZ];
-        int x, y, y1, width = dst.cols, height = dst.rows;
-
-        int bh0 = std::min(BLOCK_SZ / 2, height);
-        int bw0 = std::min(BLOCK_SZ * BLOCK_SZ / bh0, width);
-        bh0 = std::min(BLOCK_SZ * BLOCK_SZ / bw0, height);
-
-        for (y = range.start; y < range.end; y += bh0) {
-            for (x = 0; x < width; x += bw0) {
-                int bw = std::min(bw0, width - x);
-                int bh = std::min(bh0, range.end - y); // height
-
-                Mat _XY(bh, bw, CV_16SC2, XY);
-                Mat dpart(dst, Rect(x, y, bw, bh));
-
-                for (y1 = 0; y1 < bh; y1++) {
-                    short* xy = XY + y1 * bw * 2;
-                    double X0 = M[0] * x + M[1] * (y + y1) + M[2];
-                    double Y0 = M[3] * x + M[4] * (y + y1) + M[5];
-                    double W0 = M[6] * x + M[7] * (y + y1) + M[8];
-
-                    if (interpolation == CV_HAL_INTER_NEAREST) {
-                        int x1 = 0;
-
-                        for (; x1 < bw; x1 += 2) {
-                            double W1 = W0 + M[6] * x1, W2 = W1 + M[6];
-                            W1 = W1 ? 1. / W1 : 0;
-                            W2 = W2 ? 1. / W2 : 0;
-                            double fX1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W1));
-                            double fX2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * (x1 + 1)) * W2));
-                            double fY1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W1));
-                            double fY2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * (x1 + 1)) * W2));
-
-                            int32x2_t vX = {saturate_cast<int>(fX1), saturate_cast<int>(fX2)};
-                            int32x2_t vY = {saturate_cast<int>(fY1), saturate_cast<int>(fY2)};
-
-                            vX = __nds__v_sclip32(vX, 15);
-                            vY = __nds__v_sclip32(vY, 15);
-
-                            *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vY, (unsigned long)vX);
-                        }
-
-                        for (; x1 < bw; x1++) {
-                            double W = W0 + M[6] * x1;
-                            W = W ? 1. / W : 0;
-                            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W));
-                            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W));
-                            int X = saturate_cast<int>(fX);
-                            int Y = saturate_cast<int>(fY);
-
-                            xy[x1 * 2] = saturate_cast<short>(X);
-                            xy[x1 * 2 + 1] = saturate_cast<short>(Y);
-                        }
-                    } else {
-                        short* alpha = A + y1 * bw;
-                        int x1 = 0;
-
-                        const int INTER_MASK = INTER_TAB_SIZE - 1;
-                        const uint32x2_t vmask = { INTER_MASK, INTER_MASK };
-                        for (; x1 < bw; x1 += 2) {
-                            double W1 = W0 + M[6] * x1, W2 = W1 + M[6];
-                            W1 = W1 ? INTER_TAB_SIZE / W1 : 0;
-                            W2 = W2 ? INTER_TAB_SIZE / W2 : 0;
-                            double fX1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W1));
-                            double fX2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * (x1 + 1)) * W2));
-                            double fY1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W1));
-                            double fY2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * (x1 + 1)) * W2));
-
-                            int32x2_t vX = {saturate_cast<int>(fX1), saturate_cast<int>(fX2)};
-                            int32x2_t vY = {saturate_cast<int>(fY1), saturate_cast<int>(fY2)};
-
-                            int32x2_t vx = __nds__v_sclip32(__nds__v_sra32(vX, INTER_BITS), 15);
-                            int32x2_t vy = __nds__v_sclip32(__nds__v_sra32(vY, INTER_BITS), 15);
-
-                            *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vy, (unsigned long)vx);
-
-                            uint32x2_t valpha = __nds__v_uadd32(__nds__v_sll32((uint32x2_t)(vY & vmask), INTER_BITS), (uint32x2_t)(vX & vmask));
-                            *(int16x2_t*)(alpha + x1) = (int16x2_t) { (short)(valpha[0]), (short)(valpha[1]) };
-                        }
-
-                        for (; x1 < bw; x1++) {
-                            double W = W0 + M[6] * x1;
-                            W = W ? INTER_TAB_SIZE / W : 0;
-                            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W));
-                            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W));
-                            int X = saturate_cast<int>(fX);
-                            int Y = saturate_cast<int>(fY);
-
-                            xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
-                            xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
-                            alpha[x1] = (short)((Y & (INTER_TAB_SIZE - 1)) * INTER_TAB_SIZE + (X & (INTER_TAB_SIZE - 1)));
-                        }
-                    }
-                }
-
-                if (interpolation == CV_HAL_INTER_NEAREST)
-                    remap(src, dpart, _XY, Mat(), interpolation, borderType, borderValue);
-                else {
-                    Mat _matA(bh, bw, CV_16U, A);
-                    remap(src, dpart, _XY, _matA, interpolation, borderType, borderValue);
-                }
-            }
-        }
-    }
-
-private:
-    Mat src;
-    Mat dst;
-    const double* M;
-    int interpolation, borderType;
-    Scalar borderValue;
-};
-
-int warpPerspective(int src_type,
-    const uchar* src_data, size_t src_step, int src_width, int src_height,
-    uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
-    const double M[9], int interpolation, int borderType, const double borderValue[4])
+int warpPerspectiveBlocklineNN(const double *M, short* xy, double X0, double Y0, double W0, int bw)
 {
-    Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
-    Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
+    int x1 = 0;
+
+    for (; x1 < bw; x1 += 2) {
+        double W1 = W0 + M[6] * x1, W2 = W1 + M[6];
+        W1 = W1 ? 1. / W1 : 0;
+        W2 = W2 ? 1. / W2 : 0;
+        double fX1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W1));
+        double fX2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * (x1 + 1)) * W2));
+        double fY1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W1));
+        double fY2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * (x1 + 1)) * W2));
+
+        int32x2_t vX = {saturate_cast<int>(fX1), saturate_cast<int>(fX2)};
+        int32x2_t vY = {saturate_cast<int>(fY1), saturate_cast<int>(fY2)};
+
+        vX = __nds__v_sclip32(vX, 15);
+        vY = __nds__v_sclip32(vY, 15);
+
+        *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vY, (unsigned long)vX);
+    }
+
+    for (; x1 < bw; x1++) {
+        double W = W0 + M[6] * x1;
+        W = W ? 1. / W : 0;
+        double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W));
+        double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W));
+        int X = saturate_cast<int>(fX);
+        int Y = saturate_cast<int>(fY);
+
+        xy[x1 * 2] = saturate_cast<short>(X);
+        xy[x1 * 2 + 1] = saturate_cast<short>(Y);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+int warpPerspectiveBlockline(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw)
+{
+    int x1 = 0;
+
+    const int INTER_MASK = INTER_TAB_SIZE - 1;
+    const uint32x2_t vmask = { INTER_MASK, INTER_MASK };
+    for (; x1 < bw; x1 += 2) {
+        double W1 = W0 + M[6] * x1, W2 = W1 + M[6];
+        W1 = W1 ? INTER_TAB_SIZE / W1 : 0;
+        W2 = W2 ? INTER_TAB_SIZE / W2 : 0;
+        double fX1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W1));
+        double fX2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * (x1 + 1)) * W2));
+        double fY1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W1));
+        double fY2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * (x1 + 1)) * W2));
+
+        int32x2_t vX = {saturate_cast<int>(fX1), saturate_cast<int>(fX2)};
+        int32x2_t vY = {saturate_cast<int>(fY1), saturate_cast<int>(fY2)};
+
+        int32x2_t vx = __nds__v_sclip32(__nds__v_sra32(vX, INTER_BITS), 15);
+        int32x2_t vy = __nds__v_sclip32(__nds__v_sra32(vY, INTER_BITS), 15);
+
+        *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vy, (unsigned long)vx);
+
+        uint32x2_t valpha = __nds__v_uadd32(__nds__v_sll32((uint32x2_t)(vY & vmask), INTER_BITS), (uint32x2_t)(vX & vmask));
+        *(int16x2_t*)(alpha + x1) = (int16x2_t) { (short)(valpha[0]), (short)(valpha[1]) };
+    }
+
+    for (; x1 < bw; x1++) {
+        double W = W0 + M[6] * x1;
+        W = W ? INTER_TAB_SIZE / W : 0;
+        double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W));
+        double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W));
+        int X = saturate_cast<int>(fX);
+        int Y = saturate_cast<int>(fY);
+
+        xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
+        xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
+        alpha[x1] = (short)((Y & INTER_MASK) * INTER_TAB_SIZE + (X & INTER_MASK));
+    }

-    Range range(0, dst.rows);
-    WarpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, Scalar(borderValue[0], borderValue[1], borderValue[2], borderValue[3]));
-    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
    return CV_HAL_ERROR_OK;
 }

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1040,7 +1040,7 @@ foreach(hal ${OpenCV_HAL})
      ocv_hal_register(NDSRVP_HAL_LIBRARIES NDSRVP_HAL_HEADERS NDSRVP_HAL_INCLUDE_DIRS)
      list(APPEND OpenCV_USED_HAL "ndsrvp (ver ${NDSRVP_HAL_VERSION})")
    else()
-      message(STATUS "NDSRVP: Andes GNU Toolchain DSP extension is not open, disabling ndsrvp...")
+      message(STATUS "NDSRVP: Andes GNU Toolchain DSP extension is not enabled, disabling ndsrvp...")
    endif()
  elseif(hal STREQUAL "halrvv")
    if(";${CPU_BASELINE_FINAL};" MATCHES ";RVV;")
--- a/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp
+++ b/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp
@ -108,11 +108,19 @@ CV_EXPORTS void warpAffine(int src_type,
                           uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
                           const double M[6], int interpolation, int borderType, const double borderValue[4]);

+CV_EXPORTS void warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw);
+
+CV_EXPORTS void warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw);
+
 CV_EXPORTS void warpPerspective(int src_type,
                               const uchar * src_data, size_t src_step, int src_width, int src_height,
                               uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
                               const double M[9], int interpolation, int borderType, const double borderValue[4]);

+CV_EXPORTS void warpPerspectiveBlocklineNN(const double *M, short* xy, double X0, double Y0, double W0, int bw);
+
+CV_EXPORTS void warpPerspectiveBlockline(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw);
+
 CV_EXPORTS void cvtBGRtoBGR(const uchar * src_data, size_t src_step,
                            uchar * dst_data, size_t dst_step,
                            int width, int height,
--- a/modules/imgproc/include/opencv2/imgproc/hal/interface.h
+++ b/modules/imgproc/include/opencv2/imgproc/hal/interface.h
@ -12,6 +12,12 @@
 #define CV_HAL_INTER_CUBIC 2
 #define CV_HAL_INTER_AREA 3
 #define CV_HAL_INTER_LANCZOS4 4
+#define CV_HAL_INTER_LINEAR_EXACT 5
+#define CV_HAL_INTER_NEAREST_EXACT 6
+#define CV_HAL_INTER_MAX 7
+#define CV_HAL_WARP_FILL_OUTLIERS 8
+#define CV_HAL_WARP_INVERSE_MAP 16
+#define CV_HAL_WARP_RELATIVE_MAP 32
 //! @}

 //! @name Morphology operations
--- a/modules/imgproc/src/hal_replacement.hpp
+++ b/modules/imgproc/src/hal_replacement.hpp
@ -273,6 +273,29 @@ inline int hal_ni_resize(int src_type, const uchar *src_data, size_t src_step, i
   @sa cv::warpAffine, cv::hal::warpAffine
 */
 inline int hal_ni_warpAffine(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[6], int interpolation, int borderType, const double borderValue[4]) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_warpAffineBlocklineNN doing a row of affine transformation
+   @param adelta input M0 * x array
+   @param bdelta input M3 * x array
+   @param xy output (x', y') coordinates
+   @param X0 input M1 * y + M2 value
+   @param Y0 input M4 * y + M5 value
+   @param bw length of the row
+   @sa cv::warpAffineBlocklineNN, cv::hal::warpAffineBlocklineNN
+ */
+inline int hal_ni_warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_warpAffineBlockline doing a row of affine transformation
+   @param adelta input M0 * x array
+   @param bdelta input M3 * x array
+   @param xy output (x', y') coordinates
+   @param alpha output least significant bits of the (x', y') coordinates for interpolation
+   @param X0 input M1 * y + M2 value
+   @param Y0 input M4 * y + M5 value
+   @param bw length of the row
+   @sa cv::warpAffineBlockline, cv::hal::warpAffineBlockline
+ */
+inline int hal_ni_warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 /**
   @brief hal_warpPerspective
   @param src_type source and destination image type
@ -291,11 +314,38 @@ inline int hal_ni_warpAffine(int src_type, const uchar *src_data, size_t src_ste
   @sa cv::warpPerspective, cv::hal::warpPerspective
 */
 inline int hal_ni_warpPerspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[9], int interpolation, int borderType, const double borderValue[4]) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_warpPerspectiveBlocklineNN doing a row of perspective transformation
+   @param M 3x3 matrix with transform coefficients
+   @param xy output (x', y') coordinates
+   @param X0 input M0 * x0 + M1 * y + M2 value
+   @param Y0 input M3 * x0 + M4 * y + M5 value
+   @param W0 input M6 * x0 + M7 * y + M8 value
+   @param bw length of the row
+   @sa cv::warpPerspectiveBlocklineNN, cv::hal::warpPerspectiveBlocklineNN
+ */
+inline int hal_ni_warpPerspectiveBlocklineNN(const double *M, short* xy, double X0, double Y0, double W0, int bw) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_warpPerspectiveBlockline doing a row of perspective transformation
+   @param M 3x3 matrix with transform coefficients
+   @param xy output (x', y') coordinates
+   @param alpha output least significant bits of the (x', y') coordinates for interpolation
+   @param X0 input M0 * x0 + M1 * y + M2 value
+   @param Y0 input M3 * x0 + M4 * y + M5 value
+   @param W0 input M6 * x0 + M7 * y + M8 value
+   @param bw length of the row
+   @sa cv::warpPerspectiveBlockline, cv::hal::warpPerspectiveBlockline
+ */
+inline int hal_ni_warpPerspectiveBlockline(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }

 //! @cond IGNORED
 #define cv_hal_resize hal_ni_resize
 #define cv_hal_warpAffine hal_ni_warpAffine
+#define cv_hal_warpAffineBlocklineNN hal_ni_warpAffineBlocklineNN
+#define cv_hal_warpAffineBlockline hal_ni_warpAffineBlockline
 #define cv_hal_warpPerspective hal_ni_warpPerspective
+#define cv_hal_warpPerspectiveBlocklineNN hal_ni_warpPerspectiveBlocklineNN
+#define cv_hal_warpPerspectiveBlockline hal_ni_warpPerspectiveBlockline
 //! @endcond

 /**
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@ -2268,16 +2268,7 @@ public:
        short *XY = __XY.data(), *A = __A.data();
        const int AB_BITS = MAX(10, (int)INTER_BITS);
        const int AB_SCALE = 1 << AB_BITS;
-        int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
-    #if CV_TRY_AVX2
-        bool useAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
-    #endif
-    #if CV_TRY_SSE4_1
-        bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
-    #endif
-    #if CV_TRY_LASX
-        bool useLASX = CV_CPU_HAS_SUPPORT_LASX;
-    #endif
+        int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, y1;

        int bh0 = std::min(BLOCK_SZ/2, dst.rows);
        int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
@ -2300,84 +2291,9 @@ public:
                    int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta;

                    if( interpolation == INTER_NEAREST )
-                    {
-                        x1 = 0;
-                        #if CV_TRY_SSE4_1
-                        if( useSSE4_1 )
-                            opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta + x, bdelta + x, xy, X0, Y0, bw);
-                        else
-                        #endif
-                        {
-                            #if CV_SIMD128
-                            {
-                                v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
-                                int span = VTraits<v_uint16x8>::vlanes();
-                                for( ; x1 <= bw - span; x1 += span )
-                                {
-                                    v_int16x8 v_dst[2];
-                                    #define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset))),\
-                                                                                    v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset + 4))))
-                                    v_dst[0] = CV_CONVERT_MAP(adelta, x+x1, v_X0);
-                                    v_dst[1] = CV_CONVERT_MAP(bdelta, x+x1, v_Y0);
-                                    #undef CV_CONVERT_MAP
-                                    v_store_interleave(xy + (x1 << 1), v_dst[0], v_dst[1]);
-                                }
-                            }
-                            #endif
-                            for( ; x1 < bw; x1++ )
-                            {
-                                int X = (X0 + adelta[x+x1]) >> AB_BITS;
-                                int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
-                                xy[x1*2] = saturate_cast<short>(X);
-                                xy[x1*2+1] = saturate_cast<short>(Y);
-                            }
-                        }
-                    }
+                        hal::warpAffineBlocklineNN(adelta + x, bdelta + x, xy, X0, Y0, bw);
                    else
-                    {
-                        short* alpha = A + y1*bw;
-                        x1 = 0;
-                        #if CV_TRY_AVX2
-                        if ( useAVX2 )
-                            x1 = opt_AVX2::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
-                        #endif
-                        #if CV_TRY_LASX
-                        if ( useLASX )
-                            x1 = opt_LASX::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
-                        #endif
-                        #if CV_SIMD128
-                        {
-                            v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0);
-                            v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
-                            int span = VTraits<v_float32x4>::vlanes();
-                            for( ; x1 <= bw - span * 2; x1 += span * 2 )
-                            {
-                                v_int32x4 v_X0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(this->adelta + x + x1)));
-                                v_int32x4 v_Y0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(this->bdelta + x + x1)));
-                                v_int32x4 v_X1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(this->adelta + x + x1 + span)));
-                                v_int32x4 v_Y1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(this->bdelta + x + x1 + span)));
-
-                                v_int16x8 v_xy[2];
-                                v_xy[0] = v_pack(v_shr<INTER_BITS>(v_X0), v_shr<INTER_BITS>(v_X1));
-                                v_xy[1] = v_pack(v_shr<INTER_BITS>(v_Y0), v_shr<INTER_BITS>(v_Y1));
-                                v_store_interleave(xy + (x1 << 1), v_xy[0], v_xy[1]);
-
-                                v_int32x4 v_alpha0 = v_or(v_shl<INTER_BITS>(v_and(v_Y0, v_mask)), v_and(v_X0, v_mask));
-                                v_int32x4 v_alpha1 = v_or(v_shl<INTER_BITS>(v_and(v_Y1, v_mask)), v_and(v_X1, v_mask));
-                                v_store(alpha + x1, v_pack(v_alpha0, v_alpha1));
-                            }
-                        }
-                        #endif
-                        for( ; x1 < bw; x1++ )
-                        {
-                            int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS);
-                            int Y = (Y0 + bdelta[x+x1]) >> (AB_BITS - INTER_BITS);
-                            xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
-                            xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
-                            alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
-                                    (X & (INTER_TAB_SIZE-1)));
-                        }
-                    }
+                        hal::warpAffineBlockline(adelta + x, bdelta + x, xy, A + y1*bw, X0, Y0, bw);
                }

                if( interpolation == INTER_NEAREST )
@ -2802,6 +2718,97 @@ void warpAffine(int src_type,
    parallel_for_(range, invoker, dst.total()/(double)(1<<16));
 }

+void warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw)
+{
+    CALL_HAL(warpAffineBlocklineNN, cv_hal_warpAffineBlocklineNN, adelta, bdelta, xy, X0, Y0, bw);
+
+    const int AB_BITS = MAX(10, (int)INTER_BITS);
+    int x1 = 0;
+    #if CV_TRY_SSE4_1
+    bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
+    if( useSSE4_1 )
+        opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta, bdelta, xy, X0, Y0, bw);
+    else
+    #endif
+    {
+        #if CV_SIMD128
+        {
+            v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
+            int span = VTraits<v_uint16x8>::vlanes();
+            for( ; x1 <= bw - span; x1 += span )
+            {
+                v_int16x8 v_dst[2];
+                #define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset))),\
+                                                                v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset + 4))))
+                v_dst[0] = CV_CONVERT_MAP(adelta, x1, v_X0);
+                v_dst[1] = CV_CONVERT_MAP(bdelta, x1, v_Y0);
+                #undef CV_CONVERT_MAP
+                v_store_interleave(xy + (x1 << 1), v_dst[0], v_dst[1]);
+            }
+        }
+        #endif
+        for( ; x1 < bw; x1++ )
+        {
+            int X = (X0 + adelta[x1]) >> AB_BITS;
+            int Y = (Y0 + bdelta[x1]) >> AB_BITS;
+            xy[x1*2] = saturate_cast<short>(X);
+            xy[x1*2+1] = saturate_cast<short>(Y);
+        }
+    }
+}
+
+void warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)
+{
+    CALL_HAL(warpAffineBlockline, cv_hal_warpAffineBlockline, adelta, bdelta, xy, alpha, X0, Y0, bw);
+
+    const int AB_BITS = MAX(10, (int)INTER_BITS);
+    int x1 = 0;
+    #if CV_TRY_AVX2
+    bool useAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
+    if ( useAVX2 )
+        x1 = opt_AVX2::warpAffineBlockline(adelta, bdelta, xy, alpha, X0, Y0, bw);
+    #endif
+    #if CV_TRY_LASX
+    bool useLASX = CV_CPU_HAS_SUPPORT_LASX;
+    if ( useLASX )
+        x1 = opt_LASX::warpAffineBlockline(adelta, bdelta, xy, alpha, X0, Y0, bw);
+    #endif
+    {
+        #if CV_SIMD128
+        {
+            v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0);
+            v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
+            int span = VTraits<v_float32x4>::vlanes();
+            for( ; x1 <= bw - span * 2; x1 += span * 2 )
+            {
+                v_int32x4 v_X0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(adelta + x1)));
+                v_int32x4 v_Y0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(bdelta + x1)));
+                v_int32x4 v_X1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(adelta + x1 + span)));
+                v_int32x4 v_Y1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(bdelta + x1 + span)));
+
+                v_int16x8 v_xy[2];
+                v_xy[0] = v_pack(v_shr<INTER_BITS>(v_X0), v_shr<INTER_BITS>(v_X1));
+                v_xy[1] = v_pack(v_shr<INTER_BITS>(v_Y0), v_shr<INTER_BITS>(v_Y1));
+                v_store_interleave(xy + (x1 << 1), v_xy[0], v_xy[1]);
+
+                v_int32x4 v_alpha0 = v_or(v_shl<INTER_BITS>(v_and(v_Y0, v_mask)), v_and(v_X0, v_mask));
+                v_int32x4 v_alpha1 = v_or(v_shl<INTER_BITS>(v_and(v_Y1, v_mask)), v_and(v_X1, v_mask));
+                v_store(alpha + x1, v_pack(v_alpha0, v_alpha1));
+            }
+        }
+        #endif
+        for( ; x1 < bw; x1++ )
+        {
+            int X = (X0 + adelta[x1]) >> (AB_BITS - INTER_BITS);
+            int Y = (Y0 + bdelta[x1]) >> (AB_BITS - INTER_BITS);
+            xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
+            xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
+            alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
+                    (X & (INTER_TAB_SIZE-1)));
+        }
+    }
+}
+
 } // hal::
 } // cv::

@ -3204,12 +3211,6 @@ public:
        int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
        bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);

-        #if CV_TRY_SSE4_1
-        Ptr<opt_SSE4_1::WarpPerspectiveLine_SSE4> pwarp_impl_sse4;
-        if(CV_CPU_HAS_SUPPORT_SSE4_1)
-            pwarp_impl_sse4 = opt_SSE4_1::WarpPerspectiveLine_SSE4::getImpl(M);
-        #endif
-
        for( y = range.start; y < range.end; y += bh0 )
        {
            for( x = 0; x < width; x += bw0 )
@ -3228,57 +3229,9 @@ public:
                    double W0 = M[6]*x + M[7]*(y + y1) + M[8];

                    if( interpolation == INTER_NEAREST )
-                    {
-                        #if CV_TRY_SSE4_1
-                        if (pwarp_impl_sse4)
-                            pwarp_impl_sse4->processNN(M, xy, X0, Y0, W0, bw);
-                        else
-                        #endif
-                        #if CV_SIMD128_64F
-                        WarpPerspectiveLine_ProcessNN_CV_SIMD(M, xy, X0, Y0, W0, bw);
-                        #else
-                        for( int x1 = 0; x1 < bw; x1++ )
-                        {
-                            double W = W0 + M[6]*x1;
-                            W = W ? 1./W : 0;
-                            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
-                            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
-                            int X = saturate_cast<int>(fX);
-                            int Y = saturate_cast<int>(fY);
-
-                            xy[x1*2] = saturate_cast<short>(X);
-                            xy[x1*2+1] = saturate_cast<short>(Y);
-                        }
-                        #endif
-                    }
+                        hal::warpPerspectiveBlocklineNN(M, xy, X0, Y0, W0, bw);
                    else
-                    {
-                        short* alpha = A + y1*bw;
-
-                        #if CV_TRY_SSE4_1
-                        if (pwarp_impl_sse4)
-                            pwarp_impl_sse4->process(M, xy, alpha, X0, Y0, W0, bw);
-                        else
-                        #endif
-                        #if CV_SIMD128_64F
-                        WarpPerspectiveLine_Process_CV_SIMD(M, xy, alpha, X0, Y0, W0, bw);
-                        #else
-                        for( int x1 = 0; x1 < bw; x1++ )
-                        {
-                            double W = W0 + M[6]*x1;
-                            W = W ? INTER_TAB_SIZE/W : 0;
-                            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
-                            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
-                            int X = saturate_cast<int>(fX);
-                            int Y = saturate_cast<int>(fY);
-
-                            xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
-                            xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
-                            alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
-                                                (X & (INTER_TAB_SIZE-1)));
-                        }
-                        #endif
-                    }
+                        hal::warpPerspectiveBlockline(M, xy, A + y1*bw, X0, Y0, W0, bw);
                }

                if( interpolation == INTER_NEAREST )
@ -3371,6 +3324,74 @@ void warpPerspective(int src_type,
    parallel_for_(range, invoker, dst.total()/(double)(1<<16));
 }

+void warpPerspectiveBlocklineNN(const double *M, short* xy, double X0, double Y0, double W0, int bw)
+{
+    CALL_HAL(warpPerspectiveBlocklineNN, cv_hal_warpPerspectiveBlocklineNN, M, xy, X0, Y0, W0, bw);
+
+    #if CV_TRY_SSE4_1
+    Ptr<opt_SSE4_1::WarpPerspectiveLine_SSE4> pwarp_impl_sse4;
+    if(CV_CPU_HAS_SUPPORT_SSE4_1)
+        pwarp_impl_sse4 = opt_SSE4_1::WarpPerspectiveLine_SSE4::getImpl(M);
+
+    if (pwarp_impl_sse4)
+        pwarp_impl_sse4->processNN(M, xy, X0, Y0, W0, bw);
+    else
+    #endif
+    {
+        #if CV_SIMD128_64F
+        WarpPerspectiveLine_ProcessNN_CV_SIMD(M, xy, X0, Y0, W0, bw);
+        #else
+        for( int x1 = 0; x1 < bw; x1++ )
+        {
+            double W = W0 + M[6]*x1;
+            W = W ? 1./W : 0;
+            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
+            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
+            int X = saturate_cast<int>(fX);
+            int Y = saturate_cast<int>(fY);
+
+            xy[x1*2] = saturate_cast<short>(X);
+            xy[x1*2+1] = saturate_cast<short>(Y);
+        }
+        #endif
+    }
+}
+
+void warpPerspectiveBlockline(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw)
+{
+    CALL_HAL(warpPerspectiveBlockline, cv_hal_warpPerspectiveBlockline, M, xy, alpha, X0, Y0, W0, bw);
+
+    #if CV_TRY_SSE4_1
+    Ptr<opt_SSE4_1::WarpPerspectiveLine_SSE4> pwarp_impl_sse4;
+    if(CV_CPU_HAS_SUPPORT_SSE4_1)
+        pwarp_impl_sse4 = opt_SSE4_1::WarpPerspectiveLine_SSE4::getImpl(M);
+
+    if (pwarp_impl_sse4)
+        pwarp_impl_sse4->process(M, xy, alpha, X0, Y0, W0, bw);
+    else
+    #endif
+    {
+        #if CV_SIMD128_64F
+        WarpPerspectiveLine_Process_CV_SIMD(M, xy, alpha, X0, Y0, W0, bw);
+        #else
+        for( int x1 = 0; x1 < bw; x1++ )
+        {
+            double W = W0 + M[6]*x1;
+            W = W ? INTER_TAB_SIZE/W : 0;
+            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
+            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
+            int X = saturate_cast<int>(fX);
+            int Y = saturate_cast<int>(fY);
+
+            xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
+            xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
+            alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
+                                (X & (INTER_TAB_SIZE-1)));
+        }
+        #endif
+    }
+}
+
 } // hal::
 } // cv::