Merge pull request #26617 from CodeLinaro:xuezha_2ndPost

FastCV-based HAL for OpenCV acceleration 2ndpost-1 #26617 ### Detailed description: - Add parallel support for cv_hal_sobel - Add cv_hal_gaussianBlurBinomial and parallel support. - Add cv_hal_addWeighted8u and parallel support - Add cv_hal_warpPerspective and parallel support Requires binary from [opencv/opencv_3rdparty#90](https://github.com/opencv/opencv_3rdparty/pull/90) Related patch to opencv_contrib: [opencv/opencv_contrib#3844](https://github.com/opencv/opencv_contrib/pull/3844) ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2025-07-24 14:06:27 +08:00 · 2024-12-18 14:34:13 +08:00 · 2024-12-18 14:34:13 +08:00 · 1c28a98b34
commit 1c28a98b34
parent 23f6a9ee3e
6 changed files with 648 additions and 73 deletions
--- a/3rdparty/fastcv/fastcv.cmake
+++ b/3rdparty/fastcv/fastcv.cmake
@ -1,23 +1,23 @@
 function(download_fastcv root_dir)

  # Commit SHA in the opencv_3rdparty repo
-  set(FASTCV_COMMIT "b8f0d48fa9dbebb0237d3e0abd206f9930c89db6")
+  set(FASTCV_COMMIT "dc5d58018f3af915a8d209386d2c58c0501c0f2c")

  # Define actual FastCV versions
  if(ANDROID)
    if(AARCH64)
      message(STATUS "Download FastCV for Android aarch64")
-      set(FCV_PACKAGE_NAME  "fastcv_android_aarch64_2024_10_24.tgz")
-      set(FCV_PACKAGE_HASH  "14486af00dc0282dac591dc9ccdd957e")
+      set(FCV_PACKAGE_NAME  "fastcv_android_aarch64_2024_12_11.tgz")
+      set(FCV_PACKAGE_HASH  "9dac41e86597305f846212dae31a4a88")
    else()
      message(STATUS "Download FastCV for Android armv7")
-      set(FCV_PACKAGE_NAME  "fastcv_android_arm32_2024_10_24.tgz")
-      set(FCV_PACKAGE_HASH  "b5afadd5a5b55f8f6c2e7361f225fa21")
+      set(FCV_PACKAGE_NAME  "fastcv_android_arm32_2024_12_11.tgz")
+      set(FCV_PACKAGE_HASH  "fe2d30334180b17e3031eee92aac43b6")
    endif()
  elseif(UNIX AND NOT APPLE AND NOT IOS AND NOT XROS)
    if(AARCH64)
-      set(FCV_PACKAGE_NAME  "fastcv_linux_aarch64_2024_10_24.tgz")
-      set(FCV_PACKAGE_HASH  "d15c7b77f2d3577ba46bd94e6cf15230")
+      set(FCV_PACKAGE_NAME  "fastcv_linux_aarch64_2024_12_11.tgz")
+      set(FCV_PACKAGE_HASH  "7b33ad833e6f15ab6d4ec64fa3c17acd")
    else()
      message("FastCV: fastcv lib for 32-bit Linux is not supported for now!")
    endif()
--- a/3rdparty/fastcv/include/fastcv_hal_core.hpp
+++ b/3rdparty/fastcv/include/fastcv_hal_core.hpp
@ -24,6 +24,8 @@
 #define cv_hal_flip                 fastcv_hal_flip
 #undef  cv_hal_rotate90
 #define cv_hal_rotate90             fastcv_hal_rotate
+#undef  cv_hal_addWeighted8u
+#define cv_hal_addWeighted8u        fastcv_hal_addWeighted8u

 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 /// @brief look-up table transform of an array.
@ -152,4 +154,27 @@ int fastcv_hal_rotate(
    size_t          dst_step,
    int             angle);

+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief weighted sum of two arrays using formula: dst[i] = a * src1[i] + b * src2[i]
+/// @param src1_data first source image data
+/// @param src1_step first source image step
+/// @param src2_data second source image data
+/// @param src2_step second source image step
+/// @param dst_data  destination image data
+/// @param dst_step  destination image step
+/// @param width     width of the images
+/// @param height    height of the images
+/// @param scalars   numbers a, b, and c
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_addWeighted8u(
+    const uchar*    src1_data,
+    size_t          src1_step,
+    const uchar*    src2_data,
+    size_t          src2_step,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    const double    scalars[3]);
+
 #endif
--- a/3rdparty/fastcv/include/fastcv_hal_imgproc.hpp
+++ b/3rdparty/fastcv/include/fastcv_hal_imgproc.hpp
@ -12,10 +12,14 @@
 #define cv_hal_medianBlur           fastcv_hal_medianBlur
 #undef  cv_hal_sobel
 #define cv_hal_sobel                fastcv_hal_sobel
-#undef cv_hal_boxFilter
+#undef  cv_hal_boxFilter
 #define cv_hal_boxFilter            fastcv_hal_boxFilter
-#undef cv_hal_adaptiveThreshold
+#undef  cv_hal_adaptiveThreshold
 #define cv_hal_adaptiveThreshold    fastcv_hal_adaptiveThreshold
+#undef  cv_hal_gaussianBlurBinomial
+#define cv_hal_gaussianBlurBinomial fastcv_hal_gaussianBlurBinomial
+#undef  cv_hal_warpPerspective
+#define cv_hal_warpPerspective      fastcv_hal_warpPerspective

 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 /// @brief Calculate medianBlur filter
@ -148,4 +152,69 @@ int fastcv_hal_adaptiveThreshold(
    int             blockSize,
    double          C);

+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief Blurs an image using a Gaussian filter.
+/// @param src_data         Source image data
+/// @param src_step         Source image step
+/// @param dst_data         Destination image data
+/// @param dst_step         Destination image step
+/// @param width            Source image width
+/// @param height           Source image height
+/// @param depth            Depth of source and destination image
+/// @param cn               Number of channels
+/// @param margin_left      Left margins for source image
+/// @param margin_top       Top margins for source image
+/// @param margin_right     Right margins for source image
+/// @param margin_bottom    Bottom margins for source image
+/// @param ksize            Kernel size
+/// @param border_type      Border type
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_gaussianBlurBinomial(
+    const uchar*    src_data,
+    size_t          src_step,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    int             depth,
+    int             cn,
+    size_t          margin_left,
+    size_t          margin_top,
+    size_t          margin_right,
+    size_t          margin_bottom,
+    size_t          ksize,
+    int             border_type);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// @brief Applies a perspective transformation to an image.
+///
+/// @param src_type         Source and destination image type
+/// @param src_data         Source image data
+/// @param src_step         Source image step
+/// @param src_width        Source image width
+/// @param src_height       Source image height
+/// @param dst_data         Destination image data
+/// @param dst_step         Destination image step
+/// @param dst_width        Destination image width
+/// @param dst_height       Destination image height
+/// @param M                3x3 matrix with transform coefficients
+/// @param interpolation    Interpolation mode (CV_HAL_INTER_NEAREST, ...)
+/// @param border_type      Border processing mode (CV_HAL_BORDER_REFLECT, ...)
+/// @param border_value     Values to use for CV_HAL_BORDER_CONSTANT mode
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_warpPerspective(
+    int             src_type,
+    const uchar*    src_data,
+    size_t          src_step,
+    int             src_width,
+    int             src_height,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             dst_width,
+    int             dst_height,
+    const double    M[9],
+    int             interpolation,
+    int             border_type,
+    const double    border_value[4]);
+
 #endif
--- a/3rdparty/fastcv/include/fastcv_hal_utils.hpp
+++ b/3rdparty/fastcv/include/fastcv_hal_utils.hpp
@ -29,7 +29,7 @@
            status == FASTCV_EHWGPU)                                        \
    {                                                                       \
        CV_LOG_DEBUG(NULL, "FastCV status:"<<getFastCVErrorString(status)   \
-            <<"Switching to default OpenCV solution!");                     \
+            <<", Switching to default OpenCV solution!");                   \
        return CV_HAL_ERROR_NOT_IMPLEMENTED;                                \
    }                                                                       \
    else                                                                    \
@ -38,7 +38,7 @@
        return CV_HAL_ERROR_UNKNOWN;                                        \
    }                                                                       \
 }
- 
+
 #define CV_HAL_RETURN_NOT_IMPLEMENTED(reason)                           \
 {                                                                       \
    CV_LOG_DEBUG(NULL,"Switching to default OpenCV\nInfo: "<<reason);   \
@ -47,6 +47,7 @@

 #define FCV_KernelSize_SHIFT 3
 #define FCV_MAKETYPE(ksize,depth) ((ksize<<FCV_KernelSize_SHIFT) + depth)
+#define FCV_CMP_EQ(val1,val2) (fabs(val1 - val2) < FLT_EPSILON)

 const char* getFastCVErrorString(int status);
 const char* borderToString(int border);
--- a/3rdparty/fastcv/src/fastcv_hal_core.cpp
+++ b/3rdparty/fastcv/src/fastcv_hal_core.cpp
@ -38,15 +38,15 @@ private:
 };

 int fastcv_hal_lut(
-    const uchar*    src_data, 
-    size_t          src_step, 
-    size_t          src_type, 
-    const uchar*    lut_data, 
-    size_t          lut_channel_size, 
-    size_t          lut_channels, 
-    uchar*          dst_data, 
-    size_t          dst_step, 
-    int             width, 
+    const uchar*    src_data,
+    size_t          src_step,
+    size_t          src_type,
+    const uchar*    lut_data,
+    size_t          lut_channel_size,
+    size_t          lut_channels,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             width,
    int             height)
 {
    if((width*height)<=(320*240))
@ -69,10 +69,10 @@ int fastcv_hal_lut(
 }

 int fastcv_hal_normHammingDiff8u(
-    const uchar*    a, 
-    const uchar*    b, 
-    int             n, 
-    int             cellSize, 
+    const uchar*    a,
+    const uchar*    b,
+    int             n,
+    int             cellSize,
    int*            result)
 {
    fcvStatus           status;
@ -169,15 +169,15 @@ int fastcv_hal_transpose2d(
    switch (element_size)
    {
        case 1:
-            status = fcvTransposeu8_v2(src_data, src_width, src_height, src_step, 
+            status = fcvTransposeu8_v2(src_data, src_width, src_height, src_step,
                                       dst_data, dst_step);
            break;
        case 2:
-            status = fcvTransposeu16_v2((const uint16_t*)src_data, src_width, src_height, 
+            status = fcvTransposeu16_v2((const uint16_t*)src_data, src_width, src_height,
                                       src_step, (uint16_t*)dst_data, dst_step);
            break;
        case 4:
-            status = fcvTransposef32_v2((const float32_t*)src_data, src_width, src_height, 
+            status = fcvTransposef32_v2((const float32_t*)src_data, src_width, src_height,
                                       src_step, (float32_t*)dst_data, dst_step);
            break;
        default:
@ -205,18 +205,18 @@ int fastcv_hal_meanStdDev(
    if(src_type != CV_8UC1)
    {
        CV_HAL_RETURN_NOT_IMPLEMENTED("src type not supported");
-    }  
+    }
    else if(mask != nullptr)
    {
        CV_HAL_RETURN_NOT_IMPLEMENTED("mask not supported");
-    }  
+    }
    else if(mean_val == nullptr && stddev_val == nullptr)
    {
        CV_HAL_RETURN_NOT_IMPLEMENTED("null ptr for mean and stddev");
    }
-       
+
    float32_t mean, variance;
-        
+
    fcvStatus status = fcvImageIntensityStats_v2(src_data, src_step, 0, 0, width, height,
                                   &mean, &variance, FASTCV_BIASED_VARIANCE_ESTIMATOR);

@ -278,7 +278,7 @@ int fastcv_hal_flip(
        status = fcvFlipRGB888u8((uint8_t*)src_data, src_width, src_height, src_step, (uint8_t*)dst_data, dst_step, dir);
    else
        CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Data type:%d is not supported, Switching to default OpenCV solution!", src_type));
-    
+
    CV_HAL_RETURN(status, hal_flip);
 }

@ -294,7 +294,7 @@ int fastcv_hal_rotate(
 {
    if((src_width*src_height)<(120*80))
        CV_HAL_RETURN_NOT_IMPLEMENTED("Switching to default OpenCV solution for lower resolution!");
-    
+
    fcvStatus           status;
    fcvRotateDegree     degree;

@ -324,11 +324,63 @@ int fastcv_hal_rotate(
            status = fcvRotateImageu8(src_data, src_width, src_height, src_step, dst_data, dst_step, degree);
            break;
        case CV_8UC2:
-            status = fcvRotateImageInterleavedu8((uint8_t*)src_data, src_width, src_height, src_step, (uint8_t*)dst_data, 
+            status = fcvRotateImageInterleavedu8((uint8_t*)src_data, src_width, src_height, src_step, (uint8_t*)dst_data,
                                                    dst_step, degree);
            break;
        default:
            CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("src_type:%d is not supported", src_type));
    }
    CV_HAL_RETURN(status, hal_rotate);
+}
+
+int fastcv_hal_addWeighted8u(
+    const uchar*    src1_data,
+    size_t          src1_step,
+    const uchar*    src2_data,
+    size_t          src2_step,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    const double    scalars[3])
+{
+    if( (scalars[0] < -128.0f) || (scalars[0] >= 128.0f) ||
+        (scalars[1] < -128.0f) || (scalars[1] >= 128.0f) ||
+        (scalars[2] < -(1<<23))|| (scalars[2] >= 1<<23))
+        CV_HAL_RETURN_NOT_IMPLEMENTED(
+            cv::format("Alpha:%f,Beta:%f,Gamma:%f is not supported because it's too large or too small\n",
+            scalars[0],scalars[1],scalars[2]));
+
+    INITIALIZATION_CHECK;
+
+    fcvStatus status = FASTCV_SUCCESS;
+
+    if (height == 1)
+    {
+        src1_step = width*sizeof(uchar);
+        src2_step = width*sizeof(uchar);
+        dst_step  = width*sizeof(uchar);
+
+        cv::parallel_for_(cv::Range(0, width), [&](const cv::Range &range){
+            int rangeWidth = range.end - range.start;
+            const uint8_t *src1 = src1_data + range.start;
+            const uint8_t *src2 = src2_data + range.start;
+            uint8_t *dst = dst_data + range.start;
+            fcvAddWeightedu8_v2(src1, rangeWidth, height, src1_step, src2, src2_step,
+                scalars[0], scalars[1], scalars[2], dst, dst_step);
+            });
+    }
+    else
+    {
+        cv::parallel_for_(cv::Range(0, height), [&](const cv::Range &range){
+            int rangeHeight = range.end - range.start;
+            const uint8_t *src1 = src1_data + range.start * src1_step;
+            const uint8_t *src2 = src2_data + range.start * src2_step;
+            uint8_t *dst = dst_data + range.start * dst_step;
+            fcvAddWeightedu8_v2(src1, width, rangeHeight, src1_step, src2, src2_step,
+                scalars[0], scalars[1], scalars[2], dst, dst_step);
+            });
+    }
+
+    CV_HAL_RETURN(status, hal_addWeighted8u_v2);
 }
--- a/3rdparty/fastcv/src/fastcv_hal_imgproc.cpp
+++ b/3rdparty/fastcv/src/fastcv_hal_imgproc.cpp
@ -34,7 +34,7 @@ int fastcv_hal_medianBlur(

    INITIALIZATION_CHECK;

-    fcvStatus status;
+    fcvStatus status = FASTCV_SUCCESS;
    int fcvFuncType = FCV_MAKETYPE(ksize,depth);

    switch (fcvFuncType)
@ -52,6 +52,101 @@ int fastcv_hal_medianBlur(
    CV_HAL_RETURN(status, hal_medianBlur);
 }

+class FcvSobelLoop_Invoker : public cv::ParallelLoopBody
+{
+    public:
+
+    FcvSobelLoop_Invoker(const cv::Mat& _src, cv::Mat& _dst, int _dx, int _dy, int _ksize, fcvBorderType _fcvBorder,
+        int _fcvBorderValue) : cv::ParallelLoopBody(), src(_src), dst(_dst), dx(_dx), dy(_dy), ksize(_ksize),
+        fcvBorder(_fcvBorder), fcvBorderValue(_fcvBorderValue)
+    {
+        width       = src.cols;
+        height      = src.rows;
+        halfKernelSize  = ksize/2;
+        fcvFuncType = FCV_MAKETYPE(ksize,src.depth());
+    }
+
+    virtual void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+        int topLines     = 0;
+        int rangeHeight  = range.end-range.start;
+        int paddedHeight = rangeHeight;
+
+        // Need additional lines to be border.
+        if(range.start > 0)
+        {
+            topLines     += halfKernelSize;
+            paddedHeight += halfKernelSize;
+        }
+
+        if(range.end < height)
+        {
+            paddedHeight += halfKernelSize;
+        }
+
+        cv::Mat srcPadded = src(cv::Rect(0, range.start-topLines, width, paddedHeight));
+        cv::Mat dstPadded = cv::Mat(paddedHeight, width, dst.depth());
+
+        int16_t *dxBuffer = nullptr, *dyBuffer = nullptr;
+
+        if ((dx == 1) && (dy == 0))
+        {
+            dxBuffer = (int16_t*)dstPadded.data;
+        }
+        else if ((dx == 0) && (dy == 1))
+        {
+            dyBuffer = (int16_t*)dstPadded.data;
+        }
+
+        switch (fcvFuncType)
+        {
+            case FCV_MAKETYPE(3,CV_8U):
+            {
+                fcvFilterSobel3x3u8s16(srcPadded.data, width, paddedHeight, srcPadded.step, dxBuffer, dyBuffer, dstPadded.step,
+                    fcvBorder, 0);
+                break;
+            }
+            case FCV_MAKETYPE(5,CV_8U):
+            {
+                fcvFilterSobel5x5u8s16(srcPadded.data, width, paddedHeight, srcPadded.step, dxBuffer, dyBuffer, dstPadded.step,
+                    fcvBorder, 0);
+                break;
+            }
+            case FCV_MAKETYPE(7,CV_8U):
+            {
+                fcvFilterSobel7x7u8s16(srcPadded.data, width, paddedHeight, srcPadded.step, dxBuffer, dyBuffer, dstPadded.step,
+                    fcvBorder, 0);
+                break;
+            }
+            default:
+                CV_Error(cv::Error::StsBadArg, cv::format("Ksize:%d, src_depth:%s is not supported",
+                    ksize, cv::depthToString(src.depth())));
+                break;
+        }
+
+        // Only copy center part back to output image and ignore the padded lines
+        cv::Mat temp1 = dstPadded(cv::Rect(0, topLines, width, rangeHeight));
+        cv::Mat temp2 = dst(cv::Rect(0, range.start, width, rangeHeight));
+        temp1.copyTo(temp2);
+    }
+
+    private:
+    const cv::Mat&  src;
+    cv::Mat&        dst;
+    int             width;
+    int             height;
+    int             dx;
+    int             dy;
+    int             ksize;
+    int             halfKernelSize;
+    int             fcvFuncType;
+    fcvBorderType   fcvBorder;
+    int             fcvBorderValue;
+
+    FcvSobelLoop_Invoker(const FcvSobelLoop_Invoker &);  // = delete;
+    const FcvSobelLoop_Invoker& operator= (const FcvSobelLoop_Invoker &);  // = delete;
+};
+
 int fastcv_hal_sobel(
    const uchar*    src_data,
    size_t          src_step,
@ -73,10 +168,13 @@ int fastcv_hal_sobel(
    double          delta,
    int             border_type)
 {
-
-    if(scale != 1.0f || delta != 0.0f)
+    if (!(FCV_CMP_EQ(scale, 1.0f) && FCV_CMP_EQ(delta, 0.0f)))
        CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Scale:%f, delta:%f is not supported", scale, delta));

+    // Only support one direction derivatives and the order is 1.(dx=1 && dy=0)||(dx=0 && dy=1)
+    if ((dx + dy == 0) || (dx + dy > 1))
+        CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Dx:%d Dy:%d is not supported",dx, dy));
+
    // Do not support inplace case
    if (src_data == dst_data)
        CV_HAL_RETURN_NOT_IMPLEMENTED("Inplace is not supported");
@ -89,10 +187,6 @@ int fastcv_hal_sobel(
    if (cn != 1)
        CV_HAL_RETURN_NOT_IMPLEMENTED("Multi-channels is not supported");

-    // Do not support for ROI case
-    if((margin_left!=0) || (margin_top != 0) || (margin_right != 0) || (margin_bottom !=0))
-        CV_HAL_RETURN_NOT_IMPLEMENTED("ROI is not supported");
-
    // 1. When ksize <= 0, OpenCV will use Scharr Derivatives
    // 2. When ksize == 1, OpenCV will use 3×1 or 1×3 kernel(no Gaussian smoothing is done)
    // FastCV doesn't support above two situation
@ -103,26 +197,16 @@ int fastcv_hal_sobel(
    if (dst_depth != CV_16S)
        CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Dst depth:%s is not supported", cv::depthToString(dst_depth)));

+    // Only support following ksize and src_depth as input
+    if ((FCV_MAKETYPE(ksize,src_depth) != FCV_MAKETYPE(3, CV_8U))   &&
+        (FCV_MAKETYPE(ksize,src_depth) != FCV_MAKETYPE(5, CV_8U))   &&
+        (FCV_MAKETYPE(ksize,src_depth) != FCV_MAKETYPE(7, CV_8U)))
+        CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Ksize:%d, src_depth:%s is not supported", ksize, cv::depthToString(src_depth)));
+
    INITIALIZATION_CHECK;

-    // Only support one direction derivatives and the order is 1.(dx=1 && dy=0)||(dx=0 && dy=1)
-    int16_t *dxBuffer, *dyBuffer;
-
-    if ((dx == 1) && (dy == 0))
-    {
-        dxBuffer = (int16_t*)dst_data;
-        dyBuffer = NULL;
-    }
-    else if ((dx == 0) && (dy == 1))
-    {
-        dxBuffer = NULL;
-        dyBuffer = (int16_t*)dst_data;
-    }
-    else
-        CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Dx:%d Dy:%d is not supported",dx, dy));
-
-    fcvStatus       status;
-    fcvBorderType   fcvBorder;
+    fcvStatus       status    = FASTCV_SUCCESS;
+    fcvBorderType   fcvBorder = FASTCV_BORDER_CONSTANT;

    switch (border_type)
    {
@ -141,28 +225,89 @@ int fastcv_hal_sobel(
            CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Border type:%s is not supported", borderToString(border_type)));
    }

-    int fcvFuncType = FCV_MAKETYPE(ksize,src_depth);
+    cv::Mat src = cv::Mat(height, width, CV_MAKE_TYPE(src_depth, 1), (void*)src_data, src_step);
+    cv::Mat dst = cv::Mat(height, width, CV_MAKE_TYPE(dst_depth, 1), (void*)dst_data, dst_step);

-    switch (fcvFuncType)
+    if (margin_left||margin_top||margin_top||margin_bottom)
    {
-        case FCV_MAKETYPE(3,CV_8U):
+        // Need additional lines to be border.
+        int paddedHeight = height, paddedWidth = width, startX = 0, startY = 0;
+
+        if(margin_left != 0)
        {
-            status = fcvFilterSobel3x3u8s16(src_data, width, height, src_step, dxBuffer, dyBuffer, dst_step, fcvBorder, 0);
-            break;
+            src_data    -= ksize/2;
+            paddedWidth += ksize/2;
+            startX      =  ksize/2;
        }
-        case FCV_MAKETYPE(5,CV_8U):
+
+        if(margin_top != 0)
        {
-            status = fcvFilterSobel5x5u8s16(src_data, width, height, src_step, dxBuffer, dyBuffer, dst_step, fcvBorder, 0);
-            break;
+            src_data     -= (ksize/2) * src_step;
+            paddedHeight += ksize/2;
+            startY       =  ksize/2;
        }
-        case FCV_MAKETYPE(7,CV_8U):
+
+        if(margin_right != 0)
        {
-            status = fcvFilterSobel7x7u8s16(src_data, width, height, src_step, dxBuffer, dyBuffer, dst_step, fcvBorder, 0);
-            break;
+            paddedWidth += ksize/2;
        }
-        default:
-            CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Ksize:%d, src_depth:%s, border type:%s is not supported",
-                ksize, cv::depthToString(src_depth), borderToString(border_type)));
+
+        if(margin_bottom != 0)
+        {
+            paddedHeight += ksize/2;
+        }
+
+        cv::Mat padded(paddedHeight, paddedWidth, src_depth);
+        int16_t *dxBuffer = nullptr, *dyBuffer = nullptr;
+
+        if ((dx == 1) && (dy == 0))
+        {
+            dxBuffer = (int16_t*)padded.data;
+            dyBuffer = NULL;
+        }
+        else if ((dx == 0) && (dy == 1))
+        {
+            dxBuffer = NULL;
+            dyBuffer = (int16_t*)padded.data;
+        }
+
+        int fcvFuncType = FCV_MAKETYPE(ksize, src_depth);
+
+        switch (fcvFuncType)
+        {
+            case FCV_MAKETYPE(3,CV_8U):
+            {
+                status = fcvFilterSobel3x3u8s16(src_data, paddedWidth, paddedHeight, src_step, dxBuffer, dyBuffer, padded.step,
+                    fcvBorder, 0);
+                break;
+            }
+            case FCV_MAKETYPE(5,CV_8U):
+            {
+                status = fcvFilterSobel5x5u8s16(src_data, paddedWidth, paddedHeight, src_step, dxBuffer, dyBuffer, padded.step,
+                    fcvBorder, 0);
+                break;
+            }
+            case FCV_MAKETYPE(7,CV_8U):
+            {
+                status = fcvFilterSobel7x7u8s16(src_data, paddedWidth, paddedHeight, src_step, dxBuffer, dyBuffer, padded.step,
+                    fcvBorder, 0);
+                break;
+            }
+            default:
+                CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Ksize:%d, src_depth:%s is not supported",
+                    ksize, cv::depthToString(src_depth)));
+                break;
+        }
+
+        cv::Mat temp1 = padded(cv::Rect(startX, startY, width, height));
+        temp1.copyTo(dst);
+    }
+    else
+    {
+        int nThreads = cv::getNumThreads();
+        int nStripes = nThreads > 1 ? 3*nThreads : 1;
+
+        cv::parallel_for_(cv::Range(0, height), FcvSobelLoop_Invoker(src, dst, dx, dy, ksize, fcvBorder, 0), nStripes);
    }

    CV_HAL_RETURN(status, hal_sobel);
@ -316,3 +461,286 @@ int fastcv_hal_adaptiveThreshold(

    CV_HAL_RETURN(status,hal_adaptiveThreshold);
 }
+
+class FcvGaussianBlurLoop_Invoker : public cv::ParallelLoopBody
+{
+    public:
+
+    FcvGaussianBlurLoop_Invoker(const cv::Mat& _src, cv::Mat& _dst, int _ksize, fcvBorderType _fcvBorder, int _fcvBorderValue) :
+        cv::ParallelLoopBody(), src(_src),dst(_dst), ksize(_ksize), fcvBorder(_fcvBorder), fcvBorderValue(_fcvBorderValue)
+    {
+        width       = src.cols;
+        height      = src.rows;
+        halfKernelSize   = ksize / 2;
+        fcvFuncType = FCV_MAKETYPE(ksize, src.depth());
+    }
+
+    virtual void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+        int topLines     = 0;
+        int rangeHeight  = range.end-range.start;
+        int paddedHeight = rangeHeight;
+
+        // Need additional lines to be border.
+        if(range.start != 0)
+        {
+            topLines     += halfKernelSize;
+            paddedHeight += halfKernelSize;
+        }
+
+        if(range.end != height)
+        {
+            paddedHeight += halfKernelSize;
+        }
+
+        const cv::Mat srcPadded = src(cv::Rect(0, range.start - topLines, width, paddedHeight));
+        cv::Mat dstPadded       = cv::Mat(paddedHeight, width, CV_8U);
+
+        if (fcvFuncType == FCV_MAKETYPE(3,CV_8U))
+            fcvFilterGaussian3x3u8_v4(srcPadded.data, width, paddedHeight, srcPadded.step, dstPadded.data, dstPadded.step,
+                fcvBorder, 0);
+        else if (fcvFuncType == FCV_MAKETYPE(5,CV_8U))
+            fcvFilterGaussian5x5u8_v3(srcPadded.data, width, paddedHeight, srcPadded.step, dstPadded.data, dstPadded.step,
+                fcvBorder, 0);
+
+        // Only copy center part back to output image and ignore the padded lines
+        cv::Mat temp1 = dstPadded(cv::Rect(0, topLines, width, rangeHeight));
+        cv::Mat temp2 = dst(cv::Rect(0, range.start, width, rangeHeight));
+        temp1.copyTo(temp2);
+    }
+
+    private:
+    const cv::Mat&  src;
+    cv::Mat&        dst;
+    int             width;
+    int             height;
+    const int       ksize;
+    int             halfKernelSize;
+    int             fcvFuncType;
+    fcvBorderType   fcvBorder;
+    int             fcvBorderValue;
+
+    FcvGaussianBlurLoop_Invoker(const FcvGaussianBlurLoop_Invoker &);  // = delete;
+    const FcvGaussianBlurLoop_Invoker& operator= (const FcvGaussianBlurLoop_Invoker &);  // = delete;
+};
+
+int fastcv_hal_gaussianBlurBinomial(
+    const uchar*    src_data,
+    size_t          src_step,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             width,
+    int             height,
+    int             depth,
+    int             cn,
+    size_t          margin_left,
+    size_t          margin_top,
+    size_t          margin_right,
+    size_t          margin_bottom,
+    size_t          ksize,
+    int             border_type)
+{
+    // Do not support inplace case
+    if (src_data == dst_data)
+        CV_HAL_RETURN_NOT_IMPLEMENTED("Inplace is not supported");
+
+    // The input image width and height should greater than kernel size
+    if (((size_t)height <= ksize) || ((size_t)width <= ksize))
+        CV_HAL_RETURN_NOT_IMPLEMENTED("Input image size should be larger than kernel size");
+
+    // The input channel should be 1
+    if (cn != 1)
+        CV_HAL_RETURN_NOT_IMPLEMENTED("Multi-channels is not supported");
+
+    // Do not support for ROI case
+    if((margin_left!=0) || (margin_top != 0) || (margin_right != 0) || (margin_bottom !=0))
+        CV_HAL_RETURN_NOT_IMPLEMENTED("ROI is not supported");
+
+    INITIALIZATION_CHECK;
+
+    fcvStatus status = FASTCV_SUCCESS;
+    fcvBorderType fcvBorder = fcvBorderType::FASTCV_BORDER_UNDEFINED;
+    int fcvFuncType = FCV_MAKETYPE(ksize,depth);
+
+    switch (border_type)
+    {
+        case cv::BorderTypes::BORDER_REPLICATE:
+        {
+            fcvBorder = fcvBorderType::FASTCV_BORDER_REPLICATE;
+            break;
+        }
+        // For constant border, there are no border value, OpenCV default value is 0
+        case cv::BorderTypes::BORDER_CONSTANT:
+        {
+            fcvBorder = fcvBorderType::FASTCV_BORDER_CONSTANT;
+            break;
+        }
+        case cv::BorderTypes::BORDER_REFLECT:
+        {
+            fcvBorder = fcvBorderType::FASTCV_BORDER_REFLECT;
+            break;
+        }
+        case cv::BorderTypes::BORDER_REFLECT_101:
+        {
+            fcvBorder = fcvBorderType::FASTCV_BORDER_REFLECT_V2;
+            break;
+        }
+        default:
+            CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Border type:%s is not supported", borderToString(border_type)));
+    }
+
+    int nThreads = cv::getNumThreads();
+    int nStripes = (nThreads > 1) ? ((height > 60) ? 3 * nThreads : 1) : 1;
+
+    switch (fcvFuncType)
+    {
+        case FCV_MAKETYPE(3,CV_8U):
+        case FCV_MAKETYPE(5,CV_8U):
+        {
+            cv::Mat src = cv::Mat(height, width, CV_8UC1, (void*)src_data, src_step);
+            cv::Mat dst = cv::Mat(height, width, CV_8UC1, (void*)dst_data, dst_step);
+            cv::parallel_for_(cv::Range(0, height), FcvGaussianBlurLoop_Invoker(src, dst, ksize, fcvBorder, 0), nStripes);
+            break;
+        }
+        default:
+            CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Ksize:%d, depth:%s is not supported", (int)ksize, cv::depthToString(depth)));
+    }
+
+    CV_HAL_RETURN(status, hal_gaussianBlurBinomial);
+}
+
+class FcvWarpPerspectiveLoop_Invoker : public cv::ParallelLoopBody
+{
+    public:
+
+    FcvWarpPerspectiveLoop_Invoker(const uchar* _src_data, int _src_width, int _src_height, size_t _src_step, uchar* _dst_data,
+        int _dst_width, int _dst_height, size_t _dst_step, int _type, const double* _M,
+        fcvInterpolationType _fcvInterpolation, fcvBorderType _fcvBorder, int _fcvBorderValue) :
+        cv::ParallelLoopBody(), src_data(_src_data), src_width(_src_width), src_height(_src_height), src_step(_src_step),
+        dst_data(_dst_data), dst_width(_dst_width), dst_height(_dst_height), dst_step(_dst_step), type(_type),
+        M(_M), fcvInterpolation(_fcvInterpolation),fcvBorder(_fcvBorder),
+        fcvBorderValue(_fcvBorderValue) {}
+
+    virtual void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+        uchar* dst = dst_data + range.start*dst_step;
+        int rangeHeight = range.end - range.start;
+
+        float rangeMatrix[9];
+        rangeMatrix[0] = (float)(M[0]);
+        rangeMatrix[1] = (float)(M[1]);
+        rangeMatrix[2] = (float)(M[2]+range.start*M[1]);
+        rangeMatrix[3] = (float)(M[3]);
+        rangeMatrix[4] = (float)(M[4]);
+        rangeMatrix[5] = (float)(M[5]+range.start*M[4]);
+        rangeMatrix[6] = (float)(M[6]);
+        rangeMatrix[7] = (float)(M[7]);
+        rangeMatrix[8] = (float)(M[8]+range.start*M[7]);
+        fcvWarpPerspectiveu8_v5(src_data, src_width, src_height, src_step, CV_MAT_CN(type), dst, dst_width, rangeHeight,
+            dst_step, rangeMatrix, fcvInterpolation, fcvBorder, fcvBorderValue);
+    }
+
+    private:
+    const uchar*            src_data;
+    const int               src_width;
+    const int               src_height;
+    const size_t            src_step;
+    uchar*                  dst_data;
+    const int               dst_width;
+    const int               dst_height;
+    const size_t            dst_step;
+    const int               type;
+    const double*           M;
+    fcvInterpolationType    fcvInterpolation;
+    fcvBorderType           fcvBorder;
+    int                     fcvBorderValue;
+
+    FcvWarpPerspectiveLoop_Invoker(const FcvWarpPerspectiveLoop_Invoker &);  // = delete;
+    const FcvWarpPerspectiveLoop_Invoker& operator= (const FcvWarpPerspectiveLoop_Invoker &);  // = delete;
+};
+
+int fastcv_hal_warpPerspective(
+    int             src_type,
+    const uchar*    src_data,
+    size_t          src_step,
+    int             src_width,
+    int             src_height,
+    uchar*          dst_data,
+    size_t          dst_step,
+    int             dst_width,
+    int             dst_height,
+    const double    M[9],
+    int             interpolation,
+    int             border_type,
+    const double    border_value[4])
+{
+    // Do not support inplace case
+    if (src_data == dst_data)
+        CV_HAL_RETURN_NOT_IMPLEMENTED("Inplace is not supported");
+
+    // The input channel should be 1
+    if (CV_MAT_CN(src_type) != 1)
+        CV_HAL_RETURN_NOT_IMPLEMENTED("Multi-channels is not supported");
+
+    INITIALIZATION_CHECK;
+
+    fcvStatus               status = FASTCV_SUCCESS;
+    fcvBorderType           fcvBorder;
+    uint8_t                 fcvBorderValue = 0;
+    fcvInterpolationType    fcvInterpolation;
+
+    switch (border_type)
+    {
+        case cv::BorderTypes::BORDER_CONSTANT:
+        {
+            if ((border_value[0] == border_value[1]) &&
+                (border_value[0] == border_value[2]) &&
+                (border_value[0] == border_value[3]))
+            {
+                fcvBorder       = fcvBorderType::FASTCV_BORDER_CONSTANT;
+                fcvBorderValue  = static_cast<uint8_t>(border_value[0]);
+                break;
+            }
+            else
+                CV_HAL_RETURN_NOT_IMPLEMENTED("Different border value is not supported");
+        }
+        case cv::BorderTypes::BORDER_REPLICATE:
+        {
+            fcvBorder = fcvBorderType::FASTCV_BORDER_REPLICATE;
+            break;
+        }
+        case cv::BorderTypes::BORDER_TRANSPARENT:
+        {
+            fcvBorder = fcvBorderType::FASTCV_BORDER_UNDEFINED;
+            break;
+        }
+        default:
+            CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Border type:%s is not supported", borderToString(border_type)));
+    }
+
+    switch(interpolation)
+    {
+        case cv::InterpolationFlags::INTER_NEAREST:
+        {
+            fcvInterpolation = FASTCV_INTERPOLATION_TYPE_NEAREST_NEIGHBOR;
+            break;
+        }
+        default:
+            CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Interpolation type:%s is not supported",
+                                          interpolationToString(interpolation)));
+    }
+
+    int nThreads = cv::getNumThreads();
+    int nStripes = nThreads > 1 ? 3*nThreads : 1;
+
+    if(CV_MAT_DEPTH(src_type) == CV_8U)
+    {
+        cv::parallel_for_(cv::Range(0, dst_height),
+            FcvWarpPerspectiveLoop_Invoker(src_data, src_width, src_height, src_step, dst_data, dst_width, dst_height,
+            dst_step, src_type, M, fcvInterpolation, fcvBorder, fcvBorderValue), nStripes);
+    }
+    else
+        CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Src type:%s is not supported", cv::typeToString(src_type).c_str()));
+
+    CV_HAL_RETURN(status, hal_warpPerspective);
+}