Merge branch 4.x

2025-08-06 14:36:36 +08:00 · 2025-04-28 22:13:51 +03:00 · 2025-04-28 22:13:51 +03:00 · f8de2e06e6
commit f8de2e06e6
parent 1df06488b1 4ad4bd5dc0
243 changed files with 12965 additions and 3216 deletions
--- a/3rdparty/fastcv/fastcv.cmake
+++ b/3rdparty/fastcv/fastcv.cmake
@ -1,23 +1,23 @@
 function(download_fastcv root_dir)

  # Commit SHA in the opencv_3rdparty repo
-  set(FASTCV_COMMIT "f4413cc2ab7233fdfc383a4cded402c072677fb0")
+  set(FASTCV_COMMIT "8d86e68dad8b80b8575a8d3cf401d3ee96c24148")

  # Define actual FastCV versions
  if(ANDROID)
    if(AARCH64)
      message(STATUS "Download FastCV for Android aarch64")
-      set(FCV_PACKAGE_NAME  "fastcv_android_aarch64_2024_12_11.tgz")
-      set(FCV_PACKAGE_HASH  "9dac41e86597305f846212dae31a4a88")
+      set(FCV_PACKAGE_NAME  "fastcv_android_aarch64_2025_04_08.tgz")
+      set(FCV_PACKAGE_HASH  "e028966a1d1b2f3f0bc5967d316e8b64")
    else()
      message(STATUS "Download FastCV for Android armv7")
-      set(FCV_PACKAGE_NAME  "fastcv_android_arm32_2024_12_11.tgz")
-      set(FCV_PACKAGE_HASH  "fe2d30334180b17e3031eee92aac43b6")
+      set(FCV_PACKAGE_NAME  "fastcv_android_arm32_2025_04_08.tgz")
+      set(FCV_PACKAGE_HASH  "6fc1e812a4b3ef392469d2283e037ffe")
    endif()
  elseif(UNIX AND NOT APPLE AND NOT IOS AND NOT XROS)
    if(AARCH64)
-      set(FCV_PACKAGE_NAME  "fastcv_linux_aarch64_2025_02_12.tgz")
-      set(FCV_PACKAGE_HASH  "33ac2a59cf3e7d6402eee2e010de1202")
+      set(FCV_PACKAGE_NAME  "fastcv_linux_aarch64_2025_04_08.tgz")
+      set(FCV_PACKAGE_HASH  "062a26639cd2788beee2e0dd8743d680")
    else()
      message("FastCV: fastcv lib for 32-bit Linux is not supported for now!")
    endif()
--- a/3rdparty/hal_rvv/hal_rvv_1p0/atan.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/atan.hpp
@ -1,128 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level
-// directory of this distribution and at http://opencv.org/license.html.
-#pragma once
-
-#undef cv_hal_fastAtan32f
-#define cv_hal_fastAtan32f cv::cv_hal_rvv::fast_atan_32
-
-#undef cv_hal_fastAtan64f
-#define cv_hal_fastAtan64f cv::cv_hal_rvv::fast_atan_64
-
-#include <riscv_vector.h>
-
-#include <cfloat>
-
-namespace cv::cv_hal_rvv {
-
-namespace detail {
-// ref: mathfuncs_core.simd.hpp
-static constexpr float pi = CV_PI;
-static constexpr float atan2_p1 = 0.9997878412794807F * (180 / pi);
-static constexpr float atan2_p3 = -0.3258083974640975F * (180 / pi);
-static constexpr float atan2_p5 = 0.1555786518463281F * (180 / pi);
-static constexpr float atan2_p7 = -0.04432655554792128F * (180 / pi);
-
-__attribute__((always_inline)) inline vfloat32m4_t
-rvv_atan_f32(vfloat32m4_t vy, vfloat32m4_t vx, size_t vl, float p7,
-             vfloat32m4_t vp5, vfloat32m4_t vp3, vfloat32m4_t vp1,
-             float angle_90_deg) {
-    const auto ax = __riscv_vfabs(vx, vl);
-    const auto ay = __riscv_vfabs(vy, vl);
-    const auto c = __riscv_vfdiv(
-        __riscv_vfmin(ax, ay, vl),
-        __riscv_vfadd(__riscv_vfmax(ax, ay, vl), FLT_EPSILON, vl), vl);
-    const auto c2 = __riscv_vfmul(c, c, vl);
-
-    auto a = __riscv_vfmadd(c2, p7, vp5, vl);
-    a = __riscv_vfmadd(a, c2, vp3, vl);
-    a = __riscv_vfmadd(a, c2, vp1, vl);
-    a = __riscv_vfmul(a, c, vl);
-
-    const auto mask = __riscv_vmflt(ax, ay, vl);
-    a = __riscv_vfrsub_mu(mask, a, a, angle_90_deg, vl);
-
-    a = __riscv_vfrsub_mu(__riscv_vmflt(vx, 0.F, vl), a, a, angle_90_deg * 2,
-                          vl);
-    a = __riscv_vfrsub_mu(__riscv_vmflt(vy, 0.F, vl), a, a, angle_90_deg * 4,
-                          vl);
-
-    return a;
-}
-
-} // namespace detail
-
-inline int fast_atan_32(const float *y, const float *x, float *dst, size_t n,
-                        bool angle_in_deg) {
-    const float scale = angle_in_deg ? 1.f : CV_PI / 180.f;
-    const float p1 = detail::atan2_p1 * scale;
-    const float p3 = detail::atan2_p3 * scale;
-    const float p5 = detail::atan2_p5 * scale;
-    const float p7 = detail::atan2_p7 * scale;
-    const float angle_90_deg = 90.F * scale;
-
-    static size_t vlmax = __riscv_vsetvlmax_e32m4();
-    auto vp1 = __riscv_vfmv_v_f_f32m4(p1, vlmax);
-    auto vp3 = __riscv_vfmv_v_f_f32m4(p3, vlmax);
-    auto vp5 = __riscv_vfmv_v_f_f32m4(p5, vlmax);
-
-    for (size_t vl{}; n > 0; n -= vl) {
-        vl = __riscv_vsetvl_e32m4(n);
-
-        auto vy = __riscv_vle32_v_f32m4(y, vl);
-        auto vx = __riscv_vle32_v_f32m4(x, vl);
-
-        auto a =
-            detail::rvv_atan_f32(vy, vx, vl, p7, vp5, vp3, vp1, angle_90_deg);
-
-        __riscv_vse32(dst, a, vl);
-
-        x += vl;
-        y += vl;
-        dst += vl;
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-inline int fast_atan_64(const double *y, const double *x, double *dst, size_t n,
-                        bool angle_in_deg) {
-    // this also uses float32 version, ref: mathfuncs_core.simd.hpp
-
-    const float scale = angle_in_deg ? 1.f : CV_PI / 180.f;
-    const float p1 = detail::atan2_p1 * scale;
-    const float p3 = detail::atan2_p3 * scale;
-    const float p5 = detail::atan2_p5 * scale;
-    const float p7 = detail::atan2_p7 * scale;
-    const float angle_90_deg = 90.F * scale;
-
-    static size_t vlmax = __riscv_vsetvlmax_e32m4();
-    auto vp1 = __riscv_vfmv_v_f_f32m4(p1, vlmax);
-    auto vp3 = __riscv_vfmv_v_f_f32m4(p3, vlmax);
-    auto vp5 = __riscv_vfmv_v_f_f32m4(p5, vlmax);
-
-    for (size_t vl{}; n > 0; n -= vl) {
-        vl = __riscv_vsetvl_e64m8(n);
-
-        auto wy = __riscv_vle64_v_f64m8(y, vl);
-        auto wx = __riscv_vle64_v_f64m8(x, vl);
-
-        auto vy = __riscv_vfncvt_f_f_w_f32m4(wy, vl);
-        auto vx = __riscv_vfncvt_f_f_w_f32m4(wx, vl);
-
-        auto a =
-            detail::rvv_atan_f32(vy, vx, vl, p7, vp5, vp3, vp1, angle_90_deg);
-
-        auto wa = __riscv_vfwcvt_f_f_v_f64m8(a, vl);
-
-        __riscv_vse64(dst, wa, vl);
-
-        x += vl;
-        y += vl;
-        dst += vl;
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-} // namespace cv::cv_hal_rvv
--- a/3rdparty/hal_rvv/hal_rvv_1p0/filter.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/filter.hpp
@ -1,852 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-#ifndef OPENCV_HAL_RVV_FILTER_HPP_INCLUDED
-#define OPENCV_HAL_RVV_FILTER_HPP_INCLUDED
-
-#include "../../imgproc/include/opencv2/imgproc/hal/interface.h"
-#include <riscv_vector.h>
-
-struct cvhalFilter2D;
-
-namespace cv { namespace cv_hal_rvv {
-
-namespace filter {
-#undef cv_hal_filterInit
-#undef cv_hal_filter
-#undef cv_hal_filterFree
-#define cv_hal_filterInit cv::cv_hal_rvv::filter::filterInit
-#define cv_hal_filter cv::cv_hal_rvv::filter::filter
-#define cv_hal_filterFree cv::cv_hal_rvv::filter::filterFree
-
-class FilterInvoker : public ParallelLoopBody
-{
-public:
-    template<typename... Args>
-    FilterInvoker(std::function<int(int, int, Args...)> _func, Args&&... args)
-    {
-        func = std::bind(_func, std::placeholders::_1, std::placeholders::_2, std::forward<Args>(args)...);
-    }
-
-    virtual void operator()(const Range& range) const override
-    {
-        func(range.start, range.end);
-    }
-
-private:
-    std::function<int(int, int)> func;
-};
-
-template<typename... Args>
-static inline int invoke(int start, int end, std::function<int(int, int, Args...)> func, Args&&... args)
-{
-    cv::parallel_for_(Range(start + 1, end), FilterInvoker(func, std::forward<Args>(args)...), cv::getNumThreads());
-    return func(start, start + 1, std::forward<Args>(args)...);
-}
-
-static inline int borderInterpolate( int p, int len, int borderType )
-{
-    if( (unsigned)p < (unsigned)len )
-        ;
-    else if( borderType == BORDER_REPLICATE )
-        p = p < 0 ? 0 : len - 1;
-    else if( borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101 )
-    {
-        int delta = borderType == BORDER_REFLECT_101;
-        if( len == 1 )
-            return 0;
-        do
-        {
-            if( p < 0 )
-                p = -p - 1 + delta;
-            else
-                p = len - 1 - (p - len) - delta;
-        }
-        while( (unsigned)p >= (unsigned)len );
-    }
-    else if( borderType == BORDER_CONSTANT )
-        p = -1;
-    return p;
-}
-
-struct Filter2D
-{
-    const uchar* kernel_data;
-    size_t kernel_step;
-    int kernel_type;
-    int kernel_width;
-    int kernel_height;
-    int src_type;
-    int dst_type;
-    int borderType;
-    double delta;
-    int anchor_x;
-    int anchor_y;
-};
-
-inline int filterInit(cvhalFilter2D** context, uchar* kernel_data, size_t kernel_step, int kernel_type, int kernel_width, int kernel_height, int /*max_width*/, int /*max_height*/, int src_type, int dst_type, int borderType, double delta, int anchor_x, int anchor_y, bool /*allowSubmatrix*/, bool /*allowInplace*/)
-{
-    if (kernel_type != CV_32FC1 || src_type != CV_8UC4 || dst_type != CV_8UC4)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (kernel_width != kernel_height)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (kernel_width != 3 && kernel_width != 5)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    anchor_x = anchor_x < 0 ? kernel_width  / 2 : anchor_x;
-    anchor_y = anchor_y < 0 ? kernel_height / 2 : anchor_y;
-    *context = reinterpret_cast<cvhalFilter2D*>(new Filter2D{kernel_data, kernel_step, kernel_type, kernel_width, kernel_height, src_type, dst_type, borderType, delta, anchor_x, anchor_y});
-    return CV_HAL_ERROR_OK;
-}
-
-static void process3(int anchor, int left, int right, float delta, const float* kernel, const uchar* row0, const uchar* row1, const uchar* row2, uchar* dst)
-{
-    int vl;
-    for (int i = left; i < right; i += vl)
-    {
-        vl = __riscv_vsetvl_e8m1(right - i);
-        auto s0 = __riscv_vfmv_v_f_f32m4(delta, vl);
-        auto s1 = __riscv_vfmv_v_f_f32m4(delta, vl);
-        auto s2 = __riscv_vfmv_v_f_f32m4(delta, vl);
-        auto s3 = __riscv_vfmv_v_f_f32m4(delta, vl);
-
-        auto addshift = [&](vfloat32m4_t a, vfloat32m4_t b, float k0, float k1, float k2, float r1, float r2) {
-            a = __riscv_vfmacc(a, k0, b, vl);
-            b = __riscv_vfslide1down(b, r1, vl);
-            a = __riscv_vfmacc(a, k1, b, vl);
-            b = __riscv_vfslide1down(b, r2, vl);
-            return __riscv_vfmacc(a, k2, b, vl);
-        };
-        auto loadsrc = [&](const uchar* row, float k0, float k1, float k2) {
-            if (!row) return;
-
-            const uchar* extra = row + (i - anchor) * 4;
-            auto src = __riscv_vlseg4e8_v_u8m1x4(extra, vl);
-            auto v0 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl), vl);
-            auto v1 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl), vl);
-            auto v2 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl), vl);
-            auto v3 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl), vl);
-
-            s0 = addshift(s0, v0, k0, k1, k2, extra[vl * 4    ], extra[vl * 4 + 4]);
-            s1 = addshift(s1, v1, k0, k1, k2, extra[vl * 4 + 1], extra[vl * 4 + 5]);
-            s2 = addshift(s2, v2, k0, k1, k2, extra[vl * 4 + 2], extra[vl * 4 + 6]);
-            s3 = addshift(s3, v3, k0, k1, k2, extra[vl * 4 + 3], extra[vl * 4 + 7]);
-        };
-
-        loadsrc(row0, kernel[0], kernel[1], kernel[2]);
-        loadsrc(row1, kernel[3], kernel[4], kernel[5]);
-        loadsrc(row2, kernel[6], kernel[7], kernel[8]);
-        vuint8m1x4_t val{};
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 0, __riscv_vnclipu(__riscv_vfncvt_xu(s0, vl), 0, __RISCV_VXRM_RNU, vl));
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 1, __riscv_vnclipu(__riscv_vfncvt_xu(s1, vl), 0, __RISCV_VXRM_RNU, vl));
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 2, __riscv_vnclipu(__riscv_vfncvt_xu(s2, vl), 0, __RISCV_VXRM_RNU, vl));
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 3, __riscv_vnclipu(__riscv_vfncvt_xu(s3, vl), 0, __RISCV_VXRM_RNU, vl));
-        __riscv_vsseg4e8(dst + i * 4, val, vl);
-    }
-}
-
-static void process5(int anchor, int left, int right, float delta, const float* kernel, const uchar* row0, const uchar* row1, const uchar* row2, const uchar* row3, const uchar* row4, uchar* dst)
-{
-    int vl;
-    for (int i = left; i < right; i += vl)
-    {
-        vl = __riscv_vsetvl_e8m1(right - i);
-        auto s0 = __riscv_vfmv_v_f_f32m4(delta, vl);
-        auto s1 = __riscv_vfmv_v_f_f32m4(delta, vl);
-        auto s2 = __riscv_vfmv_v_f_f32m4(delta, vl);
-        auto s3 = __riscv_vfmv_v_f_f32m4(delta, vl);
-
-        auto addshift = [&](vfloat32m4_t a, vfloat32m4_t b, float k0, float k1, float k2, float k3, float k4, float r1, float r2, float r3, float r4) {
-            a = __riscv_vfmacc(a, k0, b, vl);
-            b = __riscv_vfslide1down(b, r1, vl);
-            a = __riscv_vfmacc(a, k1, b, vl);
-            b = __riscv_vfslide1down(b, r2, vl);
-            a = __riscv_vfmacc(a, k2, b, vl);
-            b = __riscv_vfslide1down(b, r3, vl);
-            a = __riscv_vfmacc(a, k3, b, vl);
-            b = __riscv_vfslide1down(b, r4, vl);
-            return __riscv_vfmacc(a, k4, b, vl);
-        };
-        auto loadsrc = [&](const uchar* row, float k0, float k1, float k2, float k3, float k4) {
-            if (!row) return;
-
-            auto src = __riscv_vlseg4e8_v_u8m1x4(row + (i - anchor) * 4, vl);
-            auto v0 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl), vl);
-            auto v1 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl), vl);
-            auto v2 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl), vl);
-            auto v3 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl), vl);
-
-            const uchar* extra = row + (i + vl - anchor) * 4;
-            s0 = addshift(s0, v0, k0, k1, k2, k3, k4, *(extra    ), *(extra + 4), *(extra +  8), *(extra + 12));
-            s1 = addshift(s1, v1, k0, k1, k2, k3, k4, *(extra + 1), *(extra + 5), *(extra +  9), *(extra + 13));
-            s2 = addshift(s2, v2, k0, k1, k2, k3, k4, *(extra + 2), *(extra + 6), *(extra + 10), *(extra + 14));
-            s3 = addshift(s3, v3, k0, k1, k2, k3, k4, *(extra + 3), *(extra + 7), *(extra + 11), *(extra + 15));
-        };
-
-        loadsrc(row0, kernel[ 0], kernel[ 1], kernel[ 2], kernel[ 3], kernel[ 4]);
-        loadsrc(row1, kernel[ 5], kernel[ 6], kernel[ 7], kernel[ 8], kernel[ 9]);
-        loadsrc(row2, kernel[10], kernel[11], kernel[12], kernel[13], kernel[14]);
-        loadsrc(row3, kernel[15], kernel[16], kernel[17], kernel[18], kernel[19]);
-        loadsrc(row4, kernel[20], kernel[21], kernel[22], kernel[23], kernel[24]);
-        vuint8m1x4_t val{};
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 0, __riscv_vnclipu(__riscv_vfncvt_xu(s0, vl), 0, __RISCV_VXRM_RNU, vl));
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 1, __riscv_vnclipu(__riscv_vfncvt_xu(s1, vl), 0, __RISCV_VXRM_RNU, vl));
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 2, __riscv_vnclipu(__riscv_vfncvt_xu(s2, vl), 0, __RISCV_VXRM_RNU, vl));
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 3, __riscv_vnclipu(__riscv_vfncvt_xu(s3, vl), 0, __RISCV_VXRM_RNU, vl));
-        __riscv_vsseg4e8(dst + i * 4, val, vl);
-    }
-}
-
-// the algorithm is copied from 3rdparty/carotene/src/convolution.cpp,
-// in the function void CAROTENE_NS::convolution
-template<int ksize>
-static inline int filter(int start, int end, Filter2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
-{
-    float kernel[ksize * ksize];
-    for (int i = 0; i < ksize * ksize; i++)
-    {
-        kernel[i] = reinterpret_cast<const float*>(data->kernel_data + (i / ksize) * data->kernel_step)[i % ksize];
-    }
-
-    constexpr int noval = std::numeric_limits<int>::max();
-    auto access = [&](int x, int y) {
-        int pi, pj;
-        if (data->borderType & BORDER_ISOLATED)
-        {
-            pi = borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED);
-            pj = borderInterpolate(y - data->anchor_x, width , data->borderType & ~BORDER_ISOLATED);
-            pi = pi < 0 ? noval : pi;
-            pj = pj < 0 ? noval : pj;
-        }
-        else
-        {
-            pi = borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType);
-            pj = borderInterpolate(offset_x + y - data->anchor_x, full_width , data->borderType);
-            pi = pi < 0 ? noval : pi - offset_y;
-            pj = pj < 0 ? noval : pj - offset_x;
-        }
-        return std::make_pair(pi, pj);
-    };
-
-    auto process = [&](int x, int y) {
-        float sum0, sum1, sum2, sum3;
-        sum0 = sum1 = sum2 = sum3 = data->delta;
-        for (int i = 0; i < ksize * ksize; i++)
-        {
-            auto p = access(x + i / ksize, y + i % ksize);
-            if (p.first != noval && p.second != noval)
-            {
-                sum0 += kernel[i] * src_data[p.first * src_step + p.second * 4    ];
-                sum1 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 1];
-                sum2 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 2];
-                sum3 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 3];
-            }
-        }
-        dst_data[(x * width + y) * 4    ] = std::max(0, std::min((int)std::round(sum0), (int)std::numeric_limits<uchar>::max()));
-        dst_data[(x * width + y) * 4 + 1] = std::max(0, std::min((int)std::round(sum1), (int)std::numeric_limits<uchar>::max()));
-        dst_data[(x * width + y) * 4 + 2] = std::max(0, std::min((int)std::round(sum2), (int)std::numeric_limits<uchar>::max()));
-        dst_data[(x * width + y) * 4 + 3] = std::max(0, std::min((int)std::round(sum3), (int)std::numeric_limits<uchar>::max()));
-    };
-
-    for (int i = start; i < end; i++)
-    {
-        const int left = ksize - 1, right = width - (ksize - 1);
-        if (left >= right)
-        {
-            for (int j = 0; j < width; j++)
-                process(i, j);
-        }
-        else
-        {
-            for (int j = 0; j < left; j++)
-                process(i, j);
-            for (int j = right; j < width; j++)
-                process(i, j);
-
-            const uchar* row0 = access(i    , 0).first == noval ? nullptr : src_data + access(i    , 0).first * src_step;
-            const uchar* row1 = access(i + 1, 0).first == noval ? nullptr : src_data + access(i + 1, 0).first * src_step;
-            const uchar* row2 = access(i + 2, 0).first == noval ? nullptr : src_data + access(i + 2, 0).first * src_step;
-            if (ksize == 3)
-            {
-                process3(data->anchor_x, left, right, data->delta, kernel, row0, row1, row2, dst_data + i * width * 4);
-            }
-            else
-            {
-                const uchar* row3 = access(i + 3, 0).first == noval ? nullptr : src_data + access(i + 3, 0).first * src_step;
-                const uchar* row4 = access(i + 4, 0).first == noval ? nullptr : src_data + access(i + 4, 0).first * src_step;
-                process5(data->anchor_x, left, right, data->delta, kernel, row0, row1, row2, row3, row4, dst_data + i * width * 4);
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-inline int filter(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
-{
-    Filter2D* data = reinterpret_cast<Filter2D*>(context);
-    std::vector<uchar> dst(width * height * 4);
-
-    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
-    switch (data->kernel_width)
-    {
-    case 3:
-        res = invoke(0, height, {filter<3>}, data, src_data, src_step, dst.data(), width, height, full_width, full_height, offset_x, offset_y);
-        break;
-    case 5:
-        res = invoke(0, height, {filter<5>}, data, src_data, src_step, dst.data(), width, height, full_width, full_height, offset_x, offset_y);
-        break;
-    }
-
-    for (int i = 0; i < height; i++)
-        std::copy(dst.data() + i * width * 4, dst.data() + (i + 1) * width * 4, dst_data + i * dst_step);
-    return res;
-}
-
-inline int filterFree(cvhalFilter2D* context)
-{
-    delete reinterpret_cast<Filter2D*>(context);
-    return CV_HAL_ERROR_OK;
-}
-} // cv::cv_hal_rvv::filter
-
-namespace sepFilter {
-#undef cv_hal_sepFilterInit
-#undef cv_hal_sepFilter
-#undef cv_hal_sepFilterFree
-#define cv_hal_sepFilterInit cv::cv_hal_rvv::sepFilter::sepFilterInit
-#define cv_hal_sepFilter cv::cv_hal_rvv::sepFilter::sepFilter
-#define cv_hal_sepFilterFree cv::cv_hal_rvv::sepFilter::sepFilterFree
-
-struct sepFilter2D
-{
-    int src_type;
-    int dst_type;
-    int kernel_type;
-    const uchar* kernelx_data;
-    int kernelx_length;
-    const uchar* kernely_data;
-    int kernely_length;
-    int anchor_x;
-    int anchor_y;
-    double delta;
-    int borderType;
-};
-
-inline int sepFilterInit(cvhalFilter2D **context, int src_type, int dst_type, int kernel_type, uchar *kernelx_data, int kernelx_length, uchar *kernely_data, int kernely_length, int anchor_x, int anchor_y, double delta, int borderType)
-{
-    if (kernel_type != CV_32FC1 || src_type != CV_8UC1 || (dst_type != CV_16SC1 && dst_type != CV_32FC1))
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (kernelx_length != kernely_length)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (kernelx_length != 3 && kernelx_length != 5)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    anchor_x = anchor_x < 0 ? kernelx_length / 2 : anchor_x;
-    anchor_y = anchor_y < 0 ? kernely_length / 2 : anchor_y;
-    *context = reinterpret_cast<cvhalFilter2D*>(new sepFilter2D{src_type, dst_type, kernel_type, kernelx_data, kernelx_length, kernely_data, kernely_length, anchor_x, anchor_y, delta, borderType & ~BORDER_ISOLATED});
-    return CV_HAL_ERROR_OK;
-}
-
-// the algorithm is copied from 3rdparty/carotene/src/separable_filter.hpp,
-// in the functor RowFilter3x3S16Generic and ColFilter3x3S16Generic
-template<int ksize>
-static inline int sepFilterRow(int start, int end, sepFilter2D* data, const uchar* src_data, size_t src_step, float* dst_data, int width, int full_width, int offset_x)
-{
-    constexpr int noval = std::numeric_limits<int>::max();
-    auto access = [&](int y) {
-        int pj;
-        if (data->borderType & BORDER_ISOLATED)
-        {
-            pj = filter::borderInterpolate(y - data->anchor_x, width, data->borderType & ~BORDER_ISOLATED);
-            pj = pj < 0 ? noval : pj;
-        }
-        else
-        {
-            pj = filter::borderInterpolate(offset_x + y - data->anchor_x, full_width, data->borderType);
-            pj = pj < 0 ? noval : pj - offset_x;
-        }
-        return pj;
-    };
-
-    const float* kx = reinterpret_cast<const float*>(data->kernelx_data);
-    auto process = [&](int x, int y) {
-        float sum = 0;
-        for (int i = 0; i < ksize; i++)
-        {
-            int p = access(y + i);
-            if (p != noval)
-            {
-                sum += kx[i] * src_data[x * src_step + p];
-            }
-        }
-        dst_data[x * width + y] = sum;
-    };
-
-    for (int i = start; i < end; i++)
-    {
-        const int left = ksize - 1, right = width - (ksize - 1);
-        if (left >= right)
-        {
-            for (int j = 0; j < width; j++)
-                process(i, j);
-        }
-        else
-        {
-            for (int j = 0; j < left; j++)
-                process(i, j);
-            for (int j = right; j < width; j++)
-                process(i, j);
-
-            int vl;
-            for (int j = left; j < right; j += vl)
-            {
-                vl = __riscv_vsetvl_e8m2(right - j);
-                const uchar* extra = src_data + i * src_step + j - data->anchor_x;
-                auto sum = __riscv_vfmv_v_f_f32m8(0, vl);
-                auto src = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vle8_v_u8m2(extra, vl), vl), vl);
-                sum = __riscv_vfmacc(sum, kx[0], src, vl);
-                src = __riscv_vfslide1down(src, extra[vl], vl);
-                sum = __riscv_vfmacc(sum, kx[1], src, vl);
-                src = __riscv_vfslide1down(src, extra[vl + 1], vl);
-                sum = __riscv_vfmacc(sum, kx[2], src, vl);
-                if (ksize == 5)
-                {
-                    src = __riscv_vfslide1down(src, extra[vl + 2], vl);
-                    sum = __riscv_vfmacc(sum, kx[3], src, vl);
-                    src = __riscv_vfslide1down(src, extra[vl + 3], vl);
-                    sum = __riscv_vfmacc(sum, kx[4], src, vl);
-                }
-                __riscv_vse32(dst_data + i * width + j, sum, vl);
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-template<int ksize>
-static inline int sepFilterCol(int start, int end, sepFilter2D* data, const float* src_data, uchar* dst_data, size_t dst_step, int width, int height, int full_height, int offset_y)
-{
-    constexpr int noval = std::numeric_limits<int>::max();
-    auto access = [&](int x) {
-        int pi;
-        if (data->borderType & BORDER_ISOLATED)
-        {
-            pi = filter::borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED);
-            pi = pi < 0 ? noval : pi;
-        }
-        else
-        {
-            pi = filter::borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType);
-            pi = pi < 0 ? noval : pi - offset_y;
-        }
-        return pi;
-    };
-
-    const float* ky = reinterpret_cast<const float*>(data->kernely_data);
-    for (int i = start; i < end; i++)
-    {
-        const float* row0 = access(i    ) == noval ? nullptr : src_data + access(i    ) * width;
-        const float* row1 = access(i + 1) == noval ? nullptr : src_data + access(i + 1) * width;
-        const float* row2 = access(i + 2) == noval ? nullptr : src_data + access(i + 2) * width;
-        const float* row3, *row4;
-        if (ksize == 5)
-        {
-            row3 = access(i + 3) == noval ? nullptr : src_data + access(i + 3) * width;
-            row4 = access(i + 4) == noval ? nullptr : src_data + access(i + 4) * width;
-        }
-
-        int vl;
-        for (int j = 0; j < width; j += vl)
-        {
-            vl = __riscv_vsetvl_e32m4(width - j);
-            auto v0 = row0 ? __riscv_vle32_v_f32m4(row0 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
-            auto v1 = row1 ? __riscv_vle32_v_f32m4(row1 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
-            auto v2 = row2 ? __riscv_vle32_v_f32m4(row2 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
-            auto sum = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmv_v_f_f32m4(data->delta, vl), ky[0], v0, vl), ky[1], v1, vl), ky[2], v2, vl);
-
-            if (ksize == 5)
-            {
-                auto v3 = row3 ? __riscv_vle32_v_f32m4(row3 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
-                auto v4 = row4 ? __riscv_vle32_v_f32m4(row4 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
-                sum = __riscv_vfmacc(__riscv_vfmacc(sum, ky[3], v3, vl), ky[4], v4, vl);
-            }
-            if (data->dst_type == CV_16SC1)
-            {
-                __riscv_vse16(reinterpret_cast<short*>(dst_data + i * dst_step) + j, __riscv_vfncvt_x(sum, vl), vl);
-            }
-            else
-            {
-                __riscv_vse32(reinterpret_cast<float*>(dst_data + i * dst_step) + j, sum, vl);
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-inline int sepFilter(cvhalFilter2D *context, uchar *src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
-{
-    sepFilter2D* data = reinterpret_cast<sepFilter2D*>(context);
-    const int padding = data->kernelx_length - 1;
-    std::vector<float> _result(width * (height + 2 * padding));
-    float* result = _result.data() + width * padding;
-
-    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
-    switch (data->kernelx_length)
-    {
-    case 3:
-        res = filter::invoke(-std::min(offset_y, padding), height + std::min(full_height - height - offset_y, padding), {sepFilterRow<3>}, data, src_data, src_step, result, width, full_width, offset_x);
-        break;
-    case 5:
-        res = filter::invoke(-std::min(offset_y, padding), height + std::min(full_height - height - offset_y, padding), {sepFilterRow<5>}, data, src_data, src_step, result, width, full_width, offset_x);
-        break;
-    }
-    if (res == CV_HAL_ERROR_NOT_IMPLEMENTED)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    switch (data->kernelx_length)
-    {
-    case 3:
-        return filter::invoke(0, height, {sepFilterCol<3>}, data, result, dst_data, dst_step, width, height, full_height, offset_y);
-    case 5:
-        return filter::invoke(0, height, {sepFilterCol<5>}, data, result, dst_data, dst_step, width, height, full_height, offset_y);
-    }
-
-    return CV_HAL_ERROR_NOT_IMPLEMENTED;
-}
-
-inline int sepFilterFree(cvhalFilter2D* context)
-{
-    delete reinterpret_cast<sepFilter2D*>(context);
-    return CV_HAL_ERROR_OK;
-}
-} // cv::cv_hal_rvv::sepFilter
-
-namespace morph {
-#undef cv_hal_morphInit
-#undef cv_hal_morph
-#undef cv_hal_morphFree
-#define cv_hal_morphInit cv::cv_hal_rvv::morph::morphInit
-#define cv_hal_morph cv::cv_hal_rvv::morph::morph
-#define cv_hal_morphFree cv::cv_hal_rvv::morph::morphFree
-
-struct Morph2D
-{
-    int operation;
-    int src_type;
-    int dst_type;
-    int kernel_type;
-    uchar *kernel_data;
-    size_t kernel_step;
-    int kernel_width;
-    int kernel_height;
-    int anchor_x;
-    int anchor_y;
-    int borderType;
-    const uchar* borderValue;
-};
-
-inline int morphInit(cvhalFilter2D** context, int operation, int src_type, int dst_type, int /*max_width*/, int /*max_height*/, int kernel_type, uchar *kernel_data, size_t kernel_step, int kernel_width, int kernel_height, int anchor_x, int anchor_y, int borderType, const double borderValue[4], int iterations, bool /*allowSubmatrix*/, bool /*allowInplace*/)
-{
-    if (kernel_type != CV_8UC1 || src_type != dst_type)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (src_type != CV_8UC1 && src_type != CV_8UC4)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (kernel_width != kernel_height || kernel_width != 3)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (iterations != 1)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (operation != CV_HAL_MORPH_ERODE && operation != CV_HAL_MORPH_DILATE)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    uchar* borderV;
-    if (src_type == CV_8UC1)
-    {
-        borderV = new uchar{static_cast<uchar>(borderValue[0])};
-        if (operation == CV_HAL_MORPH_DILATE && borderValue[0] == DBL_MAX)
-            borderV[0] = 0;
-    }
-    else
-    {
-        borderV = new uchar[4]{static_cast<uchar>(borderValue[0]), static_cast<uchar>(borderValue[1]), static_cast<uchar>(borderValue[2]), static_cast<uchar>(borderValue[3])};
-        if (operation == CV_HAL_MORPH_DILATE)
-        {
-            if (borderValue[0] == DBL_MAX)
-                borderV[0] = 0;
-            if (borderValue[1] == DBL_MAX)
-                borderV[1] = 0;
-            if (borderValue[2] == DBL_MAX)
-                borderV[2] = 0;
-            if (borderValue[3] == DBL_MAX)
-                borderV[3] = 0;
-        }
-    }
-
-    anchor_x = anchor_x < 0 ? kernel_width  / 2 : anchor_x;
-    anchor_y = anchor_y < 0 ? kernel_height / 2 : anchor_y;
-    *context = reinterpret_cast<cvhalFilter2D*>(new Morph2D{operation, src_type, dst_type, kernel_type, kernel_data, kernel_step, kernel_width, kernel_height, anchor_x, anchor_y, borderType, borderV});
-    return CV_HAL_ERROR_OK;
-}
-
-template<int op> struct rvv;
-template<> struct rvv<CV_HAL_MORPH_ERODE>
-{
-    static inline uchar init() { return std::numeric_limits<uchar>::max(); }
-    static inline uchar mop(uchar a, uchar b) { return a < b ? a : b; }
-    static inline vuint8m4_t vop(vuint8m4_t a, vuint8m4_t b, size_t c) { return __riscv_vminu(a, b, c); }
-    static inline vuint8m4_t vop(vuint8m4_t a, uchar b, size_t c) { return __riscv_vminu(a, b, c); }
-};
-template<> struct rvv<CV_HAL_MORPH_DILATE>
-{
-    static inline uchar init() { return std::numeric_limits<uchar>::min(); }
-    static inline uchar mop(uchar a, uchar b) { return a > b ? a : b; }
-    static inline vuint8m4_t vop(vuint8m4_t a, vuint8m4_t b, size_t c) { return __riscv_vmaxu(a, b, c); }
-    static inline vuint8m4_t vop(vuint8m4_t a, uchar b, size_t c) { return __riscv_vmaxu(a, b, c); }
-};
-
-// the algorithm is copied from 3rdparty/carotene/src/morph.cpp,
-// in the function template void morph3x3
-template<int op>
-static inline int morph(int start, int end, Morph2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
-{
-    bool kernel[9];
-    for (int i = 0; i < 9; i++)
-    {
-        kernel[i] = data->kernel_data[(i / 3) * data->kernel_step + i % 3] != 0;
-    }
-
-    constexpr int noval = std::numeric_limits<int>::max();
-    auto access = [&](int x, int y) {
-        int pi, pj;
-        if (data->borderType & BORDER_ISOLATED)
-        {
-            pi = filter::borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED);
-            pj = filter::borderInterpolate(y - data->anchor_x, width , data->borderType & ~BORDER_ISOLATED);
-            pi = pi < 0 ? noval : pi;
-            pj = pj < 0 ? noval : pj;
-        }
-        else
-        {
-            pi = filter::borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType);
-            pj = filter::borderInterpolate(offset_x + y - data->anchor_x, full_width , data->borderType);
-            pi = pi < 0 ? noval : pi - offset_y;
-            pj = pj < 0 ? noval : pj - offset_x;
-        }
-        return std::make_pair(pi, pj);
-    };
-
-    auto process = [&](int x, int y) {
-        if (data->src_type == CV_8UC1)
-        {
-            uchar val = rvv<op>::init();
-            for (int i = 0; i < 9; i++)
-            {
-                if (kernel[i])
-                {
-                    auto p = access(x + i / 3, y + i % 3);
-                    if (p.first != noval && p.second != noval)
-                    {
-                        val = rvv<op>::mop(val, src_data[p.first * src_step + p.second]);
-                    }
-                    else
-                    {
-                        val = rvv<op>::mop(val, data->borderValue[0]);
-                    }
-                }
-            }
-            dst_data[x * width + y] = val;
-        }
-        else
-        {
-            uchar val0, val1, val2, val3;
-            val0 = val1 = val2 = val3 = rvv<op>::init();
-            for (int i = 0; i < 9; i++)
-            {
-                if (kernel[i])
-                {
-                    auto p = access(x + i / 3, y + i % 3);
-                    if (p.first != noval && p.second != noval)
-                    {
-                        val0 = rvv<op>::mop(val0, src_data[p.first * src_step + p.second * 4    ]);
-                        val1 = rvv<op>::mop(val1, src_data[p.first * src_step + p.second * 4 + 1]);
-                        val2 = rvv<op>::mop(val2, src_data[p.first * src_step + p.second * 4 + 2]);
-                        val3 = rvv<op>::mop(val3, src_data[p.first * src_step + p.second * 4 + 3]);
-                    }
-                    else
-                    {
-                        val0 = rvv<op>::mop(val0, data->borderValue[0]);
-                        val1 = rvv<op>::mop(val1, data->borderValue[1]);
-                        val2 = rvv<op>::mop(val2, data->borderValue[2]);
-                        val3 = rvv<op>::mop(val3, data->borderValue[3]);
-                    }
-                }
-            }
-            dst_data[(x * width + y) * 4    ] = val0;
-            dst_data[(x * width + y) * 4 + 1] = val1;
-            dst_data[(x * width + y) * 4 + 2] = val2;
-            dst_data[(x * width + y) * 4 + 3] = val3;
-        }
-    };
-
-    for (int i = start; i < end; i++)
-    {
-        const int left = 2, right = width - 2;
-        if (left >= right)
-        {
-            for (int j = 0; j < width; j++)
-                process(i, j);
-        }
-        else
-        {
-            for (int j = 0; j < left; j++)
-                process(i, j);
-            for (int j = right; j < width; j++)
-                process(i, j);
-
-            const uchar* row0 = access(i    , 0).first == noval ? nullptr : src_data + access(i    , 0).first * src_step;
-            const uchar* row1 = access(i + 1, 0).first == noval ? nullptr : src_data + access(i + 1, 0).first * src_step;
-            const uchar* row2 = access(i + 2, 0).first == noval ? nullptr : src_data + access(i + 2, 0).first * src_step;
-            if (data->src_type == CV_8UC1)
-            {
-                int vl;
-                for (int j = left; j < right; j += vl)
-                {
-                    vl = __riscv_vsetvl_e8m4(right - j);
-                    auto m0 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
-                    auto loadsrc = [&](const uchar* row, bool k0, bool k1, bool k2) {
-                        if (!row)
-                        {
-                            m0 = rvv<op>::vop(m0, data->borderValue[0], vl);
-                            return;
-                        }
-
-                        const uchar* extra = row + j - data->anchor_x;
-                        auto v0 = __riscv_vle8_v_u8m4(extra, vl);
-
-                        if (k0) m0 = rvv<op>::vop(m0, v0, vl);
-                        v0 = __riscv_vslide1down(v0, extra[vl], vl);
-                        if (k1) m0 = rvv<op>::vop(m0, v0, vl);
-                        if (!k2) return;
-                        v0 = __riscv_vslide1down(v0, extra[vl + 1], vl);
-                        m0 = rvv<op>::vop(m0, v0, vl);
-                    };
-
-                    loadsrc(row0, kernel[0], kernel[1], kernel[2]);
-                    loadsrc(row1, kernel[3], kernel[4], kernel[5]);
-                    loadsrc(row2, kernel[6], kernel[7], kernel[8]);
-                    __riscv_vse8(dst_data + i * width + j, m0, vl);
-                }
-            }
-            else
-            {
-                int vl, vl0, vl1;
-                for (int j = left; j < right; j += vl)
-                {
-                    vl = __riscv_vsetvl_e8m4(right - j);
-                    vl0 = std::min(vl, (int)__riscv_vlenb() * 2);
-                    vl1 = vl - vl0;
-                    auto m0 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
-                    auto m1 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
-                    auto m2 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
-                    auto m3 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
-
-                    auto opshift = [&](vuint8m4_t a, vuint8m4_t b, bool k0, bool k1, bool k2, uchar r1, uchar r2) {
-                        if (k0) a = rvv<op>::vop(a, b, vl);
-                        b = __riscv_vslide1down(b, r1, vl);
-                        if (k1) a = rvv<op>::vop(a, b, vl);
-                        if (!k2) return a;
-                        b = __riscv_vslide1down(b, r2, vl);
-                        return rvv<op>::vop(a, b, vl);
-                    };
-                    auto loadsrc = [&](const uchar* row, bool k0, bool k1, bool k2) {
-                        if (!row)
-                        {
-                            m0 = rvv<op>::vop(m0, data->borderValue[0], vl);
-                            m1 = rvv<op>::vop(m1, data->borderValue[1], vl);
-                            m2 = rvv<op>::vop(m2, data->borderValue[2], vl);
-                            m3 = rvv<op>::vop(m3, data->borderValue[3], vl);
-                            return;
-                        }
-
-                        vuint8m4_t v0{}, v1{}, v2{}, v3{};
-                        const uchar* extra = row + (j - data->anchor_x) * 4;
-                        auto src = __riscv_vlseg4e8_v_u8m2x4(extra, vl0);
-                        v0 = __riscv_vset_v_u8m2_u8m4(v0, 0, __riscv_vget_v_u8m2x4_u8m2(src, 0));
-                        v1 = __riscv_vset_v_u8m2_u8m4(v1, 0, __riscv_vget_v_u8m2x4_u8m2(src, 1));
-                        v2 = __riscv_vset_v_u8m2_u8m4(v2, 0, __riscv_vget_v_u8m2x4_u8m2(src, 2));
-                        v3 = __riscv_vset_v_u8m2_u8m4(v3, 0, __riscv_vget_v_u8m2x4_u8m2(src, 3));
-                        src = __riscv_vlseg4e8_v_u8m2x4(extra + vl0 * 4, vl1);
-                        v0 = __riscv_vset_v_u8m2_u8m4(v0, 1, __riscv_vget_v_u8m2x4_u8m2(src, 0));
-                        v1 = __riscv_vset_v_u8m2_u8m4(v1, 1, __riscv_vget_v_u8m2x4_u8m2(src, 1));
-                        v2 = __riscv_vset_v_u8m2_u8m4(v2, 1, __riscv_vget_v_u8m2x4_u8m2(src, 2));
-                        v3 = __riscv_vset_v_u8m2_u8m4(v3, 1, __riscv_vget_v_u8m2x4_u8m2(src, 3));
-
-                        m0 = opshift(m0, v0, k0, k1, k2, extra[vl * 4    ], extra[vl * 4 + 4]);
-                        m1 = opshift(m1, v1, k0, k1, k2, extra[vl * 4 + 1], extra[vl * 4 + 5]);
-                        m2 = opshift(m2, v2, k0, k1, k2, extra[vl * 4 + 2], extra[vl * 4 + 6]);
-                        m3 = opshift(m3, v3, k0, k1, k2, extra[vl * 4 + 3], extra[vl * 4 + 7]);
-                    };
-
-                    loadsrc(row0, kernel[0], kernel[1], kernel[2]);
-                    loadsrc(row1, kernel[3], kernel[4], kernel[5]);
-                    loadsrc(row2, kernel[6], kernel[7], kernel[8]);
-                    vuint8m2x4_t val{};
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 0, __riscv_vget_v_u8m4_u8m2(m0, 0));
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 1, __riscv_vget_v_u8m4_u8m2(m1, 0));
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 2, __riscv_vget_v_u8m4_u8m2(m2, 0));
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 3, __riscv_vget_v_u8m4_u8m2(m3, 0));
-                    __riscv_vsseg4e8(dst_data + (i * width + j) * 4, val, vl0);
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 0, __riscv_vget_v_u8m4_u8m2(m0, 1));
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 1, __riscv_vget_v_u8m4_u8m2(m1, 1));
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 2, __riscv_vget_v_u8m4_u8m2(m2, 1));
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 3, __riscv_vget_v_u8m4_u8m2(m3, 1));
-                    __riscv_vsseg4e8(dst_data + (i * width + j + vl0) * 4, val, vl1);
-                }
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-inline int morph(cvhalFilter2D* context, uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height, int src_full_width, int src_full_height, int src_roi_x, int src_roi_y, int /*dst_full_width*/, int /*dst_full_height*/, int /*dst_roi_x*/, int /*dst_roi_y*/)
-{
-    Morph2D* data = reinterpret_cast<Morph2D*>(context);
-    int cn = data->src_type == CV_8UC1 ? 1 : 4;
-    std::vector<uchar> dst(width * height * cn);
-
-    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
-    switch (data->operation)
-    {
-    case CV_HAL_MORPH_ERODE:
-        res = filter::invoke(0, height, {morph<CV_HAL_MORPH_ERODE>}, data, src_data, src_step, dst.data(), width, height, src_full_width, src_full_height, src_roi_x, src_roi_y);
-        break;
-    case CV_HAL_MORPH_DILATE:
-        res = filter::invoke(0, height, {morph<CV_HAL_MORPH_DILATE>}, data, src_data, src_step, dst.data(), width, height, src_full_width, src_full_height, src_roi_x, src_roi_y);
-        break;
-    }
-
-    for (int i = 0; i < height; i++)
-        std::copy(dst.data() + i * width * cn, dst.data() + (i + 1) * width * cn, dst_data + i * dst_step);
-    return res;
-}
-
-inline int morphFree(cvhalFilter2D* context)
-{
-    delete reinterpret_cast<Morph2D*>(context)->borderValue;
-    delete reinterpret_cast<Morph2D*>(context);
-    return CV_HAL_ERROR_OK;
-}
-} // cv::cv_hal_rvv::morph
-
-}}
-
-#endif
--- a/3rdparty/hal_rvv/hal_rvv_1p0/flip.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/flip.hpp
@ -1,225 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-#pragma once
-
-#include <riscv_vector.h>
-#include <opencv2/core/base.hpp>
-#include "hal_rvv_1p0/types.hpp"
-
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_flip
-#define cv_hal_flip cv::cv_hal_rvv::flip
-
-struct FlipVlen256
-{
-    using SrcType = RVV_U8M8;
-    using TabType = RVV_U8M8;
-    using TabVecType = typename TabType::VecType;
-
-    static inline void gather(const uchar* src, TabVecType tab, uchar* dst, size_t vl)
-    {
-        auto src_v = SrcType::vload(src, vl);
-        SrcType::vstore(dst, __riscv_vrgather(src_v, tab, vl), vl);
-    }
-};
-
-struct FlipVlen512 : RVV_U8M8
-{
-    using SrcType = RVV_U8M4;
-    using TabType = RVV_U16M8;
-    using TabVecType = typename TabType::VecType;
-
-    static inline void gather(const uchar* src, TabVecType tab, uchar* dst, size_t vl)
-    {
-        auto src_v = SrcType::vload(src, vl);
-        SrcType::vstore(dst, __riscv_vrgatherei16(src_v, tab, vl), vl);
-    }
-};
-
-template <typename T>
-inline void flipFillBuffer(T* buf, size_t len, int esz)
-{
-    for (int i = (int)len - esz; i >= 0; i -= esz, buf += esz)
-        for (int j = 0; j < esz; j++)
-            buf[j] = (T)(i + j);
-}
-
-inline void flipX(int esz,
-                  const uchar* src_data,
-                  size_t src_step,
-                  int src_width,
-                  int src_height,
-                  uchar* dst_data,
-                  size_t dst_step)
-{
-    size_t w = (size_t)src_width * esz;
-    auto src0 = src_data, src1 = src_data + src_step * (src_height - 1);
-    auto dst0 = dst_data, dst1 = dst_data + dst_step * (src_height - 1);
-    size_t vl;
-    for (src_height -= 2; src_height >= 0;
-         src_height -= 2, src0 += src_step, dst0 += dst_step, src1 -= src_step, dst1 -= dst_step)
-    {
-        for (size_t i = 0; i < w; i += vl)
-        {
-            vl = __riscv_vsetvl_e8m8(w - i);
-            __riscv_vse8(dst1 + i, __riscv_vle8_v_u8m8(src0 + i, vl), vl);
-            __riscv_vse8(dst0 + i, __riscv_vle8_v_u8m8(src1 + i, vl), vl);
-        }
-    }
-    if (src_height == -1)
-    {
-        for (size_t i = 0; i < w; i += (int)vl)
-        {
-            vl = __riscv_vsetvl_e8m8(w - i);
-            __riscv_vse8(dst0 + i, __riscv_vle8_v_u8m8(src1 + i, vl), vl);
-        }
-    }
-}
-
-template <typename FlipVlen,
-          typename SrcType = typename FlipVlen::SrcType,
-          typename TabType = typename FlipVlen::TabType>
-inline void flipY(int esz,
-                  const uchar* src_data,
-                  size_t src_step,
-                  int src_width,
-                  int src_height,
-                  uchar* dst_data,
-                  size_t dst_step)
-{
-    size_t w = (size_t)src_width * esz;
-    size_t vl = std::min(SrcType::setvlmax() / esz * esz, w);
-    typename TabType::VecType tab_v;
-    if (esz == 1)
-        tab_v = __riscv_vrsub(TabType::vid(vl), vl - 1, vl);
-    else
-    {
-        // max vlen supported is 1024 (vlmax of u8m4 for vlen 1024 is 512)
-        typename TabType::ElemType buf[512];
-        flipFillBuffer(buf, vl, esz);
-        tab_v = TabType::vload(buf, vl);
-    }
-    if (vl == w)
-        for (; src_height; src_height--, src_data += src_step, dst_data += dst_step)
-            FlipVlen::gather(src_data, tab_v, dst_data, vl);
-    else
-        for (; src_height; src_height--, src_data += src_step, dst_data += dst_step)
-        {
-            auto src0 = src_data, src1 = src_data + w - vl;
-            auto dst0 = dst_data, dst1 = dst_data + w - vl;
-            for (; src0 < src1 + vl; src0 += vl, src1 -= vl, dst0 += vl, dst1 -= vl)
-            {
-                FlipVlen::gather(src0, tab_v, dst1, vl);
-                FlipVlen::gather(src1, tab_v, dst0, vl);
-            }
-        }
-}
-
-template <typename FlipVlen,
-          typename SrcType = typename FlipVlen::SrcType,
-          typename TabType = typename FlipVlen::TabType>
-inline void flipXY(int esz,
-                   const uchar* src_data,
-                   size_t src_step,
-                   int src_width,
-                   int src_height,
-                   uchar* dst_data,
-                   size_t dst_step)
-{
-    size_t w = (size_t)src_width * esz;
-    size_t vl = std::min(SrcType::setvlmax() / esz * esz, w);
-    typename TabType::VecType tab_v;
-    if (esz == 1)
-        tab_v = __riscv_vrsub(TabType::vid(vl), vl - 1, vl);
-    else
-    {
-        // max vlen supported is 1024 (vlmax of u8m4 for vlen 1024 is 512)
-        typename TabType::ElemType buf[512];
-        flipFillBuffer(buf, vl, esz);
-        tab_v = TabType::vload(buf, vl);
-    }
-    auto src0 = src_data, src1 = src_data + src_step * (src_height - 1);
-    auto dst0 = dst_data, dst1 = dst_data + dst_step * (src_height - 1);
-    if (vl == w)
-    {
-        for (src_height -= 2; src_height >= 0;
-             src_height -= 2,
-             src0 += src_step,
-             dst0 += dst_step,
-             src1 -= src_step,
-             dst1 -= dst_step)
-        {
-            FlipVlen::gather(src0, tab_v, dst1, vl);
-            FlipVlen::gather(src1, tab_v, dst0, vl);
-        }
-        if (src_height == -1)
-        {
-            FlipVlen::gather(src1, tab_v, dst0, vl);
-        }
-    }
-    else
-    {
-        for (src_height -= 2; src_height >= 0;
-             src_height -= 2,
-             src0 += src_step,
-             dst0 += dst_step,
-             src1 -= src_step,
-             dst1 -= dst_step)
-        {
-            for (size_t i = 0; 2 * i < w; i += vl)
-            {
-                FlipVlen::gather(src0 + i, tab_v, dst1 + w - i - vl, vl);
-                FlipVlen::gather(src0 + w - i - vl, tab_v, dst1 + i, vl);
-                FlipVlen::gather(src1 + i, tab_v, dst0 + w - i - vl, vl);
-                FlipVlen::gather(src1 + w - i - vl, tab_v, dst0 + i, vl);
-            }
-        }
-        if (src_height == -1)
-        {
-            for (size_t i = 0; 2 * i < w; i += vl)
-            {
-                FlipVlen::gather(src1 + i, tab_v, dst0 + w - i - vl, vl);
-                FlipVlen::gather(src1 + w - i - vl, tab_v, dst0 + i, vl);
-            }
-        }
-    }
-}
-
-inline int flip(int src_type,
-                const uchar* src_data,
-                size_t src_step,
-                int src_width,
-                int src_height,
-                uchar* dst_data,
-                size_t dst_step,
-                int flip_mode)
-{
-    if (src_width < 0 || src_height < 0 || src_data == dst_data)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    int esz = CV_ELEM_SIZE(src_type);
-    if (flip_mode == 0)
-    {
-        flipX(esz, src_data, src_step, src_width, src_height, dst_data, dst_step);
-    }
-    else if (flip_mode > 0)
-    {
-        if (__riscv_vlenb() * 8 <= 256)
-            flipY<FlipVlen256>(esz, src_data, src_step, src_width, src_height, dst_data, dst_step);
-        else
-            flipY<FlipVlen512>(esz, src_data, src_step, src_width, src_height, dst_data, dst_step);
-    }
-    else
-    {
-        if (__riscv_vlenb() * 8 <= 256)
-            flipXY<FlipVlen256>(esz, src_data, src_step, src_width, src_height, dst_data, dst_step);
-        else
-            flipXY<FlipVlen512>(esz, src_data, src_step, src_width, src_height, dst_data, dst_step);
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-}}  // namespace cv::cv_hal_rvv
--- a/3rdparty/tbb/CMakeLists.txt
+++ b/3rdparty/tbb/CMakeLists.txt
@ -5,8 +5,8 @@ if (WIN32 AND NOT ARM)
  message(FATAL_ERROR "BUILD_TBB option supports Windows on ARM only!\nUse regular official TBB build instead of the BUILD_TBB option!")
 endif()

-ocv_update(OPENCV_TBB_RELEASE "v2021.11.0")
-ocv_update(OPENCV_TBB_RELEASE_MD5 "b301151120b08a17e98dcdda6e4f6011")
+ocv_update(OPENCV_TBB_RELEASE "v2022.1.0")
+ocv_update(OPENCV_TBB_RELEASE_MD5 "cce28e6cb1ceae14a93848990c98cb6b")
 ocv_update(OPENCV_TBB_FILENAME "${OPENCV_TBB_RELEASE}.tar.gz")
 string(REGEX REPLACE "^v" "" OPENCV_TBB_RELEASE_ "${OPENCV_TBB_RELEASE}")
 #ocv_update(OPENCV_TBB_SUBDIR ...)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -369,7 +369,7 @@ OCV_OPTION(WITH_ITT "Include Intel ITT support" ON
 OCV_OPTION(WITH_PROTOBUF "Enable libprotobuf" ON
  VISIBLE_IF TRUE
  VERIFY HAVE_PROTOBUF)
-OCV_OPTION(WITH_IMGCODEC_GIF "Include GIF support" OFF
+OCV_OPTION(WITH_IMGCODEC_GIF "Include GIF support" ON
  VISIBLE_IF TRUE
  VERIFY HAVE_IMGCODEC_GIF)
 OCV_OPTION(WITH_IMGCODEC_HDR "Include HDR support" ON
@ -621,6 +621,7 @@ if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)

    cmake_policy(SET CMP0092 NEW) # CMake 3.15+: leave warning flags out of default CMAKE_<LANG>_FLAGS flags.
    if(CMAKE_CUDA_COMPILER)
+      #  CMake 3.18+: if CMAKE_CUDA_ARCHITECTURES is empty enable_language(CUDA) sets it to the default architecture chosen by the compiler, to trigger the OpenCV custom CUDA architecture search an empty value needs to be respected see https://github.com/opencv/opencv/pull/25941.
      if(CMAKE_CUDA_ARCHITECTURES)
        set(USER_DEFINED_CMAKE_CUDA_ARCHITECTURES TRUE)
      endif()
@ -883,6 +884,13 @@ if(NOT DEFINED OpenCV_HAL)
  set(OpenCV_HAL "OpenCV_HAL")
 endif()

+if(HAVE_IPP)
+  ocv_debug_message(STATUS "Enable IPP acceleration")
+  if(NOT ";${OpenCV_HAL};" MATCHES ";ipp;")
+    set(OpenCV_HAL "ipp;${OpenCV_HAL}")
+  endif()
+endif()
+
 if(HAVE_FASTCV)
  ocv_debug_message(STATUS "Enable FastCV acceleration")
  if(NOT ";${OpenCV_HAL};" MATCHES ";fastcv;")
@ -921,7 +929,7 @@ endif()
 foreach(hal ${OpenCV_HAL})
  if(hal STREQUAL "carotene")
    if(";${CPU_BASELINE_FINAL};" MATCHES ";NEON;")
-      add_subdirectory(3rdparty/carotene/hal)
+      add_subdirectory(hal/carotene/hal)
      ocv_hal_register(CAROTENE_HAL_LIBRARIES CAROTENE_HAL_HEADERS CAROTENE_HAL_INCLUDE_DIRS)
      list(APPEND OpenCV_USED_HAL "carotene (ver ${CAROTENE_HAL_VERSION})")
    else()
@ -929,19 +937,19 @@ foreach(hal ${OpenCV_HAL})
    endif()
  elseif(hal STREQUAL "fastcv")
    if((ARM OR AARCH64) AND (ANDROID OR (UNIX AND NOT APPLE AND NOT IOS AND NOT XROS)))
-      add_subdirectory(3rdparty/fastcv)
+      add_subdirectory(hal/fastcv)
      ocv_hal_register(FASTCV_HAL_LIBRARIES FASTCV_HAL_HEADERS FASTCV_HAL_INCLUDE_DIRS)
      list(APPEND OpenCV_USED_HAL "fastcv (ver ${FASTCV_HAL_VERSION})")
    else()
      message(STATUS "FastCV: fastcv is not available, disabling fastcv...")
    endif()
  elseif(hal STREQUAL "kleidicv")
-    add_subdirectory(3rdparty/kleidicv)
+    add_subdirectory(hal/kleidicv)
    ocv_hal_register(KLEIDICV_HAL_LIBRARIES KLEIDICV_HAL_HEADERS KLEIDICV_HAL_INCLUDE_DIRS)
    list(APPEND OpenCV_USED_HAL "KleidiCV (ver ${KLEIDICV_HAL_VERSION})")
  elseif(hal STREQUAL "ndsrvp")
    if(CMAKE_C_FLAGS MATCHES "-mext-dsp" AND CMAKE_CXX_FLAGS MATCHES "-mext-dsp" AND NOT ";${CPU_BASELINE_FINAL};" MATCHES ";RVV;")
-      add_subdirectory(3rdparty/ndsrvp)
+      add_subdirectory(hal/ndsrvp)
      ocv_hal_register(NDSRVP_HAL_LIBRARIES NDSRVP_HAL_HEADERS NDSRVP_HAL_INCLUDE_DIRS)
      list(APPEND OpenCV_USED_HAL "ndsrvp (ver ${NDSRVP_HAL_VERSION})")
    else()
@ -949,12 +957,16 @@ foreach(hal ${OpenCV_HAL})
    endif()
  elseif(hal STREQUAL "halrvv")
    if(";${CPU_BASELINE_FINAL};" MATCHES ";RVV;")
-      add_subdirectory(3rdparty/hal_rvv/)
+      add_subdirectory(hal/riscv-rvv)
      ocv_hal_register(RVV_HAL_LIBRARIES RVV_HAL_HEADERS RVV_HAL_INCLUDE_DIRS)
      list(APPEND OpenCV_USED_HAL "HAL RVV (ver ${RVV_HAL_VERSION})")
    else()
      message(STATUS "HAL RVV: RVV is not available, disabling halrvv...")
    endif()
+  elseif(hal STREQUAL "ipp")
+    add_subdirectory(hal/ipp)
+    ocv_hal_register(IPP_HAL_LIBRARIES IPP_HAL_HEADERS IPP_HAL_INCLUDE_DIRS)
+    list(APPEND OpenCV_USED_HAL "ipp (ver ${IPP_HAL_VERSION})")
  else()
    ocv_debug_message(STATUS "OpenCV HAL: ${hal} ...")
    ocv_clear_vars(OpenCV_HAL_LIBRARIES OpenCV_HAL_HEADERS OpenCV_HAL_INCLUDE_DIRS)
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@ -149,13 +149,16 @@ macro(ocv_cuda_compile VAR)
  ocv_check_windows_crt_linkage()
  ocv_nvcc_flags()

-  if(UNIX OR APPLE)
-    if(NOT " ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS_DEBUG} ${CUDA_NVCC_FLAGS}" MATCHES "-std=")
-      if(CUDA_VERSION VERSION_LESS "11.0")
+  if(NOT " ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS_DEBUG} ${CUDA_NVCC_FLAGS}" MATCHES "-std=")
+    if(CUDA_VERSION VERSION_LESS "11.0")
+      # Windows version does not support --std option
+      if(UNIX OR APPLE)
        list(APPEND CUDA_NVCC_FLAGS "--std=c++11")
-      else()
-        list(APPEND CUDA_NVCC_FLAGS "--std=c++14")
      endif()
+    elseif(CUDA_VERSION VERSION_LESS "12.8")
+      list(APPEND CUDA_NVCC_FLAGS "--std=c++14")
+    elseif(CUDA_VERSION VERSION_GREATER_EQUAL "12.8")
+      list(APPEND CUDA_NVCC_FLAGS "--std=c++17")
    endif()
  endif()

--- a/cmake/OpenCVDetectCUDALanguage.cmake
+++ b/cmake/OpenCVDetectCUDALanguage.cmake
@ -33,10 +33,12 @@ if(CMAKE_CUDA_COMPILER AND CUDAToolkit_FOUND)
  set(CUDA_TOOLKIT_INCLUDE ${CUDAToolkit_INCLUDE_DIRS})
  set(CUDA_VERSION_STRING ${CUDAToolkit_VERSION})
  set(CUDA_VERSION ${CUDAToolkit_VERSION})
-  if(NOT CUDA_VERSION VERSION_LESS 11.0)
+  if(CUDA_VERSION VERSION_LESS 11.0)
+      set(CMAKE_CUDA_STANDARD 11)
+  elseif(CUDA_VERSION VERSION_LESS 12.8)
      set(CMAKE_CUDA_STANDARD 14)
  else()
-      set(CMAKE_CUDA_STANDARD 11)
+      set(CMAKE_CUDA_STANDARD 17)
  endif()
  if(UNIX AND NOT BUILD_SHARED_LIBS)
      set(CUDA_LIB_EXT "_static")
--- a/cmake/OpenCVFindLibsPerf.cmake
+++ b/cmake/OpenCVFindLibsPerf.cmake
@ -165,7 +165,7 @@ if(WITH_KLEIDICV)
    set(HAVE_KLEIDICV ON)
  endif()
  if(NOT HAVE_KLEIDICV)
-    include("${OpenCV_SOURCE_DIR}/3rdparty/kleidicv/kleidicv.cmake")
+    include("${OpenCV_SOURCE_DIR}/hal/kleidicv/kleidicv.cmake")
    download_kleidicv(KLEIDICV_SOURCE_PATH)
    if(KLEIDICV_SOURCE_PATH)
      set(HAVE_KLEIDICV ON)
@ -195,13 +195,16 @@ if(WITH_FASTCV)
      set(FastCV_INCLUDE_PATH "${FCV_ROOT_DIR}/inc" CACHE PATH "FastCV includes directory")
      set(FastCV_LIB_PATH "${FCV_ROOT_DIR}/libs" CACHE PATH "FastCV library directory")
      ocv_install_3rdparty_licenses(FastCV "${OpenCV_BINARY_DIR}/3rdparty/fastcv/LICENSE")
-      if(ANDROID)
-        set(FASTCV_LIBRARY "${FastCV_LIB_PATH}/libfastcvopt.so" CACHE PATH "FastCV library")
-        install(FILES "${FASTCV_LIBRARY}" DESTINATION "${OPENCV_LIB_INSTALL_PATH}" COMPONENT "bin")
-      else()
-        set(FASTCV_LIBRARY "${FastCV_LIB_PATH}/libfastcv.a" CACHE PATH "FastCV library")
-        install(FILES "${FASTCV_LIBRARY}" DESTINATION "${OPENCV_LIB_INSTALL_PATH}" COMPONENT "dev")
+      add_library(fastcv STATIC IMPORTED)
+      set_target_properties(fastcv PROPERTIES
+          IMPORTED_LINK_INTERFACE_LIBRARIES ""
+          IMPORTED_LOCATION "${FastCV_LIB_PATH}/libfastcv.a"
+      )
+      if (NOT BUILD_SHARED_LIBS)
+        install(FILES "${FastCV_LIB_PATH}/libfastcv.a" DESTINATION "${OPENCV_3P_LIB_INSTALL_PATH}" COMPONENT "dev")
      endif()
+      set(FASTCV_LIBRARY "fastcv" CACHE PATH "FastCV library")
+      list(APPEND OPENCV_LINKER_LIBS ${FASTCV_LIBRARY})
    else()
      set(HAVE_FASTCV FALSE CACHE BOOL "FastCV status")
    endif()
--- a/cmake/OpenCVFindOpenBLAS.cmake
+++ b/cmake/OpenCVFindOpenBLAS.cmake
@ -1,107 +1,42 @@
-#COPYRIGHT
-#
-#All contributions by the University of California:
-#Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
-#All rights reserved.
-#
-#All other contributions:
-#Copyright (c) 2014, 2015, the respective contributors
-#All rights reserved.
-#
-#Caffe uses a shared copyright model: each contributor holds copyright over
-#their contributions to Caffe. The project versioning records all such
-#contribution and copyright details. If a contributor wants to further mark
-#their specific copyright on a particular contribution, they should indicate
-#their copyright solely in the commit message of the change when it is
-#committed.
-#
-#LICENSE
-#
-#Redistribution and use in source and binary forms, with or without
-#modification, are permitted provided that the following conditions are met:
-#
-#1. Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#2. Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-#ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-#DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-#ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-#(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-#ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#CONTRIBUTION AGREEMENT
-#
-#By contributing to the BVLC/caffe repository through pull-request, comment,
-#or otherwise, the contributor releases their content to the
-#license and copyright terms herein.
+# Search for OpenBLAS library

-SET(Open_BLAS_INCLUDE_SEARCH_PATHS
-  $ENV{OpenBLAS_HOME}
-  $ENV{OpenBLAS_HOME}/include
-  $ENV{OpenBLAS_HOME}/include/openblas
-  /opt/OpenBLAS/include
-  /usr/local/include/openblas
-  /usr/include/openblas
-  /usr/local/include/openblas-base
-  /usr/include/openblas-base
-  /usr/local/include
-  /usr/include
-)
+if(NOT OpenBLAS_FOUND AND NOT SKIP_OPENBLAS_PACKAGE)
+  find_package(OpenBLAS QUIET)
+  if(OpenBLAS_FOUND)
+    message(STATUS "Found OpenBLAS package")
+  endif()
+endif()

-SET(Open_BLAS_LIB_SEARCH_PATHS
-        $ENV{OpenBLAS}
-        $ENV{OpenBLAS}/lib
-        $ENV{OpenBLAS_HOME}
-        $ENV{OpenBLAS_HOME}/lib
-        /opt/OpenBLAS/lib
-        /usr/local/lib64
-        /usr/local/lib
-        /lib/openblas-base
-        /lib64/
-        /lib/
-        /usr/lib/openblas-base
-        /usr/lib64
-        /usr/lib
- )
+if(NOT OpenBLAS_FOUND)
+  find_library(OpenBLAS_LIBRARIES NAMES openblas PATHS ENV "OpenBLAS" ENV "OpenBLAS_HOME" PATH_SUFFIXES "lib" NO_DEFAULT_PATH)
+  find_path(OpenBLAS_INCLUDE_DIRS NAMES cblas.h PATHS ENV "OpenBLAS" ENV "OpenBLAS_HOME" PATH_SUFFIXES "include" NO_DEFAULT_PATH)
+  find_path(OpenBLAS_LAPACKE_DIR NAMES lapacke.h PATHS "${OpenBLAS_INCLUDE_DIRS}" ENV "OpenBLAS" ENV "OpenBLAS_HOME" PATH_SUFFIXES "include" NO_DEFAULT_PATH)
+  if(OpenBLAS_LIBRARIES AND OpenBLAS_INCLUDE_DIRS)
+    message(STATUS "Found OpenBLAS using environment hint")
+    set(OpenBLAS_FOUND ON)
+  else()
+    ocv_clear_vars(OpenBLAS_LIBRARIES OpenBLAS_INCLUDE_DIRS)
+  endif()
+endif()

-FIND_PATH(OpenBLAS_INCLUDE_DIR NAMES cblas.h PATHS ${Open_BLAS_INCLUDE_SEARCH_PATHS} NO_DEFAULT_PATH)
-FIND_LIBRARY(OpenBLAS_LIB NAMES openblas libopenblas PATHS ${Open_BLAS_LIB_SEARCH_PATHS}  NO_DEFAULT_PATH)
+if(NOT OpenBLAS_FOUND)
+  find_library(OpenBLAS_LIBRARIES NAMES openblas)
+  find_path(OpenBLAS_INCLUDE_DIRS NAMES cblas.h)
+  find_path(OpenBLAS_LAPACKE_DIR NAMES lapacke.h PATHS "${OpenBLAS_INCLUDE_DIRS}")
+  if(OpenBLAS_LIBRARIES AND OpenBLAS_INCLUDE_DIRS)
+    message(STATUS "Found OpenBLAS in the system")
+    set(OpenBLAS_FOUND ON)
+  else()
+    ocv_clear_vars(OpenBLAS_LIBRARIES OpenBLAS_INCLUDE_DIRS)
+  endif()
+endif()

-SET(OpenBLAS_FOUND ON)
+if(OpenBLAS_FOUND)
+  if(OpenBLAS_LAPACKE_DIR)
+    set(OpenBLAS_INCLUDE_DIRS "${OpenBLAS_INCLUDE_DIRS};${OpenBLAS_LAPACKE_DIR}")
+  endif()
+  message(STATUS "OpenBLAS_LIBRARIES=${OpenBLAS_LIBRARIES}")
+  message(STATUS "OpenBLAS_INCLUDE_DIRS=${OpenBLAS_INCLUDE_DIRS}")
+endif()

-#    Check include files
-IF(NOT OpenBLAS_INCLUDE_DIR)
-    SET(OpenBLAS_FOUND OFF)
-    MESSAGE(STATUS "Could not find OpenBLAS include. Turning OpenBLAS_FOUND off")
-ENDIF()
-
-#    Check libraries
-IF(NOT OpenBLAS_LIB)
-    SET(OpenBLAS_FOUND OFF)
-    MESSAGE(STATUS "Could not find OpenBLAS lib. Turning OpenBLAS_FOUND off")
-ENDIF()
-
-IF (OpenBLAS_FOUND)
-  IF (NOT OpenBLAS_FIND_QUIETLY)
-    MESSAGE(STATUS "Found OpenBLAS libraries: ${OpenBLAS_LIB}")
-    MESSAGE(STATUS "Found OpenBLAS include: ${OpenBLAS_INCLUDE_DIR}")
-  ENDIF (NOT OpenBLAS_FIND_QUIETLY)
-ELSE (OpenBLAS_FOUND)
-  IF (OpenBLAS_FIND_REQUIRED)
-    MESSAGE(FATAL_ERROR "Could not find OpenBLAS")
-  ENDIF (OpenBLAS_FIND_REQUIRED)
-ENDIF (OpenBLAS_FOUND)
-
-MARK_AS_ADVANCED(
-    OpenBLAS_INCLUDE_DIR
-    OpenBLAS_LIB
-    OpenBLAS
-)
+mark_as_advanced(OpenBLAS_LIBRARIES OpenBLAS_INCLUDE_DIRS OpenBLAS_LAPACKE_DIR)
--- a/cmake/checks/cxx11.cpp
+++ b/cmake/checks/cxx11.cpp
@ -1,13 +0,0 @@
-#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1600)
-// OK
-#else
-#error "C++11 is not supported"
-#endif
-
-static int test() { return 0; }
-
-int main()
-{
-    auto res = test();
-    return res;
-}
--- a/doc/js_tutorials/js_assets/js_houghcircles_HoughCirclesP.html
+++ b/doc/js_tutorials/js_assets/js_houghcircles_HoughCirclesP.html
@ -47,7 +47,7 @@ let color = new cv.Scalar(255, 0, 0);
 cv.cvtColor(src, src, cv.COLOR_RGBA2GRAY, 0);
 // You can try more different parameters
 cv.HoughCircles(src, circles, cv.HOUGH_GRADIENT,
-                1, 45, 75, 40, 0, 0);
+                1, 45, 175, 40, 0, 0);
 // draw circles
 for (let i = 0; i < circles.cols; ++i) {
    let x = circles.data32F[i * 3];
--- a/doc/js_tutorials/js_assets/js_setup_usage.html
+++ b/doc/js_tutorials/js_assets/js_setup_usage.html
@ -36,7 +36,8 @@ inputElement.addEventListener('change', (e) => {
    imgElement.src = URL.createObjectURL(e.target.files[0]);
 }, false);

-imgElement.onload = function() {
+imgElement.onload = async function() {
+    cv = (cv instanceof Promise) ? await cv : cv;
    let mat = cv.imread(imgElement);
    cv.imshow('canvasOutput', mat);
    mat.delete();
--- a/doc/js_tutorials/js_imgproc/js_watershed/js_watershed.markdown
+++ b/doc/js_tutorials/js_imgproc/js_watershed/js_watershed.markdown
@ -17,7 +17,7 @@ nearby, water from different valleys, obviously with different colors will start
 that, you build barriers in the locations where water merges. You continue the work of filling water
 and building barriers until all the peaks are under water. Then the barriers you created gives you
 the segmentation result. This is the "philosophy" behind the watershed. You can visit the [CMM
-webpage on watershed](http://cmm.ensmp.fr/~beucher/wtshed.html) to understand it with the help of
+webpage on watershed](https://people.cmm.minesparis.psl.eu/users/beucher/wtshed.html) to understand it with the help of
 some animations.

 But this approach gives you oversegmented result due to noise or any other irregularities in the
--- a/doc/js_tutorials/js_setup/js_setup/js_setup.markdown
+++ b/doc/js_tutorials/js_setup/js_setup/js_setup.markdown
@ -73,6 +73,10 @@ Building OpenCV.js from Source
 ---------------------------------------

 -#  To build `opencv.js`, execute python script `<opencv_src_dir>/platforms/js/build_js.py <build_dir>`.
+    The build script builds WebAssembly version by default(`--build_wasm` switch is kept by back-compatibility reason).
+    By default everything is bundled into one JavaScript file by `base64` encoding the WebAssembly code. For production
+    builds you can add `--disable_single_file` which will reduce total size by writing the WebAssembly code
+    to a dedicated `.wasm` file which the generated JavaScript file will automatically load.

    For example, to build in `build_js` directory:
    @code{.bash}
@ -82,16 +86,6 @@ Building OpenCV.js from Source
    @note
    It requires `python` and `cmake` installed in your development environment.

-#  The build script builds asm.js version by default. To build WebAssembly version, append `--build_wasm` switch.
-    By default everything is bundled into one JavaScript file by `base64` encoding the WebAssembly code. For production
-    builds you can add `--disable_single_file` which will reduce total size by writing the WebAssembly code
-    to a dedicated `.wasm` file which the generated JavaScript file will automatically load.
-
-    For example, to build wasm version in `build_wasm` directory:
-    @code{.bash}
-    emcmake python ./opencv/platforms/js/build_js.py build_wasm --build_wasm
-    @endcode
-
 -#  [Optional] To build the OpenCV.js loader, append `--build_loader`.

    For example:
--- a/doc/js_tutorials/js_setup/js_usage/js_usage.markdown
+++ b/doc/js_tutorials/js_setup/js_usage/js_usage.markdown
@ -63,13 +63,16 @@ Example for asynchronous loading
 ### Use OpenCV.js

 Once `opencv.js` is ready, you can access OpenCV objects and functions through `cv` object.
+The promise-typed `cv` object should be unwrap with `await` operator.
+See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/await .

 For example, you can create a cv.Mat from an image by cv.imread.

@note Because image loading is asynchronous, you need to put cv.Mat creation inside the `onload` callback.

@code{.js}
-imgElement.onload = function() {
+imgElement.onload = await function() {
+  cv = (cv instanceof Promise) ? await cv : cv;
  let mat = cv.imread(imgElement);
 }
@endcode
@ -116,7 +119,8 @@ inputElement.addEventListener('change', (e) => {
  imgElement.src = URL.createObjectURL(e.target.files[0]);
 }, false);

-imgElement.onload = function() {
+imgElement.onload = async function() {
+  cv = (cv instanceof Promise) ? await cv : cv;
  let mat = cv.imread(imgElement);
  cv.imshow('canvasOutput', mat);
  mat.delete();
--- a/doc/py_tutorials/py_imgproc/py_watershed/py_watershed.markdown
+++ b/doc/py_tutorials/py_imgproc/py_watershed/py_watershed.markdown
@ -18,7 +18,7 @@ nearby, water from different valleys, obviously with different colors will start
 that, you build barriers in the locations where water merges. You continue the work of filling water
 and building barriers until all the peaks are under water. Then the barriers you created gives you
 the segmentation result. This is the "philosophy" behind the watershed. You can visit the [CMM
-webpage on watershed](http://cmm.ensmp.fr/~beucher/wtshed.html) to understand it with the help of
+webpage on watershed](https://people.cmm.minesparis.psl.eu/users/beucher/wtshed.html) to understand it with the help of
 some animations.

 But this approach gives you oversegmented result due to noise or any other irregularities in the
@ -140,7 +140,7 @@ some, they are not.
 Additional Resources
 --------------------

-#  CMM page on [Watershed Transformation](http://cmm.ensmp.fr/~beucher/wtshed.html)
+-#  CMM page on [Watershed Transformation](https://people.cmm.minesparis.psl.eu/users/beucher/wtshed.html)

 Exercises
 ---------
--- a/doc/tutorials/app/orbbec_uvc.markdown
+++ b/doc/tutorials/app/orbbec_uvc.markdown
@ -123,4 +123,5 @@ This tutorial code's is shown lines below. You can also download it from
 ![BGR And DEPTH And DepthToColor frame](images/orbbec_uvc_cpp.jpg)

 ### Note
-Mac users need `sudo` privileges to execute the code.
+ - Mac users need `sudo` privileges to execute the code.
+ - **Firmware**: If you’re using an Orbbec UVC 3D camera, please ensure your camera’s firmware is updated to the latest version to avoid potential compatibility issues. For more details, see [Orbbec’s Release Notes](https://github.com/orbbec/OrbbecSDK_v2/releases).
--- a/doc/tutorials/calib3d/camera_calibration/camera_calibration.markdown
+++ b/doc/tutorials/calib3d/camera_calibration/camera_calibration.markdown
@ -129,7 +129,7 @@ Explanation
 -#  **Find the pattern in the current input**

    The formation of the equations I mentioned above aims
-    to finding major patterns in the input: in case of the chessboard this are corners of the
+    to finding major patterns in the input: in case of the chessboard these are corners of the
    squares and for the circles, well, the circles themselves. ChArUco board is equivalent to
    chessboard, but corners are matched by ArUco markers. The position of these will form the
    result which will be written into the *pointBuf* vector.
@ -140,7 +140,7 @@ Explanation
    of the patterns. cv::findChessboardCorners and cv::findCirclesGrid return a boolean variable
    which states if the pattern was found in the input (we only need to take into account
    those images where this is true!). `CharucoDetector::detectBoard` may detect partially visible
-    pattern and returns coordunates and ids of visible inner corners.
+    pattern and returns coordinates and ids of visible inner corners.

    @note Board size and amount of matched points is different for chessboard, circles grid and ChArUco.
    All chessboard related algorithm expects amount of inner corners as board width and height.
--- a/doc/tutorials/calib3d/camera_calibration_pattern/camera_calibration_pattern.markdown
+++ b/doc/tutorials/calib3d/camera_calibration_pattern/camera_calibration_pattern.markdown
@ -11,7 +11,7 @@ Create calibration pattern {#tutorial_camera_calibration_pattern}
 | Compatibility | OpenCV >= 3.0 |


-The goal of this tutorial is to learn how to create calibration pattern.
+The goal of this tutorial is to learn how to create a calibration pattern.

 You can find a chessboard pattern in https://github.com/opencv/opencv/blob/5.x/doc/pattern.png

@ -47,14 +47,14 @@ create a ChAruco board pattern in charuco_board.svg with 7 rows, 5 columns, squa

        python gen_pattern.py -o charuco_board.svg --rows 7 --columns 5 -T charuco_board --square_size 30 --marker_size 15 -f DICT_5X5_100.json.gz

-If you want to change unit use -u option (mm inches, px, m)
+If you want to change the measurement units, use the -u option (e.g. mm, inches, px, m)

-If you want to change page size use -w and -h options
+If you want to change the page size, use the -w (width) and -h (height) options

-If you want to use your own dictionary for ChAruco board your should write name of file with your dictionary. For example
+If you want to use your own dictionary for the ChAruco board, specify the name of your dictionary file. For example

        python gen_pattern.py -o charuco_board.svg --rows 7 --columns 5 -T charuco_board -f my_dictionary.json

-You can generate your dictionary in my_dictionary.json file with number of markers 30 and markers size 5 bits by using opencv/samples/cpp/aruco_dict_utils.cpp.
+You can generate your dictionary in the file my_dictionary.json with 30 markers and a marker size of 5 bits using the utility provided in opencv/samples/cpp/aruco_dict_utils.cpp.

        bin/example_cpp_aruco_dict_utils.exe my_dict.json -nMarkers=30 -markerSize=5
--- a/doc/tutorials/calib3d/camera_calibration_square_chess/camera_calibration_square_chess.markdown
+++ b/doc/tutorials/calib3d/camera_calibration_square_chess/camera_calibration_square_chess.markdown
@ -63,4 +63,9 @@ image.
    opencv/samples/cpp/calibration.cpp, function computeReprojectionErrors).

 Question: how would you calculate distance from the camera origin to any one of the corners?
-Answer: As our image lies in a 3D space, firstly we would calculate the relative camera pose. This would give us 3D to 2D correspondences. Next, we can apply a simple L2 norm to calculate distance between any point (end point for corners).
+Answer: After obtaining the camera pose using solvePnP, the rotation (rvec) and translation (tvec) vectors define the transformation between the world (chessboard) coordinates and the camera coordinate system. To calculate the distance from the camera’s origin to any chessboard corner, first transform the 3D point from the chessboard coordinate system to the camera coordinate system (if not already done) and then compute its Euclidean distance using the L2 norm, for example:
+
+        // assuming 'point' is the 3D position of a chessboard corner in the camera coordinate system
+        double distance = norm(point);
+
+This is equivalent to applying the L2 norm on the 3D point’s coordinates (x, y, z).
--- a/doc/tutorials/introduction/config_reference/config_reference.markdown
+++ b/doc/tutorials/introduction/config_reference/config_reference.markdown
@ -64,6 +64,21 @@ Only 0- and 1-level deep module locations are supported, following command will
 cmake -DOPENCV_EXTRA_MODULES_PATH=../opencv_contrib ../opencv
 ```

+## Build with C++ Standard setting {#tutorial_config_reference_general_cxx_standard}
+
+`CMAKE_CXX_STANDARD` option can be used to set C++ standard settings for OpenCV building.
+
+```.sh
+cmake -DCMAKE_CXX_STANDARD=17 ../opencv
+cmake --build .
+```
+
+- C++11 is default/required/recommended for OpenCV 4.x. C++17 is default/required/recomended for OpenCV 5.x.
+- If your compiler does not support required C++ Standard features, OpenCV configuration should be fail.
+- If you set older C++ Standard than required, OpenCV configuration should be fail.
+  For workaround, `OPENCV_SKIP_CMAKE_CXX_STANDARD` option can be used to skip `CMAKE_CXX_STANDARD` version check.
+- If you set newer C++ Standard than recomended, numerous warnings may appear or OpenCV build may fail.
+

 ## Debug build {#tutorial_config_reference_general_debug}

--- a/3rdparty/carotene/.gitignore
+++ b/3rdparty/carotene/.gitignore
--- a/3rdparty/carotene/CMakeLists.txt
+++ b/3rdparty/carotene/CMakeLists.txt
--- a/3rdparty/carotene/README.md
+++ b/3rdparty/carotene/README.md
--- a/3rdparty/carotene/hal/CMakeLists.txt
+++ b/3rdparty/carotene/hal/CMakeLists.txt
--- a/3rdparty/carotene/hal/dummy.cpp
+++ b/3rdparty/carotene/hal/dummy.cpp
--- a/3rdparty/carotene/hal/tegra_hal.hpp
+++ b/3rdparty/carotene/hal/tegra_hal.hpp
--- a/3rdparty/carotene/include/carotene/definitions.hpp
+++ b/3rdparty/carotene/include/carotene/definitions.hpp
--- a/3rdparty/carotene/include/carotene/functions.hpp
+++ b/3rdparty/carotene/include/carotene/functions.hpp
@ -359,7 +359,7 @@ namespace CAROTENE_NS {

    /*
        For each point `p` within `size`, do:
-        dst[p] = src0[p] * scale / src1[p] 
+        dst[p] = src0[p] * scale / src1[p]

        NOTE: ROUND_TO_ZERO convert policy is used
    */
@ -420,7 +420,7 @@ namespace CAROTENE_NS {

    /*
        For each point `p` within `size`, do:
-        dst[p] = scale / src[p] 
+        dst[p] = scale / src[p]

        NOTE: ROUND_TO_ZERO convert policy is used
    */
--- a/3rdparty/carotene/include/carotene/types.hpp
+++ b/3rdparty/carotene/include/carotene/types.hpp
--- a/3rdparty/carotene/src/absdiff.cpp
+++ b/3rdparty/carotene/src/absdiff.cpp
--- a/3rdparty/carotene/src/accumulate.cpp
+++ b/3rdparty/carotene/src/accumulate.cpp
--- a/3rdparty/carotene/src/add.cpp
+++ b/3rdparty/carotene/src/add.cpp
--- a/3rdparty/carotene/src/add_weighted.cpp
+++ b/3rdparty/carotene/src/add_weighted.cpp
--- a/3rdparty/carotene/src/bitwise.cpp
+++ b/3rdparty/carotene/src/bitwise.cpp
--- a/3rdparty/carotene/src/blur.cpp
+++ b/3rdparty/carotene/src/blur.cpp
--- a/3rdparty/carotene/src/canny.cpp
+++ b/3rdparty/carotene/src/canny.cpp
--- a/3rdparty/carotene/src/channel_extract.cpp
+++ b/3rdparty/carotene/src/channel_extract.cpp
@ -378,7 +378,7 @@ void extract4(const Size2D &size,
                                  vst1q_##sgn##bits(dst1 + d1j, vals.v4.val[3]); \
                              }

-#endif 
+#endif

 #define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &_size,                                          \
                                          const sgn##bits * srcBase, ptrdiff_t srcStride,               \
--- a/3rdparty/carotene/src/channels_combine.cpp
+++ b/3rdparty/carotene/src/channels_combine.cpp
--- a/3rdparty/carotene/src/cmp.cpp
+++ b/3rdparty/carotene/src/cmp.cpp
--- a/3rdparty/carotene/src/colorconvert.cpp
+++ b/3rdparty/carotene/src/colorconvert.cpp
--- a/3rdparty/carotene/src/common.cpp
+++ b/3rdparty/carotene/src/common.cpp
--- a/3rdparty/carotene/src/common.hpp
+++ b/3rdparty/carotene/src/common.hpp
--- a/3rdparty/carotene/src/convert.cpp
+++ b/3rdparty/carotene/src/convert.cpp
--- a/3rdparty/carotene/src/convert_depth.cpp
+++ b/3rdparty/carotene/src/convert_depth.cpp
--- a/3rdparty/carotene/src/convert_scale.cpp
+++ b/3rdparty/carotene/src/convert_scale.cpp
--- a/3rdparty/carotene/src/convolution.cpp
+++ b/3rdparty/carotene/src/convolution.cpp
--- a/3rdparty/carotene/src/count_nonzero.cpp
+++ b/3rdparty/carotene/src/count_nonzero.cpp
--- a/3rdparty/carotene/src/div.cpp
+++ b/3rdparty/carotene/src/div.cpp
--- a/3rdparty/carotene/src/dot_product.cpp
+++ b/3rdparty/carotene/src/dot_product.cpp
--- a/3rdparty/carotene/src/dummy.cpp
+++ b/3rdparty/carotene/src/dummy.cpp
--- a/3rdparty/carotene/src/fast.cpp
+++ b/3rdparty/carotene/src/fast.cpp
--- a/3rdparty/carotene/src/fill_minmaxloc.cpp
+++ b/3rdparty/carotene/src/fill_minmaxloc.cpp
--- a/3rdparty/carotene/src/flip.cpp
+++ b/3rdparty/carotene/src/flip.cpp
--- a/3rdparty/carotene/src/gaussian_blur.cpp
+++ b/3rdparty/carotene/src/gaussian_blur.cpp
--- a/3rdparty/carotene/src/in_range.cpp
+++ b/3rdparty/carotene/src/in_range.cpp
--- a/3rdparty/carotene/src/integral.cpp
+++ b/3rdparty/carotene/src/integral.cpp
--- a/3rdparty/carotene/src/intrinsics.hpp
+++ b/3rdparty/carotene/src/intrinsics.hpp
--- a/3rdparty/carotene/src/laplacian.cpp
+++ b/3rdparty/carotene/src/laplacian.cpp
--- a/3rdparty/carotene/src/magnitude.cpp
+++ b/3rdparty/carotene/src/magnitude.cpp
--- a/3rdparty/carotene/src/meanstddev.cpp
+++ b/3rdparty/carotene/src/meanstddev.cpp
--- a/3rdparty/carotene/src/median_filter.cpp
+++ b/3rdparty/carotene/src/median_filter.cpp
--- a/3rdparty/carotene/src/min_max.cpp
+++ b/3rdparty/carotene/src/min_max.cpp
--- a/3rdparty/carotene/src/minmaxloc.cpp
+++ b/3rdparty/carotene/src/minmaxloc.cpp
--- a/3rdparty/carotene/src/morph.cpp
+++ b/3rdparty/carotene/src/morph.cpp
--- a/3rdparty/carotene/src/mul.cpp
+++ b/3rdparty/carotene/src/mul.cpp
--- a/3rdparty/carotene/src/norm.cpp
+++ b/3rdparty/carotene/src/norm.cpp
--- a/3rdparty/carotene/src/opticalflow.cpp
+++ b/3rdparty/carotene/src/opticalflow.cpp
--- a/3rdparty/carotene/src/phase.cpp
+++ b/3rdparty/carotene/src/phase.cpp
--- a/3rdparty/carotene/src/pyramid.cpp
+++ b/3rdparty/carotene/src/pyramid.cpp
--- a/3rdparty/carotene/src/reduce.cpp
+++ b/3rdparty/carotene/src/reduce.cpp
--- a/3rdparty/carotene/src/remap.cpp
+++ b/3rdparty/carotene/src/remap.cpp
--- a/3rdparty/carotene/src/remap.hpp
+++ b/3rdparty/carotene/src/remap.hpp
--- a/3rdparty/carotene/src/resize.cpp
+++ b/3rdparty/carotene/src/resize.cpp
--- a/3rdparty/carotene/src/saturate_cast.hpp
+++ b/3rdparty/carotene/src/saturate_cast.hpp
--- a/3rdparty/carotene/src/scharr.cpp
+++ b/3rdparty/carotene/src/scharr.cpp
--- a/3rdparty/carotene/src/separable_filter.cpp
+++ b/3rdparty/carotene/src/separable_filter.cpp
--- a/3rdparty/carotene/src/separable_filter.hpp
+++ b/3rdparty/carotene/src/separable_filter.hpp
--- a/3rdparty/carotene/src/sobel.cpp
+++ b/3rdparty/carotene/src/sobel.cpp
--- a/3rdparty/carotene/src/sub.cpp
+++ b/3rdparty/carotene/src/sub.cpp
--- a/3rdparty/carotene/src/sum.cpp
+++ b/3rdparty/carotene/src/sum.cpp
--- a/3rdparty/carotene/src/template_matching.cpp
+++ b/3rdparty/carotene/src/template_matching.cpp
--- a/3rdparty/carotene/src/threshold.cpp
+++ b/3rdparty/carotene/src/threshold.cpp
--- a/3rdparty/carotene/src/vround_helper.hpp
+++ b/3rdparty/carotene/src/vround_helper.hpp
--- a/3rdparty/carotene/src/vtransform.hpp
+++ b/3rdparty/carotene/src/vtransform.hpp
--- a/3rdparty/carotene/src/warp_affine.cpp
+++ b/3rdparty/carotene/src/warp_affine.cpp
--- a/3rdparty/carotene/src/warp_perspective.cpp
+++ b/3rdparty/carotene/src/warp_perspective.cpp
--- a/3rdparty/fastcv/CMakeLists.txt
+++ b/3rdparty/fastcv/CMakeLists.txt
@ -9,7 +9,7 @@ if(HAVE_FASTCV)

  file(GLOB FASTCV_HAL_FILES    "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp")

-  add_library(fastcv_hal STATIC ${FASTCV_HAL_FILES})
+  add_library(fastcv_hal STATIC ${OPENCV_3RDPARTY_EXCLUDE_FROM_ALL} ${FASTCV_HAL_FILES})

  target_include_directories(fastcv_hal PRIVATE
    ${CMAKE_SOURCE_DIR}/modules/core/include
--- a/3rdparty/fastcv/include/fastcv_hal_core.hpp
+++ b/3rdparty/fastcv/include/fastcv_hal_core.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 */

@ -32,6 +32,10 @@
 #define cv_hal_mul16s               fastcv_hal_mul16s
 #undef  cv_hal_mul32f
 #define cv_hal_mul32f               fastcv_hal_mul32f
+#undef  cv_hal_SVD32f
+#define cv_hal_SVD32f               fastcv_hal_SVD32f
+#undef  cv_hal_gemm32f
+#define cv_hal_gemm32f              fastcv_hal_gemm32f

 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 /// @brief look-up table transform of an array.
@ -219,4 +223,48 @@ int fastcv_hal_mul32f(
    int             height,
    double          scale);

+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Performs singular value decomposition of \f$M\times N\f$(\f$M>N\f$) matrix \f$A = U*\Sigma*V^T\f$.
+///
+/// @param src      Pointer to input MxN matrix A stored in column major order.
+///                 After finish of work src will be filled with rows of U or not modified (depends of flag CV_HAL_SVD_MODIFY_A).
+/// @param src_step Number of bytes between two consequent columns of matrix A.
+/// @param w        Pointer to array for singular values of matrix A (i. e. first N diagonal elements of matrix \f$\Sigma\f$).
+/// @param u        Pointer to output MxN or MxM matrix U (size depends of flags).
+///                 Pointer must be valid if flag CV_HAL_SVD_MODIFY_A not used.
+/// @param u_step   Number of bytes between two consequent rows of matrix U.
+/// @param vt       Pointer to array for NxN matrix V^T.
+/// @param vt_step  Number of bytes between two consequent rows of matrix V^T.
+/// @param m        Number fo rows in matrix A.
+/// @param n        Number of columns in matrix A.
+/// @param flags    Algorithm options (combination of CV_HAL_SVD_FULL_UV, ...).
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int fastcv_hal_SVD32f(
+    float* src,
+    size_t src_step,
+    float* w,
+    float* u,
+    size_t u_step,
+    float* vt,
+    size_t vt_step,
+    int    m,
+    int    n,
+    int    flags);
+
+int fastcv_hal_gemm32f(
+    const float*    src1,
+    size_t          src1_step,
+    const float*    src2,
+    size_t          src2_step,
+    float           alpha,
+    const float*    src3,
+    size_t          src3_step,
+    float           beta,
+    float*          dst,
+    size_t          dst_step,
+    int             m,
+    int             n,
+    int             k,
+    int             flags);
+
 #endif
--- a/3rdparty/fastcv/include/fastcv_hal_imgproc.hpp
+++ b/3rdparty/fastcv/include/fastcv_hal_imgproc.hpp
--- a/3rdparty/fastcv/include/fastcv_hal_utils.hpp
+++ b/3rdparty/fastcv/include/fastcv_hal_utils.hpp
--- a/3rdparty/fastcv/src/fastcv_hal_core.cpp
+++ b/3rdparty/fastcv/src/fastcv_hal_core.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 */

@ -399,7 +399,7 @@ int fastcv_hal_mul8u(
    int8_t sF;

    if(FCV_CMP_EQ(scale,1.0))              { sF =  0; }
-    else if(scale > 1.0)                    
+    else if(scale > 1.0)
    {
        if(FCV_CMP_EQ(scale,2.0))          { sF = -1; }
        else if(FCV_CMP_EQ(scale,4.0))     { sF = -2; }
@ -471,7 +471,7 @@ int fastcv_hal_mul16s(
    int8_t sF;

    if(FCV_CMP_EQ(scale,1.0))              { sF =  0; }
-    else if(scale > 1.0)                    
+    else if(scale > 1.0)
    {
        if(FCV_CMP_EQ(scale,2.0))          { sF = -1; }
        else if(FCV_CMP_EQ(scale,4.0))     { sF = -2; }
@ -571,4 +571,170 @@ int fastcv_hal_mul32f(

    fcvStatus status = FASTCV_SUCCESS;
    CV_HAL_RETURN(status, hal_mul32f);
-}
+}
+
+int fastcv_hal_SVD32f(
+    float*  src,
+    size_t  src_step,
+    float*  w,
+    float*  u,
+    size_t  u_step,
+    float*  vt,
+    size_t  vt_step,
+    int     m,
+    int     n,
+    int     flags)
+{
+    if (n * sizeof(float) != src_step)
+        CV_HAL_RETURN_NOT_IMPLEMENTED("step is not supported");
+
+    INITIALIZATION_CHECK;
+
+    fcvStatus status = FASTCV_SUCCESS;
+
+    cv::Mat tmpU(m, n, CV_32F);
+    cv::Mat tmpV(n, n, CV_32F);
+
+    switch (flags)
+    {
+        case CV_HAL_SVD_NO_UV:
+        {
+            status = fcvSVDf32_v2(src, m, n, w, u, vt, (float32_t *)tmpU.data, (float32_t *)tmpV.data, false);
+            break;
+        }
+        case CV_HAL_SVD_SHORT_UV:
+        {
+            if ((n * sizeof(float) == u_step) && (n * sizeof(float) == vt_step))
+                status = fcvSVDf32_v2(src, m, n, w, u, vt, (float32_t *)tmpU.data, (float32_t *)tmpV.data, false);
+            else
+                CV_HAL_RETURN_NOT_IMPLEMENTED("step is not supported");
+            break;
+        }
+        case CV_HAL_SVD_FULL_UV:
+        {
+            if ((n * sizeof(float) == u_step) && (n * sizeof(float) == vt_step))
+                status = fcvSVDf32_v2(src, m, n, w, u, vt, (float32_t *)tmpU.data, (float32_t *)tmpV.data, true);
+            else
+                CV_HAL_RETURN_NOT_IMPLEMENTED("step is not supported");
+            break;
+        }
+        default:
+            CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Flags:%d is not supported", flags));
+    }
+
+    CV_HAL_RETURN(status, fastcv_hal_SVD32f);
+}
+
+int fastcv_hal_gemm32f(
+    const float*    src1,
+    size_t          src1_step,
+    const float*    src2,
+    size_t          src2_step,
+    float           alpha,
+    const float*    src3,
+    size_t          src3_step,
+    float           beta,
+    float*          dst,
+    size_t          dst_step,
+    int             m,
+    int             n,
+    int             k,
+    int             flags)
+{
+    cv::Mat src1_t, src2_t, src3_t, dst_temp1;
+    int height_a = m, width_a = n, width_d = k;
+    const float *src1p = src1, *src2p = src2, *src3p = src3;
+
+    INITIALIZATION_CHECK;
+
+    if((flags & (cv::GEMM_1_T)) && (flags & (cv::GEMM_2_T)))
+    {
+        height_a = n; width_a = m;
+    }
+    else if(flags & (cv::GEMM_1_T))
+    {
+        src1_t = cv::Mat(width_a, height_a, CV_32FC1);
+        fcvTransposef32_v2(src1, width_a, height_a, src1_step, src1_t.ptr<float>(), src1_t.step[0]);
+        src1p = src1_t.ptr<float>();
+        src1_step = src1_t.step[0];
+        height_a = n; width_a = m;
+    }
+    else if(flags & (cv::GEMM_2_T))
+    {
+        src2_t = cv::Mat(width_a, width_d, CV_32FC1);
+        fcvTransposef32_v2(src2, width_a, width_d, src2_step, src2_t.ptr<float>(), src2_t.step[0]);
+        src2p = src2_t.ptr<float>();
+        src2_step = src2_t.step[0];
+    }
+
+    if((flags & cv::GEMM_3_T) && beta != 0.0 && src3 != NULL)
+    {
+        src3_t = cv::Mat(height_a, width_d, CV_32FC1);
+        fcvTransposef32_v2(src3, height_a, width_d, src3_step, src3_t.ptr<float>(), src3_t.step[0]);
+        src3p = src3_t.ptr<float>();
+        src3_step = src3_t.step[0];
+    }
+
+    bool inplace = false;
+    size_t dst_stride;
+    float *dstp = NULL;
+
+    if(src1 == dst || src2 == dst || src3 == dst)
+    {
+        dst_temp1 = cv::Mat(height_a, width_d, CV_32FC1);
+        dstp = dst_temp1.ptr<float>();
+        inplace = true;
+        dst_stride = dst_temp1.step[0];
+    }
+    else
+    {
+        dstp = dst;
+        dst_stride = dst_step;
+    }
+
+    float *dstp1 = dstp;
+
+    fcvStatus status = FASTCV_SUCCESS;
+
+    if(alpha != 0.0)
+    {
+        if((flags & (cv::GEMM_1_T)) && (flags & (cv::GEMM_2_T)))
+        {
+            cv::Mat dst_temp2 = cv::Mat(k, n, CV_32FC1);
+            fcvMatrixMultiplyf32_v2(src2p, m, k, src2_step, src1p, n, src1_step,
+                                         dst_temp2.ptr<float>(), dst_temp2.step[0]);
+            fcvTransposef32_v2(dst_temp2.ptr<float>(), n, k, dst_temp2.step[0], dstp, dst_stride);
+        }
+        else
+        {
+            status = fcvMatrixMultiplyf32_v2(src1p, width_a, height_a, src1_step, src2p, width_d,
+                                                src2_step, dstp, dst_stride);
+        }
+    }
+
+    if(alpha != 1.0 && alpha != 0.0 && status == FASTCV_SUCCESS)
+    {
+        status = fcvMultiplyScalarf32(dstp, width_d, height_a, dst_stride, alpha, dstp1, dst_stride);
+    }
+
+    if(src3 != NULL && beta != 0.0 && status == FASTCV_SUCCESS)
+    {
+        cv::Mat dst3 = cv::Mat(height_a, width_d, CV_32FC1);
+        if(beta != 1.0)
+        {
+            status = fcvMultiplyScalarf32(src3p, width_d, height_a, src3_step, beta, (float32_t*)dst3.data, dst3.step);
+            if(status == FASTCV_SUCCESS)
+                fcvAddf32_v2(dstp, width_d, height_a, dst_stride, (float32_t*)dst3.data, dst3.step, dstp1, dst_stride);
+        }
+        else
+            fcvAddf32_v2(dstp, width_d, height_a, dst_stride, src3p, src3_step, dstp1, dst_stride);
+    }
+
+    if(inplace)
+    {
+        cv::Mat dst_mat = cv::Mat(height_a, width_d, CV_32FC1, (void*)dst, dst_step);
+        dst_temp1.copyTo(dst_mat);
+    }
+
+    CV_HAL_RETURN(status,hal_gemm32f);
+}
--- a/3rdparty/fastcv/src/fastcv_hal_imgproc.cpp
+++ b/3rdparty/fastcv/src/fastcv_hal_imgproc.cpp
@ -314,6 +314,69 @@ int fastcv_hal_sobel(
    CV_HAL_RETURN(status, hal_sobel);
 }

+class FcvBoxLoop_Invoker : public cv::ParallelLoopBody
+{
+public:
+
+    FcvBoxLoop_Invoker(cv::Mat src_, int width_, int height_, cv::Mat dst_, int bdr_, int knl_, int normalize_, int stripeHeight_, int nStripes_, int depth_) :
+        cv::ParallelLoopBody(), src(src_), width(width_), height(height_), dst(dst_), bdr(bdr_), knl(knl_), normalize(normalize_), stripeHeight(stripeHeight_), nStripes(nStripes_), depth(depth_)
+    {
+    }
+
+    virtual void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+        int height_ = stripeHeight * (range.end - range.start);
+        int width_  = width;
+        cv::Mat src_;
+        int n = knl/2;
+
+        if(range.end == nStripes)
+            height_ += (height - range.end * stripeHeight);
+
+        src_ = cv::Mat(height_ + 2*n, width_ + 2*n, depth);
+
+        if(range.start == 0 && range.end == nStripes)
+            cv::copyMakeBorder(src(cv::Rect(0, 0, width_, height_)), src_, n, n, n, n, bdr);
+        else if(range.start == 0)
+            cv::copyMakeBorder(src(cv::Rect(0, 0, width_, height_ + n)), src_, n, 0, n, n, bdr);
+        else if(range.end == nStripes)
+            cv::copyMakeBorder(src(cv::Rect(0, range.start * stripeHeight - n, width_, height_ + n)), src_, 0, n, n, n, bdr);
+        else
+            cv::copyMakeBorder(src(cv::Rect(0, range.start * stripeHeight - n, width_, height_ + 2*n)), src_, 0, 0, n, n, bdr);
+
+        cv::Mat dst_padded = cv::Mat(height_ + 2*n, width_ + 2*n, depth);
+        if(depth == CV_32F)
+            fcvBoxFilterNxNf32((float*)src_.data, width_ + 2*n, height_ + 2*n, (width_ + 2*n)*sizeof(float),
+                                       knl, (float*)dst_padded.data, dst_padded.step[0]);
+        else
+        {
+            auto func = knl == 3 ? fcvBoxFilter3x3u8_v3 : fcvBoxFilter5x5u8_v2;
+
+            func(src_.data, width_ + 2*n, height_ + 2*n, width_ + 2*n,
+                            dst_padded.data, dst_padded.step[0], normalize, FASTCV_BORDER_UNDEFINED, 0);
+        }
+        int start_val = stripeHeight * range.start;
+        cv::Mat dst_temp1 = dst_padded(cv::Rect(n, n, width_, height_));
+        cv::Mat dst_temp2 = dst(cv::Rect(0, start_val, width_, height_));
+        dst_temp1.copyTo(dst_temp2);
+    }
+
+private:
+    cv::Mat src;
+    const int width;
+    const int height;
+    cv::Mat dst;
+    const int bdr;
+    const int knl;
+    const int normalize;
+    const int stripeHeight;
+    const int nStripes;
+    int depth;
+
+    FcvBoxLoop_Invoker(const FcvBoxLoop_Invoker &);  // = delete;
+    const FcvBoxLoop_Invoker& operator= (const FcvBoxLoop_Invoker &);  // = delete;
+};
+
 int fastcv_hal_boxFilter(
    const uchar*     src_data,
    size_t           src_step,
@ -335,15 +398,7 @@ int fastcv_hal_boxFilter(
    bool             normalize,
    int              border_type)
 {
-    if((width*height) < (320*240))
-    {
-        CV_HAL_RETURN_NOT_IMPLEMENTED("input size not supported");
-    }
-    else if(src_data == dst_data)
-    {
-        CV_HAL_RETURN_NOT_IMPLEMENTED("in-place processing not supported");
-    }
-    else if(src_depth != CV_8U || cn != 1)
+    if((src_depth != CV_8U && src_depth != CV_32F) || cn != 1)
    {
        CV_HAL_RETURN_NOT_IMPLEMENTED("src type not supported");
    }
@ -351,8 +406,7 @@ int fastcv_hal_boxFilter(
    {
        CV_HAL_RETURN_NOT_IMPLEMENTED("same src and dst type supported");
    }
-    else if(ksize_width != ksize_height ||
-           (ksize_width != 3 && ksize_width != 5))
+    else if(ksize_width != ksize_height)
    {
        CV_HAL_RETURN_NOT_IMPLEMENTED("kernel size not supported");
    }
@ -363,37 +417,52 @@ int fastcv_hal_boxFilter(
        CV_HAL_RETURN_NOT_IMPLEMENTED("ROI not supported");
    }

+    if(src_depth == CV_32F && normalize != 1)
+        CV_HAL_RETURN_NOT_IMPLEMENTED("normalized kernel supported for float types");
+
+    if(src_depth == CV_32F && (height < 5 || width < 5 || ksize_height < 5))
+        CV_HAL_RETURN_NOT_IMPLEMENTED("size not supported");
+
+    if(src_depth == CV_8U && (ksize_width != 3 && ksize_width != 5))
+        CV_HAL_RETURN_NOT_IMPLEMENTED("kernel size not supported");
+
    INITIALIZATION_CHECK;

-    fcvBorderType bdr;
-    uint8_t bdrVal = 0;
-    switch(border_type)
+    cv::Mat dst_temp;
+    bool inPlace = src_data == dst_data ? true : false ;
+
+    int nThreads = cv::getNumThreads();
+
+    cv::Mat src = cv::Mat(height, width, src_depth, (void*)src_data, src_step);
+
+    if(inPlace)
+        dst_temp = cv::Mat(height, width, src_depth);
+    else
+        dst_temp = cv::Mat(height, width, src_depth, (void*)dst_data, dst_step);
+
+    int nStripes, stripeHeight = src.rows/nThreads;
+
+    if((size_t)src.rows < ksize_height || stripeHeight < 5 || nThreads <= 1)
    {
-        case cv::BORDER_REPLICATE:
-            bdr = FASTCV_BORDER_REPLICATE;
-            break;
-        case cv::BORDER_REFLECT:
-            bdr = FASTCV_BORDER_REFLECT;
-            break;
-        case cv::BORDER_REFLECT101:    // cv::BORDER_REFLECT_101, BORDER_DEFAULT
-            bdr = FASTCV_BORDER_REFLECT_V2;
-            break;
-        default:
-            CV_HAL_RETURN_NOT_IMPLEMENTED("border type not supported");
+        nStripes = 1;
+        stripeHeight = src.rows;
+    }
+    else
+    {
+        nStripes = nThreads;
+        stripeHeight = src.rows/nThreads;
+    }
+
+    cv::parallel_for_(cv::Range(0, nStripes),
+            FcvBoxLoop_Invoker(src, width, height, dst_temp, border_type, ksize_width, normalize, stripeHeight, nStripes, src_depth), nStripes);
+
+    if(inPlace)
+    {
+        cv::Mat dst = cv::Mat(height, width, src_depth, (void*)dst_data, dst_step);
+        dst_temp.copyTo(dst);
    }

    fcvStatus status = FASTCV_SUCCESS;
-    if(ksize_width == 3)
-    {
-        status = fcvBoxFilter3x3u8_v3(src_data, width, height, src_step,
-                                      dst_data, dst_step, normalize, bdr, bdrVal);
-    }
-    else if(ksize_width == 5)
-    {
-        status = fcvBoxFilter5x5u8_v2(src_data, width, height, src_step,
-                                      dst_data, dst_step, normalize, bdr, bdrVal);
-    }
-
    CV_HAL_RETURN(status,hal_boxFilter);
 }

@ -467,30 +536,88 @@ class FcvGaussianBlurLoop_Invoker : public cv::ParallelLoopBody
 {
    public:

-    FcvGaussianBlurLoop_Invoker(const cv::Mat& _src, cv::Mat& _dst, int _ksize, int _borderType, int _fcvBorderValue) :
-        cv::ParallelLoopBody(), src(_src),dst(_dst), ksize(_ksize), borderType(_borderType), fcvBorderValue(_fcvBorderValue)
+    FcvGaussianBlurLoop_Invoker(const cv::Mat& _src, cv::Mat& _dst, int _ksize, int _borderType) :
+        cv::ParallelLoopBody(), src(_src),dst(_dst), ksize(_ksize), borderType(_borderType)
    {
        width       = src.cols;
        height      = src.rows;
        halfKSize   = ksize / 2;
        fcvFuncType = FCV_MAKETYPE(ksize, src.depth());
+
+        switch (borderType)
+        {
+            case cv::BorderTypes::BORDER_REPLICATE:
+            {
+                fcvBorder = fcvBorderType::FASTCV_BORDER_REPLICATE;
+                break;
+            }
+            // For constant border, there are no border value, OpenCV default value is 0
+            case cv::BorderTypes::BORDER_CONSTANT:
+            {
+                fcvBorder = fcvBorderType::FASTCV_BORDER_CONSTANT;
+                break;
+            }
+            case cv::BorderTypes::BORDER_REFLECT:
+            {
+                fcvBorder = fcvBorderType::FASTCV_BORDER_REFLECT;
+                break;
+            }
+            case cv::BorderTypes::BORDER_REFLECT_101:
+            {
+                fcvBorder = fcvBorderType::FASTCV_BORDER_REFLECT_V2;
+                break;
+            }
+        }
    }

    virtual void operator()(const cv::Range& range) const CV_OVERRIDE
    {
-        int rangeHeight  = range.end - range.start;
-        int paddedHeight = rangeHeight + halfKSize * 2;
-        int paddedWidth  = width;
+        int topLines     = 0;
+        int bottomLines  = 0;
+        int rangeHeight  = range.end-range.start;
+        int paddedHeight = rangeHeight;

-        cv::Mat srcPadded = src(cv::Rect(0, range.start, paddedWidth, paddedHeight));
-        cv::Mat dstPadded = dst(cv::Rect(0, range.start, paddedWidth, paddedHeight));
+        // Need additional lines to be border.
+        if(range.start > 0)
+        {
+            topLines     = MIN(range.start, halfKSize);
+            paddedHeight += topLines;
+        }
+
+        if(range.end < height)
+        {
+            bottomLines  = MIN(height-range.end, halfKSize);
+            paddedHeight += bottomLines;
+        }

        if (fcvFuncType == FCV_MAKETYPE(3,CV_8U))
-            fcvFilterGaussian3x3u8_v4(srcPadded.data, paddedWidth, paddedHeight, srcPadded.step, dstPadded.data, dstPadded.step,
-                fcvBorderType::FASTCV_BORDER_UNDEFINED, fcvBorderValue);
+        {
+            cv::Mat srcPadded = src(cv::Rect(0, range.start - topLines, width, paddedHeight));
+            cv::Mat dstPadded = cv::Mat(paddedHeight, width, CV_8UC1);
+            fcvFilterGaussian3x3u8_v4(srcPadded.data, width, paddedHeight, srcPadded.step, dstPadded.data, dstPadded.step,
+                                      fcvBorder, 0);
+
+            // Only copy center part back to output image and ignore the padded lines
+            cv::Mat temp1 = dstPadded(cv::Rect(0, topLines, width, rangeHeight));
+            cv::Mat temp2 = dst(cv::Rect(0, range.start, width, rangeHeight));
+            temp1.copyTo(temp2);
+        }
        else if (fcvFuncType == FCV_MAKETYPE(5,CV_8U))
-            fcvFilterGaussian5x5u8_v3(srcPadded.data, paddedWidth, paddedHeight, srcPadded.step, dstPadded.data, dstPadded.step,
-                fcvBorderType::FASTCV_BORDER_UNDEFINED, fcvBorderValue);
+        {
+            int width_  = width + ksize - 1;
+            int height_ = rangeHeight + ksize - 1;
+            cv::Mat srcPadded = cv::Mat(height_, width_, CV_8UC1);
+            cv::Mat dstPadded = cv::Mat(height_, width_, CV_8UC1);
+            cv::copyMakeBorder(src(cv::Rect(0, range.start - topLines, width, paddedHeight)), srcPadded,
+                               halfKSize - topLines, halfKSize - bottomLines, halfKSize, halfKSize, borderType);
+            fcvFilterGaussian5x5u8_v3(srcPadded.data, width_, height_, srcPadded.step, dstPadded.data, dstPadded.step,
+                                      fcvBorderType::FASTCV_BORDER_UNDEFINED, 0);
+
+            // Only copy center part back to output image and ignore the padded lines
+            cv::Mat temp1 = dstPadded(cv::Rect(halfKSize, halfKSize, width, rangeHeight));
+            cv::Mat temp2 = dst(cv::Rect(0, range.start, width, rangeHeight));
+            temp1.copyTo(temp2);
+        }
    }

    private:
@ -500,9 +627,9 @@ class FcvGaussianBlurLoop_Invoker : public cv::ParallelLoopBody
    int             height;
    const int       ksize;
    int             halfKSize;
-    int             fcvFuncType;
    int             borderType;
-    int             fcvBorderValue;
+    int             fcvFuncType;
+    fcvBorderType   fcvBorder;

    FcvGaussianBlurLoop_Invoker(const FcvGaussianBlurLoop_Invoker &);  // = delete;
    const FcvGaussianBlurLoop_Invoker& operator= (const FcvGaussianBlurLoop_Invoker &);  // = delete;
@ -528,9 +655,9 @@ int fastcv_hal_gaussianBlurBinomial(
    if (src_data == dst_data)
        CV_HAL_RETURN_NOT_IMPLEMENTED("Inplace is not supported");

-    // The pixels of input image should larger than 320*240
-    if((width*height) < (320*240))
-        CV_HAL_RETURN_NOT_IMPLEMENTED("Input image size should be larger than 320*240");
+    // The input image width and height should greater than kernel size
+    if (((size_t)height <= ksize) || ((size_t)width <= ksize))
+        CV_HAL_RETURN_NOT_IMPLEMENTED("Input image size should be larger than kernel size");

    // The input channel should be 1
    if (cn != 1)
@ -540,26 +667,31 @@ int fastcv_hal_gaussianBlurBinomial(
    if((margin_left!=0) || (margin_top != 0) || (margin_right != 0) || (margin_bottom !=0))
        CV_HAL_RETURN_NOT_IMPLEMENTED("ROI is not supported");

+    // Border type check
+    if( border_type != cv::BorderTypes::BORDER_CONSTANT  &&
+        border_type != cv::BorderTypes::BORDER_REPLICATE &&
+        border_type != cv::BorderTypes::BORDER_REFLECT   &&
+        border_type != cv::BorderTypes::BORDER_REFLECT101)
+        CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Border type:%s is not supported", borderToString(border_type)));
+
    INITIALIZATION_CHECK;

    fcvStatus status = FASTCV_SUCCESS;
-    int fcvFuncType = FCV_MAKETYPE(ksize, depth);
+    int fcvFuncType  = FCV_MAKETYPE(ksize, depth);

    int nThreads = cv::getNumThreads();
-    int nStripes = (nThreads > 1) ? ((height > 60) ? 3 * nThreads : 1) : 1;
+    // In each stripe, the height should be equal or larger than ksize.
+    // Use 3*nThreads stripes to avoid too many threads.
+    int nStripes = nThreads > 1 ? MIN(height / (int)ksize, 3 * nThreads) : 1;

    switch (fcvFuncType)
    {
        case FCV_MAKETYPE(3,CV_8U):
        case FCV_MAKETYPE(5,CV_8U):
        {
-            cv::Mat src = cv::Mat(height, width, CV_8UC1, (void *)src_data, src_step);
-            cv::Mat dst = cv::Mat(height, width, CV_8UC1, (void *)dst_data, dst_step);
-            cv::Mat src_tmp = cv::Mat(height + ksize - 1, width + ksize - 1, CV_8UC1);
-            cv::Mat dst_tmp = cv::Mat(height + ksize - 1, width + ksize - 1, CV_8UC1);
-            cv::copyMakeBorder(src, src_tmp, ksize / 2, ksize / 2, ksize / 2, ksize / 2, border_type);
-            cv::parallel_for_(cv::Range(0, height), FcvGaussianBlurLoop_Invoker(src_tmp, dst_tmp, ksize, border_type, 0), nStripes);
-            dst_tmp(cv::Rect(ksize / 2, ksize / 2, width, height)).copyTo(dst);
+            cv::Mat src = cv::Mat(height, width, CV_8UC1, (void*)src_data, src_step);
+            cv::Mat dst = cv::Mat(height, width, CV_8UC1, (void*)dst_data, dst_step);
+            cv::parallel_for_(cv::Range(0, height), FcvGaussianBlurLoop_Invoker(src, dst, ksize, border_type), nStripes);
            break;
        }
        default:
@ -1007,4 +1139,4 @@ int fastcv_hal_canny(
        CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Ksize:%d is not supported", ksize));
    }
    CV_HAL_RETURN(status, hal_canny);
-}
+}
--- a/3rdparty/fastcv/src/fastcv_hal_utils.cpp
+++ b/3rdparty/fastcv/src/fastcv_hal_utils.cpp
--- a/hal/ipp/CMakeLists.txt
+++ b/hal/ipp/CMakeLists.txt
@ -0,0 +1,49 @@
+project(ipphal)
+
+set(IPP_HAL_VERSION 0.0.1 CACHE INTERNAL "")
+set(IPP_HAL_LIBRARIES "ipphal" CACHE INTERNAL "")
+set(IPP_HAL_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/include" CACHE INTERNAL "")
+set(IPP_HAL_HEADERS
+  "${CMAKE_CURRENT_SOURCE_DIR}/include/ipp_hal_core.hpp"
+  CACHE INTERNAL "")
+
+add_library(ipphal STATIC
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/mean_ipp.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/minmax_ipp.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/norm_ipp.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/cart_polar_ipp.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/transforms_ipp.cpp"
+)
+
+#TODO: HAVE_IPP_ICV and HAVE_IPP_IW added as private macro till OpenCV itself is
+#      source of IPP and public definitions lead to redefinition warning
+#      The macro should be redefined as PUBLIC when IPP part is removed from core
+#      to make HAL the source of IPP integration
+if(HAVE_IPP_ICV)
+  target_compile_definitions(ipphal PRIVATE HAVE_IPP_ICV)
+endif()
+
+if(HAVE_IPP_IW)
+  target_compile_definitions(ipphal PRIVATE HAVE_IPP_IW)
+endif()
+
+target_include_directories(ipphal PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include")
+ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-suggest-override)
+
+target_include_directories(ipphal PRIVATE
+  "${CMAKE_CURRENT_SOURCE_DIR}/src"
+  ${CMAKE_SOURCE_DIR}/modules/core/include
+  ${IPP_INCLUDE_DIRS}
+)
+
+target_link_libraries(ipphal PUBLIC ${IPP_IW_LIBRARY} ${IPP_LIBRARIES})
+
+set_target_properties(ipphal PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH})
+
+if(NOT BUILD_SHARED_LIBS)
+  ocv_install_target(ipphal EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
+endif()
+
+if(ENABLE_SOLUTION_FOLDERS)
+  set_target_properties(ipphal PROPERTIES FOLDER "3rdparty")
+endif()
--- a/hal/ipp/include/ipp_hal_core.hpp
+++ b/hal/ipp/include/ipp_hal_core.hpp
@ -0,0 +1,59 @@
+#ifndef __IPP_HAL_CORE_HPP__
+#define __IPP_HAL_CORE_HPP__
+
+#include <opencv2/core/base.hpp>
+#include "ipp_utils.hpp"
+
+#if (IPP_VERSION_X100 >= 700)
+int ipp_hal_meanStdDev(const uchar* src_data, size_t src_step, int width, int height, int src_type,
+                       double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);
+
+#undef cv_hal_meanStdDev
+#define cv_hal_meanStdDev ipp_hal_meanStdDev
+
+int ipp_hal_minMaxIdxMaskStep(const uchar* src_data, size_t src_step, int width, int height, int depth,
+                              double* _minVal, double* _maxVal, int* _minIdx, int* _maxIdx, uchar* mask, size_t mask_step);
+
+#undef cv_hal_minMaxIdxMaskStep
+#define cv_hal_minMaxIdxMaskStep ipp_hal_minMaxIdxMaskStep
+
+#define IPP_DISABLE_NORM_8U             1 // accuracy difference in perf test sanity check
+
+int ipp_hal_norm(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step,
+                 int width, int height, int type, int norm_type, double* result);
+
+#undef cv_hal_norm
+#define cv_hal_norm ipp_hal_norm
+
+
+int ipp_hal_normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask,
+                     size_t mask_step, int width, int height, int type, int norm_type, double* result);
+
+#undef cv_hal_normDiff
+#define cv_hal_normDiff ipp_hal_normDiff
+
+#endif
+
+int ipp_hal_polarToCart32f(const float* mag, const float* angle, float* x, float* y, int len, bool angleInDegrees);
+int ipp_hal_polarToCart64f(const double* mag, const double* angle, double* x, double* y, int len, bool angleInDegrees);
+
+#undef cv_hal_polarToCart32f
+#define cv_hal_polarToCart32f ipp_hal_polarToCart32f
+#undef cv_hal_polarToCart64f
+#define cv_hal_polarToCart64f ipp_hal_polarToCart64f
+
+#ifdef HAVE_IPP_IW
+int ipp_hal_flip(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height,
+                 uchar* dst_data, size_t dst_step, int flip_mode);
+
+#undef cv_hal_flip
+#define cv_hal_flip ipp_hal_flip
+#endif
+
+int ipp_hal_transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int src_width,
+                        int src_height, int element_size);
+
+#undef cv_hal_transpose2d
+#define cv_hal_transpose2d ipp_hal_transpose2d
+
+#endif
--- a/hal/ipp/include/ipp_utils.hpp
+++ b/hal/ipp/include/ipp_utils.hpp
@ -0,0 +1,24 @@
+#ifndef __IPP_HAL_UTILS_HPP__
+#define __IPP_HAL_UTILS_HPP__
+
+#include "ippversion.h"
+#ifndef IPP_VERSION_UPDATE // prior to 7.1
+#define IPP_VERSION_UPDATE 0
+#endif
+
+#define IPP_VERSION_X100 (IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR*10 + IPP_VERSION_UPDATE)
+
+#ifdef HAVE_IPP_ICV
+# define ICV_BASE
+#if IPP_VERSION_X100 >= 201700
+# include "ippicv.h"
+#else
+# include "ipp.h"
+#endif
+#else
+# include "ipp.h"
+#endif
+
+#define CV_INSTRUMENT_FUN_IPP(FUN, ...) ((FUN)(__VA_ARGS__))
+
+#endif
--- a/hal/ipp/src/cart_polar_ipp.cpp
+++ b/hal/ipp/src/cart_polar_ipp.cpp
@ -0,0 +1,28 @@
+#include "ipp_hal_core.hpp"
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/core/base.hpp>
+
+int ipp_hal_polarToCart32f(const float* mag, const float* angle, float* x, float* y, int len, bool angleInDegrees)
+{
+    const bool isInPlace = (x == mag) || (x == angle) || (y == mag) || (y == angle);
+    if (isInPlace || angleInDegrees)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    if (CV_INSTRUMENT_FUN_IPP(ippsPolarToCart_32f, mag, angle, x, y, len) < 0)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    return CV_HAL_ERROR_OK;
+}
+
+int ipp_hal_polarToCart64f(const double* mag, const double* angle, double* x, double* y, int len, bool angleInDegrees)
+{
+    const bool isInPlace = (x == mag) || (x == angle) || (y == mag) || (y == angle);
+    if (isInPlace || angleInDegrees)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    if (CV_INSTRUMENT_FUN_IPP(ippsPolarToCart_64f, mag, angle, x, y, len) < 0)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    return CV_HAL_ERROR_OK;
+}
--- a/hal/ipp/src/mean_ipp.cpp
+++ b/hal/ipp/src/mean_ipp.cpp
@ -0,0 +1,206 @@
+#include "ipp_hal_core.hpp"
+
+#include <opencv2/core.hpp>
+#include <opencv2/core/base.hpp>
+
+#if IPP_VERSION_X100 >= 700
+
+static int ipp_mean(const uchar* src_data, size_t src_step, int width, int height,
+                    int src_type, double* mean_val, uchar* mask, size_t mask_step)
+{
+    int cn = CV_MAT_CN(src_type);
+    if (cn > 4)
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    if((src_step == 1 || src_step == static_cast<size_t>(width)) && (mask_step == 1 || mask_step == static_cast<size_t>(width)))
+    {
+        IppiSize sz = { width, height };
+        if( mask )
+        {
+            typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *);
+            ippiMaskMeanFuncC1 ippiMean_C1MR =
+            src_type == CV_8UC1 ? (ippiMaskMeanFuncC1)ippiMean_8u_C1MR :
+            src_type == CV_16UC1 ? (ippiMaskMeanFuncC1)ippiMean_16u_C1MR :
+            src_type == CV_32FC1 ? (ippiMaskMeanFuncC1)ippiMean_32f_C1MR :
+            0;
+            if( ippiMean_C1MR )
+            {
+                if( CV_INSTRUMENT_FUN_IPP(ippiMean_C1MR, src_data, (int)src_step, mask, (int)mask_step, sz, mean_val) >= 0 )
+                {
+                    return CV_HAL_ERROR_OK;
+                }
+            }
+            typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
+            ippiMaskMeanFuncC3 ippiMean_C3MR =
+            src_type == CV_8UC3 ? (ippiMaskMeanFuncC3)ippiMean_8u_C3CMR :
+            src_type == CV_16UC3 ? (ippiMaskMeanFuncC3)ippiMean_16u_C3CMR :
+            src_type == CV_32FC3 ? (ippiMaskMeanFuncC3)ippiMean_32f_C3CMR :
+            0;
+            if( ippiMean_C3MR )
+            {
+                if( CV_INSTRUMENT_FUN_IPP(ippiMean_C3MR, src_data, (int)src_step, mask, (int)mask_step, sz, 1, &mean_val[0]) >= 0 &&
+                    CV_INSTRUMENT_FUN_IPP(ippiMean_C3MR, src_data, (int)src_step, mask, (int)mask_step, sz, 2, &mean_val[1]) >= 0 &&
+                    CV_INSTRUMENT_FUN_IPP(ippiMean_C3MR, src_data, (int)src_step, mask, (int)mask_step, sz, 3, &mean_val[2]) >= 0 )
+                {
+                    return CV_HAL_ERROR_OK;
+                }
+            }
+        }
+        else
+        {
+            typedef IppStatus (CV_STDCALL* ippiMeanFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm);
+            typedef IppStatus (CV_STDCALL* ippiMeanFuncNoHint)(const void*, int, IppiSize, double *);
+            ippiMeanFuncHint ippiMeanHint =
+            src_type == CV_32FC1 ? (ippiMeanFuncHint)ippiMean_32f_C1R :
+            src_type == CV_32FC3 ? (ippiMeanFuncHint)ippiMean_32f_C3R :
+            src_type == CV_32FC4 ? (ippiMeanFuncHint)ippiMean_32f_C4R :
+            0;
+            ippiMeanFuncNoHint ippiMean =
+            src_type == CV_8UC1 ? (ippiMeanFuncNoHint)ippiMean_8u_C1R :
+            src_type == CV_8UC3 ? (ippiMeanFuncNoHint)ippiMean_8u_C3R :
+            src_type == CV_8UC4 ? (ippiMeanFuncNoHint)ippiMean_8u_C4R :
+            src_type == CV_16UC1 ? (ippiMeanFuncNoHint)ippiMean_16u_C1R :
+            src_type == CV_16UC3 ? (ippiMeanFuncNoHint)ippiMean_16u_C3R :
+            src_type == CV_16UC4 ? (ippiMeanFuncNoHint)ippiMean_16u_C4R :
+            src_type == CV_16SC1 ? (ippiMeanFuncNoHint)ippiMean_16s_C1R :
+            src_type == CV_16SC3 ? (ippiMeanFuncNoHint)ippiMean_16s_C3R :
+            src_type == CV_16SC4 ? (ippiMeanFuncNoHint)ippiMean_16s_C4R :
+            0;
+
+            // Make sure only zero or one version of the function pointer is valid
+            CV_Assert(!ippiMeanHint || !ippiMean);
+            if( ippiMeanHint || ippiMean )
+            {
+                IppStatus status = ippiMeanHint ? CV_INSTRUMENT_FUN_IPP(ippiMeanHint, src_data, (int)src_step, sz, mean_val, ippAlgHintAccurate) :
+                CV_INSTRUMENT_FUN_IPP(ippiMean, src_data, (int)src_step, sz, mean_val);
+                if( status >= 0 )
+                {
+                    return CV_HAL_ERROR_OK;
+                }
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+
+
+static int ipp_meanStdDev(const uchar* src_data, size_t src_step, int width, int height,
+                          int src_type, double* mean_val, double* stddev_val, uchar* mask, size_t mask_step)
+{
+    int cn = CV_MAT_CN(src_type);
+
+    if((src_step == 1 || src_step == static_cast<size_t>(width)) && (mask_step == 1 || mask_step == static_cast<size_t>(width)))
+    {
+        Ipp64f mean_temp[3];
+        Ipp64f stddev_temp[3];
+        Ipp64f *pmean = &mean_temp[0];
+        Ipp64f *pstddev = &stddev_temp[0];
+        int dcn_mean = -1;
+        if( mean_val )
+        {
+            dcn_mean = cn;
+            pmean = mean_val;
+        }
+        int dcn_stddev = -1;
+        if( stddev_val )
+        {
+            dcn_stddev = cn;
+            pstddev = stddev_val;
+        }
+
+        for( int c = cn; c < dcn_mean; c++ )
+            pmean[c] = 0;
+        for( int c = cn; c < dcn_stddev; c++ )
+            pstddev[c] = 0;
+
+        IppiSize sz = { width, height };
+        if( !mask )
+        {
+            typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *, Ipp64f *);
+            ippiMaskMeanStdDevFuncC1 ippiMean_StdDev_C1MR =
+            src_type == CV_8UC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_8u_C1MR :
+            src_type == CV_16UC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_16u_C1MR :
+            src_type == CV_32FC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_32f_C1MR :
+            nullptr;
+            if( ippiMean_StdDev_C1MR )
+            {
+                if( CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C1MR, src_data, (int)src_step, mask, (int)mask_step, sz, pmean, pstddev) >= 0 )
+                {
+                    return CV_HAL_ERROR_OK;
+                }
+            }
+
+            typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *, Ipp64f *);
+            ippiMaskMeanStdDevFuncC3 ippiMean_StdDev_C3CMR =
+            src_type == CV_8UC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_8u_C3CMR :
+            src_type == CV_16UC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_16u_C3CMR :
+            src_type == CV_32FC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_32f_C3CMR :
+            nullptr;
+            if( ippiMean_StdDev_C3CMR )
+            {
+                if( CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CMR, src_data, (int)src_step, mask, (int)mask_step, sz, 1, &pmean[0], &pstddev[0]) >= 0 &&
+                    CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CMR, src_data, (int)src_step, mask, (int)mask_step, sz, 2, &pmean[1], &pstddev[1]) >= 0 &&
+                    CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CMR, src_data, (int)src_step, mask, (int)mask_step, sz, 3, &pmean[2], &pstddev[2]) >= 0 )
+                {
+                    return CV_HAL_ERROR_OK;
+                }
+            }
+        }
+        else
+        {
+            typedef IppStatus (CV_STDCALL* ippiMeanStdDevFuncC1)(const void *, int, IppiSize, Ipp64f *, Ipp64f *);
+            ippiMeanStdDevFuncC1 ippiMean_StdDev_C1R =
+                src_type == CV_8UC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_8u_C1R :
+                src_type == CV_16UC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_16u_C1R :
+            #if (IPP_VERSION_X100 >= 810)
+                src_type == CV_32FC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_32f_C1R ://Aug 2013: bug in IPP 7.1, 8.0
+            #endif
+                nullptr;
+            if( ippiMean_StdDev_C1R )
+            {
+                if( CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C1R, src_data, (int)src_step, sz, pmean, pstddev) >= 0 )
+                {
+                    return CV_HAL_ERROR_OK;
+                }
+            }
+
+            typedef IppStatus (CV_STDCALL* ippiMeanStdDevFuncC3)(const void *, int, IppiSize, int, Ipp64f *, Ipp64f *);
+                ippiMeanStdDevFuncC3 ippiMean_StdDev_C3CR =
+                src_type == CV_8UC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_8u_C3CR :
+                src_type == CV_16UC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_16u_C3CR :
+                src_type == CV_32FC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_32f_C3CR :
+                nullptr;
+            if( ippiMean_StdDev_C3CR )
+            {
+                if( CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CR, src_data, (int)src_step, sz, 1, &pmean[0], &pstddev[0]) >= 0 &&
+                    CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CR, src_data, (int)src_step, sz, 2, &pmean[1], &pstddev[1]) >= 0 &&
+                    CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CR, src_data, (int)src_step, sz, 3, &pmean[2], &pstddev[2]) >= 0 )
+                {
+                    return CV_HAL_ERROR_OK;
+                }
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+int ipp_hal_meanStdDev(const uchar* src_data, size_t src_step, int width, int height, int src_type,
+                       double* mean_val, double* stddev_val, uchar* mask, size_t mask_step)
+{
+    if (stddev_val)
+    {
+        return ipp_meanStdDev(src_data, src_step, width, height, src_type, mean_val, stddev_val, mask, mask_step);
+    }
+    else
+    {
+        return ipp_mean(src_data, src_step, width, height, src_type, mean_val, mask, mask_step);
+    }
+}
+
+
+#endif // IPP_VERSION_X100 >= 700
--- a/Show More
+++ b/Show More