Merge pull request #25792 from asmorkalov:as/HAL_fast_GaussianBlur

Added flag to GaussianBlur for faster but not bit-exact implementation #25792 Rationale: Current implementation of GaussianBlur is almost always bit-exact. It helps to get predictable results according platforms, but prohibits most of approximations and optimization tricks. The patch converts `borderType` parameter to more generic `flags` and introduces `GAUSS_ALLOW_APPROXIMATIONS` flag to allow not bit-exact implementation. With the flag IPP and generic HAL implementation are called first. The flag naming and location is a subject for discussion. Replaces https://github.com/opencv/opencv/pull/22073 Possibly related issue: https://github.com/opencv/opencv/issues/24135 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2025-08-06 14:36:36 +08:00 · 2024-07-12 15:03:33 +03:00 · 2024-07-12 15:03:33 +03:00 · 15783d6598
commit 15783d6598
parent 3ff97c5580
10 changed files with 143 additions and 12 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1258,7 +1258,11 @@ if(CMAKE_GENERATOR MATCHES "Xcode|Visual Studio|Multi-Config")
 else()
  status("    Configuration:"  ${CMAKE_BUILD_TYPE})
 endif()
-
+if(DEFINED OPENCV_ALGO_HINT_DEFAULT)
+  status("    Algorithm Hint:"  ${OPENCV_ALGO_HINT_DEFAULT})
+else()
+  status("    Algorithm Hint:" " ALGO_ACCURATE")
+endif()

 # ========================= CPU code generation mode =========================
 status("")
--- a/doc/tutorials/introduction/config_reference/config_reference.markdown
+++ b/doc/tutorials/introduction/config_reference/config_reference.markdown
@ -217,6 +217,7 @@ Following options can be used to produce special builds with instrumentation or
 | `ENABLE_BUILD_HARDENING` | GCC, Clang, MSVC | Enable compiler options which reduce possibility of code exploitation.  |
 | `ENABLE_LTO` | GCC, Clang, MSVC | Enable Link Time Optimization (LTO). |
 | `ENABLE_THIN_LTO` | Clang | Enable thin LTO which incorporates intermediate bitcode to binaries allowing consumers optimize their applications later. |
+| `OPENCV_ALGO_HINT_DEFAULT` | Any | Set default OpenCV implementation hint value: `ALGO_ACCURATE` or `ALGO_APROX`. Dangerous! The option  changes behaviour globally and may affect accuracy of many algorithms. |

@see [GCC instrumentation](https://gcc.gnu.org/onlinedocs/gcc/Instrumentation-Options.html)
@see [Build hardening](https://en.wikipedia.org/wiki/Hardening_(computing))
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@ -186,6 +186,10 @@ if(OPENCV_SEMIHOSTING)
  ocv_target_compile_definitions(${the_module} PRIVATE "-DOPENCV_SEMIHOSTING")
 endif(OPENCV_SEMIHOSTING)

+if(DEFINED OPENCV_ALGO_HINT_DEFAULT)
+  ocv_target_compile_definitions(${the_module} PRIVATE "-DOPENCV_ALGO_HINT_DEFAULT=${OPENCV_ALGO_HINT_DEFAULT}")
+endif(DEFINED OPENCV_ALGO_HINT_DEFAULT)
+
 if(HAVE_HPX)
  ocv_target_link_libraries(${the_module} LINK_PRIVATE "${HPX_LIBRARIES}")
 endif()
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@ -150,6 +150,18 @@ It is possible to alternate error processing by using #redirectError().
 */
 CV_EXPORTS CV_NORETURN void error(const Exception& exc);

+/*! @brief Flags that allow to midify some functions behavior. Used as set of flags.
+*/
+enum AlgorithmHint {
+    ALGO_DEFAULT = 0, //!< Default algorithm behaviour defined during OpenCV build
+    ALGO_ACCURATE = 1, //!< Use generic portable implementation
+    ALGO_APPROX = 2, //!< Allow alternative approximations to get faster implementation. Behaviour and result depends on a platform
+};
+
+/*! @brief Returns ImplementationHint selected by default, a.k.a. `IMPL_DEFAULT` defined during OpenCV compilation.
+ */
+CV_EXPORTS_W AlgorithmHint getDefaultAlgorithmHint();
+
 enum SortFlags { SORT_EVERY_ROW    = 0, //!< each matrix row is sorted independently
                 SORT_EVERY_COLUMN = 1, //!< each matrix column is sorted
                                        //!< independently; this flag and the previous one are
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@ -46,6 +46,7 @@
 #include <iostream>
 #include <ostream>

+#include <opencv2/core.hpp>
 #include <opencv2/core/utils/configuration.private.hpp>
 #include <opencv2/core/utils/trace.private.hpp>

@ -2888,6 +2889,14 @@ bool restoreFPDenormalsState(const FPDenormalsModeState& state)

 }  // namespace details

+AlgorithmHint getDefaultAlgorithmHint()
+{
+#ifdef OPENCV_ALGO_HINT_DEFAULT
+    return OPENCV_ALGO_HINT_DEFAULT;
+#else
+    return ALGO_ACCURATE;
+#endif
+};

 } // namespace cv

--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@ -1536,12 +1536,14 @@ respectively (see #getGaussianKernel for details); to fully control the result r
 possible future modifications of all this semantics, it is recommended to specify all of ksize,
 sigmaX, and sigmaY.
@param borderType pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not supported.
+@param hint Implementation modfication flags. See #AlgorithmHint

@sa  sepFilter2D, filter2D, blur, boxFilter, bilateralFilter, medianBlur
 */
 CV_EXPORTS_W void GaussianBlur( InputArray src, OutputArray dst, Size ksize,
                                double sigmaX, double sigmaY = 0,
-                                int borderType = BORDER_DEFAULT );
+                                int borderType = BORDER_DEFAULT,
+                                AlgorithmHint hint = cv::ALGO_DEFAULT );

 /** @brief Applies the bilateral filter to an image.

--- a/modules/imgproc/src/smooth.dispatch.cpp
+++ b/modules/imgproc/src/smooth.dispatch.cpp
@ -468,7 +468,7 @@ static bool openvx_gaussianBlur(InputArray _src, OutputArray _dst, Size ksize,

 #endif

-#if defined ENABLE_IPP_GAUSSIAN_BLUR  // see CMake's OPENCV_IPP_GAUSSIAN_BLUR option
+#ifdef ENABLE_IPP_GAUSSIAN_BLUR  // see CMake's OPENCV_IPP_GAUSSIAN_BLUR option

 #define IPP_DISABLE_GAUSSIAN_BLUR_LARGE_KERNELS_1TH 1
 #define IPP_DISABLE_GAUSSIAN_BLUR_16SC4_1TH 1
@ -526,14 +526,14 @@ private:

 #endif

-static bool ipp_GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
+static bool ipp_GaussianBlur(cv::Mat& src, cv::Mat& dst, Size ksize,
                   double sigma1, double sigma2, int borderType )
 {
 #ifdef HAVE_IPP_IW
    CV_INSTRUMENT_REGION_IPP();

 #if IPP_VERSION_X100 < 201800 && ((defined _MSC_VER && defined _M_IX86) || (defined __GNUC__ && defined __i386__))
-    CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(ksize); CV_UNUSED(sigma1); CV_UNUSED(sigma2); CV_UNUSED(borderType);
+    CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(ksize); CV_UNUSED(sigma1); CV_UNUSED(sigma2); CV_UNUSED(borderType);
    return false; // bug on ia32
 #else
    if(sigma1 != sigma2)
@ -548,8 +548,6 @@ static bool ipp_GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
    // Acquire data and begin processing
    try
    {
-        Mat src = _src.getMat();
-        Mat dst = _dst.getMat();
        ::ipp::IwiImage       iwSrc      = ippiGetImage(src);
        ::ipp::IwiImage       iwDst      = ippiGetImage(dst);
        ::ipp::IwiBorderSize  borderSize = ::ipp::iwiSizeToBorderSize(ippiGetSize(ksize));
@ -589,7 +587,7 @@ static bool ipp_GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
    return true;
 #endif
 #else
-    CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(ksize); CV_UNUSED(sigma1); CV_UNUSED(sigma2); CV_UNUSED(borderType);
+    CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(ksize); CV_UNUSED(sigma1); CV_UNUSED(sigma2); CV_UNUSED(borderType);
    return false;
 #endif
 }
@ -610,10 +608,13 @@ static bool validateGaussianBlurKernel(std::vector<T>& kernel)

 void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
                  double sigma1, double sigma2,
-                  int borderType)
+                  int borderType, AlgorithmHint hint)
 {
    CV_INSTRUMENT_REGION();

+    if (hint == cv::ALGO_DEFAULT)
+        hint = cv::getDefaultAlgorithmHint();
+
    CV_Assert(!_src.empty());

    int type = _src.type();
@ -693,7 +694,27 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
                    src2.locateROI( wsz, ofs );

                CALL_HAL(gaussianBlurBinomial, cv_hal_gaussianBlurBinomial, src2.ptr(), src2.step, dst.ptr(), dst.step, src2.cols, src2.rows, sdepth, cn,
-                         ofs.x, ofs.y, wsz.width - src2.cols - ofs.x,  wsz.height - src2.rows - ofs.y, ksize.width, borderType&~BORDER_ISOLATED);
+                         ofs.x, ofs.y, wsz.width - src2.cols - ofs.x,  wsz.height - src2.rows - ofs.y, ksize.width,
+                         borderType & ~BORDER_ISOLATED);
+            }
+
+            if (hint == ALGO_APPROX)
+            {
+                Point ofs;
+                Size wsz(src.cols, src.rows);
+                if(!(borderType & BORDER_ISOLATED))
+                    src.locateROI( wsz, ofs );
+
+                CALL_HAL(gaussianBlur, cv_hal_gaussianBlur, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, cn,
+                        ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height,
+                        sigma1, sigma2, borderType & ~BORDER_ISOLATED);
+
+#ifdef ENABLE_IPP_GAUSSIAN_BLUR
+                // IPP is not bit-exact to OpenCV implementation
+                CV_IPP_RUN_FAST(ipp_GaussianBlur(src, dst, ksize, sigma1, sigma2, borderType));
+#endif
+                CV_OVX_RUN(true,
+                        openvx_gaussianBlur(src, dst, ksize, sigma1, sigma2, borderType))
            }

            CV_CPU_DISPATCH(GaussianBlurFixedPoint, (src, dst, (const uint16_t*)&fkx[0], (int)fkx.size(), (const uint16_t*)&fky[0], (int)fky.size(), borderType),
@ -747,6 +768,25 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
                         ofs.x, ofs.y, wsz.width - src2.cols - ofs.x,  wsz.height - src2.rows - ofs.y, ksize.width, borderType&~BORDER_ISOLATED);
            }

+            if (hint == ALGO_APPROX)
+            {
+                Point ofs;
+                Size wsz(src.cols, src.rows);
+                if(!(borderType & BORDER_ISOLATED))
+                    src.locateROI( wsz, ofs );
+
+                CALL_HAL(gaussianBlur, cv_hal_gaussianBlur, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, cn,
+                        ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height,
+                        sigma1, sigma2, borderType & ~BORDER_ISOLATED);
+
+#ifdef ENABLE_IPP_GAUSSIAN_BLUR
+                // IPP is not bit-exact to OpenCV implementation
+                CV_IPP_RUN_FAST(ipp_GaussianBlur(src, dst, ksize, sigma1, sigma2, borderType));
+#endif
+                CV_OVX_RUN(true,
+                        openvx_gaussianBlur(src, dst, ksize, sigma1, sigma2, borderType))
+            }
+
            CV_CPU_DISPATCH(GaussianBlurFixedPoint, (src, dst, (const uint32_t*)&fkx[0], (int)fkx.size(), (const uint32_t*)&fky[0], (int)fky.size(), borderType),
                CV_CPU_DISPATCH_MODES_ALL);

@ -772,7 +812,7 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,

    CALL_HAL(gaussianBlur, cv_hal_gaussianBlur, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, cn,
             ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height,
-             sigma1, sigma2, borderType&~BORDER_ISOLATED);
+             sigma1, sigma2, borderType & ~BORDER_ISOLATED);

    CV_OVX_RUN(true,
               openvx_gaussianBlur(src, dst, ksize, sigma1, sigma2, borderType))
--- a/modules/imgproc/test/test_smooth_bitexact.cpp
+++ b/modules/imgproc/test/test_smooth_bitexact.cpp
@ -244,7 +244,7 @@ static void checkGaussianBlur_8Uvs32F(const Mat& src8u, const Mat& src32f, int N
 TEST(GaussianBlur_Bitexact, regression_9863)
 {
    Mat src8u = imread(cvtest::findDataFile("shared/lena.png"));
-     Mat src32f; src8u.convertTo(src32f, CV_32F);
+    Mat src32f; src8u.convertTo(src32f, CV_32F);

    checkGaussianBlur_8Uvs32F(src8u, src32f, 151, 30);
 }
@ -260,4 +260,58 @@ TEST(GaussianBlur_Bitexact, overflow_20792)
    EXPECT_GT(count, nintyPercent);
 }

+CV_ENUM(GaussInputType, CV_8U, CV_16S);
+CV_ENUM(GaussBorder, BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT_101);
+
+struct GaussianBlurVsBitexact: public testing::TestWithParam<tuple<GaussInputType, int, double, GaussBorder>>
+{
+    virtual void SetUp()
+    {
+        orig = imread(findDataFile("shared/lena.png"));
+        EXPECT_FALSE(orig.empty()) << "Cannot find test image shared/lena.png";
+    }
+
+    Mat orig;
+};
+
+// NOTE: The test was designed for IPP (-DOPENCV_IPP_GAUSSIAN_BLUR=ON)
+// Should be extended after new HAL integration
+TEST_P(GaussianBlurVsBitexact, approx)
+{
+    auto testParams = GetParam();
+    int dtype = get<0>(testParams);
+    int ksize = get<1>(testParams);
+    double sigma = get<2>(testParams);
+    int border = get<3>(testParams);
+
+    Mat src;
+    orig.convertTo(src, dtype);
+
+    cv::Mat gt;
+    GaussianBlur(src, gt, Size(ksize, ksize), sigma, sigma, border, ALGO_ACCURATE);
+
+    cv::Mat dst;
+    GaussianBlur(src, dst, Size(ksize, ksize), sigma, sigma, border, ALGO_APPROX);
+
+    cv::Mat diff;
+    cv::absdiff(dst, gt, diff);
+    cv::Mat flatten_diff = diff.reshape(1, diff.rows);
+
+    int nz = countNonZero(flatten_diff);
+    EXPECT_LE(nz, 0.06*src.total()); // Less 6% of different pixels
+
+    double min_val, max_val;
+    minMaxLoc(flatten_diff, &min_val, &max_val);
+    EXPECT_LE(max_val, 2); // expectes results floating +-1
+}
+
+INSTANTIATE_TEST_CASE_P(/*nothing*/, GaussianBlurVsBitexact,
+    testing::Combine(
+        GaussInputType::all(),
+        testing::Values(3, 5, 7),
+        testing::Values(0.75, 1.25),
+        GaussBorder::all()
+    )
+);
+
 }} // namespace
--- a/modules/python/test/test_misc.py
+++ b/modules/python/test/test_misc.py
@ -987,6 +987,10 @@ class SamplesFindFile(NewOpenCVTests):
        except cv.error as _e:
            pass

+class AlgorithmImplHit(NewOpenCVTests):
+    def test_callable(self):
+        res = cv.getDefaultAlgorithmHint()
+        self.assertTrue(res is not None)

 if __name__ == '__main__':
    NewOpenCVTests.bootstrap()
--- a/modules/ts/src/ts.cpp
+++ b/modules/ts/src/ts.cpp
@ -1126,6 +1126,7 @@ void SystemInfoCollector::OnTestProgramStart(const testing::UnitTest&)
    recordPropertyVerbose("cv_vcs_version", "OpenCV VCS version", getSnippetFromConfig("Version control:", "\n"));
    recordPropertyVerbose("cv_build_type", "Build type", getSnippetFromConfig("Configuration:", "\n"), CV_TEST_BUILD_CONFIG);
    recordPropertyVerbose("cv_compiler", "Compiler", getSnippetFromConfig("C++ Compiler:", "\n"));
+    recordPropertyVerbose("implementation_hint", "Algorithm hint", getSnippetFromConfig("Algorithm Hint:", "\n"));
    const char* parallelFramework = cv::currentParallelFramework();
    if (parallelFramework)
    {