Merge pull request #5810 from mshabunin:hal_interface

2025-07-22 20:39:41 +08:00 · 2015-12-17 16:48:01 +00:00 · 2015-12-17 16:48:01 +00:00 · 9aeb8c8d5a
commit 9aeb8c8d5a
parent 5021350082 c04d62db8b
65 changed files with 3782 additions and 3437 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -590,10 +590,30 @@ include(cmake/OpenCVFindMatlab.cmake)

 include(cmake/OpenCVDetectVTK.cmake)

+# -- Custom HAL replacement --
+set(_includes "")
+# assuming OPENCV_HAL_HEADERS and OPENCV_HAL_LIBS are lists of files:
+# option example: -DOPENCV_HAL_HEADERS="<some-path>/header1.h;<some-path>/header2.h"
 if (OPENCV_HAL_HEADERS AND OPENCV_HAL_LIBS)
-  get_filename_component(OPENCV_HAL_HEADERS "${OPENCV_HAL_HEADERS}" ABSOLUTE)
-  get_filename_component(OPENCV_HAL_LIBS "${OPENCV_HAL_LIBS}" ABSOLUTE)
+  foreach (h ${OPENCV_HAL_HEADERS})
+    get_filename_component(h "${h}" ABSOLUTE)
+    set(_includes "${_includes}\n#include \"${h}\"")
+  endforeach()
+  foreach (l ${OPENCV_HAL_LIBS})
+    get_filename_component(l "${l}" ABSOLUTE)
+    set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${l})
+    # TODO: install?
+    # ocv_install_target(${l} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
+  endforeach()
+else()
+  set(_includes "// using default HAL")
+  unset(OPENCV_HAL_HEADERS CACHE)
+  unset(OPENCV_HAL_LIBS CACHE)
 endif()
+set(OPENCV_HAL_HEADERS "${OPENCV_HAL_HEADERS}" CACHE STRING "Headers with custom HAL implementation")
+set(OPENCV_HAL_LIBS "${OPENCV_HAL_LIBS}" CACHE STRING "Libraries with custom HAL implementation")
+configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/custom_hal.hpp.in" "${CMAKE_BINARY_DIR}/custom_hal.hpp" @ONLY)
+unset(_includes)

 # ----------------------------------------------------------------------------
 # Add CUDA libraries (needed for apps/tools, samples)
@ -1091,6 +1111,7 @@ endif(DEFINED WITH_VA_INTEL)
 status("    Use Eigen:"      HAVE_EIGEN       THEN "YES (ver ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})" ELSE NO)
 status("    Use Cuda:"       HAVE_CUDA        THEN "YES (ver ${CUDA_VERSION_STRING})" ELSE NO)
 status("    Use OpenCL:"     HAVE_OPENCL      THEN YES ELSE NO)
+status("    Use custom HAL:" OPENCV_HAL_HEADERS AND OPENCV_HAL_LIBS THEN "YES (${OPENCV_HAL_HEADERS}; ${OPENCV_HAL_LIBS})" ELSE "NO")

 if(HAVE_CUDA)
  status("")
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@ -655,6 +655,8 @@ macro(ocv_glob_module_sources)
       "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/*.hpp"
       "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/*.hpp"
       "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/*.h"
+       "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/hal/*.hpp"
+       "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/hal/*.h"
  )
  file(GLOB lib_hdrs_detail
       "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/detail/*.hpp"
--- a/cmake/templates/custom_hal.hpp.in
+++ b/cmake/templates/custom_hal.hpp.in
@ -1,6 +1,6 @@
 #ifndef _CUSTOM_HAL_INCLUDED_
 #define _CUSTOM_HAL_INCLUDED_

-@OPENCV_HAL_HEADERS_INCLUDES@
+@_includes@

 #endif
--- a/cmake/templates/opencv_abi.xml.in
+++ b/cmake/templates/opencv_abi.xml.in
@ -21,7 +21,7 @@
 </libs>

 <skip_headers>
-    opencv2/hal/intrin*
+    opencv2/core/hal/intrin*
    opencv2/core/cuda*
    opencv2/core/private*
    opencv/cxeigen.hpp
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@ -243,6 +243,7 @@ PREDEFINED             = __cplusplus=1 \
                         CV_NORETURN= \
                         CV_DEFAULT(x)=" = x" \
                         CV_NEON=1 \
+                         CV_SSE2=1 \
                         FLANN_DEPRECATED=
 EXPAND_AS_DEFINED      =
 SKIP_FUNCTION_MACROS   = YES
--- a/modules/calib3d/src/stereosgbm.cpp
+++ b/modules/calib3d/src/stereosgbm.cpp
@ -52,7 +52,7 @@

 #include "precomp.hpp"
 #include <limits.h>
-#include "opencv2/hal/intrin.hpp"
+#include "opencv2/core/hal/intrin.hpp"

 namespace cv
 {
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@ -1,6 +1,5 @@
 set(the_description "The Core Functionality")
 ocv_add_module(core
-               opencv_hal
               PRIVATE_REQUIRED ${ZLIB_LIBRARIES} "${OPENCL_LIBRARIES}" "${VA_LIBRARIES}"
               OPTIONAL opencv_cudev
               WRAP java python)
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@ -72,6 +72,7 @@
    @defgroup core_cluster Clustering
    @defgroup core_utils Utility and system functions and macros
    @{
+        @defgroup core_utils_sse SSE utilities
        @defgroup core_utils_neon NEON utilities
    @}
    @defgroup core_opengl OpenGL interoperability
@ -80,6 +81,16 @@
    @defgroup core_directx DirectX interoperability
    @defgroup core_eigen Eigen support
    @defgroup core_opencl OpenCL support
+    @defgroup core_va_intel Intel VA-API/OpenCL (CL-VA) interoperability
+    @defgroup core_hal Hardware Acceleration Layer
+    @{
+        @defgroup core_hal_functions Functions
+        @defgroup core_hal_interface Interface
+        @defgroup core_hal_intrin Universal intrinsics
+        @{
+            @defgroup core_hal_intrin_impl Private implementation helpers
+        @}
+    @}
@}
 */

--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@ -50,10 +50,10 @@
 #endif

 #include <climits>
+#include <algorithm>

 #include "opencv2/core/cvdef.h"
 #include "opencv2/core/cvstd.hpp"
-#include "opencv2/hal.hpp"

 namespace cv
 {
@ -679,8 +679,11 @@ CV_EXPORTS void setUseIPP(bool flag);

 //! @} core_utils

+
+
+
 } // cv

-#include "opencv2/hal/neon_utils.hpp"
+#include "opencv2/core/neon_utils.hpp"

 #endif //__OPENCV_CORE_BASE_HPP__
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@ -45,6 +45,9 @@
 #ifndef __OPENCV_CORE_CVDEF_H__
 #define __OPENCV_CORE_CVDEF_H__

+//! @addtogroup core_utils
+//! @{
+
 #if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300
 #  define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */
 #endif
@ -56,7 +59,265 @@
 #undef abs
 #undef Complex

-#include "opencv2/hal/defs.h"
+#if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300
+#  define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */
+#endif
+
+#include <limits.h>
+#include "opencv2/core/hal/interface.h"
+
+#if defined __ICL
+#  define CV_ICC   __ICL
+#elif defined __ICC
+#  define CV_ICC   __ICC
+#elif defined __ECL
+#  define CV_ICC   __ECL
+#elif defined __ECC
+#  define CV_ICC   __ECC
+#elif defined __INTEL_COMPILER
+#  define CV_ICC   __INTEL_COMPILER
+#endif
+
+#ifndef CV_INLINE
+#  if defined __cplusplus
+#    define CV_INLINE static inline
+#  elif defined _MSC_VER
+#    define CV_INLINE __inline
+#  else
+#    define CV_INLINE static
+#  endif
+#endif
+
+#if defined CV_ICC && !defined CV_ENABLE_UNROLLED
+#  define CV_ENABLE_UNROLLED 0
+#else
+#  define CV_ENABLE_UNROLLED 1
+#endif
+
+#ifdef __GNUC__
+#  define CV_DECL_ALIGNED(x) __attribute__ ((aligned (x)))
+#elif defined _MSC_VER
+#  define CV_DECL_ALIGNED(x) __declspec(align(x))
+#else
+#  define CV_DECL_ALIGNED(x)
+#endif
+
+/* CPU features and intrinsics support */
+#define CV_CPU_NONE             0
+#define CV_CPU_MMX              1
+#define CV_CPU_SSE              2
+#define CV_CPU_SSE2             3
+#define CV_CPU_SSE3             4
+#define CV_CPU_SSSE3            5
+#define CV_CPU_SSE4_1           6
+#define CV_CPU_SSE4_2           7
+#define CV_CPU_POPCNT           8
+
+#define CV_CPU_AVX              10
+#define CV_CPU_AVX2             11
+#define CV_CPU_FMA3             12
+
+#define CV_CPU_AVX_512F         13
+#define CV_CPU_AVX_512BW        14
+#define CV_CPU_AVX_512CD        15
+#define CV_CPU_AVX_512DQ        16
+#define CV_CPU_AVX_512ER        17
+#define CV_CPU_AVX_512IFMA512   18
+#define CV_CPU_AVX_512PF        19
+#define CV_CPU_AVX_512VBMI      20
+#define CV_CPU_AVX_512VL        21
+
+#define CV_CPU_NEON   100
+
+// when adding to this list remember to update the following enum
+#define CV_HARDWARE_MAX_FEATURE 255
+
+/** @brief Available CPU features.
+*/
+enum CpuFeatures {
+    CPU_MMX             = 1,
+    CPU_SSE             = 2,
+    CPU_SSE2            = 3,
+    CPU_SSE3            = 4,
+    CPU_SSSE3           = 5,
+    CPU_SSE4_1          = 6,
+    CPU_SSE4_2          = 7,
+    CPU_POPCNT          = 8,
+
+    CPU_AVX             = 10,
+    CPU_AVX2            = 11,
+    CPU_FMA3            = 12,
+
+    CPU_AVX_512F        = 13,
+    CPU_AVX_512BW       = 14,
+    CPU_AVX_512CD       = 15,
+    CPU_AVX_512DQ       = 16,
+    CPU_AVX_512ER       = 17,
+    CPU_AVX_512IFMA512  = 18,
+    CPU_AVX_512PF       = 19,
+    CPU_AVX_512VBMI     = 20,
+    CPU_AVX_512VL       = 21,
+
+    CPU_NEON            = 100
+};
+
+// do not include SSE/AVX/NEON headers for NVCC compiler
+#ifndef __CUDACC__
+
+#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
+#  include <emmintrin.h>
+#  define CV_MMX 1
+#  define CV_SSE 1
+#  define CV_SSE2 1
+#  if defined __SSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)
+#    include <pmmintrin.h>
+#    define CV_SSE3 1
+#  endif
+#  if defined __SSSE3__  || (defined _MSC_VER && _MSC_VER >= 1500)
+#    include <tmmintrin.h>
+#    define CV_SSSE3 1
+#  endif
+#  if defined __SSE4_1__ || (defined _MSC_VER && _MSC_VER >= 1500)
+#    include <smmintrin.h>
+#    define CV_SSE4_1 1
+#  endif
+#  if defined __SSE4_2__ || (defined _MSC_VER && _MSC_VER >= 1500)
+#    include <nmmintrin.h>
+#    define CV_SSE4_2 1
+#  endif
+#  if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500)
+#    ifdef _MSC_VER
+#      include <nmmintrin.h>
+#    else
+#      include <popcntintrin.h>
+#    endif
+#    define CV_POPCNT 1
+#  endif
+#  if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600 && 0)
+// MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX
+// See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
+#    include <immintrin.h>
+#    define CV_AVX 1
+#    if defined(_XCR_XFEATURE_ENABLED_MASK)
+#      define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
+#    else
+#      define __xgetbv() 0
+#    endif
+#  endif
+#  if defined __AVX2__ || (defined _MSC_VER && _MSC_VER >= 1800 && 0)
+#    include <immintrin.h>
+#    define CV_AVX2 1
+#    if defined __FMA__
+#      define CV_FMA3 1
+#    endif
+#  endif
+#endif
+
+#if (defined WIN32 || defined _WIN32) && defined(_M_ARM)
+# include <Intrin.h>
+# include "arm_neon.h"
+# define CV_NEON 1
+# define CPU_HAS_NEON_FEATURE (true)
+#elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__))
+#  include <arm_neon.h>
+#  define CV_NEON 1
+#endif
+
+#if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__
+#  define CV_VFP 1
+#endif
+
+#endif // __CUDACC__
+
+#ifndef CV_POPCNT
+#define CV_POPCNT 0
+#endif
+#ifndef CV_MMX
+#  define CV_MMX 0
+#endif
+#ifndef CV_SSE
+#  define CV_SSE 0
+#endif
+#ifndef CV_SSE2
+#  define CV_SSE2 0
+#endif
+#ifndef CV_SSE3
+#  define CV_SSE3 0
+#endif
+#ifndef CV_SSSE3
+#  define CV_SSSE3 0
+#endif
+#ifndef CV_SSE4_1
+#  define CV_SSE4_1 0
+#endif
+#ifndef CV_SSE4_2
+#  define CV_SSE4_2 0
+#endif
+#ifndef CV_AVX
+#  define CV_AVX 0
+#endif
+#ifndef CV_AVX2
+#  define CV_AVX2 0
+#endif
+#ifndef CV_FMA3
+#  define CV_FMA3 0
+#endif
+#ifndef CV_AVX_512F
+#  define CV_AVX_512F 0
+#endif
+#ifndef CV_AVX_512BW
+#  define CV_AVX_512BW 0
+#endif
+#ifndef CV_AVX_512CD
+#  define CV_AVX_512CD 0
+#endif
+#ifndef CV_AVX_512DQ
+#  define CV_AVX_512DQ 0
+#endif
+#ifndef CV_AVX_512ER
+#  define CV_AVX_512ER 0
+#endif
+#ifndef CV_AVX_512IFMA512
+#  define CV_AVX_512IFMA512 0
+#endif
+#ifndef CV_AVX_512PF
+#  define CV_AVX_512PF 0
+#endif
+#ifndef CV_AVX_512VBMI
+#  define CV_AVX_512VBMI 0
+#endif
+#ifndef CV_AVX_512VL
+#  define CV_AVX_512VL 0
+#endif
+
+#ifndef CV_NEON
+#  define CV_NEON 0
+#endif
+
+#ifndef CV_VFP
+#  define CV_VFP 0
+#endif
+
+/* fundamental constants */
+#define CV_PI   3.1415926535897932384626433832795
+#define CV_2PI 6.283185307179586476925286766559
+#define CV_LOG2 0.69314718055994530941723212145818
+
+typedef union Cv32suf
+{
+    int i;
+    unsigned u;
+    float f;
+}
+Cv32suf;
+
+typedef union Cv64suf
+{
+    int64 i;
+    uint64 u;
+    double f;
+}
+Cv64suf;

 #define OPENCV_ABI_COMPATIBILITY 300

@ -169,12 +430,12 @@
 #define CV_SUBMAT_FLAG          (1 << CV_SUBMAT_FLAG_SHIFT)
 #define CV_IS_SUBMAT(flags)     ((flags) & CV_MAT_SUBMAT_FLAG)

-/* Size of each channel item,
+/** Size of each channel item,
   0x124489 = 1000 0100 0100 0010 0010 0001 0001 ~ array of sizeof(arr_type_elem) */
 #define CV_ELEM_SIZE1(type) \
    ((((sizeof(size_t)<<28)|0x8442211) >> CV_MAT_DEPTH(type)*4) & 15)

-/* 0x3a50 = 11 10 10 01 01 00 00 ~ array of log2(sizeof(arr_type_elem)) */
+/** 0x3a50 = 11 10 10 01 01 00 00 ~ array of log2(sizeof(arr_type_elem)) */
 #define CV_ELEM_SIZE(type) \
    (CV_MAT_CN(type) << ((((sizeof(size_t)/4+1)*16384|0x3a50) >> CV_MAT_DEPTH(type)*2) & 3))

@ -249,4 +510,6 @@
 #  endif
 #endif

+//! @}
+
 #endif // __OPENCV_CORE_CVDEF_H__
--- a/modules/core/include/opencv2/core/fast_math.hpp
+++ b/modules/core/include/opencv2/core/fast_math.hpp
@ -0,0 +1,302 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_CORE_FAST_MATH_HPP__
+#define __OPENCV_CORE_FAST_MATH_HPP__
+
+#include "opencv2/core/cvdef.h"
+
+//! @addtogroup core_utils
+//! @{
+
+/****************************************************************************************\
+*                                      fast math                                         *
+\****************************************************************************************/
+
+#if defined __BORLANDC__
+#  include <fastmath.h>
+#elif defined __cplusplus
+#  include <cmath>
+#else
+#  include <math.h>
+#endif
+
+#ifdef HAVE_TEGRA_OPTIMIZATION
+#  include "tegra_round.hpp"
+#endif
+
+#if CV_VFP
+    // 1. general scheme
+    #define ARM_ROUND(_value, _asm_string) \
+        int res; \
+        float temp; \
+        asm(_asm_string : [res] "=r" (res), [temp] "=w" (temp) : [value] "w" (_value)); \
+        return res
+    // 2. version for double
+    #ifdef __clang__
+        #define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]")
+    #else
+        #define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]")
+    #endif
+    // 3. version for float
+    #define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
+#endif // CV_VFP
+
+/** @brief Rounds floating-point number to the nearest integer
+
+ @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
+ result is not defined.
+ */
+CV_INLINE int
+cvRound( double value )
+{
+#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
+    && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128d t = _mm_set_sd( value );
+    return _mm_cvtsd_si32(t);
+#elif defined _MSC_VER && defined _M_IX86
+    int t;
+    __asm
+    {
+        fld value;
+        fistp t;
+    }
+    return t;
+#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
+        defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
+    TEGRA_ROUND_DBL(value);
+#elif defined CV_ICC || defined __GNUC__
+# if CV_VFP
+    ARM_ROUND_DBL(value);
+# else
+    return (int)lrint(value);
+# endif
+#else
+    /* it's ok if round does not comply with IEEE754 standard;
+       the tests should allow +/-1 difference when the tested functions use round */
+    return (int)(value + (value >= 0 ? 0.5 : -0.5));
+#endif
+}
+
+
+/** @brief Rounds floating-point number to the nearest integer not larger than the original.
+
+ The function computes an integer i such that:
+ \f[i \le \texttt{value} < i+1\f]
+ @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
+ result is not defined.
+ */
+CV_INLINE int cvFloor( double value )
+{
+#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128d t = _mm_set_sd( value );
+    int i = _mm_cvtsd_si32(t);
+    return i - _mm_movemask_pd(_mm_cmplt_sd(t, _mm_cvtsi32_sd(t,i)));
+#elif defined __GNUC__
+    int i = (int)value;
+    return i - (i > value);
+#else
+    int i = cvRound(value);
+    float diff = (float)(value - i);
+    return i - (diff < 0);
+#endif
+}
+
+/** @brief Rounds floating-point number to the nearest integer not smaller than the original.
+
+ The function computes an integer i such that:
+ \f[i \le \texttt{value} < i+1\f]
+ @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
+ result is not defined.
+ */
+CV_INLINE int cvCeil( double value )
+{
+#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128d t = _mm_set_sd( value );
+    int i = _mm_cvtsd_si32(t);
+    return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t,i), t));
+#elif defined __GNUC__
+    int i = (int)value;
+    return i + (i < value);
+#else
+    int i = cvRound(value);
+    float diff = (float)(i - value);
+    return i + (diff < 0);
+#endif
+}
+
+/** @brief Determines if the argument is Not A Number.
+
+ @param value The input floating-point value
+
+ The function returns 1 if the argument is Not A Number (as defined by IEEE754 standard), 0
+ otherwise. */
+CV_INLINE int cvIsNaN( double value )
+{
+    Cv64suf ieee754;
+    ieee754.f = value;
+    return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) +
+           ((unsigned)ieee754.u != 0) > 0x7ff00000;
+}
+
+/** @brief Determines if the argument is Infinity.
+
+ @param value The input floating-point value
+
+ The function returns 1 if the argument is a plus or minus infinity (as defined by IEEE754 standard)
+ and 0 otherwise. */
+CV_INLINE int cvIsInf( double value )
+{
+    Cv64suf ieee754;
+    ieee754.f = value;
+    return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) == 0x7ff00000 &&
+            (unsigned)ieee754.u == 0;
+}
+
+#ifdef __cplusplus
+
+/** @overload */
+CV_INLINE int cvRound(float value)
+{
+#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ && \
+      defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128 t = _mm_set_ss( value );
+    return _mm_cvtss_si32(t);
+#elif defined _MSC_VER && defined _M_IX86
+    int t;
+    __asm
+    {
+        fld value;
+        fistp t;
+    }
+    return t;
+#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
+        defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
+    TEGRA_ROUND_FLT(value);
+#elif defined CV_ICC || defined __GNUC__
+# if CV_VFP
+    ARM_ROUND_FLT(value);
+# else
+    return (int)lrintf(value);
+# endif
+#else
+    /* it's ok if round does not comply with IEEE754 standard;
+     the tests should allow +/-1 difference when the tested functions use round */
+    return (int)(value + (value >= 0 ? 0.5f : -0.5f));
+#endif
+}
+
+/** @overload */
+CV_INLINE int cvRound( int value )
+{
+    return value;
+}
+
+/** @overload */
+CV_INLINE int cvFloor( float value )
+{
+#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128 t = _mm_set_ss( value );
+    int i = _mm_cvtss_si32(t);
+    return i - _mm_movemask_ps(_mm_cmplt_ss(t, _mm_cvtsi32_ss(t,i)));
+#elif defined __GNUC__
+    int i = (int)value;
+    return i - (i > value);
+#else
+    int i = cvRound(value);
+    float diff = (float)(value - i);
+    return i - (diff < 0);
+#endif
+}
+
+/** @overload */
+CV_INLINE int cvFloor( int value )
+{
+    return value;
+}
+
+/** @overload */
+CV_INLINE int cvCeil( float value )
+{
+#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128 t = _mm_set_ss( value );
+    int i = _mm_cvtss_si32(t);
+    return i + _mm_movemask_ps(_mm_cmplt_ss(_mm_cvtsi32_ss(t,i), t));
+#elif defined __GNUC__
+    int i = (int)value;
+    return i + (i < value);
+#else
+    int i = cvRound(value);
+    float diff = (float)(i - value);
+    return i + (diff < 0);
+#endif
+}
+
+/** @overload */
+CV_INLINE int cvCeil( int value )
+{
+    return value;
+}
+
+/** @overload */
+CV_INLINE int cvIsNaN( float value )
+{
+    Cv32suf ieee754;
+    ieee754.f = value;
+    return (ieee754.u & 0x7fffffff) > 0x7f800000;
+}
+
+/** @overload */
+CV_INLINE int cvIsInf( float value )
+{
+    Cv32suf ieee754;
+    ieee754.f = value;
+    return (ieee754.u & 0x7fffffff) == 0x7f800000;
+}
+
+#endif // __cplusplus
+
+//! @} core_utils
+
+#endif
--- a/modules/core/include/opencv2/core/hal/hal.hpp
+++ b/modules/core/include/opencv2/core/hal/hal.hpp
@ -0,0 +1,218 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_HAL_HPP__
+#define __OPENCV_HAL_HPP__
+
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/hal/interface.h"
+
+//! @cond IGNORED
+#define CALL_HAL(name, fun, ...) \
+    int res = fun(__VA_ARGS__); \
+    if (res == CV_HAL_ERROR_OK) \
+        return; \
+    else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \
+        CV_Error_(cv::Error::StsInternal, \
+            ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res));
+//! @endcond
+
+
+namespace cv { namespace hal {
+
+//! @addtogroup core_hal_functions
+//! @{
+
+CV_EXPORTS int normHamming(const uchar* a, int n);
+CV_EXPORTS int normHamming(const uchar* a, const uchar* b, int n);
+
+CV_EXPORTS int normHamming(const uchar* a, int n, int cellSize);
+CV_EXPORTS int normHamming(const uchar* a, const uchar* b, int n, int cellSize);
+
+CV_EXPORTS int LU32f(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+CV_EXPORTS int LU64f(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+CV_EXPORTS bool Cholesky32f(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+CV_EXPORTS bool Cholesky64f(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+
+CV_EXPORTS int normL1_(const uchar* a, const uchar* b, int n);
+CV_EXPORTS float normL1_(const float* a, const float* b, int n);
+CV_EXPORTS float normL2Sqr_(const float* a, const float* b, int n);
+
+CV_EXPORTS void exp32f(const float* src, float* dst, int n);
+CV_EXPORTS void exp64f(const double* src, double* dst, int n);
+CV_EXPORTS void log32f(const float* src, float* dst, int n);
+CV_EXPORTS void log64f(const double* src, double* dst, int n);
+
+CV_EXPORTS void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
+CV_EXPORTS void magnitude32f(const float* x, const float* y, float* dst, int n);
+CV_EXPORTS void magnitude64f(const double* x, const double* y, double* dst, int n);
+CV_EXPORTS void sqrt32f(const float* src, float* dst, int len);
+CV_EXPORTS void sqrt64f(const double* src, double* dst, int len);
+CV_EXPORTS void invSqrt32f(const float* src, float* dst, int len);
+CV_EXPORTS void invSqrt64f(const double* src, double* dst, int len);
+
+CV_EXPORTS void split8u(const uchar* src, uchar** dst, int len, int cn );
+CV_EXPORTS void split16u(const ushort* src, ushort** dst, int len, int cn );
+CV_EXPORTS void split32s(const int* src, int** dst, int len, int cn );
+CV_EXPORTS void split64s(const int64* src, int64** dst, int len, int cn );
+
+CV_EXPORTS void merge8u(const uchar** src, uchar* dst, int len, int cn );
+CV_EXPORTS void merge16u(const ushort** src, ushort* dst, int len, int cn );
+CV_EXPORTS void merge32s(const int** src, int* dst, int len, int cn );
+CV_EXPORTS void merge64s(const int64** src, int64* dst, int len, int cn );
+
+CV_EXPORTS void add8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void add8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void add16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void add16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void add32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void add32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void add64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
+
+CV_EXPORTS void sub8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void sub8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void sub16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void sub16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void sub32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void sub32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void sub64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
+
+CV_EXPORTS void max8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void max8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void max16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void max16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void max32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void max32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void max64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
+
+CV_EXPORTS void min8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void min8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void min16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void min16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void min32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void min32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void min64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
+
+CV_EXPORTS void absdiff8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void absdiff8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void absdiff16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void absdiff16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void absdiff32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void absdiff32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void absdiff64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
+
+CV_EXPORTS void and8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void or8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void xor8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void not8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+
+CV_EXPORTS void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+CV_EXPORTS void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+CV_EXPORTS void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+CV_EXPORTS void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+CV_EXPORTS void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+CV_EXPORTS void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+CV_EXPORTS void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+
+CV_EXPORTS void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void mul16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void mul32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void mul32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void mul64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
+
+CV_EXPORTS void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void div16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void div32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void div32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void div64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
+
+CV_EXPORTS void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
+
+CV_EXPORTS void addWeighted8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _scalars );
+CV_EXPORTS void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scalars );
+CV_EXPORTS void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scalars );
+CV_EXPORTS void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scalars );
+CV_EXPORTS void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scalars );
+CV_EXPORTS void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
+CV_EXPORTS void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );
+
+//! @} core_hal
+
+//=============================================================================
+// for binary compatibility with 3.0
+
+//! @cond IGNORED
+
+CV_EXPORTS int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+CV_EXPORTS int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+CV_EXPORTS bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+CV_EXPORTS bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+
+CV_EXPORTS void exp(const float* src, float* dst, int n);
+CV_EXPORTS void exp(const double* src, double* dst, int n);
+CV_EXPORTS void log(const float* src, float* dst, int n);
+CV_EXPORTS void log(const double* src, double* dst, int n);
+
+CV_EXPORTS void magnitude(const float* x, const float* y, float* dst, int n);
+CV_EXPORTS void magnitude(const double* x, const double* y, double* dst, int n);
+CV_EXPORTS void sqrt(const float* src, float* dst, int len);
+CV_EXPORTS void sqrt(const double* src, double* dst, int len);
+CV_EXPORTS void invSqrt(const float* src, float* dst, int len);
+CV_EXPORTS void invSqrt(const double* src, double* dst, int len);
+
+//! @endcond
+
+}} //cv::hal
+
+#endif //__OPENCV_HAL_HPP__
--- a/modules/core/include/opencv2/core/hal/interface.h
+++ b/modules/core/include/opencv2/core/hal/interface.h
@ -1,8 +1,11 @@
 #ifndef _HAL_INTERFACE_HPP_INCLUDED_
 #define _HAL_INTERFACE_HPP_INCLUDED_

+//! @addtogroup core_hal_interface
+//! @{
+
 #define CV_HAL_ERROR_OK 0
-#define CV_HAL_ERROR_NI 1
+#define CV_HAL_ERROR_NOT_IMPLEMENTED 1
 #define CV_HAL_ERROR_UNKNOWN -1

 #define CV_HAL_CMP_EQ 0
@ -12,33 +15,6 @@
 #define CV_HAL_CMP_LE 4
 #define CV_HAL_CMP_NE 5

-#ifdef __cplusplus
-namespace cv { namespace hal {
-
-namespace Error {
-
-enum
-{
-    Ok = 0,
-    NotImplemented = 1,
-    Unknown = -1
-};
-
-}
-
-enum
-{
-    CMP_EQ = 0,
-    CMP_GT = 1,
-    CMP_GE = 2,
-    CMP_LT = 3,
-    CMP_LE = 4,
-    CMP_NE = 5
-};
-
-}}
-#endif
-
 #ifdef __cplusplus
 #include <cstddef>
 #else
@ -88,4 +64,6 @@ typedef signed char schar;
 #  define CV_BIG_UINT(n)  n##ULL
 #endif

+//! @}
+
 #endif
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@ -48,7 +48,7 @@
 #include <cmath>
 #include <float.h>
 #include <stdlib.h>
-#include "opencv2/hal/defs.h"
+#include "opencv2/core/cvdef.h"

 #define OPENCV_HAL_ADD(a, b) ((a) + (b))
 #define OPENCV_HAL_AND(a, b) ((a) & (b))
@ -60,7 +60,7 @@
 // access from within opencv code more accessible
 namespace cv {

-//! @addtogroup hal_intrin
+//! @addtogroup core_hal_intrin
 //! @{

 //! @cond IGNORED
@ -290,19 +290,19 @@ template <typename T> struct V_SIMD128Traits

 #if CV_SSE2

-#include "opencv2/hal/intrin_sse.hpp"
+#include "opencv2/core/hal/intrin_sse.hpp"

 #elif CV_NEON

-#include "opencv2/hal/intrin_neon.hpp"
+#include "opencv2/core/hal/intrin_neon.hpp"

 #else

-#include "opencv2/hal/intrin_cpp.hpp"
+#include "opencv2/core/hal/intrin_cpp.hpp"

 #endif

-//! @addtogroup hal_intrin
+//! @addtogroup core_hal_intrin
 //! @{

 #ifndef CV_SIMD128
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@ -47,11 +47,13 @@

 #include <limits>
 #include <cstring>
+#include <algorithm>
+#include "opencv2/core/saturate.hpp"

 namespace cv
 {

-/** @addtogroup hal_intrin
+/** @addtogroup core_hal_intrin

 "Universal intrinsics" is a types and functions set intended to simplify vectorization of code on
 different platforms. Currently there are two supported SIMD extensions: __SSE/SSE2__ on x86
@ -370,7 +372,7 @@ typedef v_reg<uint64, 2> v_uint64x2;
 typedef v_reg<int64, 2> v_int64x2;

 //! @brief Helper macro
-//! @ingroup hal_intrin_impl
+//! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_BIN_OP(bin_op) \
 template<typename _Tp, int n> inline v_reg<_Tp, n> \
    operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
@ -409,7 +411,7 @@ For floating types only. */
 OPENCV_HAL_IMPL_BIN_OP(/)

 //! @brief Helper macro
-//! @ingroup hal_intrin_impl
+//! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_BIT_OP(bit_op) \
 template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \
    (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
@ -458,7 +460,7 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp,
 }

 //! @brief Helper macro
-//! @ingroup hal_intrin_impl
+//! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
 template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
 { \
@ -507,7 +509,7 @@ Only for floating point types.*/
 OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int)

 //! @brief Helper macro
-//! @ingroup hal_intrin_impl
+//! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
 template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 { \
@ -518,7 +520,7 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a,
 }

 //! @brief Helper macro
-//! @ingroup hal_intrin_impl
+//! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \
 template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \
 { \
@ -584,7 +586,7 @@ inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
 //! @endcond

 //! @brief Helper macro
-//! @ingroup hal_intrin_impl
+//! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
 template<typename _Tp, int n> \
 inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
@ -627,7 +629,7 @@ For all types except 64-bit integer values. */
 OPENCV_HAL_IMPL_CMP_OP(!=)

 //! @brief Helper macro
-//! @ingroup hal_intrin_impl
+//! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \
 template<typename _Tp, int n> \
 inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
@ -821,7 +823,7 @@ template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
 //! @endcond

 //! @brief Helper macro
-//! @ingroup hal_intrin_impl
+//! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
 template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
 { \
@ -1465,7 +1467,7 @@ inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
 }

 //! @brief Helper macro
-//! @ingroup hal_intrin_impl
+//! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, _Tp, suffix) \
 inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); }

@ -1485,7 +1487,7 @@ OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, int64, s64)
 //! @}

 //! @brief Helper macro
-//! @ingroup hal_intrin_impl
+//! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, suffix) \
 inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); }

@ -1505,7 +1507,7 @@ OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, s64)
 //! @}

 //! @brief Helper macro
-//! @ingroup hal_intrin_impl
+//! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \
 template<typename _Tp0, int n0> inline _Tpvec \
    v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
@ -1527,7 +1529,7 @@ OPENCV_HAL_IMPL_C_REINTERPRET(v_int64x2, int64, s64)
 //! @}

 //! @brief Helper macro
-//! @ingroup hal_intrin_impl
+//! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \
 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
 { return a << n; }
@ -1544,7 +1546,7 @@ OPENCV_HAL_IMPL_C_SHIFTL(v_int64x2, int64)
 //! @}

 //! @brief Helper macro
-//! @ingroup hal_intrin_impl
+//! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \
 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
 { return a >> n; }
@ -1561,7 +1563,7 @@ OPENCV_HAL_IMPL_C_SHIFTR(v_int64x2, int64)
 //! @}

 //! @brief Helper macro
-//! @ingroup hal_intrin_impl
+//! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \
 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
 { \
@ -1583,7 +1585,7 @@ OPENCV_HAL_IMPL_C_RSHIFTR(v_int64x2, int64)
 //! @}

 //! @brief Helper macro
-//! @ingroup hal_intrin_impl
+//! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix) \
 inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
 { \
@ -1616,7 +1618,7 @@ OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u)
 //! @}

 //! @brief Helper macro
-//! @ingroup hal_intrin_impl
+//! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
 template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
 { \
@ -1649,7 +1651,7 @@ OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u)
 //! @}

 //! @brief Helper macro
-//! @ingroup hal_intrin_impl
+//! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
 inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
 { \
@ -1677,7 +1679,7 @@ OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u)
 //! @}

 //! @brief Helper macro
-//! @ingroup hal_intrin_impl
+//! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
 template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
 { \
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -45,6 +45,8 @@
 #ifndef __OPENCV_HAL_INTRIN_NEON_HPP__
 #define __OPENCV_HAL_INTRIN_NEON_HPP__

+#include <algorithm>
+
 namespace cv
 {

--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@ -45,6 +45,8 @@
 #ifndef __OPENCV_HAL_SSE_HPP__
 #define __OPENCV_HAL_SSE_HPP__

+#include <algorithm>
+
 #define CV_SIMD128 1
 #define CV_SIMD128_64F 1

--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@ -51,6 +51,7 @@
 #include "opencv2/core/cvdef.h"
 #include "opencv2/core/base.hpp"
 #include "opencv2/core/traits.hpp"
+#include "opencv2/core/saturate.hpp"

 namespace cv
 {
--- a/modules/core/include/opencv2/core/neon_utils.hpp
+++ b/modules/core/include/opencv2/core/neon_utils.hpp
@ -42,9 +42,10 @@
 #ifndef __OPENCV_HAL_NEON_UTILS_HPP__
 #define __OPENCV_HAL_NEON_UTILS_HPP__

-#include "opencv2/hal/defs.h"
+#include "opencv2/core/cvdef.h"

-namespace cv {
+//! @addtogroup core_utils_neon
+//! @{

 #if CV_NEON

@ -122,6 +123,6 @@ inline float32x2_t cv_vsqrt_f32(float32x2_t val)

 #endif

-}
+//! @}

 #endif // __OPENCV_HAL_NEON_UTILS_HPP__
--- a/modules/core/include/opencv2/core/saturate.hpp
+++ b/modules/core/include/opencv2/core/saturate.hpp
@ -0,0 +1,150 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_CORE_SATURATE_HPP__
+#define __OPENCV_CORE_SATURATE_HPP__
+
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/fast_math.hpp"
+
+namespace cv
+{
+
+//! @addtogroup core_utils
+//! @{
+
+/////////////// saturate_cast (used in image & signal processing) ///////////////////
+
+/** @brief Template function for accurate conversion from one primitive type to another.
+
+ The functions saturate_cast resemble the standard C++ cast operations, such as static_cast\<T\>()
+ and others. They perform an efficient and accurate conversion from one primitive type to another
+ (see the introduction chapter). saturate in the name means that when the input value v is out of the
+ range of the target type, the result is not formed just by taking low bits of the input, but instead
+ the value is clipped. For example:
+ @code
+ uchar a = saturate_cast<uchar>(-100); // a = 0 (UCHAR_MIN)
+ short b = saturate_cast<short>(33333.33333); // b = 32767 (SHRT_MAX)
+ @endcode
+ Such clipping is done when the target type is unsigned char , signed char , unsigned short or
+ signed short . For 32-bit integers, no clipping is done.
+
+ When the parameter is a floating-point value and the target type is an integer (8-, 16- or 32-bit),
+ the floating-point value is first rounded to the nearest integer and then clipped if needed (when
+ the target type is 8- or 16-bit).
+
+ This operation is used in the simplest or most complex image processing functions in OpenCV.
+
+ @param v Function parameter.
+ @sa add, subtract, multiply, divide, Mat::convertTo
+ */
+template<typename _Tp> static inline _Tp saturate_cast(uchar v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(schar v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(ushort v)   { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(short v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(unsigned v) { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(int v)      { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(float v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(double v)   { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(int64 v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(uint64 v)   { return _Tp(v); }
+
+template<> inline uchar saturate_cast<uchar>(schar v)        { return (uchar)std::max((int)v, 0); }
+template<> inline uchar saturate_cast<uchar>(ushort v)       { return (uchar)std::min((unsigned)v, (unsigned)UCHAR_MAX); }
+template<> inline uchar saturate_cast<uchar>(int v)          { return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+template<> inline uchar saturate_cast<uchar>(short v)        { return saturate_cast<uchar>((int)v); }
+template<> inline uchar saturate_cast<uchar>(unsigned v)     { return (uchar)std::min(v, (unsigned)UCHAR_MAX); }
+template<> inline uchar saturate_cast<uchar>(float v)        { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
+template<> inline uchar saturate_cast<uchar>(double v)       { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
+template<> inline uchar saturate_cast<uchar>(int64 v)        { return (uchar)((uint64)v <= (uint64)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+template<> inline uchar saturate_cast<uchar>(uint64 v)       { return (uchar)std::min(v, (uint64)UCHAR_MAX); }
+
+template<> inline schar saturate_cast<schar>(uchar v)        { return (schar)std::min((int)v, SCHAR_MAX); }
+template<> inline schar saturate_cast<schar>(ushort v)       { return (schar)std::min((unsigned)v, (unsigned)SCHAR_MAX); }
+template<> inline schar saturate_cast<schar>(int v)          { return (schar)((unsigned)(v-SCHAR_MIN) <= (unsigned)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
+template<> inline schar saturate_cast<schar>(short v)        { return saturate_cast<schar>((int)v); }
+template<> inline schar saturate_cast<schar>(unsigned v)     { return (schar)std::min(v, (unsigned)SCHAR_MAX); }
+template<> inline schar saturate_cast<schar>(float v)        { int iv = cvRound(v); return saturate_cast<schar>(iv); }
+template<> inline schar saturate_cast<schar>(double v)       { int iv = cvRound(v); return saturate_cast<schar>(iv); }
+template<> inline schar saturate_cast<schar>(int64 v)        { return (schar)((uint64)((int64)v-SCHAR_MIN) <= (uint64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
+template<> inline schar saturate_cast<schar>(uint64 v)       { return (schar)std::min(v, (uint64)SCHAR_MAX); }
+
+template<> inline ushort saturate_cast<ushort>(schar v)      { return (ushort)std::max((int)v, 0); }
+template<> inline ushort saturate_cast<ushort>(short v)      { return (ushort)std::max((int)v, 0); }
+template<> inline ushort saturate_cast<ushort>(int v)        { return (ushort)((unsigned)v <= (unsigned)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+template<> inline ushort saturate_cast<ushort>(unsigned v)   { return (ushort)std::min(v, (unsigned)USHRT_MAX); }
+template<> inline ushort saturate_cast<ushort>(float v)      { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
+template<> inline ushort saturate_cast<ushort>(double v)     { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
+template<> inline ushort saturate_cast<ushort>(int64 v)      { return (ushort)((uint64)v <= (uint64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+template<> inline ushort saturate_cast<ushort>(uint64 v)     { return (ushort)std::min(v, (uint64)USHRT_MAX); }
+
+template<> inline short saturate_cast<short>(ushort v)       { return (short)std::min((int)v, SHRT_MAX); }
+template<> inline short saturate_cast<short>(int v)          { return (short)((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
+template<> inline short saturate_cast<short>(unsigned v)     { return (short)std::min(v, (unsigned)SHRT_MAX); }
+template<> inline short saturate_cast<short>(float v)        { int iv = cvRound(v); return saturate_cast<short>(iv); }
+template<> inline short saturate_cast<short>(double v)       { int iv = cvRound(v); return saturate_cast<short>(iv); }
+template<> inline short saturate_cast<short>(int64 v)        { return (short)((uint64)((int64)v - SHRT_MIN) <= (uint64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
+template<> inline short saturate_cast<short>(uint64 v)       { return (short)std::min(v, (uint64)SHRT_MAX); }
+
+template<> inline int saturate_cast<int>(float v)            { return cvRound(v); }
+template<> inline int saturate_cast<int>(double v)           { return cvRound(v); }
+
+// we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
+template<> inline unsigned saturate_cast<unsigned>(float v)  { return cvRound(v); }
+template<> inline unsigned saturate_cast<unsigned>(double v) { return cvRound(v); }
+
+//! @}
+
+} // cv
+
+#endif // __OPENCV_CORE_SATURATE_HPP__
--- a/modules/core/include/opencv2/core/sse_utils.hpp
+++ b/modules/core/include/opencv2/core/sse_utils.hpp
@ -46,7 +46,10 @@
 #  error sse_utils.hpp header must be compiled as C++
 #endif

-#include "opencv2/hal/defs.h"
+#include "opencv2/core/cvdef.h"
+
+//! @addtogroup core_utils_sse
+//! @{

 #if CV_SSE2

@ -644,4 +647,6 @@ inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m12

 #endif // CV_SSE2

+//! @}
+
 #endif //__OPENCV_CORE_SSE_UTILS_HPP__
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
--- a/modules/core/src/arithm_core.hpp
+++ b/modules/core/src/arithm_core.hpp
@ -42,144 +42,94 @@
 //
 //M*/

-#ifndef __OPENCV_HAL_ARITHM_CORE_HPP__
-#define __OPENCV_HAL_ARITHM_CORE_HPP__
+#ifndef __OPENCV_ARITHM_CORE_HPP__
+#define __OPENCV_ARITHM_CORE_HPP__

 #include "arithm_simd.hpp"

-const uchar g_Saturate8u[] =
-{
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
-     16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
-     32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
-     48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
-     64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
-     80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
-     96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
-    112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
-    128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
-    144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
-    160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
-    176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
-    192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
-    208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
-    224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
-    240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255
-};
-
-
-#define CV_FAST_CAST_8U(t)   (assert(-256 <= (t) && (t) <= 512), g_Saturate8u[(t)+256])
-#define CV_MIN_8U(a,b)       ((a) - CV_FAST_CAST_8U((a) - (b)))
-#define CV_MAX_8U(a,b)       ((a) + CV_FAST_CAST_8U((b) - (a)))
-
-const float g_8x32fTab[] =
-{
-    -128.f, -127.f, -126.f, -125.f, -124.f, -123.f, -122.f, -121.f,
-    -120.f, -119.f, -118.f, -117.f, -116.f, -115.f, -114.f, -113.f,
-    -112.f, -111.f, -110.f, -109.f, -108.f, -107.f, -106.f, -105.f,
-    -104.f, -103.f, -102.f, -101.f, -100.f,  -99.f,  -98.f,  -97.f,
-     -96.f,  -95.f,  -94.f,  -93.f,  -92.f,  -91.f,  -90.f,  -89.f,
-     -88.f,  -87.f,  -86.f,  -85.f,  -84.f,  -83.f,  -82.f,  -81.f,
-     -80.f,  -79.f,  -78.f,  -77.f,  -76.f,  -75.f,  -74.f,  -73.f,
-     -72.f,  -71.f,  -70.f,  -69.f,  -68.f,  -67.f,  -66.f,  -65.f,
-     -64.f,  -63.f,  -62.f,  -61.f,  -60.f,  -59.f,  -58.f,  -57.f,
-     -56.f,  -55.f,  -54.f,  -53.f,  -52.f,  -51.f,  -50.f,  -49.f,
-     -48.f,  -47.f,  -46.f,  -45.f,  -44.f,  -43.f,  -42.f,  -41.f,
-     -40.f,  -39.f,  -38.f,  -37.f,  -36.f,  -35.f,  -34.f,  -33.f,
-     -32.f,  -31.f,  -30.f,  -29.f,  -28.f,  -27.f,  -26.f,  -25.f,
-     -24.f,  -23.f,  -22.f,  -21.f,  -20.f,  -19.f,  -18.f,  -17.f,
-     -16.f,  -15.f,  -14.f,  -13.f,  -12.f,  -11.f,  -10.f,   -9.f,
-      -8.f,   -7.f,   -6.f,   -5.f,   -4.f,   -3.f,   -2.f,   -1.f,
-       0.f,    1.f,    2.f,    3.f,    4.f,    5.f,    6.f,    7.f,
-       8.f,    9.f,   10.f,   11.f,   12.f,   13.f,   14.f,   15.f,
-      16.f,   17.f,   18.f,   19.f,   20.f,   21.f,   22.f,   23.f,
-      24.f,   25.f,   26.f,   27.f,   28.f,   29.f,   30.f,   31.f,
-      32.f,   33.f,   34.f,   35.f,   36.f,   37.f,   38.f,   39.f,
-      40.f,   41.f,   42.f,   43.f,   44.f,   45.f,   46.f,   47.f,
-      48.f,   49.f,   50.f,   51.f,   52.f,   53.f,   54.f,   55.f,
-      56.f,   57.f,   58.f,   59.f,   60.f,   61.f,   62.f,   63.f,
-      64.f,   65.f,   66.f,   67.f,   68.f,   69.f,   70.f,   71.f,
-      72.f,   73.f,   74.f,   75.f,   76.f,   77.f,   78.f,   79.f,
-      80.f,   81.f,   82.f,   83.f,   84.f,   85.f,   86.f,   87.f,
-      88.f,   89.f,   90.f,   91.f,   92.f,   93.f,   94.f,   95.f,
-      96.f,   97.f,   98.f,   99.f,  100.f,  101.f,  102.f,  103.f,
-     104.f,  105.f,  106.f,  107.f,  108.f,  109.f,  110.f,  111.f,
-     112.f,  113.f,  114.f,  115.f,  116.f,  117.f,  118.f,  119.f,
-     120.f,  121.f,  122.f,  123.f,  124.f,  125.f,  126.f,  127.f,
-     128.f,  129.f,  130.f,  131.f,  132.f,  133.f,  134.f,  135.f,
-     136.f,  137.f,  138.f,  139.f,  140.f,  141.f,  142.f,  143.f,
-     144.f,  145.f,  146.f,  147.f,  148.f,  149.f,  150.f,  151.f,
-     152.f,  153.f,  154.f,  155.f,  156.f,  157.f,  158.f,  159.f,
-     160.f,  161.f,  162.f,  163.f,  164.f,  165.f,  166.f,  167.f,
-     168.f,  169.f,  170.f,  171.f,  172.f,  173.f,  174.f,  175.f,
-     176.f,  177.f,  178.f,  179.f,  180.f,  181.f,  182.f,  183.f,
-     184.f,  185.f,  186.f,  187.f,  188.f,  189.f,  190.f,  191.f,
-     192.f,  193.f,  194.f,  195.f,  196.f,  197.f,  198.f,  199.f,
-     200.f,  201.f,  202.f,  203.f,  204.f,  205.f,  206.f,  207.f,
-     208.f,  209.f,  210.f,  211.f,  212.f,  213.f,  214.f,  215.f,
-     216.f,  217.f,  218.f,  219.f,  220.f,  221.f,  222.f,  223.f,
-     224.f,  225.f,  226.f,  227.f,  228.f,  229.f,  230.f,  231.f,
-     232.f,  233.f,  234.f,  235.f,  236.f,  237.f,  238.f,  239.f,
-     240.f,  241.f,  242.f,  243.f,  244.f,  245.f,  246.f,  247.f,
-     248.f,  249.f,  250.f,  251.f,  252.f,  253.f,  254.f,  255.f
-};
-
-#define CV_8TO32F(x)  g_8x32fTab[(x)+128]
-
 namespace cv {

-template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const
-{ return CV_FAST_CAST_8U(a + b); }
+template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + b); }
+};

-template<> inline uchar OpSub<uchar>::operator ()(uchar a, uchar b) const
-{ return CV_FAST_CAST_8U(a - b); }
+template<typename T1, typename T2=T1, typename T3=T1> struct OpSub
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a - b); }
+};

-template<> inline short OpAbsDiff<short>::operator ()(short a, short b) const
-{ return saturate_cast<short>(std::abs(a - b)); }
+template<typename T1, typename T2=T1, typename T3=T1> struct OpRSub
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(b - a); }
+};

-template<> inline schar OpAbsDiff<schar>::operator ()(schar a, schar b) const
-{ return saturate_cast<schar>(std::abs(a - b)); }
+template<typename T> struct OpMin
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator ()(const T a, const T b) const { return std::min(a, b); }
+};

-template<> inline uchar OpMin<uchar>::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); }
+template<typename T> struct OpMax
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator ()(const T a, const T b) const { return std::max(a, b); }
+};

-template<> inline uchar OpMax<uchar>::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); }
+template<typename T> struct OpAbsDiff
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()(T a, T b) const { return a > b ? a - b : b - a; }
+};

-}
+template<typename T> struct OpAnd
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T b ) const { return a & b; }
+};

-namespace cv { namespace hal {
+template<typename T> struct OpOr
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T b ) const { return a | b; }
+};
+
+template<typename T> struct OpXor
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T b ) const { return a ^ b; }
+};
+
+template<typename T> struct OpNot
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T ) const { return ~a; }
+};
+
+//=============================================================================

 template<typename T, class Op, class VOp>
 void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, int width, int height)
@ -651,7 +601,7 @@ addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
    }
 }

-}} // cv::hal::
+} // cv::


-#endif // __OPENCV_HAL_ARITHM_CORE_HPP__
+#endif // __OPENCV_ARITHM_CORE_HPP__
--- a/modules/core/src/arithm_simd.hpp
+++ b/modules/core/src/arithm_simd.hpp
@ -42,10 +42,10 @@
 //
 //M*/

-#ifndef __OPENCV_HAL_ARITHM_SIMD_HPP__
-#define __OPENCV_HAL_ARITHM_SIMD_HPP__
+#ifndef __OPENCV_ARITHM_SIMD_HPP__
+#define __OPENCV_ARITHM_SIMD_HPP__

-namespace cv { namespace hal {
+namespace cv {

 struct NOP {};

@ -2020,6 +2020,6 @@ struct AddWeighted_SIMD<short, float>

 #endif

-}}
+}

-#endif // __OPENCV_HAL_ARITHM_SIMD_HPP__
+#endif // __OPENCV_ARITHM_SIMD_HPP__
--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@ -0,0 +1,228 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_CORE_HAL_REPLACEMENT_HPP__
+#define __OPENCV_CORE_HAL_REPLACEMENT_HPP__
+
+#include "opencv2/core/hal/interface.h"
+
+inline int hal_ni_add8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_add8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_add16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_add16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_add32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_add32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_add64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sub8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sub8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sub16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sub16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sub32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sub32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sub64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_max8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_max8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_max16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_max16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_max32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_max32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_max64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_min8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_min8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_min16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_min16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_min32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_min32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_min64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_absdiff8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_absdiff8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_absdiff16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_absdiff16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_absdiff32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_absdiff32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_absdiff64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_and8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_or8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_xor8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_not8u(const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+#define cv_hal_add8u hal_ni_add8u
+#define cv_hal_add8s hal_ni_add8s
+#define cv_hal_add16u hal_ni_add16u
+#define cv_hal_add16s hal_ni_add16s
+#define cv_hal_add32s hal_ni_add32s
+#define cv_hal_add32f hal_ni_add32f
+#define cv_hal_add64f hal_ni_add64f
+#define cv_hal_sub8u hal_ni_sub8u
+#define cv_hal_sub8s hal_ni_sub8s
+#define cv_hal_sub16u hal_ni_sub16u
+#define cv_hal_sub16s hal_ni_sub16s
+#define cv_hal_sub32s hal_ni_sub32s
+#define cv_hal_sub32f hal_ni_sub32f
+#define cv_hal_sub64f hal_ni_sub64f
+#define cv_hal_max8u hal_ni_max8u
+#define cv_hal_max8s hal_ni_max8s
+#define cv_hal_max16u hal_ni_max16u
+#define cv_hal_max16s hal_ni_max16s
+#define cv_hal_max32s hal_ni_max32s
+#define cv_hal_max32f hal_ni_max32f
+#define cv_hal_max64f hal_ni_max64f
+#define cv_hal_min8u hal_ni_min8u
+#define cv_hal_min8s hal_ni_min8s
+#define cv_hal_min16u hal_ni_min16u
+#define cv_hal_min16s hal_ni_min16s
+#define cv_hal_min32s hal_ni_min32s
+#define cv_hal_min32f hal_ni_min32f
+#define cv_hal_min64f hal_ni_min64f
+#define cv_hal_absdiff8u hal_ni_absdiff8u
+#define cv_hal_absdiff8s hal_ni_absdiff8s
+#define cv_hal_absdiff16u hal_ni_absdiff16u
+#define cv_hal_absdiff16s hal_ni_absdiff16s
+#define cv_hal_absdiff32s hal_ni_absdiff32s
+#define cv_hal_absdiff32f hal_ni_absdiff32f
+#define cv_hal_absdiff64f hal_ni_absdiff64f
+#define cv_hal_and8u hal_ni_and8u
+#define cv_hal_or8u hal_ni_or8u
+#define cv_hal_xor8u hal_ni_xor8u
+#define cv_hal_not8u hal_ni_not8u
+
+inline int hal_ni_cmp8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_cmp8s(const schar*, size_t, const schar*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_cmp16u(const ushort*, size_t, const ushort*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_cmp16s(const short*, size_t, const short*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_cmp32s(const int*, size_t, const int*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_cmp32f(const float*, size_t, const float*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_cmp64f(const double*, size_t, const double*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+#define cv_hal_cmp8u hal_ni_cmp8u
+#define cv_hal_cmp8s hal_ni_cmp8s
+#define cv_hal_cmp16u hal_ni_cmp16u
+#define cv_hal_cmp16s hal_ni_cmp16s
+#define cv_hal_cmp32s hal_ni_cmp32s
+#define cv_hal_cmp32f hal_ni_cmp32f
+#define cv_hal_cmp64f hal_ni_cmp64f
+
+inline int hal_ni_mul8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_div8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_div8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_div16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_div16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_div32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_div32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_div64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_recip8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_recip8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_recip16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_recip16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_recip32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_recip32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_recip64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+#define cv_hal_mul8u hal_ni_mul8u
+#define cv_hal_mul8s hal_ni_mul8s
+#define cv_hal_mul16u hal_ni_mul16u
+#define cv_hal_mul16s hal_ni_mul16s
+#define cv_hal_mul32s hal_ni_mul32s
+#define cv_hal_mul32f hal_ni_mul32f
+#define cv_hal_mul64f hal_ni_mul64f
+#define cv_hal_div8u hal_ni_div8u
+#define cv_hal_div8s hal_ni_div8s
+#define cv_hal_div16u hal_ni_div16u
+#define cv_hal_div16s hal_ni_div16s
+#define cv_hal_div32s hal_ni_div32s
+#define cv_hal_div32f hal_ni_div32f
+#define cv_hal_div64f hal_ni_div64f
+#define cv_hal_recip8u hal_ni_recip8u
+#define cv_hal_recip8s hal_ni_recip8s
+#define cv_hal_recip16u hal_ni_recip16u
+#define cv_hal_recip16s hal_ni_recip16s
+#define cv_hal_recip32s hal_ni_recip32s
+#define cv_hal_recip32f hal_ni_recip32f
+#define cv_hal_recip64f hal_ni_recip64f
+
+inline int hal_ni_addWeighted8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_addWeighted8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_addWeighted16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_addWeighted16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_addWeighted32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_addWeighted32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_addWeighted64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+#define cv_hal_addWeighted8u hal_ni_addWeighted8u
+#define cv_hal_addWeighted8s hal_ni_addWeighted8s
+#define cv_hal_addWeighted16u hal_ni_addWeighted16u
+#define cv_hal_addWeighted16s hal_ni_addWeighted16s
+#define cv_hal_addWeighted32s hal_ni_addWeighted32s
+#define cv_hal_addWeighted32f hal_ni_addWeighted32f
+#define cv_hal_addWeighted64f hal_ni_addWeighted64f
+
+inline int hal_ni_split8u(const uchar*, uchar**, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_split16u(const ushort*, ushort**, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_split32s(const int*, int**, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_split64s(const int64*, int64**, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+#define cv_hal_split8u hal_ni_split8u
+#define cv_hal_split16u hal_ni_split16u
+#define cv_hal_split32s hal_ni_split32s
+#define cv_hal_split64s hal_ni_split64s
+
+inline int hal_ni_merge8u(const uchar**, uchar*, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_merge16u(const ushort**, ushort*, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_merge32s(const int**, int*, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_merge64s(const int64**, int64*, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+#define cv_hal_merge8u hal_ni_merge8u
+#define cv_hal_merge16u hal_ni_merge16u
+#define cv_hal_merge32s hal_ni_merge32s
+#define cv_hal_merge64s hal_ni_merge64s
+
+#include "custom_hal.hpp"
+
+#endif
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@ -52,22 +52,22 @@ namespace cv

 int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
 {
-    return hal::LU(A, astep, m, b, bstep, n);
+    return hal::LU32f(A, astep, m, b, bstep, n);
 }

 int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
 {
-    return hal::LU(A, astep, m, b, bstep, n);
+    return hal::LU64f(A, astep, m, b, bstep, n);
 }

 bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
 {
-    return hal::Cholesky(A, astep, m, b, bstep, n);
+    return hal::Cholesky32f(A, astep, m, b, bstep, n);
 }

 bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
 {
-    return hal::Cholesky(A, astep, m, b, bstep, n);
+    return hal::Cholesky64f(A, astep, m, b, bstep, n);
 }

 template<typename _Tp> static inline _Tp hypot(_Tp a, _Tp b)
@ -740,7 +740,7 @@ double cv::determinant( InputArray _mat )
            Mat a(rows, rows, CV_32F, (uchar*)buffer);
            mat.copyTo(a);

-            result = hal::LU(a.ptr<float>(), a.step, rows, 0, 0, 0);
+            result = hal::LU32f(a.ptr<float>(), a.step, rows, 0, 0, 0);
            if( result )
            {
                for( int i = 0; i < rows; i++ )
@ -764,7 +764,7 @@ double cv::determinant( InputArray _mat )
            Mat a(rows, rows, CV_64F, (uchar*)buffer);
            mat.copyTo(a);

-            result = hal::LU(a.ptr<double>(), a.step, rows, 0, 0, 0);
+            result = hal::LU64f(a.ptr<double>(), a.step, rows, 0, 0, 0);
            if( result )
            {
                for( int i = 0; i < rows; i++ )
@ -1027,13 +1027,13 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
    setIdentity(dst);

    if( method == DECOMP_LU && type == CV_32F )
-        result = hal::LU(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
+        result = hal::LU32f(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
    else if( method == DECOMP_LU && type == CV_64F )
-        result = hal::LU(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
+        result = hal::LU64f(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
    else if( method == DECOMP_CHOLESKY && type == CV_32F )
-        result = hal::Cholesky(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
+        result = hal::Cholesky32f(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
    else
-        result = hal::Cholesky(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);
+        result = hal::Cholesky64f(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);

    if( !result )
        dst = Scalar(0);
@ -1265,16 +1265,16 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
    if( method == DECOMP_LU )
    {
        if( type == CV_32F )
-            result = hal::LU(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
+            result = hal::LU32f(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
        else
-            result = hal::LU(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
+            result = hal::LU64f(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
    }
    else if( method == DECOMP_CHOLESKY )
    {
        if( type == CV_32F )
-            result = hal::Cholesky(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
+            result = hal::Cholesky32f(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
        else
-            result = hal::Cholesky(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
+            result = hal::Cholesky64f(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
    }
    else
    {
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@ -191,13 +191,13 @@ void magnitude( InputArray src1, InputArray src2, OutputArray dst )
        {
            const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1];
            float *mag = (float*)ptrs[2];
-            hal::magnitude( x, y, mag, len );
+            hal::magnitude32f( x, y, mag, len );
        }
        else
        {
            const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1];
            double *mag = (double*)ptrs[2];
-            hal::magnitude( x, y, mag, len );
+            hal::magnitude64f( x, y, mag, len );
        }
    }
 }
@ -374,7 +374,7 @@ void cartToPolar( InputArray src1, InputArray src2,
            {
                const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1];
                float *mag = (float*)ptrs[2], *angle = (float*)ptrs[3];
-                hal::magnitude( x, y, mag, len );
+                hal::magnitude32f( x, y, mag, len );
                hal::fastAtan2( y, x, angle, len, angleInDegrees );
            }
            else
@ -382,7 +382,7 @@ void cartToPolar( InputArray src1, InputArray src2,
                const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1];
                double *angle = (double*)ptrs[3];

-                hal::magnitude(x, y, (double*)ptrs[2], len);
+                hal::magnitude64f(x, y, (double*)ptrs[2], len);
                k = 0;

 #if CV_SSE2
@ -760,7 +760,7 @@ static void Exp_32f_ipp(const float *x, float *y, int n)
        }
        setIppErrorStatus();
    }
-    hal::exp(x, y, n);
+    hal::exp32f(x, y, n);
 }

 static void Exp_64f_ipp(const double *x, double *y, int n)
@ -774,14 +774,14 @@ static void Exp_64f_ipp(const double *x, double *y, int n)
        }
        setIppErrorStatus();
    }
-    hal::exp(x, y, n);
+    hal::exp64f(x, y, n);
 }

 #define Exp_32f Exp_32f_ipp
 #define Exp_64f Exp_64f_ipp
 #else
-#define Exp_32f hal::exp
-#define Exp_64f hal::exp
+#define Exp_32f hal::exp32f
+#define Exp_64f hal::exp64f
 #endif


@ -828,7 +828,7 @@ static void Log_32f_ipp(const float *x, float *y, int n)
        }
        setIppErrorStatus();
    }
-    hal::log(x, y, n);
+    hal::log32f(x, y, n);
 }

 static void Log_64f_ipp(const double *x, double *y, int n)
@ -842,14 +842,14 @@ static void Log_64f_ipp(const double *x, double *y, int n)
        }
        setIppErrorStatus();
    }
-    hal::log(x, y, n);
+    hal::log64f(x, y, n);
 }

 #define Log_32f Log_32f_ipp
 #define Log_64f Log_64f_ipp
 #else
-#define Log_32f hal::log
-#define Log_64f hal::log
+#define Log_32f hal::log32f
+#define Log_64f hal::log64f
 #endif

 void log( InputArray _src, OutputArray _dst )
@ -1356,10 +1356,10 @@ static bool ocl_pow(InputArray _src, double power, OutputArray _dst,

 #endif

-static void InvSqrt_32f(const float* src, float* dst, int n) { hal::invSqrt(src, dst, n); }
-static void InvSqrt_64f(const double* src, double* dst, int n) { hal::invSqrt(src, dst, n); }
-static void Sqrt_32f(const float* src, float* dst, int n) { hal::sqrt(src, dst, n); }
-static void Sqrt_64f(const double* src, double* dst, int n) { hal::sqrt(src, dst, n); }
+static void InvSqrt_32f(const float* src, float* dst, int n) { hal::invSqrt32f(src, dst, n); }
+static void InvSqrt_64f(const double* src, double* dst, int n) { hal::invSqrt64f(src, dst, n); }
+static void Sqrt_32f(const float* src, float* dst, int n) { hal::sqrt32f(src, dst, n); }
+static void Sqrt_64f(const double* src, double* dst, int n) { hal::sqrt64f(src, dst, n); }

 void pow( InputArray _src, double power, OutputArray _dst )
 {
--- a/modules/core/src/mathfuncs_core.cpp
+++ b/modules/core/src/mathfuncs_core.cpp
@ -52,16 +52,6 @@ static const float atan2_p3 = -0.3258083974640975f*(float)(180/CV_PI);
 static const float atan2_p5 = 0.1555786518463281f*(float)(180/CV_PI);
 static const float atan2_p7 = -0.04432655554792128f*(float)(180/CV_PI);

-#if CV_NEON
-static inline float32x4_t cv_vrecpq_f32(float32x4_t val)
-{
-    float32x4_t reciprocal = vrecpeq_f32(val);
-    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
-    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
-    return reciprocal;
-}
-#endif
-
 void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
 {
    int i = 0;
@ -160,7 +150,7 @@ void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angle
 }


-void magnitude(const float* x, const float* y, float* mag, int len)
+void magnitude32f(const float* x, const float* y, float* mag, int len)
 {
 #if defined HAVE_IPP
    CV_IPP_CHECK()
@ -196,7 +186,7 @@ void magnitude(const float* x, const float* y, float* mag, int len)
    }
 }

-void magnitude(const double* x, const double* y, double* mag, int len)
+void magnitude64f(const double* x, const double* y, double* mag, int len)
 {
 #if defined(HAVE_IPP)
    CV_IPP_CHECK()
@ -233,7 +223,7 @@ void magnitude(const double* x, const double* y, double* mag, int len)
 }


-void invSqrt(const float* src, float* dst, int len)
+void invSqrt32f(const float* src, float* dst, int len)
 {
 #if defined(HAVE_IPP)
    CV_IPP_CHECK()
@ -264,7 +254,7 @@ void invSqrt(const float* src, float* dst, int len)
 }


-void invSqrt(const double* src, double* dst, int len)
+void invSqrt64f(const double* src, double* dst, int len)
 {
    int i = 0;

@ -279,7 +269,7 @@ void invSqrt(const double* src, double* dst, int len)
 }


-void sqrt(const float* src, float* dst, int len)
+void sqrt32f(const float* src, float* dst, int len)
 {
 #if defined(HAVE_IPP)
    CV_IPP_CHECK()
@ -310,7 +300,7 @@ void sqrt(const float* src, float* dst, int len)
 }


-void sqrt(const double* src, double* dst, int len)
+void sqrt64f(const double* src, double* dst, int len)
 {
 #if defined(HAVE_IPP)
    CV_IPP_CHECK()
@ -441,7 +431,7 @@ static const double exp_prescale = 1.4426950408889634073599246810019 * (1 << EXP
 static const double exp_postscale = 1./(1 << EXPTAB_SCALE);
 static const double exp_max_val = 3000.*(1 << EXPTAB_SCALE); // log10(DBL_MAX) < 3000

-void exp( const float *_x, float *y, int n )
+void exp32f( const float *_x, float *y, int n )
 {
    static const float
    A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0),
@ -640,7 +630,7 @@ void exp( const float *_x, float *y, int n )
    }
 }

-void exp( const double *_x, double *y, int n )
+void exp64f( const double *_x, double *y, int n )
 {
    static const double
    A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0,
@ -1084,7 +1074,7 @@ static const double CV_DECL_ALIGNED(16) icvLogTab[] = {
 #define LOGTAB_TRANSLATE(x,h) (((x) - 1.)*icvLogTab[(h)+1])
 static const double ln_2 = 0.69314718055994530941723212145818;

-void log( const float *_x, float *y, int n )
+void log32f( const float *_x, float *y, int n )
 {
    static const float shift[] = { 0, -1.f/512 };
    static const float
@ -1228,7 +1218,7 @@ void log( const float *_x, float *y, int n )
    }
 }

-void log( const double *x, double *y, int n )
+void log64f( const double *x, double *y, int n )
 {
    static const double shift[] = { 0, -1./512 };
    static const double
@ -1413,4 +1403,58 @@ void log( const double *x, double *y, int n )
    }
 }

-}}
+//=============================================================================
+// for compatibility with 3.0
+
+void exp(const float* src, float* dst, int n)
+{
+    exp32f(src, dst, n);
+}
+
+void exp(const double* src, double* dst, int n)
+{
+    exp64f(src, dst, n);
+}
+
+void log(const float* src, float* dst, int n)
+{
+    log32f(src, dst, n);
+}
+
+void log(const double* src, double* dst, int n)
+{
+    log64f(src, dst, n);
+}
+
+void magnitude(const float* x, const float* y, float* dst, int n)
+{
+    magnitude32f(x, y, dst, n);
+}
+
+void magnitude(const double* x, const double* y, double* dst, int n)
+{
+    magnitude64f(x, y, dst, n);
+}
+
+void sqrt(const float* src, float* dst, int len)
+{
+    sqrt32f(src, dst, len);
+}
+
+void sqrt(const double* src, double* dst, int len)
+{
+    sqrt64f(src, dst, len);
+}
+
+void invSqrt(const float* src, float* dst, int len)
+{
+    invSqrt32f(src, dst, len);
+}
+
+void invSqrt(const double* src, double* dst, int len)
+{
+    invSqrt64f(src, dst, len);
+}
+
+
+}} // cv::hal::
--- a/modules/core/src/matrix_decomp.cpp
+++ b/modules/core/src/matrix_decomp.cpp
@ -109,18 +109,17 @@ LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n, _Tp eps)
 }


-int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
+int LU32f(float* A, size_t astep, int m, float* b, size_t bstep, int n)
 {
    return LUImpl(A, astep, m, b, bstep, n, FLT_EPSILON*10);
 }


-int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
+int LU64f(double* A, size_t astep, int m, double* b, size_t bstep, int n)
 {
    return LUImpl(A, astep, m, b, bstep, n, DBL_EPSILON*100);
 }

-
 template<typename _Tp> static inline bool
 CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
 {
@ -195,6 +194,29 @@ CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
 }


+bool Cholesky32f(float* A, size_t astep, int m, float* b, size_t bstep, int n)
+{
+    return CholImpl(A, astep, m, b, bstep, n);
+}
+
+bool Cholesky64f(double* A, size_t astep, int m, double* b, size_t bstep, int n)
+{
+    return CholImpl(A, astep, m, b, bstep, n);
+}
+
+//=============================================================================
+// for compatibility with 3.0
+
+int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
+{
+    return LUImpl(A, astep, m, b, bstep, n, FLT_EPSILON*10);
+}
+
+int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
+{
+    return LUImpl(A, astep, m, b, bstep, n, DBL_EPSILON*100);
+}
+
 bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
 {
    return CholImpl(A, astep, m, b, bstep, n);
@ -205,4 +227,5 @@ bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
    return CholImpl(A, astep, m, b, bstep, n);
 }

+
 }}
--- a/modules/core/src/merge.cpp
+++ b/modules/core/src/merge.cpp
@ -387,21 +387,25 @@ merge_( const T** src, T* dst, int len, int cn )

 void merge8u(const uchar** src, uchar* dst, int len, int cn )
 {
+    CALL_HAL(merge8u, cv_hal_merge8u, src, dst, len, cn)
    merge_(src, dst, len, cn);
 }

 void merge16u(const ushort** src, ushort* dst, int len, int cn )
 {
+    CALL_HAL(merge16u, cv_hal_merge16u, src, dst, len, cn)
    merge_(src, dst, len, cn);
 }

 void merge32s(const int** src, int* dst, int len, int cn )
 {
+    CALL_HAL(merge32s, cv_hal_merge32s, src, dst, len, cn)
    merge_(src, dst, len, cn);
 }

 void merge64s(const int64** src, int64* dst, int len, int cn )
 {
+    CALL_HAL(merge64s, cv_hal_merge64s, src, dst, len, cn)
    merge_(src, dst, len, cn);
 }

--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@ -58,8 +58,6 @@
 #include "opencv2/core/ocl.hpp"
 #endif

-#include "opencv2/hal.hpp"
-
 #include <assert.h>
 #include <ctype.h>
 #include <float.h>
@ -69,6 +67,27 @@
 #include <stdlib.h>
 #include <string.h>

+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <limits>
+#include <float.h>
+#include <cstring>
+#include <cassert>
+
+#define USE_SSE2  (cv::checkHardwareSupport(CV_CPU_SSE))
+#define USE_SSE4_2  (cv::checkHardwareSupport(CV_CPU_SSE4_2))
+#define USE_AVX  (cv::checkHardwareSupport(CV_CPU_AVX))
+#define USE_AVX2  (cv::checkHardwareSupport(CV_CPU_AVX2))
+
+#include "opencv2/core/hal/hal.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+#include "opencv2/core/sse_utils.hpp"
+#include "opencv2/core/neon_utils.hpp"
+
+#include "arithm_core.hpp"
+#include "hal_replacement.hpp"
+
 #ifdef HAVE_TEGRA_OPTIMIZATION
 #include "opencv2/core/core_tegra.hpp"
 #else
@ -78,6 +97,34 @@
 namespace cv
 {

+// -128.f ... 255.f
+extern const float g_8x32fTab[];
+#define CV_8TO32F(x)  cv::g_8x32fTab[(x)+128]
+
+extern const ushort g_8x16uSqrTab[];
+#define CV_SQR_8U(x)  cv::g_8x16uSqrTab[(x)+255]
+
+extern const uchar g_Saturate8u[];
+#define CV_FAST_CAST_8U(t)   (assert(-256 <= (t) && (t) <= 512), cv::g_Saturate8u[(t)+256])
+#define CV_MIN_8U(a,b)       ((a) - CV_FAST_CAST_8U((a) - (b)))
+#define CV_MAX_8U(a,b)       ((a) + CV_FAST_CAST_8U((b) - (a)))
+
+template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const
+{ return CV_FAST_CAST_8U(a + b); }
+
+template<> inline uchar OpSub<uchar>::operator ()(uchar a, uchar b) const
+{ return CV_FAST_CAST_8U(a - b); }
+
+template<> inline short OpAbsDiff<short>::operator ()(short a, short b) const
+{ return saturate_cast<short>(std::abs(a - b)); }
+
+template<> inline schar OpAbsDiff<schar>::operator ()(schar a, schar b) const
+{ return saturate_cast<schar>(std::abs(a - b)); }
+
+template<> inline uchar OpMin<uchar>::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); }
+
+template<> inline uchar OpMax<uchar>::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); }
+
 typedef void (*BinaryFunc)(const uchar* src1, size_t step1,
                       const uchar* src2, size_t step2,
                       uchar* dst, size_t step, Size sz,
@ -100,21 +147,6 @@ BinaryFunc getCopyMaskFunc(size_t esz);
 /* maximal average node_count/hash_size ratio beyond which hash table is resized */
 #define  CV_SPARSE_HASH_RATIO    3

-
-
-// -128.f ... 255.f
-extern const float g_8x32fTab[];
-#define CV_8TO32F(x)  cv::g_8x32fTab[(x)+128]
-
-extern const ushort g_8x16uSqrTab[];
-#define CV_SQR_8U(x)  cv::g_8x16uSqrTab[(x)+255]
-
-extern const uchar g_Saturate8u[];
-#define CV_FAST_CAST_8U(t)   (assert(-256 <= (t) && (t) <= 512), cv::g_Saturate8u[(t)+256])
-#define CV_MIN_8U(a,b)       ((a) - CV_FAST_CAST_8U((a) - (b)))
-#define CV_MAX_8U(a,b)       ((a) + CV_FAST_CAST_8U((b) - (a)))
-
-
 #if defined WIN32 || defined _WIN32
 void deleteThreadAllocData();
 #endif
@ -282,6 +314,4 @@ cv::Mutex& getInitializationMutex();

 }

-#include "opencv2/hal/intrin.hpp"
-
 #endif /*_CXCORE_INTERNAL_H_*/
--- a/modules/core/src/split.cpp
+++ b/modules/core/src/split.cpp
@ -403,21 +403,25 @@ split_( const T* src, T** dst, int len, int cn )

 void split8u(const uchar* src, uchar** dst, int len, int cn )
 {
+    CALL_HAL(split8u, cv_hal_split8u, src,dst, len, cn)
    split_(src, dst, len, cn);
 }

 void split16u(const ushort* src, ushort** dst, int len, int cn )
 {
+    CALL_HAL(split16u, cv_hal_split16u, src,dst, len, cn)
    split_(src, dst, len, cn);
 }

 void split32s(const int* src, int** dst, int len, int cn )
 {
+    CALL_HAL(split32s, cv_hal_split32s, src,dst, len, cn)
    split_(src, dst, len, cn);
 }

 void split64s(const int64* src, int64** dst, int len, int cn )
 {
+    CALL_HAL(split64s, cv_hal_split64s, src,dst, len, cn)
    split_(src, dst, len, cn);
 }

--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@ -3996,3 +3996,266 @@ cvNorm( const void* imgA, const void* imgB, int normType, const void* maskarr )

    return !maskarr ? cv::norm(a, b, normType) : cv::norm(a, b, normType, mask);
 }
+
+namespace cv { namespace hal {
+
+static const uchar popCountTable[] =
+{
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
+};
+
+static const uchar popCountTable2[] =
+{
+    0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
+};
+
+static const uchar popCountTable4[] =
+{
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+};
+
+int normHamming(const uchar* a, int n)
+{
+    int i = 0;
+    int result = 0;
+#if CV_NEON
+    {
+        uint32x4_t bits = vmovq_n_u32(0);
+        for (; i <= n - 16; i += 16) {
+            uint8x16_t A_vec = vld1q_u8 (a + i);
+            uint8x16_t bitsSet = vcntq_u8 (A_vec);
+            uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
+            uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
+            bits = vaddq_u32(bits, bitSet4);
+        }
+        uint64x2_t bitSet2 = vpaddlq_u32 (bits);
+        result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
+        result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
+    }
+#endif
+        for( ; i <= n - 4; i += 4 )
+            result += popCountTable[a[i]] + popCountTable[a[i+1]] +
+            popCountTable[a[i+2]] + popCountTable[a[i+3]];
+    for( ; i < n; i++ )
+        result += popCountTable[a[i]];
+    return result;
+}
+
+int normHamming(const uchar* a, const uchar* b, int n)
+{
+    int i = 0;
+    int result = 0;
+#if CV_NEON
+    {
+        uint32x4_t bits = vmovq_n_u32(0);
+        for (; i <= n - 16; i += 16) {
+            uint8x16_t A_vec = vld1q_u8 (a + i);
+            uint8x16_t B_vec = vld1q_u8 (b + i);
+            uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
+            uint8x16_t bitsSet = vcntq_u8 (AxorB);
+            uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
+            uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
+            bits = vaddq_u32(bits, bitSet4);
+        }
+        uint64x2_t bitSet2 = vpaddlq_u32 (bits);
+        result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
+        result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
+    }
+#endif
+        for( ; i <= n - 4; i += 4 )
+            result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
+                    popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
+    for( ; i < n; i++ )
+        result += popCountTable[a[i] ^ b[i]];
+    return result;
+}
+
+int normHamming(const uchar* a, int n, int cellSize)
+{
+    if( cellSize == 1 )
+        return normHamming(a, n);
+    const uchar* tab = 0;
+    if( cellSize == 2 )
+        tab = popCountTable2;
+    else if( cellSize == 4 )
+        tab = popCountTable4;
+    else
+        return -1;
+    int i = 0;
+    int result = 0;
+#if CV_ENABLE_UNROLLED
+    for( ; i <= n - 4; i += 4 )
+        result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];
+#endif
+    for( ; i < n; i++ )
+        result += tab[a[i]];
+    return result;
+}
+
+int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
+{
+    if( cellSize == 1 )
+        return normHamming(a, b, n);
+    const uchar* tab = 0;
+    if( cellSize == 2 )
+        tab = popCountTable2;
+    else if( cellSize == 4 )
+        tab = popCountTable4;
+    else
+        return -1;
+    int i = 0;
+    int result = 0;
+    #if CV_ENABLE_UNROLLED
+    for( ; i <= n - 4; i += 4 )
+        result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
+                tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
+    #endif
+    for( ; i < n; i++ )
+        result += tab[a[i] ^ b[i]];
+    return result;
+}
+
+float normL2Sqr_(const float* a, const float* b, int n)
+{
+    int j = 0; float d = 0.f;
+#if CV_SSE
+    float CV_DECL_ALIGNED(16) buf[4];
+    __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
+
+    for( ; j <= n - 8; j += 8 )
+    {
+        __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
+        __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
+        d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
+        d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
+    }
+    _mm_store_ps(buf, _mm_add_ps(d0, d1));
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
+            d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
+        }
+    }
+
+    for( ; j < n; j++ )
+    {
+        float t = a[j] - b[j];
+        d += t*t;
+    }
+    return d;
+}
+
+
+float normL1_(const float* a, const float* b, int n)
+{
+    int j = 0; float d = 0.f;
+#if CV_SSE
+    float CV_DECL_ALIGNED(16) buf[4];
+    static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+    __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
+    __m128 absmask = _mm_load_ps((const float*)absbuf);
+
+    for( ; j <= n - 8; j += 8 )
+    {
+        __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
+        __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
+        d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
+        d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
+    }
+    _mm_store_ps(buf, _mm_add_ps(d0, d1));
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#elif CV_NEON
+    float32x4_t v_sum = vdupq_n_f32(0.0f);
+    for ( ; j <= n - 4; j += 4)
+        v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
+
+    float CV_DECL_ALIGNED(16) buf[4];
+    vst1q_f32(buf, v_sum);
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
+            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
+        }
+    }
+
+    for( ; j < n; j++ )
+        d += std::abs(a[j] - b[j]);
+    return d;
+}
+
+int normL1_(const uchar* a, const uchar* b, int n)
+{
+    int j = 0, d = 0;
+#if CV_SSE
+    __m128i d0 = _mm_setzero_si128();
+
+    for( ; j <= n - 16; j += 16 )
+    {
+        __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
+        __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
+
+        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
+    }
+
+    for( ; j <= n - 4; j += 4 )
+    {
+        __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
+        __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
+
+        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
+    }
+    d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
+#elif CV_NEON
+    uint32x4_t v_sum = vdupq_n_u32(0.0f);
+    for ( ; j <= n - 16; j += 16)
+    {
+        uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
+        uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
+        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
+        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
+    }
+
+    uint CV_DECL_ALIGNED(16) buf[4];
+    vst1q_u32(buf, v_sum);
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
+            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
+        }
+    }
+    for( ; j < n; j++ )
+        d += std::abs(a[j] - b[j]);
+    return d;
+}
+
+}} //cv::hal
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@ -86,6 +86,45 @@ Mutex* __initialization_mutex_initializer = &getInitializationMutex();
 #undef max
 #undef abs
 #include <tchar.h>
+#if defined _MSC_VER
+  #if _MSC_VER >= 1400
+    #include <intrin.h>
+  #elif defined _M_IX86
+    static void __cpuid(int* cpuid_data, int)
+    {
+        __asm
+        {
+            push ebx
+            push edi
+            mov edi, cpuid_data
+            mov eax, 1
+            cpuid
+            mov [edi], eax
+            mov [edi + 4], ebx
+            mov [edi + 8], ecx
+            mov [edi + 12], edx
+            pop edi
+            pop ebx
+        }
+    }
+    static void __cpuidex(int* cpuid_data, int, int)
+    {
+        __asm
+        {
+            push edi
+            mov edi, cpuid_data
+            mov eax, 7
+            mov ecx, 0
+            cpuid
+            mov [edi], eax
+            mov [edi + 4], ebx
+            mov [edi + 8], ecx
+            mov [edi + 12], edx
+            pop edi
+        }
+    }
+  #endif
+#endif

 #ifdef WINRT
 #include <wrl/client.h>
@ -198,15 +237,154 @@ void Exception::formatMessage()
        msg = format("%s:%d: error: (%d) %s\n", file.c_str(), line, code, err.c_str());
 }

+struct HWFeatures
+{
+    enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE };
+
+    HWFeatures(void)
+    {
+        memset( have, 0, sizeof(have) );
+        x86_family = 0;
+    }
+
+    static HWFeatures initialize(void)
+    {
+        HWFeatures f;
+        int cpuid_data[4] = { 0, 0, 0, 0 };
+
+    #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+        __cpuid(cpuid_data, 1);
+    #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+        #ifdef __x86_64__
+        asm __volatile__
+        (
+         "movl $1, %%eax\n\t"
+         "cpuid\n\t"
+         :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
+         :
+         : "cc"
+        );
+        #else
+        asm volatile
+        (
+         "pushl %%ebx\n\t"
+         "movl $1,%%eax\n\t"
+         "cpuid\n\t"
+         "popl %%ebx\n\t"
+         : "=a"(cpuid_data[0]), "=c"(cpuid_data[2]), "=d"(cpuid_data[3])
+         :
+         : "cc"
+        );
+        #endif
+    #endif
+
+        f.x86_family = (cpuid_data[0] >> 8) & 15;
+        if( f.x86_family >= 6 )
+        {
+            f.have[CV_CPU_MMX]    = (cpuid_data[3] & (1 << 23)) != 0;
+            f.have[CV_CPU_SSE]    = (cpuid_data[3] & (1<<25)) != 0;
+            f.have[CV_CPU_SSE2]   = (cpuid_data[3] & (1<<26)) != 0;
+            f.have[CV_CPU_SSE3]   = (cpuid_data[2] & (1<<0)) != 0;
+            f.have[CV_CPU_SSSE3]  = (cpuid_data[2] & (1<<9)) != 0;
+            f.have[CV_CPU_FMA3]  = (cpuid_data[2] & (1<<12)) != 0;
+            f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0;
+            f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
+            f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
+            f.have[CV_CPU_AVX]    = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
+
+            // make the second call to the cpuid command in order to get
+            // information about extended features like AVX2
+        #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+            __cpuidex(cpuid_data, 7, 0);
+        #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+            #ifdef __x86_64__
+            asm __volatile__
+            (
+             "movl $7, %%eax\n\t"
+             "movl $0, %%ecx\n\t"
+             "cpuid\n\t"
+             :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
+             :
+             : "cc"
+            );
+            #else
+            asm volatile
+            (
+             "pushl %%ebx\n\t"
+             "movl $7,%%eax\n\t"
+             "movl $0,%%ecx\n\t"
+             "cpuid\n\t"
+             "movl %%ebx, %0\n\t"
+             "popl %%ebx\n\t"
+             : "=r"(cpuid_data[1]), "=c"(cpuid_data[2])
+             :
+             : "cc"
+            );
+            #endif
+        #endif
+            f.have[CV_CPU_AVX2]   = (cpuid_data[1] & (1<<5)) != 0;
+
+            f.have[CV_CPU_AVX_512F]       = (cpuid_data[1] & (1<<16)) != 0;
+            f.have[CV_CPU_AVX_512DQ]      = (cpuid_data[1] & (1<<17)) != 0;
+            f.have[CV_CPU_AVX_512IFMA512] = (cpuid_data[1] & (1<<21)) != 0;
+            f.have[CV_CPU_AVX_512PF]      = (cpuid_data[1] & (1<<26)) != 0;
+            f.have[CV_CPU_AVX_512ER]      = (cpuid_data[1] & (1<<27)) != 0;
+            f.have[CV_CPU_AVX_512CD]      = (cpuid_data[1] & (1<<28)) != 0;
+            f.have[CV_CPU_AVX_512BW]      = (cpuid_data[1] & (1<<30)) != 0;
+            f.have[CV_CPU_AVX_512VL]      = (cpuid_data[1] & (1<<31)) != 0;
+            f.have[CV_CPU_AVX_512VBMI]    = (cpuid_data[2] &  (1<<1)) != 0;
+        }
+
+    #if defined ANDROID || defined __linux__
+    #ifdef __aarch64__
+        f.have[CV_CPU_NEON] = true;
+    #else
+        int cpufile = open("/proc/self/auxv", O_RDONLY);
+
+        if (cpufile >= 0)
+        {
+            Elf32_auxv_t auxv;
+            const size_t size_auxv_t = sizeof(auxv);
+
+            while ((size_t)read(cpufile, &auxv, size_auxv_t) == size_auxv_t)
+            {
+                if (auxv.a_type == AT_HWCAP)
+                {
+                    f.have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0;
+                    break;
+                }
+            }
+
+            close(cpufile);
+        }
+    #endif
+    #elif (defined __clang__ || defined __APPLE__) && (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__))
+        f.have[CV_CPU_NEON] = true;
+    #endif
+
+        return f;
+    }
+
+    int x86_family;
+    bool have[MAX_FEATURE+1];
+};
+
+static HWFeatures  featuresEnabled = HWFeatures::initialize(), featuresDisabled = HWFeatures();
+static HWFeatures* currentFeatures = &featuresEnabled;
+
 bool checkHardwareSupport(int feature)
 {
    CV_DbgAssert( 0 <= feature && feature <= CV_HARDWARE_MAX_FEATURE );
-    return cv::hal::checkHardwareSupport(feature);
+    return currentFeatures->have[feature];
 }

+
+volatile bool useOptimizedFlag = true;
+
 void setUseOptimized( bool flag )
 {
-    cv::hal::setUseOptimized(flag);
+    useOptimizedFlag = flag;
+    currentFeatures = flag ? &featuresEnabled : &featuresDisabled;

    ipp::setUseIPP(flag);
 #ifdef HAVE_OPENCL
@ -219,7 +397,7 @@ void setUseOptimized( bool flag )

 bool useOptimized(void)
 {
-    return cv::hal::useOptimized();
+    return useOptimizedFlag;
 }

 int64 getTickCount(void)
@ -499,12 +677,12 @@ redirectError( CvErrorCallback errCallback, void* userdata, void** prevUserdata)
 CV_IMPL int cvCheckHardwareSupport(int feature)
 {
    CV_DbgAssert( 0 <= feature && feature <= CV_HARDWARE_MAX_FEATURE );
-    return cv::hal::checkHardwareSupport(feature);
+    return cv::currentFeatures->have[feature];
 }

 CV_IMPL int cvUseOptimized( int flag )
 {
-    int prevMode = cv::useOptimized();
+    int prevMode = cv::useOptimizedFlag;
    cv::setUseOptimized( flag != 0 );
    return prevMode;
 }
--- a/modules/core/test/test_hal_core.cpp
+++ b/modules/core/test/test_hal_core.cpp
@ -40,7 +40,6 @@
 //M*/

 #include "test_precomp.hpp"
-#include "opencv2/hal.hpp"

 using namespace cv;

@ -72,21 +71,21 @@ TEST(Core_HAL, mathfuncs)
            {
            case HAL_EXP:
                if( depth == CV_32F )
-                    hal::exp(src.ptr<float>(), dst.ptr<float>(), n);
+                    hal::exp32f(src.ptr<float>(), dst.ptr<float>(), n);
                else
-                    hal::exp(src.ptr<double>(), dst.ptr<double>(), n);
+                    hal::exp64f(src.ptr<double>(), dst.ptr<double>(), n);
                break;
            case HAL_LOG:
                if( depth == CV_32F )
-                    hal::log(src.ptr<float>(), dst.ptr<float>(), n);
+                    hal::log32f(src.ptr<float>(), dst.ptr<float>(), n);
                else
-                    hal::log(src.ptr<double>(), dst.ptr<double>(), n);
+                    hal::log64f(src.ptr<double>(), dst.ptr<double>(), n);
                break;
            case HAL_SQRT:
                if( depth == CV_32F )
-                    hal::sqrt(src.ptr<float>(), dst.ptr<float>(), n);
+                    hal::sqrt32f(src.ptr<float>(), dst.ptr<float>(), n);
                else
-                    hal::sqrt(src.ptr<double>(), dst.ptr<double>(), n);
+                    hal::sqrt64f(src.ptr<double>(), dst.ptr<double>(), n);
                break;
            default:
                CV_Error(Error::StsBadArg, "unknown function");
@ -159,15 +158,15 @@ TEST(Core_HAL, mat_decomp)
            {
            case HAL_LU:
                if( depth == CV_32F )
-                    hal::LU(a.ptr<float>(), a.step, size, x.ptr<float>(), x.step, 1);
+                    hal::LU32f(a.ptr<float>(), a.step, size, x.ptr<float>(), x.step, 1);
                else
-                    hal::LU(a.ptr<double>(), a.step, size, x.ptr<double>(), x.step, 1);
+                    hal::LU64f(a.ptr<double>(), a.step, size, x.ptr<double>(), x.step, 1);
                break;
            case HAL_CHOL:
                if( depth == CV_32F )
-                    hal::Cholesky(a.ptr<float>(), a.step, size, x.ptr<float>(), x.step, 1);
+                    hal::Cholesky32f(a.ptr<float>(), a.step, size, x.ptr<float>(), x.step, 1);
                else
-                    hal::Cholesky(a.ptr<double>(), a.step, size, x.ptr<double>(), x.step, 1);
+                    hal::Cholesky64f(a.ptr<double>(), a.step, size, x.ptr<double>(), x.step, 1);
                break;
            default:
                CV_Error(Error::StsBadArg, "unknown function");
--- a/modules/core/test/test_intrin.cpp
+++ b/modules/core/test/test_intrin.cpp
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@ -1,7 +1,7 @@
 #ifndef _TEST_UTILS_HPP_
 #define _TEST_UTILS_HPP_

-#include "opencv2/hal/intrin.hpp"
+#include "opencv2/core/hal/intrin.hpp"
 #include "opencv2/ts.hpp"
 #include <ostream>
 #include <algorithm>
--- a/modules/core/test/test_precomp.hpp
+++ b/modules/core/test/test_precomp.hpp
@ -13,6 +13,9 @@
 #include "opencv2/ts.hpp"
 #include "opencv2/core/core_c.h"

+#include "opencv2/core/cvdef.h"
 #include "opencv2/core/private.hpp"
+#include "opencv2/core/hal/hal.hpp"
+#include "opencv2/core/hal/intrin.hpp"

 #endif
--- a/modules/features2d/src/precomp.hpp
+++ b/modules/features2d/src/precomp.hpp
@ -49,6 +49,7 @@
 #include "opencv2/core/utility.hpp"
 #include "opencv2/core/private.hpp"
 #include "opencv2/core/ocl.hpp"
+#include "opencv2/core/hal/hal.hpp"

 #include <algorithm>

--- a/modules/hal/CMakeLists.txt
+++ b/modules/hal/CMakeLists.txt
@ -1,21 +0,0 @@
-set(the_description "The Hardware Acceleration Layer (HAL) module")
-
-set(OPENCV_MODULE_TYPE STATIC)
-
-if(OPENCV_HAL_HEADERS AND OPENCV_HAL_LIBS)
-    set(OPENCV_HAL_HEADERS_INCLUDES "#include \"${OPENCV_HAL_HEADERS}\"")
-    set(DEPS "${OPENCV_HAL_LIBS}")
-else()
-    set(OPENCV_HAL_HEADERS_INCLUDES "// using default HAL")
-    set(DEPS "")
-endif()
-
-configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/custom_hal.hpp.in" "${CMAKE_BINARY_DIR}/custom_hal.hpp" @ONLY)
-
-if(UNIX)
-  if(CMAKE_COMPILER_IS_GNUCXX OR CV_ICC)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
-  endif()
-endif()
-
-ocv_define_module(hal ${DEPS})
--- a/modules/hal/include/opencv2/hal.hpp
+++ b/modules/hal/include/opencv2/hal.hpp
@ -1,287 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Copyright (C) 2015, Itseez Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_HAL_HPP__
-#define __OPENCV_HAL_HPP__
-
-#include "opencv2/hal/defs.h"
-#include "opencv2/hal/interface.hpp"
-
-/**
-  @defgroup hal Hardware Acceleration Layer
-  @{
-    @defgroup hal_intrin Universal intrinsics
-    @{
-      @defgroup hal_intrin_impl Private implementation helpers
-    @}
-    @defgroup hal_utils Platform-dependent utils
-  @}
-*/
-
-namespace cv { namespace hal {
-
-//! @addtogroup hal
-//! @{
-
-class Failure
-{
-public:
-    Failure(int code_ = Error::Unknown) : code(code_) {}
-public:
-    int code;
-};
-
-int normHamming(const uchar* a, int n);
-int normHamming(const uchar* a, const uchar* b, int n);
-
-int normHamming(const uchar* a, int n, int cellSize);
-int normHamming(const uchar* a, const uchar* b, int n, int cellSize);
-
-//////////////////////////////// low-level functions ////////////////////////////////
-
-int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n);
-int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n);
-bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n);
-bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n);
-
-int normL1_(const uchar* a, const uchar* b, int n);
-float normL1_(const float* a, const float* b, int n);
-float normL2Sqr_(const float* a, const float* b, int n);
-
-void exp(const float* src, float* dst, int n);
-void exp(const double* src, double* dst, int n);
-void log(const float* src, float* dst, int n);
-void log(const double* src, double* dst, int n);
-
-void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
-void magnitude(const float* x, const float* y, float* dst, int n);
-void magnitude(const double* x, const double* y, double* dst, int n);
-void sqrt(const float* src, float* dst, int len);
-void sqrt(const double* src, double* dst, int len);
-void invSqrt(const float* src, float* dst, int len);
-void invSqrt(const double* src, double* dst, int len);
-
-void split8u(const uchar* src, uchar** dst, int len, int cn );
-void split16u(const ushort* src, ushort** dst, int len, int cn );
-void split32s(const int* src, int** dst, int len, int cn );
-void split64s(const int64* src, int64** dst, int len, int cn );
-
-void merge8u(const uchar** src, uchar* dst, int len, int cn );
-void merge16u(const ushort** src, ushort* dst, int len, int cn );
-void merge32s(const int** src, int* dst, int len, int cn );
-void merge64s(const int64** src, int64* dst, int len, int cn );
-
-void add8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
-void add8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
-void add16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
-void add16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
-void add32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
-void add32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
-void add64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
-
-void sub8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
-void sub8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
-void sub16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
-void sub16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
-void sub32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
-void sub32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
-void sub64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
-
-void max8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
-void max8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
-void max16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
-void max16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
-void max32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
-void max32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
-void max64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
-
-void min8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
-void min8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
-void min16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
-void min16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
-void min32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
-void min32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
-void min64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
-
-void absdiff8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
-void absdiff8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
-void absdiff16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
-void absdiff16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
-void absdiff32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
-void absdiff32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
-void absdiff64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
-
-void and8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
-void or8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
-void xor8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
-void not8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
-
-void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
-void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
-void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
-void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
-void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
-void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
-void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
-
-void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
-void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
-void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
-void mul16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
-void mul32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
-void mul32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
-void mul64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
-
-void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
-void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
-void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
-void div16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
-void div32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
-void div32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
-void div64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
-
-void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
-void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
-void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
-void recip16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
-void recip32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
-void recip32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
-void recip64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
-
-void addWeighted8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _scalars );
-void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scalars );
-void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scalars );
-void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scalars );
-void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scalars );
-void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
-void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );
-//! @}
-
-}} //cv::hal
-
-namespace cv {
-
-template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
-{
-    typedef T1 type1;
-    typedef T2 type2;
-    typedef T3 rtype;
-    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + b); }
-};
-
-template<typename T1, typename T2=T1, typename T3=T1> struct OpSub
-{
-    typedef T1 type1;
-    typedef T2 type2;
-    typedef T3 rtype;
-    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a - b); }
-};
-
-template<typename T1, typename T2=T1, typename T3=T1> struct OpRSub
-{
-    typedef T1 type1;
-    typedef T2 type2;
-    typedef T3 rtype;
-    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(b - a); }
-};
-
-template<typename T> struct OpMin
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator ()(const T a, const T b) const { return std::min(a, b); }
-};
-
-template<typename T> struct OpMax
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator ()(const T a, const T b) const { return std::max(a, b); }
-};
-
-template<typename T> struct OpAbsDiff
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()(T a, T b) const { return a > b ? a - b : b - a; }
-};
-
-template<typename T> struct OpAnd
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()( T a, T b ) const { return a & b; }
-};
-
-template<typename T> struct OpOr
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()( T a, T b ) const { return a | b; }
-};
-
-template<typename T> struct OpXor
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()( T a, T b ) const { return a ^ b; }
-};
-
-template<typename T> struct OpNot
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()( T a, T ) const { return ~a; }
-};
-
-}
-
-#endif //__OPENCV_HAL_HPP__
--- a/modules/hal/include/opencv2/hal/defs.h
+++ b/modules/hal/include/opencv2/hal/defs.h
@ -1,675 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Copyright (C) 2015, Itseez Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DEF_H__
-#define __OPENCV_DEF_H__
-
-//! @addtogroup hal_utils
-//! @{
-
-#if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300
-#  define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */
-#endif
-
-#include <limits.h>
-#include "opencv2/hal/interface.hpp"
-
-#if defined __ICL
-#  define CV_ICC   __ICL
-#elif defined __ICC
-#  define CV_ICC   __ICC
-#elif defined __ECL
-#  define CV_ICC   __ECL
-#elif defined __ECC
-#  define CV_ICC   __ECC
-#elif defined __INTEL_COMPILER
-#  define CV_ICC   __INTEL_COMPILER
-#endif
-
-#ifndef CV_INLINE
-#  if defined __cplusplus
-#    define CV_INLINE static inline
-#  elif defined _MSC_VER
-#    define CV_INLINE __inline
-#  else
-#    define CV_INLINE static
-#  endif
-#endif
-
-#if defined CV_ICC && !defined CV_ENABLE_UNROLLED
-#  define CV_ENABLE_UNROLLED 0
-#else
-#  define CV_ENABLE_UNROLLED 1
-#endif
-
-#ifdef __GNUC__
-#  define CV_DECL_ALIGNED(x) __attribute__ ((aligned (x)))
-#elif defined _MSC_VER
-#  define CV_DECL_ALIGNED(x) __declspec(align(x))
-#else
-#  define CV_DECL_ALIGNED(x)
-#endif
-
-/* CPU features and intrinsics support */
-#define CV_CPU_NONE             0
-#define CV_CPU_MMX              1
-#define CV_CPU_SSE              2
-#define CV_CPU_SSE2             3
-#define CV_CPU_SSE3             4
-#define CV_CPU_SSSE3            5
-#define CV_CPU_SSE4_1           6
-#define CV_CPU_SSE4_2           7
-#define CV_CPU_POPCNT           8
-
-#define CV_CPU_AVX              10
-#define CV_CPU_AVX2             11
-#define CV_CPU_FMA3             12
-
-#define CV_CPU_AVX_512F         13
-#define CV_CPU_AVX_512BW        14
-#define CV_CPU_AVX_512CD        15
-#define CV_CPU_AVX_512DQ        16
-#define CV_CPU_AVX_512ER        17
-#define CV_CPU_AVX_512IFMA512   18
-#define CV_CPU_AVX_512PF        19
-#define CV_CPU_AVX_512VBMI      20
-#define CV_CPU_AVX_512VL        21
-
-#define CV_CPU_NEON   100
-
-// when adding to this list remember to update the following enum
-#define CV_HARDWARE_MAX_FEATURE 255
-
-/** @brief Available CPU features.
-*/
-enum CpuFeatures {
-    CPU_MMX             = 1,
-    CPU_SSE             = 2,
-    CPU_SSE2            = 3,
-    CPU_SSE3            = 4,
-    CPU_SSSE3           = 5,
-    CPU_SSE4_1          = 6,
-    CPU_SSE4_2          = 7,
-    CPU_POPCNT          = 8,
-
-    CPU_AVX             = 10,
-    CPU_AVX2            = 11,
-    CPU_FMA3            = 12,
-
-    CPU_AVX_512F        = 13,
-    CPU_AVX_512BW       = 14,
-    CPU_AVX_512CD       = 15,
-    CPU_AVX_512DQ       = 16,
-    CPU_AVX_512ER       = 17,
-    CPU_AVX_512IFMA512  = 18,
-    CPU_AVX_512PF       = 19,
-    CPU_AVX_512VBMI     = 20,
-    CPU_AVX_512VL       = 21,
-
-    CPU_NEON            = 100
-};
-
-// do not include SSE/AVX/NEON headers for NVCC compiler
-#ifndef __CUDACC__
-
-#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
-#  include <emmintrin.h>
-#  define CV_MMX 1
-#  define CV_SSE 1
-#  define CV_SSE2 1
-#  if defined __SSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)
-#    include <pmmintrin.h>
-#    define CV_SSE3 1
-#  endif
-#  if defined __SSSE3__  || (defined _MSC_VER && _MSC_VER >= 1500)
-#    include <tmmintrin.h>
-#    define CV_SSSE3 1
-#  endif
-#  if defined __SSE4_1__ || (defined _MSC_VER && _MSC_VER >= 1500)
-#    include <smmintrin.h>
-#    define CV_SSE4_1 1
-#  endif
-#  if defined __SSE4_2__ || (defined _MSC_VER && _MSC_VER >= 1500)
-#    include <nmmintrin.h>
-#    define CV_SSE4_2 1
-#  endif
-#  if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500)
-#    ifdef _MSC_VER
-#      include <nmmintrin.h>
-#    else
-#      include <popcntintrin.h>
-#    endif
-#    define CV_POPCNT 1
-#  endif
-#  if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600 && 0)
-// MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX
-// See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
-#    include <immintrin.h>
-#    define CV_AVX 1
-#    if defined(_XCR_XFEATURE_ENABLED_MASK)
-#      define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
-#    else
-#      define __xgetbv() 0
-#    endif
-#  endif
-#  if defined __AVX2__ || (defined _MSC_VER && _MSC_VER >= 1800 && 0)
-#    include <immintrin.h>
-#    define CV_AVX2 1
-#    if defined __FMA__
-#      define CV_FMA3 1
-#    endif
-#  endif
-#endif
-
-#if (defined WIN32 || defined _WIN32) && defined(_M_ARM)
-# include <Intrin.h>
-# include "arm_neon.h"
-# define CV_NEON 1
-# define CPU_HAS_NEON_FEATURE (true)
-#elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__))
-#  include <arm_neon.h>
-#  define CV_NEON 1
-#endif
-
-#if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__
-#  define CV_VFP 1
-#endif
-
-#endif // __CUDACC__
-
-#ifndef CV_POPCNT
-#define CV_POPCNT 0
-#endif
-#ifndef CV_MMX
-#  define CV_MMX 0
-#endif
-#ifndef CV_SSE
-#  define CV_SSE 0
-#endif
-#ifndef CV_SSE2
-#  define CV_SSE2 0
-#endif
-#ifndef CV_SSE3
-#  define CV_SSE3 0
-#endif
-#ifndef CV_SSSE3
-#  define CV_SSSE3 0
-#endif
-#ifndef CV_SSE4_1
-#  define CV_SSE4_1 0
-#endif
-#ifndef CV_SSE4_2
-#  define CV_SSE4_2 0
-#endif
-#ifndef CV_AVX
-#  define CV_AVX 0
-#endif
-#ifndef CV_AVX2
-#  define CV_AVX2 0
-#endif
-#ifndef CV_FMA3
-#  define CV_FMA3 0
-#endif
-#ifndef CV_AVX_512F
-#  define CV_AVX_512F 0
-#endif
-#ifndef CV_AVX_512BW
-#  define CV_AVX_512BW 0
-#endif
-#ifndef CV_AVX_512CD
-#  define CV_AVX_512CD 0
-#endif
-#ifndef CV_AVX_512DQ
-#  define CV_AVX_512DQ 0
-#endif
-#ifndef CV_AVX_512ER
-#  define CV_AVX_512ER 0
-#endif
-#ifndef CV_AVX_512IFMA512
-#  define CV_AVX_512IFMA512 0
-#endif
-#ifndef CV_AVX_512PF
-#  define CV_AVX_512PF 0
-#endif
-#ifndef CV_AVX_512VBMI
-#  define CV_AVX_512VBMI 0
-#endif
-#ifndef CV_AVX_512VL
-#  define CV_AVX_512VL 0
-#endif
-
-#ifndef CV_NEON
-#  define CV_NEON 0
-#endif
-
-#ifndef CV_VFP
-#  define CV_VFP 0
-#endif
-
-/* fundamental constants */
-#define CV_PI   3.1415926535897932384626433832795
-#define CV_2PI 6.283185307179586476925286766559
-#define CV_LOG2 0.69314718055994530941723212145818
-
-typedef union Cv32suf
-{
-    int i;
-    unsigned u;
-    float f;
-}
-Cv32suf;
-
-typedef union Cv64suf
-{
-    int64 i;
-    uint64 u;
-    double f;
-}
-Cv64suf;
-
-namespace cv { namespace hal {
-
-bool checkHardwareSupport(int feature);
-void setUseOptimized(bool onoff);
-bool useOptimized();
-
-}}
-
-#define USE_SSE2  (cv::hal::checkHardwareSupport(CV_CPU_SSE))
-#define USE_SSE4_2  (cv::hal::checkHardwareSupport(CV_CPU_SSE4_2))
-#define USE_AVX  (cv::hal::checkHardwareSupport(CV_CPU_AVX))
-#define USE_AVX2  (cv::hal::checkHardwareSupport(CV_CPU_AVX2))
-
-
-/****************************************************************************************\
-*                                      fast math                                         *
-\****************************************************************************************/
-
-#if defined __BORLANDC__
-#  include <fastmath.h>
-#elif defined __cplusplus
-#  include <cmath>
-#else
-#  include <math.h>
-#endif
-
-#ifdef HAVE_TEGRA_OPTIMIZATION
-#  include "tegra_round.hpp"
-#endif
-
-#if CV_VFP
-    // 1. general scheme
-    #define ARM_ROUND(_value, _asm_string) \
-        int res; \
-        float temp; \
-        asm(_asm_string : [res] "=r" (res), [temp] "=w" (temp) : [value] "w" (_value)); \
-        return res
-    // 2. version for double
-    #ifdef __clang__
-        #define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]")
-    #else
-        #define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]")
-    #endif
-    // 3. version for float
-    #define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
-#endif // CV_VFP
-
-/** @brief Rounds floating-point number to the nearest integer
-
- @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
- result is not defined.
- */
-CV_INLINE int
-cvRound( double value )
-{
-#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
-    && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128d t = _mm_set_sd( value );
-    return _mm_cvtsd_si32(t);
-#elif defined _MSC_VER && defined _M_IX86
-    int t;
-    __asm
-    {
-        fld value;
-        fistp t;
-    }
-    return t;
-#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
-        defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
-    TEGRA_ROUND_DBL(value);
-#elif defined CV_ICC || defined __GNUC__
-# if CV_VFP
-    ARM_ROUND_DBL(value);
-# else
-    return (int)lrint(value);
-# endif
-#else
-    /* it's ok if round does not comply with IEEE754 standard;
-       the tests should allow +/-1 difference when the tested functions use round */
-    return (int)(value + (value >= 0 ? 0.5 : -0.5));
-#endif
-}
-
-
-/** @brief Rounds floating-point number to the nearest integer not larger than the original.
-
- The function computes an integer i such that:
- \f[i \le \texttt{value} < i+1\f]
- @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
- result is not defined.
- */
-CV_INLINE int cvFloor( double value )
-{
-#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128d t = _mm_set_sd( value );
-    int i = _mm_cvtsd_si32(t);
-    return i - _mm_movemask_pd(_mm_cmplt_sd(t, _mm_cvtsi32_sd(t,i)));
-#elif defined __GNUC__
-    int i = (int)value;
-    return i - (i > value);
-#else
-    int i = cvRound(value);
-    float diff = (float)(value - i);
-    return i - (diff < 0);
-#endif
-}
-
-/** @brief Rounds floating-point number to the nearest integer not smaller than the original.
-
- The function computes an integer i such that:
- \f[i \le \texttt{value} < i+1\f]
- @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
- result is not defined.
- */
-CV_INLINE int cvCeil( double value )
-{
-#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128d t = _mm_set_sd( value );
-    int i = _mm_cvtsd_si32(t);
-    return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t,i), t));
-#elif defined __GNUC__
-    int i = (int)value;
-    return i + (i < value);
-#else
-    int i = cvRound(value);
-    float diff = (float)(i - value);
-    return i + (diff < 0);
-#endif
-}
-
-/** @brief Determines if the argument is Not A Number.
-
- @param value The input floating-point value
-
- The function returns 1 if the argument is Not A Number (as defined by IEEE754 standard), 0
- otherwise. */
-CV_INLINE int cvIsNaN( double value )
-{
-    Cv64suf ieee754;
-    ieee754.f = value;
-    return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) +
-           ((unsigned)ieee754.u != 0) > 0x7ff00000;
-}
-
-/** @brief Determines if the argument is Infinity.
-
- @param value The input floating-point value
-
- The function returns 1 if the argument is a plus or minus infinity (as defined by IEEE754 standard)
- and 0 otherwise. */
-CV_INLINE int cvIsInf( double value )
-{
-    Cv64suf ieee754;
-    ieee754.f = value;
-    return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) == 0x7ff00000 &&
-            (unsigned)ieee754.u == 0;
-}
-
-#ifdef __cplusplus
-
-/** @overload */
-CV_INLINE int cvRound(float value)
-{
-#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ && \
-      defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128 t = _mm_set_ss( value );
-    return _mm_cvtss_si32(t);
-#elif defined _MSC_VER && defined _M_IX86
-    int t;
-    __asm
-    {
-        fld value;
-        fistp t;
-    }
-    return t;
-#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
-        defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
-    TEGRA_ROUND_FLT(value);
-#elif defined CV_ICC || defined __GNUC__
-# if CV_VFP
-    ARM_ROUND_FLT(value);
-# else
-    return (int)lrintf(value);
-# endif
-#else
-    /* it's ok if round does not comply with IEEE754 standard;
-     the tests should allow +/-1 difference when the tested functions use round */
-    return (int)(value + (value >= 0 ? 0.5f : -0.5f));
-#endif
-}
-
-/** @overload */
-CV_INLINE int cvRound( int value )
-{
-    return value;
-}
-
-/** @overload */
-CV_INLINE int cvFloor( float value )
-{
-#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128 t = _mm_set_ss( value );
-    int i = _mm_cvtss_si32(t);
-    return i - _mm_movemask_ps(_mm_cmplt_ss(t, _mm_cvtsi32_ss(t,i)));
-#elif defined __GNUC__
-    int i = (int)value;
-    return i - (i > value);
-#else
-    int i = cvRound(value);
-    float diff = (float)(value - i);
-    return i - (diff < 0);
-#endif
-}
-
-/** @overload */
-CV_INLINE int cvFloor( int value )
-{
-    return value;
-}
-
-/** @overload */
-CV_INLINE int cvCeil( float value )
-{
-#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128 t = _mm_set_ss( value );
-    int i = _mm_cvtss_si32(t);
-    return i + _mm_movemask_ps(_mm_cmplt_ss(_mm_cvtsi32_ss(t,i), t));
-#elif defined __GNUC__
-    int i = (int)value;
-    return i + (i < value);
-#else
-    int i = cvRound(value);
-    float diff = (float)(i - value);
-    return i + (diff < 0);
-#endif
-}
-
-/** @overload */
-CV_INLINE int cvCeil( int value )
-{
-    return value;
-}
-
-/** @overload */
-CV_INLINE int cvIsNaN( float value )
-{
-    Cv32suf ieee754;
-    ieee754.f = value;
-    return (ieee754.u & 0x7fffffff) > 0x7f800000;
-}
-
-/** @overload */
-CV_INLINE int cvIsInf( float value )
-{
-    Cv32suf ieee754;
-    ieee754.f = value;
-    return (ieee754.u & 0x7fffffff) == 0x7f800000;
-}
-
-//! @}
-
-#include <algorithm>
-
-namespace cv
-{
-
-//! @addtogroup hal_utils
-//! @{
-
-/////////////// saturate_cast (used in image & signal processing) ///////////////////
-
-/** @brief Template function for accurate conversion from one primitive type to another.
-
- The functions saturate_cast resemble the standard C++ cast operations, such as static_cast\<T\>()
- and others. They perform an efficient and accurate conversion from one primitive type to another
- (see the introduction chapter). saturate in the name means that when the input value v is out of the
- range of the target type, the result is not formed just by taking low bits of the input, but instead
- the value is clipped. For example:
- @code
- uchar a = saturate_cast<uchar>(-100); // a = 0 (UCHAR_MIN)
- short b = saturate_cast<short>(33333.33333); // b = 32767 (SHRT_MAX)
- @endcode
- Such clipping is done when the target type is unsigned char , signed char , unsigned short or
- signed short . For 32-bit integers, no clipping is done.
-
- When the parameter is a floating-point value and the target type is an integer (8-, 16- or 32-bit),
- the floating-point value is first rounded to the nearest integer and then clipped if needed (when
- the target type is 8- or 16-bit).
-
- This operation is used in the simplest or most complex image processing functions in OpenCV.
-
- @param v Function parameter.
- @sa add, subtract, multiply, divide, Mat::convertTo
- */
-template<typename _Tp> static inline _Tp saturate_cast(uchar v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(schar v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(ushort v)   { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(short v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(unsigned v) { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(int v)      { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(float v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(double v)   { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(int64 v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(uint64 v)   { return _Tp(v); }
-
-template<> inline uchar saturate_cast<uchar>(schar v)        { return (uchar)std::max((int)v, 0); }
-template<> inline uchar saturate_cast<uchar>(ushort v)       { return (uchar)std::min((unsigned)v, (unsigned)UCHAR_MAX); }
-template<> inline uchar saturate_cast<uchar>(int v)          { return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
-template<> inline uchar saturate_cast<uchar>(short v)        { return saturate_cast<uchar>((int)v); }
-template<> inline uchar saturate_cast<uchar>(unsigned v)     { return (uchar)std::min(v, (unsigned)UCHAR_MAX); }
-template<> inline uchar saturate_cast<uchar>(float v)        { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
-template<> inline uchar saturate_cast<uchar>(double v)       { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
-template<> inline uchar saturate_cast<uchar>(int64 v)        { return (uchar)((uint64)v <= (uint64)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
-template<> inline uchar saturate_cast<uchar>(uint64 v)       { return (uchar)std::min(v, (uint64)UCHAR_MAX); }
-
-template<> inline schar saturate_cast<schar>(uchar v)        { return (schar)std::min((int)v, SCHAR_MAX); }
-template<> inline schar saturate_cast<schar>(ushort v)       { return (schar)std::min((unsigned)v, (unsigned)SCHAR_MAX); }
-template<> inline schar saturate_cast<schar>(int v)          { return (schar)((unsigned)(v-SCHAR_MIN) <= (unsigned)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
-template<> inline schar saturate_cast<schar>(short v)        { return saturate_cast<schar>((int)v); }
-template<> inline schar saturate_cast<schar>(unsigned v)     { return (schar)std::min(v, (unsigned)SCHAR_MAX); }
-template<> inline schar saturate_cast<schar>(float v)        { int iv = cvRound(v); return saturate_cast<schar>(iv); }
-template<> inline schar saturate_cast<schar>(double v)       { int iv = cvRound(v); return saturate_cast<schar>(iv); }
-template<> inline schar saturate_cast<schar>(int64 v)        { return (schar)((uint64)((int64)v-SCHAR_MIN) <= (uint64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
-template<> inline schar saturate_cast<schar>(uint64 v)       { return (schar)std::min(v, (uint64)SCHAR_MAX); }
-
-template<> inline ushort saturate_cast<ushort>(schar v)      { return (ushort)std::max((int)v, 0); }
-template<> inline ushort saturate_cast<ushort>(short v)      { return (ushort)std::max((int)v, 0); }
-template<> inline ushort saturate_cast<ushort>(int v)        { return (ushort)((unsigned)v <= (unsigned)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
-template<> inline ushort saturate_cast<ushort>(unsigned v)   { return (ushort)std::min(v, (unsigned)USHRT_MAX); }
-template<> inline ushort saturate_cast<ushort>(float v)      { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
-template<> inline ushort saturate_cast<ushort>(double v)     { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
-template<> inline ushort saturate_cast<ushort>(int64 v)      { return (ushort)((uint64)v <= (uint64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
-template<> inline ushort saturate_cast<ushort>(uint64 v)     { return (ushort)std::min(v, (uint64)USHRT_MAX); }
-
-template<> inline short saturate_cast<short>(ushort v)       { return (short)std::min((int)v, SHRT_MAX); }
-template<> inline short saturate_cast<short>(int v)          { return (short)((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
-template<> inline short saturate_cast<short>(unsigned v)     { return (short)std::min(v, (unsigned)SHRT_MAX); }
-template<> inline short saturate_cast<short>(float v)        { int iv = cvRound(v); return saturate_cast<short>(iv); }
-template<> inline short saturate_cast<short>(double v)       { int iv = cvRound(v); return saturate_cast<short>(iv); }
-template<> inline short saturate_cast<short>(int64 v)        { return (short)((uint64)((int64)v - SHRT_MIN) <= (uint64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
-template<> inline short saturate_cast<short>(uint64 v)       { return (short)std::min(v, (uint64)SHRT_MAX); }
-
-template<> inline int saturate_cast<int>(float v)            { return cvRound(v); }
-template<> inline int saturate_cast<int>(double v)           { return cvRound(v); }
-
-// we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
-template<> inline unsigned saturate_cast<unsigned>(float v)  { return cvRound(v); }
-template<> inline unsigned saturate_cast<unsigned>(double v) { return cvRound(v); }
-
-//! @}
-
-}
-
-#endif // __cplusplus
-
-#endif //__OPENCV_HAL_H__
--- a/modules/hal/samples/simple_hal/CMakeLists.txt
+++ b/modules/hal/samples/simple_hal/CMakeLists.txt
@ -1,11 +0,0 @@
-cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR)
-
-if(UNIX)
-  if(CMAKE_COMPILER_IS_GNUCXX OR CV_ICC)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
-  endif()
-endif()
-
-add_library(simple_hal simple.cpp)
-set(OPENCV_HAL_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../..")
-target_include_directories(simple_hal PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${OPENCV_HAL_DIR}/include)
--- a/modules/hal/src/arithm.cpp
+++ b/modules/hal/src/arithm.cpp
--- a/modules/hal/src/color.cpp
+++ b/modules/hal/src/color.cpp
@ -1,47 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-namespace cv { namespace hal {
-
-}}
--- a/modules/hal/src/filter.cpp
+++ b/modules/hal/src/filter.cpp
@ -1,47 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-namespace cv { namespace hal {
-
-}}
--- a/modules/hal/src/hardware.cpp
+++ b/modules/hal/src/hardware.cpp
@ -1,221 +0,0 @@
-#include "precomp.hpp"
-
-#if defined WIN32 || defined _WIN32 || defined WINCE
-#include <windows.h>
-#if defined _MSC_VER
-  #if _MSC_VER >= 1400
-    #include <intrin.h>
-  #elif defined _M_IX86
-    static void __cpuid(int* cpuid_data, int)
-    {
-        __asm
-        {
-            push ebx
-            push edi
-            mov edi, cpuid_data
-            mov eax, 1
-            cpuid
-            mov [edi], eax
-            mov [edi + 4], ebx
-            mov [edi + 8], ecx
-            mov [edi + 12], edx
-            pop edi
-            pop ebx
-        }
-    }
-    static void __cpuidex(int* cpuid_data, int, int)
-    {
-        __asm
-        {
-            push edi
-            mov edi, cpuid_data
-            mov eax, 7
-            mov ecx, 0
-            cpuid
-            mov [edi], eax
-            mov [edi + 4], ebx
-            mov [edi + 8], ecx
-            mov [edi + 12], edx
-            pop edi
-        }
-    }
-  #endif
-#endif
-#endif
-
-#if defined ANDROID || defined __linux__
-#  include <unistd.h>
-#  include <fcntl.h>
-#  include <elf.h>
-#  include <linux/auxvec.h>
-#endif
-
-#if defined __linux__ || defined __APPLE__ || defined __EMSCRIPTEN__
-#include <unistd.h>
-#include <stdio.h>
-#include <sys/types.h>
-#if defined ANDROID
-#include <sys/sysconf.h>
-#endif
-#endif
-
-#ifdef ANDROID
-# include <android/log.h>
-#endif
-
-struct HWFeatures
-{
-    enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE };
-
-    HWFeatures(void)
-    {
-        memset( have, 0, sizeof(have) );
-        x86_family = 0;
-    }
-
-    static HWFeatures initialize(void)
-    {
-        HWFeatures f;
-        int cpuid_data[4] = { 0, 0, 0, 0 };
-
-    #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
-        __cpuid(cpuid_data, 1);
-    #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
-        #ifdef __x86_64__
-        asm __volatile__
-        (
-         "movl $1, %%eax\n\t"
-         "cpuid\n\t"
-         :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
-         :
-         : "cc"
-        );
-        #else
-        asm volatile
-        (
-         "pushl %%ebx\n\t"
-         "movl $1,%%eax\n\t"
-         "cpuid\n\t"
-         "popl %%ebx\n\t"
-         : "=a"(cpuid_data[0]), "=c"(cpuid_data[2]), "=d"(cpuid_data[3])
-         :
-         : "cc"
-        );
-        #endif
-    #endif
-
-        f.x86_family = (cpuid_data[0] >> 8) & 15;
-        if( f.x86_family >= 6 )
-        {
-            f.have[CV_CPU_MMX]    = (cpuid_data[3] & (1 << 23)) != 0;
-            f.have[CV_CPU_SSE]    = (cpuid_data[3] & (1<<25)) != 0;
-            f.have[CV_CPU_SSE2]   = (cpuid_data[3] & (1<<26)) != 0;
-            f.have[CV_CPU_SSE3]   = (cpuid_data[2] & (1<<0)) != 0;
-            f.have[CV_CPU_SSSE3]  = (cpuid_data[2] & (1<<9)) != 0;
-            f.have[CV_CPU_FMA3]  = (cpuid_data[2] & (1<<12)) != 0;
-            f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0;
-            f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
-            f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
-            f.have[CV_CPU_AVX]    = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
-
-            // make the second call to the cpuid command in order to get
-            // information about extended features like AVX2
-        #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
-            __cpuidex(cpuid_data, 7, 0);
-        #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
-            #ifdef __x86_64__
-            asm __volatile__
-            (
-             "movl $7, %%eax\n\t"
-             "movl $0, %%ecx\n\t"
-             "cpuid\n\t"
-             :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
-             :
-             : "cc"
-            );
-            #else
-            asm volatile
-            (
-             "pushl %%ebx\n\t"
-             "movl $7,%%eax\n\t"
-             "movl $0,%%ecx\n\t"
-             "cpuid\n\t"
-             "movl %%ebx, %0\n\t"
-             "popl %%ebx\n\t"
-             : "=r"(cpuid_data[1]), "=c"(cpuid_data[2])
-             :
-             : "cc"
-            );
-            #endif
-        #endif
-            f.have[CV_CPU_AVX2]   = (cpuid_data[1] & (1<<5)) != 0;
-
-            f.have[CV_CPU_AVX_512F]       = (cpuid_data[1] & (1<<16)) != 0;
-            f.have[CV_CPU_AVX_512DQ]      = (cpuid_data[1] & (1<<17)) != 0;
-            f.have[CV_CPU_AVX_512IFMA512] = (cpuid_data[1] & (1<<21)) != 0;
-            f.have[CV_CPU_AVX_512PF]      = (cpuid_data[1] & (1<<26)) != 0;
-            f.have[CV_CPU_AVX_512ER]      = (cpuid_data[1] & (1<<27)) != 0;
-            f.have[CV_CPU_AVX_512CD]      = (cpuid_data[1] & (1<<28)) != 0;
-            f.have[CV_CPU_AVX_512BW]      = (cpuid_data[1] & (1<<30)) != 0;
-            f.have[CV_CPU_AVX_512VL]      = (cpuid_data[1] & (1<<31)) != 0;
-            f.have[CV_CPU_AVX_512VBMI]    = (cpuid_data[2] &  (1<<1)) != 0;
-        }
-
-    #if defined ANDROID || defined __linux__
-    #ifdef __aarch64__
-        f.have[CV_CPU_NEON] = true;
-    #else
-        int cpufile = open("/proc/self/auxv", O_RDONLY);
-
-        if (cpufile >= 0)
-        {
-            Elf32_auxv_t auxv;
-            const size_t size_auxv_t = sizeof(auxv);
-
-            while ((size_t)read(cpufile, &auxv, size_auxv_t) == size_auxv_t)
-            {
-                if (auxv.a_type == AT_HWCAP)
-                {
-                    f.have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0;
-                    break;
-                }
-            }
-
-            close(cpufile);
-        }
-    #endif
-    #elif (defined __clang__ || defined __APPLE__) && (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__))
-        f.have[CV_CPU_NEON] = true;
-    #endif
-
-        return f;
-    }
-
-    int x86_family;
-    bool have[MAX_FEATURE+1];
-};
-
-static HWFeatures  featuresEnabled = HWFeatures::initialize(), featuresDisabled = HWFeatures();
-static HWFeatures* currentFeatures = &featuresEnabled;
-volatile bool useOptimizedFlag = true;
-
-namespace cv { namespace hal {
-
-bool checkHardwareSupport(int feature)
-{
-//    CV_DbgAssert( 0 <= feature && feature <= CV_HARDWARE_MAX_FEATURE );
-    return currentFeatures->have[feature];
-}
-
-void setUseOptimized( bool flag )
-{
-    useOptimizedFlag = flag;
-    currentFeatures = flag ? &featuresEnabled : &featuresDisabled;
-}
-
-bool useOptimized(void)
-{
-    return useOptimizedFlag;
-}
-
-}}
--- a/modules/hal/src/precomp.hpp
+++ b/modules/hal/src/precomp.hpp
@ -1,60 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "opencv2/hal.hpp"
-#include "opencv2/hal/intrin.hpp"
-#include <algorithm>
-#include <cmath>
-#include <cstdlib>
-#include <limits>
-#include <float.h>
-#include <cstring>
-#include <cassert>
-
-#include "opencv2/hal/sse_utils.hpp"
-#include "opencv2/hal/neon_utils.hpp"
-
-#if defined HAVE_IPP && (IPP_VERSION_X100 >= 700)
-#define ARITHM_USE_IPP 1
-#else
-#define ARITHM_USE_IPP 0
-#endif
--- a/modules/hal/src/replacement.hpp
+++ b/modules/hal/src/replacement.hpp
@ -1,208 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Copyright (C) 2015, Itseez Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_HAL_REPLACEMENT_HPP__
-#define __OPENCV_HAL_REPLACEMENT_HPP__
-
-#include "opencv2/hal.hpp"
-
-inline int hal_t_add8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_add8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_add16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_add16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_add32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_add32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_add64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_sub8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_sub8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_sub16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_sub16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_sub32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_sub32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_sub64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_max8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_max8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_max16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_max16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_max32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_max32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_max64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_min8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_min8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_min16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_min16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_min32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_min32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_min64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_absdiff8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_absdiff8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_absdiff16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_absdiff16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_absdiff32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_absdiff32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_absdiff64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_and8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_or8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_xor8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_not8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
-
-#define hal_add8u hal_t_add8u
-#define hal_add8s hal_t_add8s
-#define hal_add16u hal_t_add16u
-#define hal_add16s hal_t_add16s
-#define hal_add32s hal_t_add32s
-#define hal_add32f hal_t_add32f
-#define hal_add64f hal_t_add64f
-#define hal_sub8u hal_t_sub8u
-#define hal_sub8s hal_t_sub8s
-#define hal_sub16u hal_t_sub16u
-#define hal_sub16s hal_t_sub16s
-#define hal_sub32s hal_t_sub32s
-#define hal_sub32f hal_t_sub32f
-#define hal_sub64f hal_t_sub64f
-#define hal_max8u hal_t_max8u
-#define hal_max8s hal_t_max8s
-#define hal_max16u hal_t_max16u
-#define hal_max16s hal_t_max16s
-#define hal_max32s hal_t_max32s
-#define hal_max32f hal_t_max32f
-#define hal_max64f hal_t_max64f
-#define hal_min8u hal_t_min8u
-#define hal_min8s hal_t_min8s
-#define hal_min16u hal_t_min16u
-#define hal_min16s hal_t_min16s
-#define hal_min32s hal_t_min32s
-#define hal_min32f hal_t_min32f
-#define hal_min64f hal_t_min64f
-#define hal_absdiff8u hal_t_absdiff8u
-#define hal_absdiff8s hal_t_absdiff8s
-#define hal_absdiff16u hal_t_absdiff16u
-#define hal_absdiff16s hal_t_absdiff16s
-#define hal_absdiff32s hal_t_absdiff32s
-#define hal_absdiff32f hal_t_absdiff32f
-#define hal_absdiff64f hal_t_absdiff64f
-#define hal_and8u hal_t_and8u
-#define hal_or8u hal_t_or8u
-#define hal_xor8u hal_t_xor8u
-#define hal_not8u hal_t_not8u
-
-inline int hal_t_cmp8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_cmp8s(const schar*, size_t, const schar*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_cmp16u(const ushort*, size_t, const ushort*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_cmp16s(const short*, size_t, const short*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_cmp32s(const int*, size_t, const int*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_cmp32f(const float*, size_t, const float*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_cmp64f(const double*, size_t, const double*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; }
-
-#define hal_cmp8u hal_t_cmp8u
-#define hal_cmp8s hal_t_cmp8s
-#define hal_cmp16u hal_t_cmp16u
-#define hal_cmp16s hal_t_cmp16s
-#define hal_cmp32s hal_t_cmp32s
-#define hal_cmp32f hal_t_cmp32f
-#define hal_cmp64f hal_t_cmp64f
-
-inline int hal_t_mul8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_mul8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_mul16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_mul16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_mul32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_mul32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_mul64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_div8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_div8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_div16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_div16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_div32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_div32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_div64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_recip8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_recip8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_recip16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_recip16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_recip32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_recip32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_recip64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
-
-#define hal_mul8u hal_t_mul8u
-#define hal_mul8s hal_t_mul8s
-#define hal_mul16u hal_t_mul16u
-#define hal_mul16s hal_t_mul16s
-#define hal_mul32s hal_t_mul32s
-#define hal_mul32f hal_t_mul32f
-#define hal_mul64f hal_t_mul64f
-#define hal_div8u hal_t_div8u
-#define hal_div8s hal_t_div8s
-#define hal_div16u hal_t_div16u
-#define hal_div16s hal_t_div16s
-#define hal_div32s hal_t_div32s
-#define hal_div32f hal_t_div32f
-#define hal_div64f hal_t_div64f
-#define hal_recip8u hal_t_recip8u
-#define hal_recip8s hal_t_recip8s
-#define hal_recip16u hal_t_recip16u
-#define hal_recip16s hal_t_recip16s
-#define hal_recip32s hal_t_recip32s
-#define hal_recip32f hal_t_recip32f
-#define hal_recip64f hal_t_recip64f
-
-inline int hal_t_addWeighted8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_addWeighted8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_addWeighted16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_addWeighted16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_addWeighted32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_addWeighted32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; }
-inline int hal_t_addWeighted64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; }
-
-#define hal_addWeighted8u hal_t_addWeighted8u
-#define hal_addWeighted8s hal_t_addWeighted8s
-#define hal_addWeighted16u hal_t_addWeighted16u
-#define hal_addWeighted16s hal_t_addWeighted16s
-#define hal_addWeighted32s hal_t_addWeighted32s
-#define hal_addWeighted32f hal_t_addWeighted32f
-#define hal_addWeighted64f hal_t_addWeighted64f
-
-#include "custom_hal.hpp"
-
-#endif
--- a/modules/hal/src/resize.cpp
+++ b/modules/hal/src/resize.cpp
@ -1,47 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-namespace cv { namespace hal {
-
-}}
--- a/modules/hal/src/stat.cpp
+++ b/modules/hal/src/stat.cpp
@ -1,306 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-namespace cv { namespace hal {
-
-static const uchar popCountTable[] =
-{
-    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
-};
-
-static const uchar popCountTable2[] =
-{
-    0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
-    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
-    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
-    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
-    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
-    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
-    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
-    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
-};
-
-static const uchar popCountTable4[] =
-{
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
-};
-
-int normHamming(const uchar* a, int n)
-{
-    int i = 0;
-    int result = 0;
-#if CV_NEON
-    {
-        uint32x4_t bits = vmovq_n_u32(0);
-        for (; i <= n - 16; i += 16) {
-            uint8x16_t A_vec = vld1q_u8 (a + i);
-            uint8x16_t bitsSet = vcntq_u8 (A_vec);
-            uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
-            uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
-            bits = vaddq_u32(bits, bitSet4);
-        }
-        uint64x2_t bitSet2 = vpaddlq_u32 (bits);
-        result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
-        result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
-    }
-#endif
-        for( ; i <= n - 4; i += 4 )
-            result += popCountTable[a[i]] + popCountTable[a[i+1]] +
-            popCountTable[a[i+2]] + popCountTable[a[i+3]];
-    for( ; i < n; i++ )
-        result += popCountTable[a[i]];
-    return result;
-}
-
-int normHamming(const uchar* a, const uchar* b, int n)
-{
-    int i = 0;
-    int result = 0;
-#if CV_NEON
-    {
-        uint32x4_t bits = vmovq_n_u32(0);
-        for (; i <= n - 16; i += 16) {
-            uint8x16_t A_vec = vld1q_u8 (a + i);
-            uint8x16_t B_vec = vld1q_u8 (b + i);
-            uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
-            uint8x16_t bitsSet = vcntq_u8 (AxorB);
-            uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
-            uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
-            bits = vaddq_u32(bits, bitSet4);
-        }
-        uint64x2_t bitSet2 = vpaddlq_u32 (bits);
-        result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
-        result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
-    }
-#endif
-        for( ; i <= n - 4; i += 4 )
-            result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
-                    popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
-    for( ; i < n; i++ )
-        result += popCountTable[a[i] ^ b[i]];
-    return result;
-}
-
-int normHamming(const uchar* a, int n, int cellSize)
-{
-    if( cellSize == 1 )
-        return normHamming(a, n);
-    const uchar* tab = 0;
-    if( cellSize == 2 )
-        tab = popCountTable2;
-    else if( cellSize == 4 )
-        tab = popCountTable4;
-    else
-        return -1;
-    int i = 0;
-    int result = 0;
-#if CV_ENABLE_UNROLLED
-    for( ; i <= n - 4; i += 4 )
-        result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];
-#endif
-    for( ; i < n; i++ )
-        result += tab[a[i]];
-    return result;
-}
-
-int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
-{
-    if( cellSize == 1 )
-        return normHamming(a, b, n);
-    const uchar* tab = 0;
-    if( cellSize == 2 )
-        tab = popCountTable2;
-    else if( cellSize == 4 )
-        tab = popCountTable4;
-    else
-        return -1;
-    int i = 0;
-    int result = 0;
-    #if CV_ENABLE_UNROLLED
-    for( ; i <= n - 4; i += 4 )
-        result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
-                tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
-    #endif
-    for( ; i < n; i++ )
-        result += tab[a[i] ^ b[i]];
-    return result;
-}
-
-float normL2Sqr_(const float* a, const float* b, int n)
-{
-    int j = 0; float d = 0.f;
-#if CV_SSE
-    float CV_DECL_ALIGNED(16) buf[4];
-    __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
-
-    for( ; j <= n - 8; j += 8 )
-    {
-        __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
-        __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
-        d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
-        d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
-    }
-    _mm_store_ps(buf, _mm_add_ps(d0, d1));
-    d = buf[0] + buf[1] + buf[2] + buf[3];
-#endif
-    {
-        for( ; j <= n - 4; j += 4 )
-        {
-            float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
-            d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
-        }
-    }
-
-    for( ; j < n; j++ )
-    {
-        float t = a[j] - b[j];
-        d += t*t;
-    }
-    return d;
-}
-
-
-float normL1_(const float* a, const float* b, int n)
-{
-    int j = 0; float d = 0.f;
-#if CV_SSE
-    float CV_DECL_ALIGNED(16) buf[4];
-    static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
-    __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
-    __m128 absmask = _mm_load_ps((const float*)absbuf);
-
-    for( ; j <= n - 8; j += 8 )
-    {
-        __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
-        __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
-        d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
-        d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
-    }
-    _mm_store_ps(buf, _mm_add_ps(d0, d1));
-    d = buf[0] + buf[1] + buf[2] + buf[3];
-#elif CV_NEON
-    float32x4_t v_sum = vdupq_n_f32(0.0f);
-    for ( ; j <= n - 4; j += 4)
-        v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
-
-    float CV_DECL_ALIGNED(16) buf[4];
-    vst1q_f32(buf, v_sum);
-    d = buf[0] + buf[1] + buf[2] + buf[3];
-#endif
-    {
-        for( ; j <= n - 4; j += 4 )
-        {
-            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
-            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
-        }
-    }
-
-    for( ; j < n; j++ )
-        d += std::abs(a[j] - b[j]);
-    return d;
-}
-
-int normL1_(const uchar* a, const uchar* b, int n)
-{
-    int j = 0, d = 0;
-#if CV_SSE
-    __m128i d0 = _mm_setzero_si128();
-
-    for( ; j <= n - 16; j += 16 )
-    {
-        __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
-        __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
-
-        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
-    }
-
-    for( ; j <= n - 4; j += 4 )
-    {
-        __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
-        __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
-
-        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
-    }
-    d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
-#elif CV_NEON
-    uint32x4_t v_sum = vdupq_n_u32(0.0f);
-    for ( ; j <= n - 16; j += 16)
-    {
-        uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
-        uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
-        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
-        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
-    }
-
-    uint CV_DECL_ALIGNED(16) buf[4];
-    vst1q_u32(buf, v_sum);
-    d = buf[0] + buf[1] + buf[2] + buf[3];
-#endif
-    {
-        for( ; j <= n - 4; j += 4 )
-        {
-            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
-            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
-        }
-    }
-    for( ; j < n; j++ )
-        d += std::abs(a[j] - b[j]);
-    return d;
-}
-
-}} //cv::hal
--- a/modules/hal/src/warp.cpp
+++ b/modules/hal/src/warp.cpp
@ -1,47 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-namespace cv { namespace hal {
-
-}}
--- a/modules/hal/test/test_main.cpp
+++ b/modules/hal/test/test_main.cpp
@ -1,3 +0,0 @@
-#include "opencv2/ts.hpp"
-
-CV_TEST_MAIN("cv")
--- a/modules/hal/test/test_precomp.hpp
+++ b/modules/hal/test/test_precomp.hpp
@ -1,11 +0,0 @@
-#ifndef __OPENCV_HAL_TEST_PRECOMP_HPP__
-#define __OPENCV_HAL_TEST_PRECOMP_HPP__
-
-#include <iostream>
-#include <limits>
-#include "opencv2/ts.hpp"
-#include "opencv2/hal.hpp"
-#include "opencv2/hal/defs.h"
-#include "opencv2/hal/intrin.hpp"
-
-#endif
--- a/modules/imgproc/src/precomp.hpp
+++ b/modules/imgproc/src/precomp.hpp
@ -49,7 +49,7 @@
 #include "opencv2/imgproc/imgproc_c.h"
 #include "opencv2/core/private.hpp"
 #include "opencv2/core/ocl.hpp"
-#include "opencv2/hal.hpp"
+#include "opencv2/core/hal/hal.hpp"

 #include <math.h>
 #include <assert.h>
@ -94,6 +94,6 @@ extern const float icv8x32fSqrTab[];
 #include "_geom.h"
 #include "filterengine.hpp"

-#include "opencv2/hal/sse_utils.hpp"
+#include "opencv2/core/sse_utils.hpp"

 #endif /*__OPENCV_CV_INTERNAL_H_*/
--- a/modules/imgproc/src/spatialgradient.cpp
+++ b/modules/imgproc/src/spatialgradient.cpp
@ -41,7 +41,7 @@
 //M*/

 #include "precomp.hpp"
-#include "opencv2/hal/intrin.hpp"
+#include "opencv2/core/hal/intrin.hpp"

 #include <iostream>
 namespace cv
--- a/modules/python/common.cmake
+++ b/modules/python/common.cmake
@ -28,6 +28,7 @@ endforeach(m)
 ocv_list_filterout(opencv_hdrs ".h$")
 ocv_list_filterout(opencv_hdrs "cuda")
 ocv_list_filterout(opencv_hdrs "cudev")
+ocv_list_filterout(opencv_hdrs "/hal/")
 ocv_list_filterout(opencv_hdrs "detection_based_tracker.hpp") # Conditional compilation

 set(cv2_generated_hdrs
--- a/modules/stitching/src/autocalib.cpp
+++ b/modules/stitching/src/autocalib.cpp
@ -41,19 +41,19 @@
 //M*/

 #include "precomp.hpp"
+#include "opencv2/core/hal/hal.hpp"

 using namespace cv;

 namespace {

-template<typename _Tp> static inline bool
-decomposeCholesky(_Tp* A, size_t astep, int m)
+static inline bool decomposeCholesky(double* A, size_t astep, int m)
 {
-    if (!hal::Cholesky(A, astep, m, 0, 0, 0))
+    if (!hal::Cholesky64f(A, astep, m, 0, 0, 0))
        return false;
    astep /= sizeof(A[0]);
    for (int i = 0; i < m; ++i)
-        A[i*astep + i] = (_Tp)(1./A[i*astep + i]);
+        A[i*astep + i] = (double)(1./A[i*astep + i]);
    return true;
 }

--- a/samples/hal/README.md
+++ b/samples/hal/README.md
@ -0,0 +1,34 @@
+Custom HAL samples
+==================
+
+Samples in this folder are intended to demonstrate functionality replacement mechanism in the OpenCV library.
+
+The __c_hal__ is the example of pure C replacement library with all functions returning error. It can be used to verify error handling in the function switching code.
+
+The __slow_hal__ contains naive C++ implementations of the element-wise logical array operations (and, or, xor, not) making them twice slower than the default.
+
+Build custom HAL replacement library
+------------------------------------
+
+1. Create folder for build (for example `<home-dir>/my-hal-build`)
+2. Go to the created folder and run cmake: `cmake <opencv-src>/samples/hal/slow_hal`
+3. Run make
+
+After build you will find static library in the build folder: `libslow_hal.a`
+
+Build OpenCV with HAL replacement
+---------------------------------
+
+1. Create folder for build (for example `<home-dir>/my-opencv-build`)
+2. Go to the created folder and run cmake:
+    ```
+    cmake \
+        -DOPENCV_HAL_HEADERS="<opencv-src>/samples/hal/slow_hal/impl.hpp" \
+        -DOPENCV_HAL_LIBS="<home-dir>/my-hal-build/libslow_hal.a" \
+        <opencv-src>
+    ```
+3. Run make (or `make opencv_perf_core` to build the demonstration test executable only)
+4. After build you can run the tests and verify that some functions works slower:
+    ```
+    ./bin/opencv_perf_core --gtest_filter=*bitwise_and*
+    ```
--- a/samples/hal/c_hal/CMakeLists.txt
+++ b/samples/hal/c_hal/CMakeLists.txt
@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR)
+
+if(UNIX)
+  if(CMAKE_COMPILER_IS_GNUC OR CV_ICC)
+    set(CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+  endif()
+endif()
+
+add_library(c_hal impl.c)
+set(OPENCV_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../..")
+target_include_directories(c_hal PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${OPENCV_SRC_DIR}/modules/core/include)
--- a/samples/hal/c_hal/impl.c
+++ b/samples/hal/c_hal/impl.c
@ -0,0 +1,371 @@
+#include "impl.h"
+
+int wrong_add8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_add8s(const schar* src1, size_t sz1, const schar* src2, size_t sz2, schar* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_add16u(const ushort* src1, size_t sz1, const ushort* src2, size_t sz2, ushort* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_add16s(const short* src1, size_t sz1, const short* src2, size_t sz2, short* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_add32s(const int* src1, size_t sz1, const int* src2, size_t sz2, int* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_add32f(const float* src1, size_t sz1, const float* src2, size_t sz2, float* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_add64f(const double* src1, size_t sz1, const double* src2, size_t sz2, double* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_sub8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_sub8s(const schar* src1, size_t sz1, const schar* src2, size_t sz2, schar* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_sub16u(const ushort* src1, size_t sz1, const ushort* src2, size_t sz2, ushort* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_sub16s(const short* src1, size_t sz1, const short* src2, size_t sz2, short* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_sub32s(const int* src1, size_t sz1, const int* src2, size_t sz2, int* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_sub32f(const float* src1, size_t sz1, const float* src2, size_t sz2, float* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_sub64f(const double* src1, size_t sz1, const double* src2, size_t sz2, double* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_max8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_max8s(const schar* src1, size_t sz1, const schar* src2, size_t sz2, schar* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_max16u(const ushort* src1, size_t sz1, const ushort* src2, size_t sz2, ushort* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_max16s(const short* src1, size_t sz1, const short* src2, size_t sz2, short* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_max32s(const int* src1, size_t sz1, const int* src2, size_t sz2, int* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_max32f(const float* src1, size_t sz1, const float* src2, size_t sz2, float* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_max64f(const double* src1, size_t sz1, const double* src2, size_t sz2, double* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_min8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_min8s(const schar* src1, size_t sz1, const schar* src2, size_t sz2, schar* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_min16u(const ushort* src1, size_t sz1, const ushort* src2, size_t sz2, ushort* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_min16s(const short* src1, size_t sz1, const short* src2, size_t sz2, short* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_min32s(const int* src1, size_t sz1, const int* src2, size_t sz2, int* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_min32f(const float* src1, size_t sz1, const float* src2, size_t sz2, float* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_min64f(const double* src1, size_t sz1, const double* src2, size_t sz2, double* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_absdiff8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_absdiff8s(const schar* src1, size_t sz1, const schar* src2, size_t sz2, schar* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_absdiff16u(const ushort* src1, size_t sz1, const ushort* src2, size_t sz2, ushort* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_absdiff16s(const short* src1, size_t sz1, const short* src2, size_t sz2, short* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_absdiff32s(const int* src1, size_t sz1, const int* src2, size_t sz2, int* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_absdiff32f(const float* src1, size_t sz1, const float* src2, size_t sz2, float* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_absdiff64f(const double* src1, size_t sz1, const double* src2, size_t sz2, double* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_and8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_or8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_xor8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_not8u(const uchar* src1, size_t sz1, uchar* dst, size_t sz, int w, int h)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_cmp8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h, int op)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_cmp8s(const schar* src1, size_t sz1, const schar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h, int op)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_cmp16u(const ushort* src1, size_t sz1, const ushort* src2, size_t sz2, uchar* dst, size_t sz, int w, int h, int op)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_cmp16s(const short* src1, size_t sz1, const short* src2, size_t sz2, uchar* dst, size_t sz, int w, int h, int op)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_cmp32s(const int* src1, size_t sz1, const int* src2, size_t sz2, uchar* dst, size_t sz, int w, int h, int op)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_cmp32f(const float* src1, size_t sz1, const float* src2, size_t sz2, uchar* dst, size_t sz, int w, int h, int op)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_cmp64f(const double* src1, size_t sz1, const double* src2, size_t sz2, uchar* dst, size_t sz, int w, int h, int op)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_mul8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h, double scale)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_mul8s(const schar* src1, size_t sz1, const schar* src2, size_t sz2, schar* dst, size_t sz, int w, int h, double scale)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_mul16u(const ushort* src1, size_t sz1, const ushort* src2, size_t sz2, ushort* dst, size_t sz, int w, int h, double scale)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_mul16s(const short* src1, size_t sz1, const short* src2, size_t sz2, short* dst, size_t sz, int w, int h, double scale)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_mul32s(const int* src1, size_t sz1, const int* src2, size_t sz2, int* dst, size_t sz, int w, int h, double scale)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_mul32f(const float* src1, size_t sz1, const float* src2, size_t sz2, float* dst, size_t sz, int w, int h, double scale)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_mul64f(const double* src1, size_t sz1, const double* src2, size_t sz2, double* dst, size_t sz, int w, int h, double scale)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_div8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h, double scale)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_div8s(const schar* src1, size_t sz1, const schar* src2, size_t sz2, schar* dst, size_t sz, int w, int h, double scale)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_div16u(const ushort* src1, size_t sz1, const ushort* src2, size_t sz2, ushort* dst, size_t sz, int w, int h, double scale)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_div16s(const short* src1, size_t sz1, const short* src2, size_t sz2, short* dst, size_t sz, int w, int h, double scale)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_div32s(const int* src1, size_t sz1, const int* src2, size_t sz2, int* dst, size_t sz, int w, int h, double scale)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_div32f(const float* src1, size_t sz1, const float* src2, size_t sz2, float* dst, size_t sz, int w, int h, double scale)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_div64f(const double* src1, size_t sz1, const double* src2, size_t sz2, double* dst, size_t sz, int w, int h, double scale)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_recip8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h, double scale)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_recip8s(const schar* src1, size_t sz1, const schar* src2, size_t sz2, schar* dst, size_t sz, int w, int h, double scale)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_recip16u(const ushort* src1, size_t sz1, const ushort* src2, size_t sz2, ushort* dst, size_t sz, int w, int h, double scale)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_recip16s(const short* src1, size_t sz1, const short* src2, size_t sz2, short* dst, size_t sz, int w, int h, double scale)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_recip32s(const int* src1, size_t sz1, const int* src2, size_t sz2, int* dst, size_t sz, int w, int h, double scale)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_recip32f(const float* src1, size_t sz1, const float* src2, size_t sz2, float* dst, size_t sz, int w, int h, double scale)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_recip64f(const double* src1, size_t sz1, const double* src2, size_t sz2, double* dst, size_t sz, int w, int h, double scale)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_addWeighted8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h, const double* scales)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_addWeighted8s(const schar* src1, size_t sz1, const schar* src2, size_t sz2, schar* dst, size_t sz, int w, int h, const double* scales)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_addWeighted16u(const ushort* src1, size_t sz1, const ushort* src2, size_t sz2, ushort* dst, size_t sz, int w, int h, const double* scales)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_addWeighted16s(const short* src1, size_t sz1, const short* src2, size_t sz2, short* dst, size_t sz, int w, int h, const double* scales)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_addWeighted32s(const int* src1, size_t sz1, const int* src2, size_t sz2, int* dst, size_t sz, int w, int h, const double* scales)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_addWeighted32f(const float* src1, size_t sz1, const float* src2, size_t sz2, float* dst, size_t sz, int w, int h, const double* scales)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
+
+int wrong_addWeighted64f(const double* src1, size_t sz1, const double* src2, size_t sz2, double* dst, size_t sz, int w, int h, const double* scales)
+{
+    return CV_HAL_ERROR_UNKNOWN; // to test how OpenCV handles errors from external HAL
+}
--- a/samples/hal/c_hal/impl.h
+++ b/samples/hal/c_hal/impl.h
@ -0,0 +1,245 @@
+#ifndef _wrong_H_INCLUDED_
+#define _wrong_H_INCLUDED_
+
+#include "opencv2/core/hal/interface.h"
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+int wrong_add8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h);
+int wrong_add8s(const schar* src1, size_t sz1, const schar* src2, size_t sz2, schar* dst, size_t sz, int w, int h);
+int wrong_add16u(const ushort* src1, size_t sz1, const ushort* src2, size_t sz2, ushort* dst, size_t sz, int w, int h);
+int wrong_add16s(const short* src1, size_t sz1, const short* src2, size_t sz2, short* dst, size_t sz, int w, int h);
+int wrong_add32s(const int* src1, size_t sz1, const int* src2, size_t sz2, int* dst, size_t sz, int w, int h);
+int wrong_add32f(const float* src1, size_t sz1, const float* src2, size_t sz2, float* dst, size_t sz, int w, int h);
+int wrong_add64f(const double* src1, size_t sz1, const double* src2, size_t sz2, double* dst, size_t sz, int w, int h);
+int wrong_sub8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h);
+int wrong_sub8s(const schar* src1, size_t sz1, const schar* src2, size_t sz2, schar* dst, size_t sz, int w, int h);
+int wrong_sub16u(const ushort* src1, size_t sz1, const ushort* src2, size_t sz2, ushort* dst, size_t sz, int w, int h);
+int wrong_sub16s(const short* src1, size_t sz1, const short* src2, size_t sz2, short* dst, size_t sz, int w, int h);
+int wrong_sub32s(const int* src1, size_t sz1, const int* src2, size_t sz2, int* dst, size_t sz, int w, int h);
+int wrong_sub32f(const float* src1, size_t sz1, const float* src2, size_t sz2, float* dst, size_t sz, int w, int h);
+int wrong_sub64f(const double* src1, size_t sz1, const double* src2, size_t sz2, double* dst, size_t sz, int w, int h);
+int wrong_max8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h);
+int wrong_max8s(const schar* src1, size_t sz1, const schar* src2, size_t sz2, schar* dst, size_t sz, int w, int h);
+int wrong_max16u(const ushort* src1, size_t sz1, const ushort* src2, size_t sz2, ushort* dst, size_t sz, int w, int h);
+int wrong_max16s(const short* src1, size_t sz1, const short* src2, size_t sz2, short* dst, size_t sz, int w, int h);
+int wrong_max32s(const int* src1, size_t sz1, const int* src2, size_t sz2, int* dst, size_t sz, int w, int h);
+int wrong_max32f(const float* src1, size_t sz1, const float* src2, size_t sz2, float* dst, size_t sz, int w, int h);
+int wrong_max64f(const double* src1, size_t sz1, const double* src2, size_t sz2, double* dst, size_t sz, int w, int h);
+int wrong_min8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h);
+int wrong_min8s(const schar* src1, size_t sz1, const schar* src2, size_t sz2, schar* dst, size_t sz, int w, int h);
+int wrong_min16u(const ushort* src1, size_t sz1, const ushort* src2, size_t sz2, ushort* dst, size_t sz, int w, int h);
+int wrong_min16s(const short* src1, size_t sz1, const short* src2, size_t sz2, short* dst, size_t sz, int w, int h);
+int wrong_min32s(const int* src1, size_t sz1, const int* src2, size_t sz2, int* dst, size_t sz, int w, int h);
+int wrong_min32f(const float* src1, size_t sz1, const float* src2, size_t sz2, float* dst, size_t sz, int w, int h);
+int wrong_min64f(const double* src1, size_t sz1, const double* src2, size_t sz2, double* dst, size_t sz, int w, int h);
+int wrong_absdiff8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h);
+int wrong_absdiff8s(const schar* src1, size_t sz1, const schar* src2, size_t sz2, schar* dst, size_t sz, int w, int h);
+int wrong_absdiff16u(const ushort* src1, size_t sz1, const ushort* src2, size_t sz2, ushort* dst, size_t sz, int w, int h);
+int wrong_absdiff16s(const short* src1, size_t sz1, const short* src2, size_t sz2, short* dst, size_t sz, int w, int h);
+int wrong_absdiff32s(const int* src1, size_t sz1, const int* src2, size_t sz2, int* dst, size_t sz, int w, int h);
+int wrong_absdiff32f(const float* src1, size_t sz1, const float* src2, size_t sz2, float* dst, size_t sz, int w, int h);
+int wrong_absdiff64f(const double* src1, size_t sz1, const double* src2, size_t sz2, double* dst, size_t sz, int w, int h);
+int wrong_and8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h);
+int wrong_or8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h);
+int wrong_xor8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h);
+int wrong_not8u(const uchar* src1, size_t sz1, uchar* dst, size_t sz, int w, int h);
+
+#undef cv_hal_add8u
+#define cv_hal_add8u wrong_add8u
+#undef cv_hal_add8s
+#define cv_hal_add8s wrong_add8s
+#undef cv_hal_add16u
+#define cv_hal_add16u wrong_add16u
+#undef cv_hal_add16s
+#define cv_hal_add16s wrong_add16s
+#undef cv_hal_add32s
+#define cv_hal_add32s wrong_add32s
+#undef cv_hal_add32f
+#define cv_hal_add32f wrong_add32f
+#undef cv_hal_add64f
+#define cv_hal_add64f wrong_add64f
+#undef cv_hal_sub8u
+#define cv_hal_sub8u wrong_sub8u
+#undef cv_hal_sub8s
+#define cv_hal_sub8s wrong_sub8s
+#undef cv_hal_sub16u
+#define cv_hal_sub16u wrong_sub16u
+#undef cv_hal_sub16s
+#define cv_hal_sub16s wrong_sub16s
+#undef cv_hal_sub32s
+#define cv_hal_sub32s wrong_sub32s
+#undef cv_hal_sub32f
+#define cv_hal_sub32f wrong_sub32f
+#undef cv_hal_sub64f
+#define cv_hal_sub64f wrong_sub64f
+#undef cv_hal_max8u
+#define cv_hal_max8u wrong_max8u
+#undef cv_hal_max8s
+#define cv_hal_max8s wrong_max8s
+#undef cv_hal_max16u
+#define cv_hal_max16u wrong_max16u
+#undef cv_hal_max16s
+#define cv_hal_max16s wrong_max16s
+#undef cv_hal_max32s
+#define cv_hal_max32s wrong_max32s
+#undef cv_hal_max32f
+#define cv_hal_max32f wrong_max32f
+#undef cv_hal_max64f
+#define cv_hal_max64f wrong_max64f
+#undef cv_hal_min8u
+#define cv_hal_min8u wrong_min8u
+#undef cv_hal_min8s
+#define cv_hal_min8s wrong_min8s
+#undef cv_hal_min16u
+#define cv_hal_min16u wrong_min16u
+#undef cv_hal_min16s
+#define cv_hal_min16s wrong_min16s
+#undef cv_hal_min32s
+#define cv_hal_min32s wrong_min32s
+#undef cv_hal_min32f
+#define cv_hal_min32f wrong_min32f
+#undef cv_hal_min64f
+#define cv_hal_min64f wrong_min64f
+#undef cv_hal_absdiff8u
+#define cv_hal_absdiff8u wrong_absdiff8u
+#undef cv_hal_absdiff8s
+#define cv_hal_absdiff8s wrong_absdiff8s
+#undef cv_hal_absdiff16u
+#define cv_hal_absdiff16u wrong_absdiff16u
+#undef cv_hal_absdiff16s
+#define cv_hal_absdiff16s wrong_absdiff16s
+#undef cv_hal_absdiff32s
+#define cv_hal_absdiff32s wrong_absdiff32s
+#undef cv_hal_absdiff32f
+#define cv_hal_absdiff32f wrong_absdiff32f
+#undef cv_hal_absdiff64f
+#define cv_hal_absdiff64f wrong_absdiff64f
+#undef cv_hal_and8u
+#define cv_hal_and8u wrong_and8u
+#undef cv_hal_or8u
+#define cv_hal_or8u wrong_or8u
+#undef cv_hal_xor8u
+#define cv_hal_xor8u wrong_xor8u
+#undef cv_hal_not8u
+#define cv_hal_not8u wrong_not8u
+
+int wrong_cmp8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h, int op);
+int wrong_cmp8s(const schar* src1, size_t sz1, const schar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h, int op);
+int wrong_cmp16u(const ushort* src1, size_t sz1, const ushort* src2, size_t sz2, uchar* dst, size_t sz, int w, int h, int op);
+int wrong_cmp16s(const short* src1, size_t sz1, const short* src2, size_t sz2, uchar* dst, size_t sz, int w, int h, int op);
+int wrong_cmp32s(const int* src1, size_t sz1, const int* src2, size_t sz2, uchar* dst, size_t sz, int w, int h, int op);
+int wrong_cmp32f(const float* src1, size_t sz1, const float* src2, size_t sz2, uchar* dst, size_t sz, int w, int h, int op);
+int wrong_cmp64f(const double* src1, size_t sz1, const double* src2, size_t sz2, uchar* dst, size_t sz, int w, int h, int op);
+
+#undef cv_hal_cmp8u
+#define cv_hal_cmp8u wrong_cmp8u
+#undef cv_hal_cmp8s
+#define cv_hal_cmp8s wrong_cmp8s
+#undef cv_hal_cmp16u
+#define cv_hal_cmp16u wrong_cmp16u
+#undef cv_hal_cmp16s
+#define cv_hal_cmp16s wrong_cmp16s
+#undef cv_hal_cmp32s
+#define cv_hal_cmp32s wrong_cmp32s
+#undef cv_hal_cmp32f
+#define cv_hal_cmp32f wrong_cmp32f
+#undef cv_hal_cmp64f
+#define cv_hal_cmp64f wrong_cmp64f
+
+int wrong_mul8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h, double scale);
+int wrong_mul8s(const schar* src1, size_t sz1, const schar* src2, size_t sz2, schar* dst, size_t sz, int w, int h, double scale);
+int wrong_mul16u(const ushort* src1, size_t sz1, const ushort* src2, size_t sz2, ushort* dst, size_t sz, int w, int h, double scale);
+int wrong_mul16s(const short* src1, size_t sz1, const short* src2, size_t sz2, short* dst, size_t sz, int w, int h, double scale);
+int wrong_mul32s(const int* src1, size_t sz1, const int* src2, size_t sz2, int* dst, size_t sz, int w, int h, double scale);
+int wrong_mul32f(const float* src1, size_t sz1, const float* src2, size_t sz2, float* dst, size_t sz, int w, int h, double scale);
+int wrong_mul64f(const double* src1, size_t sz1, const double* src2, size_t sz2, double* dst, size_t sz, int w, int h, double scale);
+int wrong_div8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h, double scale);
+int wrong_div8s(const schar* src1, size_t sz1, const schar* src2, size_t sz2, schar* dst, size_t sz, int w, int h, double scale);
+int wrong_div16u(const ushort* src1, size_t sz1, const ushort* src2, size_t sz2, ushort* dst, size_t sz, int w, int h, double scale);
+int wrong_div16s(const short* src1, size_t sz1, const short* src2, size_t sz2, short* dst, size_t sz, int w, int h, double scale);
+int wrong_div32s(const int* src1, size_t sz1, const int* src2, size_t sz2, int* dst, size_t sz, int w, int h, double scale);
+int wrong_div32f(const float* src1, size_t sz1, const float* src2, size_t sz2, float* dst, size_t sz, int w, int h, double scale);
+int wrong_div64f(const double* src1, size_t sz1, const double* src2, size_t sz2, double* dst, size_t sz, int w, int h, double scale);
+int wrong_recip8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h, double scale);
+int wrong_recip8s(const schar* src1, size_t sz1, const schar* src2, size_t sz2, schar* dst, size_t sz, int w, int h, double scale);
+int wrong_recip16u(const ushort* src1, size_t sz1, const ushort* src2, size_t sz2, ushort* dst, size_t sz, int w, int h, double scale);
+int wrong_recip16s(const short* src1, size_t sz1, const short* src2, size_t sz2, short* dst, size_t sz, int w, int h, double scale);
+int wrong_recip32s(const int* src1, size_t sz1, const int* src2, size_t sz2, int* dst, size_t sz, int w, int h, double scale);
+int wrong_recip32f(const float* src1, size_t sz1, const float* src2, size_t sz2, float* dst, size_t sz, int w, int h, double scale);
+int wrong_recip64f(const double* src1, size_t sz1, const double* src2, size_t sz2, double* dst, size_t sz, int w, int h, double scale);
+
+#undef cv_hal_mul8u
+#define cv_hal_mul8u wrong_mul8u
+#undef cv_hal_mul8s
+#define cv_hal_mul8s wrong_mul8s
+#undef cv_hal_mul16u
+#define cv_hal_mul16u wrong_mul16u
+#undef cv_hal_mul16s
+#define cv_hal_mul16s wrong_mul16s
+#undef cv_hal_mul32s
+#define cv_hal_mul32s wrong_mul32s
+#undef cv_hal_mul32f
+#define cv_hal_mul32f wrong_mul32f
+#undef cv_hal_mul64f
+#define cv_hal_mul64f wrong_mul64f
+#undef cv_hal_div8u
+#define cv_hal_div8u wrong_div8u
+#undef cv_hal_div8s
+#define cv_hal_div8s wrong_div8s
+#undef cv_hal_div16u
+#define cv_hal_div16u wrong_div16u
+#undef cv_hal_div16s
+#define cv_hal_div16s wrong_div16s
+#undef cv_hal_div32s
+#define cv_hal_div32s wrong_div32s
+#undef cv_hal_div32f
+#define cv_hal_div32f wrong_div32f
+#undef cv_hal_div64f
+#define cv_hal_div64f wrong_div64f
+#undef cv_hal_recip8u
+#define cv_hal_recip8u wrong_recip8u
+#undef cv_hal_recip8s
+#define cv_hal_recip8s wrong_recip8s
+#undef cv_hal_recip16u
+#define cv_hal_recip16u wrong_recip16u
+#undef cv_hal_recip16s
+#define cv_hal_recip16s wrong_recip16s
+#undef cv_hal_recip32s
+#define cv_hal_recip32s wrong_recip32s
+#undef cv_hal_recip32f
+#define cv_hal_recip32f wrong_recip32f
+#undef cv_hal_recip64f
+#define cv_hal_recip64f wrong_recip64f
+
+int wrong_addWeighted8u(const uchar* src1, size_t sz1, const uchar* src2, size_t sz2, uchar* dst, size_t sz, int w, int h, const double* scales);
+int wrong_addWeighted8s(const schar* src1, size_t sz1, const schar* src2, size_t sz2, schar* dst, size_t sz, int w, int h, const double* scales);
+int wrong_addWeighted16u(const ushort* src1, size_t sz1, const ushort* src2, size_t sz2, ushort* dst, size_t sz, int w, int h, const double* scales);
+int wrong_addWeighted16s(const short* src1, size_t sz1, const short* src2, size_t sz2, short* dst, size_t sz, int w, int h, const double* scales);
+int wrong_addWeighted32s(const int* src1, size_t sz1, const int* src2, size_t sz2, int* dst, size_t sz, int w, int h, const double* scales);
+int wrong_addWeighted32f(const float* src1, size_t sz1, const float* src2, size_t sz2, float* dst, size_t sz, int w, int h, const double* scales);
+int wrong_addWeighted64f(const double* src1, size_t sz1, const double* src2, size_t sz2, double* dst, size_t sz, int w, int h, const double* scales);
+
+#undef cv_hal_addWeighted8u
+#define cv_hal_addWeighted8u wrong_addWeighted8u
+#undef cv_hal_addWeighted8s
+#define cv_hal_addWeighted8s wrong_addWeighted8s
+#undef cv_hal_addWeighted16u
+#define cv_hal_addWeighted16u wrong_addWeighted16u
+#undef cv_hal_addWeighted16s
+#define cv_hal_addWeighted16s wrong_addWeighted16s
+#undef cv_hal_addWeighted32s
+#define cv_hal_addWeighted32s wrong_addWeighted32s
+#undef cv_hal_addWeighted32f
+#define cv_hal_addWeighted32f wrong_addWeighted32f
+#undef cv_hal_addWeighted64f
+#define cv_hal_addWeighted64f wrong_addWeighted64f
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
--- a/samples/hal/slow_hal/CMakeLists.txt
+++ b/samples/hal/slow_hal/CMakeLists.txt
@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR)
+
+if(UNIX)
+  if(CMAKE_COMPILER_IS_GNUCXX OR CV_ICC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+  endif()
+endif()
+
+add_library(slow_hal impl.cpp)
+set(OPENCV_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../..")
+target_include_directories(slow_hal PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${OPENCV_SRC_DIR}/modules/core/include)
--- a/modules/hal/samples/simple_hal/simple.cpp
+++ b/modules/hal/samples/simple_hal/simple.cpp
@ -1,11 +1,11 @@
-#include "simple.hpp"
+#include "impl.hpp"

 int slow_and8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height)
 {
    for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step)
        for(int x = 0 ; x < width; x++ )
            dst[x] = src1[x] & src2[x];
-    return cv::hal::Error::Ok;
+    return CV_HAL_ERROR_OK;
 }

 int slow_or8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height)
@ -13,7 +13,7 @@ int slow_or8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
    for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step)
        for(int x = 0 ; x < width; x++ )
            dst[x] = src1[x] | src2[x];
-    return cv::hal::Error::Ok;
+    return CV_HAL_ERROR_OK;
 }

 int slow_xor8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height)
@ -21,13 +21,13 @@ int slow_xor8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
    for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step)
        for(int x = 0 ; x < width; x++ )
            dst[x] = src1[x] ^ src2[x];
-    return cv::hal::Error::Ok;
+    return CV_HAL_ERROR_OK;
 }

-int slow_not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height)
+int slow_not8u(const uchar* src1, size_t step1, uchar* dst, size_t step, int width, int height)
 {
-    for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step)
+    for(; height--; src1 = src1 + step1, dst = dst + step)
        for(int x = 0 ; x < width; x++ )
            dst[x] = ~src1[x];
-    return cv::hal::Error::Ok;
+    return CV_HAL_ERROR_OK;
 }
--- a/modules/hal/samples/simple_hal/simple.hpp
+++ b/modules/hal/samples/simple_hal/simple.hpp
@ -1,20 +1,20 @@
 #ifndef _SIMPLE_HPP_INCLUDED_
 #define _SIMPLE_HPP_INCLUDED_

-#include "opencv2/hal/interface.hpp"
+#include "opencv2/core/hal/interface.h"

 int slow_and8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height);
 int slow_or8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height);
 int slow_xor8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height);
-int slow_not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height);
+int slow_not8u(const uchar* src1, size_t step1, uchar* dst, size_t step, int width, int height);

-#undef hal_and8u
-#define hal_and8u slow_and8u
-#undef hal_or8u
-#define hal_or8u slow_or8u
-#undef hal_xor8u
-#define hal_xor8u slow_xor8u
-#undef hal_not8u
-#define hal_not8u slow_not8u
+#undef cv_hal_and8u
+#define cv_hal_and8u slow_and8u
+#undef cv_hal_or8u
+#define cv_hal_or8u slow_or8u
+#undef cv_hal_xor8u
+#define cv_hal_xor8u slow_xor8u
+#undef cv_hal_not8u
+#define cv_hal_not8u slow_not8u

 #endif