Merge remote-tracking branch 'upstream/3.4' into merge-3.4

2025-06-08 01:53:19 +08:00 · 2018-05-03 14:30:38 +00:00 · 2018-05-03 14:30:38 +00:00 · 000a13b6a3
commit 000a13b6a3
parent 84b3b5b4a4 9fa01a7b70
37 changed files with 2148 additions and 6905 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -277,6 +277,7 @@ OCV_OPTION(WITH_GDAL           "Include GDAL Support"                        OFF
 OCV_OPTION(WITH_GPHOTO2        "Include gPhoto2 library support"             ON   IF (UNIX AND NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_LAPACK         "Include Lapack library support"              (NOT CV_DISABLE_OPTIMIZATION)  IF (NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_ITT            "Include Intel ITT support"                   ON   IF (NOT APPLE_FRAMEWORK) )
+OCV_OPTION(WITH_PROTOBUF       "Enable libprotobuf"                          ON )

 # OpenCV build components
 # ===================================================
@ -523,16 +524,7 @@ set(OPENCV_EXTRA_MODULES_PATH "" CACHE PATH "Where to look for additional OpenCV
 find_host_package(Git QUIET)

 if(NOT DEFINED OPENCV_VCSVERSION AND GIT_FOUND)
-  execute_process(COMMAND "${GIT_EXECUTABLE}" describe --tags --always --dirty --match "[0-9].[0-9].[0-9]*"
-    WORKING_DIRECTORY "${OpenCV_SOURCE_DIR}"
-    OUTPUT_VARIABLE OPENCV_VCSVERSION
-    RESULT_VARIABLE GIT_RESULT
-    ERROR_QUIET
-    OUTPUT_STRIP_TRAILING_WHITESPACE
-  )
-  if(NOT GIT_RESULT EQUAL 0)
-    set(OPENCV_VCSVERSION "unknown")
-  endif()
+  ocv_git_describe(OPENCV_VCSVERSION "${OpenCV_SOURCE_DIR}")
 elseif(NOT DEFINED OPENCV_VCSVERSION)
  # We don't have git:
  set(OPENCV_VCSVERSION "unknown")
@ -928,19 +920,7 @@ if(OPENCV_EXTRA_MODULES_PATH AND NOT BUILD_INFO_SKIP_EXTRA_MODULES)
      else()
        status("")
      endif()
-      set(EXTRA_MODULES_VCSVERSION "unknown")
-      if(GIT_FOUND)
-        execute_process(COMMAND "${GIT_EXECUTABLE}" describe --tags --always --dirty --match "[0-9].[0-9].[0-9]*"
-          WORKING_DIRECTORY "${p}"
-          OUTPUT_VARIABLE EXTRA_MODULES_VCSVERSION
-          RESULT_VARIABLE GIT_RESULT
-          ERROR_QUIET
-          OUTPUT_STRIP_TRAILING_WHITESPACE
-        )
-        if(NOT GIT_RESULT EQUAL 0)
-          set(EXTRA_MODULES_VCSVERSION "unknown")
-        endif()
-      endif()
+      ocv_git_describe(EXTRA_MODULES_VCSVERSION "${p}")
      status("    Location (extra):" ${p})
      status("    Version control (extra):" ${EXTRA_MODULES_VCSVERSION})
    endif()
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@ -234,6 +234,7 @@ if(X86 OR X86_64)
  elseif(MSVC)
    ocv_update(CPU_AVX2_FLAGS_ON "/arch:AVX2")
    ocv_update(CPU_AVX_FLAGS_ON "/arch:AVX")
+    ocv_update(CPU_FP16_FLAGS_ON "/arch:AVX")
    if(NOT MSVC64)
      # 64-bit MSVC compiler uses SSE/SSE2 by default
      ocv_update(CPU_SSE_FLAGS_ON "/arch:SSE")
@ -422,6 +423,9 @@ foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS})
  if(__is_disabled)
    set(__is_from_baseline 0)
  else()
+    if(CPU_${OPT}_SUPPORTED AND CPU_BASELINE_DETECT)
+      list(APPEND CPU_BASELINE_FINAL ${OPT})
+    endif()
    ocv_is_optimization_in_list(__is_from_baseline ${OPT} ${CPU_BASELINE_REQUIRE})
    if(NOT __is_from_baseline)
      ocv_is_optimization_in_list(__is_from_baseline ${OPT} ${CPU_BASELINE})
@ -441,7 +445,9 @@ foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS})
    if(";${CPU_DISPATCH};" MATCHES ";${OPT};" AND NOT __is_from_baseline)
      list(APPEND CPU_DISPATCH_FINAL ${OPT})
    elseif(__is_from_baseline)
+      if(NOT ";${CPU_BASELINE_FINAL};" MATCHES ";${OPT};")
        list(APPEND CPU_BASELINE_FINAL ${OPT})
+      endif()
      ocv_append_optimization_flag(CPU_BASELINE_FLAGS ${OPT})
    endif()
  endif()
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@ -104,7 +104,7 @@ if(CV_GCC OR CV_CLANG)
  add_extra_compiler_option(-Wuninitialized)
  add_extra_compiler_option(-Winit-self)
  if(HAVE_CXX11)
-    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT ENABLE_PRECOMPILED_HEADERS)
      add_extra_compiler_option(-Wsuggest-override)
    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
      add_extra_compiler_option(-Winconsistent-missing-override)
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@ -200,6 +200,9 @@ if(CUDA_FOUND)
      string(REPLACE "-frtti" "" ${var} "${${var}}")

      string(REPLACE "-fvisibility-inlines-hidden" "" ${var} "${${var}}")
+
+      # cc1: warning: command line option '-Wsuggest-override' is valid for C++/ObjC++ but not for C
+      string(REPLACE "-Wsuggest-override" "" ${var} "${${var}}")
    endforeach()
  endmacro()

--- a/cmake/OpenCVFindLibsVideo.cmake
+++ b/cmake/OpenCVFindLibsVideo.cmake
@ -261,6 +261,13 @@ endif(WITH_DSHOW)
 ocv_clear_vars(HAVE_MSMF)
 if(WITH_MSMF)
  check_include_file(Mfapi.h HAVE_MSMF)
+  check_include_file(D3D11.h D3D11_found)
+  check_include_file(D3d11_4.h D3D11_4_found)
+  if(D3D11_found AND D3D11_4_found)
+    set(HAVE_DXVA YES)
+  else()
+    set(HAVE_DXVA NO)
+  endif()
 endif(WITH_MSMF)

 # --- Extra HighGUI and VideoIO libs on Windows ---
--- a/cmake/OpenCVFindProtobuf.cmake
+++ b/cmake/OpenCVFindProtobuf.cmake
@ -1,15 +1,14 @@
 # If protobuf is found - libprotobuf target is available

-ocv_option(WITH_PROTOBUF "Enable libprotobuf" ON)
-ocv_option(BUILD_PROTOBUF "Force to build libprotobuf from sources" ON)
-ocv_option(PROTOBUF_UPDATE_FILES "Force rebuilding .proto files (protoc should be available)" OFF)
-
 set(HAVE_PROTOBUF FALSE)

 if(NOT WITH_PROTOBUF)
  return()
 endif()

+ocv_option(BUILD_PROTOBUF "Force to build libprotobuf from sources" ON)
+ocv_option(PROTOBUF_UPDATE_FILES "Force rebuilding .proto files (protoc should be available)" OFF)
+
 function(get_protobuf_version version include)
  file(STRINGS "${include}/google/protobuf/stubs/common.h" ver REGEX "#define GOOGLE_PROTOBUF_VERSION [0-9]+")
  string(REGEX MATCHALL "[0-9]+" ver ${ver})
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@ -1620,3 +1620,40 @@ if(NOT CMAKE_VERSION VERSION_LESS 3.1)
 else()
  set(compatible_MESSAGE_NEVER "")
 endif()
+
+
+macro(ocv_git_describe var_name path)
+  if(GIT_FOUND)
+    execute_process(COMMAND "${GIT_EXECUTABLE}" describe --tags --tags --exact-match --dirty
+      WORKING_DIRECTORY "${path}"
+      OUTPUT_VARIABLE ${var_name}
+      RESULT_VARIABLE GIT_RESULT
+      ERROR_QUIET
+      OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if(NOT GIT_RESULT EQUAL 0)
+      execute_process(COMMAND "${GIT_EXECUTABLE}" describe --tags --always --dirty --match "[0-9].[0-9].[0-9]*" --exclude "[^-]*-cvsdk"
+        WORKING_DIRECTORY "${path}"
+        OUTPUT_VARIABLE ${var_name}
+        RESULT_VARIABLE GIT_RESULT
+        ERROR_QUIET
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+      )
+      if(NOT GIT_RESULT EQUAL 0)  # --exclude is not supported by 'git'
+        # match only tags with complete OpenCV versions (ignores -alpha/-beta/-rc suffixes)
+        execute_process(COMMAND "${GIT_EXECUTABLE}" describe --tags --always --dirty --match "[0-9].[0-9]*[0-9]"
+          WORKING_DIRECTORY "${path}"
+          OUTPUT_VARIABLE ${var_name}
+          RESULT_VARIABLE GIT_RESULT
+          ERROR_QUIET
+          OUTPUT_STRIP_TRAILING_WHITESPACE
+        )
+        if(NOT GIT_RESULT EQUAL 0)
+          set(${var_name} "unknown")
+        endif()
+      endif()
+    endif()
+  else()
+    set(${var_name} "unknown")
+  endif()
+endmacro()
--- a/cmake/checks/cpu_fp16.cpp
+++ b/cmake/checks/cpu_fp16.cpp
@ -1,6 +1,6 @@
 #include <stdio.h>

-#if defined __F16C__ || (defined _MSC_VER && _MSC_VER >= 1700) || (defined __INTEL_COMPILER && defined __AVX__)
+#if defined __F16C__ || (defined _MSC_VER && _MSC_VER >= 1700 && defined __AVX__) || (defined __INTEL_COMPILER && defined __AVX__)
 #include <immintrin.h>
 int test()
 {
--- a/doc/tutorials/dnn/dnn_custom_layers/dnn_custom_layers.md
+++ b/doc/tutorials/dnn/dnn_custom_layers/dnn_custom_layers.md
@ -190,3 +190,37 @@ In our case resize's output shape will be stored in layer's `blobs[0]`.
 Next we register a layer and try to import the model.

@snippet dnn/custom_layers.cpp Register ResizeBilinearLayer
+
+## Define a custom layer in Python
+The following example shows how to customize OpenCV's layers in Python.
+
+Let's consider [Holistically-Nested Edge Detection](https://arxiv.org/abs/1504.06375)
+deep learning model. That was trained with one and only difference comparing to
+a current version of [Caffe framework](http://caffe.berkeleyvision.org/). `Crop`
+layers that receive two input blobs and crop the first one to match spatial dimensions
+of the second one used to crop from the center. Nowadays Caffe's layer does it
+from the top-left corner. So using the latest version of Caffe or OpenCV you'll
+get shifted results with filled borders.
+
+Next we're going to replace OpenCV's `Crop` layer that makes top-left cropping by
+a centric one.
+
+- Create a class with `getMemoryShapes` and `forward` methods
+
+@snippet dnn/edge_detection.py CropLayer
+
+@note Both methods should return lists.
+
+- Register a new layer.
+
+@snippet dnn/edge_detection.py Register
+
+That's it! We've replaced an implemented OpenCV's layer to a custom one.
+You may find a full script in the [source code](https://github.com/opencv/opencv/tree/master/samples/dnn/edge_detection.py).
+
+<table border="0">
+<tr>
+<td>![](js_tutorials/js_assets/lena.jpg)</td>
+<td>![](images/lena_hed.jpg)</td>
+</tr>
+</table>
--- a/doc/tutorials/dnn/images/lena_hed.jpg
+++ b/doc/tutorials/dnn/images/lena_hed.jpg
--- a/modules/core/include/opencv2/core/cvstd.inl.hpp
+++ b/modules/core/include/opencv2/core/cvstd.inl.hpp
@ -265,10 +265,10 @@ std::ostream& operator << (std::ostream& out, const Rect_<_Tp>& rect)

 static inline std::ostream& operator << (std::ostream& out, const MatSize& msize)
 {
-    int i, dims = msize.p[-1];
+    int i, dims = msize.dims();
    for( i = 0; i < dims; i++ )
    {
-        out << msize.p[i];
+        out << msize[i];
        if( i < dims-1 )
            out << " x ";
    }
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@ -236,6 +236,7 @@ public:
    bool isUMatVector() const;
    bool isMatx() const;
    bool isVector() const;
+    bool isGpuMat() const;
    bool isGpuMatVector() const;
    ~_InputArray();

@ -544,10 +545,11 @@ struct CV_EXPORTS UMatData
 struct CV_EXPORTS MatSize
 {
    explicit MatSize(int* _p);
+    int dims() const;
    Size operator()() const;
    const int& operator[](int i) const;
    int& operator[](int i);
-    operator const int*() const;
+    operator const int*() const;  // TODO OpenCV 4.0: drop this
    bool operator == (const MatSize& sz) const;
    bool operator != (const MatSize& sz) const;

--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@ -155,6 +155,7 @@ inline bool _InputArray::isMatx() const { return kind() == _InputArray::MATX; }
 inline bool _InputArray::isVector() const { return kind() == _InputArray::STD_VECTOR ||
                                                   kind() == _InputArray::STD_BOOL_VECTOR ||
                                                   kind() == _InputArray::STD_ARRAY; }
+inline bool _InputArray::isGpuMat() const { return kind() == _InputArray::CUDA_GPU_MAT; }
 inline bool _InputArray::isGpuMatVector() const { return kind() == _InputArray::STD_VECTOR_CUDA_GPU_MAT; }

 ////////////////////////////////////////////////////////////////////////////////////////
@ -1391,22 +1392,36 @@ inline
 MatSize::MatSize(int* _p)
    : p(_p) {}

+inline
+int MatSize::dims() const
+{
+    return (p - 1)[0];
+}
+
 inline
 Size MatSize::operator()() const
 {
-    CV_DbgAssert(p[-1] <= 2);
+    CV_DbgAssert(dims() <= 2);
    return Size(p[1], p[0]);
 }

 inline
 const int& MatSize::operator[](int i) const
 {
+    CV_DbgAssert(i < dims());
+#ifdef __OPENCV_BUILD
+    CV_DbgAssert(i >= 0);
+#endif
    return p[i];
 }

 inline
 int& MatSize::operator[](int i)
 {
+    CV_DbgAssert(i < dims());
+#ifdef __OPENCV_BUILD
+    CV_DbgAssert(i >= 0);
+#endif
    return p[i];
 }

@ -1419,8 +1434,8 @@ MatSize::operator const int*() const
 inline
 bool MatSize::operator == (const MatSize& sz) const
 {
-    int d = p[-1];
-    int dsz = sz.p[-1];
+    int d = dims();
+    int dsz = sz.dims();
    if( d != dsz )
        return false;
    if( d == 2 )
--- a/modules/core/src/convert_scale.cpp
+++ b/modules/core/src/convert_scale.cpp
@ -1885,12 +1885,23 @@ void cv::normalize( InputArray _src, InputOutputArray _dst, double a, double b,
    CV_INSTRUMENT_REGION()

    double scale = 1, shift = 0;
+    int type = _src.type(), depth = CV_MAT_DEPTH(type);
+
+    if( rtype < 0 )
+        rtype = _dst.fixedType() ? _dst.depth() : depth;
+
    if( norm_type == CV_MINMAX )
    {
        double smin = 0, smax = 0;
        double dmin = MIN( a, b ), dmax = MAX( a, b );
        minMaxIdx( _src, &smin, &smax, 0, 0, _mask );
        scale = (dmax - dmin)*(smax - smin > DBL_EPSILON ? 1./(smax - smin) : 0);
+        if( rtype == CV_32F )
+        {
+            scale = (float)scale;
+            shift = (float)dmin - (float)(smin*scale);
+        }
+        else
            shift = dmin - smin*scale;
    }
    else if( norm_type == CV_L2 || norm_type == CV_L1 || norm_type == CV_C )
@ -1902,10 +1913,6 @@ void cv::normalize( InputArray _src, InputOutputArray _dst, double a, double b,
    else
        CV_Error( CV_StsBadArg, "Unknown/unsupported norm type" );

-    int type = _src.type(), depth = CV_MAT_DEPTH(type);
-    if( rtype < 0 )
-        rtype = _dst.fixedType() ? _dst.depth() : depth;
-
    CV_OCL_RUN(_dst.isUMat(),
               ocl_normalize(_src, _dst, _mask, rtype, scale, shift))

--- a/modules/core/src/cuda_gpu_mat.cpp
+++ b/modules/core/src/cuda_gpu_mat.cpp
@ -344,13 +344,12 @@ void cv::cuda::ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr)

 GpuMat cv::cuda::getInputMat(InputArray _src, Stream& stream)
 {
-    GpuMat src;
-
 #ifndef HAVE_CUDA
    (void) _src;
    (void) stream;
    throw_no_cuda();
 #else
+    GpuMat src;
    if (_src.kind() == _InputArray::CUDA_GPU_MAT)
    {
        src = _src.getGpuMat();
@ -361,15 +360,12 @@ GpuMat cv::cuda::getInputMat(InputArray _src, Stream& stream)
        src = pool.getBuffer(_src.size(), _src.type());
        src.upload(_src, stream);
    }
-#endif
-
    return src;
+#endif
 }

 GpuMat cv::cuda::getOutputMat(OutputArray _dst, int rows, int cols, int type, Stream& stream)
 {
-    GpuMat dst;
-
 #ifndef HAVE_CUDA
    (void) _dst;
    (void) rows;
@ -378,6 +374,7 @@ GpuMat cv::cuda::getOutputMat(OutputArray _dst, int rows, int cols, int type, St
    (void) stream;
    throw_no_cuda();
 #else
+    GpuMat dst;
    if (_dst.kind() == _InputArray::CUDA_GPU_MAT)
    {
        _dst.create(rows, cols, type);
@ -388,9 +385,8 @@ GpuMat cv::cuda::getOutputMat(OutputArray _dst, int rows, int cols, int type, St
        BufferPool pool(stream);
        dst = pool.getBuffer(rows, cols, type);
    }
-#endif
-
    return dst;
+#endif
 }

 void cv::cuda::syncOutput(const GpuMat& dst, OutputArray _dst, Stream& stream)
--- a/modules/core/src/norm.cpp
+++ b/modules/core/src/norm.cpp
@ -1005,6 +1005,16 @@ static bool ipp_norm(InputArray _src1, InputArray _src2, int normType, InputArra
                type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_16u_C3CMR :
                type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_32f_C3CMR :
                0) : 0;
+            if (cv::ipp::getIppTopFeatures() & (
+#if IPP_VERSION_X100 >= 201700
+                    ippCPUID_AVX512F |
+#endif
+                    ippCPUID_AVX2)
+            ) // IPP_DISABLE_NORM_16UC3_mask_small (#11399)
+            {
+                if (normType == NORM_L1 && type == CV_16UC3 && sz.width < 16)
+                    return false;
+            }
            if( ippiNormDiff_C3CMR )
            {
                Ipp64f norm1, norm2, norm3;
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@ -129,8 +129,6 @@
 #    define CV__EXCEPTION_PTR 0  // Not supported, details: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58938
 #  elif defined(CV_CXX11)
 #    define CV__EXCEPTION_PTR 1
-#  elif defined(CV_ICC)
-#    define CV__EXCEPTION_PTR 1
 #  elif defined(_MSC_VER)
 #    define CV__EXCEPTION_PTR (_MSC_VER >= 1600)
 #  elif defined(__clang__)
--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@ -1918,6 +1918,25 @@ TEST(Normalize, regression_5876_inplace_change_type)
    EXPECT_EQ(0, cvtest::norm(m, result, NORM_INF));
 }

+TEST(Normalize, regression_6125)
+{
+    float initial_values[] = {
+        1888, 1692, 369, 263, 199,
+        280, 326, 129, 143, 126,
+        233, 221, 130, 126, 150,
+        249, 575, 574, 63, 12
+    };
+
+    Mat src(Size(20, 1), CV_32F, initial_values);
+    float min = 0., max = 400.;
+    normalize(src, src, 0, 400, NORM_MINMAX, CV_32F);
+    for(int i = 0; i < 20; i++)
+    {
+        EXPECT_GE(src.at<float>(i), min) << "Value should be >= 0";
+        EXPECT_LE(src.at<float>(i), max) << "Value should be <= 400";
+    }
+}
+
 TEST(MinMaxLoc, regression_4955_nans)
 {
    cv::Mat one_mat(2, 2, CV_32F, cv::Scalar(1));
@ -2084,4 +2103,22 @@ TEST(Core_Set, regression_11044)
    EXPECT_EQ(std::numeric_limits<double>::infinity(), testDouble.at<double>(0, 0));
 }

+TEST(Core_Norm, IPP_regression_NORM_L1_16UC3_small)
+{
+    int cn = 3;
+    Size sz(9, 4);  // width < 16
+    Mat a(sz, CV_MAKE_TYPE(CV_16U, cn), Scalar::all(1));
+    Mat b(sz, CV_MAKE_TYPE(CV_16U, cn), Scalar::all(2));
+    uchar mask_[9*4] = {
+ 255, 255, 255,   0, 255, 255,   0, 255,   0,
+   0, 255,   0,   0, 255, 255, 255, 255,   0,
+   0,   0,   0, 255,   0, 255,   0, 255, 255,
+   0,   0, 255,   0, 255, 255, 255,   0, 255
+};
+    Mat mask(sz, CV_8UC1, mask_);
+
+    EXPECT_EQ((double)9*4*cn, cv::norm(a, b, NORM_L1)); // without mask, IPP works well
+    EXPECT_EQ((double)20*cn, cv::norm(a, b, NORM_L1, mask));
+}
+
 }} // namespace
--- a/modules/dnn/include/opencv2/dnn/shape_utils.hpp
+++ b/modules/dnn/include/opencv2/dnn/shape_utils.hpp
@ -134,7 +134,7 @@ static inline MatShape shape(const Mat& mat)

 static inline MatShape shape(const MatSize& sz)
 {
-    return shape(sz.p, sz[-1]);
+    return shape(sz.p, sz.dims());
 }

 static inline MatShape shape(const UMat& mat)
--- a/modules/dnn/misc/python/pyopencv_dnn.hpp
+++ b/modules/dnn/misc/python/pyopencv_dnn.hpp
@ -40,4 +40,182 @@ bool pyopencv_to(PyObject *o, std::vector<Mat> &blobs, const char *name) //requi
  return pyopencvVecConverter<Mat>::to(o, blobs, ArgInfo(name, false));
 }

-#endif
+template<typename T>
+PyObject* pyopencv_from(const dnn::DictValue &dv)
+{
+    if (dv.size() > 1)
+    {
+        std::vector<T> vec(dv.size());
+        for (int i = 0; i < dv.size(); ++i)
+            vec[i] = dv.get<T>(i);
+        return pyopencv_from_generic_vec(vec);
+    }
+    else
+        return pyopencv_from(dv.get<T>());
+}
+
+template<>
+PyObject* pyopencv_from(const dnn::DictValue &dv)
+{
+    if (dv.isInt()) return pyopencv_from<int>(dv);
+    if (dv.isReal()) return pyopencv_from<float>(dv);
+    if (dv.isString()) return pyopencv_from<String>(dv);
+    CV_Error(Error::StsNotImplemented, "Unknown value type");
+    return NULL;
+}
+
+template<>
+PyObject* pyopencv_from(const dnn::LayerParams& lp)
+{
+    PyObject* dict = PyDict_New();
+    for (std::map<String, dnn::DictValue>::const_iterator it = lp.begin(); it != lp.end(); ++it)
+    {
+        CV_Assert(!PyDict_SetItemString(dict, it->first.c_str(), pyopencv_from(it->second)));
+    }
+    return dict;
+}
+
+class pycvLayer CV_FINAL : public dnn::Layer
+{
+public:
+    pycvLayer(const dnn::LayerParams &params, PyObject* pyLayer) : Layer(params)
+    {
+        PyGILState_STATE gstate;
+        gstate = PyGILState_Ensure();
+
+        PyObject* args = PyTuple_New(2);
+        CV_Assert(!PyTuple_SetItem(args, 0, pyopencv_from(params)));
+        CV_Assert(!PyTuple_SetItem(args, 1, pyopencv_from(params.blobs)));
+        o = PyObject_CallObject(pyLayer, args);
+
+        Py_DECREF(args);
+        PyGILState_Release(gstate);
+        if (!o)
+            CV_Error(Error::StsError, "Failed to create an instance of custom layer");
+    }
+
+    static void registerLayer(const std::string& type, PyObject* o)
+    {
+        std::map<std::string, std::vector<PyObject*> >::iterator it = pyLayers.find(type);
+        if (it != pyLayers.end())
+            it->second.push_back(o);
+        else
+            pyLayers[type] = std::vector<PyObject*>(1, o);
+    }
+
+    static void unregisterLayer(const std::string& type)
+    {
+        std::map<std::string, std::vector<PyObject*> >::iterator it = pyLayers.find(type);
+        if (it != pyLayers.end())
+        {
+            if (it->second.size() > 1)
+                it->second.pop_back();
+            else
+                pyLayers.erase(it);
+        }
+    }
+
+    static Ptr<dnn::Layer> create(dnn::LayerParams &params)
+    {
+        std::map<std::string, std::vector<PyObject*> >::iterator it = pyLayers.find(params.type);
+        if (it == pyLayers.end())
+            CV_Error(Error::StsNotImplemented, "Layer with a type \"" + params.type +
+                                               "\" is not implemented");
+        CV_Assert(!it->second.empty());
+        return Ptr<dnn::Layer>(new pycvLayer(params, it->second.back()));
+    }
+
+    virtual bool getMemoryShapes(const std::vector<std::vector<int> > &inputs,
+                                 const int,
+                                 std::vector<std::vector<int> > &outputs,
+                                 std::vector<std::vector<int> > &) const CV_OVERRIDE
+    {
+        PyGILState_STATE gstate;
+        gstate = PyGILState_Ensure();
+
+        PyObject* args = PyList_New(inputs.size());
+        for(size_t i = 0; i < inputs.size(); ++i)
+            PyList_SET_ITEM(args, i, pyopencv_from_generic_vec(inputs[i]));
+
+        PyObject* res = PyObject_CallMethodObjArgs(o, PyString_FromString("getMemoryShapes"), args, NULL);
+        Py_DECREF(args);
+        PyGILState_Release(gstate);
+        if (!res)
+            CV_Error(Error::StsNotImplemented, "Failed to call \"getMemoryShapes\" method");
+        pyopencv_to_generic_vec(res, outputs, ArgInfo("", 0));
+        return false;
+    }
+
+    virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &) CV_OVERRIDE
+    {
+        PyGILState_STATE gstate;
+        gstate = PyGILState_Ensure();
+
+        std::vector<Mat> inps(inputs.size());
+        for (size_t i = 0; i < inputs.size(); ++i)
+            inps[i] = *inputs[i];
+
+        PyObject* args = pyopencv_from(inps);
+        PyObject* res = PyObject_CallMethodObjArgs(o, PyString_FromString("forward"), args, NULL);
+        Py_DECREF(args);
+        PyGILState_Release(gstate);
+        if (!res)
+            CV_Error(Error::StsNotImplemented, "Failed to call \"forward\" method");
+
+        std::vector<Mat> pyOutputs;
+        pyopencv_to(res, pyOutputs, ArgInfo("", 0));
+
+        CV_Assert(pyOutputs.size() == outputs.size());
+        for (size_t i = 0; i < outputs.size(); ++i)
+        {
+            CV_Assert(pyOutputs[i].size == outputs[i].size);
+            CV_Assert(pyOutputs[i].type() == outputs[i].type());
+            pyOutputs[i].copyTo(outputs[i]);
+        }
+    }
+
+    virtual void forward(InputArrayOfArrays, OutputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, "");
+    }
+
+private:
+    // Map layers types to python classes.
+    static std::map<std::string, std::vector<PyObject*> > pyLayers;
+    PyObject* o;  // Instance of implemented python layer.
+};
+
+std::map<std::string, std::vector<PyObject*> > pycvLayer::pyLayers;
+
+static PyObject *pyopencv_cv_dnn_registerLayer(PyObject*, PyObject *args, PyObject *kw)
+{
+    const char *keywords[] = { "type", "class", NULL };
+    char* layerType;
+    PyObject *classInstance;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kw, "sO", (char**)keywords, &layerType, &classInstance))
+        return NULL;
+    if (!PyCallable_Check(classInstance)) {
+        PyErr_SetString(PyExc_TypeError, "class must be callable");
+        return NULL;
+    }
+
+    pycvLayer::registerLayer(layerType, classInstance);
+    dnn::LayerFactory::registerLayer(layerType, pycvLayer::create);
+    Py_RETURN_NONE;
+}
+
+static PyObject *pyopencv_cv_dnn_unregisterLayer(PyObject*, PyObject *args, PyObject *kw)
+{
+    const char *keywords[] = { "type", NULL };
+    char* layerType;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kw, "s", (char**)keywords, &layerType))
+        return NULL;
+
+    pycvLayer::unregisterLayer(layerType);
+    dnn::LayerFactory::unregisterLayer(layerType);
+    Py_RETURN_NONE;
+}
+
+#endif  // HAVE_OPENCV_DNN
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@ -329,6 +329,7 @@ struct ReLUFunctor
    {
        lp.type = "ReLU";
        std::shared_ptr<InferenceEngine::ReLULayer> ieLayer(new InferenceEngine::ReLULayer(lp));
+        ieLayer->negative_slope = slope;
        return ieLayer;
    }
 #endif  // HAVE_INF_ENGINE
--- a/modules/dnn/src/layers/prior_box_layer.cpp
+++ b/modules/dnn/src/layers/prior_box_layer.cpp
@ -205,7 +205,9 @@ public:

        if (_explicitSizes)
        {
-            CV_Assert(_aspectRatios.empty(), !params.has("min_size"), !params.has("max_size"));
+            CV_Assert(_aspectRatios.empty());
+            CV_Assert(!params.has("min_size"));
+            CV_Assert(!params.has("max_size"));
            _boxWidths = widths;
            _boxHeights = heights;
        }
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@ -161,14 +161,14 @@ public:

        for (int i = 0; i < outputs.size(); ++i)
        {
-            CV_Assert(sliceRanges[i].size() <= inpShape[-1]);
+            CV_Assert(sliceRanges[i].size() <= inpShape.dims());
            // Clamp.
            for (int j = 0; j < sliceRanges[i].size(); ++j)
            {
                sliceRanges[i][j] = clamp(sliceRanges[i][j], inpShape[j]);
            }
            // Fill the rest of ranges.
-            for (int j = sliceRanges[i].size(); j < inpShape[-1]; ++j)
+            for (int j = sliceRanges[i].size(); j < inpShape.dims(); ++j)
            {
                sliceRanges[i].push_back(Range::all());
            }
--- a/modules/dnn/src/op_halide.cpp
+++ b/modules/dnn/src/op_halide.cpp
@ -6,6 +6,7 @@
 // Third party copyrights are property of their respective owners.

 #include "precomp.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
 #include "op_halide.hpp"

 #ifdef HAVE_HALIDE
@ -36,7 +37,7 @@ static MatShape getBufferShape(const MatShape& shape)

 static MatShape getBufferShape(const MatSize& size)
 {
-    return getBufferShape(MatShape(size.p, size.p + size[-1]));
+    return getBufferShape(shape(size));
 }

 Halide::Buffer<float> wrapToHalideBuffer(const Mat& mat)
@ -160,7 +161,7 @@ void HalideBackendWrapper::setHostDirty()

 void getCanonicalSize(const MatSize& size, int* w, int* h, int* c, int* n)
 {
-    getCanonicalSize(MatShape(size.p, size.p + size[-1]), w, h, c, n);
+    getCanonicalSize(shape(size), w, h, c, n);
 }

 void getCanonicalSize(const MatShape& shape, int* width, int* height,
--- a/modules/imgproc/src/clahe.cpp
+++ b/modules/imgproc/src/clahe.cpp
@ -54,16 +54,7 @@ namespace clahe
        const int tilesX, const int tilesY, const cv::Size tileSize,
        const int clipLimit, const float lutScale)
    {
-        cv::ocl::Kernel _k("calcLut", cv::ocl::imgproc::clahe_oclsrc);
-
-        bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
-        cv::String opts;
-        if(is_cpu)
-            opts = "-D CPU ";
-        else
-            opts = cv::format("-D WAVE_SIZE=%d", _k.preferedWorkGroupSizeMultiple());
-
-        cv::ocl::Kernel k("calcLut", cv::ocl::imgproc::clahe_oclsrc, opts);
+        cv::ocl::Kernel k("calcLut", cv::ocl::imgproc::clahe_oclsrc);
        if(k.empty())
            return false;

--- a/modules/imgproc/src/fixedpoint.inl.hpp
+++ b/modules/imgproc/src/fixedpoint.inl.hpp
@ -55,9 +55,11 @@ public:
    CV_ALWAYS_INLINE fixedpoint64 operator * (const int32_t& val2) const { return operator *(fixedpoint64(val2)); }
    CV_ALWAYS_INLINE fixedpoint64 operator * (const fixedpoint64& val2) const
    {
-        uint64_t uval = (uint64_t)((val ^ (val >> 63)) - (val >> 63));
-        uint64_t umul = (uint64_t)((val2.val ^ (val2.val >> 63)) - (val2.val >> 63));
-        int64_t ressign = (val >> 63) ^ (val2.val >> 63);
+        bool sign_val = val < 0;
+        bool sign_mul = val2.val < 0;
+        uint64_t uval = sign_val ? (uint64_t)(-val) : (uint64_t)val;
+        uint64_t umul = sign_mul ? (uint64_t)(-val2.val) : (uint64_t)val2.val;
+        bool ressign = sign_val ^ sign_mul;

        uint64_t sh0   = fixedround((uval & 0xFFFFFFFF) * (umul & 0xFFFFFFFF));
        uint64_t sh1_0 = (uval >> 32)        * (umul & 0xFFFFFFFF);
@ -67,33 +69,37 @@ public:
        uint64_t val0_h = (sh2   & 0xFFFFFFFF) + (sh1_0 >> 32) + (sh1_1 >> 32) + (val0_l >> 32);
        val0_l &= 0xFFFFFFFF;

-        if ( (sh2 >> 32) || (val0_h >> ressign ? 32 : 31) )
-            return (ressign ? ~(int64_t)0x7FFFFFFFFFFFFFFF : (int64_t)0x7FFFFFFFFFFFFFFF);
+        if (sh2 > CV_BIG_INT(0x7FFFFFFF) || val0_h > CV_BIG_INT(0x7FFFFFFF))
+            return (int64_t)(ressign ? CV_BIG_UINT(0x8000000000000000) : CV_BIG_INT(0x7FFFFFFFFFFFFFFF));

        if (ressign)
        {
-            val0_l = (~val0_l + 1) & 0xFFFFFFFF;
-            val0_h = val0_l ? ~val0_h : (~val0_h + 1);
+            return -(int64_t)(val0_h << 32 | val0_l);
        }
        return (int64_t)(val0_h << 32 | val0_l);
    }
    CV_ALWAYS_INLINE fixedpoint64 operator + (const fixedpoint64& val2) const
    {
        int64_t res = val + val2.val;
-        return ((val ^ res) & (val2.val ^ res)) >> 63 ? ~(res & ~0x7FFFFFFFFFFFFFFF) : res;
+        return (int64_t)(((val ^ res) & (val2.val ^ res)) < 0 ? ~(res & CV_BIG_UINT(0x8000000000000000)) : res);
    }
    CV_ALWAYS_INLINE fixedpoint64 operator - (const fixedpoint64& val2) const
    {
        int64_t res = val - val2.val;
-        return ((val ^ val2.val) & (val ^ res)) >> 63 ? ~(res & ~0x7FFFFFFFFFFFFFFF) : res;
+        return (int64_t)(((val ^ val2.val) & (val ^ res)) < 0 ? ~(res & CV_BIG_UINT(0x8000000000000000)) : res);
    }
    CV_ALWAYS_INLINE fixedpoint64 operator >> (int n) const { return fixedpoint64(val >> n); }
    CV_ALWAYS_INLINE fixedpoint64 operator << (int n) const { return fixedpoint64(val << n); }
    CV_ALWAYS_INLINE bool operator == (const fixedpoint64& val2) const { return val == val2.val; }
    template <typename ET>
-    CV_ALWAYS_INLINE operator ET() const { return cv::saturate_cast<ET>((int64_t)fixedround((uint64_t)val) >> fixedShift); }
+    CV_ALWAYS_INLINE ET saturate_cast() const { return cv::saturate_cast<ET>((int64_t)fixedround((uint64_t)val) >> fixedShift); }
    CV_ALWAYS_INLINE operator double() const { return (double)val / (1LL << fixedShift); }
    CV_ALWAYS_INLINE operator float() const { return (float)val / (1LL << fixedShift); }
+    CV_ALWAYS_INLINE operator uint8_t() const { return saturate_cast<uint8_t>(); }
+    CV_ALWAYS_INLINE operator int8_t() const { return saturate_cast<int8_t>(); }
+    CV_ALWAYS_INLINE operator uint16_t() const { return saturate_cast<uint16_t>(); }
+    CV_ALWAYS_INLINE operator int16_t() const { return saturate_cast<int16_t>(); }
+    CV_ALWAYS_INLINE operator int32_t() const { return saturate_cast<int32_t>(); }
    CV_ALWAYS_INLINE bool isZero() { return val == 0; }
    static CV_ALWAYS_INLINE fixedpoint64 zero() { return fixedpoint64(); }
    static CV_ALWAYS_INLINE fixedpoint64 one() { return fixedpoint64((int64_t)(1LL << fixedShift)); }
@ -133,15 +139,15 @@ public:
        uint64_t val0_h = (sh2 & 0xFFFFFFFF) + (sh1_0 >> 32) + (sh1_1 >> 32) + (val0_l >> 32);
        val0_l &= 0xFFFFFFFF;

-        if ((sh2 >> 32) || (val0_h >> 32))
-            return ((uint64_t)0xFFFFFFFFFFFFFFFF);
+        if (sh2 > CV_BIG_INT(0xFFFFFFFF) || val0_h > CV_BIG_INT(0xFFFFFFFF))
+            return (uint64_t)CV_BIG_UINT(0xFFFFFFFFFFFFFFFF);

-        return val0_h << 32 | val0_l;
+        return (val0_h << 32 | val0_l);
    }
    CV_ALWAYS_INLINE ufixedpoint64 operator + (const ufixedpoint64& val2) const
    {
        uint64_t res = val + val2.val;
-        return (val > res) ? (uint64_t)0xFFFFFFFFFFFFFFFF : res;
+        return (uint64_t)((val > res) ? CV_BIG_UINT(0xFFFFFFFFFFFFFFFF) : res);
    }
    CV_ALWAYS_INLINE ufixedpoint64 operator - (const ufixedpoint64& val2) const
    {
@ -151,9 +157,14 @@ public:
    CV_ALWAYS_INLINE ufixedpoint64 operator << (int n) const { return ufixedpoint64(val << n); }
    CV_ALWAYS_INLINE bool operator == (const ufixedpoint64& val2) const { return val == val2.val; }
    template <typename ET>
-    CV_ALWAYS_INLINE operator ET() const { return cv::saturate_cast<ET>(fixedround(val) >> fixedShift); }
+    CV_ALWAYS_INLINE ET saturate_cast() const { return cv::saturate_cast<ET>(fixedround(val) >> fixedShift); }
    CV_ALWAYS_INLINE operator double() const { return (double)val / (1LL << fixedShift); }
    CV_ALWAYS_INLINE operator float() const { return (float)val / (1LL << fixedShift); }
+    CV_ALWAYS_INLINE operator uint8_t() const { return saturate_cast<uint8_t>(); }
+    CV_ALWAYS_INLINE operator int8_t() const { return saturate_cast<int8_t>(); }
+    CV_ALWAYS_INLINE operator uint16_t() const { return saturate_cast<uint16_t>(); }
+    CV_ALWAYS_INLINE operator int16_t() const { return saturate_cast<int16_t>(); }
+    CV_ALWAYS_INLINE operator int32_t() const { return saturate_cast<int32_t>(); }
    CV_ALWAYS_INLINE bool isZero() { return val == 0; }
    static CV_ALWAYS_INLINE ufixedpoint64 zero() { return ufixedpoint64(); }
    static CV_ALWAYS_INLINE ufixedpoint64 one() { return ufixedpoint64((uint64_t)(1ULL << fixedShift)); }
@ -187,21 +198,26 @@ public:
    CV_ALWAYS_INLINE fixedpoint32 operator + (const fixedpoint32& val2) const
    {
        int32_t res = val + val2.val;
-        return ((val ^ res) & (val2.val ^ res)) >> 31 ? ~(res & ~0x7FFFFFFF) : res;
+        return (int64_t)((val ^ res) & (val2.val ^ res)) >> 31 ? ~(res & ~0x7FFFFFFF) : res;
    }
    CV_ALWAYS_INLINE fixedpoint32 operator - (const fixedpoint32& val2) const
    {
        int32_t res = val - val2.val;
-        return ((val ^ val2.val) & (val ^ res)) >> 31 ? ~(res & ~0x7FFFFFFF) : res;
+        return (int64_t)((val ^ val2.val) & (val ^ res)) >> 31 ? ~(res & ~0x7FFFFFFF) : res;
    }
    CV_ALWAYS_INLINE fixedpoint32 operator >> (int n) const { return fixedpoint32(val >> n); }
    CV_ALWAYS_INLINE fixedpoint32 operator << (int n) const { return fixedpoint32(val << n); }
    CV_ALWAYS_INLINE bool operator == (const fixedpoint32& val2) const { return val == val2.val; }
    template <typename ET>
-    CV_ALWAYS_INLINE operator ET() const { return cv::saturate_cast<ET>((int32_t)fixedround((uint32_t)val) >> fixedShift); }
+    CV_ALWAYS_INLINE ET saturate_cast() const { return cv::saturate_cast<ET>((int32_t)fixedround((uint32_t)val) >> fixedShift); }
    CV_ALWAYS_INLINE operator fixedpoint64() const { return (int64_t)val << (fixedpoint64::fixedShift - fixedShift); }
    CV_ALWAYS_INLINE operator double() const { return (double)val / (1 << fixedShift); }
    CV_ALWAYS_INLINE operator float() const { return (float)val / (1 << fixedShift); }
+    CV_ALWAYS_INLINE operator uint8_t() const { return saturate_cast<uint8_t>(); }
+    CV_ALWAYS_INLINE operator int8_t() const { return saturate_cast<int8_t>(); }
+    CV_ALWAYS_INLINE operator uint16_t() const { return saturate_cast<uint16_t>(); }
+    CV_ALWAYS_INLINE operator int16_t() const { return saturate_cast<int16_t>(); }
+    CV_ALWAYS_INLINE operator int32_t() const { return saturate_cast<int32_t>(); }
    CV_ALWAYS_INLINE bool isZero() { return val == 0; }
    static CV_ALWAYS_INLINE fixedpoint32 zero() { return fixedpoint32(); }
    static CV_ALWAYS_INLINE fixedpoint32 one() { return fixedpoint32((1 << fixedShift)); }
@ -242,10 +258,15 @@ public:
    CV_ALWAYS_INLINE ufixedpoint32 operator << (int n) const { return ufixedpoint32(val << n); }
    CV_ALWAYS_INLINE bool operator == (const ufixedpoint32& val2) const { return val == val2.val; }
    template <typename ET>
-    CV_ALWAYS_INLINE operator ET() const { return cv::saturate_cast<ET>(fixedround(val) >> fixedShift); }
+    CV_ALWAYS_INLINE ET saturate_cast() const { return cv::saturate_cast<ET>(fixedround(val) >> fixedShift); }
    CV_ALWAYS_INLINE operator ufixedpoint64() const { return (uint64_t)val << (ufixedpoint64::fixedShift - fixedShift); }
    CV_ALWAYS_INLINE operator double() const { return (double)val / (1 << fixedShift); }
    CV_ALWAYS_INLINE operator float() const { return (float)val / (1 << fixedShift); }
+    CV_ALWAYS_INLINE operator uint8_t() const { return saturate_cast<uint8_t>(); }
+    CV_ALWAYS_INLINE operator int8_t() const { return saturate_cast<int8_t>(); }
+    CV_ALWAYS_INLINE operator uint16_t() const { return saturate_cast<uint16_t>(); }
+    CV_ALWAYS_INLINE operator int16_t() const { return saturate_cast<int16_t>(); }
+    CV_ALWAYS_INLINE operator int32_t() const { return saturate_cast<int32_t>(); }
    CV_ALWAYS_INLINE bool isZero() { return val == 0; }
    static CV_ALWAYS_INLINE ufixedpoint32 zero() { return ufixedpoint32(); }
    static CV_ALWAYS_INLINE ufixedpoint32 one() { return ufixedpoint32((1U << fixedShift)); }
@ -284,10 +305,15 @@ public:
    CV_ALWAYS_INLINE fixedpoint16 operator << (int n) const { return fixedpoint16((int16_t)(val << n)); }
    CV_ALWAYS_INLINE bool operator == (const fixedpoint16& val2) const { return val == val2.val; }
    template <typename ET>
-    CV_ALWAYS_INLINE operator ET() const { return cv::saturate_cast<ET>((int16_t)fixedround((uint16_t)val) >> fixedShift); }
+    CV_ALWAYS_INLINE ET saturate_cast() const { return cv::saturate_cast<ET>((int16_t)fixedround((uint16_t)val) >> fixedShift); }
    CV_ALWAYS_INLINE operator fixedpoint32() const { return (int32_t)val << (fixedpoint32::fixedShift - fixedShift); }
    CV_ALWAYS_INLINE operator double() const { return (double)val / (1 << fixedShift); }
    CV_ALWAYS_INLINE operator float() const { return (float)val / (1 << fixedShift); }
+    CV_ALWAYS_INLINE operator uint8_t() const { return saturate_cast<uint8_t>(); }
+    CV_ALWAYS_INLINE operator int8_t() const { return saturate_cast<int8_t>(); }
+    CV_ALWAYS_INLINE operator uint16_t() const { return saturate_cast<uint16_t>(); }
+    CV_ALWAYS_INLINE operator int16_t() const { return saturate_cast<int16_t>(); }
+    CV_ALWAYS_INLINE operator int32_t() const { return saturate_cast<int32_t>(); }
    CV_ALWAYS_INLINE bool isZero() { return val == 0; }
    static CV_ALWAYS_INLINE fixedpoint16 zero() { return fixedpoint16(); }
    static CV_ALWAYS_INLINE fixedpoint16 one() { return fixedpoint16((int16_t)(1 << fixedShift)); }
@ -324,10 +350,15 @@ public:
    CV_ALWAYS_INLINE ufixedpoint16 operator << (int n) const { return ufixedpoint16((uint16_t)(val << n)); }
    CV_ALWAYS_INLINE bool operator == (const ufixedpoint16& val2) const { return val == val2.val; }
    template <typename ET>
-    CV_ALWAYS_INLINE operator ET() const { return cv::saturate_cast<ET>(fixedround(val) >> fixedShift); }
+    CV_ALWAYS_INLINE ET saturate_cast() const { return cv::saturate_cast<ET>(fixedround(val) >> fixedShift); }
    CV_ALWAYS_INLINE operator ufixedpoint32() const { return (uint32_t)val << (ufixedpoint32::fixedShift - fixedShift); }
    CV_ALWAYS_INLINE operator double() const { return (double)val / (1 << fixedShift); }
    CV_ALWAYS_INLINE operator float() const { return (float)val / (1 << fixedShift); }
+    CV_ALWAYS_INLINE operator uint8_t() const { return saturate_cast<uint8_t>(); }
+    CV_ALWAYS_INLINE operator int8_t() const { return saturate_cast<int8_t>(); }
+    CV_ALWAYS_INLINE operator uint16_t() const { return saturate_cast<uint16_t>(); }
+    CV_ALWAYS_INLINE operator int16_t() const { return saturate_cast<int16_t>(); }
+    CV_ALWAYS_INLINE operator int32_t() const { return saturate_cast<int32_t>(); }
    CV_ALWAYS_INLINE bool isZero() { return val == 0; }
    static CV_ALWAYS_INLINE ufixedpoint16 zero() { return ufixedpoint16(); }
    static CV_ALWAYS_INLINE ufixedpoint16 one() { return ufixedpoint16((uint16_t)(1 << fixedShift)); }
--- a/modules/imgproc/src/opencl/clahe.cl
+++ b/modules/imgproc/src/opencl/clahe.cl
@ -43,10 +43,6 @@
 //
 //M*/

-#ifndef WAVE_SIZE
-#define WAVE_SIZE 1
-#endif
-
 inline int calc_lut(__local int* smem, int val, int tid)
 {
    smem[tid] = val;
@ -60,8 +56,7 @@ inline int calc_lut(__local int* smem, int val, int tid)
    return smem[tid];
 }

-#ifdef CPU
-inline void reduce(volatile __local int* smem, int val, int tid)
+inline int reduce(__local volatile int* smem, int val, int tid)
 {
    smem[tid] = val;
    barrier(CLK_LOCAL_MEM_FENCE);
@ -75,69 +70,39 @@ inline void reduce(volatile __local int* smem, int val, int tid)
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 32)
+    {
        smem[tid] += smem[tid + 32];
+    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 16)
+    {
        smem[tid] += smem[tid + 16];
+    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 8)
+    {
        smem[tid] += smem[tid + 8];
+    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 4)
+    {
        smem[tid] += smem[tid + 4];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 2)
-        smem[tid] += smem[tid + 2];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 1)
-        smem[256] = smem[tid] + smem[tid + 1];
-    barrier(CLK_LOCAL_MEM_FENCE);
-}
-
-#else
-
-inline void reduce(__local volatile int* smem, int val, int tid)
-{
-    smem[tid] = val;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 128)
-        smem[tid] = val += smem[tid + 128];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 64)
-        smem[tid] = val += smem[tid + 64];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 32)
-    {
-        smem[tid] += smem[tid + 32];
-#if WAVE_SIZE < 32
-    } barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 16)
-    {
-#endif
-        smem[tid] += smem[tid + 16];
-#if WAVE_SIZE < 16
    }
    barrier(CLK_LOCAL_MEM_FENCE);

-    if (tid < 8)
+    if (tid == 0)
    {
-#endif
-        smem[tid] += smem[tid + 8];
-        smem[tid] += smem[tid + 4];
-        smem[tid] += smem[tid + 2];
-        smem[tid] += smem[tid + 1];
+        smem[0] = (smem[0] + smem[1]) + (smem[2] + smem[3]);
    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    val = smem[0];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return val;
 }
-#endif

 __kernel void calcLut(__global __const uchar * src, const int srcStep,
                      const int src_offset, __global uchar * lut,
@ -179,14 +144,7 @@ __kernel void calcLut(__global __const uchar * src, const int srcStep,
        }

        // find number of overall clipped samples
-        reduce(smem, clipped, tid);
-        barrier(CLK_LOCAL_MEM_FENCE);
-#ifdef CPU
-        clipped = smem[256];
-#else
-        clipped = smem[0];
-#endif
-        barrier(CLK_LOCAL_MEM_FENCE);
+        clipped = reduce(smem, clipped, tid);

        // redistribute clipped samples evenly
        int redistBatch = clipped / 256;
--- a/modules/python/src2/cv2.cpp
+++ b/modules/python/src2/cv2.cpp
@ -1783,6 +1783,10 @@ static PyMethodDef special_methods[] = {
  {"createTrackbar", pycvCreateTrackbar, METH_VARARGS, "createTrackbar(trackbarName, windowName, value, count, onChange) -> None"},
  {"createButton", (PyCFunction)pycvCreateButton, METH_VARARGS | METH_KEYWORDS, "createButton(buttonName, onChange [, userData, buttonType, initialButtonState]) -> None"},
  {"setMouseCallback", (PyCFunction)pycvSetMouseCallback, METH_VARARGS | METH_KEYWORDS, "setMouseCallback(windowName, onMouse [, param]) -> None"},
+#endif
+#ifdef HAVE_OPENCV_DNN
+  {"dnn_registerLayer", (PyCFunction)pyopencv_cv_dnn_registerLayer, METH_VARARGS | METH_KEYWORDS, "registerLayer(type, class) -> None"},
+  {"dnn_unregisterLayer", (PyCFunction)pyopencv_cv_dnn_unregisterLayer, METH_VARARGS | METH_KEYWORDS, "unregisterLayer(type) -> None"},
 #endif
  {NULL, NULL},
 };
--- a/modules/stitching/include/opencv2/stitching/detail/blenders.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/blenders.hpp
@ -145,6 +145,18 @@ private:
 #if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
    std::vector<cuda::GpuMat> gpu_dst_pyr_laplace_;
    std::vector<cuda::GpuMat> gpu_dst_band_weights_;
+    std::vector<Point> gpu_tl_points_;
+    std::vector<cuda::GpuMat> gpu_imgs_with_border_;
+    std::vector<std::vector<cuda::GpuMat> > gpu_weight_pyr_gauss_vec_;
+    std::vector<std::vector<cuda::GpuMat> > gpu_src_pyr_laplace_vec_;
+    std::vector<std::vector<cuda::GpuMat> > gpu_ups_;
+    cuda::GpuMat gpu_dst_mask_;
+    cuda::GpuMat gpu_mask_;
+    cuda::GpuMat gpu_img_;
+    cuda::GpuMat gpu_weight_map_;
+    cuda::GpuMat gpu_add_mask_;
+    int gpu_feed_idx_;
+    bool gpu_initialized_;
 #endif
 };

--- a/modules/stitching/src/blenders.cpp
+++ b/modules/stitching/src/blenders.cpp
@ -221,6 +221,7 @@ MultiBandBlender::MultiBandBlender(int try_gpu, int num_bands, int weight_type)

 #if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
    can_use_gpu_ = try_gpu && cuda::getCudaEnabledDeviceCount();
+    gpu_feed_idx_ = 0;
 #else
    (void) try_gpu;
    can_use_gpu_ = false;
@ -248,6 +249,15 @@ void MultiBandBlender::prepare(Rect dst_roi)
 #if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
    if (can_use_gpu_)
    {
+        gpu_initialized_ = false;
+        gpu_feed_idx_ = 0;
+
+        gpu_tl_points_.clear();
+        gpu_weight_pyr_gauss_vec_.clear();
+        gpu_src_pyr_laplace_vec_.clear();
+        gpu_ups_.clear();
+        gpu_imgs_with_border_.clear();
+
        gpu_dst_pyr_laplace_.resize(num_bands_ + 1);
        gpu_dst_pyr_laplace_[0].create(dst_roi.size(), CV_16SC3);
        gpu_dst_pyr_laplace_[0].setTo(Scalar::all(0));
@ -320,7 +330,37 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)
    int64 t = getTickCount();
 #endif

-    UMat img = _img.getUMat();
+    UMat img;
+
+#if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
+    // If using gpu save the top left coordinate when running first time after prepare
+    if (can_use_gpu_)
+    {
+        if (!gpu_initialized_)
+        {
+            gpu_tl_points_.push_back(tl);
+        }
+        else
+        {
+            tl = gpu_tl_points_[gpu_feed_idx_];
+        }
+    }
+    // If _img is not a GpuMat get it as UMat from the InputArray object.
+    // If it is GpuMat make a dummy object with right dimensions but no data and
+    // get _img as a GpuMat
+    if (!_img.isGpuMat())
+#endif
+    {
+        img = _img.getUMat();
+    }
+#if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
+    else
+    {
+        gpu_img_ = _img.getGpuMat();
+        img = UMat(gpu_img_.rows, gpu_img_.cols, gpu_img_.type());
+    }
+#endif
+
    CV_Assert(img.type() == CV_16SC3 || img.type() == CV_8UC3);
    CV_Assert(mask.type() == CV_8U);

@ -357,42 +397,63 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)
 #if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
    if (can_use_gpu_)
    {
-        // Create the source image Laplacian pyramid
-        cuda::GpuMat gpu_img;
-        gpu_img.upload(img);
-        cuda::GpuMat img_with_border;
-        cuda::copyMakeBorder(gpu_img, img_with_border, top, bottom, left, right, BORDER_REFLECT);
-        std::vector<cuda::GpuMat> gpu_src_pyr_laplace(num_bands_ + 1);
-        img_with_border.convertTo(gpu_src_pyr_laplace[0], CV_16S);
-        for (int i = 0; i < num_bands_; ++i)
-            cuda::pyrDown(gpu_src_pyr_laplace[i], gpu_src_pyr_laplace[i + 1]);
-        for (int i = 0; i < num_bands_; ++i)
+        if (!gpu_initialized_)
        {
-            cuda::GpuMat up;
-            cuda::pyrUp(gpu_src_pyr_laplace[i + 1], up);
-            cuda::subtract(gpu_src_pyr_laplace[i], up, gpu_src_pyr_laplace[i]);
+            gpu_imgs_with_border_.push_back(cuda::GpuMat());
+            gpu_weight_pyr_gauss_vec_.push_back(std::vector<cuda::GpuMat>(num_bands_+1));
+            gpu_src_pyr_laplace_vec_.push_back(std::vector<cuda::GpuMat>(num_bands_+1));
+            gpu_ups_.push_back(std::vector<cuda::GpuMat>(num_bands_));
        }

-        // Create the weight map Gaussian pyramid
-        cuda::GpuMat gpu_mask;
-        gpu_mask.upload(mask);
-        cuda::GpuMat weight_map;
-        std::vector<cuda::GpuMat> gpu_weight_pyr_gauss(num_bands_ + 1);
+        // If _img is not GpuMat upload it to gpu else gpu_img_ was set already
+        if (!_img.isGpuMat())
+        {
+            gpu_img_.upload(img);
+        }
+
+        // Create the source image Laplacian pyramid
+        cuda::copyMakeBorder(gpu_img_, gpu_imgs_with_border_[gpu_feed_idx_], top, bottom,
+                             left, right, BORDER_REFLECT);
+        gpu_imgs_with_border_[gpu_feed_idx_].convertTo(gpu_src_pyr_laplace_vec_[gpu_feed_idx_][0], CV_16S);
+        for (int i = 0; i < num_bands_; ++i)
+            cuda::pyrDown(gpu_src_pyr_laplace_vec_[gpu_feed_idx_][i],
+                          gpu_src_pyr_laplace_vec_[gpu_feed_idx_][i + 1]);
+        for (int i = 0; i < num_bands_; ++i)
+        {
+            cuda::pyrUp(gpu_src_pyr_laplace_vec_[gpu_feed_idx_][i + 1], gpu_ups_[gpu_feed_idx_][i]);
+            cuda::subtract(gpu_src_pyr_laplace_vec_[gpu_feed_idx_][i],
+                           gpu_ups_[gpu_feed_idx_][i],
+                           gpu_src_pyr_laplace_vec_[gpu_feed_idx_][i]);
+        }
+
+        // Create the weight map Gaussian pyramid only if not yet initialized
+        if (!gpu_initialized_)
+        {
+            if (mask.isGpuMat())
+            {
+                gpu_mask_ = mask.getGpuMat();
+            }
+            else
+            {
+                gpu_mask_.upload(mask);
+            }

            if (weight_type_ == CV_32F)
            {
-            gpu_mask.convertTo(weight_map, CV_32F, 1. / 255.);
+                gpu_mask_.convertTo(gpu_weight_map_, CV_32F, 1. / 255.);
            }
            else // weight_type_ == CV_16S
            {
-            gpu_mask.convertTo(weight_map, CV_16S);
-            cuda::GpuMat add_mask;
-            cuda::compare(gpu_mask, 0, add_mask, CMP_NE);
-            cuda::add(weight_map, Scalar::all(1), weight_map, add_mask);
+                gpu_mask_.convertTo(gpu_weight_map_, CV_16S);
+                cuda::compare(gpu_mask_, 0, gpu_add_mask_, CMP_NE);
+                cuda::add(gpu_weight_map_, Scalar::all(1), gpu_weight_map_, gpu_add_mask_);
            }
-        cuda::copyMakeBorder(weight_map, gpu_weight_pyr_gauss[0], top, bottom, left, right, BORDER_CONSTANT);
+            cuda::copyMakeBorder(gpu_weight_map_, gpu_weight_pyr_gauss_vec_[gpu_feed_idx_][0], top,
+                                 bottom, left, right, BORDER_CONSTANT);
            for (int i = 0; i < num_bands_; ++i)
-            cuda::pyrDown(gpu_weight_pyr_gauss[i], gpu_weight_pyr_gauss[i + 1]);
+                cuda::pyrDown(gpu_weight_pyr_gauss_vec_[gpu_feed_idx_][i],
+                              gpu_weight_pyr_gauss_vec_[gpu_feed_idx_][i + 1]);
+        }

        int y_tl = tl_new.y - dst_roi_.y;
        int y_br = br_new.y - dst_roi_.y;
@ -403,9 +464,9 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)
        for (int i = 0; i <= num_bands_; ++i)
        {
            Rect rc(x_tl, y_tl, x_br - x_tl, y_br - y_tl);
-            cuda::GpuMat &_src_pyr_laplace = gpu_src_pyr_laplace[i];
+            cuda::GpuMat &_src_pyr_laplace = gpu_src_pyr_laplace_vec_[gpu_feed_idx_][i];
            cuda::GpuMat _dst_pyr_laplace = gpu_dst_pyr_laplace_[i](rc);
-            cuda::GpuMat &_weight_pyr_gauss = gpu_weight_pyr_gauss[i];
+            cuda::GpuMat &_weight_pyr_gauss = gpu_weight_pyr_gauss_vec_[gpu_feed_idx_][i];
            cuda::GpuMat _dst_band_weights = gpu_dst_band_weights_[i](rc);

            using namespace cv::cuda::device::blend;
@ -420,6 +481,7 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)
            x_tl /= 2; y_tl /= 2;
            x_br /= 2; y_br /= 2;
        }
+        ++gpu_feed_idx_;
        return;
    }
 #endif
@ -445,7 +507,7 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)
    UMat weight_map;
    std::vector<UMat> weight_pyr_gauss(num_bands_ + 1);

-    if(weight_type_ == CV_32F)
+    if (weight_type_ == CV_32F)
    {
        mask.getUMat().convertTo(weight_map, CV_32F, 1./255.);
    }
@ -486,7 +548,7 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)
            Mat _dst_pyr_laplace = dst_pyr_laplace_[i](rc).getMat(ACCESS_RW);
            Mat _weight_pyr_gauss = weight_pyr_gauss[i].getMat(ACCESS_READ);
            Mat _dst_band_weights = dst_band_weights_[i](rc).getMat(ACCESS_RW);
-            if(weight_type_ == CV_32F)
+            if (weight_type_ == CV_32F)
            {
                for (int y = 0; y < rc.height; ++y)
                {
@ -540,11 +602,15 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)

 void MultiBandBlender::blend(InputOutputArray dst, InputOutputArray dst_mask)
 {
-    cv::UMat dst_band_weights_0;
    Rect dst_rc(0, 0, dst_roi_final_.width, dst_roi_final_.height);
 #if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
    if (can_use_gpu_)
    {
+        if (!gpu_initialized_)
+        {
+            gpu_ups_.push_back(std::vector<cuda::GpuMat>(num_bands_+1));
+        }
+
        for (int i = 0; i <= num_bands_; ++i)
        {
            cuda::GpuMat dst_i = gpu_dst_pyr_laplace_[i];
@ -564,20 +630,50 @@ void MultiBandBlender::blend(InputOutputArray dst, InputOutputArray dst_mask)
        // Restore image from Laplacian pyramid
        for (size_t i = num_bands_; i > 0; --i)
        {
-            cuda::GpuMat up;
-            cuda::pyrUp(gpu_dst_pyr_laplace_[i], up);
-            cuda::add(up, gpu_dst_pyr_laplace_[i - 1], gpu_dst_pyr_laplace_[i - 1]);
+            cuda::pyrUp(gpu_dst_pyr_laplace_[i], gpu_ups_[gpu_ups_.size()-1][num_bands_-i]);
+            cuda::add(gpu_ups_[gpu_ups_.size()-1][num_bands_-i],
+                      gpu_dst_pyr_laplace_[i - 1],
+                      gpu_dst_pyr_laplace_[i - 1]);
        }

+        // If dst is GpuMat do masking on gpu and return dst as a GpuMat
+        // else download the image to cpu and return it as an ordinary Mat
+        if (dst.isGpuMat())
+        {
+            cuda::GpuMat &gpu_dst = dst.getGpuMatRef();
+
+            cuda::compare(gpu_dst_band_weights_[0](dst_rc), WEIGHT_EPS, gpu_dst_mask_, CMP_GT);
+
+            cuda::compare(gpu_dst_mask_, 0, gpu_mask_, CMP_EQ);
+
+            gpu_dst_pyr_laplace_[0](dst_rc).setTo(Scalar::all(0), gpu_mask_);
+            gpu_dst_pyr_laplace_[0](dst_rc).convertTo(gpu_dst, CV_16S);
+
+        }
+        else
+        {
            gpu_dst_pyr_laplace_[0](dst_rc).download(dst_);
+            Mat dst_band_weights_0;
            gpu_dst_band_weights_[0].download(dst_band_weights_0);

-        gpu_dst_pyr_laplace_.clear();
-        gpu_dst_band_weights_.clear();
+            compare(dst_band_weights_0(dst_rc), WEIGHT_EPS, dst_mask_, CMP_GT);
+            Blender::blend(dst, dst_mask);
+        }
+
+        // Set destination Mats to 0 so new image can be blended
+        for (size_t i = 0; i < num_bands_ + 1; ++i)
+        {
+            gpu_dst_band_weights_[i].setTo(0);
+            gpu_dst_pyr_laplace_[i].setTo(Scalar::all(0));
+        }
+        gpu_feed_idx_ = 0;
+        gpu_initialized_ = true;
    }
    else
 #endif
    {
+        cv::UMat dst_band_weights_0;
+
        for (int i = 0; i <= num_bands_; ++i)
            normalizeUsingWeightMap(dst_band_weights_[i], dst_pyr_laplace_[i]);

@ -588,11 +684,11 @@ void MultiBandBlender::blend(InputOutputArray dst, InputOutputArray dst_mask)

        dst_pyr_laplace_.clear();
        dst_band_weights_.clear();
-    }

        compare(dst_band_weights_0(dst_rc), WEIGHT_EPS, dst_mask_, CMP_GT);

        Blender::blend(dst, dst_mask);
+    }
 }


--- a/modules/stitching/test/test_blenders.cuda.cpp
+++ b/modules/stitching/test/test_blenders.cuda.cpp
@ -50,13 +50,17 @@ namespace opencv_test { namespace {
        detail::MultiBandBlender blender(try_cuda, 5);

        blender.prepare(Rect(0, 0, max(im1.cols, im2.cols), max(im1.rows, im2.rows)));
-        blender.feed(im1, mask1, Point(0,0));
-        blender.feed(im2, mask2, Point(0,0));
+
+        // If using cuda try blending multiple times without calling prepare inbetween
+        for (int i = 0; i < (try_cuda ? 10 : 1); ++i) {
+            blender.feed(im1, mask1, Point(0, 0));
+            blender.feed(im2, mask2, Point(0, 0));

            Mat result_s, result_mask;
            blender.blend(result_s, result_mask);
            result_s.convertTo(result, CV_8U);
        }
+    }

 TEST(CUDA_MultiBandBlender, Accuracy)
 {
--- a/modules/videoio/CMakeLists.txt
+++ b/modules/videoio/CMakeLists.txt
@ -86,6 +86,9 @@ endif()
 if (WIN32 AND HAVE_MSMF)
  list(APPEND videoio_srcs ${CMAKE_CURRENT_LIST_DIR}/src/cap_msmf.hpp)
  list(APPEND videoio_srcs ${CMAKE_CURRENT_LIST_DIR}/src/cap_msmf.cpp)
+  if (HAVE_DXVA)
+    add_definitions(-DHAVE_DXVA)
+  endif()
 endif()

 if (WIN32 AND HAVE_VFW)
--- a/modules/videoio/src/cap_msmf.cpp
+++ b/modules/videoio/src/cap_msmf.cpp
--- a/modules/videoio/src/cap_msmf.hpp
+++ b/modules/videoio/src/cap_msmf.hpp
--- a/modules/videoio/src/cap_openni2.cpp
+++ b/modules/videoio/src/cap_openni2.cpp
@ -490,6 +490,7 @@ double CvCapture_OpenNI2::getCommonProperty( int propIdx ) const
        break;
    case CV_CAP_PROP_OPENNI2_SYNC :
        propValue = const_cast<CvCapture_OpenNI2 *>(this)->device.getDepthColorSyncEnabled();
+        break;
    case CV_CAP_PROP_OPENNI2_MIRROR:
    {
        bool isMirroring = false;
--- a/platforms/android/build_sdk.py
+++ b/platforms/android/build_sdk.py
@ -317,6 +317,12 @@ if __name__ == "__main__":
    if os.path.realpath(args.work_dir) == os.path.realpath(args.opencv_dir):
        raise Fail("Specify workdir (building from OpenCV source directory is not supported)")

+    # Relative paths become invalid in sub-directories
+    if args.opencv_dir is not None and not os.path.isabs(args.opencv_dir):
+        args.opencv_dir = os.path.abspath(args.opencv_dir)
+    if args.extra_modules_path is not None and not os.path.isabs(args.extra_modules_path):
+        args.extra_modules_path = os.path.abspath(args.extra_modules_path)
+
    cpath = args.config
    if not os.path.exists(cpath):
        cpath = os.path.join(SCRIPT_DIR, cpath)
--- a/samples/dnn/edge_detection.py
+++ b/samples/dnn/edge_detection.py
@ -0,0 +1,69 @@
+import cv2 as cv
+import argparse
+
+parser = argparse.ArgumentParser(
+        description='This sample shows how to define custom OpenCV deep learning layers in Python. '
+                    'Holistically-Nested Edge Detection (https://arxiv.org/abs/1504.06375) neural network '
+                    'is used as an example model. Find a pre-trained model at https://github.com/s9xie/hed.')
+parser.add_argument('--input', help='Path to image or video. Skip to capture frames from camera')
+parser.add_argument('--prototxt', help='Path to deploy.prototxt', required=True)
+parser.add_argument('--caffemodel', help='Path to hed_pretrained_bsds.caffemodel', required=True)
+parser.add_argument('--width', help='Resize input image to a specific width', default=500, type=int)
+parser.add_argument('--height', help='Resize input image to a specific height', default=500, type=int)
+args = parser.parse_args()
+
+#! [CropLayer]
+class CropLayer(object):
+    def __init__(self, params, blobs):
+        self.xstart = 0
+        self.xend = 0
+        self.ystart = 0
+        self.yend = 0
+
+    # Our layer receives two inputs. We need to crop the first input blob
+    # to match a shape of the second one (keeping batch size and number of channels)
+    def getMemoryShapes(self, inputs):
+        inputShape, targetShape = inputs[0], inputs[1]
+        batchSize, numChannels = inputShape[0], inputShape[1]
+        height, width = targetShape[2], targetShape[3]
+
+        self.ystart = (inputShape[2] - targetShape[2]) / 2
+        self.xstart = (inputShape[3] - targetShape[3]) / 2
+        self.yend = self.ystart + height
+        self.xend = self.xstart + width
+
+        return [[batchSize, numChannels, height, width]]
+
+    def forward(self, inputs):
+        return [inputs[0][:,:,self.ystart:self.yend,self.xstart:self.xend]]
+#! [CropLayer]
+
+#! [Register]
+cv.dnn_registerLayer('Crop', CropLayer)
+#! [Register]
+
+# Load the model.
+net = cv.dnn.readNet(args.prototxt, args.caffemodel)
+
+kWinName = 'Holistically-Nested Edge Detection'
+cv.namedWindow('Input', cv.WINDOW_NORMAL)
+cv.namedWindow(kWinName, cv.WINDOW_NORMAL)
+
+cap = cv.VideoCapture(args.input if args.input else 0)
+while cv.waitKey(1) < 0:
+    hasFrame, frame = cap.read()
+    if not hasFrame:
+        cv.waitKey()
+        break
+
+    cv.imshow('Input', frame)
+
+    inp = cv.dnn.blobFromImage(frame, scalefactor=1.0, size=(args.width, args.height),
+                               mean=(104.00698793, 116.66876762, 122.67891434),
+                               swapRB=False, crop=False)
+    net.setInput(inp)
+
+    out = net.forward()
+    out = out[0, 0]
+    out = cv.resize(out, (frame.shape[1], frame.shape[0]))
+    cv.imshow(kWinName, out)