Merge remote-tracking branch 'upstream/3.4' into merge-3.4

2025-06-26 22:31:22 +08:00 · 2018-07-24 22:48:54 +03:00 · 2018-07-24 22:48:54 +03:00 · 9787ab598b
commit 9787ab598b
parent 7e9b5d9e30 5336b9ad19
70 changed files with 2277 additions and 1524 deletions
--- a/apps/createsamples/utility.cpp
+++ b/apps/createsamples/utility.cpp
@ -1044,12 +1044,10 @@ void cvCreateTrainingSamples( const char* filename,
        output = fopen( filename, "wb" );
        if( output != NULL )
        {
-            int hasbg;
            int i;
            int inverse;

-            hasbg = 0;
-            hasbg = (bgfilename != NULL && icvInitBackgroundReaders( bgfilename,
+            const int hasbg = (bgfilename != NULL && icvInitBackgroundReaders( bgfilename,
                     Size( winwidth,winheight ) ) );

            Mat sample( winheight, winwidth, CV_8UC1 );
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@ -740,7 +740,7 @@ macro(ocv_compiler_optimization_fill_cpu_config)
 ")


-  set(__file "${CMAKE_SOURCE_DIR}/modules/core/include/opencv2/core/cv_cpu_helper.h")
+  set(__file "${OpenCV_SOURCE_DIR}/modules/core/include/opencv2/core/cv_cpu_helper.h")
  if(EXISTS "${__file}")
    file(READ "${__file}" __content)
  endif()
--- a/modules/calib3d/src/circlesgrid.cpp
+++ b/modules/calib3d/src/circlesgrid.cpp
@ -220,7 +220,7 @@ void CirclesGridClusterFinder::findOutsideCorners(const std::vector<cv::Point2f>
  CV_Assert(!corners.empty());
  outsideCorners.clear();
  //find two pairs of the most nearest corners
-  int i, j, n = (int)corners.size();
+  const size_t n = corners.size();

 #ifdef DEBUG_CIRCLES
  Mat cornersImage(1024, 1248, CV_8UC1, Scalar(0));
@ -228,22 +228,22 @@ void CirclesGridClusterFinder::findOutsideCorners(const std::vector<cv::Point2f>
  imshow("corners", cornersImage);
 #endif

-  std::vector<Point2f> tangentVectors(corners.size());
-  for(size_t k=0; k<corners.size(); k++)
+  std::vector<Point2f> tangentVectors(n);
+  for(size_t k=0; k < n; k++)
  {
-    Point2f diff = corners[(k + 1) % corners.size()] - corners[k];
+    Point2f diff = corners[(k + 1) % n] - corners[k];
    tangentVectors[k] = diff * (1.0f / norm(diff));
  }

  //compute angles between all sides
-  Mat cosAngles(n, n, CV_32FC1, 0.0f);
-  for(i = 0; i < n; i++)
+  Mat cosAngles((int)n, (int)n, CV_32FC1, 0.0f);
+  for(size_t i = 0; i < n; i++)
  {
-    for(j = i + 1; j < n; j++)
+    for(size_t j = i + 1; j < n; j++)
    {
      float val = fabs(tangentVectors[i].dot(tangentVectors[j]));
-      cosAngles.at<float>(i, j) = val;
-      cosAngles.at<float>(j, i) = val;
+      cosAngles.at<float>((int)i, (int)j) = val;
+      cosAngles.at<float>((int)j, (int)i) = val;
    }
  }

@ -272,10 +272,10 @@ void CirclesGridClusterFinder::findOutsideCorners(const std::vector<cv::Point2f>
  const int bigDiff = 4;
  if(maxIdx - minIdx == bigDiff)
  {
-    minIdx += n;
+    minIdx += (int)n;
    std::swap(maxIdx, minIdx);
  }
-  if(maxIdx - minIdx != n - bigDiff)
+  if(maxIdx - minIdx != (int)n - bigDiff)
  {
    return;
  }
--- a/modules/calib3d/src/dls.cpp
+++ b/modules/calib3d/src/dls.cpp
@ -206,7 +206,7 @@ void dls::run_kernel(const cv::Mat& pp)

 void dls::build_coeff_matrix(const cv::Mat& pp, cv::Mat& Mtilde, cv::Mat& D)
 {
-    CV_Assert(!pp.empty());
+    CV_Assert(!pp.empty() && N > 0);
    cv::Mat eye = cv::Mat::eye(3, 3, CV_64F);

    // build coeff matrix
--- a/modules/calib3d/test/test_chesscorners.cpp
+++ b/modules/calib3d/test/test_chesscorners.cpp
@ -334,19 +334,19 @@ bool validateData(const ChessBoardGenerator& cbg, const Size& imgSz,

            tmp = cv::norm(cur - mat(i + 1, j + 1)); // TODO cvtest
            if (tmp < minNeibDist)
-                tmp = minNeibDist;
+                minNeibDist = tmp;

            tmp = cv::norm(cur - mat(i - 1, j + 1)); // TODO cvtest
            if (tmp < minNeibDist)
-                tmp = minNeibDist;
+                minNeibDist = tmp;

            tmp = cv::norm(cur - mat(i + 1, j - 1)); // TODO cvtest
            if (tmp < minNeibDist)
-                tmp = minNeibDist;
+                minNeibDist = tmp;

            tmp = cv::norm(cur - mat(i - 1, j - 1)); // TODO cvtest
            if (tmp < minNeibDist)
-                tmp = minNeibDist;
+                minNeibDist = tmp;
        }

    const double threshold = 0.25;
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@ -526,13 +526,13 @@ inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)

 template<typename _Tpvec>
 inline _Tpvec v256_combine_diagonal(const _Tpvec& a, const _Tpvec& b)
-{ return _Tpvec(_mm256_blend_epi32(a.val, b.val, 0b11110000)); }
+{ return _Tpvec(_mm256_blend_epi32(a.val, b.val, 0xf0)); }

 inline v_float32x8 v256_combine_diagonal(const v_float32x8& a, const v_float32x8& b)
-{ return v256_blend<0b11110000>(a, b); }
+{ return v256_blend<0xf0>(a, b); }

 inline v_float64x4 v256_combine_diagonal(const v_float64x4& a, const v_float64x4& b)
-{ return v256_blend<0b1100>(a, b); }
+{ return v256_blend<0xc>(a, b); }

 template<typename _Tpvec>
 inline _Tpvec v256_alignr_128(const _Tpvec& a, const _Tpvec& b)
@ -1609,392 +1609,592 @@ OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8)
 OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4)


-/** Reinterpret **/
-// its up there with load and store operations
+///////////////////// load deinterleave /////////////////////////////

-/* de&interleave */
-#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_2CH(_Tpvec, _Tp, suffix)                 \
-    inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b)       \
-    { return v256_load_deinterleave_##suffix(ptr, a, b); }                      \
-    inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b)  \
-    { return v256_store_interleave_2ch(ptr, a, b); }
-
-#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_3CH(_Tpvec, _Tp, suffix)   \
-    inline void v_load_deinterleave                               \
-    (const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c)             \
-    { return v256_load_deinterleave_##suffix(ptr, a, b, c); }     \
-    inline void v_store_interleave                                \
-    (_Tp* ptr, const _Tpvec& a,const _Tpvec& b, const _Tpvec& c)  \
-    { return v256_store_interleave_##suffix(ptr, a, b, c); }
-
-#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_4CH(_Tpvec, _Tp, suffix)                    \
-    inline void v_load_deinterleave                                                \
-    (const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)                   \
-    { return v256_load_deinterleave_##suffix(ptr, a, b, c, d); }                   \
-    inline void v_store_interleave                                                 \
-    (_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) \
-    { return v256_store_interleave_##suffix(ptr, a, b, c, d); }
-
-#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_3n4CH(_Tpvec, _Tp, suffix) \
-    OPENCV_HAL_IMPL_AVX_INTERLEAVE_3CH(_Tpvec, _Tp, suffix)       \
-    OPENCV_HAL_IMPL_AVX_INTERLEAVE_4CH(_Tpvec, _Tp, suffix)
-
-#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(_Tpvec, _Tp, suffix) \
-    OPENCV_HAL_IMPL_AVX_INTERLEAVE_2CH(_Tpvec, _Tp, suffix)     \
-    OPENCV_HAL_IMPL_AVX_INTERLEAVE_3n4CH(_Tpvec, _Tp, suffix)
-
-/* **** */
-//
-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_2ch(_Tp* ptr, const _Tpvec& a, const _Tpvec& b)
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b )
 {
-    _Tpvec ab0, ab1;
-    v_zip(a, b, ab0, ab1);
-    v_store(ptr, ab0);
-    v_store(ptr + _Tpvec::nlanes, ab1);
+    __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+
+    static const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+                                               0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+    __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
+    __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
+    __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+    __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
+    __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
+    __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
+    a = v_uint8x32(a0);
+    b = v_uint8x32(b0);
 }

-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b )
 {
-    _Tpvec ab0 = v256_load(ptr);
-    _Tpvec ab1 = v256_load(ptr + _Tpvec::nlanes);
-    _Tpvec ab00, ab11;
-    v_recombine(ab0, ab1, ab00, ab11);
-    v256_zip(ab00, ab11, a, b);
+    __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+
+    static const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+                                               0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+    __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
+    __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
+    __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+    __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
+    __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
+    __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
+    a = v_uint16x16(a0);
+    b = v_uint16x16(b0);
 }

-///
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c)
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b )
 {
-    _Tpvec abc0 = v256_load(ptr);
-    _Tpvec abc1 = v256_load(ptr + _Tpvec::nlanes);
-    _Tpvec abc2 = v256_load(ptr + _Tpvec::nlanes * 2);
+    __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));

-    _Tpvec ab0 = v256_combine_diagonal(abc0, abc1);
-    _Tpvec bc1 = v256_combine_diagonal(abc1, abc2);
-    _Tpvec ac1 = v256_reverse_64(v256_combine_diagonal(abc2, abc0));
-
-    a = v256_unpacklo(ab0, ac1);
-    c = v256_unpackhi(ac1, bc1);
-    b = v256_alignr_64(bc1, ab0);
+    const int sh = 0+2*4+1*16+3*64;
+    __m256i p0 = _mm256_shuffle_epi32(ab0, sh);
+    __m256i p1 = _mm256_shuffle_epi32(ab1, sh);
+    __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+    __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
+    __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
+    __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
+    a = v_uint32x8(a0);
+    b = v_uint32x8(b0);
 }

-
-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b )
 {
-    _Tpvec ab0 = v256_unpacklo(a, b);
-    _Tpvec bc1 = v256_unpackhi(b, c);
-    _Tpvec ca10 = v256_swap_halves(v256_blend<0b1010>(c, a));
+    __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));

-    v_store(ptr, v256_combine_diagonal(ab0, ca10));
-    v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(bc1, ab0));
-    v_store(ptr + _Tpvec::nlanes * 2, v256_combine_diagonal(ca10, bc1));
+    __m256i pl = _mm256_permute2x128_si256(ab0, ab1, 0 + 2*16);
+    __m256i ph = _mm256_permute2x128_si256(ab0, ab1, 1 + 3*16);
+    __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
+    __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
+    a = v_uint64x4(a0);
+    b = v_uint64x4(b0);
 }

-////
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g, v_uint8x32& r )
 {
-    _Tpvec abcd0 = v256_load(ptr);
-    _Tpvec abcd1 = v256_load(ptr + _Tpvec::nlanes);
-    _Tpvec abcd2 = v256_load(ptr + _Tpvec::nlanes * 2);
-    _Tpvec abcd3 = v256_load(ptr + _Tpvec::nlanes * 3);
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));

-    _Tpvec cd0ab0 = v256_alignr_128(abcd0, abcd2);
-    _Tpvec cd1ab1 = v256_alignr_128(abcd1, abcd3);
+    __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
+    __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);

-    _Tpvec ab0 = v256_combine_diagonal(abcd0, cd0ab0);
-    _Tpvec ab1 = v256_combine_diagonal(abcd1, cd1ab1);
-    _Tpvec cd0 = v256_combine_diagonal(cd0ab0, abcd2);
-    _Tpvec cd1 = v256_combine_diagonal(cd1ab1, abcd3);
+    static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
+                                               0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    static const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
+                                               -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);

-    v256_zip(ab0, ab1, a, b);
-    v256_zip(cd0, cd1, c, d);
+    __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
+    __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
+    __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
+
+    static const __m256i
+    sh_b = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
+                            0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
+    sh_g = _mm256_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
+                            1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14),
+    sh_r = _mm256_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15,
+                            2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
+    b0 = _mm256_shuffle_epi8(b0, sh_b);
+    g0 = _mm256_shuffle_epi8(g0, sh_g);
+    r0 = _mm256_shuffle_epi8(r0, sh_r);
+
+    b = v_uint8x32(b0);
+    g = v_uint8x32(g0);
+    r = v_uint8x32(r0);
 }

-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16& g, v_uint16x16& r )
 {
-    _Tpvec ab0, ab1, cd0, cd1;
-    v256_zip(a, b, ab0, ab1);
-    v256_zip(c, d, cd0, cd1);
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));

-    _Tpvec ab0cd0 = v256_alignr_128(ab0, cd0);
-    _Tpvec ab1cd1 = v256_alignr_128(ab1, cd1);
+    __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
+    __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);

-    v_store(ptr, v256_combine_diagonal(ab0, ab0cd0));
-    v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(ab1, ab1cd1));
-    v_store(ptr + _Tpvec::nlanes * 2, v256_combine_diagonal(ab0cd0, cd0));
-    v_store(ptr + _Tpvec::nlanes * 3, v256_combine_diagonal(ab1cd1, cd1));
+    static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+                                               0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
+    static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+                                               -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
+    __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
+    __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
+    __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
+    static const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
+                                                 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    static const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
+                                                 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
+    static const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
+                                                 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    b0 = _mm256_shuffle_epi8(b0, sh_b);
+    g0 = _mm256_shuffle_epi8(g0, sh_g);
+    r0 = _mm256_shuffle_epi8(r0, sh_r);
+
+    b = v_uint16x16(b0);
+    g = v_uint16x16(g0);
+    r = v_uint16x16(r0);
 }

-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint64x4,  uint64, l4)
-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int64x4,   int64,  l4)
-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_float64x4, double, l4)
-
-/* **** **** */
-//
-inline void v256_load_deinterleave_l8(const float* ptr, v_float32x8& a, v_float32x8& b)
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& b, v_uint32x8& g, v_uint32x8& r )
 {
-    v_float32x8 ab0 = v256_load(ptr);
-    v_float32x8 ab1 = v256_load(ptr + 8);
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));

-    v_float32x8 ab0ab2, ab1ab3;
-    v_recombine(ab0, ab1, ab0ab2, ab1ab3);
+    __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
+    __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);

-    a.val = _mm256_shuffle_ps(ab0ab2.val, ab1ab3.val, _MM_SHUFFLE(2, 0, 2, 0));
-    b.val = _mm256_shuffle_ps(ab0ab2.val, ab1ab3.val, _MM_SHUFFLE(3, 1, 3, 1));
+    __m256i b0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_low, s02_high, 0x24), bgr1, 0x92);
+    __m256i g0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_high, s02_low, 0x92), bgr1, 0x24);
+    __m256i r0 = _mm256_blend_epi32(_mm256_blend_epi32(bgr1, s02_low, 0x24), s02_high, 0x92);
+
+    b0 = _mm256_shuffle_epi32(b0, 0x6c);
+    g0 = _mm256_shuffle_epi32(g0, 0xb1);
+    r0 = _mm256_shuffle_epi32(r0, 0xc6);
+
+    b = v_uint32x8(b0);
+    g = v_uint32x8(g0);
+    r = v_uint32x8(r0);
 }

-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g, v_uint64x4& r )
 {
-    v_float32x8 fa, fb;
-    v256_load_deinterleave_l8((float*)ptr, fa, fb);
-    a.val = v_reinterpret_as_u32(fa).val;
-    b.val = v_reinterpret_as_u32(fb).val;
-}
-///
-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
-{
-    _Tpvec ab0, ab1, bc0, bc1;
-    v256_zip(a, b, ab0, ab1);
-    v256_zip(b, c, bc0, bc1);
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));

-    _Tpvec cazg = v256_blend<0b10101010>(c, a);
-    _Tpvec abc0abc1(_mm256_unpacklo_epi64(ab0.val, cazg.val));
-    _Tpvec abc1abc2(_mm256_unpackhi_epi64(cazg.val, bc1.val));
-    _Tpvec abc2abc0 = v256_reverse_64(v256_blend<0b11001100>(ab1, bc0));
+    __m256i s01 = _mm256_blend_epi32(bgr0, bgr1, 0xf0);
+    __m256i s12 = _mm256_blend_epi32(bgr1, bgr2, 0xf0);
+    __m256i s20r = _mm256_permute4x64_epi64(_mm256_blend_epi32(bgr2, bgr0, 0xf0), 0x1b);
+    __m256i b0 = _mm256_unpacklo_epi64(s01, s20r);
+    __m256i g0 = _mm256_alignr_epi8(s12, s01, 8);
+    __m256i r0 = _mm256_unpackhi_epi64(s20r, s12);

-    _Tpvec abc0 = v256_combine_diagonal(abc0abc1, abc2abc0);
-    _Tpvec abc1 = v256_combine_diagonal(abc1abc2, abc0abc1);
-    _Tpvec abc2 = v256_combine_diagonal(abc2abc0, abc1abc2);
-
-    v_store(ptr, abc0);
-    v_store(ptr + _Tpvec::nlanes, abc1);
-    v_store(ptr + _Tpvec::nlanes * 2, abc2);
+    b = v_uint64x4(b0);
+    g = v_uint64x4(g0);
+    r = v_uint64x4(r0);
 }

-inline void v256_store_interleave_l8(float* ptr, const v_float32x8& a, const v_float32x8& b, const v_float32x8& c)
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g, v_uint8x32& r, v_uint8x32& a )
 {
-    v_float32x8 ab0, ab1, bc0, bc1;
-    v256_zip(a, b, ab0, ab1);
-    v256_zip(b, c, bc0, bc1);
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
+    __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 96));
+    static const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+                                               0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);

-    v_float32x8 cazg = v256_blend<0b10101010>(c, a);
-    v_float32x8 abc0abc1(_mm256_shuffle_ps(ab0.val, cazg.val, _MM_SHUFFLE(1, 0, 1, 0)));
-    v_float32x8 abc1abc2(_mm256_shuffle_ps(cazg.val, bc1.val, _MM_SHUFFLE(3, 2, 3, 2)));
+    __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
+    __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
+    __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
+    __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);

-    v_float32x8 abc0abc2(_mm256_shuffle_ps(bc0.val, ab1.val, _MM_SHUFFLE(1, 0, 3, 2)));
-    v_float32x8 abc2abc0 = v256_swap_halves(abc0abc2);
+    __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
+    __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
+    __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
+    __m256i p23h = _mm256_unpackhi_epi32(p2, p3);

-    v_float32x8 abc0 = v256_combine_diagonal(abc0abc1, abc2abc0);
-    v_float32x8 abc1 = v256_combine_diagonal(abc1abc2, abc0abc1);
-    v_float32x8 abc2 = v256_combine_diagonal(abc2abc0, abc1abc2);
+    __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
+    __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
+    __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
+    __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);

-    v_store(ptr, abc0);
-    v_store(ptr + 8, abc1);
-    v_store(ptr + 16, abc2);
+    __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
+    __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
+    __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
+    __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
+
+    b = v_uint8x32(b0);
+    g = v_uint8x32(g0);
+    r = v_uint8x32(r0);
+    a = v_uint8x32(a0);
 }

-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c)
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16& g, v_uint16x16& r, v_uint16x16& a )
 {
-    _Tpvec abc02 = v256_load(ptr);
-    _Tpvec abc1  = v256_load(ptr + _Tpvec::nlanes);
-    _Tpvec abc20 = v256_load(ptr + _Tpvec::nlanes * 2);
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+    __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 48));
+    static const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+                                               0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
+    __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
+    __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
+    __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
+    __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);

-    _Tpvec abc2 = v256_alignr_128(abc02, abc20);
-    _Tpvec abc0 = v256_combine_diagonal(abc02, abc20);
+    __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
+    __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
+    __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
+    __m256i p23h = _mm256_unpackhi_epi32(p2, p3);

-    a = v256_blend<0b10010010>(abc0, abc1);
-    a = v256_blend<0b01000100>(a, abc2);
+    __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
+    __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
+    __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
+    __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);

-    b = v256_blend<0b00100100>(abc0, abc1);
-    b = v256_blend<0b10011001>(b, abc2);
+    __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
+    __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
+    __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
+    __m256i a0 = _mm256_unpackhi_epi32(phl, phh);

-    c = v256_blend<0b01001001>(abc0, abc1);
-    c = v256_blend<0b00100010>(c, abc2);
-
-    a = v256_shuffle<_MM_SHUFFLE(1, 2, 3, 0)>(a);
-    b = v256_shuffle<_MM_SHUFFLE(2, 3, 0, 1)>(b);
-    c = v256_shuffle<_MM_SHUFFLE(3, 0, 1, 2)>(c);
-}
-/////
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
-{
-    _Tpvec ab0, ab1, cd0, cd1;
-    v256_load_deinterleave_l4(ptr, ab0, cd0, ab1, cd1);
-    v256_zip(ab0, ab1, a, b);
-    v256_zip(cd0, cd1, c, d);
+    b = v_uint16x16(b0);
+    g = v_uint16x16(g0);
+    r = v_uint16x16(r0);
+    a = v_uint16x16(a0);
 }

-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& b, v_uint32x8& g, v_uint32x8& r, v_uint32x8& a )
 {
-    _Tpvec ac0, ac1, bd0, bd1;
-    v256_zip(a, c, ac0, ac1);
-    v256_zip(b, d, bd0, bd1);
+    __m256i p0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i p1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+    __m256i p2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+    __m256i p3 = _mm256_loadu_si256((const __m256i*)(ptr + 24));

-    _Tpvec abcd0, abcd1, abcd2, abcd3;
-    v256_zip(ac0, bd0, abcd0, abcd1);
-    v256_zip(ac1, bd1, abcd2, abcd3);
+    __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
+    __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
+    __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
+    __m256i p23h = _mm256_unpackhi_epi32(p2, p3);

-    _Tpvec abcd01, abcd23, abcd45, abcd67;
-    v_recombine(abcd0, abcd1, abcd01, abcd45);
-    v_recombine(abcd2, abcd3, abcd23, abcd67);
+    __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
+    __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
+    __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
+    __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);

-    v_store(ptr, abcd01);
-    v_store(ptr + _Tpvec::nlanes, abcd23);
-    v_store(ptr + _Tpvec::nlanes * 2, abcd45);
-    v_store(ptr + _Tpvec::nlanes * 3, abcd67);
+    __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
+    __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
+    __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
+    __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
+
+    b = v_uint32x8(b0);
+    g = v_uint32x8(g0);
+    r = v_uint32x8(r0);
+    a = v_uint32x8(a0);
 }

-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint32x8,  unsigned, l8)
-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int32x8,   int,      l8)
-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_float32x8, float,    l8)
-
-/* ********  ******** */
-//
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l16(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g, v_uint64x4& r, v_uint64x4& a )
 {
-    const __m256i sep = _mm256_setr_epi8(
-        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
-        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
-    );
+    __m256i bgra0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgra1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
+    __m256i bgra2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+    __m256i bgra3 = _mm256_loadu_si256((const __m256i*)(ptr + 12));

-    _Tpvec ab0, ab1;
-    v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes), ab0, ab1);
+    __m256i l02 = _mm256_permute2x128_si256(bgra0, bgra2, 0 + 2*16);
+    __m256i h02 = _mm256_permute2x128_si256(bgra0, bgra2, 1 + 3*16);
+    __m256i l13 = _mm256_permute2x128_si256(bgra1, bgra3, 0 + 2*16);
+    __m256i h13 = _mm256_permute2x128_si256(bgra1, bgra3, 1 + 3*16);

-    __m256i a0b0 = _mm256_shuffle_epi8(ab0.val, sep);
-    __m256i a1b1 = _mm256_shuffle_epi8(ab1.val, sep);
+    __m256i b0 = _mm256_unpacklo_epi64(l02, l13);
+    __m256i g0 = _mm256_unpackhi_epi64(l02, l13);
+    __m256i r0 = _mm256_unpacklo_epi64(h02, h13);
+    __m256i a0 = _mm256_unpackhi_epi64(h02, h13);

-    a.val = _mm256_unpacklo_epi64(a0b0, a1b1);
-    b.val = _mm256_unpackhi_epi64(a0b0, a1b1);
-}
-///
-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
-{
-    v_uint32x8 ab0 = v_reinterpret_as_u32(v256_unpacklo(a, b));
-    v_uint32x8 ab1 = v_reinterpret_as_u32(v256_unpackhi(a, b));
-    v_uint32x8 bc0 = v_reinterpret_as_u32(v256_unpacklo(b, c));
-    v_uint32x8 bc1 = v_reinterpret_as_u32(v256_unpackhi(b, c));
-
-    v_uint32x8 cazg = v_reinterpret_as_u32(v256_blend<0b10101010>(c, a));
-               cazg = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(cazg);
-
-    v_uint32x8 ac1ab1 = v256_blend<0b10101010>(ab1, bc1);
-               ac1ab1 = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(ac1ab1);
-
-    v_uint32x8 abc001 = v256_blend<0b10101010>(ab0, cazg);
-    v_uint32x8 cabc0 = v256_blend<0b10101010>(cazg, bc0);
-
-    v_uint32x8 cabc1 = v256_unpacklo(cabc0, ac1ab1);
-    v_uint32x8 bcab0 = v256_unpackhi(cabc1, abc001);
-
-    v_uint64x4 abc01 = v256_unpacklo(v_reinterpret_as_u64(abc001), v_reinterpret_as_u64(bcab0));
-    v_uint64x4 abc21 = v256_unpackhi(v_reinterpret_as_u64(cabc0), v_reinterpret_as_u64(bcab0));
-               abc21 = v256_swap_halves(abc21);
-    v_uint64x4 abc12 = v_reinterpret_as_u64(v256_alignr_64(cabc1, ac1ab1));
-
-    v_uint64x4 abc0 = v256_combine_diagonal(abc01, abc21);
-    v_uint64x4 abc1 = v256_combine_diagonal(abc12, abc01);
-    v_uint64x4 abc2 = v256_combine_diagonal(abc21, abc12);
-
-    v_store(ptr, _Tpvec(abc0.val));
-    v_store(ptr + _Tpvec::nlanes, _Tpvec(abc1.val));
-    v_store(ptr + _Tpvec::nlanes * 2, _Tpvec(abc2.val));
-}
-// todo:
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l16(const _Tp*, _Tpvec&, _Tpvec&, _Tpvec&)
-{}
-////
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l16(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
-{
-    _Tpvec ab0, ab1, cd0, cd1;
-    v256_load_deinterleave_l8(ptr, ab0, cd0, ab1, cd1);
-    v256_zip(ab0, ab1, a, b);
-    v256_zip(cd0, cd1, c, d);
+    b = v_uint64x4(b0);
+    g = v_uint64x4(g0);
+    r = v_uint64x4(r0);
+    a = v_uint64x4(a0);
 }

-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
-{ v256_store_interleave_l8(ptr, a, b, c, d); }
+///////////////////////////// store interleave /////////////////////////////////////

-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint16x16,  ushort, l16)
-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int16x16,   short,  l16)
-
-/* **************** **************** */
-//
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l32(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y )
 {
-    const __m256i sep = _mm256_setr_epi8(
-        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
-        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
-    );
+    __m256i xy_l = _mm256_unpacklo_epi8(x.val, y.val);
+    __m256i xy_h = _mm256_unpackhi_epi8(x.val, y.val);

-    _Tpvec ab0, ab1;
-    v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes), ab0, ab1);
+    __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
+    __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);

-    __m256i a0b0 = _mm256_shuffle_epi8(ab0.val, sep);
-    __m256i a1b1 = _mm256_shuffle_epi8(ab1.val, sep);
-
-    a.val = _mm256_unpacklo_epi64(a0b0, a1b1);
-    b.val = _mm256_unpackhi_epi64(a0b0, a1b1);
+    _mm256_storeu_si256((__m256i*)ptr, xy0);
+    _mm256_storeu_si256((__m256i*)(ptr + 32), xy1);
 }

-/// todo
-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l32(_Tp*, const _Tpvec&, const _Tpvec&, const _Tpvec&)
-{}
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l32(const _Tp*, _Tpvec&, _Tpvec&, _Tpvec&)
-{}
-////
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l32(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y )
 {
-    const __m256i sep = _mm256_setr_epi8(
-        0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
-        0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
-    );
+    __m256i xy_l = _mm256_unpacklo_epi16(x.val, y.val);
+    __m256i xy_h = _mm256_unpackhi_epi16(x.val, y.val);

-    _Tpvec abcd0, abcd1, abcd2, abcd3;
-    v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes * 2), abcd0, abcd1);
-    v_recombine(v256_load(ptr + _Tpvec::nlanes), v256_load(ptr + _Tpvec::nlanes * 3), abcd2, abcd3);
+    __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
+    __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);

-    __m256i ab0cd0 = _mm256_shuffle_epi8(abcd0.val, sep);
-    __m256i ab1cd1 = _mm256_shuffle_epi8(abcd1.val, sep);
-    __m256i ab2cd2 = _mm256_shuffle_epi8(abcd2.val, sep);
-    __m256i ab3cd3 = _mm256_shuffle_epi8(abcd3.val, sep);
-
-    __m256i ab0 = _mm256_unpacklo_epi32(ab0cd0, ab1cd1);
-    __m256i ab1 = _mm256_unpacklo_epi32(ab2cd2, ab3cd3);
-    __m256i cd0 = _mm256_unpackhi_epi32(ab0cd0, ab1cd1);
-    __m256i cd1 = _mm256_unpackhi_epi32(ab2cd2, ab3cd3);
-
-    a.val = _mm256_unpacklo_epi64(ab0, ab1);
-    b.val = _mm256_unpackhi_epi64(ab0, ab1);
-    c.val = _mm256_unpacklo_epi64(cd0, cd1);
-    d.val = _mm256_unpackhi_epi64(cd0, cd1);
+    _mm256_storeu_si256((__m256i*)ptr, xy0);
+    _mm256_storeu_si256((__m256i*)(ptr + 16), xy1);
 }

-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l32(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
-{ v256_store_interleave_l8(ptr, a, b, c, d); }
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y )
+{
+    __m256i xy_l = _mm256_unpacklo_epi32(x.val, y.val);
+    __m256i xy_h = _mm256_unpackhi_epi32(x.val, y.val);

-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint8x32,  uchar, l32)
-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int8x32,   schar, l32)
+    __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
+    __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
+
+    _mm256_storeu_si256((__m256i*)ptr, xy0);
+    _mm256_storeu_si256((__m256i*)(ptr + 8), xy1);
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y )
+{
+    __m256i xy_l = _mm256_unpacklo_epi64(x.val, y.val);
+    __m256i xy_h = _mm256_unpackhi_epi64(x.val, y.val);
+
+    __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
+    __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
+
+    _mm256_storeu_si256((__m256i*)ptr, xy0);
+    _mm256_storeu_si256((__m256i*)(ptr + 4), xy1);
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r )
+{
+    static const __m256i sh_b = _mm256_setr_epi8(
+            0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
+            0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
+    static const __m256i sh_g = _mm256_setr_epi8(
+            5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
+            5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
+    static const __m256i sh_r = _mm256_setr_epi8(
+            10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
+            10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
+
+    __m256i b0 = _mm256_shuffle_epi8(b.val, sh_b);
+    __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g);
+    __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r);
+
+    static const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
+                                               0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    static const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
+                                               0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+
+    __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
+    __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
+    __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
+
+    __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+    __m256i bgr1 = _mm256_permute2x128_si256(p2, p0, 0 + 3*16);
+    __m256i bgr2 = _mm256_permute2x128_si256(p1, p2, 1 + 3*16);
+
+    _mm256_storeu_si256((__m256i*)ptr, bgr0);
+    _mm256_storeu_si256((__m256i*)(ptr + 32), bgr1);
+    _mm256_storeu_si256((__m256i*)(ptr + 64), bgr2);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, const v_uint16x16& r )
+{
+    static const __m256i sh_b = _mm256_setr_epi8(
+         0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
+         0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    static const __m256i sh_g = _mm256_setr_epi8(
+         10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
+         10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
+    static const __m256i sh_r = _mm256_setr_epi8(
+         4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
+         4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+
+    __m256i b0 = _mm256_shuffle_epi8(b.val, sh_b);
+    __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g);
+    __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r);
+
+    static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+                                               0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
+    static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+                                               -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
+
+    __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
+    __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
+    __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
+
+    __m256i bgr0 = _mm256_permute2x128_si256(p0, p2, 0 + 2*16);
+    //__m256i bgr1 = p1;
+    __m256i bgr2 = _mm256_permute2x128_si256(p0, p2, 1 + 3*16);
+
+    _mm256_storeu_si256((__m256i*)ptr, bgr0);
+    _mm256_storeu_si256((__m256i*)(ptr + 16), p1);
+    _mm256_storeu_si256((__m256i*)(ptr + 32), bgr2);
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, const v_uint32x8& r )
+{
+    __m256i b0 = _mm256_shuffle_epi32(b.val, 0x6c);
+    __m256i g0 = _mm256_shuffle_epi32(g.val, 0xb1);
+    __m256i r0 = _mm256_shuffle_epi32(r.val, 0xc6);
+
+    __m256i p0 = _mm256_blend_epi32(_mm256_blend_epi32(b0, g0, 0x92), r0, 0x24);
+    __m256i p1 = _mm256_blend_epi32(_mm256_blend_epi32(g0, r0, 0x92), b0, 0x24);
+    __m256i p2 = _mm256_blend_epi32(_mm256_blend_epi32(r0, b0, 0x92), g0, 0x24);
+
+    __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+    //__m256i bgr1 = p2;
+    __m256i bgr2 = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
+
+    _mm256_storeu_si256((__m256i*)ptr, bgr0);
+    _mm256_storeu_si256((__m256i*)(ptr + 8), p2);
+    _mm256_storeu_si256((__m256i*)(ptr + 16), bgr2);
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, const v_uint64x4& r )
+{
+    __m256i s01 = _mm256_unpacklo_epi64(b.val, g.val);
+    __m256i s12 = _mm256_unpackhi_epi64(g.val, r.val);
+    __m256i s20 = _mm256_blend_epi32(r.val, b.val, 0xcc);
+
+    __m256i bgr0 = _mm256_permute2x128_si256(s01, s20, 0 + 2*16);
+    __m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f);
+    __m256i bgr2 = _mm256_permute2x128_si256(s20, s12, 1 + 3*16);
+
+    _mm256_storeu_si256((__m256i*)ptr, bgr0);
+    _mm256_storeu_si256((__m256i*)(ptr + 4), bgr1);
+    _mm256_storeu_si256((__m256i*)(ptr + 8), bgr2);
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r, const v_uint8x32& a )
+{
+    __m256i bg0 = _mm256_unpacklo_epi8(b.val, g.val);
+    __m256i bg1 = _mm256_unpackhi_epi8(b.val, g.val);
+    __m256i ra0 = _mm256_unpacklo_epi8(r.val, a.val);
+    __m256i ra1 = _mm256_unpackhi_epi8(r.val, a.val);
+
+    __m256i bgra0_ = _mm256_unpacklo_epi16(bg0, ra0);
+    __m256i bgra1_ = _mm256_unpackhi_epi16(bg0, ra0);
+    __m256i bgra2_ = _mm256_unpacklo_epi16(bg1, ra1);
+    __m256i bgra3_ = _mm256_unpackhi_epi16(bg1, ra1);
+
+    __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
+    __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
+    __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
+    __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
+
+    _mm256_storeu_si256((__m256i*)ptr, bgra0);
+    _mm256_storeu_si256((__m256i*)(ptr + 32), bgra1);
+    _mm256_storeu_si256((__m256i*)(ptr + 64), bgra2);
+    _mm256_storeu_si256((__m256i*)(ptr + 96), bgra3);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g,
+                                const v_uint16x16& r, const v_uint16x16& a )
+{
+    __m256i bg0 = _mm256_unpacklo_epi16(b.val, g.val);
+    __m256i bg1 = _mm256_unpackhi_epi16(b.val, g.val);
+    __m256i ra0 = _mm256_unpacklo_epi16(r.val, a.val);
+    __m256i ra1 = _mm256_unpackhi_epi16(r.val, a.val);
+
+    __m256i bgra0_ = _mm256_unpacklo_epi32(bg0, ra0);
+    __m256i bgra1_ = _mm256_unpackhi_epi32(bg0, ra0);
+    __m256i bgra2_ = _mm256_unpacklo_epi32(bg1, ra1);
+    __m256i bgra3_ = _mm256_unpackhi_epi32(bg1, ra1);
+
+    __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
+    __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
+    __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
+    __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
+
+    _mm256_storeu_si256((__m256i*)ptr, bgra0);
+    _mm256_storeu_si256((__m256i*)(ptr + 16), bgra1);
+    _mm256_storeu_si256((__m256i*)(ptr + 32), bgra2);
+    _mm256_storeu_si256((__m256i*)(ptr + 48), bgra3);
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g,
+                                const v_uint32x8& r, const v_uint32x8& a )
+{
+    __m256i bg0 = _mm256_unpacklo_epi32(b.val, g.val);
+    __m256i bg1 = _mm256_unpackhi_epi32(b.val, g.val);
+    __m256i ra0 = _mm256_unpacklo_epi32(r.val, a.val);
+    __m256i ra1 = _mm256_unpackhi_epi32(r.val, a.val);
+
+    __m256i bgra0_ = _mm256_unpacklo_epi64(bg0, ra0);
+    __m256i bgra1_ = _mm256_unpackhi_epi64(bg0, ra0);
+    __m256i bgra2_ = _mm256_unpacklo_epi64(bg1, ra1);
+    __m256i bgra3_ = _mm256_unpackhi_epi64(bg1, ra1);
+
+    __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
+    __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
+    __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
+    __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
+
+    _mm256_storeu_si256((__m256i*)ptr, bgra0);
+    _mm256_storeu_si256((__m256i*)(ptr + 8), bgra1);
+    _mm256_storeu_si256((__m256i*)(ptr + 16), bgra2);
+    _mm256_storeu_si256((__m256i*)(ptr + 24), bgra3);
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g,
+                                const v_uint64x4& r, const v_uint64x4& a )
+{
+    __m256i bg0 = _mm256_unpacklo_epi64(b.val, g.val);
+    __m256i bg1 = _mm256_unpackhi_epi64(b.val, g.val);
+    __m256i ra0 = _mm256_unpacklo_epi64(r.val, a.val);
+    __m256i ra1 = _mm256_unpackhi_epi64(r.val, a.val);
+
+    __m256i bgra0 = _mm256_permute2x128_si256(bg0, ra0, 0 + 2*16);
+    __m256i bgra1 = _mm256_permute2x128_si256(bg1, ra1, 0 + 2*16);
+    __m256i bgra2 = _mm256_permute2x128_si256(bg0, ra0, 1 + 3*16);
+    __m256i bgra3 = _mm256_permute2x128_si256(bg1, ra1, 1 + 3*16);
+
+    _mm256_storeu_si256((__m256i*)ptr, bgra0);
+    _mm256_storeu_si256((__m256i*)(ptr + 4), bgra1);
+    _mm256_storeu_si256((__m256i*)(ptr + 8), bgra2);
+    _mm256_storeu_si256((__m256i*)(ptr + 12), bgra3);
+}
+
+#define OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
+{ \
+    _Tpvec1 a1, b1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
+{ \
+    _Tpvec1 a1, b1, c1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
+{ \
+    _Tpvec1 a1, b1, c1, d1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+    d0 = v_reinterpret_as_##suffix0(d1); \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1);      \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1);  \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, const _Tpvec0& d0 ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \
+}
+
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int16x16, short, s16, v_uint16x16, ushort, u16)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int32x8, int, s32, v_uint32x8, unsigned, u32)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, unsigned, u32)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64)

 inline void v256_cleanup() { _mm256_zeroupper(); }

--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -1318,6 +1318,80 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
    vst4q_##suffix(ptr, v); \
 }

+#define OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(tp, suffix) \
+inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b ) \
+{ \
+    tp##x1_t a0 = vld1_##suffix(ptr); \
+    tp##x1_t b0 = vld1_##suffix(ptr + 1); \
+    tp##x1_t a1 = vld1_##suffix(ptr + 2); \
+    tp##x1_t b1 = vld1_##suffix(ptr + 3); \
+    a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
+    b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
+} \
+ \
+inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, \
+                                 v_##tp##x2& b, v_##tp##x2& c ) \
+{ \
+    tp##x1_t a0 = vld1_##suffix(ptr); \
+    tp##x1_t b0 = vld1_##suffix(ptr + 1); \
+    tp##x1_t c0 = vld1_##suffix(ptr + 2); \
+    tp##x1_t a1 = vld1_##suffix(ptr + 3); \
+    tp##x1_t b1 = vld1_##suffix(ptr + 4); \
+    tp##x1_t c1 = vld1_##suffix(ptr + 5); \
+    a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
+    b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
+    c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
+} \
+ \
+inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
+                                 v_##tp##x2& c, v_##tp##x2& d ) \
+{ \
+    tp##x1_t a0 = vld1_##suffix(ptr); \
+    tp##x1_t b0 = vld1_##suffix(ptr + 1); \
+    tp##x1_t c0 = vld1_##suffix(ptr + 2); \
+    tp##x1_t d0 = vld1_##suffix(ptr + 3); \
+    tp##x1_t a1 = vld1_##suffix(ptr + 4); \
+    tp##x1_t b1 = vld1_##suffix(ptr + 5); \
+    tp##x1_t c1 = vld1_##suffix(ptr + 6); \
+    tp##x1_t d1 = vld1_##suffix(ptr + 7); \
+    a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
+    b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
+    c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
+    d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
+} \
+ \
+inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b ) \
+{ \
+    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
+    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
+    vst1_##suffix(ptr + 2, vget_high_##suffix(a.val)); \
+    vst1_##suffix(ptr + 3, vget_high_##suffix(b.val)); \
+} \
+ \
+inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
+                                const v_##tp##x2& b, const v_##tp##x2& c ) \
+{ \
+    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
+    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
+    vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
+    vst1_##suffix(ptr + 3, vget_high_##suffix(a.val)); \
+    vst1_##suffix(ptr + 4, vget_high_##suffix(b.val)); \
+    vst1_##suffix(ptr + 5, vget_high_##suffix(c.val)); \
+} \
+ \
+inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
+                                const v_##tp##x2& c, const v_##tp##x2& d ) \
+{ \
+    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
+    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
+    vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
+    vst1_##suffix(ptr + 3, vget_low_##suffix(d.val)); \
+    vst1_##suffix(ptr + 4, vget_high_##suffix(a.val)); \
+    vst1_##suffix(ptr + 5, vget_high_##suffix(b.val)); \
+    vst1_##suffix(ptr + 6, vget_high_##suffix(c.val)); \
+    vst1_##suffix(ptr + 7, vget_high_##suffix(d.val)); \
+}
+
 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
@ -1329,6 +1403,9 @@ OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64)
 #endif

+OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(uint64, u64)
+
 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 {
    return v_float32x4(vcvtq_f32_s32(a.val));
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@ -58,17 +58,6 @@ namespace cv

 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN

-struct v_uint8x16;
-struct v_int8x16;
-struct v_uint16x8;
-struct v_int16x8;
-struct v_uint32x4;
-struct v_int32x4;
-struct v_float32x4;
-struct v_uint64x2;
-struct v_int64x2;
-struct v_float64x2;
-
 struct v_uint8x16
 {
    typedef uchar lane_type;
@ -1660,7 +1649,7 @@ OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_N
 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)

-// adopted from sse_utils.hpp
+// load deinterleave
 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
 {
    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
@ -1681,7 +1670,25 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)

 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
 {
-#if CV_SSSE3
+#if CV_SSE4_1
+    static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+    static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    __m128i s0 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i s1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+    __m128i s2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
+    __m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1);
+    __m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1);
+    __m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1);
+    static const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
+    static const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
+    static const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
+    a0 = _mm_shuffle_epi8(a0, sh_b);
+    b0 = _mm_shuffle_epi8(b0, sh_g);
+    c0 = _mm_shuffle_epi8(c0, sh_r);
+    a.val = a0;
+    b.val = b0;
+    c.val = c0;
+#elif CV_SSSE3
    static const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
    static const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
    static const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
@ -1753,8 +1760,41 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b,
    d.val = _mm_unpackhi_epi8(v2, v3);
 }

+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
+{
+    __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));     // a0 b0 a1 b1 a2 b2 a3 b3
+    __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
+
+    __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
+    __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
+    __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
+    __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
+
+    a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
+    b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
+}
+
 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
 {
+#if CV_SSE4_1
+    __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));
+    __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8));
+    __m128i v2 = _mm_loadu_si128((__m128i*)(ptr + 16));
+    __m128i a0 = _mm_blend_epi16(_mm_blend_epi16(v0, v1, 0x92), v2, 0x24);
+    __m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24);
+    __m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24);
+
+    static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    static const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
+    static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    a0 = _mm_shuffle_epi8(a0, sh_a);
+    b0 = _mm_shuffle_epi8(b0, sh_b);
+    c0 = _mm_shuffle_epi8(c0, sh_c);
+
+    a.val = a0;
+    b.val = b0;
+    c.val = c0;
+#else
    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
    __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
    __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
@ -1770,6 +1810,7 @@ inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b,
    a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
    b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
    c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
+#endif
 }

 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
@ -1795,6 +1836,18 @@ inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b,
    d.val = _mm_unpackhi_epi16(u2, u3);
 }

+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
+{
+    __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));     // a0 b0 a1 b1
+    __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 4)); // a2 b2 a3 b3
+
+    __m128i v2 = _mm_unpacklo_epi32(v0, v1); // a0 a2 b0 b2
+    __m128i v3 = _mm_unpackhi_epi32(v0, v1); // a1 a3 b1 b3
+
+    a.val = _mm_unpacklo_epi32(v2, v3); // a0 a1 a2 a3
+    b.val = _mm_unpackhi_epi32(v2, v3); // b0 b1 ab b3
+}
+
 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
 {
    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
@ -1812,12 +1865,23 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4&

 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
 {
-    v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr));        // a0 b0 c0 d0
-    v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
-    v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
-    v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
+    v_uint32x4 s0(_mm_loadu_si128((const __m128i*)ptr));        // a0 b0 c0 d0
+    v_uint32x4 s1(_mm_loadu_si128((const __m128i*)(ptr + 4)));  // a1 b1 c1 d1
+    v_uint32x4 s2(_mm_loadu_si128((const __m128i*)(ptr + 8)));  // a2 b2 c2 d2
+    v_uint32x4 s3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3

-    v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
+    v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
+}
+
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
+{
+    const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
+
+    __m128 u0 = _mm_loadu_ps(ptr);       // a0 b0 a1 b1
+    __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
+
+    a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
+    b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
 }

 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
@ -1853,77 +1917,43 @@ inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b
    d.val = _mm_unpackhi_ps(t02hi, t13hi);
 }

-inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b)
 {
    __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
    __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2));
-    __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4));

-    a = v_uint64x2(_mm_unpacklo_epi64(t0, _mm_unpackhi_epi64(t1, t1)));
+    a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
+    b = v_uint64x2(_mm_unpackhi_epi64(t0, t1));
+}
+
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
+{
+    __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0, b0
+    __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0, a1
+    __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // b1, c1
+
+    t1 = _mm_shuffle_epi32(t1, 0x4e); // a1, c0
+
+    a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
    b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
-    c = v_uint64x2(_mm_unpacklo_epi64(t1, _mm_unpackhi_epi64(t2, t2)));
+    c = v_uint64x2(_mm_unpackhi_epi64(t1, t2));
 }

-inline void v_load_deinterleave(const int64 *ptr, v_int64x2& a, v_int64x2& b, v_int64x2& c)
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
+                                v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
 {
-    v_uint64x2 t0, t1, t2;
-    v_load_deinterleave((const uint64*)ptr, t0, t1, t2);
-    a = v_reinterpret_as_s64(t0);
-    b = v_reinterpret_as_s64(t1);
-    c = v_reinterpret_as_s64(t2);
+    __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0
+    __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0 d0
+    __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // a1 b1
+    __m128i t3 = _mm_loadu_si128((const __m128i*)(ptr + 6)); // c1 d1
+
+    a = v_uint64x2(_mm_unpacklo_epi64(t0, t2));
+    b = v_uint64x2(_mm_unpackhi_epi64(t0, t2));
+    c = v_uint64x2(_mm_unpacklo_epi64(t1, t3));
+    d = v_uint64x2(_mm_unpackhi_epi64(t1, t3));
 }

-inline void v_load_deinterleave(const double *ptr, v_float64x2& a, v_float64x2& b, v_float64x2& c)
-{
-    v_uint64x2 t0, t1, t2;
-    v_load_deinterleave((const uint64*)ptr, t0, t1, t2);
-    a = v_reinterpret_as_f64(t0);
-    b = v_reinterpret_as_f64(t1);
-    c = v_reinterpret_as_f64(t2);
-}
-
-// 2-channel
-inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
-{
-    const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
-
-    __m128 u0 = _mm_loadu_ps(ptr);       // a0 b0 a1 b1
-    __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
-
-    a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
-    b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
-}
-
-inline void v_load_deinterleave(const short* ptr, v_int16x8& a, v_int16x8& b)
-{
-    __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));     // a0 b0 a1 b1 a2 b2 a3 b3
-    __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
-
-    __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
-    __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
-    __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
-    __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
-
-    a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
-    b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
-}
-
-inline void v_load_deinterleave(const ushort*ptr, v_uint16x8& a, v_uint16x8& b)
-{
-    v_int16x8 sa, sb;
-    v_load_deinterleave((const short*)ptr, sa, sb);
-    a = v_reinterpret_as_u16(sa);
-    b = v_reinterpret_as_u16(sb);
-}
-
-inline void v_store_interleave(short* ptr, const v_int16x8& a, const v_int16x8& b)
-{
-    __m128i t0, t1;
-    t0 = _mm_unpacklo_epi16(a.val, b.val);
-    t1 = _mm_unpackhi_epi16(a.val, b.val);
-    _mm_storeu_si128((__m128i*)(ptr), t0);
-    _mm_storeu_si128((__m128i*)(ptr + 8), t1);
-}
+// store interleave

 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b)
 {
@ -1937,7 +1967,24 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1
 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
                                const v_uint8x16& c )
 {
-#if CV_SSSE3
+#if CV_SSE4_1
+    static const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
+    static const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
+    static const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
+    __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
+    __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
+    __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
+
+    static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+    static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    __m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0);
+    __m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0);
+    __m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0);
+
+    _mm_storeu_si128((__m128i*)(ptr), v0);
+    _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+    _mm_storeu_si128((__m128i*)(ptr + 32), v2);
+#elif CV_SSSE3
    static const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
    static const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
    static const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
@ -2025,10 +2072,35 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1
    _mm_storeu_si128((__m128i*)(ptr + 48), v3);
 }

+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b )
+{
+    __m128i t0, t1;
+    t0 = _mm_unpacklo_epi16(a.val, b.val);
+    t1 = _mm_unpackhi_epi16(a.val, b.val);
+    _mm_storeu_si128((__m128i*)(ptr), t0);
+    _mm_storeu_si128((__m128i*)(ptr + 8), t1);
+}
+
 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
                                const v_uint16x8& b,
                                const v_uint16x8& c )
 {
+#if CV_SSE4_1
+    static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    static const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
+    static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
+    __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
+    __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
+
+    __m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24);
+    __m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24);
+    __m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24);
+
+    _mm_storeu_si128((__m128i*)ptr, v0);
+    _mm_storeu_si128((__m128i*)(ptr + 8), v1);
+    _mm_storeu_si128((__m128i*)(ptr + 16), v2);
+#else
    __m128i z = _mm_setzero_si128();
    __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
    __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
@ -2060,6 +2132,7 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
    _mm_storeu_si128((__m128i*)(ptr), v0);
    _mm_storeu_si128((__m128i*)(ptr + 8), v1);
    _mm_storeu_si128((__m128i*)(ptr + 16), v2);
+#endif
 }

 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
@ -2085,6 +2158,15 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16
    _mm_storeu_si128((__m128i*)(ptr + 24), v3);
 }

+inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b )
+{
+    __m128i t0 = _mm_unpacklo_epi32(a.val, b.val);
+    __m128i t1 = _mm_unpackhi_epi32(a.val, b.val);
+
+    _mm_storeu_si128((__m128i*)ptr, t0);
+    _mm_storeu_si128((__m128i*)(ptr + 4), t1);
+}
+
 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
                                const v_uint32x4& c )
 {
@ -2158,6 +2240,15 @@ inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32
    _mm_storeu_ps(ptr + 12, v3);
 }

+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b)
+{
+    __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
+    __m128i t1 = _mm_unpackhi_epi64(a.val, b.val);
+
+    _mm_storeu_si128((__m128i*)ptr, t0);
+    _mm_storeu_si128((__m128i*)(ptr + 2), t1);
+}
+
 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c)
 {
    __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
@ -2169,58 +2260,72 @@ inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x
    _mm_storeu_si128((__m128i*)(ptr + 4), t2);
 }

-inline void v_store_interleave(int64 *ptr, const v_int64x2& a, const v_int64x2& b, const v_int64x2& c)
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, const v_uint64x2& d)
 {
-    v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c));
+    __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
+    __m128i t1 = _mm_unpacklo_epi64(c.val, d.val);
+    __m128i t2 = _mm_unpackhi_epi64(a.val, b.val);
+    __m128i t3 = _mm_unpackhi_epi64(c.val, d.val);
+
+    _mm_storeu_si128((__m128i*)ptr, t0);
+    _mm_storeu_si128((__m128i*)(ptr + 2), t1);
+    _mm_storeu_si128((__m128i*)(ptr + 4), t2);
+    _mm_storeu_si128((__m128i*)(ptr + 6), t3);
 }

-inline void v_store_interleave(double *ptr, const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
-{
-    v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c));
-}
-
-#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
-inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
-                                 _Tpvec& b0, _Tpvec& c0 ) \
+#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
 { \
-    _Tpuvec a1, b1, c1; \
-    v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1); \
-    a0 = v_reinterpret_as_##suffix(a1); \
-    b0 = v_reinterpret_as_##suffix(b1); \
-    c0 = v_reinterpret_as_##suffix(c1); \
+    _Tpvec1 a1, b1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
 } \
-inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
-                                 _Tpvec& b0, _Tpvec& c0, _Tpvec& d0 ) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
 { \
-    _Tpuvec a1, b1, c1, d1; \
-    v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1, d1); \
-    a0 = v_reinterpret_as_##suffix(a1); \
-    b0 = v_reinterpret_as_##suffix(b1); \
-    c0 = v_reinterpret_as_##suffix(c1); \
-    d0 = v_reinterpret_as_##suffix(d1); \
+    _Tpvec1 a1, b1, c1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
 } \
-inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, \
-                               const _Tpvec& b0, const _Tpvec& c0 ) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
 { \
-    _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
-    _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
-    _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
-    v_store_interleave((_Tpu*)ptr, a1, b1, c1); \
+    _Tpvec1 a1, b1, c1, d1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+    d0 = v_reinterpret_as_##suffix0(d1); \
 } \
-inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, const _Tpvec& b0, \
-                               const _Tpvec& c0, const _Tpvec& d0 ) \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \
 { \
-    _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
-    _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
-    _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
-    _Tpuvec d1 = v_reinterpret_as_##usuffix(d0); \
-    v_store_interleave((_Tpu*)ptr, a1, b1, c1, d1); \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1);      \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1);  \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                               const _Tpvec0& c0, const _Tpvec0& d0 ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \
 }

 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
-//OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)

 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 {
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@ -298,6 +298,8 @@ OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint, v_uint32x4)
 OPENCV_HAL_IMPL_VSX_INTERLEAVE(int, v_int32x4)
 OPENCV_HAL_IMPL_VSX_INTERLEAVE(float, v_float32x4)
 OPENCV_HAL_IMPL_VSX_INTERLEAVE(double, v_float64x2)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(int64, v_int64x2)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint64, v_uint64x2)

 /* Expand */
 #define OPENCV_HAL_IMPL_VSX_EXPAND(_Tpvec, _Tpwvec, _Tp, fl, fh)  \
--- a/modules/core/include/opencv2/core/types.hpp
+++ b/modules/core/include/opencv2/core/types.hpp
@ -871,6 +871,13 @@ public:
    */
    TermCriteria(int type, int maxCount, double epsilon);

+    inline bool isValid() const
+    {
+        const bool isCount = (type & COUNT) && maxCount > 0;
+        const bool isEps = (type & EPS) && !cvIsNaN(epsilon);
+        return isCount || isEps;
+    }
+
    int type; //!< the type of termination criteria: COUNT, EPS or COUNT + EPS
    int maxCount; //!< the maximum number of iterations/elements
    double epsilon; //!< the desired accuracy
--- a/modules/core/include/opencv2/core/types_c.h
+++ b/modules/core/include/opencv2/core/types_c.h
@ -629,7 +629,6 @@ CV_INLINE int cvIplDepth( int type )
 #define CV_TYPE_NAME_MATND    "opencv-nd-matrix"

 #define CV_MAX_DIM            32
-#define CV_MAX_DIM_HEAP       1024

 /**
  @deprecated consider using cv::Mat instead
--- a/modules/core/src/array.cpp
+++ b/modules/core/src/array.cpp
@ -1725,8 +1725,8 @@ cvPtr1D( const CvArr* arr, int idx, int* _type )
        else
        {
            int i, n = m->dims;
-            CV_DbgAssert( n <= CV_MAX_DIM_HEAP );
-            int _idx[CV_MAX_DIM_HEAP];
+            CV_DbgAssert( n <= CV_MAX_DIM );
+            int _idx[CV_MAX_DIM];

            for( i = n - 1; i >= 0; i-- )
            {
--- a/modules/core/src/merge.cpp
+++ b/modules/core/src/merge.cpp
@ -8,223 +8,49 @@

 namespace cv { namespace hal {

-#if CV_NEON
-template<typename T> struct VMerge2;
-template<typename T> struct VMerge3;
-template<typename T> struct VMerge4;
-
-#define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
-    template<>                                                                    \
-    struct name<data_type>{                                                       \
-        void operator()(const data_type* src0, const data_type* src1,             \
-                        data_type* dst){                                          \
-            reg_type r;                                                           \
-            r.val[0] = load_func(src0);                                           \
-            r.val[1] = load_func(src1);                                           \
-            store_func(dst, r);                                                   \
-        }                                                                         \
-    }
-
-#define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
-    template<>                                                                    \
-    struct name<data_type>{                                                       \
-        void operator()(const data_type* src0, const data_type* src1,             \
-                        const data_type* src2, data_type* dst){                   \
-            reg_type r;                                                           \
-            r.val[0] = load_func(src0);                                           \
-            r.val[1] = load_func(src1);                                           \
-            r.val[2] = load_func(src2);                                           \
-            store_func(dst, r);                                                   \
-        }                                                                         \
-    }
-
-#define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
-    template<>                                                                    \
-    struct name<data_type>{                                                       \
-        void operator()(const data_type* src0, const data_type* src1,             \
-                        const data_type* src2, const data_type* src3,             \
-                        data_type* dst){                                          \
-            reg_type r;                                                           \
-            r.val[0] = load_func(src0);                                           \
-            r.val[1] = load_func(src1);                                           \
-            r.val[2] = load_func(src2);                                           \
-            r.val[3] = load_func(src3);                                           \
-            store_func(dst, r);                                                   \
-        }                                                                         \
-    }
-
-MERGE2_KERNEL_TEMPLATE(VMerge2, uchar ,  uint8x16x2_t, vld1q_u8 , vst2q_u8 );
-MERGE2_KERNEL_TEMPLATE(VMerge2, ushort,  uint16x8x2_t, vld1q_u16, vst2q_u16);
-MERGE2_KERNEL_TEMPLATE(VMerge2, int   ,   int32x4x2_t, vld1q_s32, vst2q_s32);
-MERGE2_KERNEL_TEMPLATE(VMerge2, int64 ,   int64x1x2_t, vld1_s64 , vst2_s64 );
-
-MERGE3_KERNEL_TEMPLATE(VMerge3, uchar ,  uint8x16x3_t, vld1q_u8 , vst3q_u8 );
-MERGE3_KERNEL_TEMPLATE(VMerge3, ushort,  uint16x8x3_t, vld1q_u16, vst3q_u16);
-MERGE3_KERNEL_TEMPLATE(VMerge3, int   ,   int32x4x3_t, vld1q_s32, vst3q_s32);
-MERGE3_KERNEL_TEMPLATE(VMerge3, int64 ,   int64x1x3_t, vld1_s64 , vst3_s64 );
-
-MERGE4_KERNEL_TEMPLATE(VMerge4, uchar ,  uint8x16x4_t, vld1q_u8 , vst4q_u8 );
-MERGE4_KERNEL_TEMPLATE(VMerge4, ushort,  uint16x8x4_t, vld1q_u16, vst4q_u16);
-MERGE4_KERNEL_TEMPLATE(VMerge4, int   ,   int32x4x4_t, vld1q_s32, vst4q_s32);
-MERGE4_KERNEL_TEMPLATE(VMerge4, int64 ,   int64x1x4_t, vld1_s64 , vst4_s64 );
-
-#elif CV_SSE2
-
-template <typename T>
-struct VMerge2
+#if CV_SIMD
+template<typename T, typename VecT> static void
+vecmerge_( const T** src, T* dst, int len, int cn )
 {
-    VMerge2() : support(false) { }
-    void operator()(const T *, const T *, T *) const { }
+    int i;
+    const T* src0 = src[0];
+    const T* src1 = src[1];

-    bool support;
-};
-
-template <typename T>
-struct VMerge3
-{
-    VMerge3() : support(false) { }
-    void operator()(const T *, const T *, const T *, T *) const { }
-
-    bool support;
-};
-
-template <typename T>
-struct VMerge4
-{
-    VMerge4() : support(false) { }
-    void operator()(const T *, const T *, const T *, const T *, T *) const { }
-
-    bool support;
-};
-
-#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
-template <>                                                                                \
-struct VMerge2<data_type>                                                                  \
-{                                                                                          \
-    enum                                                                                   \
-    {                                                                                      \
-        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
-    };                                                                                     \
-                                                                                           \
-    VMerge2()                                                                              \
-    {                                                                                      \
-        support = checkHardwareSupport(se);                                                \
-    }                                                                                      \
-                                                                                           \
-    void operator()(const data_type * src0, const data_type * src1,                        \
-                    data_type * dst) const                                                 \
-    {                                                                                      \
-        reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
-        reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
-        reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
-        reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
-                                                                                           \
-        _mm_interleave(v_src0, v_src1, v_src2, v_src3);                                    \
-                                                                                           \
-        _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
-    }                                                                                      \
-                                                                                           \
-    bool support;                                                                          \
+    const int VECSZ = VecT::nlanes;
+    if( cn == 2 )
+    {
+        for( i = 0; i < len; i += VECSZ )
+        {
+            i = std::min( len - VECSZ, i );
+            VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
+            v_store_interleave(dst + i*cn, a, b);
+        }
+    }
+    else if( cn == 3 )
+    {
+        const T* src2 = src[2];
+        for( i = 0; i < len; i += VECSZ )
+        {
+            i = std::min( len - VECSZ, i );
+            VecT a = vx_load(src0 + i), b = vx_load(src1 + i), c = vx_load(src2 + i);
+            v_store_interleave(dst + i*cn, a, b, c);
+        }
+    }
+    else
+    {
+        CV_Assert( cn == 4 );
+        const T* src2 = src[2];
+        const T* src3 = src[3];
+        for( i = 0; i < len; i += VECSZ )
+        {
+            i = std::min( len - VECSZ, i );
+            VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
+            VecT c = vx_load(src2 + i), d = vx_load(src3 + i);
+            v_store_interleave(dst + i*cn, a, b, c, d);
+        }
+    }
+    vx_cleanup();
 }
-
-#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
-template <>                                                                                \
-struct VMerge3<data_type>                                                                  \
-{                                                                                          \
-    enum                                                                                   \
-    {                                                                                      \
-        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
-    };                                                                                     \
-                                                                                           \
-    VMerge3()                                                                              \
-    {                                                                                      \
-        support = checkHardwareSupport(se);                                                \
-    }                                                                                      \
-                                                                                           \
-    void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\
-                    data_type * dst) const                                                 \
-    {                                                                                      \
-        reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
-        reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
-        reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
-        reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
-        reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2));                   \
-        reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC));    \
-                                                                                           \
-        _mm_interleave(v_src0, v_src1, v_src2,                                             \
-                       v_src3, v_src4, v_src5);                                            \
-                                                                                           \
-        _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5);                \
-    }                                                                                      \
-                                                                                           \
-    bool support;                                                                          \
-}
-
-#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
-template <>                                                                                \
-struct VMerge4<data_type>                                                                  \
-{                                                                                          \
-    enum                                                                                   \
-    {                                                                                      \
-        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
-    };                                                                                     \
-                                                                                           \
-    VMerge4()                                                                              \
-    {                                                                                      \
-        support = checkHardwareSupport(se);                                                \
-    }                                                                                      \
-                                                                                           \
-    void operator()(const data_type * src0, const data_type * src1,                        \
-                    const data_type * src2, const data_type * src3,                        \
-                    data_type * dst) const                                                 \
-    {                                                                                      \
-        reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
-        reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
-        reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
-        reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
-        reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2));                   \
-        reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC));    \
-        reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3));                   \
-        reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC));    \
-                                                                                           \
-        _mm_interleave(v_src0, v_src1, v_src2, v_src3,                                     \
-                       v_src4, v_src5, v_src6, v_src7);                                    \
-                                                                                           \
-        _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7);                \
-    }                                                                                      \
-                                                                                           \
-    bool support;                                                                          \
-}
-
-MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
-MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
-MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
-
-#if CV_SSE4_1
-MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
-MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
-MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
-#endif
-
-MERGE2_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
-MERGE3_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
-MERGE4_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
-
 #endif

 template<typename T> static void
@ -242,28 +68,6 @@ merge_( const T** src, T* dst, int len, int cn )
    {
        const T *src0 = src[0], *src1 = src[1];
        i = j = 0;
-#if CV_NEON
-        if(cn == 2)
-        {
-            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
-            int inc_j = 2 * inc_i;
-
-            VMerge2<T> vmerge;
-            for( ; i < len - inc_i; i += inc_i, j += inc_j)
-                vmerge(src0 + i, src1 + i, dst + j);
-        }
-#elif CV_SSE2
-        if(cn == 2)
-        {
-            int inc_i = 32/sizeof(T);
-            int inc_j = 2 * inc_i;
-
-            VMerge2<T> vmerge;
-            if (vmerge.support)
-                for( ; i < len - inc_i; i += inc_i, j += inc_j)
-                    vmerge(src0 + i, src1 + i, dst + j);
-        }
-#endif
        for( ; i < len; i++, j += cn )
        {
            dst[j] = src0[i];
@ -274,28 +78,6 @@ merge_( const T** src, T* dst, int len, int cn )
    {
        const T *src0 = src[0], *src1 = src[1], *src2 = src[2];
        i = j = 0;
-#if CV_NEON
-        if(cn == 3)
-        {
-            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
-            int inc_j = 3 * inc_i;
-
-            VMerge3<T> vmerge;
-            for( ; i < len - inc_i; i += inc_i, j += inc_j)
-                vmerge(src0 + i, src1 + i, src2 + i, dst + j);
-        }
-#elif CV_SSE2
-        if(cn == 3)
-        {
-            int inc_i = 32/sizeof(T);
-            int inc_j = 3 * inc_i;
-
-            VMerge3<T> vmerge;
-            if (vmerge.support)
-                for( ; i < len - inc_i; i += inc_i, j += inc_j)
-                    vmerge(src0 + i, src1 + i, src2 + i, dst + j);
-        }
-#endif
        for( ; i < len; i++, j += cn )
        {
            dst[j] = src0[i];
@ -307,28 +89,6 @@ merge_( const T** src, T* dst, int len, int cn )
    {
        const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
        i = j = 0;
-#if CV_NEON
-        if(cn == 4)
-        {
-            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
-            int inc_j = 4 * inc_i;
-
-            VMerge4<T> vmerge;
-            for( ; i < len - inc_i; i += inc_i, j += inc_j)
-                vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
-        }
-#elif CV_SSE2
-        if(cn == 4)
-        {
-            int inc_i = 32/sizeof(T);
-            int inc_j = 4 * inc_i;
-
-            VMerge4<T> vmerge;
-            if (vmerge.support)
-                for( ; i < len - inc_i; i += inc_i, j += inc_j)
-                    vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
-        }
-#endif
        for( ; i < len; i++, j += cn )
        {
            dst[j] = src0[i]; dst[j+1] = src1[i];
@ -347,29 +107,48 @@ merge_( const T** src, T* dst, int len, int cn )
    }
 }

-
 void merge8u(const uchar** src, uchar* dst, int len, int cn )
 {
    CALL_HAL(merge8u, cv_hal_merge8u, src, dst, len, cn)
-    merge_(src, dst, len, cn);
+#if CV_SIMD
+    if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 )
+        vecmerge_<uchar, v_uint8>(src, dst, len, cn);
+    else
+#endif
+        merge_(src, dst, len, cn);
 }

 void merge16u(const ushort** src, ushort* dst, int len, int cn )
 {
    CALL_HAL(merge16u, cv_hal_merge16u, src, dst, len, cn)
-    merge_(src, dst, len, cn);
+#if CV_SIMD
+    if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 )
+        vecmerge_<ushort, v_uint16>(src, dst, len, cn);
+    else
+#endif
+        merge_(src, dst, len, cn);
 }

 void merge32s(const int** src, int* dst, int len, int cn )
 {
    CALL_HAL(merge32s, cv_hal_merge32s, src, dst, len, cn)
-    merge_(src, dst, len, cn);
+#if CV_SIMD
+    if( len >= v_int32::nlanes && 2 <= cn && cn <= 4 )
+        vecmerge_<int, v_int32>(src, dst, len, cn);
+    else
+#endif
+        merge_(src, dst, len, cn);
 }

 void merge64s(const int64** src, int64* dst, int len, int cn )
 {
    CALL_HAL(merge64s, cv_hal_merge64s, src, dst, len, cn)
-    merge_(src, dst, len, cn);
+#if CV_SIMD
+    if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 )
+        vecmerge_<int64, v_int64>(src, dst, len, cn);
+    else
+#endif
+        merge_(src, dst, len, cn);
 }

 }} // cv::hal::
--- a/modules/core/src/persistence_json.cpp
+++ b/modules/core/src/persistence_json.cpp
@ -123,7 +123,6 @@ static char* icvJSONParseKey( CvFileStorage* fs, char* ptr, CvFileNode* map, CvF
        CV_PARSE_ERROR( "Key must start with \'\"\'" );

    char * beg = ptr + 1;
-    char * end = beg;

    do {
        ++ptr;
@ -133,7 +132,7 @@ static char* icvJSONParseKey( CvFileStorage* fs, char* ptr, CvFileNode* map, CvF
    if( *ptr != '"' )
        CV_PARSE_ERROR( "Key must end with \'\"\'" );

-    end = ptr;
+    const char * end = ptr;
    ptr++;
    ptr = icvJSONSkipSpaces( fs, ptr );
    if ( ptr == 0 || fs->dummy_eof )
@ -576,12 +575,12 @@ void icvJSONParse( CvFileStorage* fs )
    if ( *ptr == '{' )
    {
        CvFileNode* root_node = (CvFileNode*)cvSeqPush( fs->roots, 0 );
-        ptr = icvJSONParseMap( fs, ptr, root_node );
+        icvJSONParseMap( fs, ptr, root_node );
    }
    else if ( *ptr == '[' )
    {
        CvFileNode* root_node = (CvFileNode*)cvSeqPush( fs->roots, 0 );
-        ptr = icvJSONParseSeq( fs, ptr, root_node );
+        icvJSONParseSeq( fs, ptr, root_node );
    }
    else
    {
@ -668,7 +667,7 @@ void icvJSONWrite( CvFileStorage* fs, const char* key, const char* data )
            *ptr++ = '\n';
            *ptr++ = '\0';
            ::icvPuts( fs, fs->buffer_start );
-            ptr = fs->buffer = fs->buffer_start;
+            fs->buffer = fs->buffer_start;
        }
        ptr = icvFSFlush(fs);
    }
--- a/modules/core/src/persistence_types.cpp
+++ b/modules/core/src/persistence_types.cpp
@ -302,7 +302,7 @@ static void* icvReadSparseMat( CvFileStorage* fs, CvFileNode* node )
    CvFileNode* sizes_node;
    CvSeqReader reader;
    CvSeq* elements;
-    int sizes[CV_MAX_DIM_HEAP], dims, elem_type, cn;
+    int sizes[CV_MAX_DIM], dims, elem_type, cn;
    int i;

    sizes_node = cvGetFileNodeByName( fs, node, "sizes" );
@ -327,7 +327,7 @@ static void* icvReadSparseMat( CvFileStorage* fs, CvFileNode* node )
    mat = cvCreateSparseMat( dims, sizes, elem_type );

    cn = CV_MAT_CN(elem_type);
-    int idx[CV_MAX_DIM_HEAP];
+    int idx[CV_MAX_DIM];
    elements = data->data.seq;
    cvStartReadRawData( fs, data, &reader );

--- a/modules/core/src/split.cpp
+++ b/modules/core/src/split.cpp
@ -8,222 +8,57 @@

 namespace cv { namespace hal {

-#if CV_NEON
-template<typename T> struct VSplit2;
-template<typename T> struct VSplit3;
-template<typename T> struct VSplit4;
-
-#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
-    template<>                                                                    \
-    struct name<data_type>                                                        \
-    {                                                                             \
-        void operator()(const data_type* src, data_type* dst0,                    \
-                        data_type* dst1) const                                    \
-        {                                                                         \
-            reg_type r = load_func(src);                                          \
-            store_func(dst0, r.val[0]);                                           \
-            store_func(dst1, r.val[1]);                                           \
-        }                                                                         \
-    }
-
-#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
-    template<>                                                                    \
-    struct name<data_type>                                                        \
-    {                                                                             \
-        void operator()(const data_type* src, data_type* dst0, data_type* dst1,   \
-                        data_type* dst2) const                                    \
-        {                                                                         \
-            reg_type r = load_func(src);                                          \
-            store_func(dst0, r.val[0]);                                           \
-            store_func(dst1, r.val[1]);                                           \
-            store_func(dst2, r.val[2]);                                           \
-        }                                                                         \
-    }
-
-#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
-    template<>                                                                    \
-    struct name<data_type>                                                        \
-    {                                                                             \
-        void operator()(const data_type* src, data_type* dst0, data_type* dst1,   \
-                        data_type* dst2, data_type* dst3) const                   \
-        {                                                                         \
-            reg_type r = load_func(src);                                          \
-            store_func(dst0, r.val[0]);                                           \
-            store_func(dst1, r.val[1]);                                           \
-            store_func(dst2, r.val[2]);                                           \
-            store_func(dst3, r.val[3]);                                           \
-        }                                                                         \
-    }
-
-SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar ,  uint8x16x2_t, vld2q_u8 , vst1q_u8 );
-SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort,  uint16x8x2_t, vld2q_u16, vst1q_u16);
-SPLIT2_KERNEL_TEMPLATE(VSplit2, int   ,   int32x4x2_t, vld2q_s32, vst1q_s32);
-SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 ,   int64x1x2_t, vld2_s64 , vst1_s64 );
-
-SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar ,  uint8x16x3_t, vld3q_u8 , vst1q_u8 );
-SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort,  uint16x8x3_t, vld3q_u16, vst1q_u16);
-SPLIT3_KERNEL_TEMPLATE(VSplit3, int   ,   int32x4x3_t, vld3q_s32, vst1q_s32);
-SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 ,   int64x1x3_t, vld3_s64 , vst1_s64 );
-
-SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar ,  uint8x16x4_t, vld4q_u8 , vst1q_u8 );
-SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort,  uint16x8x4_t, vld4q_u16, vst1q_u16);
-SPLIT4_KERNEL_TEMPLATE(VSplit4, int   ,   int32x4x4_t, vld4q_s32, vst1q_s32);
-SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 ,   int64x1x4_t, vld4_s64 , vst1_s64 );
-
-#elif CV_SSE2
-
-template <typename T>
-struct VSplit2
+#if CV_SIMD
+template<typename T, typename VecT> static void
+vecsplit_( const T* src, T** dst, int len, int cn )
 {
-    VSplit2() : support(false) { }
-    void operator()(const T *, T *, T *) const { }
+    int i;
+    T* dst0 = dst[0];
+    T* dst1 = dst[1];

-    bool support;
-};
-
-template <typename T>
-struct VSplit3
-{
-    VSplit3() : support(false) { }
-    void operator()(const T *, T *, T *, T *) const { }
-
-    bool support;
-};
-
-template <typename T>
-struct VSplit4
-{
-    VSplit4() : support(false) { }
-    void operator()(const T *, T *, T *, T *, T *) const { }
-
-    bool support;
-};
-
-#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
-template <>                                                                                \
-struct VSplit2<data_type>                                                                  \
-{                                                                                          \
-    enum                                                                                   \
-    {                                                                                      \
-        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
-    };                                                                                     \
-                                                                                           \
-    VSplit2()                                                                              \
-    {                                                                                      \
-        support = checkHardwareSupport(CV_CPU_SSE2);                                       \
-    }                                                                                      \
-                                                                                           \
-    void operator()(const data_type * src,                                                 \
-                    data_type * dst0, data_type * dst1) const                              \
-    {                                                                                      \
-        reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
-        reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
-        reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
-        reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
-                                                                                           \
-        _mm_deinterleave(v_src0, v_src1, v_src2, v_src3);                                  \
-                                                                                           \
-        _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
-        _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
-    }                                                                                      \
-                                                                                           \
-    bool support;                                                                          \
+    const int VECSZ = VecT::nlanes;
+    if( cn == 2 )
+    {
+        for( i = 0; i < len; i += VECSZ )
+        {
+            i = std::min( len - VECSZ, i );
+            VecT a, b;
+            v_load_deinterleave(src + i*cn, a, b);
+            v_store(dst0 + i, a);
+            v_store(dst1 + i, b);
+        }
+    }
+    else if( cn == 3 )
+    {
+        T* dst2 = dst[2];
+        for( i = 0; i < len; i += VECSZ )
+        {
+            i = std::min( len - VECSZ, i );
+            VecT a, b, c;
+            v_load_deinterleave(src + i*cn, a, b, c);
+            v_store(dst0 + i, a);
+            v_store(dst1 + i, b);
+            v_store(dst2 + i, c);
+        }
+    }
+    else
+    {
+        CV_Assert( cn == 4 );
+        T* dst2 = dst[2];
+        T* dst3 = dst[3];
+        for( i = 0; i < len; i += VECSZ )
+        {
+            i = std::min( len - VECSZ, i );
+            VecT a, b, c, d;
+            v_load_deinterleave(src + i*cn, a, b, c, d);
+            v_store(dst0 + i, a);
+            v_store(dst1 + i, b);
+            v_store(dst2 + i, c);
+            v_store(dst3 + i, d);
+        }
+    }
+    vx_cleanup();
 }
-
-#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
-template <>                                                                                \
-struct VSplit3<data_type>                                                                  \
-{                                                                                          \
-    enum                                                                                   \
-    {                                                                                      \
-        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
-    };                                                                                     \
-                                                                                           \
-    VSplit3()                                                                              \
-    {                                                                                      \
-        support = checkHardwareSupport(CV_CPU_SSE2);                                       \
-    }                                                                                      \
-                                                                                           \
-    void operator()(const data_type * src,                                                 \
-                    data_type * dst0, data_type * dst1, data_type * dst2) const            \
-    {                                                                                      \
-        reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
-        reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
-        reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
-        reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
-        reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
-        reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
-                                                                                           \
-        _mm_deinterleave(v_src0, v_src1, v_src2,                                           \
-                         v_src3, v_src4, v_src5);                                          \
-                                                                                           \
-        _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
-        _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
-        _mm_storeu_##flavor((cast_type *)(dst2), v_src4);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5);                   \
-    }                                                                                      \
-                                                                                           \
-    bool support;                                                                          \
-}
-
-#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
-template <>                                                                                \
-struct VSplit4<data_type>                                                                  \
-{                                                                                          \
-    enum                                                                                   \
-    {                                                                                      \
-        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
-    };                                                                                     \
-                                                                                           \
-    VSplit4()                                                                              \
-    {                                                                                      \
-        support = checkHardwareSupport(CV_CPU_SSE2);                                       \
-    }                                                                                      \
-                                                                                           \
-    void operator()(const data_type * src, data_type * dst0, data_type * dst1,             \
-                    data_type * dst2, data_type * dst3) const                              \
-    {                                                                                      \
-        reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
-        reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
-        reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
-        reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
-        reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
-        reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
-        reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \
-        reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \
-                                                                                           \
-        _mm_deinterleave(v_src0, v_src1, v_src2, v_src3,                                   \
-                         v_src4, v_src5, v_src6, v_src7);                                  \
-                                                                                           \
-        _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
-        _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
-        _mm_storeu_##flavor((cast_type *)(dst2), v_src4);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5);                   \
-        _mm_storeu_##flavor((cast_type *)(dst3), v_src6);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7);                   \
-    }                                                                                      \
-                                                                                           \
-    bool support;                                                                          \
-}
-
-SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
-SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
-SPLIT2_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
-
-SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
-SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
-SPLIT3_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
-
-SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
-SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
-SPLIT4_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
-
 #endif

 template<typename T> static void
@ -250,30 +85,6 @@ split_( const T* src, T** dst, int len, int cn )
        T *dst0 = dst[0], *dst1 = dst[1];
        i = j = 0;

-#if CV_NEON
-        if(cn == 2)
-        {
-            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
-            int inc_j = 2 * inc_i;
-
-            VSplit2<T> vsplit;
-            for( ; i < len - inc_i; i += inc_i, j += inc_j)
-                vsplit(src + j, dst0 + i, dst1 + i);
-        }
-#elif CV_SSE2
-        if (cn == 2)
-        {
-            int inc_i = 32/sizeof(T);
-            int inc_j = 2 * inc_i;
-
-            VSplit2<T> vsplit;
-            if (vsplit.support)
-            {
-                for( ; i <= len - inc_i; i += inc_i, j += inc_j)
-                    vsplit(src + j, dst0 + i, dst1 + i);
-            }
-        }
-#endif
        for( ; i < len; i++, j += cn )
        {
            dst0[i] = src[j];
@ -285,31 +96,6 @@ split_( const T* src, T** dst, int len, int cn )
        T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2];
        i = j = 0;

-#if CV_NEON
-        if(cn == 3)
-        {
-            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
-            int inc_j = 3 * inc_i;
-
-            VSplit3<T> vsplit;
-            for( ; i <= len - inc_i; i += inc_i, j += inc_j)
-                vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
-        }
-#elif CV_SSE2
-        if (cn == 3)
-        {
-            int inc_i = 32/sizeof(T);
-            int inc_j = 3 * inc_i;
-
-            VSplit3<T> vsplit;
-
-            if (vsplit.support)
-            {
-                for( ; i <= len - inc_i; i += inc_i, j += inc_j)
-                    vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
-            }
-        }
-#endif
        for( ; i < len; i++, j += cn )
        {
            dst0[i] = src[j];
@ -322,30 +108,6 @@ split_( const T* src, T** dst, int len, int cn )
        T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3];
        i = j = 0;

-#if CV_NEON
-        if(cn == 4)
-        {
-            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
-            int inc_j = 4 * inc_i;
-
-            VSplit4<T> vsplit;
-            for( ; i <= len - inc_i; i += inc_i, j += inc_j)
-                vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
-        }
-#elif CV_SSE2
-        if (cn == 4)
-        {
-            int inc_i = 32/sizeof(T);
-            int inc_j = 4 * inc_i;
-
-            VSplit4<T> vsplit;
-            if (vsplit.support)
-            {
-                for( ; i <= len - inc_i; i += inc_i, j += inc_j)
-                    vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
-            }
-        }
-#endif
        for( ; i < len; i++, j += cn )
        {
            dst0[i] = src[j]; dst1[i] = src[j+1];
@ -367,25 +129,46 @@ split_( const T* src, T** dst, int len, int cn )
 void split8u(const uchar* src, uchar** dst, int len, int cn )
 {
    CALL_HAL(split8u, cv_hal_split8u, src,dst, len, cn)
-    split_(src, dst, len, cn);
+
+#if CV_SIMD
+    if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 )
+        vecsplit_<uchar, v_uint8>(src, dst, len, cn);
+    else
+#endif
+        split_(src, dst, len, cn);
 }

 void split16u(const ushort* src, ushort** dst, int len, int cn )
 {
    CALL_HAL(split16u, cv_hal_split16u, src,dst, len, cn)
-    split_(src, dst, len, cn);
+#if CV_SIMD
+    if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 )
+        vecsplit_<ushort, v_uint16>(src, dst, len, cn);
+    else
+#endif
+        split_(src, dst, len, cn);
 }

 void split32s(const int* src, int** dst, int len, int cn )
 {
    CALL_HAL(split32s, cv_hal_split32s, src,dst, len, cn)
-    split_(src, dst, len, cn);
+#if CV_SIMD
+    if( len >= v_uint32::nlanes && 2 <= cn && cn <= 4 )
+        vecsplit_<int, v_int32>(src, dst, len, cn);
+    else
+#endif
+        split_(src, dst, len, cn);
 }

 void split64s(const int64* src, int64** dst, int len, int cn )
 {
    CALL_HAL(split64s, cv_hal_split64s, src,dst, len, cn)
-    split_(src, dst, len, cn);
+#if CV_SIMD
+    if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 )
+        vecsplit_<int64, v_int64>(src, dst, len, cn);
+    else
+#endif
+        split_(src, dst, len, cn);
 }

 }} // cv::hal::
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@ -1014,8 +1014,8 @@ protected:
        Size mSize(rng.uniform(minMSize, maxMSize), rng.uniform(minMSize, maxMSize));
        size_t mvSize = rng.uniform(1, maxMvSize);

-        int res = cvtest::TS::OK, curRes = res;
-        curRes = run_case(CV_8U, mvSize, mSize, rng);
+        int res = cvtest::TS::OK;
+        int curRes = run_case(CV_8U, mvSize, mSize, rng);
        res = curRes != cvtest::TS::OK ? curRes : res;

        curRes = run_case(CV_8S, mvSize, mSize, rng);
--- a/modules/core/test/test_rand.cpp
+++ b/modules/core/test/test_rand.cpp
@ -173,7 +173,7 @@ void Core_RandTest::run( int )
                dsz = slice+1 < maxSlice ? (int)(cvtest::randInt(rng) % (SZ - sz) + 1) : SZ - sz;
                Mat aslice = arr[k].colRange(sz, sz + dsz);
                tested_rng.fill(aslice, dist_type, A, B);
-                printf("%d - %d\n", sz, sz + dsz);
+                //printf("%d - %d\n", sz, sz + dsz);
            }
        }

@ -375,9 +375,11 @@ TEST(Core_Rand, Regression_Stack_Corruption)
    int bufsz = 128; //enough for 14 doubles
    AutoBuffer<uchar> buffer(bufsz);
    size_t offset = 0;
-    cv::Mat_<cv::Point2d> x(2, 3, (cv::Point2d*)(buffer.data()+offset)); offset += x.total()*x.elemSize();
-    double& param1 = *(double*)(buffer.data()+offset); offset += sizeof(double);
-    double& param2 = *(double*)(buffer.data()+offset); offset += sizeof(double);
+    cv::Mat_<cv::Point2d> x(2, 3, (cv::Point2d*)(buffer.data()+offset));
+    offset += x.total()*x.elemSize();
+    double& param1 = *(double*)(buffer.data()+offset);
+    offset += sizeof(double);
+    double& param2 = *(double*)(buffer.data()+offset);
    param1 = -9; param2 = 2;

    cv::theRNG().fill(x, cv::RNG::NORMAL, param1, param2);
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@ -120,3 +120,9 @@ if(BUILD_PERF_TESTS)
    endif()
  endif()
 endif()
+
+# Test Intel's Inference Engine models
+if(HAVE_INF_ENGINE AND TARGET opencv_test_dnn)
+  ocv_target_include_directories(opencv_test_dnn PRIVATE ${INF_ENGINE_INCLUDE_DIRS})
+  ocv_target_link_libraries(opencv_test_dnn LINK_PRIVATE ${INF_ENGINE_LIBRARIES})
+endif()
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@ -46,9 +46,9 @@
 #include <opencv2/core.hpp>

 #if !defined CV_DOXYGEN && !defined CV_DNN_DONT_ADD_EXPERIMENTAL_NS
-#define CV__DNN_EXPERIMENTAL_NS_BEGIN namespace experimental_dnn_v5 {
+#define CV__DNN_EXPERIMENTAL_NS_BEGIN namespace experimental_dnn_v6 {
 #define CV__DNN_EXPERIMENTAL_NS_END }
-namespace cv { namespace dnn { namespace experimental_dnn_v5 { } using namespace experimental_dnn_v5; }}
+namespace cv { namespace dnn { namespace experimental_dnn_v6 { } using namespace experimental_dnn_v6; }}
 #else
 #define CV__DNN_EXPERIMENTAL_NS_BEGIN
 #define CV__DNN_EXPERIMENTAL_NS_END
@ -487,14 +487,19 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
         */
        CV_WRAP void setPreferableTarget(int targetId);

-        /** @brief Sets the new value for the layer output blob
-         *  @param name descriptor of the updating layer output blob.
-         *  @param blob new blob.
+        /** @brief Sets the new input value for the network
+         *  @param blob        A new blob. Should have CV_32F or CV_8U depth.
+         *  @param name        A name of input layer.
+         *  @param scalefactor An optional normalization scale.
+         *  @param mean        An optional mean subtraction values.
         *  @see connect(String, String) to know format of the descriptor.
-         *  @note If updating blob is not empty then @p blob must have the same shape,
-         *  because network reshaping is not implemented yet.
+         *
+         *  If scale or mean values are specified, a final input blob is computed
+         *  as:
+         * \f[input(n,c,h,w) = scalefactor \times (blob(n,c,h,w) - mean_c)\f]
         */
-        CV_WRAP void setInput(InputArray blob, const String& name = "");
+        CV_WRAP void setInput(InputArray blob, const String& name = "",
+                              double scalefactor = 1.0, const Scalar& mean = Scalar());

        /** @brief Sets the new value for the learned param of the layer.
         *  @param layer name or id of the layer.
@ -805,13 +810,15 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
     *  @param swapRB flag which indicates that swap first and last channels
     *  in 3-channel image is necessary.
     *  @param crop flag which indicates whether image will be cropped after resize or not
+     *  @param ddepth Depth of output blob. Choose CV_32F or CV_8U.
     *  @details if @p crop is true, input image is resized so one side after resize is equal to corresponding
     *  dimension in @p size and another one is equal or larger. Then, crop from the center is performed.
     *  If @p crop is false, direct resize without cropping and preserving aspect ratio is performed.
     *  @returns 4-dimensional Mat with NCHW dimensions order.
     */
    CV_EXPORTS_W Mat blobFromImage(InputArray image, double scalefactor=1.0, const Size& size = Size(),
-                                   const Scalar& mean = Scalar(), bool swapRB=true, bool crop=true);
+                                   const Scalar& mean = Scalar(), bool swapRB=true, bool crop=true,
+                                   int ddepth=CV_32F);

    /** @brief Creates 4-dimensional blob from image.
     *  @details This is an overloaded member function, provided for convenience.
@ -819,7 +826,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
     */
    CV_EXPORTS void blobFromImage(InputArray image, OutputArray blob, double scalefactor=1.0,
                                  const Size& size = Size(), const Scalar& mean = Scalar(),
-                                  bool swapRB=true, bool crop=true);
+                                  bool swapRB=true, bool crop=true, int ddepth=CV_32F);


    /** @brief Creates 4-dimensional blob from series of images. Optionally resizes and
@ -833,13 +840,15 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
     *  @param swapRB flag which indicates that swap first and last channels
     *  in 3-channel image is necessary.
     *  @param crop flag which indicates whether image will be cropped after resize or not
+     *  @param ddepth Depth of output blob. Choose CV_32F or CV_8U.
     *  @details if @p crop is true, input image is resized so one side after resize is equal to corresponding
     *  dimension in @p size and another one is equal or larger. Then, crop from the center is performed.
     *  If @p crop is false, direct resize without cropping and preserving aspect ratio is performed.
-     *  @returns 4-dimansional Mat with NCHW dimensions order.
+     *  @returns 4-dimensional Mat with NCHW dimensions order.
     */
    CV_EXPORTS_W Mat blobFromImages(InputArrayOfArrays images, double scalefactor=1.0,
-                                    Size size = Size(), const Scalar& mean = Scalar(), bool swapRB=true, bool crop=true);
+                                    Size size = Size(), const Scalar& mean = Scalar(), bool swapRB=true, bool crop=true,
+                                    int ddepth=CV_32F);

    /** @brief Creates 4-dimensional blob from series of images.
     *  @details This is an overloaded member function, provided for convenience.
@ -847,7 +856,8 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
     */
    CV_EXPORTS void blobFromImages(InputArrayOfArrays images, OutputArray blob,
                                   double scalefactor=1.0, Size size = Size(),
-                                   const Scalar& mean = Scalar(), bool swapRB=true, bool crop=true);
+                                   const Scalar& mean = Scalar(), bool swapRB=true, bool crop=true,
+                                   int ddepth=CV_32F);

    /** @brief Parse a 4D blob and output the images it contains as 2D arrays through a simpler data structure
     *  (std::vector<cv::Mat>).
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -97,35 +97,42 @@ namespace
 }

 Mat blobFromImage(InputArray image, double scalefactor, const Size& size,
-                  const Scalar& mean, bool swapRB, bool crop)
+                  const Scalar& mean, bool swapRB, bool crop, int ddepth)
 {
    CV_TRACE_FUNCTION();
    Mat blob;
-    blobFromImage(image, blob, scalefactor, size, mean, swapRB, crop);
+    blobFromImage(image, blob, scalefactor, size, mean, swapRB, crop, ddepth);
    return blob;
 }

 void blobFromImage(InputArray image, OutputArray blob, double scalefactor,
-                   const Size& size, const Scalar& mean, bool swapRB, bool crop)
+                   const Size& size, const Scalar& mean, bool swapRB, bool crop, int ddepth)
 {
    CV_TRACE_FUNCTION();
    std::vector<Mat> images(1, image.getMat());
-    blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop);
+    blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop, ddepth);
 }

 Mat blobFromImages(InputArrayOfArrays images, double scalefactor, Size size,
-                   const Scalar& mean, bool swapRB, bool crop)
+                   const Scalar& mean, bool swapRB, bool crop, int ddepth)
 {
    CV_TRACE_FUNCTION();
    Mat blob;
-    blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop);
+    blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop, ddepth);
    return blob;
 }

 void blobFromImages(InputArrayOfArrays images_, OutputArray blob_, double scalefactor,
-                    Size size, const Scalar& mean_, bool swapRB, bool crop)
+                    Size size, const Scalar& mean_, bool swapRB, bool crop, int ddepth)
 {
    CV_TRACE_FUNCTION();
+    CV_CheckType(ddepth, ddepth == CV_32F || ddepth == CV_8U, "Blob depth should be CV_32F or CV_8U");
+    if (ddepth == CV_8U)
+    {
+        CV_CheckEQ(scalefactor, 1.0, "Scaling is not supported for CV_8U blob depth");
+        CV_Assert(mean_ == Scalar(), "Mean subtraction is not supported for CV_8U blob depth");
+    }
+
    std::vector<Mat> images;
    images_.getMatVector(images);
    CV_Assert(!images.empty());
@ -149,7 +156,7 @@ void blobFromImages(InputArrayOfArrays images_, OutputArray blob_, double scalef
            else
              resize(images[i], images[i], size, 0, 0, INTER_LINEAR);
        }
-        if(images[i].depth() == CV_8U)
+        if(images[i].depth() == CV_8U && ddepth == CV_32F)
            images[i].convertTo(images[i], CV_32F);
        Scalar mean = mean_;
        if (swapRB)
@ -167,20 +174,20 @@ void blobFromImages(InputArrayOfArrays images_, OutputArray blob_, double scalef
    if (nch == 3 || nch == 4)
    {
        int sz[] = { (int)nimages, nch, image0.rows, image0.cols };
-        blob_.create(4, sz, CV_32F);
+        blob_.create(4, sz, ddepth);
        Mat blob = blob_.getMat();
        Mat ch[4];

        for( i = 0; i < nimages; i++ )
        {
            image = images[i];
-            CV_Assert(image.depth() == CV_32F);
+            CV_Assert(image.depth() == blob_.depth());
            nch = image.channels();
            CV_Assert(image.dims == 2 && (nch == 3 || nch == 4));
            CV_Assert(image.size() == image0.size());

            for( int j = 0; j < nch; j++ )
-                ch[j] = Mat(image.rows, image.cols, CV_32F, blob.ptr((int)i, j));
+                ch[j] = Mat(image.rows, image.cols, ddepth, blob.ptr((int)i, j));
            if(swapRB)
                std::swap(ch[0], ch[2]);
            split(image, ch);
@ -190,18 +197,18 @@ void blobFromImages(InputArrayOfArrays images_, OutputArray blob_, double scalef
    {
       CV_Assert(nch == 1);
       int sz[] = { (int)nimages, 1, image0.rows, image0.cols };
-       blob_.create(4, sz, CV_32F);
+       blob_.create(4, sz, ddepth);
       Mat blob = blob_.getMat();

       for( i = 0; i < nimages; i++ )
       {
           Mat image = images[i];
-           CV_Assert(image.depth() == CV_32F);
+           CV_Assert(image.depth() == blob_.depth());
           nch = image.channels();
           CV_Assert(image.dims == 2 && (nch == 1));
           CV_Assert(image.size() == image0.size());

-           image.copyTo(Mat(image.rows, image.cols, CV_32F, blob.ptr((int)i, 0)));
+           image.copyTo(Mat(image.rows, image.cols, ddepth, blob.ptr((int)i, 0)));
       }
    }
 }
@ -408,7 +415,16 @@ struct LayerData
 //fake layer containing network input blobs
 struct DataLayer : public Layer
 {
-    void finalize(const std::vector<Mat*>&, std::vector<Mat>&) CV_OVERRIDE {}
+    DataLayer() : Layer()
+    {
+        skip = false;
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE && inputsData.size() == 1;
+    }

    void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals) CV_OVERRIDE
    {
@ -423,11 +439,36 @@ struct DataLayer : public Layer

    void forward(std::vector<Mat*>&, std::vector<Mat>& outputs, std::vector<Mat> &) CV_OVERRIDE
    {
+        // Supported modes:
+        // | Input type | Output type |
+        // |       fp32 |        fp32 |
+        // |      uint8 |        fp32 |
        for (int i = 0; i < inputsData.size(); ++i)
        {
-            if (inputsData[i].type() == CV_32F && outputs[i].type() == CV_16S)
+            double scale = scaleFactors[i];
+            Scalar& mean = means[i];
+            CV_Assert(mean == Scalar() || inputsData[i].size[1] <= 4,
+                      outputs[i].type() == CV_32F);
+
+            bool singleMean = true;
+            for (int j = 1; j < std::min(4, inputsData[i].size[1]) && singleMean; ++j)
            {
-                convertFp16(inputsData[i], outputs[i]);
+                singleMean = mean[j] == mean[j - 1];
+            }
+
+            if (singleMean)
+            {
+                inputsData[i].convertTo(outputs[i], CV_32F, scale, -mean[0] * scale);
+            }
+            else
+            {
+                for (int n = 0; n < inputsData[i].size[0]; ++n)
+                    for (int c = 0; c < inputsData[i].size[1]; ++c)
+                    {
+                        Mat inp = getPlane(inputsData[i], n, c);
+                        Mat out = getPlane(outputs[i], n, c);
+                        inp.convertTo(out, CV_32F, scale, -mean[c] * scale);
+                    }
            }
        }
    }
@ -435,13 +476,66 @@ struct DataLayer : public Layer
 #ifdef HAVE_OPENCL
    bool forward_ocl(InputArrayOfArrays, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
    {
-        if (outputs_.depth() == CV_16S)
+        // Supported modes:
+        // | Input type | Output type |
+        // |       fp32 |        fp32 |
+        // |       fp32 |        fp16 |
+        // |      uint8 |        fp32 |
+        std::vector<UMat> outputs;
+        outputs_.getUMatVector(outputs);
+
+        for (int i = 0; i < inputsData.size(); ++i)
        {
-            std::vector<UMat> outputs;
-            outputs_.getUMatVector(outputs);
-            for (int i = 0; i < inputsData.size(); ++i)
+            double scale = scaleFactors[i];
+            Scalar& mean = means[i];
+
+            CV_Assert(mean == Scalar() || inputsData[i].size[1] <= 4);
+            bool singleMean = true;
+            for (int j = 1; j < std::min(4, inputsData[i].size[1]) && singleMean; ++j)
            {
-                convertFp16(inputsData[i], outputs[i]);
+                singleMean = mean[j] == mean[j - 1];
+            }
+
+            if (outputs_.depth() == CV_16S)
+            {
+                if (singleMean)
+                    convertFp16(scale * (inputsData[i] - mean[0]), outputs[i]);
+                else
+                {
+                    for (int n = 0; n < inputsData[i].size[0]; ++n)
+                        for (int c = 0; c < inputsData[i].size[1]; ++c)
+                        {
+                            Mat inp = getPlane(inputsData[i], n, c);
+
+                            std::vector<cv::Range> plane(4, Range::all());
+                            plane[0] = Range(n, n + 1);
+                            plane[1] = Range(c, c + 1);
+                            UMat out = outputs[i](plane).reshape(1, inp.dims, inp.size);
+
+                            convertFp16(scale * (inp - mean[c]), out);
+                        }
+                }
+            }
+            else
+            {
+                CV_Assert(outputs_.depth() == CV_32F);
+                if (singleMean)
+                    inputsData[i].convertTo(outputs[i], CV_32F, scale, -mean[0] * scale);
+                else
+                {
+                    for (int n = 0; n < inputsData[i].size[0]; ++n)
+                        for (int c = 0; c < inputsData[i].size[1]; ++c)
+                        {
+                            Mat inp = getPlane(inputsData[i], n, c);
+
+                            std::vector<cv::Range> plane(4, Range::all());
+                            plane[0] = Range(n, n + 1);
+                            plane[1] = Range(c, c + 1);
+                            UMat out = outputs[i](plane).reshape(1, inp.dims, inp.size);
+
+                            inp.convertTo(out, CV_32F, scale, -mean[c] * scale);
+                        }
+                }
            }
        }
        return true;
@ -469,8 +563,61 @@ struct DataLayer : public Layer
        return false;
    }

+    void finalize(const std::vector<Mat*>&, std::vector<Mat>& outputs) CV_OVERRIDE
+    {
+        CV_Assert(outputs.size() == scaleFactors.size(), outputs.size() == means.size(),
+                  inputsData.size() == outputs.size());
+        skip = true;
+        for (int i = 0; skip && i < inputsData.size(); ++i)
+        {
+            if (inputsData[i].data != outputs[i].data || scaleFactors[i] != 1.0 || means[i] != Scalar())
+                skip = false;
+        }
+    }
+
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+#ifdef HAVE_INF_ENGINE
+        InferenceEngine::LayerParams lp;
+        lp.name = name;
+        lp.type = "ScaleShift";
+        lp.precision = InferenceEngine::Precision::FP32;
+        std::shared_ptr<InferenceEngine::ScaleShiftLayer> ieLayer(new InferenceEngine::ScaleShiftLayer(lp));
+
+        CV_Assert(inputsData.size() == 1, inputsData[0].dims == 4);
+        const size_t numChannels = inputsData[0].size[1];
+        CV_Assert(numChannels <= 4);
+
+        // Scale
+        auto weights = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
+                                                                {numChannels});
+        weights->allocate();
+        weights->set(std::vector<float>(numChannels, scaleFactors[0]));
+        ieLayer->_weights = weights;
+
+        // Mean subtraction
+        auto biases = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
+                                                               {numChannels});
+        biases->allocate();
+        std::vector<float> biasesVec(numChannels);
+        for (int i = 0; i < numChannels; ++i)
+        {
+            biasesVec[i] = -means[0][i] * scaleFactors[0];
+        }
+        biases->set(biasesVec);
+        ieLayer->_biases = biases;
+
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif  // HAVE_INF_ENGINE
+        return Ptr<BackendNode>();
+    }
+
    std::vector<String> outNames;
+    // Preprocessing parameters for each network's input.
+    std::vector<double> scaleFactors;
+    std::vector<Scalar> means;
    std::vector<Mat> inputsData;
+    bool skip;
 };

 struct BlobManager
@ -739,7 +886,7 @@ struct Net::Impl
        netInputLayer = Ptr<DataLayer>(new DataLayer());
        LayerData &inpl = layers.insert( make_pair(0, LayerData()) ).first->second;
        inpl.id = 0;
-        inpl.name = "_input";
+        netInputLayer->name = inpl.name = "_input";
        inpl.type = "__NetInputLayer__";
        inpl.layerInstance = netInputLayer;
        layerNameToId.insert(std::make_pair(inpl.name, inpl.id));
@ -930,6 +1077,11 @@ struct Net::Impl
            clear();

            allocateLayers(blobsToKeep_);
+
+            MapIdToLayerData::iterator it = layers.find(0);
+            CV_Assert(it != layers.end());
+            it->second.skip = netInputLayer->skip;
+
            initBackend();

            if (!netWasAllocated )
@ -1179,6 +1331,29 @@ struct Net::Impl
        MapIdToLayerData::iterator it;
        Ptr<InfEngineBackendNet> net;

+        for (it = layers.begin(); it != layers.end(); ++it)
+        {
+            LayerData &ld = it->second;
+            if (ld.id == 0)
+            {
+                CV_Assert((netInputLayer->outNames.empty() && ld.outputBlobsWrappers.size() == 1) ||
+                          (netInputLayer->outNames.size() == ld.outputBlobsWrappers.size()));
+                for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
+                {
+                    InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
+                    dataPtr->name = netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i];
+                }
+            }
+            else
+            {
+                for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
+                {
+                    InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
+                    dataPtr->name = ld.name;
+                }
+            }
+        }
+
        if (skipInfEngineInit)
        {
            Ptr<BackendNode> node = layers[lastLayerId].backendNodes[preferableBackend];
@ -1190,11 +1365,21 @@ struct Net::Impl
            for (it = layers.begin(); it != layers.end(); ++it)
            {
                LayerData &ld = it->second;
-
-                for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
+                if (ld.id == 0)
                {
-                    InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
-                    dataPtr->name = ld.id == 0 ? netInputLayer->outNames[i] : ld.name;
+                    for (int i = 0; i < ld.inputBlobsWrappers.size(); ++i)
+                    {
+                        InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.inputBlobsWrappers[i]);
+                        dataPtr->name = netInputLayer->outNames[i];
+                    }
+                }
+                else
+                {
+                    for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
+                    {
+                        InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
+                        dataPtr->name = ld.name;
+                    }
                }
                ieNode->net->addBlobs(ld.inputBlobsWrappers);
                ieNode->net->addBlobs(ld.outputBlobsWrappers);
@ -1210,11 +1395,11 @@ struct Net::Impl
        // some of layers is not implemented.

        // Set of all input and output blobs wrappers for current network.
-        std::map<int, Ptr<BackendWrapper> > netBlobsWrappers;
+        std::map<LayerPin, Ptr<BackendWrapper> > netBlobsWrappers;
        for (it = layers.begin(); it != layers.end(); ++it)
        {
            LayerData &ld = it->second;
-            if (ld.id == 0)
+            if (ld.id == 0 && ld.skip)
                continue;
            bool fused = ld.skip;

@ -1251,20 +1436,17 @@ struct Net::Impl
            // So we need to rewrap all the external blobs.
            for (int i = 0; i < ld.inputBlobsId.size(); ++i)
            {
-                int lid = ld.inputBlobsId[i].lid;
-                LayerData &inpLd = layers[lid];
-                auto it = netBlobsWrappers.find(lid);
+                LayerPin inPin = ld.inputBlobsId[i];
+                auto it = netBlobsWrappers.find(inPin);
                if (it == netBlobsWrappers.end())
                {
-                    ld.inputBlobsWrappers[i] = wrap(*ld.inputBlobs[i]);
-                    auto dataPtr = infEngineDataNode(ld.inputBlobsWrappers[i]);
-                    dataPtr->name = inpLd.name;
-                    netBlobsWrappers[lid] = ld.inputBlobsWrappers[i];
+                    ld.inputBlobsWrappers[i] = InfEngineBackendWrapper::create(ld.inputBlobsWrappers[i]);
+                    netBlobsWrappers[inPin] = ld.inputBlobsWrappers[i];
                }
                else
                    ld.inputBlobsWrappers[i] = it->second;
            }
-            netBlobsWrappers[ld.id] = ld.outputBlobsWrappers[0];
+            netBlobsWrappers[LayerPin(ld.id, 0)] = ld.outputBlobsWrappers[0];

            Ptr<BackendNode> node;
            if (!net.empty())
@ -2343,7 +2525,7 @@ void Net::setInputsNames(const std::vector<String> &inputBlobNames)
    impl->netInputLayer->setNames(inputBlobNames);
 }

-void Net::setInput(InputArray blob, const String& name)
+void Net::setInput(InputArray blob, const String& name, double scalefactor, const Scalar& mean)
 {
    CV_TRACE_FUNCTION();
    CV_TRACE_ARG_VALUE(name, "name", name.c_str());
@ -2360,6 +2542,8 @@ void Net::setInput(InputArray blob, const String& name)
    ld.outputBlobs.resize(numInputs);
    ld.outputBlobsWrappers.resize(numInputs);
    impl->netInputLayer->inputsData.resize(numInputs);
+    impl->netInputLayer->scaleFactors.resize(numInputs);
+    impl->netInputLayer->means.resize(numInputs);

    MatShape prevShape = shape(impl->netInputLayer->inputsData[pin.oid]);
    Mat blob_ = blob.getMat();
@ -2378,6 +2562,8 @@ void Net::setInput(InputArray blob, const String& name)
    {
        ld.outputBlobsWrappers[pin.oid]->setHostDirty();
    }
+    impl->netInputLayer->scaleFactors[pin.oid] = scalefactor;
+    impl->netInputLayer->means[pin.oid] = mean;
    impl->netWasAllocated = impl->netWasAllocated && oldShape;
 }

--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -560,7 +560,7 @@ public:
            int ngroups = ngroups_, batchSize = input_->size[0]*ngroups;
            int outW = output_->size[3], outH = output_->size[2], outCn = output_->size[1]/ngroups;
            int width = input_->size[3], height = input_->size[2], inpCn = input_->size[1]/ngroups;
-            int nstripes = nstripes_;
+            const int nstripes = nstripes_;
            int kernel_w = kernel_.width, kernel_h = kernel_.height;
            int pad_w = pad_.width, pad_h = pad_.height;
            int stride_w = stride_.width, stride_h = stride_.height;
@ -587,7 +587,6 @@ public:
                int samplesPerStripe = std::max((batchSize + nstripes - 1)/nstripes, 1);
                r.start *= samplesPerStripe;
                r.end *= samplesPerStripe;
-                nstripes *= samplesPerStripe;
                stripeSize = outPlaneSize;
            }

@ -866,6 +865,16 @@ public:
        for (int i = 0; i < inputs.size(); ++i)
            CV_Assert(inputs[i].u != outputs[0].u);

+        if (umat_blobs.empty())
+        {
+            size_t n = blobs.size();
+            umat_blobs.resize(n);
+            for (size_t i = 0; i < n; i++)
+            {
+                blobs[i].copyTo(umat_blobs[i]);
+            }
+        }
+
        if (convolutionOp.empty())
        {
            OCL4DNNConvConfig config;
@ -1637,14 +1646,6 @@ public:
 Ptr<BaseConvolutionLayer> ConvolutionLayer::create(const LayerParams &params)
 {
    Ptr<ConvolutionLayerImpl> l(new ConvolutionLayerImpl(params));
-
-#ifdef HAVE_OPENCL
-    size_t n = params.blobs.size();
-    l->umat_blobs.resize(n);
-    for (int i = 0; i < n; i++)
-        l->umat_blobs[i] = params.blobs[i].getUMat(ACCESS_READ);
-#endif
-
    return l;
 }

--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@ -187,7 +187,7 @@ public:
            int c, j, k, n = nsrcs;
            const float* coeffsptr = coeffs && !coeffs->empty() ? &coeffs->at(0) : 0;
            float* dstptr0 = dst->ptr<float>();
-            int blockSize0 = 1 << 12, blockSize = blockSize0;
+            int blockSize0 = 1 << 12, blockSize;

            for( size_t ofs = stripeStart; ofs < stripeEnd; ofs += blockSize )
            {
--- a/modules/dnn/src/layers/normalize_bbox_layer.cpp
+++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp
@ -190,6 +190,7 @@ public:

        size_t num = total(shape(inp0.size), 0, startAxis);
        size_t numPlanes = total(shape(inp0.size), startAxis, endAxis + 1);
+        CV_Assert(num * numPlanes != 0);
        size_t planeSize = inp0.total() / (num * numPlanes);
        for (size_t n = 0; n < num; ++n)
        {
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@ -189,18 +189,16 @@ public:
        else
            outTailShape_.assign(1, _numOut);

-        int _numTimeStamps, _numSamples;
+        int _numSamples;
        if (useTimestampDim)
        {
            CV_Assert(inp0.size() >= 2 && total(inp0, 2) == _numInp);
-            _numTimeStamps = inp0[0];
            _numSamples = inp0[1];
-            outResShape.push_back(_numTimeStamps);
+            outResShape.push_back(inp0[0]);
        }
        else
        {
            CV_Assert(inp0.size() >= 2 && total(inp0, 1) == _numInp);
-            _numTimeStamps = 1;
            _numSamples = inp0[0];
        }

--- a/modules/dnn/src/layers/resize_layer.cpp
+++ b/modules/dnn/src/layers/resize_layer.cpp
@ -14,7 +14,7 @@ namespace cv { namespace dnn {
 class ResizeLayerImpl : public ResizeLayer
 {
 public:
-    ResizeLayerImpl(const LayerParams& params) : scaleWidth(0), scaleHeight(0)
+    ResizeLayerImpl(const LayerParams& params) : zoomFactorWidth(0), zoomFactorHeight(0), scaleWidth(0), scaleHeight(0)
    {
        setParamsFrom(params);
        outWidth = params.get<float>("width", 0);
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@ -563,10 +563,10 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
    }

    if (use_half_ && bias_half.empty() && !bias.empty())
-        convertFp16((UMat&)bias, bias_half);
+        convertFp16(bias, bias_half);

    if (use_half_ && weights_half.empty())
-        convertFp16((UMat&)weight, weights_half);
+        convertFp16(weight, weights_half);

    prepareKernel(bottom, top, weight, (use_half_) ? bias_half : bias, numImages);
    if (bestKernelConfig.empty())
--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@ -68,19 +68,32 @@ static InferenceEngine::DataPtr wrapToInfEngineDataNode(const Mat& m, const std:
 {
    std::vector<size_t> reversedShape(&m.size[0], &m.size[0] + m.dims);
    std::reverse(reversedShape.begin(), reversedShape.end());
-    return InferenceEngine::DataPtr(
-        new InferenceEngine::Data(name, reversedShape, InferenceEngine::Precision::FP32, estimateLayout(m))
-    );
+    if (m.type() == CV_32F)
+        return InferenceEngine::DataPtr(
+            new InferenceEngine::Data(name, reversedShape, InferenceEngine::Precision::FP32, estimateLayout(m))
+        );
+    else if (m.type() == CV_8U)
+        return InferenceEngine::DataPtr(
+            new InferenceEngine::Data(name, reversedShape, InferenceEngine::Precision::U8, estimateLayout(m))
+        );
+    else
+        CV_Error(Error::StsNotImplemented, format("Unsupported data type %d", m.type()));
 }

-InferenceEngine::TBlob<float>::Ptr wrapToInfEngineBlob(const Mat& m, const std::vector<size_t>& shape,
-                                                       InferenceEngine::Layout layout)
+InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, const std::vector<size_t>& shape,
+                                               InferenceEngine::Layout layout)
 {
-    return InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
-                                                    layout, shape, (float*)m.data);
+    if (m.type() == CV_32F)
+        return InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
+                                                        layout, shape, (float*)m.data);
+    else if (m.type() == CV_8U)
+        return InferenceEngine::make_shared_blob<uint8_t>(InferenceEngine::Precision::U8,
+                                                          layout, shape, (uint8_t*)m.data);
+    else
+        CV_Error(Error::StsNotImplemented, format("Unsupported data type %d", m.type()));
 }

-InferenceEngine::TBlob<float>::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::Layout layout)
+InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::Layout layout)
 {
    std::vector<size_t> reversedShape(&m.size[0], &m.size[0] + m.dims);
    std::reverse(reversedShape.begin(), reversedShape.end());
@ -102,6 +115,24 @@ InfEngineBackendWrapper::InfEngineBackendWrapper(int targetId, const cv::Mat& m)
    blob = wrapToInfEngineBlob(m, estimateLayout(m));
 }

+InfEngineBackendWrapper::InfEngineBackendWrapper(Ptr<BackendWrapper> wrapper)
+    : BackendWrapper(DNN_BACKEND_INFERENCE_ENGINE, wrapper->targetId)
+{
+    Ptr<InfEngineBackendWrapper> ieWrapper = wrapper.dynamicCast<InfEngineBackendWrapper>();
+    CV_Assert(!ieWrapper.empty());
+    InferenceEngine::DataPtr srcData = ieWrapper->dataPtr;
+    dataPtr = InferenceEngine::DataPtr(
+        new InferenceEngine::Data(srcData->name, srcData->dims, srcData->precision,
+                                  srcData->layout)
+    );
+    blob = ieWrapper->blob;
+}
+
+Ptr<BackendWrapper> InfEngineBackendWrapper::create(Ptr<BackendWrapper> wrapper)
+{
+    return Ptr<BackendWrapper>(new InfEngineBackendWrapper(wrapper));
+}
+
 InfEngineBackendWrapper::~InfEngineBackendWrapper()
 {

@ -149,10 +180,15 @@ InferenceEngine::Precision InfEngineBackendNet::getPrecision() noexcept
    return precision;
 }

+InferenceEngine::Precision InfEngineBackendNet::getPrecision() const noexcept
+{
+    return precision;
+}
+
 // Assume that outputs of network is unconnected blobs.
 void InfEngineBackendNet::getOutputsInfo(InferenceEngine::OutputsDataMap &outputs_) noexcept
 {
-    outputs_ = outputs;
+    const_cast<const InfEngineBackendNet*>(this)->getOutputsInfo(outputs_);
 }
 void InfEngineBackendNet::getOutputsInfo(InferenceEngine::OutputsDataMap &outputs_) const noexcept
 {
@ -162,7 +198,7 @@ void InfEngineBackendNet::getOutputsInfo(InferenceEngine::OutputsDataMap &output
 // Returns input references that aren't connected to internal outputs.
 void InfEngineBackendNet::getInputsInfo(InferenceEngine::InputsDataMap &inputs_) noexcept
 {
-    inputs_ = inputs;
+    const_cast<const InfEngineBackendNet*>(this)->getInputsInfo(inputs_);
 }

 // Returns input references that aren't connected to internal outputs.
@ -173,7 +209,11 @@ void InfEngineBackendNet::getInputsInfo(InferenceEngine::InputsDataMap &inputs_)

 InferenceEngine::InputInfo::Ptr InfEngineBackendNet::getInput(const std::string &inputName) noexcept
 {
-    getInputsInfo(inputs);
+    return const_cast<const InfEngineBackendNet*>(this)->getInput(inputName);
+}
+
+InferenceEngine::InputInfo::Ptr InfEngineBackendNet::getInput(const std::string &inputName) const noexcept
+{
    const auto& it = inputs.find(inputName);
    CV_Assert(it != inputs.end());
    return it->second;
@ -187,7 +227,17 @@ void InfEngineBackendNet::getName(char*, size_t) const noexcept
 {
 }

+const std::string& InfEngineBackendNet::getName() const noexcept
+{
+    return name;
+}
+
 size_t InfEngineBackendNet::layerCount() noexcept
+{
+    return const_cast<const InfEngineBackendNet*>(this)->layerCount();
+}
+
+size_t InfEngineBackendNet::layerCount() const noexcept
 {
    return layers.size();
 }
@ -227,6 +277,13 @@ InfEngineBackendNet::addOutput(const std::string &layerName, size_t outputIndex,
 InferenceEngine::StatusCode
 InfEngineBackendNet::getLayerByName(const char *layerName, InferenceEngine::CNNLayerPtr &out,
                                    InferenceEngine::ResponseDesc *resp) noexcept
+{
+    return const_cast<const InfEngineBackendNet*>(this)->getLayerByName(layerName, out, resp);
+}
+
+InferenceEngine::StatusCode InfEngineBackendNet::getLayerByName(const char *layerName,
+                                                                InferenceEngine::CNNLayerPtr &out,
+                                                                InferenceEngine::ResponseDesc *resp) const noexcept
 {
    for (auto& l : layers)
    {
@ -254,7 +311,12 @@ InferenceEngine::TargetDevice InfEngineBackendNet::getTargetDevice() noexcept
    return targetDevice;
 }

-InferenceEngine::StatusCode InfEngineBackendNet::setBatchSize(const size_t size) noexcept
+InferenceEngine::TargetDevice InfEngineBackendNet::getTargetDevice() const noexcept
+{
+    return targetDevice;
+}
+
+InferenceEngine::StatusCode InfEngineBackendNet::setBatchSize(const size_t) noexcept
 {
    CV_Error(Error::StsNotImplemented, "");
    return InferenceEngine::StatusCode::OK;
@ -329,6 +391,7 @@ void InfEngineBackendNet::init(int targetId)
    {
        CV_Assert(allBlobs.find(it.first) != allBlobs.end());
        inpBlobs[it.first] = allBlobs[it.first];
+        it.second->setPrecision(inpBlobs[it.first]->precision());
    }

    // Set up output blobs.
@ -342,7 +405,9 @@ void InfEngineBackendNet::init(int targetId)
    switch (targetId)
    {
    case DNN_TARGET_CPU: setTargetDevice(InferenceEngine::TargetDevice::eCPU); break;
-    case DNN_TARGET_OPENCL_FP16: setPrecision(InferenceEngine::Precision::FP16);  // Fallback to the next.
+    case DNN_TARGET_OPENCL_FP16:
+        setPrecision(InferenceEngine::Precision::FP16);
+        /* Falls through. */
    case DNN_TARGET_OPENCL: setTargetDevice(InferenceEngine::TargetDevice::eGPU); break;
    case DNN_TARGET_MYRIAD:
    {
@ -363,9 +428,8 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)

    try
    {
-        static std::map<std::string, InferenceEngine::InferenceEnginePluginPtr> sharedPlugins;
-        std::string deviceName = InferenceEngine::getDeviceName(targetDevice);
-        auto pluginIt = sharedPlugins.find(deviceName);
+        static std::map<InferenceEngine::TargetDevice, InferenceEngine::InferenceEnginePluginPtr> sharedPlugins;
+        auto pluginIt = sharedPlugins.find(targetDevice);
        if (pluginIt != sharedPlugins.end())
        {
            enginePtr = pluginIt->second;
@ -373,7 +437,7 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)
        else
        {
            enginePtr = InferenceEngine::PluginDispatcher({""}).getSuitablePlugin(targetDevice);
-            sharedPlugins[deviceName] = enginePtr;
+            sharedPlugins[targetDevice] = enginePtr;

            if (targetDevice == InferenceEngine::TargetDevice::eCPU)
            {
@ -427,7 +491,7 @@ void InfEngineBackendNet::addBlobs(const std::vector<Ptr<BackendWrapper> >& ptrs
    auto wrappers = infEngineWrappers(ptrs);
    for (const auto& wrapper : wrappers)
    {
-        allBlobs[wrapper->dataPtr->name] = wrapper->blob;
+        allBlobs.insert({wrapper->dataPtr->name, wrapper->blob});
    }
 }

--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@ -8,6 +8,8 @@
 #ifndef __OPENCV_DNN_OP_INF_ENGINE_HPP__
 #define __OPENCV_DNN_OP_INF_ENGINE_HPP__

+#include "opencv2/core/cvdef.h"
+
 #ifdef HAVE_INF_ENGINE
 #if defined(__GNUC__) && __GNUC__ >= 5
 //#pragma GCC diagnostic push
@ -34,7 +36,9 @@ public:

    void setPrecision(InferenceEngine::Precision p) noexcept;

-    virtual InferenceEngine::Precision getPrecision() noexcept CV_OVERRIDE;
+    virtual InferenceEngine::Precision getPrecision() noexcept;
+
+    virtual InferenceEngine::Precision getPrecision() const noexcept;

    virtual void getOutputsInfo(InferenceEngine::OutputsDataMap &out) noexcept /*CV_OVERRIDE*/;

@ -44,13 +48,19 @@ public:

    virtual void getInputsInfo(InferenceEngine::InputsDataMap &inputs) const noexcept /*CV_OVERRIDE*/;

-    virtual InferenceEngine::InputInfo::Ptr getInput(const std::string &inputName) noexcept CV_OVERRIDE;
+    virtual InferenceEngine::InputInfo::Ptr getInput(const std::string &inputName) noexcept;
+
+    virtual InferenceEngine::InputInfo::Ptr getInput(const std::string &inputName) const noexcept;

    virtual void getName(char *pName, size_t len) noexcept;

    virtual void getName(char *pName, size_t len) const noexcept;

-    virtual size_t layerCount() noexcept CV_OVERRIDE;
+    virtual const std::string& getName() const noexcept;
+
+    virtual size_t layerCount() noexcept;
+
+    virtual size_t layerCount() const noexcept;

    virtual InferenceEngine::DataPtr& getData(const char *dname) noexcept CV_OVERRIDE;

@ -58,15 +68,21 @@ public:

    virtual InferenceEngine::StatusCode addOutput(const std::string &layerName,
                                                  size_t outputIndex = 0,
-                                                  InferenceEngine::ResponseDesc *resp = nullptr) noexcept CV_OVERRIDE;
+                                                  InferenceEngine::ResponseDesc *resp = nullptr) noexcept;

    virtual InferenceEngine::StatusCode getLayerByName(const char *layerName,
                                                       InferenceEngine::CNNLayerPtr &out,
-                                                       InferenceEngine::ResponseDesc *resp) noexcept CV_OVERRIDE;
+                                                       InferenceEngine::ResponseDesc *resp) noexcept;
+
+    virtual InferenceEngine::StatusCode getLayerByName(const char *layerName,
+                                                       InferenceEngine::CNNLayerPtr &out,
+                                                       InferenceEngine::ResponseDesc *resp) const noexcept;

    virtual void setTargetDevice(InferenceEngine::TargetDevice device) noexcept CV_OVERRIDE;

-    virtual InferenceEngine::TargetDevice getTargetDevice() noexcept CV_OVERRIDE;
+    virtual InferenceEngine::TargetDevice getTargetDevice() noexcept;
+
+    virtual InferenceEngine::TargetDevice getTargetDevice() const noexcept;

    virtual InferenceEngine::StatusCode setBatchSize(const size_t size) noexcept CV_OVERRIDE;

@ -94,6 +110,8 @@ private:
    InferenceEngine::ExecutableNetwork netExec;
    InferenceEngine::InferRequest infRequest;

+    std::string name;
+
    void initPlugin(InferenceEngine::ICNNNetwork& net);
 };

@ -115,19 +133,23 @@ class InfEngineBackendWrapper : public BackendWrapper
 public:
    InfEngineBackendWrapper(int targetId, const Mat& m);

+    InfEngineBackendWrapper(Ptr<BackendWrapper> wrapper);
+
    ~InfEngineBackendWrapper();

+    static Ptr<BackendWrapper> create(Ptr<BackendWrapper> wrapper);
+
    virtual void copyToHost() CV_OVERRIDE;

    virtual void setHostDirty() CV_OVERRIDE;

    InferenceEngine::DataPtr dataPtr;
-    InferenceEngine::TBlob<float>::Ptr blob;
+    InferenceEngine::Blob::Ptr blob;
 };

-InferenceEngine::TBlob<float>::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::Layout layout = InferenceEngine::Layout::ANY);
+InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::Layout layout = InferenceEngine::Layout::ANY);

-InferenceEngine::TBlob<float>::Ptr wrapToInfEngineBlob(const Mat& m, const std::vector<size_t>& shape, InferenceEngine::Layout layout);
+InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, const std::vector<size_t>& shape, InferenceEngine::Layout layout);

 InferenceEngine::DataPtr infEngineDataNode(const Ptr<BackendWrapper>& ptr);

--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@ -771,6 +771,13 @@ void TFImporter::populateNet(Net dstNet)
                type = layer.op();
            }

+            // For the object detection networks, TensorFlow Object Detection API
+            // predicts deltas for bounding boxes in yxYX (ymin, xmin, ymax, xmax)
+            // order. We can manage it at DetectionOutput layer parsing predictions
+            // or shuffle last convolution's weights.
+            bool locPredTransposed = hasLayerAttr(layer, "loc_pred_transposed") &&
+                                     getLayerAttr(layer, "loc_pred_transposed").b();
+
            layerParams.set("bias_term", false);
            layerParams.blobs.resize(1);

@ -784,18 +791,32 @@ void TFImporter::populateNet(Net dstNet)
                blobFromTensor(getConstBlob(net.node(weights_layer_index), value_id), layerParams.blobs[1]);
                ExcludeLayer(net, weights_layer_index, 0, false);
                layers_to_ignore.insert(next_layers[0].first);
+
+                // Shuffle bias from yxYX to xyXY.
+                if (locPredTransposed)
+                {
+                    const int numWeights = layerParams.blobs[1].total();
+                    float* biasData = reinterpret_cast<float*>(layerParams.blobs[1].data);
+                    CV_Assert(numWeights % 4 == 0);
+                    for (int i = 0; i < numWeights; i += 2)
+                    {
+                        std::swap(biasData[i], biasData[i + 1]);
+                    }
+                }
            }

            const tensorflow::TensorProto& kernelTensor = getConstBlob(layer, value_id);
            kernelFromTensor(kernelTensor, layerParams.blobs[0]);
            releaseTensor(const_cast<tensorflow::TensorProto*>(&kernelTensor));
            int* kshape = layerParams.blobs[0].size.p;
+            const int outCh = kshape[0];
+            const int inCh = kshape[1];
+            const int height = kshape[2];
+            const int width = kshape[3];
            if (type == "DepthwiseConv2dNative")
            {
+                CV_Assert(!locPredTransposed);
                const int chMultiplier = kshape[0];
-                const int inCh = kshape[1];
-                const int height = kshape[2];
-                const int width = kshape[3];

                Mat copy = layerParams.blobs[0].clone();
                float* src = (float*)copy.data;
@ -814,9 +835,21 @@ void TFImporter::populateNet(Net dstNet)
                size_t* kstep = layerParams.blobs[0].step.p;
                kstep[0] = kstep[1]; // fix steps too
            }
-            layerParams.set("kernel_h", kshape[2]);
-            layerParams.set("kernel_w", kshape[3]);
-            layerParams.set("num_output", kshape[0]);
+            layerParams.set("kernel_h", height);
+            layerParams.set("kernel_w", width);
+            layerParams.set("num_output", outCh);
+
+            // Shuffle output channels from yxYX to xyXY.
+            if (locPredTransposed)
+            {
+                const int slice = height * width * inCh;
+                for (int i = 0; i < outCh; i += 2)
+                {
+                    cv::Mat src(1, slice, CV_32F, layerParams.blobs[0].ptr<float>(i));
+                    cv::Mat dst(1, slice, CV_32F, layerParams.blobs[0].ptr<float>(i + 1));
+                    std::swap_ranges(src.begin<float>(), src.end<float>(), dst.begin<float>());
+                }
+            }

            setStrides(layerParams, layer);
            setPadding(layerParams, layer);
--- a/modules/dnn/test/test_halide_layers.cpp
+++ b/modules/dnn/test/test_halide_layers.cpp
@ -107,12 +107,10 @@ TEST_P(Convolution, Accuracy)
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_MYRIAD)
        throw SkipTestException("");

-    // TODO: unstable test cases
-    if (backendId == DNN_BACKEND_OPENCV && (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16) &&
-        inChannels == 6 && outChannels == 9 && group == 1 && inSize == Size(5, 6) &&
-        kernel == Size(3, 1) && stride == Size(1, 1) && pad == Size(0, 1) && dilation == Size(1, 1) &&
-        hasBias)
-        throw SkipTestException("");
+    if (cvtest::skipUnstableTests && backendId == DNN_BACKEND_OPENCV &&
+        (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16) &&
+        kernel == Size(3, 1) && stride == Size(1, 1) && pad == Size(0, 1))
+        throw SkipTestException("Skip unstable test");

    int sz[] = {outChannels, inChannels / group, kernel.height, kernel.width};
    Mat weights(4, &sz[0], CV_32F);
--- a/modules/dnn/test/test_ie_models.cpp
+++ b/modules/dnn/test/test_ie_models.cpp
@ -0,0 +1,238 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+#include "test_precomp.hpp"
+
+#ifdef HAVE_INF_ENGINE
+#include <opencv2/core/utils/filesystem.hpp>
+
+#include <inference_engine.hpp>
+#include <ie_icnn_network.hpp>
+#include <ie_extension.h>
+
+namespace opencv_test { namespace {
+
+static void initDLDTDataPath()
+{
+#ifndef WINRT
+    static bool initialized = false;
+    if (!initialized)
+    {
+        const char* dldtTestDataPath = getenv("INTEL_CVSDK_DIR");
+        if (dldtTestDataPath)
+            cvtest::addDataSearchPath(cv::utils::fs::join(dldtTestDataPath, "deployment_tools"));
+        initialized = true;
+    }
+#endif
+}
+
+using namespace cv;
+using namespace cv::dnn;
+using namespace InferenceEngine;
+
+static inline void genData(const std::vector<size_t>& dims, Mat& m, Blob::Ptr& dataPtr)
+{
+    std::vector<int> reversedDims(dims.begin(), dims.end());
+    std::reverse(reversedDims.begin(), reversedDims.end());
+
+    m.create(reversedDims, CV_32F);
+    randu(m, -1, 1);
+
+    dataPtr = make_shared_blob<float>(Precision::FP32, dims, (float*)m.data);
+}
+
+void runIE(Target target, const std::string& xmlPath, const std::string& binPath,
+           std::map<std::string, cv::Mat>& inputsMap, std::map<std::string, cv::Mat>& outputsMap)
+{
+    CNNNetReader reader;
+    reader.ReadNetwork(xmlPath);
+    reader.ReadWeights(binPath);
+
+    CNNNetwork net = reader.getNetwork();
+
+    InferenceEnginePluginPtr enginePtr;
+    InferencePlugin plugin;
+    ExecutableNetwork netExec;
+    InferRequest infRequest;
+    TargetDevice targetDevice;
+    switch (target)
+    {
+        case DNN_TARGET_CPU:
+            targetDevice = TargetDevice::eCPU;
+            break;
+        case DNN_TARGET_OPENCL:
+        case DNN_TARGET_OPENCL_FP16:
+            targetDevice = TargetDevice::eGPU;
+            break;
+        case DNN_TARGET_MYRIAD:
+            targetDevice = TargetDevice::eMYRIAD;
+            break;
+        default:
+            CV_Error(Error::StsNotImplemented, "Unknown target");
+    };
+
+    try
+    {
+        enginePtr = PluginDispatcher({""}).getSuitablePlugin(targetDevice);
+
+        if (targetDevice == TargetDevice::eCPU)
+        {
+            std::string suffixes[] = {"_avx2", "_sse4", ""};
+            bool haveFeature[] = {
+                checkHardwareSupport(CPU_AVX2),
+                checkHardwareSupport(CPU_SSE4_2),
+                true
+            };
+            for (int i = 0; i < 3; ++i)
+            {
+                if (!haveFeature[i])
+                    continue;
+#ifdef _WIN32
+                std::string libName = "cpu_extension" + suffixes[i] + ".dll";
+#else
+                std::string libName = "libcpu_extension" + suffixes[i] + ".so";
+#endif  // _WIN32
+                try
+                {
+                    IExtensionPtr extension = make_so_pointer<IExtension>(libName);
+                    enginePtr->AddExtension(extension, 0);
+                    break;
+                }
+                catch(...) {}
+            }
+            // Some of networks can work without a library of extra layers.
+        }
+        plugin = InferencePlugin(enginePtr);
+
+        netExec = plugin.LoadNetwork(net, {});
+        infRequest = netExec.CreateInferRequest();
+    }
+    catch (const std::exception& ex)
+    {
+        CV_Error(Error::StsAssert, format("Failed to initialize Inference Engine backend: %s", ex.what()));
+    }
+
+    // Fill input blobs.
+    inputsMap.clear();
+    BlobMap inputBlobs;
+    for (auto& it : net.getInputsInfo())
+    {
+        genData(it.second->getDims(), inputsMap[it.first], inputBlobs[it.first]);
+    }
+    infRequest.SetInput(inputBlobs);
+
+    // Fill output blobs.
+    outputsMap.clear();
+    BlobMap outputBlobs;
+    for (auto& it : net.getOutputsInfo())
+    {
+        genData(it.second->dims, outputsMap[it.first], outputBlobs[it.first]);
+    }
+    infRequest.SetOutput(outputBlobs);
+
+    infRequest.Infer();
+}
+
+std::vector<String> getOutputsNames(const Net& net)
+{
+    std::vector<String> names;
+    if (names.empty())
+    {
+        std::vector<int> outLayers = net.getUnconnectedOutLayers();
+        std::vector<String> layersNames = net.getLayerNames();
+        names.resize(outLayers.size());
+        for (size_t i = 0; i < outLayers.size(); ++i)
+            names[i] = layersNames[outLayers[i] - 1];
+    }
+    return names;
+}
+
+void runCV(Target target, const std::string& xmlPath, const std::string& binPath,
+           const std::map<std::string, cv::Mat>& inputsMap,
+           std::map<std::string, cv::Mat>& outputsMap)
+{
+    Net net = readNet(xmlPath, binPath);
+    for (auto& it : inputsMap)
+        net.setInput(it.second, it.first);
+    net.setPreferableTarget(target);
+
+    std::vector<String> outNames = getOutputsNames(net);
+    std::vector<Mat> outs;
+    net.forward(outs, outNames);
+
+    outputsMap.clear();
+    EXPECT_EQ(outs.size(), outNames.size());
+    for (int i = 0; i < outs.size(); ++i)
+    {
+        EXPECT_TRUE(outputsMap.insert({outNames[i], outs[i]}).second);
+    }
+}
+
+typedef TestWithParam<tuple<Target, String> > DNNTestOpenVINO;
+TEST_P(DNNTestOpenVINO, models)
+{
+    Target target = (dnn::Target)(int)get<0>(GetParam());
+    std::string modelName = get<1>(GetParam());
+
+    if (modelName == "semantic-segmentation-adas-0001" && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("");
+
+    std::string precision = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? "FP16" : "FP32";
+    std::string prefix = utils::fs::join("intel_models",
+                         utils::fs::join(modelName,
+                         utils::fs::join(precision, modelName)));
+    std::string xmlPath = findDataFile(prefix + ".xml");
+    std::string binPath = findDataFile(prefix + ".bin");
+
+    std::map<std::string, cv::Mat> inputsMap;
+    std::map<std::string, cv::Mat> ieOutputsMap, cvOutputsMap;
+    runIE(target, xmlPath, binPath, inputsMap, ieOutputsMap);
+    runCV(target, xmlPath, binPath, inputsMap, cvOutputsMap);
+
+    EXPECT_EQ(ieOutputsMap.size(), cvOutputsMap.size());
+    for (auto& srcIt : ieOutputsMap)
+    {
+        auto dstIt = cvOutputsMap.find(srcIt.first);
+        CV_Assert(dstIt != cvOutputsMap.end());
+        double normInf = cvtest::norm(srcIt.second, dstIt->second, cv::NORM_INF);
+        EXPECT_EQ(normInf, 0);
+    }
+}
+
+static testing::internal::ParamGenerator<String> intelModels()
+{
+    initDLDTDataPath();
+    std::vector<String> modelsNames;
+
+    std::string path;
+    try
+    {
+        path = findDataDirectory("intel_models", false);
+    }
+    catch (...)
+    {
+        std::cerr << "ERROR: Can't find OpenVINO models. Check INTEL_CVSDK_DIR environment variable (run setup.sh)" << std::endl;
+        return ValuesIn(modelsNames);  // empty list
+    }
+
+    cv::utils::fs::glob_relative(path, "", modelsNames, false, true);
+
+    modelsNames.erase(
+        std::remove_if(modelsNames.begin(), modelsNames.end(),
+                       [&](const String& dir){ return !utils::fs::isDirectory(utils::fs::join(path, dir)); }),
+        modelsNames.end()
+    );
+    CV_Assert(!modelsNames.empty());
+
+    return ValuesIn(modelsNames);
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, DNNTestOpenVINO, Combine(
+    Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16), intelModels()
+));
+
+}}
+#endif  // HAVE_INF_ENGINE
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@ -291,7 +291,7 @@ TEST_P(Test_Caffe_layers, Fused_Concat)

 TEST_P(Test_Caffe_layers, Eltwise)
 {
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
        throw SkipTestException("");
    testLayerUsingCaffeModels("layer_eltwise");
 }
@ -939,6 +939,25 @@ TEST(Layer_Test_Convolution_DLDT, Accuracy)
    ASSERT_EQ(net.getLayer(outLayers[0])->type, "Concat");
 }

+TEST(Layer_Test_Convolution_DLDT, setInput_uint8)
+{
+    Mat inp = blobFromNPY(_tf("blob.npy"));
+
+    Mat inputs[] = {Mat(inp.dims, inp.size, CV_8U), Mat()};
+    randu(inputs[0], 0, 255);
+    inputs[0].convertTo(inputs[1], CV_32F);
+
+    Mat outs[2];
+    for (int i = 0; i < 2; ++i)
+    {
+        Net net = readNet(_tf("layer_convolution.xml"), _tf("layer_convolution.bin"));
+        net.setInput(inputs[i]);
+        outs[i] = net.forward();
+        ASSERT_EQ(outs[i].type(), CV_32F);
+    }
+    normAssert(outs[0], outs[1]);
+}
+
 // 1. Create a .prototxt file with the following network:
 // layer {
 //   type: "Input" name: "data" top: "data"
@ -961,22 +980,65 @@ TEST(Layer_Test_Convolution_DLDT, Accuracy)
 // net.save('/path/to/caffemodel')
 //
 // 3. Convert using ModelOptimizer.
-TEST(Test_DLDT, two_inputs)
+typedef testing::TestWithParam<tuple<int, int> > Test_DLDT_two_inputs;
+TEST_P(Test_DLDT_two_inputs, as_IR)
 {
+    int firstInpType = get<0>(GetParam());
+    int secondInpType = get<1>(GetParam());
+    // TODO: It looks like a bug in Inference Engine.
+    if (secondInpType == CV_8U)
+        throw SkipTestException("");
+
    Net net = readNet(_tf("net_two_inputs.xml"), _tf("net_two_inputs.bin"));
    int inpSize[] = {1, 2, 3};
-    Mat firstInp(3, &inpSize[0], CV_32F);
-    Mat secondInp(3, &inpSize[0], CV_32F);
-    randu(firstInp, -1, 1);
-    randu(secondInp, -1, 1);
+    Mat firstInp(3, &inpSize[0], firstInpType);
+    Mat secondInp(3, &inpSize[0], secondInpType);
+    randu(firstInp, 0, 255);
+    randu(secondInp, 0, 255);

    net.setInput(firstInp, "data");
    net.setInput(secondInp, "second_input");
    Mat out = net.forward();

-    normAssert(out, firstInp + secondInp);
+    Mat ref;
+    cv::add(firstInp, secondInp, ref, Mat(), CV_32F);
+    normAssert(out, ref);
 }

+TEST_P(Test_DLDT_two_inputs, as_backend)
+{
+    static const float kScale = 0.5f;
+    static const float kScaleInv = 1.0f / kScale;
+
+    Net net;
+    LayerParams lp;
+    lp.type = "Eltwise";
+    lp.name = "testLayer";
+    lp.set("operation", "sum");
+    int eltwiseId = net.addLayerToPrev(lp.name, lp.type, lp);  // connect to a first input
+    net.connect(0, 1, eltwiseId, 1);  // connect to a second input
+
+    int inpSize[] = {1, 2, 3};
+    Mat firstInp(3, &inpSize[0], get<0>(GetParam()));
+    Mat secondInp(3, &inpSize[0], get<1>(GetParam()));
+    randu(firstInp, 0, 255);
+    randu(secondInp, 0, 255);
+
+    net.setInputsNames({"data", "second_input"});
+    net.setInput(firstInp, "data", kScale);
+    net.setInput(secondInp, "second_input", kScaleInv);
+    net.setPreferableBackend(DNN_BACKEND_INFERENCE_ENGINE);
+    Mat out = net.forward();
+
+    Mat ref;
+    addWeighted(firstInp, kScale, secondInp, kScaleInv, 0, ref, CV_32F);
+    normAssert(out, ref);
+}
+
+INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_DLDT_two_inputs, Combine(
+  Values(CV_8U, CV_32F), Values(CV_8U, CV_32F)
+));
+
 class UnsupportedLayer : public Layer
 {
 public:
--- a/modules/dnn/test/test_misc.cpp
+++ b/modules/dnn/test/test_misc.cpp
@ -138,4 +138,44 @@ TEST(LayerFactory, custom_layers)
    LayerFactory::unregisterLayer("CustomType");
 }

+typedef testing::TestWithParam<tuple<float, Vec3f, int, tuple<Backend, Target> > > setInput;
+TEST_P(setInput, normalization)
+{
+    const float kScale = get<0>(GetParam());
+    const Scalar kMean = get<1>(GetParam());
+    const int dtype    = get<2>(GetParam());
+    const int backend  = get<0>(get<3>(GetParam()));
+    const int target   = get<1>(get<3>(GetParam()));
+    const bool kSwapRB = true;
+
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD && !checkMyriadTarget())
+        throw SkipTestException("Myriad is not available/disabled in OpenCV");
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16 && dtype != CV_32F)
+        throw SkipTestException("");
+
+    Mat inp(5, 5, CV_8UC3);
+    randu(inp, 0, 255);
+    Mat ref = blobFromImage(inp, kScale, Size(), kMean, kSwapRB, /*crop*/false);
+
+    LayerParams lp;
+    Net net;
+    net.addLayerToPrev("testLayer", "Identity", lp);
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
+
+    Mat blob = blobFromImage(inp, 1.0, Size(), Scalar(), kSwapRB, /*crop*/false, dtype);
+    ASSERT_EQ(blob.type(), dtype);
+    net.setInput(blob, "", kScale, kMean);
+    Mat out = net.forward();
+    ASSERT_EQ(out.type(), CV_32F);
+    normAssert(ref, out, "", 4e-4, 1e-3);
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, setInput, Combine(
+  Values(1.0f, 1.0 / 127.5),
+  Values(Vec3f(), Vec3f(50, 50, 50), Vec3f(10, 50, 140)),
+  Values(CV_32F, CV_8U),
+  dnnBackendsAndTargets()
+));
+
 }} // namespace
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@ -309,7 +309,7 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)
                                    0, 10, 0.95932811, 0.38349164, 0.32528657, 0.40387636, 0.39165527,
                                    0, 10, 0.93973452, 0.66561931, 0.37841269, 0.68074018, 0.42907384);
    double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 5e-3 : default_l1;
-    double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.025 : default_lInf;
+    double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.09 : default_lInf;
    normAssertDetections(ref, out, "", 0.5, scoreDiff, iouDiff);
 }

--- a/modules/features2d/src/brisk.cpp
+++ b/modules/features2d/src/brisk.cpp
@ -1236,7 +1236,6 @@ BriskScaleSpace::isMax2D(const int layer, const int x_layer, const int y_layer)
  {
    // in this case, we have to analyze the situation more carefully:
    // the values are gaussian blurred and then we really decide
-    data = scores.ptr() + y_layer * scorescols + x_layer;
    int smoothedcenter = 4 * center + 2 * (s_10 + s10 + s0_1 + s01) + s_1_1 + s1_1 + s_11 + s11;
    for (unsigned int i = 0; i < deltasize; i += 2)
    {
@ -1312,8 +1311,7 @@ BriskScaleSpace::refine3D(const int layer, const int x_layer, const int y_layer,
      int s_2_2 = l.getAgastScore_5_8(x_layer + 1, y_layer + 1, 1);
      max_below = std::max(s_2_2, max_below);

-      max_below_float = subpixel2D(s_0_0, s_0_1, s_0_2, s_1_0, s_1_1, s_1_2, s_2_0, s_2_1, s_2_2, delta_x_below,
-                                   delta_y_below);
+      subpixel2D(s_0_0, s_0_1, s_0_2, s_1_0, s_1_1, s_1_2, s_2_0, s_2_1, s_2_2, delta_x_below, delta_y_below);
      max_below_float = (float)max_below;
    }
    else
--- a/modules/features2d/src/kaze/KAZEFeatures.cpp
+++ b/modules/features2d/src/kaze/KAZEFeatures.cpp
@ -373,8 +373,6 @@ void KAZEFeatures::Determinant_Hessian(std::vector<KeyPoint>& kpts)
                    is_out = true;
                }

-                is_out = false;
-
                if (is_out == false) {
                    if (is_repeated == false) {
                        kpts.push_back(kpts_par_[i][j]);
--- a/modules/imgcodecs/src/exif.cpp
+++ b/modules/imgcodecs/src/exif.cpp
@ -175,7 +175,6 @@ std::map<int, ExifEntry_t > ExifReader::getExif()
                    CV_THROW (ExifParsingError());
                }
                m_stream.read( reinterpret_cast<char*>(&m_data[0]), exifSize - offsetToTiffHeader );
-                count = m_stream.gcount();
                exifFound = true;
                break;

--- a/modules/imgcodecs/src/grfmt_bmp.cpp
+++ b/modules/imgcodecs/src/grfmt_bmp.cpp
@ -265,7 +265,7 @@ bool  BmpDecoder::readData( Mat& img )
                for(;;)
                {
                    int code = m_strm.getWord();
-                    int len = code & 255;
+                    const int len = code & 255;
                    code >>= 8;
                    if( len != 0 ) // encoded mode
                    {
@ -304,16 +304,13 @@ bool  BmpDecoder::readData( Mat& img )
                    else
                    {
                        int x_shift3 = (int)(line_end - data);
-                        int y_shift = m_height - y;

                        if( code == 2 )
                        {
                            x_shift3 = m_strm.getByte()*nch;
-                            y_shift = m_strm.getByte();
+                            m_strm.getByte();
                        }

-                        len = x_shift3 + ((y_shift * width3) & ((code == 0) - 1));
-
                        if( color )
                            data = FillUniColor( data, line_end, step, width3,
                                                 y, m_height, x_shift3,
--- a/modules/imgcodecs/src/grfmt_pam.cpp
+++ b/modules/imgcodecs/src/grfmt_pam.cpp
@ -689,7 +689,7 @@ bool PAMEncoder::write( const Mat& img, const std::vector<int>& params )
    tmp += sprintf( buffer + tmp, "MAXVAL %d\n", (1 << img.elemSize1()*8) - 1);
    if (fmt)
        tmp += sprintf( buffer + tmp, "TUPLTYPE %s\n", fmt->name );
-    tmp += sprintf( buffer + tmp, "ENDHDR\n" );
+    sprintf( buffer + tmp, "ENDHDR\n" );

    strm.putBytes( buffer, (int)strlen(buffer) );
    /* write data */
--- a/modules/imgcodecs/src/grfmt_tiff.cpp
+++ b/modules/imgcodecs/src/grfmt_tiff.cpp
@ -255,22 +255,21 @@ bool TiffDecoder::readHeader()
            {
                case 8:
                    m_type = CV_MAKETYPE(CV_8U, photometric > 1 ? wanted_channels : 1);
+                    result = true;
                    break;
                case 16:
                    m_type = CV_MAKETYPE(CV_16U, photometric > 1 ? wanted_channels : 1);
+                    result = true;
                    break;
-
                case 32:
                    m_type = CV_MAKETYPE(CV_32F, photometric > 1 ? 3 : 1);
+                    result = true;
                    break;
                case 64:
                    m_type = CV_MAKETYPE(CV_64F, photometric > 1 ? 3 : 1);
+                    result = true;
                    break;
-
-                default:
-                    result = false;
            }
-            result = true;
        }
    }

--- a/modules/imgproc/src/contours.cpp
+++ b/modules/imgproc/src/contours.cpp
@ -855,7 +855,6 @@ icvTraceContour_32s( int *ptr, int step, int *stop_ptr, int is_hole )
        for( ;; )
        {
            CV_Assert(i3 != NULL);
-            s_end = s;
            s = std::min(s, MAX_SIZE - 1);

            while( s < MAX_SIZE - 1 )
@ -1479,7 +1478,7 @@ icvFindContoursInInterval( const CvArr* src,
    cv::Ptr<CvMemStorage> storage01;
    CvSeq* first = 0;

-    int i, j, k, n;
+    int j, k, n;

    uchar*  src_data = 0;
    int  img_step = 0;
@ -1547,7 +1546,6 @@ icvFindContoursInInterval( const CvArr* src,

    // First line. None of runs is binded
    tmp.pt.y = 0;
-    i = 0;
    CV_WRITE_SEQ_ELEM( tmp, writer );
    upper_line = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer );

@ -1580,7 +1578,7 @@ icvFindContoursInInterval( const CvArr* src,
    last_elem = tmp_prev;
    tmp_prev->next = 0;

-    for( i = 1; i < img_size.height; i++ )
+    for( int i = 1; i < img_size.height; i++ )
    {
 //------// Find runs in next line
        src_data += img_step;
--- a/modules/imgproc/src/drawing.cpp
+++ b/modules/imgproc/src/drawing.cpp
@ -338,7 +338,6 @@ LineAA( Mat& img, Point2l pt1, Point2l pt2, const void* color )

    if( ax > ay )
    {
-        dx = ax;
        dy = (dy ^ j) - j;
        pt1.x ^= pt2.x & j;
        pt2.x ^= pt1.x & j;
@ -362,7 +361,6 @@ LineAA( Mat& img, Point2l pt1, Point2l pt2, const void* color )
    }
    else
    {
-        dy = ay;
        dx = (dx ^ i) - i;
        pt1.x ^= pt2.x & i;
        pt2.x ^= pt1.x & i;
@ -677,7 +675,6 @@ Line2( Mat& img, Point2l pt1, Point2l pt2, const void* color)

    if( ax > ay )
    {
-        dx = ax;
        dy = (dy ^ j) - j;
        pt1.x ^= pt2.x & j;
        pt2.x ^= pt1.x & j;
@ -692,7 +689,6 @@ Line2( Mat& img, Point2l pt1, Point2l pt2, const void* color)
    }
    else
    {
-        dy = ay;
        dx = (dx ^ i) - i;
        pt1.x ^= pt2.x & i;
        pt2.x ^= pt1.x & i;
--- a/modules/imgproc/src/filter.avx2.cpp
+++ b/modules/imgproc/src/filter.avx2.cpp
@ -128,8 +128,6 @@ int SymmColumnVec_32f_Symm_AVX(const float** src, const float* ky, float* dst, f
        for( k = 1; k <= ksize2; k++ )
        {
            f = _mm_set1_ps(ky[k]);
-            S = src[k] + i;
-            S2 = src[-k] + i;
            x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
            s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
        }
@ -144,7 +142,7 @@ int SymmColumnVec_32f_Symm_AVX(const float** src, const float* ky, float* dst, f
 int SymmColumnVec_32f_Unsymm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2)
 {
    int i = 0, k;
-    const float *S, *S2;
+    const float *S2;
    const __m128 d4 = _mm_set1_ps(delta);
    const __m256 d8 = _mm256_set1_ps(delta);

@ -152,11 +150,10 @@ int SymmColumnVec_32f_Unsymm_AVX(const float** src, const float* ky, float* dst,
    {
        __m256 f, s0 = d8, s1 = d8;
        __m256 x0;
-        S = src[0] + i;

        for (k = 1; k <= ksize2; k++)
        {
-            S = src[k] + i;
+            const float *S = src[k] + i;
            S2 = src[-k] + i;
            f = _mm256_set1_ps(ky[k]);
            x0 = _mm256_sub_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2));
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@ -4284,7 +4284,7 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
    size_t src_step = _src.step(), src_offset = _src.offset();
    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;

-    if (esz == 0
+    if (esz == 0 || src_step == 0
        || (src_offset % src_step) % esz != 0
        || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
        || !(borderType == BORDER_CONSTANT
--- a/modules/imgproc/src/floodfill.cpp
+++ b/modules/imgproc/src/floodfill.cpp
@ -467,7 +467,7 @@ int cv::floodFill( InputOutputArray _image, InputOutputArray _mask,
    if( rect )
        *rect = Rect();

-    int i, connectivity = flags & 255;
+    int i;
    union {
        uchar b[4];
        int i[4];
@ -491,9 +491,8 @@ int cv::floodFill( InputOutputArray _image, InputOutputArray _mask,
        CV_Error( CV_StsBadArg, "Number of channels in input image must be 1 or 3" );
    }

-    if( connectivity == 0 )
-        connectivity = 4;
-    else if( connectivity != 4 && connectivity != 8 )
+    const int connectivity = flags & 255;
+    if( connectivity != 0 && connectivity != 4 && connectivity != 8 )
        CV_Error( CV_StsBadFlag, "Connectivity must be 4, 0(=4) or 8" );

    bool is_simple = mask.empty() && (flags & FLOODFILL_MASK_ONLY) == 0;
--- a/modules/imgproc/src/histogram.cpp
+++ b/modules/imgproc/src/histogram.cpp
@ -1930,7 +1930,7 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
    Mat planes[2];
    NAryMatIterator it(arrays, planes);
    double result = 0;
-    int j, len = (int)it.size;
+    int j;

    CV_Assert( H1.type() == H2.type() && H1.depth() == CV_32F );

@ -1946,7 +1946,7 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
    {
        const float* h1 = it.planes[0].ptr<float>();
        const float* h2 = it.planes[1].ptr<float>();
-        len = it.planes[0].rows*it.planes[0].cols*H1.channels();
+        const int len = it.planes[0].rows*it.planes[0].cols*H1.channels();
        j = 0;

        if( (method == CV_COMP_CHISQR) || (method == CV_COMP_CHISQR_ALT))
--- a/modules/imgproc/src/hough.cpp
+++ b/modules/imgproc/src/hough.cpp
@ -413,7 +413,6 @@ HoughLinesSDiv( InputArray image, OutputArray lines, int type,
                // Find peaks in maccum...
                for( index = 0; index < sfn; index++ )
                {
-                    i = 0;
                    int pos = (int)(lst.size() - 1);
                    if( pos < 0 || lst[pos].value < mcaccum[index] )
                    {
--- a/modules/imgproc/src/min_enclosing_triangle.cpp
+++ b/modules/imgproc/src/min_enclosing_triangle.cpp
@ -401,7 +401,6 @@ static void findMinimumAreaEnclosingTriangle(const std::vector<cv::Point2f> &pol

    a = 1;
    b = 2;
-    c = 0;

    // Main algorithm steps

--- a/modules/imgproc/src/undistort.cpp
+++ b/modules/imgproc/src/undistort.cpp
@ -370,6 +370,7 @@ static void cvUndistortPointsInternal( const CvMat* _src, CvMat* _dst, const CvM
                   const CvMat* _distCoeffs,
                   const CvMat* matR, const CvMat* matP, cv::TermCriteria criteria)
 {
+    CV_Assert(criteria.isValid());
    double A[3][3], RR[3][3], k[14]={0,0,0,0,0,0,0,0,0,0,0,0,0,0};
    CvMat matA=cvMat(3, 3, CV_64F, A), _Dk;
    CvMat _RR=cvMat(3, 3, CV_64F, RR);
--- a/modules/ml/src/ann_mlp.cpp
+++ b/modules/ml/src/ann_mlp.cpp
@ -1187,7 +1187,7 @@ public:
            prev_dEdw_sign[i] = Mat::zeros(weights[i].size(), CV_8S);
            dEdw[i] = Mat::zeros(weights[i].size(), CV_64F);
        }
-
+        CV_Assert(total > 0);
        int dcount0 = max_buf_size/(2*total);
        dcount0 = std::max( dcount0, 1 );
        dcount0 = std::min( dcount0, count );
--- a/modules/objdetect/test/test_qrcode.cpp
+++ b/modules/objdetect/test/test_qrcode.cpp
@ -5,10 +5,9 @@
 #include "test_precomp.hpp"


-namespace opencv_test
-{
+namespace opencv_test { namespace {

-String qrcode_images_name[] = {
+std::string qrcode_images_name[] = {
    "20110817_030.jpg",
    "20110817_048.jpg",
    "img_20120226_161648.jpg",
@ -25,24 +24,25 @@ String qrcode_images_name[] = {

 TEST(Objdetect_QRCode, generate_test_data)
 {
-    String root = cvtest::TS::ptr()->get_data_path() + "qrcode/";
-    String dataset_config = cvtest::TS::ptr()->get_data_path() + "qrcode/dataset_config.json";
+    const std::string root = "qrcode/";
+    const std::string dataset_config = findDataFile(root + "dataset_config.json");
    FileStorage file_config(dataset_config, FileStorage::WRITE);

    file_config << "test_images" << "[";
-    size_t images_count = sizeof(qrcode_images_name) / sizeof(String);
+    size_t images_count = sizeof(qrcode_images_name) / sizeof(qrcode_images_name[0]);
    for (size_t i = 0; i < images_count; i++)
    {
        file_config << "{:" << "image_name" << qrcode_images_name[i];
-        String image_path = root + qrcode_images_name[i];
-        std::vector<Point> transform;
+        std::string image_path = findDataFile(root + qrcode_images_name[i]);
+        std::vector<Point> corners;
        Mat src = imread(image_path, IMREAD_GRAYSCALE);
-        EXPECT_TRUE(detectQRCode(src, transform));
+        ASSERT_FALSE(src.empty()) << "Can't read image: " << image_path;
+        EXPECT_TRUE(detectQRCode(src, corners));
        file_config << "x" << "[:";
-        for (size_t j = 0; j < transform.size(); j++) { file_config << transform[j].x; }
+        for (size_t j = 0; j < corners.size(); j++) { file_config << corners[j].x; }
        file_config << "]";
        file_config << "y" << "[:";
-        for (size_t j = 0; j < transform.size(); j++) { file_config << transform[j].y; }
+        for (size_t j = 0; j < corners.size(); j++) { file_config << corners[j].y; }
        file_config << "]" << "}";
    }
    file_config << "]";
@ -51,65 +51,65 @@ TEST(Objdetect_QRCode, generate_test_data)

 #else

-typedef testing::TestWithParam< String > Objdetect_QRCode;
+typedef testing::TestWithParam< std::string > Objdetect_QRCode;
 TEST_P(Objdetect_QRCode, regression)
 {
-    String root = cvtest::TS::ptr()->get_data_path() + "qrcode/";
-    String dataset_config = cvtest::TS::ptr()->get_data_path() + "qrcode/dataset_config.json";
-    FileStorage file_config(dataset_config, FileStorage::READ);
+    const std::string name_current_image = GetParam();
+    const std::string root = "qrcode/";
    const int pixels_error = 3;

-    std::vector<Point> corners;
-    String image_path = root + String(GetParam());
+    std::string image_path = findDataFile(root + name_current_image);
    Mat src = imread(image_path, IMREAD_GRAYSCALE);
+    ASSERT_FALSE(src.empty()) << "Can't read image: " << image_path;
+
+    std::vector<Point> corners;
    EXPECT_TRUE(detectQRCode(src, corners));

-    if (file_config.isOpened())
+    const std::string dataset_config = findDataFile(root + "dataset_config.json", false);
+    FileStorage file_config(dataset_config, FileStorage::READ);
+    ASSERT_TRUE(file_config.isOpened()) << "Can't read validation data: " << dataset_config;
    {
        FileNode images_list = file_config["test_images"];
-        int index = 0, images_count = static_cast<int>(images_list.size());
-        ASSERT_GT(images_count, 0);
+        size_t images_count = static_cast<size_t>(images_list.size());
+        ASSERT_GT(images_count, 0u) << "Can't find validation data entries in 'test_images': " << dataset_config;

-        bool runTestsFlag = false;
-        String name_current_image = String(GetParam());
-        for (; index < images_count; index++)
+        for (size_t index = 0; index < images_count; index++)
        {
-            String name_test_image = images_list[index]["image_name"];
+            FileNode config = images_list[(int)index];
+            std::string name_test_image = config["image_name"];
            if (name_test_image == name_current_image)
            {
                for (int i = 0; i < 4; i++)
                {
-                    int x = images_list[index]["x"][i];
-                    int y = images_list[index]["y"][i];
+                    int x = config["x"][i];
+                    int y = config["y"][i];
                    EXPECT_NEAR(x, corners[i].x, pixels_error);
                    EXPECT_NEAR(y, corners[i].y, pixels_error);
                }
-                runTestsFlag = true;
+                return; // done
            }
        }
-        if (!runTestsFlag)
-        {
-            std::cout << "Not found results for " << name_current_image;
-            std::cout << " image in dataset_config.json file." << std::endl;
-        }
-
-        file_config.release();
-    }
-    else
-    {
-        std::cout << " Not found dataset_config.json file." << std::endl;
+        std::cerr
+            << "Not found results for '" << name_current_image
+            << "' image in config file:" << dataset_config << std::endl
+            << "Re-run tests with enabled UPDATE_QRCODE_TEST_DATA macro to update test data."
+            << std::endl;
    }
 }

-INSTANTIATE_TEST_CASE_P(objdetect, Objdetect_QRCode, testing::ValuesIn(qrcode_images_name));
+INSTANTIATE_TEST_CASE_P(/**/, Objdetect_QRCode, testing::ValuesIn(qrcode_images_name));

-TEST(Objdetect_QRCode, not_found_qrcode)
+
+
+TEST(Objdetect_QRCode_basic, not_found_qrcode)
 {
    std::vector<Point> corners;
    Mat zero_image = Mat::zeros(256, 256, CV_8UC1);
    EXPECT_FALSE(detectQRCode(zero_image, corners));
 }

-#endif

-} // namespace
+
+#endif // UPDATE_QRCODE_TEST_DATA
+
+}} // namespace
--- a/modules/python/src2/cv2.cpp
+++ b/modules/python/src2/cv2.cpp
@ -1563,8 +1563,6 @@ PyObject* pyopencv_from(const Moments& m)
                         "nu30", m.nu30, "nu21", m.nu21, "nu12", m.nu12, "nu03", m.nu03);
 }

-#include "pyopencv_custom_headers.h"
-
 static int OnError(int status, const char *func_name, const char *err_msg, const char *file_name, int line, void *userdata)
 {
    PyGILState_STATE gstate;
@ -1802,6 +1800,7 @@ static int convert_to_char(PyObject *o, char *dst, const char *name = "no_name")
 #  pragma GCC diagnostic ignored "-Wmissing-field-initializers"
 #endif

+#include "pyopencv_custom_headers.h"
 #include "pyopencv_generated_types.h"
 #include "pyopencv_generated_funcs.h"

--- a/modules/python/test/test_videoio.py
+++ b/modules/python/test/test_videoio.py
@ -0,0 +1,25 @@
+#!/usr/bin/env python
+from __future__ import print_function
+
+import numpy as np
+import cv2 as cv
+
+from tests_common import NewOpenCVTests
+
+class Bindings(NewOpenCVTests):
+
+    def check_name(self, name):
+        #print(name)
+        self.assertFalse(name == None)
+        self.assertFalse(name == "")
+
+    def test_registry(self):
+        self.check_name(cv.videoio_registry.getBackendName(cv.CAP_ANY));
+        self.check_name(cv.videoio_registry.getBackendName(cv.CAP_FFMPEG))
+        self.check_name(cv.videoio_registry.getBackendName(cv.CAP_OPENCV_MJPEG))
+        backends = cv.videoio_registry.getBackends()
+        for backend in backends:
+            self.check_name(cv.videoio_registry.getBackendName(backend))
+
+if __name__ == '__main__':
+    NewOpenCVTests.bootstrap()
--- a/modules/ts/include/opencv2/ts.hpp
+++ b/modules/ts/include/opencv2/ts.hpp
@ -103,6 +103,7 @@ using std::pair;
 using std::make_pair;
 using testing::TestWithParam;
 using testing::Values;
+using testing::ValuesIn;
 using testing::Combine;

 using cv::Mat;
@ -654,6 +655,11 @@ void addDataSearchSubDirectory(const std::string& subdir);
 */
 std::string findDataFile(const std::string& relative_path, bool required = true);

+/*! @brief Try to find requested data directory
+@sa findDataFile
+ */
+std::string findDataDirectory(const std::string& relative_path, bool required = true);
+

 #ifndef __CV_TEST_EXEC_ARGS
 #if defined(_MSC_VER) && (_MSC_VER <= 1400)
--- a/modules/ts/include/opencv2/ts/ts_ext.hpp
+++ b/modules/ts/include/opencv2/ts/ts_ext.hpp
@ -44,13 +44,13 @@ extern int testThreads;


 #undef TEST
-#define TEST_(test_case_name, test_name, BODY_IMPL) \
-    class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public ::testing::Test {\
+#define TEST_(test_case_name, test_name, parent_class, bodyMethodName, BODY_IMPL) \
+    class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\
     public:\
      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\
     private:\
      virtual void TestBody() CV_OVERRIDE;\
-      virtual void Body();\
+      virtual void bodyMethodName();\
      static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\
      GTEST_DISALLOW_COPY_AND_ASSIGN_(\
          GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\
@ -62,14 +62,14 @@ extern int testThreads;
            #test_case_name, #test_name, NULL, NULL, \
            ::testing::internal::CodeLocation(__FILE__, __LINE__), \
            (::testing::internal::GetTestTypeId()), \
-            ::testing::Test::SetUpTestCase, \
-            ::testing::Test::TearDownTestCase, \
+            parent_class::SetUpTestCase, \
+            parent_class::TearDownTestCase, \
            new ::testing::internal::TestFactoryImpl<\
                GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\
    void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() BODY_IMPL( #test_case_name "_" #test_name ) \
-    void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::Body()
+    void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::bodyMethodName()

-#define TEST(test_case_name, test_name) TEST_(test_case_name, test_name, CV__TEST_BODY_IMPL)
+#define TEST(test_case_name, test_name) TEST_(test_case_name, test_name, ::testing::Test, Body, CV__TEST_BODY_IMPL)

 #define CV__TEST_BIGDATA_BODY_IMPL(name) \
    { \
@ -92,9 +92,9 @@ extern int testThreads;

 // Special type of tests which require / use or validate processing of huge amount of data (>= 2Gb)
 #if defined(_M_X64) || defined(__x86_64__) || defined(__aarch64__)
-#define BIGDATA_TEST(test_case_name, test_name) TEST_(BigData_ ## test_case_name, test_name, CV__TEST_BIGDATA_BODY_IMPL)
+#define BIGDATA_TEST(test_case_name, test_name) TEST_(BigData_ ## test_case_name, test_name, ::testing::Test, Body, CV__TEST_BIGDATA_BODY_IMPL)
 #else
-#define BIGDATA_TEST(test_case_name, test_name) TEST_(BigData_ ## test_case_name, DISABLED_ ## test_name, CV__TEST_BIGDATA_BODY_IMPL)
+#define BIGDATA_TEST(test_case_name, test_name) TEST_(BigData_ ## test_case_name, DISABLED_ ## test_name, ::testing::Test, Body, CV__TEST_BIGDATA_BODY_IMPL)
 #endif

 #undef TEST_F
--- a/modules/ts/include/opencv2/ts/ts_perf.hpp
+++ b/modules/ts/include/opencv2/ts/ts_perf.hpp
@ -546,17 +546,7 @@ void PrintTo(const Size& sz, ::std::ostream* os);
 //     EXPECT_TRUE(foo.StatusIsOK());
 //   }
 #define PERF_TEST(test_case_name, test_name)\
-    namespace PERF_PROXY_NAMESPACE_NAME_(test_case_name, test_name) {\
-     class TestBase {/*compile error for this class means that you are trying to use perf::TestBase as a fixture*/};\
-     class test_case_name : public ::perf::TestBase {\
-      public:\
-       test_case_name() {}\
-      protected:\
-       virtual void PerfTestBody();\
-     };\
-     TEST_F(test_case_name, test_name){ CV__PERF_TEST_BODY_IMPL(#test_case_name "_" #test_name); }\
-    }\
-    void PERF_PROXY_NAMESPACE_NAME_(test_case_name, test_name)::test_case_name::PerfTestBody()
+    TEST_(test_case_name, test_name, ::perf::TestBase, PerfTestBody, CV__PERF_TEST_BODY_IMPL)

 // Defines a performance test that uses a test fixture.
 //
--- a/modules/ts/src/ts.cpp
+++ b/modules/ts/src/ts.cpp
@ -772,16 +772,24 @@ void addDataSearchSubDirectory(const std::string& subdir)
    TS::ptr()->data_search_subdir.push_back(subdir);
 }

-std::string findDataFile(const std::string& relative_path, bool required)
+static std::string findData(const std::string& relative_path, bool required, bool findDirectory)
 {
 #define TEST_TRY_FILE_WITH_PREFIX(prefix) \
 { \
    std::string path = path_join(prefix, relative_path); \
    /*printf("Trying %s\n", path.c_str());*/ \
-    FILE* f = fopen(path.c_str(), "rb"); \
-    if(f) { \
-       fclose(f); \
-       return path; \
+    if (findDirectory) \
+    { \
+        if (isDirectory(path)) \
+            return path; \
+    } \
+    else \
+    { \
+        FILE* f = fopen(path.c_str(), "rb"); \
+        if(f) { \
+            fclose(f); \
+            return path; \
+        } \
    } \
 }

@ -842,11 +850,21 @@ std::string findDataFile(const std::string& relative_path, bool required)
    }
 #endif
 #endif
+    const char* type = findDirectory ? "directory" : "data file";
    if (required)
-        CV_Error(cv::Error::StsError, cv::format("OpenCV tests: Can't find required data file: %s", relative_path.c_str()));
-    throw SkipTestException(cv::format("OpenCV tests: Can't find data file: %s", relative_path.c_str()));
+        CV_Error(cv::Error::StsError, cv::format("OpenCV tests: Can't find required %s: %s", type, relative_path.c_str()));
+    throw SkipTestException(cv::format("OpenCV tests: Can't find %s: %s", type, relative_path.c_str()));
 }

+std::string findDataFile(const std::string& relative_path, bool required)
+{
+    return findData(relative_path, required, false);
+}
+
+std::string findDataDirectory(const std::string& relative_path, bool required)
+{
+    return findData(relative_path, required, true);
+}

 } //namespace cvtest

--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@ -59,6 +59,7 @@
    @defgroup videoio_c C API for video I/O
    @defgroup videoio_ios iOS glue for video I/O
    @defgroup videoio_winrt WinRT glue for video I/O
+    @defgroup videoio_registry Query I/O API backends registry
  @}
 */

--- a/modules/videoio/include/opencv2/videoio/registry.hpp
+++ b/modules/videoio/include/opencv2/videoio/registry.hpp
@ -0,0 +1,44 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_VIDEOIO_REGISTRY_HPP
+#define OPENCV_VIDEOIO_REGISTRY_HPP
+
+#include <opencv2/videoio.hpp>
+
+namespace cv { namespace videoio_registry {
+/** @addtogroup videoio_registry
+This section contains API description how to query/configure available Video I/O backends.
+
+Runtime configuration options:
+- enable debug mode: `OPENCV_VIDEOIO_DEBUG=1`
+- change backend priority: `OPENCV_VIDEOIO_PRIORITY_<backend>=9999`
+- disable backend: `OPENCV_VIDEOIO_PRIORITY_<backend>=0`
+- specify list of backends with high priority (>100000): `OPENCV_VIDEOIO_PRIORITY_LIST=FFMPEG,GSTREAMER`
+
+@{
+ */
+
+
+/** @brief Returns backend API name or "unknown"
+@param api backend ID (#VideoCaptureAPIs)
+*/
+CV_EXPORTS_W cv::String getBackendName(VideoCaptureAPIs api);
+
+/** @brief Returns list of all builtin backends */
+CV_EXPORTS_W std::vector<VideoCaptureAPIs> getBackends();
+
+/** @brief Returns list of available backends which works via `cv::VideoCapture(int index)` */
+CV_EXPORTS_W std::vector<VideoCaptureAPIs> getCameraBackends();
+
+/** @brief Returns list of available backends which works via `cv::VideoCapture(filename)` */
+CV_EXPORTS_W std::vector<VideoCaptureAPIs> getStreamBackends();
+
+/** @brief Returns list of available backends which works via `cv::VideoWriter()` */
+CV_EXPORTS_W std::vector<VideoCaptureAPIs> getWriterBackends();
+
+//! @}
+}} // namespace
+
+#endif // OPENCV_VIDEOIO_REGISTRY_HPP
--- a/modules/videoio/misc/python/pyopencv_videoio.hpp
+++ b/modules/videoio/misc/python/pyopencv_videoio.hpp
@ -0,0 +1,50 @@
+#ifdef HAVE_OPENCV_VIDEOIO
+typedef std::vector<VideoCaptureAPIs> vector_VideoCaptureAPIs;
+
+template<>
+bool pyopencv_to(PyObject *o, cv::VideoCaptureAPIs &v, const char *name)
+{
+    (void)name;
+    v = CAP_ANY;
+    if (!o || o == Py_None)
+        return false;
+    else if (PyLong_Check(o))
+    {
+        v = VideoCaptureAPIs((int64)PyLong_AsLongLong(o));
+        return true;
+    }
+    else if (PyInt_Check(o))
+    {
+        v = VideoCaptureAPIs((int64)PyInt_AS_LONG(o));
+        return true;
+    }
+    else
+        return false;
+}
+
+template<>
+PyObject* pyopencv_from(const cv::VideoCaptureAPIs &v)
+{
+    return pyopencv_from((int)(v));
+}
+
+template<> struct pyopencvVecConverter<cv::VideoCaptureAPIs>
+{
+    static bool to(PyObject* obj, std::vector<cv::VideoCaptureAPIs>& value, const ArgInfo info)
+    {
+        return pyopencv_to_generic_vec(obj, value, info);
+    }
+
+    static PyObject* from(const std::vector<cv::VideoCaptureAPIs>& value)
+    {
+        return pyopencv_from_generic_vec(value);
+    }
+};
+
+template<>
+bool pyopencv_to(PyObject *o, std::vector<cv::VideoCaptureAPIs>& apis, const char *name)
+{
+  return pyopencvVecConverter<cv::VideoCaptureAPIs>::to(o, apis, ArgInfo(name, false));
+}
+
+#endif // HAVE_OPENCV_VIDEOIO
--- a/modules/videoio/src/cap_ffmpeg_impl.hpp
+++ b/modules/videoio/src/cap_ffmpeg_impl.hpp
@ -2351,9 +2351,6 @@ AVStream* OutputMediaStream_FFMPEG::addVideoStream(AVFormatContext *oc, CV_CODEC
    c->codec_type = AVMEDIA_TYPE_VIDEO;

    // put sample parameters
-    unsigned long long lbit_rate = static_cast<unsigned long long>(bitrate);
-    lbit_rate += (bitrate / 4);
-    lbit_rate = std::min(lbit_rate, static_cast<unsigned long long>(std::numeric_limits<int>::max()));
    c->bit_rate = bitrate;

    // took advice from
--- a/modules/videoio/src/cap_mjpeg_encoder.cpp
+++ b/modules/videoio/src/cap_mjpeg_encoder.cpp
@ -158,8 +158,9 @@ public:
        data.resize(size);
    }

-    void put(unsigned bits, int len)
+    inline void put_bits(unsigned bits, int len)
    {
+        CV_Assert(len >=0 && len < 32);
        if((m_pos == (data.size() - 1) && len > bits_free) || m_pos == data.size())
        {
            resize(int(2*data.size()));
@ -182,6 +183,12 @@ public:
        }
    }

+    inline void put_val(int val, const unsigned * table)
+    {
+        unsigned code = table[(val) + 2];
+        put_bits(code >> 8, (int)(code & 255));
+    }
+
    void finish()
    {
        if(bits_free == 32)
@ -1188,13 +1195,6 @@ public:
    void operator()( const cv::Range& range ) const CV_OVERRIDE
    {
        const int CAT_TAB_SIZE = 4096;
-        unsigned code = 0;
-
-#define JPUT_BITS(val, bits) output_buffer.put(val, bits)
-
-#define JPUT_HUFF(val, table) \
-    code = table[(val) + 2]; \
-    JPUT_BITS(code >> 8, (int)(code & 255))

        int x, y;
        int i, j;
@ -1300,8 +1300,8 @@ public:
                            int cat = cat_table[val + CAT_TAB_SIZE];

                            //CV_Assert( cat <= 11 );
-                            JPUT_HUFF( cat, huff_dc_tab[is_chroma] );
-                            JPUT_BITS( val - (val < 0 ? 1 : 0), cat );
+                            output_buffer.put_val(cat, huff_dc_tab[is_chroma] );
+                            output_buffer.put_bits( val - (val < 0 ? 1 : 0), cat );
                        }

                        for( j = 1; j < 64; j++ )
@ -1316,15 +1316,15 @@ public:
                            {
                                while( run >= 16 )
                                {
-                                    JPUT_HUFF( 0xF0, htable ); // encode 16 zeros
+                                    output_buffer.put_val( 0xF0, htable ); // encode 16 zeros
                                    run -= 16;
                                }

                                {
                                    int cat = cat_table[val + CAT_TAB_SIZE];
                                    //CV_Assert( cat <= 10 );
-                                    JPUT_HUFF( cat + run*16, htable );
-                                    JPUT_BITS( val - (val < 0 ? 1 : 0), cat );
+                                    output_buffer.put_val( cat + run*16, htable );
+                                    output_buffer.put_bits( val - (val < 0 ? 1 : 0), cat );
                                }

                                run = 0;
@ -1333,7 +1333,7 @@ public:

                        if( run )
                        {
-                            JPUT_HUFF( 0x00, htable ); // encode EOB
+                            output_buffer.put_val( 0x00, htable ); // encode EOB
                        }
                    }
                }
--- a/modules/videoio/src/cap_v4l.cpp
+++ b/modules/videoio/src/cap_v4l.cpp
@ -277,6 +277,7 @@ struct CvCaptureCAM_V4L CV_FINAL : public CvCapture

    __u32 palette;
    int width, height;
+    int width_set, height_set;
    int bufferSize;
    __u32 fps;
    bool convert_rgb;
@ -797,6 +798,7 @@ bool CvCaptureCAM_V4L::open(const char* _deviceName)
    FirstCapture = 1;
    width = DEFAULT_V4L_WIDTH;
    height = DEFAULT_V4L_HEIGHT;
+    width_set = height_set = 0;
    bufferSize = DEFAULT_V4L_BUFFERS;
    fps = DEFAULT_V4L_FPS;
    convert_rgb = true;
@ -1769,7 +1771,6 @@ static bool icvSetControl (CvCaptureCAM_V4L* capture,

 static int icvSetPropertyCAM_V4L( CvCaptureCAM_V4L* capture,
        int property_id, double value ){
-    static int width = 0, height = 0;
    bool retval = false;
    bool possible;

@ -1778,6 +1779,9 @@ static int icvSetPropertyCAM_V4L( CvCaptureCAM_V4L* capture,

    switch (property_id) {
    case CV_CAP_PROP_FRAME_WIDTH:
+    {
+        int& width = capture->width_set;
+        int& height = capture->height_set;
        width = cvRound(value);
        retval = width != 0;
        if(width !=0 && height != 0) {
@ -1786,8 +1790,12 @@ static int icvSetPropertyCAM_V4L( CvCaptureCAM_V4L* capture,
            retval = v4l2_reset(capture);
            width = height = 0;
        }
-        break;
+    }
+    break;
    case CV_CAP_PROP_FRAME_HEIGHT:
+    {
+        int& width = capture->width_set;
+        int& height = capture->height_set;
        height = cvRound(value);
        retval = height != 0;
        if(width !=0 && height != 0) {
@ -1796,7 +1804,8 @@ static int icvSetPropertyCAM_V4L( CvCaptureCAM_V4L* capture,
            retval = v4l2_reset(capture);
            width = height = 0;
        }
-        break;
+    }
+    break;
    case CV_CAP_PROP_FPS:
        capture->fps = value;
        retval = v4l2_reset(capture);
--- a/modules/videoio/src/container_avi.cpp
+++ b/modules/videoio/src/container_avi.cpp
@ -12,7 +12,7 @@ namespace cv

 // Utility function for safe integer conversions
 template <typename D, typename S>
-inline D safe_int_cast(S val)
+inline D safe_int_cast(S val, const char * msg = 0)
 {
    typedef std::numeric_limits<S> st;
    typedef std::numeric_limits<D> dt;
@ -21,7 +21,10 @@ inline D safe_int_cast(S val)
    const bool in_range_l = (double)val >= (double)dt::min();
    if (!in_range_r || !in_range_l)
    {
-        CV_Error_(cv::Error::StsOutOfRange, ("Can not convert integer values (%s -> %s), value 0x%llx is out of range", typeid(S).name(), typeid(D).name(), val));
+        if (!msg)
+            CV_Error_(Error::StsOutOfRange, ("Can not convert integer values (%s -> %s), value 0x%llx is out of range", typeid(S).name(), typeid(D).name(), val));
+        else
+            CV_Error(Error::StsOutOfRange, msg);
    }
    return static_cast<D>(val);
 }
@ -128,7 +131,7 @@ public:
    VideoInputStream();
    VideoInputStream(const String& filename);
    ~VideoInputStream();
-    VideoInputStream& read(char*, uint64_t);
+    VideoInputStream& read(char*, uint32_t);
    VideoInputStream& seekg(uint64_t);
    uint64_t tellg();
    bool isOpened() const;
@ -229,11 +232,11 @@ void VideoInputStream::close()
    }
 }

-VideoInputStream& VideoInputStream::read(char* buf, uint64_t count)
+VideoInputStream& VideoInputStream::read(char* buf, uint32_t count)
 {
    if(isOpened())
    {
-        input.read(buf, safe_int_cast<std::streamsize>(count));
+        input.read(buf, safe_int_cast<std::streamsize>(count, "Failed to read AVI file: requested chunk size is too large"));
        m_is_valid = (input.gcount() == (std::streamsize)count);
    }

@ -243,7 +246,7 @@ VideoInputStream& VideoInputStream::read(char* buf, uint64_t count)
 VideoInputStream& VideoInputStream::seekg(uint64_t pos)
 {
    input.clear();
-    input.seekg(safe_int_cast<std::streamoff>(pos));
+    input.seekg(safe_int_cast<std::streamoff>(pos, "Failed to seek in AVI file: position is out of range"));
    m_is_valid = !input.eof();
    return *this;
 }
@ -322,9 +325,6 @@ bool AVIReadContainer::parseStrl(char stream_id, Codecs codec_)

    if(m_file_stream && strh.m_four_cc == STRH_CC)
    {
-        uint64_t next_strl_list = m_file_stream->tellg();
-        next_strl_list += strh.m_size;
-
        AviStreamHeader strm_hdr;
        *m_file_stream >> strm_hdr;

@ -668,7 +668,7 @@ void BitStream::writeBlock()
 }

 size_t BitStream::getPos() const {
-    return safe_int_cast<size_t>(m_current - m_start) + m_pos;
+    return safe_int_cast<size_t>(m_current - m_start, "Failed to determine AVI bufer position: value is out of range") + m_pos;
 }

 void BitStream::putByte(int val)
@ -737,7 +737,7 @@ void BitStream::patchInt(uint32_t val, size_t pos)
 {
    if( pos >= m_pos )
    {
-        ptrdiff_t delta = safe_int_cast<ptrdiff_t>(pos - m_pos);
+        ptrdiff_t delta = safe_int_cast<ptrdiff_t>(pos - m_pos, "Failed to seek in AVI buffer: value is out of range");
        CV_Assert( delta < m_current - m_start );
        m_start[delta] = (uchar)val;
        m_start[delta+1] = (uchar)(val >> 8);
@ -747,7 +747,7 @@ void BitStream::patchInt(uint32_t val, size_t pos)
    else
    {
        std::streamoff fpos = output.tellp();
-        output.seekp(safe_int_cast<std::streamoff>(pos));
+        output.seekp(safe_int_cast<std::streamoff>(pos, "Failed to seek in AVI file: value is out of range"));
        uchar buf[] = { (uchar)val, (uchar)(val >> 8), (uchar)(val >> 16), (uchar)(val >> 24) };
        output.write((char *)buf, 4);
        output.seekp(fpos);
@ -960,7 +960,7 @@ void AVIWriteContainer::endWriteChunk()
        size_t pospos = AVIChunkSizeIndex.back();
        AVIChunkSizeIndex.pop_back();
        CV_Assert(currpos >= pospos);
-        uint32_t chunksz = safe_int_cast<uint32_t>(currpos - pospos);
+        uint32_t chunksz = safe_int_cast<uint32_t>(currpos - pospos, "Failed to write AVI file: chunk size is out of bounds");
        strm->patchInt(chunksz, pospos);
    }
 }
@ -996,7 +996,7 @@ void AVIWriteContainer::writeIndex(int stream_number, StreamType strm_type)

 void AVIWriteContainer::finishWriteAVI()
 {
-    uint32_t nframes = safe_int_cast<uint32_t>(frameOffset.size());
+    uint32_t nframes = safe_int_cast<uint32_t>(frameOffset.size(), "Failed to write AVI file: number of frames is too large");
    // Record frames numbers to AVI Header
    while (!frameNumIndexes.empty())
    {
--- a/modules/videoio/src/videoio_registry.cpp
+++ b/modules/videoio/src/videoio_registry.cpp
@ -6,6 +6,8 @@

 #include "videoio_registry.hpp"

+#include "opencv2/videoio/registry.hpp"
+
 #include "cap_intelperc.hpp"
 #include "cap_librealsense.hpp"
 #include "cap_dshow.hpp"
@ -250,6 +252,8 @@ public:
        return g_instance;
    }

+    inline std::vector<VideoBackendInfo> getEnabledBackends() const { return enabledBackends; }
+
    inline std::vector<VideoBackendInfo> getAvailableBackends_CaptureByIndex() const
    {
        std::vector<VideoBackendInfo> result;
@ -305,6 +309,58 @@ std::vector<VideoBackendInfo> getAvailableBackends_Writer()
    return result;
 }

+cv::String getBackendName(VideoCaptureAPIs api)
+{
+    if (api == CAP_ANY)
+        return "CAP_ANY";  // special case, not a part of backends list
+    const int N = sizeof(builtin_backends)/sizeof(builtin_backends[0]);
+    for (size_t i = 0; i < N; i++)
+    {
+        const VideoBackendInfo& backend = builtin_backends[i];
+        if (backend.id == api)
+            return backend.name;
+    }
+    return cv::format("UnknownVideoAPI(%d)", (int)api);
+}
+
+std::vector<VideoCaptureAPIs> getBackends()
+{
+    std::vector<VideoBackendInfo> backends = VideoBackendRegistry::getInstance().getEnabledBackends();
+    std::vector<VideoCaptureAPIs> result;
+    for (size_t i = 0; i < backends.size(); i++)
+        result.push_back((VideoCaptureAPIs)backends[i].id);
+    return result;
+}
+
+std::vector<VideoCaptureAPIs> getCameraBackends()
+{
+    const std::vector<VideoBackendInfo> backends = VideoBackendRegistry::getInstance().getAvailableBackends_CaptureByIndex();
+    std::vector<VideoCaptureAPIs> result;
+    for (size_t i = 0; i < backends.size(); i++)
+        result.push_back((VideoCaptureAPIs)backends[i].id);
+    return result;
+
+}
+
+std::vector<VideoCaptureAPIs> getStreamBackends()
+{
+    const std::vector<VideoBackendInfo> backends = VideoBackendRegistry::getInstance().getAvailableBackends_CaptureByFilename();
+    std::vector<VideoCaptureAPIs> result;
+    for (size_t i = 0; i < backends.size(); i++)
+        result.push_back((VideoCaptureAPIs)backends[i].id);
+    return result;
+
+}
+
+std::vector<VideoCaptureAPIs> getWriterBackends()
+{
+    const std::vector<VideoBackendInfo> backends = VideoBackendRegistry::getInstance().getAvailableBackends_Writer();
+    std::vector<VideoCaptureAPIs> result;
+    for (size_t i = 0; i < backends.size(); i++)
+        result.push_back((VideoCaptureAPIs)backends[i].id);
+    return result;
+}
+
 } // namespace registry

 #define TRY_OPEN(backend_func) \
--- a/modules/videoio/test/test_precomp.hpp
+++ b/modules/videoio/test/test_precomp.hpp
@ -6,10 +6,26 @@

 #include "opencv2/ts.hpp"
 #include "opencv2/videoio.hpp"
+#include "opencv2/videoio/registry.hpp"
 #include "opencv2/imgproc/imgproc_c.h"

 #include "opencv2/core/private.hpp"

+namespace cv {
+
+inline std::ostream &operator<<(std::ostream &out, const VideoCaptureAPIs& api)
+{
+    out << cv::videoio_registry::getBackendName(api); return out;
+}
+
+static inline void PrintTo(const cv::VideoCaptureAPIs& api, std::ostream* os)
+{
+    *os << cv::videoio_registry::getBackendName(api);
+}
+
+} // namespace
+
+
 inline std::string fourccToString(int fourcc)
 {
    return cv::format("%c%c%c%c", fourcc & 255, (fourcc >> 8) & 255, (fourcc >> 16) & 255, (fourcc >> 24) & 255);
@ -55,4 +71,15 @@ public:
    }
 };

+
+static inline bool isBackendAvailable(cv::VideoCaptureAPIs api, const std::vector<cv::VideoCaptureAPIs>& api_list)
+{
+    for (size_t i = 0; i < api_list.size(); i++)
+    {
+        if (api_list[i] == api)
+            return true;
+    }
+    return false;
+}
+
 #endif
--- a/modules/videoio/test/test_video_io.cpp
+++ b/modules/videoio/test/test_video_io.cpp
@ -46,62 +46,12 @@
 namespace opencv_test
 {

-struct VideoCaptureAPI
-{
-    VideoCaptureAPIs api;
-
-    inline const char * toString() const
-    {
-        switch (api)
-        {
-        case CAP_ANY: return "CAP_ANY";
-    #ifdef __linux__
-        case CAP_V4L2: return "CAP_V4L/CAP_V4L2";
-    #else
-        case CAP_VFW: return "CAP_VFW";
-    #endif
-        case CAP_FIREWIRE: return "CAP_FIREWIRE";
-        case CAP_QT: return "CAP_QT";
-        case CAP_UNICAP: return "CAP_UNICAP";
-        case CAP_DSHOW: return "CAP_DSHOW";
-        case CAP_PVAPI: return "CAP_PVAPI";
-        case CAP_OPENNI: return "CAP_OPENNI";
-        case CAP_OPENNI_ASUS: return "CAP_OPENNI_ASUS";
-        case CAP_ANDROID: return "CAP_ANDROID";
-        case CAP_XIAPI: return "CAP_XIAPI";
-        case CAP_AVFOUNDATION: return "CAP_AVFOUNDATION";
-        case CAP_GIGANETIX: return "CAP_GIGANETIX";
-        case CAP_MSMF: return "CAP_MSMF";
-        case CAP_WINRT: return "CAP_WINRT";
-        case CAP_INTELPERC: return "CAP_INTELPERC";
-        case CAP_OPENNI2: return "CAP_OPENNI2";
-        case CAP_OPENNI2_ASUS: return "CAP_OPENNI2_ASUS";
-        case CAP_GPHOTO2: return "CAP_GPHOTO2";
-        case CAP_GSTREAMER: return "CAP_GSTREAMER";
-        case CAP_FFMPEG: return "CAP_FFMPEG";
-        case CAP_IMAGES: return "CAP_IMAGES";
-        case CAP_ARAVIS: return "CAP_ARAVIS";
-        case CAP_OPENCV_MJPEG: return "CAP_OPENCV_MJPEG";
-        case CAP_INTEL_MFX: return "CAP_INTEL_MFX";
-        case CAP_XINE: return "CAP_XINE";
-        }
-        return "unknown";
-    }
-    VideoCaptureAPI(int api_ = CAP_ANY) : api((VideoCaptureAPIs)api_) {}
-    operator int() { return api; }
-};
-
-inline std::ostream &operator<<(std::ostream &out, const VideoCaptureAPI & api)
-{
-    out << api.toString(); return out;
-}
-
 class Videoio_Test_Base
 {
 protected:
    string ext;
    string video_file;
-    VideoCaptureAPI apiPref;
+    VideoCaptureAPIs apiPref;
 protected:
    Videoio_Test_Base() {}
    virtual ~Videoio_Test_Base() {}
@ -131,6 +81,8 @@ protected:
 public:
    void doTest()
    {
+        if (!isBackendAvailable(apiPref, cv::videoio_registry::getStreamBackends()))
+            throw SkipTestException(cv::String("Backend is not available/disabled: ") + cv::videoio_registry::getBackendName(apiPref));
        VideoCapture cap;
        ASSERT_NO_THROW(cap.open(video_file, apiPref));
        if (!cap.isOpened())
@ -200,7 +152,7 @@ public:
 };

 //==================================================================================================
-typedef tuple<string, VideoCaptureAPI> Backend_Type_Params;
+typedef tuple<string, VideoCaptureAPIs> Backend_Type_Params;

 class Videoio_Bunny : public Videoio_Test_Base, public testing::TestWithParam<Backend_Type_Params>
 {
@ -214,6 +166,8 @@ public:
    }
    void doFrameCountTest()
    {
+        if (!isBackendAvailable(apiPref, cv::videoio_registry::getStreamBackends()))
+            throw SkipTestException(cv::String("Backend is not available/disabled: ") + cv::videoio_registry::getBackendName(apiPref));
        VideoCapture cap;
        EXPECT_NO_THROW(cap.open(video_file, apiPref));
        if (!cap.isOpened())
@ -274,7 +228,7 @@ struct Ext_Fourcc_PSNR
    string ext;
    string fourcc;
    float PSNR;
-    VideoCaptureAPI api;
+    VideoCaptureAPIs api;
 };
 typedef tuple<Size, Ext_Fourcc_PSNR> Size_Ext_Fourcc_PSNR;

@ -348,7 +302,7 @@ public:

 //==================================================================================================

-static VideoCaptureAPI backend_params[] = {
+static const VideoCaptureAPIs backend_params[] = {
 #ifdef HAVE_QUICKTIME
    CAP_QT,
 #endif
@ -383,7 +337,7 @@ static VideoCaptureAPI backend_params[] = {
    // CAP_INTEL_MFX
 };

-static string bunny_params[] = {
+static const string bunny_params[] = {
 #ifdef HAVE_VIDEO_INPUT
    string("wmv"),
    string("mov"),
--- a/samples/cpp/detect_mser.cpp
+++ b/samples/cpp/detect_mser.cpp
@ -7,6 +7,9 @@
 #include <vector>
 #include <map>
 #include <iostream>
+#include <iomanip>
+#include <limits>
+#include <stdint.h>
 #ifdef HAVE_OPENGL
 #ifdef _WIN32
 #define WIN32_LEAN_AND_MEAN 1
@ -36,17 +39,17 @@ static void help()
    cout << "\n This program demonstrates how to use MSER to detect extremal regions \n"
        "Usage: \n"
        "  ./detect_mser <image1(without parameter a syntehtic image is used as default)>\n"
-        "Press esc key when image window is active to change  descriptor parameter\n"
+        "Press esc key when image window is active to change descriptor parameter\n"
        "Press 2, 8, 4, 6, +,- or 5 keys in openGL windows to change view or use mouse\n";
 }

 struct MSERParams
 {
    MSERParams(int _delta = 5, int _min_area = 60, int _max_area = 14400,
-    double _max_variation = 0.25, double _min_diversity = .2,
-    int _max_evolution = 200, double _area_threshold = 1.01,
-    double _min_margin = 0.003, int _edge_blur_size = 5)
-        {
+               double _max_variation = 0.25, double _min_diversity = .2,
+               int _max_evolution = 200, double _area_threshold = 1.01,
+               double _min_margin = 0.003, int _edge_blur_size = 5)
+    {
        delta = _delta;
        minArea = _min_area;
        maxArea = _max_area;
@ -57,7 +60,7 @@ struct MSERParams
        minMargin = _min_margin;
        edgeBlurSize = _edge_blur_size;
        pass2Only = false;
-        }
+    }

    int delta;
    int minArea;
@ -72,30 +75,20 @@ struct MSERParams
    int edgeBlurSize;
 };

-static String Legende(MSERParams &pAct)
+static String Legende(const MSERParams &pAct)
 {
-    String s="";
-    String inf = static_cast<const ostringstream&>(ostringstream() << pAct.minArea).str();
-    String sup = static_cast<const ostringstream&>(ostringstream() << pAct.maxArea).str();
-    s = " Area[" + inf + "," + sup + "]";
+    ostringstream ss;
+    ss << "Area[" << pAct.minArea << "," << pAct.maxArea << "] ";
+    ss << "del. [" << pAct.delta << "] ";
+    ss << "var. [" << pAct.maxVariation << "] ";
+    ss << "div. [" << (int)pAct.minDiversity << "] ";
+    ss << "pas. [" << (int)pAct.pass2Only << "] ";
+    ss << "RGb->evo. [" << pAct.maxEvolution << "] ";
+    ss << "are. [" << (int)pAct.areaThreshold << "] ";
+    ss << "mar. [" << (int)pAct.minMargin << "] ";
+    ss << "siz. [" << pAct.edgeBlurSize << "]";

-    inf = static_cast<const ostringstream&>(ostringstream() << pAct.delta).str();
-    s += " del. [" + inf + "]";
-    inf = static_cast<const ostringstream&>(ostringstream() << pAct.maxVariation).str();
-    s += " var. [" + inf + "]";
-    inf = static_cast<const ostringstream&>(ostringstream() << (int)pAct.minDiversity).str();
-    s += " div. [" + inf + "]";
-    inf = static_cast<const ostringstream&>(ostringstream() << (int)pAct.pass2Only).str();
-    s += " pas. [" + inf + "]";
-    inf = static_cast<const ostringstream&>(ostringstream() << (int)pAct.maxEvolution).str();
-    s += "RGb-> evo. [" + inf + "]";
-    inf = static_cast<const ostringstream&>(ostringstream() << (int)pAct.areaThreshold).str();
-    s += " are. [" + inf + "]";
-    inf = static_cast<const ostringstream&>(ostringstream() << (int)pAct.minMargin).str();
-    s += " mar. [" + inf + "]";
-    inf = static_cast<const ostringstream&>(ostringstream() << (int)pAct.edgeBlurSize).str();
-    s += " siz. [" + inf + "]";
-    return s;
+    return ss.str();
 }


@ -109,18 +102,28 @@ bool    keyPressed=false;
 Vec4f   rotAxis(1,0,1,0);
 Vec3f  zoom(1,0,0);

-float	obsX = (float)0, obsY = (float)0, obsZ = (float)-10, tx = (float)0, ty = (float)0;
-float	thetaObs = (float)-1.570, phiObs = (float)1.570, rObs = (float)10;
-int prevX=-1,prevY=-1,prevTheta=-1000,prevPhi=-1000;
+float obsX = 0.f;
+float obsY = 0.f;
+float obsZ = -10.f;
+float tx = 0.f;
+float ty = 0.f;
+
+float thetaObs = -1.570f;
+float phiObs = 1.570f;
+float rObs = 10.f;
+
+int prevX = -1;
+int prevY = -1;
+int prevTheta = -1000;
+int prevPhi = -1000;

 #ifdef HAVE_OPENGL
 struct DrawData
-
-    {
+{
    ogl::Arrays arr;
    ogl::Texture2D tex;
    ogl::Buffer indices;
-    };
+};


 static void draw(void* userdata)
@ -167,19 +170,19 @@ static void onMouse(int event, int x, int y, int flags, void*)
    {
        if (x - prevTheta<0)
        {
-            thetaObs +=(float)0.02;
+            thetaObs += 0.02f;
        }
        else if (x - prevTheta>0)
        {
-            thetaObs -= (float)0.02;
+            thetaObs -= 0.02f;
        }
        if (y - prevPhi<0)
        {
-            phiObs -= (float)0.02;
+            phiObs -= 0.02f;
        }
        else if (y - prevPhi>0)
        {
-            phiObs += (float)0.02;
+            phiObs += 0.02f;
        }
        prevTheta = x;
        prevPhi = y;
@ -187,9 +190,9 @@ static void onMouse(int event, int x, int y, int flags, void*)
    if (event==EVENT_MOUSEWHEEL)
    {
        if (getMouseWheelDelta(flags)>0)
-            rObs += (float)0.1;
+            rObs += 0.1f;
        else
-            rObs -= (float)0.1;
+            rObs -= 0.1f;
    }
    float pi = static_cast<float>(CV_PI);
    if (thetaObs>pi)
@ -202,11 +205,11 @@ static void onMouse(int event, int x, int y, int flags, void*)
    }
    if (phiObs>pi / 2)
    {
-        phiObs = pi / 2 - (float)0.0001;
+        phiObs = pi / 2 - 0.0001f;
    }
    if (phiObs<-pi / 2)
    {
-        phiObs = -pi / 2 + (float)0.00001;
+        phiObs = -pi / 2 + 0.00001f;
    }
    if (rObs<0)
    {
@ -224,36 +227,37 @@ static void DrawOpenGLMSER(Mat img, Mat result)
        cvtColor(img, imgGray, COLOR_BGR2GRAY);
    else
        imgGray = img;
+
    namedWindow("OpenGL", WINDOW_OPENGL);
    setMouseCallback("OpenGL", onMouse, NULL);

    Mat_<Vec3f> vertex(1, img.cols*img.rows);
    Mat_<Vec2f> texCoords(1, img.cols*img.rows);
    for (int i = 0, nbPix = 0; i<img.rows; i++)
-        {
+    {
        for (int j = 0; j<img.cols; j++, nbPix++)
-            {
+        {
            float x = (j) / (float)img.cols;
            float y = (i) / (float)img.rows;
            vertex.at< Vec3f >(0, nbPix) = Vec3f(float(2 * (x - 0.5)), float(2 * (0.5 - y)), float(imgGray.at<uchar>(i, j) / 512.0));
            texCoords.at< Vec2f>(0, nbPix) = Vec2f(x, y);
-            }
        }
+    }

    Mat_<int> indices(1, (img.rows - 1)*(6 * img.cols));
    for (int i = 1, nbPix = 0; i<img.rows; i++)
-        {
+    {
        for (int j = 1; j<img.cols; j++)
-            {
+        {
            int c = i*img.cols + j;
-            indices.at<int>(0, nbPix++) = c ;
+            indices.at<int>(0, nbPix++) = c;
            indices.at<int>(0, nbPix++) = c - 1;
-            indices.at<int>(0, nbPix++) = c- img.cols - 1;
-            indices.at<int>(0, nbPix++) = c- img.cols - 1;
+            indices.at<int>(0, nbPix++) = c - img.cols - 1;
+            indices.at<int>(0, nbPix++) = c - img.cols - 1;
            indices.at<int>(0, nbPix++) = c - img.cols;
-            indices.at<int>(0, nbPix++) = c ;
-            }
+            indices.at<int>(0, nbPix++) = c;
        }
+    }

    DrawData *data = new DrawData;

@ -279,7 +283,7 @@ static void DrawOpenGLMSER(Mat img, Mat result)
    setOpenGlDrawCallback("OpenGL", draw, data);

    for (;;)
-        {
+    {
        updateWindow("OpenGL");
        char key = (char)waitKey(40);
        if (key == 27)
@ -292,27 +296,28 @@ static void DrawOpenGLMSER(Mat img, Mat result)
            case '5':
                obsX = 0, obsY = 0, obsZ = -10;
                thetaObs = -pi/2, phiObs = pi/2, rObs = 10;
-                tx=0;ty=0;
+                tx=0; ty=0;
                break;
            case '4':
-                thetaObs += (float)0.1;
+                thetaObs += 0.1f;
                break;
            case '6':
-                thetaObs -= (float)0.1;
+                thetaObs -= 0.1f;
                break;
            case '2':
-                phiObs -= (float).1;
+                phiObs -= 0.1f;
                break;
            case '8':
-                phiObs += (float).1;
+                phiObs += 0.1f;
                break;
            case '+':
-                rObs -= (float).1;
+                rObs -= 0.1f;
                break;
            case '-':
-                rObs += (float).1;
+                rObs += 0.1f;
                break;
        }
+
        if (thetaObs>pi)
        {
            thetaObs = -2 * pi + thetaObs;
@ -320,9 +325,9 @@ static void DrawOpenGLMSER(Mat img, Mat result)
        if (thetaObs<-pi)
            thetaObs = 2 * pi + thetaObs;
        if (phiObs>pi / 2)
-            phiObs = pi / 2 - (float)0.0001;
+            phiObs = pi / 2 - 0.0001f;
        if (phiObs<-pi / 2)
-            phiObs = -pi / 2 + (float)0.00001;
+            phiObs = -pi / 2 + 0.00001f;
        if (rObs<0)
            rObs = 0;
        obsX = rObs*cos(thetaObs)*cos(phiObs);
@ -334,67 +339,59 @@ static void DrawOpenGLMSER(Mat img, Mat result)
 }
 #endif

-static Mat MakeSyntheticImage()
-{
-    Mat img(800, 800, CV_8UC1);
-    map<int, char> val;
-    int fond = 0;
-    img = Scalar(fond);
-    val[fond] = 1;
-    int width1[] = { 390, 380, 300, 290, 280, 270, 260, 250, 210, 190, 150, 100, 80, 70 };
-    int color1[] = { 80, 180, 160, 140, 120, 100, 90, 110, 170, 150, 140, 100, 220 };
-    Point p0(10, 10);
-    int *width, *color;
-
-    width = width1;
-    color = color1;
-    for (int i = 0; i<13; i++)
-        {
+// Add nested rectangles of different widths and colors to an image
+static void addNestedRectangles(Mat &img, Point p0, int* width, int *color, int n) {
+    for (int i = 0; i<n; i++)
+    {
        rectangle(img, Rect(p0, Size(width[i], width[i])), Scalar(color[i]), 1);
        p0 += Point((width[i] - width[i + 1]) / 2, (width[i] - width[i + 1]) / 2);
        floodFill(img, p0, Scalar(color[i]));
+    }
+}

-        }
-    int color2[] = { 81, 181, 161, 141, 121, 101, 91, 111, 171, 151, 141, 101, 221 };
-    color = color2;
-    p0 = Point(200, 600);
-    for (int i = 0; i<13; i++)
-        {
-        circle(img, p0, width[i] / 2, Scalar(color[i]), 1);
-        floodFill(img, p0, Scalar(color[i]));
-
-        }
-    int color3[] = { 175,75,95,115,135,155,165,145,85,105,115,156 };
-    color = color3;
-    p0 = Point(410, 10);
-    for (int i = 0; i<13; i++)
-        {
-        rectangle(img, Rect(p0, Size(width[i], width[i])), Scalar(color[i]), 1);
-        p0 += Point((width[i] - width[i + 1]) / 2, (width[i] - width[i + 1]) / 2);
-        floodFill(img, p0, Scalar(color[i]));
-
-        }
-    int color4[] = { 173,73,93,113,133,153,163,143,83,103,114,154 };
-    color = color4;
-
-    p0 = Point(600, 600);
-    for (int i = 0; i<13; i++)
+// Add nested circles of different widths and colors to an image
+static void addNestedCircles(Mat &img, Point p0, int *width, int *color, int n) {
+    for (int i = 0; i<n; i++)
    {
        circle(img, p0, width[i] / 2, Scalar(color[i]), 1);
        floodFill(img, p0, Scalar(color[i]));
    }
+}
+
+static Mat MakeSyntheticImage()
+{
+    const int fond = 0;
+
+    Mat img(800, 800, CV_8UC1);
+    img = Scalar(fond);
+
+    int width[] = { 390, 380, 300, 290, 280, 270, 260, 250, 210, 190, 150, 100, 80, 70 };
+
+    int color1[] = { 80, 180, 160, 140, 120, 100, 90, 110, 170, 150, 140, 100, 220 };
+    int color2[] = { 81, 181, 161, 141, 121, 101, 91, 111, 171, 151, 141, 101, 221 };
+    int color3[] = { 175, 75, 95, 115, 135, 155, 165, 145, 85, 105, 115, 155, 35 };
+    int color4[] = { 173, 73, 93, 113, 133, 153, 163, 143, 83, 103, 113, 153, 33 };
+
+    addNestedRectangles(img, Point(10, 10), width, color1, 13);
+    addNestedCircles(img, Point(200, 600), width, color2, 13);
+
+    addNestedRectangles(img, Point(410, 10), width, color3, 13);
+    addNestedCircles(img, Point(600, 600), width, color4, 13);
+
    int histSize = 256;
    float range[] = { 0, 256 };
    const float* histRange[] = { range };
    Mat hist;
+
    // we compute the histogram
    calcHist(&img, 1, 0, Mat(), hist, 1, &histSize, histRange, true, false);
+
    cout << "****************Maximal region************************\n";
-    for (int i = 0; i < hist.rows ; i++)
+    for (int i = 0; i < hist.rows; i++)
    {
        if (hist.at<float>(i, 0)!=0)
        {
-            cout << "h" << i << "=\t" << hist.at<float>(i, 0) <<  "\n";
+            cout << "h" << setw(3) << left << i << "\t=\t" << hist.at<float>(i, 0) << "\n";
        }
    }

@ -403,68 +400,60 @@ static Mat MakeSyntheticImage()

 int main(int argc, char *argv[])
 {
-    vector<String> fileName;
-    Mat imgOrig,img;
-    Size blurSize(5,5);
+    Mat imgOrig, img;
+    Size blurSize(5, 5);
    cv::CommandLineParser parser(argc, argv, "{ help h | | }{ @input | | }");
    if (parser.has("help"))
    {
        help();
        return 0;
    }
+
    string input = parser.get<string>("@input");
    if (!input.empty())
    {
-        fileName.push_back(input);
-        imgOrig = imread(fileName[0], IMREAD_GRAYSCALE);
+        imgOrig = imread(input, IMREAD_GRAYSCALE);
        blur(imgOrig, img, blurSize);
    }
    else
    {
-        fileName.push_back("SyntheticImage.bmp");
        imgOrig = MakeSyntheticImage();
-        img=imgOrig;
+        img = imgOrig;
    }

-    MSERParams pDefaultMSER;
    // Descriptor array MSER
    vector<String> typeDesc;
    // Param array for MSER
    vector<MSERParams> pMSER;
-    vector<MSERParams>::iterator itMSER;

    // Color palette
-    vector<Vec3b>  palette;
-    for (int i = 0; i<65536; i++)
+    vector<Vec3b> palette;
+    for (int i = 0; i<=numeric_limits<uint16_t>::max(); i++)
        palette.push_back(Vec3b((uchar)rand(), (uchar)rand(), (uchar)rand()));
+
    help();

+    MSERParams params;
+
+    params.delta = 10;
+    params.minArea = 100;
+    params.maxArea = 5000;
+    params.maxVariation = 2;
+    params.minDiversity = 0;
+    params.pass2Only = true;
+
    typeDesc.push_back("MSER");
-    pMSER.push_back(pDefaultMSER);
-    pMSER.back().delta = 10;
-    pMSER.back().minArea = 100;
-    pMSER.back().maxArea = 5000;
-    pMSER.back().maxVariation = 2;
-    pMSER.back().minDiversity = 0;
-    pMSER.back().pass2Only = true;
+    pMSER.push_back(params);
+
+    params.pass2Only = false;
    typeDesc.push_back("MSER");
-    pMSER.push_back(pDefaultMSER);
-    pMSER.back().delta = 10;
-    pMSER.back().minArea = 100;
-    pMSER.back().maxArea = 5000;
-    pMSER.back().maxVariation = 2;
-    pMSER.back().minDiversity = 0;
-    pMSER.back().pass2Only = false;
+    pMSER.push_back(params);
+
+    params.delta = 100;
    typeDesc.push_back("MSER");
-    pMSER.push_back(pDefaultMSER);
-    pMSER.back().delta = 100;
-    pMSER.back().minArea = 100;
-    pMSER.back().maxArea = 5000;
-    pMSER.back().maxVariation = 2;
-    pMSER.back().minDiversity = 0;
-    pMSER.back().pass2Only = false;
-    itMSER = pMSER.begin();
-    vector<double> desMethCmp;
+    pMSER.push_back(params);
+
+    vector<MSERParams>::iterator itMSER = pMSER.begin();
    Ptr<Feature2D> b;
    String label;
    // Descriptor loop
@ -473,14 +462,14 @@ int main(int argc, char *argv[])
    for (itDesc = typeDesc.begin(); itDesc != typeDesc.end(); ++itDesc)
    {
        vector<KeyPoint> keyImg1;
-        if (*itDesc == "MSER"){
+        if (*itDesc == "MSER")
+        {
            if (img.type() == CV_8UC3)
            {
                b = MSER::create(itMSER->delta, itMSER->minArea, itMSER->maxArea, itMSER->maxVariation, itMSER->minDiversity, itMSER->maxEvolution,
                                 itMSER->areaThreshold, itMSER->minMargin, itMSER->edgeBlurSize);
                label = Legende(*itMSER);
                ++itMSER;
-
            }
            else
            {
@ -490,6 +479,7 @@ int main(int argc, char *argv[])
                ++itMSER;
            }
        }
+
        if (img.type()==CV_8UC3)
        {
            img.copyTo(result);
@ -505,36 +495,37 @@ int main(int argc, char *argv[])
        try
        {
            // We can detect regions using detectRegions method
-            vector<KeyPoint>  keyImg;
-            vector<Rect>  zone;
-            vector<vector <Point> >  region;
-            Mat     desc;
+            vector<KeyPoint> keyImg;
+            vector<Rect> zone;
+            vector<vector <Point> > region;
+            Mat desc;

            if (b.dynamicCast<MSER>() != NULL)
            {
                Ptr<MSER> sbd = b.dynamicCast<MSER>();
                sbd->detectRegions(img, region, zone);
-                int i = 0;
                //result = Scalar(0, 0, 0);
                int nbPixelInMSER=0;
-                for (vector<vector <Point> >::iterator itr = region.begin(); itr != region.end(); ++itr, ++i)
+                for (vector<vector <Point> >::iterator itr = region.begin(); itr != region.end(); ++itr)
                {
-                    for (vector <Point>::iterator itp = region[i].begin(); itp != region[i].end(); ++itp)
+                    for (vector <Point>::iterator itp = itr->begin(); itp != itr->end(); ++itp)
                    {
                        // all pixels belonging to region become blue
                        result.at<Vec3b>(itp->y, itp->x) = Vec3b(128, 0, 0);
                        nbPixelInMSER++;
                    }
                }
-                cout << "Number of MSER region " << region.size()<<" Number of pixels in all MSER region : "<<nbPixelInMSER<<"\n";
+                cout << "Number of MSER region: " << region.size() << "; Number of pixels in all MSER region: " << nbPixelInMSER << "\n";
            }
-            namedWindow(*itDesc + label, WINDOW_AUTOSIZE);
-            imshow(*itDesc + label, result);
+
+            const string winName = *itDesc + label;
+            namedWindow(winName, WINDOW_AUTOSIZE);
+            imshow(winName, result);
            imshow("Original", img);
        }
        catch (Exception& e)
        {
-            cout << "Feature : " << *itDesc << "\n";
+            cout << "Feature: " << *itDesc << "\n";
            cout << e.msg << endl;
        }
 #ifdef HAVE_OPENGL
--- a/samples/dnn/tf_text_graph_ssd.py
+++ b/samples/dnn/tf_text_graph_ssd.py
@ -208,12 +208,18 @@ for label in ['ClassPredictor', 'BoxEncodingPredictor']:
        graph_def.node.extend([flatten])
    addConcatNode('%s/concat' % label, concatInputs, 'concat/axis_flatten')

+idx = 0
+for node in graph_def.node:
+    if node.name == ('BoxPredictor_%d/BoxEncodingPredictor/Conv2D' % idx):
+        text_format.Merge('b: true', node.attr["loc_pred_transposed"])
+        idx += 1
+assert(idx == args.num_layers)
+
 # Add layers that generate anchors (bounding boxes proposals).
 scales = [args.min_scale + (args.max_scale - args.min_scale) * i / (args.num_layers - 1)
          for i in range(args.num_layers)] + [1.0]

 priorBoxes = []
-addConstNode('reshape_prior_boxes_to_4d', [1, 2, -1, 1])
 for i in range(args.num_layers):
    priorBox = NodeDef()
    priorBox.name = 'PriorBox_%d' % i
@ -240,18 +246,9 @@ for i in range(args.num_layers):
    text_format.Merge(tensorMsg([0.1, 0.1, 0.2, 0.2]), priorBox.attr["variance"])

    graph_def.node.extend([priorBox])
+    priorBoxes.append(priorBox.name)

-    # Reshape from 1x2xN to 1x2xNx1
-    reshape = NodeDef()
-    reshape.name = priorBox.name + '/4d'
-    reshape.op = 'Reshape'
-    reshape.input.append(priorBox.name)
-    reshape.input.append('reshape_prior_boxes_to_4d')
-    graph_def.node.extend([reshape])
-
-    priorBoxes.append(reshape.name)
-
-addConcatNode('PriorBox/concat', priorBoxes, 'PriorBox/concat/axis')
+addConcatNode('PriorBox/concat', priorBoxes, 'concat/axis_flatten')

 # Sigmoid for classes predictions and DetectionOutput layer
 sigmoid = NodeDef()
@ -276,7 +273,6 @@ text_format.Merge('i: 100', detectionOut.attr['top_k'])
 text_format.Merge('s: "CENTER_SIZE"', detectionOut.attr['code_type'])
 text_format.Merge('i: 100', detectionOut.attr['keep_top_k'])
 text_format.Merge('f: 0.01', detectionOut.attr['confidence_threshold'])
-text_format.Merge('b: true', detectionOut.attr['loc_pred_transposed'])

 graph_def.node.extend([detectionOut])