From c46f119e0e865af442a3551d40fa4cdd7ea29123 Mon Sep 17 00:00:00 2001
From: ChipKerchner <ckerchne@linux.vnet.ibm.com>
Date: Wed, 23 Oct 2019 10:47:07 -0500
Subject: [PATCH 1/6] Convert demosaic functions to HAL

---
 modules/imgproc/src/demosaicing.cpp | 591 +++++++++++++++-------------
 1 file changed, 318 insertions(+), 273 deletions(-)
diff --git a/modules/imgproc/src/demosaicing.cpp b/modules/imgproc/src/demosaicing.cpp
index a14b6d7905..3062023ea7 100644
--- a/modules/imgproc/src/demosaicing.cpp
+++ b/modules/imgproc/src/demosaicing.cpp
@@ -86,6 +86,7 @@
 
 
 #include "precomp.hpp"
+#include "opencv2/core/hal/intrin.hpp"
 
 #include <limits>
 
@@ -111,7 +112,7 @@ public:
         return 0;
     }
 
-    int bayer2RGBA(const T*, int, T*, int, int) const
+    int bayer2RGBA(const T*, int, T*, int, int, const T) const
     {
         return 0;
     }
@@ -122,279 +123,14 @@ public:
     }
 };
 
-#if CV_SSE2
+#if CV_SIMD128
 class SIMDBayerInterpolator_8u
 {
 public:
-    SIMDBayerInterpolator_8u()
-    {
-        use_simd = checkHardwareSupport(CV_CPU_SSE2);
-    }
-
     int bayer2Gray(const uchar* bayer, int bayer_step, uchar* dst,
                    int width, int bcoeff, int gcoeff, int rcoeff) const
     {
-        if( !use_simd )
-            return 0;
-
-        __m128i _b2y = _mm_set1_epi16((short)(rcoeff*2));
-        __m128i _g2y = _mm_set1_epi16((short)(gcoeff*2));
-        __m128i _r2y = _mm_set1_epi16((short)(bcoeff*2));
-        const uchar* bayer_end = bayer + width;
-
-        for( ; bayer <= bayer_end - 18; bayer += 14, dst += 14 )
-        {
-            __m128i r0 = _mm_loadu_si128((const __m128i*)bayer);
-            __m128i r1 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step));
-            __m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2));
-
-            __m128i b1 = _mm_add_epi16(_mm_srli_epi16(_mm_slli_epi16(r0, 8), 7),
-                                       _mm_srli_epi16(_mm_slli_epi16(r2, 8), 7));
-            __m128i b0 = _mm_add_epi16(b1, _mm_srli_si128(b1, 2));
-            b1 = _mm_slli_epi16(_mm_srli_si128(b1, 2), 1);
-
-            __m128i g0 = _mm_add_epi16(_mm_srli_epi16(r0, 7), _mm_srli_epi16(r2, 7));
-            __m128i g1 = _mm_srli_epi16(_mm_slli_epi16(r1, 8), 7);
-            g0 = _mm_add_epi16(g0, _mm_add_epi16(g1, _mm_srli_si128(g1, 2)));
-            g1 = _mm_slli_epi16(_mm_srli_si128(g1, 2), 2);
-
-            r0 = _mm_srli_epi16(r1, 8);
-            r1 = _mm_slli_epi16(_mm_add_epi16(r0, _mm_srli_si128(r0, 2)), 2);
-            r0 = _mm_slli_epi16(r0, 3);
-
-            g0 = _mm_add_epi16(_mm_mulhi_epi16(b0, _b2y), _mm_mulhi_epi16(g0, _g2y));
-            g1 = _mm_add_epi16(_mm_mulhi_epi16(b1, _b2y), _mm_mulhi_epi16(g1, _g2y));
-            g0 = _mm_add_epi16(g0, _mm_mulhi_epi16(r0, _r2y));
-            g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(r1, _r2y));
-            g0 = _mm_srli_epi16(g0, 2);
-            g1 = _mm_srli_epi16(g1, 2);
-            g0 = _mm_packus_epi16(g0, g0);
-            g1 = _mm_packus_epi16(g1, g1);
-            g0 = _mm_unpacklo_epi8(g0, g1);
-            _mm_storeu_si128((__m128i*)dst, g0);
-        }
-
-        return (int)(bayer - (bayer_end - width));
-    }
-
-    int bayer2RGB(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
-    {
-        if( !use_simd )
-            return 0;
-        /*
-         B G B G | B G B G | B G B G | B G B G
-         G R G R | G R G R | G R G R | G R G R
-         B G B G | B G B G | B G B G | B G B G
-         */
-
-        __m128i delta1 = _mm_set1_epi16(1), delta2 = _mm_set1_epi16(2);
-        __m128i mask = _mm_set1_epi16(blue < 0 ? -1 : 0), z = _mm_setzero_si128();
-        __m128i masklo = _mm_set1_epi16(0x00ff);
-        const uchar* bayer_end = bayer + width;
-
-        for( ; bayer <= bayer_end - 18; bayer += 14, dst += 42 )
-        {
-            __m128i r0 = _mm_loadu_si128((const __m128i*)bayer);
-            __m128i r1 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step));
-            __m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2));
-
-            __m128i b1 = _mm_add_epi16(_mm_and_si128(r0, masklo), _mm_and_si128(r2, masklo));
-            __m128i nextb1 = _mm_srli_si128(b1, 2);
-            __m128i b0 = _mm_add_epi16(b1, nextb1);
-            b1 = _mm_srli_epi16(_mm_add_epi16(nextb1, delta1), 1);
-            b0 = _mm_srli_epi16(_mm_add_epi16(b0, delta2), 2);
-            // b0 b2 ... b14 b1 b3 ... b15
-            b0 = _mm_packus_epi16(b0, b1);
-
-            __m128i g0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_srli_epi16(r2, 8));
-            __m128i g1 = _mm_and_si128(r1, masklo);
-            g0 = _mm_add_epi16(g0, _mm_add_epi16(g1, _mm_srli_si128(g1, 2)));
-            g1 = _mm_srli_si128(g1, 2);
-            g0 = _mm_srli_epi16(_mm_add_epi16(g0, delta2), 2);
-            // g0 g2 ... g14 g1 g3 ... g15
-            g0 = _mm_packus_epi16(g0, g1);
-
-            r0 = _mm_srli_epi16(r1, 8);
-            r1 = _mm_add_epi16(r0, _mm_srli_si128(r0, 2));
-            r1 = _mm_srli_epi16(_mm_add_epi16(r1, delta1), 1);
-            // r0 r2 ... r14 r1 r3 ... r15
-            r0 = _mm_packus_epi16(r0, r1);
-
-            b1 = _mm_and_si128(_mm_xor_si128(b0, r0), mask);
-            b0 = _mm_xor_si128(b0, b1);
-            r0 = _mm_xor_si128(r0, b1);
-
-            // b1 g1 b3 g3 b5 g5...
-            b1 = _mm_unpackhi_epi8(b0, g0);
-            // b0 g0 b2 g2 b4 g4 ....
-            b0 = _mm_unpacklo_epi8(b0, g0);
-
-            // r1 0 r3 0 r5 0 ...
-            r1 = _mm_unpackhi_epi8(r0, z);
-            // r0 0 r2 0 r4 0 ...
-            r0 = _mm_unpacklo_epi8(r0, z);
-
-            // 0 b0 g0 r0 0 b2 g2 r2 ...
-            g0 = _mm_slli_si128(_mm_unpacklo_epi16(b0, r0), 1);
-            // 0 b8 g8 r8 0 b10 g10 r10 ...
-            g1 = _mm_slli_si128(_mm_unpackhi_epi16(b0, r0), 1);
-
-            // b1 g1 r1 0 b3 g3 r3 0 ...
-            r0 = _mm_unpacklo_epi16(b1, r1);
-            // b9 g9 r9 0 b11 g11 r11 0 ...
-            r1 = _mm_unpackhi_epi16(b1, r1);
-
-            // 0 b0 g0 r0 b1 g1 r1 0 ...
-            b0 = _mm_srli_si128(_mm_unpacklo_epi32(g0, r0), 1);
-            // 0 b4 g4 r4 b5 g5 r5 0 ...
-            b1 = _mm_srli_si128(_mm_unpackhi_epi32(g0, r0), 1);
-
-            _mm_storel_epi64((__m128i*)(dst-1+0), b0);
-            _mm_storel_epi64((__m128i*)(dst-1+6*1), _mm_srli_si128(b0, 8));
-            _mm_storel_epi64((__m128i*)(dst-1+6*2), b1);
-            _mm_storel_epi64((__m128i*)(dst-1+6*3), _mm_srli_si128(b1, 8));
-
-            // 0 b8 g8 r8 b9 g9 r9 0 ...
-            g0 = _mm_srli_si128(_mm_unpacklo_epi32(g1, r1), 1);
-            // 0 b12 g12 r12 b13 g13 r13 0 ...
-            g1 = _mm_srli_si128(_mm_unpackhi_epi32(g1, r1), 1);
-
-            _mm_storel_epi64((__m128i*)(dst-1+6*4), g0);
-            _mm_storel_epi64((__m128i*)(dst-1+6*5), _mm_srli_si128(g0, 8));
-
-            _mm_storel_epi64((__m128i*)(dst-1+6*6), g1);
-        }
-
-        return (int)(bayer - (bayer_end - width));
-    }
-
-    int bayer2RGBA(const uchar*, int, uchar*, int, int) const
-    {
-        return 0;
-    }
-
-    int bayer2RGB_EA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
-    {
-        if (!use_simd)
-            return 0;
-
-        const uchar* bayer_end = bayer + width;
-        __m128i masklow = _mm_set1_epi16(0x00ff);
-        __m128i delta1 = _mm_set1_epi16(1), delta2 = _mm_set1_epi16(2);
-        __m128i full = _mm_set1_epi16(-1), z = _mm_setzero_si128();
-        __m128i mask = _mm_set1_epi16(blue > 0 ? -1 : 0);
-
-        for ( ; bayer <= bayer_end - 18; bayer += 14, dst += 42)
-        {
-            /*
-             B G B G | B G B G | B G B G | B G B G
-             G R G R | G R G R | G R G R | G R G R
-             B G B G | B G B G | B G B G | B G B G
-             */
-
-            __m128i r0 = _mm_loadu_si128((const __m128i*)bayer);
-            __m128i r1 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step));
-            __m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2));
-
-            __m128i b1 = _mm_add_epi16(_mm_and_si128(r0, masklow), _mm_and_si128(r2, masklow));
-            __m128i nextb1 = _mm_srli_si128(b1, 2);
-            __m128i b0 = _mm_add_epi16(b1, nextb1);
-            b1 = _mm_srli_epi16(_mm_add_epi16(nextb1, delta1), 1);
-            b0 = _mm_srli_epi16(_mm_add_epi16(b0, delta2), 2);
-            // b0 b2 ... b14 b1 b3 ... b15
-            b0 = _mm_packus_epi16(b0, b1);
-
-            // vertical sum
-            __m128i r0g = _mm_srli_epi16(r0, 8);
-            __m128i r2g = _mm_srli_epi16(r2, 8);
-            __m128i sumv = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(r0g, r2g), delta1), 1);
-            // gorizontal sum
-            __m128i g1 = _mm_and_si128(masklow, r1);
-            __m128i nextg1 = _mm_srli_si128(g1, 2);
-            __m128i sumg = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(g1, nextg1), delta1), 1);
-
-            // gradients
-            __m128i gradv = _mm_adds_epi16(_mm_subs_epu16(r0g, r2g), _mm_subs_epu16(r2g, r0g));
-            __m128i gradg = _mm_adds_epi16(_mm_subs_epu16(nextg1, g1), _mm_subs_epu16(g1, nextg1));
-            __m128i gmask = _mm_cmpgt_epi16(gradg, gradv);
-
-            __m128i g0 = _mm_add_epi16(_mm_and_si128(gmask, sumv), _mm_and_si128(sumg, _mm_xor_si128(gmask, full)));
-            // g0 g2 ... g14 g1 g3 ...
-            g0 = _mm_packus_epi16(g0, nextg1);
-
-            r0 = _mm_srli_epi16(r1, 8);
-            r1 = _mm_add_epi16(r0, _mm_srli_si128(r0, 2));
-            r1 = _mm_srli_epi16(_mm_add_epi16(r1, delta1), 1);
-            // r0 r2 ... r14 r1 r3 ... r15
-            r0 = _mm_packus_epi16(r0, r1);
-
-            b1 = _mm_and_si128(_mm_xor_si128(b0, r0), mask);
-            b0 = _mm_xor_si128(b0, b1);
-            r0 = _mm_xor_si128(r0, b1);
-
-            // b1 g1 b3 g3 b5 g5...
-            b1 = _mm_unpackhi_epi8(b0, g0);
-            // b0 g0 b2 g2 b4 g4 ....
-            b0 = _mm_unpacklo_epi8(b0, g0);
-
-            // r1 0 r3 0 r5 0 ...
-            r1 = _mm_unpackhi_epi8(r0, z);
-            // r0 0 r2 0 r4 0 ...
-            r0 = _mm_unpacklo_epi8(r0, z);
-
-            // 0 b0 g0 r0 0 b2 g2 r2 ...
-            g0 = _mm_slli_si128(_mm_unpacklo_epi16(b0, r0), 1);
-            // 0 b8 g8 r8 0 b10 g10 r10 ...
-            g1 = _mm_slli_si128(_mm_unpackhi_epi16(b0, r0), 1);
-
-            // b1 g1 r1 0 b3 g3 r3 0 ...
-            r0 = _mm_unpacklo_epi16(b1, r1);
-            // b9 g9 r9 0 b11 g11 r11 0 ...
-            r1 = _mm_unpackhi_epi16(b1, r1);
-
-            // 0 b0 g0 r0 b1 g1 r1 0 ...
-            b0 = _mm_srli_si128(_mm_unpacklo_epi32(g0, r0), 1);
-            // 0 b4 g4 r4 b5 g5 r5 0 ...
-            b1 = _mm_srli_si128(_mm_unpackhi_epi32(g0, r0), 1);
-
-            _mm_storel_epi64((__m128i*)(dst+0), b0);
-            _mm_storel_epi64((__m128i*)(dst+6*1), _mm_srli_si128(b0, 8));
-            _mm_storel_epi64((__m128i*)(dst+6*2), b1);
-            _mm_storel_epi64((__m128i*)(dst+6*3), _mm_srli_si128(b1, 8));
-
-            // 0 b8 g8 r8 b9 g9 r9 0 ...
-            g0 = _mm_srli_si128(_mm_unpacklo_epi32(g1, r1), 1);
-            // 0 b12 g12 r12 b13 g13 r13 0 ...
-            g1 = _mm_srli_si128(_mm_unpackhi_epi32(g1, r1), 1);
-
-            _mm_storel_epi64((__m128i*)(dst+6*4), g0);
-            _mm_storel_epi64((__m128i*)(dst+6*5), _mm_srli_si128(g0, 8));
-
-            _mm_storel_epi64((__m128i*)(dst+6*6), g1);
-        }
-
-        return int(bayer - (bayer_end - width));
-    }
-
-    bool use_simd;
-};
-#elif CV_NEON
-class SIMDBayerInterpolator_8u
-{
-public:
-    SIMDBayerInterpolator_8u()
-    {
-    }
-
-    int bayer2Gray(const uchar* bayer, int bayer_step, uchar* dst,
-                   int width, int bcoeff, int gcoeff, int rcoeff) const
-    {
-        /*
-         B G B G | B G B G | B G B G | B G B G
-         G R G R | G R G R | G R G R | G R G R
-         B G B G | B G B G | B G B G | B G B G
-         */
-
+#if CV_NEON
         uint16x8_t masklo = vdupq_n_u16(255);
         const uchar* bayer_end = bayer + width;
 
@@ -440,6 +176,40 @@ public:
             vst1_u8(dst, p.val[0]);
             vst1_u8(dst + 8, p.val[1]);
         }
+#else
+        v_uint16x8 _b2y = v_setall_u16((ushort)(rcoeff*2));
+        v_uint16x8 _g2y = v_setall_u16((ushort)(gcoeff*2));
+        v_uint16x8 _r2y = v_setall_u16((ushort)(bcoeff*2));
+        const uchar* bayer_end = bayer + width;
+
+        for( ; bayer <= bayer_end - 18; bayer += 14, dst += 14 )
+        {
+            v_uint16x8 r0 = v_load((ushort*)bayer);
+            v_uint16x8 r1 = v_load((ushort*)(bayer+bayer_step));
+            v_uint16x8 r2 = v_load((ushort*)(bayer+bayer_step*2));
+
+            v_uint16x8 b1 = ((r0 << 8) >> 7) + ((r2 << 8) >> 7);
+            v_uint16x8 b0 = v_rotate_right<1>(b1) + b1;
+            b1 = v_rotate_right<1>(b1) << 1;
+
+            v_uint16x8 g0 = (r0 >> 7) + (r2 >> 7);
+            v_uint16x8 g1 = (r1 << 8) >> 7;
+            g0 += v_rotate_right<1>(g1) + g1;
+            g1 = v_rotate_right<1>(g1) << 2;
+
+            r0 = r1 >> 8;
+            r1 = (v_rotate_right<1>(r0) + r0) << 2;
+            r0 = r0 << 3;
+
+            g0 = (v_mul_hi(b0, _b2y) + v_mul_hi(g0, _g2y) + v_mul_hi(r0, _r2y)) >> 2;
+            g1 = (v_mul_hi(b1, _b2y) + v_mul_hi(g1, _g2y) + v_mul_hi(r1, _r2y)) >> 2;
+            v_uint8x16 pack_lo, pack_hi;
+            v_zip(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g0)),
+                  v_pack_u(v_reinterpret_as_s16(g1), v_reinterpret_as_s16(g1)),
+                  pack_lo, pack_hi);
+            v_store(dst, pack_lo);
+        }
+#endif
 
         return (int)(bayer - (bayer_end - width));
     }
@@ -451,6 +221,8 @@ public:
          G R G R | G R G R | G R G R | G R G R
          B G B G | B G B G | B G B G | B G B G
          */
+
+#if CV_NEON
         uint16x8_t masklo = vdupq_n_u16(255);
         uint8x16x3_t pix;
         const uchar* bayer_end = bayer + width;
@@ -484,21 +256,109 @@ public:
 
             vst3q_u8(dst-1, pix);
         }
+#else
+        v_uint16x8 delta1 = v_setall_u16(1), delta2 = v_setall_u16(2);
+        v_uint16x8 mask = v_setall_u16(blue < 0 ? (ushort)(-1) : 0);
+        v_uint16x8 masklo = v_setall_u16(0x00ff);
+        v_uint8x16 z = v_setzero_u8();
+        const uchar* bayer_end = bayer + width;
+
+        for( ; bayer <= bayer_end - 18; bayer += 14, dst += 42 )
+        {
+            v_uint16x8 r0 = v_load((ushort*)bayer);
+            v_uint16x8 r1 = v_load((ushort*)(bayer+bayer_step));
+            v_uint16x8 r2 = v_load((ushort*)(bayer+bayer_step*2));
+
+            v_uint16x8 b1 = (r0 & masklo) + (r2 & masklo);
+            v_uint16x8 nextb1 = v_rotate_right<1>(b1);
+            v_uint16x8 b0 = b1 + nextb1;
+            b1 = (nextb1 + delta1) >> 1;
+            b0 = (b0 + delta2) >> 2;
+            // b0 b2 ... b14 b1 b3 ... b15
+            b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1)));
+
+            v_uint16x8 g0 = (r0 >> 8) + (r2 >> 8);
+            v_uint16x8 g1 = r1 & masklo;
+            g0 += v_rotate_right<1>(g1) + g1;
+            g1 = v_rotate_right<1>(g1);
+            g0 = (g0 + delta2) >> 2;
+            // g0 g2 ... g14 g1 g3 ... g15
+            g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g1)));
+
+            r0 = r1 >> 8;
+            r1 = v_rotate_right<1>(r0) + r0;
+            r1 = (r1 + delta1) >> 1;
+            // r0 r2 ... r14 r1 r3 ... r15
+            r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1)));
+
+            b1 = (b0 ^ r0) & mask;
+            b0 = b0 ^ b1;
+            r0 = r0 ^ b1;
+
+            // b1 g1 b3 g3 b5 g5...
+            v_uint8x16 pack_lo, pack_hi;
+            v_zip(v_reinterpret_as_u8(b0), v_reinterpret_as_u8(g0), pack_lo, pack_hi);
+            b1 = v_reinterpret_as_u16(pack_hi);
+            // b0 g0 b2 g2 b4 g4 ....
+            b0 = v_reinterpret_as_u16(pack_lo);
+
+            // r1 0 r3 0 r5 0 ...
+            v_zip(v_reinterpret_as_u8(r0), z, pack_lo, pack_hi);
+            r1 = v_reinterpret_as_u16(pack_hi);
+            // r0 0 r2 0 r4 0 ...
+            r0 = v_reinterpret_as_u16(pack_lo);
+
+            // 0 b0 g0 r0 0 b2 g2 r2 ...
+            v_zip(b0, r0, g0, g1);
+            g0 = v_reinterpret_as_u16(v_rotate_left<1>(v_reinterpret_as_u8(g0)));
+            // 0 b8 g8 r8 0 b10 g10 r10 ...
+            g1 = v_reinterpret_as_u16(v_rotate_left<1>(v_reinterpret_as_u8(g1)));
+
+            // b1 g1 r1 0 b3 g3 r3 0 ...
+            v_zip(b1, r1, r0, r1);
+            // b9 g9 r9 0 b11 g11 r11 0 ...
+
+            // 0 b0 g0 r0 b1 g1 r1 0 ...
+            v_uint32x4 pack32_lo, pack32_hi;
+            v_zip(v_reinterpret_as_u32(g0), v_reinterpret_as_u32(r0), pack32_lo, pack32_hi);
+            b0 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_lo)));
+            // 0 b4 g4 r4 b5 g5 r5 0 ...
+            b1 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_hi)));
+
+            v_store_low(dst-1+0, v_reinterpret_as_u8(b0));
+            v_store_high(dst-1+6*1, v_reinterpret_as_u8(b0));
+            v_store_low(dst-1+6*2, v_reinterpret_as_u8(b1));
+            v_store_high(dst-1+6*3, v_reinterpret_as_u8(b1));
+
+            // 0 b8 g8 r8 b9 g9 r9 0 ...
+            v_zip(v_reinterpret_as_u32(g1), v_reinterpret_as_u32(r1), pack32_lo, pack32_hi);
+            g0 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_lo)));
+            // 0 b12 g12 r12 b13 g13 r13 0 ...
+            g1 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_hi)));
+
+            v_store_low(dst-1+6*4, v_reinterpret_as_u8(g0));
+            v_store_high(dst-1+6*5, v_reinterpret_as_u8(g0));
+
+            v_store_low(dst-1+6*6, v_reinterpret_as_u8(g1));
+        }
+#endif
 
         return (int)(bayer - (bayer_end - width));
     }
 
-    int bayer2RGBA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
+    int bayer2RGBA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue, const uchar alpha) const
     {
         /*
          B G B G | B G B G | B G B G | B G B G
          G R G R | G R G R | G R G R | G R G R
          B G B G | B G B G | B G B G | B G B G
          */
+
+#if CV_NEON
         uint16x8_t masklo = vdupq_n_u16(255);
         uint8x16x4_t pix;
         const uchar* bayer_end = bayer + width;
-        pix.val[3] = vdupq_n_u8(255);
+        pix.val[3] = vdupq_n_u8(alpha);
 
         for( ; bayer <= bayer_end - 18; bayer += 14, dst += 56 )
         {
@@ -529,13 +389,198 @@ public:
 
             vst4q_u8(dst-1, pix);
         }
+#else
+        v_uint16x8 delta1 = v_setall_u16(1), delta2 = v_setall_u16(2);
+        v_uint16x8 mask = v_setall_u16(blue < 0 ? (ushort)(-1) : 0);
+        v_uint16x8 masklo = v_setall_u16(0x00ff);
+        v_uint8x16 a = v_setall_u8(alpha);
+        const uchar* bayer_end = bayer + width;
+
+        for( ; bayer <= bayer_end - 18; bayer += 14, dst += 56 )
+        {
+            v_uint16x8 r0 = v_load((ushort*)bayer);
+            v_uint16x8 r1 = v_load((ushort*)(bayer+bayer_step));
+            v_uint16x8 r2 = v_load((ushort*)(bayer+bayer_step*2));
+
+            v_uint16x8 b1 = (r0 & masklo) + (r2 & masklo);
+            v_uint16x8 nextb1 = v_rotate_right<1>(b1);
+            v_uint16x8 b0 = b1 + nextb1;
+            b1 = (nextb1 + delta1) >> 1;
+            b0 = (b0 + delta2) >> 2;
+            // b0 b2 ... b14 b1 b3 ... b15
+            b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1)));
+
+            v_uint16x8 g0 = (r0 >> 8) + (r2 >> 8);
+            v_uint16x8 g1 = r1 & masklo;
+            g0 += v_rotate_right<1>(g1) + g1;
+            g1 = v_rotate_right<1>(g1);
+            g0 = (g0 + delta2) >> 2;
+            // g0 g2 ... g14 g1 g3 ... g15
+            g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g1)));
+
+            r0 = r1 >> 8;
+            r1 = v_rotate_right<1>(r0) + r0;
+            r1 = (r1 + delta1) >> 1;
+            // r0 r2 ... r14 r1 r3 ... r15
+            r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1)));
+
+            b1 = (b0 ^ r0) & mask;
+            b0 = b0 ^ b1;
+            r0 = r0 ^ b1;
+
+            // b1 g1 b3 g3 b5 g5...
+            v_uint8x16 pack_lo, pack_hi;
+            v_zip(v_reinterpret_as_u8(b0), v_reinterpret_as_u8(g0), pack_lo, pack_hi);
+            b1 = v_reinterpret_as_u16(pack_hi);
+            // b0 g0 b2 g2 b4 g4 ....
+            b0 = v_reinterpret_as_u16(pack_lo);
+
+            // r1 a r3 a r5 a ...
+            v_zip(v_reinterpret_as_u8(r0), a, pack_lo, pack_hi);
+            r1 = v_reinterpret_as_u16(pack_hi);
+            // r0 a r2 a r4 a ...
+            r0 = v_reinterpret_as_u16(pack_lo);
+
+            // a b0 g0 r0 a b2 g2 r2 ...
+            v_zip(b0, r0, g0, g1);
+            // a b8 g8 r8 a b10 g10 r10 ...
+
+            // b1 g1 r1 a b3 g3 r3 a ...
+            v_zip(b1, r1, r0, r1);
+            // b9 g9 r9 a b11 g11 r11 a ...
+
+            // a b0 g0 r0 b1 g1 r1 a ...
+            v_uint32x4 pack32_lo, pack32_hi;
+            v_zip(v_reinterpret_as_u32(g0), v_reinterpret_as_u32(r0), pack32_lo, pack32_hi);
+            b0 = v_reinterpret_as_u16(pack32_lo);
+            // a b4 g4 r4 b5 g5 r5 a ...
+            b1 = v_reinterpret_as_u16(pack32_hi);
+
+            v_store_low(dst-1+0, v_reinterpret_as_u8(b0));
+            v_store_high(dst-1+8*1, v_reinterpret_as_u8(b0));
+            v_store_low(dst-1+8*2, v_reinterpret_as_u8(b1));
+            v_store_high(dst-1+8*3, v_reinterpret_as_u8(b1));
+
+            // a b8 g8 r8 b9 g9 r9 a ...
+            v_zip(v_reinterpret_as_u32(g1), v_reinterpret_as_u32(r1), pack32_lo, pack32_hi);
+            g0 = v_reinterpret_as_u16(pack32_lo);
+            // a b12 g12 r12 b13 g13 r13 a ...
+            g1 = v_reinterpret_as_u16(pack32_hi);
+
+            v_store_low(dst-1+8*4, v_reinterpret_as_u8(g0));
+            v_store_high(dst-1+8*5, v_reinterpret_as_u8(g0));
+
+            v_store_low(dst-1+8*6, v_reinterpret_as_u8(g1));
+        }
+#endif
 
         return (int)(bayer - (bayer_end - width));
     }
 
-    int bayer2RGB_EA(const uchar*, int, uchar*, int, int) const
+    int bayer2RGB_EA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
     {
-        return 0;
+        const uchar* bayer_end = bayer + width;
+        v_uint16x8 masklow = v_setall_u16(0x00ff);
+        v_uint16x8 delta1 = v_setall_u16(1), delta2 = v_setall_u16(2);
+        v_uint16x8 full = v_setall_u16((ushort)(-1));
+        v_uint8x16 z = v_setzero_u8();
+        v_uint16x8 mask = v_setall_u16(blue > 0 ? (ushort)(-1) : 0);
+
+        for ( ; bayer <= bayer_end - 18; bayer += 14, dst += 42)
+        {
+            /*
+             B G B G | B G B G | B G B G | B G B G
+             G R G R | G R G R | G R G R | G R G R
+             B G B G | B G B G | B G B G | B G B G
+             */
+
+            v_uint16x8 r0 = v_load((ushort*)bayer);
+            v_uint16x8 r1 = v_load((ushort*)(bayer+bayer_step));
+            v_uint16x8 r2 = v_load((ushort*)(bayer+bayer_step*2));
+
+            v_uint16x8 b1 = (r0 & masklow) + (r2 & masklow);
+            v_uint16x8 nextb1 = v_rotate_right<1>(b1);
+            v_uint16x8 b0 = b1 + nextb1;
+            b1 = (nextb1 + delta1) >> 1;
+            b0 = (b0 + delta2) >> 2;
+            // b0 b2 ... b14 b1 b3 ... b15
+            b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1)));
+
+            // vertical sum
+            v_uint16x8 r0g = r0 >> 8;
+            v_uint16x8 r2g = r2 >> 8;
+            v_uint16x8 sumv = ((r0g + r2g) + delta1) >> 1;
+            // horizontal sum
+            v_uint16x8 g1 = r1 & masklow;
+            v_uint16x8 nextg1 = v_rotate_right<1>(g1);
+            v_uint16x8 sumg = (g1 + nextg1 + delta1) >> 1;
+
+            // gradients
+            v_uint16x8 gradv = (r0g - r2g) + (r2g - r0g);
+            v_uint16x8 gradg = (nextg1 - g1) + (g1 - nextg1);
+            v_uint16x8 gmask = gradg > gradv;
+            v_uint16x8 g0 = (gmask & sumv) + (sumg & (gmask ^ full));
+            // g0 g2 ... g14 g1 g3 ...
+            g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(nextg1)));
+
+            r0 = r1 >> 8;
+            r1 = v_rotate_right<1>(r0) + r0;
+            r1 = (r1 + delta1) >> 1;
+            // r0 r2 ... r14 r1 r3 ... r15
+            r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1)));
+
+            b1 = (b0 ^ r0) & mask;
+            b0 = b0 ^ b1;
+            r0 = r0 ^ b1;
+
+            // b1 g1 b3 g3 b5 g5...
+            v_uint8x16 pack_lo, pack_hi;
+            v_zip(v_reinterpret_as_u8(b0), v_reinterpret_as_u8(g0), pack_lo, pack_hi);
+            b1 = v_reinterpret_as_u16(pack_hi);
+            // b0 g0 b2 g2 b4 g4 ....
+            b0 = v_reinterpret_as_u16(pack_lo);
+
+            // r1 0 r3 0 r5 0 ...
+            v_zip(v_reinterpret_as_u8(r0), z, pack_lo, pack_hi);
+            r1 = v_reinterpret_as_u16(pack_hi);
+            // r0 0 r2 0 r4 0 ...
+            r0 = v_reinterpret_as_u16(pack_lo);
+
+            // 0 b0 g0 r0 0 b2 g2 r2 ...
+            v_zip(b0, r0, g0, g1);
+            g0 = v_reinterpret_as_u16(v_rotate_left<1>(v_reinterpret_as_u8(g0)));
+            // 0 b8 g8 r8 0 b10 g10 r10 ...
+            g1 = v_reinterpret_as_u16(v_rotate_left<1>(v_reinterpret_as_u8(g1)));
+
+            // b1 g1 r1 0 b3 g3 r3 0 ...
+            v_zip(b1, r1, r0, r1);
+            // b9 g9 r9 0 b11 g11 r11 0 ...
+
+            // 0 b0 g0 r0 b1 g1 r1 0 ...
+            v_uint32x4 pack32_lo, pack32_hi;
+            v_zip(v_reinterpret_as_u32(g0), v_reinterpret_as_u32(r0), pack32_lo, pack32_hi);
+            b0 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_lo)));
+            // 0 b4 g4 r4 b5 g5 r5 0 ...
+            b1 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_hi)));
+
+            v_store_low(dst+0, v_reinterpret_as_u8(b0));
+            v_store_high(dst+6*1, v_reinterpret_as_u8(b0));
+            v_store_low(dst+6*2, v_reinterpret_as_u8(b1));
+            v_store_high(dst+6*3, v_reinterpret_as_u8(b1));
+
+            // 0 b8 g8 r8 b9 g9 r9 0 ...
+            v_zip(v_reinterpret_as_u32(g1), v_reinterpret_as_u32(r1), pack32_lo, pack32_hi);
+            g0 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_lo)));
+            // 0 b12 g12 r12 b13 g13 r13 0 ...
+            g1 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_hi)));
+
+            v_store_low(dst+6*4, v_reinterpret_as_u8(g0));
+            v_store_high(dst+6*5, v_reinterpret_as_u8(g0));
+
+            v_store_low(dst+6*6, v_reinterpret_as_u8(g1));
+        }
+
+        return int(bayer - (bayer_end - width));
     }
 };
 #else
@@ -775,7 +820,7 @@ public:
 
             // simd optimization only for dcn == 3
             int delta = dcn == 4 ?
-                vecOp.bayer2RGBA(bayer, bayer_step, dst, size.width, blue) :
+                vecOp.bayer2RGBA(bayer, bayer_step, dst, size.width, blue, alpha) :
                 vecOp.bayer2RGB(bayer, bayer_step, dst, size.width, blue);
             bayer += delta;
             dst += delta*dcn;

From 17e2bf5717559ccf4fe809dd9f83afa7582f8ef5 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Sun, 13 Oct 2019 11:14:41 +0000
Subject: [PATCH 2/6] core(tls): implement releasing of TLS on thread
 termination

- move TLS & instrumentation code out of core/utility.hpp
- (*) TLSData lost .gather() method (to dispose thread data on thread termination)
- use TLSDataAccumulator for reliable collecting of thread data
- prefer using of .detachData() + .cleanupDetachedData() instead of .gather() method

(*) API is broken: replace TLSData => TLSDataAccumulator if gather required
(objects disposal on threads termination is not available in accumulator mode)
---
 modules/core/include/opencv2/core/private.hpp |   4 +
 modules/core/include/opencv2/core/utility.hpp | 168 +------------
 .../opencv2/core/utils/instrumentation.hpp    | 125 +++++++++
 .../core/include/opencv2/core/utils/tls.hpp   | 237 ++++++++++++++++++
 .../opencv2/core/utils/trace.private.hpp      |   4 +-
 modules/core/src/ocl.cpp                      |  25 +-
 modules/core/src/ovx.cpp                      |  12 +-
 modules/core/src/precomp.hpp                  |   2 +-
 modules/core/src/rand.cpp                     |   2 +-
 modules/core/src/system.cpp                   | 170 ++++++++-----
 modules/core/src/umatrix.cpp                  |   2 +
 modules/core/test/test_utils.cpp              |   2 +
 modules/core/test/test_utils_tls.impl.hpp     | 134 ++++++++++
 modules/imgproc/src/histogram.cpp             |   2 +
 14 files changed, 650 insertions(+), 239 deletions(-)
 create mode 100644 modules/core/include/opencv2/core/utils/instrumentation.hpp
 create mode 100644 modules/core/include/opencv2/core/utils/tls.hpp
 create mode 100644 modules/core/test/test_utils_tls.impl.hpp

diff --git a/modules/core/include/opencv2/core/private.hpp b/modules/core/include/opencv2/core/private.hpp
index 5e66801b51..24f7fc69b8 100644
--- a/modules/core/include/opencv2/core/private.hpp
+++ b/modules/core/include/opencv2/core/private.hpp
@@ -53,6 +53,10 @@
 
 #include <opencv2/core/utils/trace.hpp>
 
+#ifdef ENABLE_INSTRUMENTATION
+#include "opencv2/core/utils/instrumentation.hpp"
+#endif
+
 #ifdef HAVE_EIGEN
 #  if defined __GNUC__ && defined __APPLE__
 #    pragma GCC diagnostic ignored "-Wshadow"
diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp
index cbec10b41b..e7f169b01a 100644
--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@@ -63,30 +63,6 @@
 namespace cv
 {
 
-#ifdef CV_COLLECT_IMPL_DATA
-CV_EXPORTS void setImpl(int flags); // set implementation flags and reset storage arrays
-CV_EXPORTS void addImpl(int flag, const char* func = 0); // add implementation and function name to storage arrays
-// Get stored implementation flags and functions names arrays
-// Each implementation entry correspond to function name entry, so you can find which implementation was executed in which function
-CV_EXPORTS int getImpl(std::vector<int> &impl, std::vector<String> &funName);
-
-CV_EXPORTS bool useCollection(); // return implementation collection state
-CV_EXPORTS void setUseCollection(bool flag); // set implementation collection state
-
-#define CV_IMPL_PLAIN  0x01 // native CPU OpenCV implementation
-#define CV_IMPL_OCL    0x02 // OpenCL implementation
-#define CV_IMPL_IPP    0x04 // IPP implementation
-#define CV_IMPL_MT     0x10 // multithreaded implementation
-
-#define CV_IMPL_ADD(impl)                                                   \
-    if(cv::useCollection())                                                 \
-    {                                                                       \
-        cv::addImpl(impl, CV_Func);                                         \
-    }
-#else
-#define CV_IMPL_ADD(impl)
-#endif
-
 //! @addtogroup core_utils
 //! @{
 
@@ -726,61 +702,6 @@ private:
     AutoLock& operator = (const AutoLock&);
 };
 
-// TLS interface
-class CV_EXPORTS TLSDataContainer
-{
-protected:
-    TLSDataContainer();
-    virtual ~TLSDataContainer();
-
-    void  gatherData(std::vector<void*> &data) const;
-#if OPENCV_ABI_COMPATIBILITY > 300
-    void* getData() const;
-    void  release();
-
-private:
-#else
-    void  release();
-
-public:
-    void* getData() const;
-#endif
-    virtual void* createDataInstance() const = 0;
-    virtual void  deleteDataInstance(void* pData) const = 0;
-
-    int key_;
-
-public:
-    void cleanup(); //! Release created TLS data container objects. It is similar to release() call, but it keeps TLS container valid.
-};
-
-// Main TLS data class
-template <typename T>
-class TLSData : protected TLSDataContainer
-{
-public:
-    inline TLSData()        {}
-    inline ~TLSData()       { release();            } // Release key and delete associated data
-    inline T* get() const   { return (T*)getData(); } // Get data associated with key
-    inline T& getRef() const { T* ptr = (T*)getData(); CV_Assert(ptr); return *ptr; } // Get data associated with key
-
-    // Get data from all threads
-    inline void gather(std::vector<T*> &data) const
-    {
-        std::vector<void*> &dataVoid = reinterpret_cast<std::vector<void*>&>(data);
-        gatherData(dataVoid);
-    }
-
-    inline void cleanup() { TLSDataContainer::cleanup(); }
-
-private:
-    virtual void* createDataInstance() const CV_OVERRIDE {return new T;}                // Wrapper to allocate data by template
-    virtual void  deleteDataInstance(void* pData) const CV_OVERRIDE {delete (T*)pData;} // Wrapper to release data by template
-
-    // Disable TLS copy operations
-    TLSData(TLSData &) {}
-    TLSData& operator =(const TLSData &) {return *this;}
-};
 
 /** @brief Designed for command line parsing
 
@@ -1199,88 +1120,6 @@ public:
     std::vector<Node<OBJECT>*> m_childs;
 };
 
-// Instrumentation external interface
-namespace instr
-{
-
-#if !defined OPENCV_ABI_CHECK
-
-enum TYPE
-{
-    TYPE_GENERAL = 0,   // OpenCV API function, e.g. exported function
-    TYPE_MARKER,        // Information marker
-    TYPE_WRAPPER,       // Wrapper function for implementation
-    TYPE_FUN,           // Simple function call
-};
-
-enum IMPL
-{
-    IMPL_PLAIN = 0,
-    IMPL_IPP,
-    IMPL_OPENCL,
-};
-
-struct NodeDataTls
-{
-    NodeDataTls()
-    {
-        m_ticksTotal = 0;
-    }
-    uint64      m_ticksTotal;
-};
-
-class CV_EXPORTS NodeData
-{
-public:
-    NodeData(const char* funName = 0, const char* fileName = NULL, int lineNum = 0, void* retAddress = NULL, bool alwaysExpand = false, cv::instr::TYPE instrType = TYPE_GENERAL, cv::instr::IMPL implType = IMPL_PLAIN);
-    NodeData(NodeData &ref);
-    ~NodeData();
-    NodeData& operator=(const NodeData&);
-
-    cv::String          m_funName;
-    cv::instr::TYPE     m_instrType;
-    cv::instr::IMPL     m_implType;
-    const char*         m_fileName;
-    int                 m_lineNum;
-    void*               m_retAddress;
-    bool                m_alwaysExpand;
-    bool                m_funError;
-
-    volatile int         m_counter;
-    volatile uint64      m_ticksTotal;
-    TLSData<NodeDataTls> m_tls;
-    int                  m_threads;
-
-    // No synchronization
-    double getTotalMs()   const { return ((double)m_ticksTotal / cv::getTickFrequency()) * 1000; }
-    double getMeanMs()    const { return (((double)m_ticksTotal/m_counter) / cv::getTickFrequency()) * 1000; }
-};
-bool operator==(const NodeData& lhs, const NodeData& rhs);
-
-typedef Node<NodeData> InstrNode;
-
-CV_EXPORTS InstrNode* getTrace();
-
-#endif // !defined OPENCV_ABI_CHECK
-
-
-CV_EXPORTS bool       useInstrumentation();
-CV_EXPORTS void       setUseInstrumentation(bool flag);
-CV_EXPORTS void       resetTrace();
-
-enum FLAGS
-{
-    FLAGS_NONE              = 0,
-    FLAGS_MAPPING           = 0x01,
-    FLAGS_EXPAND_SAME_NAMES = 0x02,
-};
-
-CV_EXPORTS void       setFlags(FLAGS modeFlags);
-static inline void    setFlags(int modeFlags) { setFlags((FLAGS)modeFlags); }
-CV_EXPORTS FLAGS      getFlags();
-
-} // namespace instr
-
 
 namespace samples {
 
@@ -1355,6 +1194,13 @@ CV_EXPORTS int getThreadID();
 
 } //namespace cv
 
+#ifdef CV_COLLECT_IMPL_DATA
+#include "opencv2/core/utils/instrumentation.hpp"
+#else
+/// Collect implementation data on OpenCV function call. Requires ENABLE_IMPL_COLLECTION build option.
+#define CV_IMPL_ADD(impl)
+#endif
+
 #ifndef DISABLE_OPENCV_24_COMPATIBILITY
 #include "opencv2/core/core_c.h"
 #endif
diff --git a/modules/core/include/opencv2/core/utils/instrumentation.hpp b/modules/core/include/opencv2/core/utils/instrumentation.hpp
new file mode 100644
index 0000000000..3639867080
--- /dev/null
+++ b/modules/core/include/opencv2/core/utils/instrumentation.hpp
@@ -0,0 +1,125 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_UTILS_INSTR_HPP
+#define OPENCV_UTILS_INSTR_HPP
+
+#include <opencv2/core/utility.hpp>
+#include <opencv2/core/utils/tls.hpp>
+
+namespace cv {
+
+//! @addtogroup core_utils
+//! @{
+
+#ifdef CV_COLLECT_IMPL_DATA
+CV_EXPORTS void setImpl(int flags); // set implementation flags and reset storage arrays
+CV_EXPORTS void addImpl(int flag, const char* func = 0); // add implementation and function name to storage arrays
+// Get stored implementation flags and functions names arrays
+// Each implementation entry correspond to function name entry, so you can find which implementation was executed in which function
+CV_EXPORTS int getImpl(std::vector<int> &impl, std::vector<String> &funName);
+
+CV_EXPORTS bool useCollection(); // return implementation collection state
+CV_EXPORTS void setUseCollection(bool flag); // set implementation collection state
+
+#define CV_IMPL_PLAIN  0x01 // native CPU OpenCV implementation
+#define CV_IMPL_OCL    0x02 // OpenCL implementation
+#define CV_IMPL_IPP    0x04 // IPP implementation
+#define CV_IMPL_MT     0x10 // multithreaded implementation
+
+#undef CV_IMPL_ADD
+#define CV_IMPL_ADD(impl)                                                   \
+    if(cv::useCollection())                                                 \
+    {                                                                       \
+        cv::addImpl(impl, CV_Func);                                         \
+    }
+#endif
+
+// Instrumentation external interface
+namespace instr
+{
+
+#if !defined OPENCV_ABI_CHECK
+
+enum TYPE
+{
+    TYPE_GENERAL = 0,   // OpenCV API function, e.g. exported function
+    TYPE_MARKER,        // Information marker
+    TYPE_WRAPPER,       // Wrapper function for implementation
+    TYPE_FUN,           // Simple function call
+};
+
+enum IMPL
+{
+    IMPL_PLAIN = 0,
+    IMPL_IPP,
+    IMPL_OPENCL,
+};
+
+struct NodeDataTls
+{
+    NodeDataTls()
+    {
+        m_ticksTotal = 0;
+    }
+    uint64      m_ticksTotal;
+};
+
+class CV_EXPORTS NodeData
+{
+public:
+    NodeData(const char* funName = 0, const char* fileName = NULL, int lineNum = 0, void* retAddress = NULL, bool alwaysExpand = false, cv::instr::TYPE instrType = TYPE_GENERAL, cv::instr::IMPL implType = IMPL_PLAIN);
+    NodeData(NodeData &ref);
+    ~NodeData();
+    NodeData& operator=(const NodeData&);
+
+    cv::String          m_funName;
+    cv::instr::TYPE     m_instrType;
+    cv::instr::IMPL     m_implType;
+    const char*         m_fileName;
+    int                 m_lineNum;
+    void*               m_retAddress;
+    bool                m_alwaysExpand;
+    bool                m_funError;
+
+    volatile int         m_counter;
+    volatile uint64      m_ticksTotal;
+    TLSDataAccumulator<NodeDataTls> m_tls;
+    int                  m_threads;
+
+    // No synchronization
+    double getTotalMs()   const { return ((double)m_ticksTotal / cv::getTickFrequency()) * 1000; }
+    double getMeanMs()    const { return (((double)m_ticksTotal/m_counter) / cv::getTickFrequency()) * 1000; }
+};
+bool operator==(const NodeData& lhs, const NodeData& rhs);
+
+typedef Node<NodeData> InstrNode;
+
+CV_EXPORTS InstrNode* getTrace();
+
+#endif // !defined OPENCV_ABI_CHECK
+
+
+CV_EXPORTS bool       useInstrumentation();
+CV_EXPORTS void       setUseInstrumentation(bool flag);
+CV_EXPORTS void       resetTrace();
+
+enum FLAGS
+{
+    FLAGS_NONE              = 0,
+    FLAGS_MAPPING           = 0x01,
+    FLAGS_EXPAND_SAME_NAMES = 0x02,
+};
+
+CV_EXPORTS void       setFlags(FLAGS modeFlags);
+static inline void    setFlags(int modeFlags) { setFlags((FLAGS)modeFlags); }
+CV_EXPORTS FLAGS      getFlags();
+
+} // namespace instr
+
+//! @}
+
+} // namespace
+
+#endif // OPENCV_UTILS_TLS_HPP
diff --git a/modules/core/include/opencv2/core/utils/tls.hpp b/modules/core/include/opencv2/core/utils/tls.hpp
new file mode 100644
index 0000000000..b5f1138593
--- /dev/null
+++ b/modules/core/include/opencv2/core/utils/tls.hpp
@@ -0,0 +1,237 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_UTILS_TLS_HPP
+#define OPENCV_UTILS_TLS_HPP
+
+#include <opencv2/core/utility.hpp>
+
+namespace cv {
+
+//! @addtogroup core_utils
+//! @{
+
+namespace details { class TlsStorage; }
+
+/** TLS container base implementation
+ *
+ * Don't use directly.
+ *
+ * @sa TLSData, TLSDataAccumulator templates
+ */
+class CV_EXPORTS TLSDataContainer
+{
+protected:
+    TLSDataContainer();
+    virtual ~TLSDataContainer();
+
+    /// @deprecated use detachData() instead
+    void  gatherData(std::vector<void*> &data) const;
+    /// get TLS data and detach all data from threads (similar to cleanup() call)
+    void  detachData(std::vector<void*>& data);
+
+    void* getData() const;
+    void  release();
+
+protected:
+    virtual void* createDataInstance() const = 0;
+    virtual void  deleteDataInstance(void* pData) const = 0;
+
+#if OPENCV_ABI_COMPATIBILITY > 300
+private:
+#else
+public:
+#endif
+    int key_;
+
+    friend class cv::details::TlsStorage;  // core/src/system.cpp
+
+public:
+    void cleanup(); //!< Release created TLS data container objects. It is similar to release() call, but it keeps TLS container valid.
+
+private:
+    // Disable copy/assign (noncopyable pattern)
+    TLSDataContainer(TLSDataContainer &);
+    TLSDataContainer& operator =(const TLSDataContainer &);
+};
+
+
+/** @brief Simple TLS data class
+ *
+ * @sa TLSDataAccumulator
+ */
+template <typename T>
+class TLSData : protected TLSDataContainer
+{
+public:
+    inline TLSData() {}
+    inline ~TLSData() { release(); }
+
+    inline T* get() const   { return (T*)getData(); }  //!< Get data associated with key
+    inline T& getRef() const { T* ptr = (T*)getData(); CV_DbgAssert(ptr); return *ptr; }  //!< Get data associated with key
+
+    /// Release associated thread data
+    inline void cleanup()
+    {
+        TLSDataContainer::cleanup();
+    }
+
+protected:
+    /// Wrapper to allocate data by template
+    virtual void* createDataInstance() const CV_OVERRIDE { return new T; }
+    /// Wrapper to release data by template
+    virtual void  deleteDataInstance(void* pData) const CV_OVERRIDE { delete (T*)pData; }
+};
+
+
+/// TLS data accumulator with gathering methods
+template <typename T>
+class TLSDataAccumulator : public TLSData<T>
+{
+    mutable cv::Mutex mutex;
+    mutable std::vector<T*> dataFromTerminatedThreads;
+    std::vector<T*> detachedData;
+    bool cleanupMode;
+public:
+    TLSDataAccumulator() : cleanupMode(false) {}
+    ~TLSDataAccumulator()
+    {
+        release();
+    }
+
+    /** @brief Get data from all threads
+     * @deprecated replaced by detachData()
+     *
+     * Lifetime of vector data is valid until next detachData()/cleanup()/release() calls
+     *
+     * @param[out] data result buffer (should be empty)
+     */
+    void gather(std::vector<T*> &data) const
+    {
+        CV_Assert(cleanupMode == false);  // state is not valid
+        CV_Assert(data.empty());
+        {
+            std::vector<void*> &dataVoid = reinterpret_cast<std::vector<void*>&>(data);
+            TLSDataContainer::gatherData(dataVoid);
+        }
+        {
+            AutoLock lock(mutex);
+            data.reserve(data.size() + dataFromTerminatedThreads.size());
+            for (typename std::vector<T*>::const_iterator i = dataFromTerminatedThreads.begin(); i != dataFromTerminatedThreads.end(); ++i)
+            {
+                data.push_back((T*)*i);
+            }
+        }
+    }
+
+    /** @brief Get and detach data from all threads
+     *
+     * Call cleanupDetachedData() when returned vector is not needed anymore.
+     *
+     * @return Vector with associated data. Content is preserved (including lifetime of attached data pointers) until next detachData()/cleanupDetachedData()/cleanup()/release() calls
+     */
+    std::vector<T*>& detachData()
+    {
+        CV_Assert(cleanupMode == false);  // state is not valid
+        std::vector<void*> dataVoid;
+        {
+            TLSDataContainer::detachData(dataVoid);
+        }
+        {
+            AutoLock lock(mutex);
+            detachedData.reserve(dataVoid.size() + dataFromTerminatedThreads.size());
+            for (typename std::vector<T*>::const_iterator i = dataFromTerminatedThreads.begin(); i != dataFromTerminatedThreads.end(); ++i)
+            {
+                detachedData.push_back((T*)*i);
+            }
+            dataFromTerminatedThreads.clear();
+            for (typename std::vector<void*>::const_iterator i = dataVoid.begin(); i != dataVoid.end(); ++i)
+            {
+                detachedData.push_back((T*)(void*)*i);
+            }
+        }
+        dataVoid.clear();
+        return detachedData;
+    }
+
+    /// Release associated thread data returned by detachData() call
+    void cleanupDetachedData()
+    {
+        AutoLock lock(mutex);
+        cleanupMode = true;
+        _cleanupDetachedData();
+        cleanupMode = false;
+    }
+
+    /// Release associated thread data
+    void cleanup()
+    {
+        cleanupMode = true;
+        TLSDataContainer::cleanup();
+
+        AutoLock lock(mutex);
+        _cleanupDetachedData();
+        _cleanupTerminatedData();
+        cleanupMode = false;
+    }
+
+    /// Release associated thread data and free TLS key
+    void release()
+    {
+        cleanupMode = true;
+        TLSDataContainer::release();
+        {
+            AutoLock lock(mutex);
+            _cleanupDetachedData();
+            _cleanupTerminatedData();
+        }
+    }
+
+protected:
+    // synchronized
+    void _cleanupDetachedData()
+    {
+        for (typename std::vector<T*>::iterator i = detachedData.begin(); i != detachedData.end(); ++i)
+        {
+            deleteDataInstance((T*)*i);
+        }
+        detachedData.clear();
+    }
+
+    // synchronized
+    void _cleanupTerminatedData()
+    {
+        for (typename std::vector<T*>::iterator i = dataFromTerminatedThreads.begin(); i != dataFromTerminatedThreads.end(); ++i)
+        {
+            deleteDataInstance((T*)*i);
+        }
+        dataFromTerminatedThreads.clear();
+    }
+
+protected:
+    virtual void* createDataInstance() const CV_OVERRIDE
+    {
+        // Note: we can collect all allocated data here, but this would require raced mutex locks
+        return new T;
+    }
+    virtual void  deleteDataInstance(void* pData) const CV_OVERRIDE
+    {
+        if (cleanupMode)
+        {
+            delete (T*)pData;
+        }
+        else
+        {
+            AutoLock lock(mutex);
+            dataFromTerminatedThreads.push_back((T*)pData);
+        }
+    }
+};
+
+
+//! @}
+
+} // namespace
+
+#endif // OPENCV_UTILS_TLS_HPP
diff --git a/modules/core/include/opencv2/core/utils/trace.private.hpp b/modules/core/include/opencv2/core/utils/trace.private.hpp
index 17981663fe..afc41159f6 100644
--- a/modules/core/include/opencv2/core/utils/trace.private.hpp
+++ b/modules/core/include/opencv2/core/utils/trace.private.hpp
@@ -9,6 +9,8 @@
 
 #include <opencv2/core/utils/logger.hpp>
 
+#include <opencv2/core/utils/tls.hpp>
+
 #include "trace.hpp"
 
 //! @cond IGNORED
@@ -332,7 +334,7 @@ public:
     Mutex mutexCreate;
     Mutex mutexCount;
 
-    TLSData<TraceManagerThreadLocal> tls;
+    TLSDataAccumulator<TraceManagerThreadLocal> tls;
 
     cv::Ptr<TraceStorage> trace_storage;
 private:
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index 22408df723..8ebb0064a9 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -909,19 +909,19 @@ bool haveOpenCL()
 
 bool useOpenCL()
 {
-    CoreTLSData* data = getCoreTlsData().get();
-    if( data->useOpenCL < 0 )
+    CoreTLSData& data = getCoreTlsData();
+    if (data.useOpenCL < 0)
     {
         try
         {
-            data->useOpenCL = (int)(haveOpenCL() && Device::getDefault().ptr() && Device::getDefault().available()) ? 1 : 0;
+            data.useOpenCL = (int)(haveOpenCL() && Device::getDefault().ptr() && Device::getDefault().available()) ? 1 : 0;
         }
         catch (...)
         {
-            data->useOpenCL = 0;
+            data.useOpenCL = 0;
         }
     }
-    return data->useOpenCL > 0;
+    return data.useOpenCL > 0;
 }
 
 #ifdef HAVE_OPENCL
@@ -937,14 +937,14 @@ void setUseOpenCL(bool flag)
 {
     CV_TRACE_FUNCTION();
 
-    CoreTLSData* data = getCoreTlsData().get();
+    CoreTLSData& data = getCoreTlsData();
     if (!flag)
     {
-        data->useOpenCL = 0;
+        data.useOpenCL = 0;
     }
     else if( haveOpenCL() )
     {
-        data->useOpenCL = (Device::getDefault().ptr() != NULL) ? 1 : 0;
+        data.useOpenCL = (Device::getDefault().ptr() != NULL) ? 1 : 0;
     }
 }
 
@@ -1655,7 +1655,7 @@ size_t Device::profilingTimerResolution() const
 const Device& Device::getDefault()
 {
     const Context& ctx = Context::getDefault();
-    int idx = getCoreTlsData().get()->device;
+    int idx = getCoreTlsData().device;
     const Device& device = ctx.device(idx);
     return device;
 }
@@ -2562,9 +2562,10 @@ void attachContext(const String& platformName, void* platformID, void* context,
     CV_OCL_CHECK(clRetainContext((cl_context)context));
 
     // clear command queue, if any
-    getCoreTlsData().get()->oclQueue.finish();
+    CoreTLSData& data = getCoreTlsData();
+    data.oclQueue.finish();
     Queue q;
-    getCoreTlsData().get()->oclQueue = q;
+    data.oclQueue = q;
 
     return;
 } // attachContext()
@@ -2752,7 +2753,7 @@ void* Queue::ptr() const
 
 Queue& Queue::getDefault()
 {
-    Queue& q = getCoreTlsData().get()->oclQueue;
+    Queue& q = getCoreTlsData().oclQueue;
     if( !q.p && haveOpenCL() )
         q.create(Context::getDefault());
     return q;
diff --git a/modules/core/src/ovx.cpp b/modules/core/src/ovx.cpp
index d906ead09c..9685cbaed2 100644
--- a/modules/core/src/ovx.cpp
+++ b/modules/core/src/ovx.cpp
@@ -76,13 +76,13 @@ bool haveOpenVX()
 bool useOpenVX()
 {
 #ifdef HAVE_OPENVX
-    CoreTLSData* data = getCoreTlsData().get();
-    if( data->useOpenVX < 0 )
+    CoreTLSData& data = getCoreTlsData();
+    if (data.useOpenVX < 0)
     {
         // enabled (if available) by default
-        data->useOpenVX = haveOpenVX() ? 1 : 0;
+        data.useOpenVX = haveOpenVX() ? 1 : 0;
     }
-    return data->useOpenVX > 0;
+    return data.useOpenVX > 0;
 #else
     return false;
 #endif
@@ -93,8 +93,8 @@ void setUseOpenVX(bool flag)
 #ifdef HAVE_OPENVX
     if( haveOpenVX() )
     {
-        CoreTLSData* data = getCoreTlsData().get();
-        data->useOpenVX = flag ? 1 : 0;
+        CoreTLSData& data = getCoreTlsData();
+        data.useOpenVX = flag ? 1 : 0;
     }
 #else
     CV_Assert(!flag && "OpenVX support isn't enabled at compile time");
diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp
index acaee08eab..0ffde8855a 100644
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -354,7 +354,7 @@ struct CoreTLSData
 #endif
 };
 
-TLSData<CoreTLSData>& getCoreTlsData();
+CoreTLSData& getCoreTlsData();
 
 #if defined(BUILD_SHARED_LIBS)
 #if defined _WIN32 || defined WINCE
diff --git a/modules/core/src/rand.cpp b/modules/core/src/rand.cpp
index aa952b2448..539f92aeb1 100644
--- a/modules/core/src/rand.cpp
+++ b/modules/core/src/rand.cpp
@@ -770,7 +770,7 @@ void RNG::fill( InputOutputArray _mat, int disttype,
 
 cv::RNG& cv::theRNG()
 {
-    return getCoreTlsData().get()->rng;
+    return getCoreTlsData().rng;
 }
 
 void cv::setRNGSeed(int seed)
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index 1b6777561a..b39173de0d 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -50,6 +50,9 @@
 
 #include <opencv2/core/utils/logger.hpp>
 
+#include <opencv2/core/utils/tls.hpp>
+#include <opencv2/core/utils/instrumentation.hpp>
+
 namespace cv {
 
 static Mutex* __initialization_mutex = NULL;
@@ -1375,6 +1378,8 @@ bool Mutex::trylock() { return impl->trylock(); }
 
 //////////////////////////////// thread-local storage ////////////////////////////////
 
+namespace details {
+
 #ifdef _WIN32
 #ifdef _MSC_VER
 #pragma warning(disable:4505) // unreferenced local function has been removed
@@ -1436,9 +1441,10 @@ void  TlsAbstraction::SetData(void *pData)
 }
 #endif
 #else // _WIN32
+static void opencv_tls_destructor(void* pData);
 TlsAbstraction::TlsAbstraction()
 {
-    CV_Assert(pthread_key_create(&tlsKey, NULL) == 0);
+    CV_Assert(pthread_key_create(&tlsKey, opencv_tls_destructor) == 0);
 }
 TlsAbstraction::~TlsAbstraction()
 {
@@ -1479,42 +1485,46 @@ public:
     }
     ~TlsStorage()
     {
-        for(size_t i = 0; i < threads.size(); i++)
-        {
-            if(threads[i])
-            {
-                /* Current architecture doesn't allow proper global objects release, so this check can cause crashes
-
-                // Check if all slots were properly cleared
-                for(size_t j = 0; j < threads[i]->slots.size(); j++)
-                {
-                    CV_Assert(threads[i]->slots[j] == 0);
-                }
-                */
-                delete threads[i];
-            }
-        }
-        threads.clear();
+        // TlsStorage object should not be released
+        // There is no reliable way to avoid problems caused by static initialization order fiasco
+        CV_LOG_FATAL(NULL, "TlsStorage::~TlsStorage() call is not expected");
     }
 
-    void releaseThread()
+    void releaseThread(void* tlsValue = NULL)
     {
+        ThreadData *pTD = tlsValue == NULL ? (ThreadData*)tls.GetData() : (ThreadData*)tlsValue;
+        if (pTD == NULL)
+            return;  // no OpenCV TLS data for this thread
         AutoLock guard(mtxGlobalAccess);
-        ThreadData *pTD = (ThreadData*)tls.GetData();
-        for(size_t i = 0; i < threads.size(); i++)
+        for (size_t i = 0; i < threads.size(); i++)
         {
-            if(pTD == threads[i])
+            if (pTD == threads[i])
             {
-                threads[i] = 0;
-                break;
+                threads[i] = NULL;
+                if (tlsValue == NULL)
+                    tls.SetData(0);
+                std::vector<void*>& thread_slots = pTD->slots;
+                for (size_t slotIdx = 0; slotIdx < thread_slots.size(); slotIdx++)
+                {
+                    void* pData = thread_slots[slotIdx];
+                    thread_slots[slotIdx] = NULL;
+                    if (!pData)
+                        continue;
+                    TLSDataContainer* container = tlsSlots[slotIdx].container;
+                    if (container)
+                        container->deleteDataInstance(pData);
+                    else
+                        CV_LOG_ERROR(NULL, "TLS: container for slotIdx=" << slotIdx << " is NULL. Can't release thread data");
+                }
+                delete pTD;
+                return;
             }
         }
-        tls.SetData(0);
-        delete pTD;
+        CV_LOG_WARNING(NULL, "TLS: Can't release thread TLS data (unknown pointer or data race): " << (void*)pTD);
     }
 
     // Reserve TLS storage index
-    size_t reserveSlot()
+    size_t reserveSlot(TLSDataContainer* container)
     {
         AutoLock guard(mtxGlobalAccess);
         CV_Assert(tlsSlotsSize == tlsSlots.size());
@@ -1522,15 +1532,15 @@ public:
         // Find unused slots
         for(size_t slot = 0; slot < tlsSlotsSize; slot++)
         {
-            if(!tlsSlots[slot])
+            if (tlsSlots[slot].container == NULL)
             {
-                tlsSlots[slot] = 1;
+                tlsSlots[slot].container = container;
                 return slot;
             }
         }
 
         // Create new slot
-        tlsSlots.push_back(1); tlsSlotsSize++;
+        tlsSlots.push_back(TlsSlotInfo(container)); tlsSlotsSize++;
         return tlsSlotsSize - 1;
     }
 
@@ -1555,7 +1565,9 @@ public:
         }
 
         if (!keepSlot)
-            tlsSlots[slotIdx] = 0;
+        {
+            tlsSlots[slotIdx].container = NULL;  // mark slot as free (see reserveSlot() implementation)
+        }
     }
 
     // Get data by TLS storage index
@@ -1604,8 +1616,26 @@ public:
             tls.SetData((void*)threadData);
             {
                 AutoLock guard(mtxGlobalAccess);
-                threadData->idx = threads.size();
-                threads.push_back(threadData);
+
+                bool found = false;
+                // Find unused slots
+                for(size_t slot = 0; slot < threads.size(); slot++)
+                {
+                    if (threads[slot] == NULL)
+                    {
+                        threadData->idx = (int)slot;
+                        threads[slot] = threadData;
+                        found = true;
+                        break;
+                    }
+                }
+
+                if (!found)
+                {
+                    // Create new slot
+                    threadData->idx = threads.size();
+                    threads.push_back(threadData);
+                }
             }
         }
 
@@ -1622,8 +1652,14 @@ private:
 
     Mutex  mtxGlobalAccess;           // Shared objects operation guard
     size_t tlsSlotsSize;              // equal to tlsSlots.size() in synchronized sections
-                                      // without synchronization this counter doesn't desrease - it is used for slotIdx sanity checks
-    std::vector<int> tlsSlots;        // TLS keys state
+                                      // without synchronization this counter doesn't decrease - it is used for slotIdx sanity checks
+
+    struct TlsSlotInfo
+    {
+        TlsSlotInfo(TLSDataContainer* _container) : container(_container) {}
+        TLSDataContainer* container;  // attached container (to dispose data of terminated threads)
+    };
+    std::vector<struct TlsSlotInfo> tlsSlots;  // TLS keys state
     std::vector<ThreadData*> threads; // Array for all allocated data. Thread data pointers are placed here to allow data cleanup
 };
 
@@ -1633,9 +1669,19 @@ static TlsStorage &getTlsStorage()
     CV_SINGLETON_LAZY_INIT_REF(TlsStorage, new TlsStorage())
 }
 
+#ifndef _WIN32  // pthread key destructor
+static void opencv_tls_destructor(void* pData)
+{
+    getTlsStorage().releaseThread(pData);
+}
+#endif
+
+} // namespace details
+using namespace details;
+
 TLSDataContainer::TLSDataContainer()
 {
-    key_ = (int)getTlsStorage().reserveSlot(); // Reserve key from TLS storage
+    key_ = (int)getTlsStorage().reserveSlot(this); // Reserve key from TLS storage
 }
 
 TLSDataContainer::~TLSDataContainer()
@@ -1648,11 +1694,17 @@ void TLSDataContainer::gatherData(std::vector<void*> &data) const
     getTlsStorage().gather(key_, data);
 }
 
+void TLSDataContainer::detachData(std::vector<void*> &data)
+{
+    getTlsStorage().releaseSlot(key_, data, true);
+}
+
 void TLSDataContainer::release()
 {
-    std::vector<void*> data;
-    data.reserve(32);
-    getTlsStorage().releaseSlot(key_, data); // Release key and get stored data for proper destruction
+    if (key_ == -1)
+        return;  // already released
+    std::vector<void*> data; data.reserve(32);
+    getTlsStorage().releaseSlot(key_, data, false); // Release key and get stored data for proper destruction
     key_ = -1;
     for(size_t i = 0; i < data.size(); i++)  // Delete all associated data
         deleteDataInstance(data[i]);
@@ -1660,8 +1712,7 @@ void TLSDataContainer::release()
 
 void TLSDataContainer::cleanup()
 {
-    std::vector<void*> data;
-    data.reserve(32);
+    std::vector<void*> data; data.reserve(32);
     getTlsStorage().releaseSlot(key_, data, true); // Extract stored data with removal from TLS tables
     for(size_t i = 0; i < data.size(); i++)  // Delete all associated data
         deleteDataInstance(data[i]);
@@ -1680,11 +1731,16 @@ void* TLSDataContainer::getData() const
     return pData;
 }
 
-TLSData<CoreTLSData>& getCoreTlsData()
+static TLSData<CoreTLSData>& getCoreTlsDataTLS()
 {
     CV_SINGLETON_LAZY_INIT_REF(TLSData<CoreTLSData>, new TLSData<CoreTLSData>())
 }
 
+CoreTLSData& getCoreTlsData()
+{
+    return getCoreTlsDataTLS().getRef();
+}
+
 #if defined CVAPI_EXPORTS && defined _WIN32 && !defined WINCE
 #ifdef WINRT
     #pragma warning(disable:4447) // Disable warning 'main' signature found without threading model
@@ -2338,12 +2394,12 @@ String getIppVersion()
 bool useIPP()
 {
 #ifdef HAVE_IPP
-    CoreTLSData* data = getCoreTlsData().get();
-    if(data->useIPP < 0)
+    CoreTLSData& data = getCoreTlsData();
+    if (data.useIPP < 0)
     {
-        data->useIPP = getIPPSingleton().useIPP;
+        data.useIPP = getIPPSingleton().useIPP;
     }
-    return (data->useIPP > 0);
+    return (data.useIPP > 0);
 #else
     return false;
 #endif
@@ -2351,24 +2407,24 @@ bool useIPP()
 
 void setUseIPP(bool flag)
 {
-    CoreTLSData* data = getCoreTlsData().get();
+    CoreTLSData& data = getCoreTlsData();
 #ifdef HAVE_IPP
-    data->useIPP = (getIPPSingleton().useIPP)?flag:false;
+    data.useIPP = (getIPPSingleton().useIPP)?flag:false;
 #else
     CV_UNUSED(flag);
-    data->useIPP = false;
+    data.useIPP = false;
 #endif
 }
 
 bool useIPP_NotExact()
 {
 #ifdef HAVE_IPP
-    CoreTLSData* data = getCoreTlsData().get();
-    if(data->useIPP_NE < 0)
+    CoreTLSData& data = getCoreTlsData();
+    if (data.useIPP_NE < 0)
     {
-        data->useIPP_NE = getIPPSingleton().useIPP_NE;
+        data.useIPP_NE = getIPPSingleton().useIPP_NE;
     }
-    return (data->useIPP_NE > 0);
+    return (data.useIPP_NE > 0);
 #else
     return false;
 #endif
@@ -2376,12 +2432,12 @@ bool useIPP_NotExact()
 
 void setUseIPP_NotExact(bool flag)
 {
-    CoreTLSData* data = getCoreTlsData().get();
+    CoreTLSData& data = getCoreTlsData();
 #ifdef HAVE_IPP
-    data->useIPP_NE = flag;
+    data.useIPP_NE = flag;
 #else
     CV_UNUSED(flag);
-    data->useIPP_NE = false;
+    data.useIPP_NE = false;
 #endif
 }
 
@@ -2407,7 +2463,7 @@ namespace tegra {
 
 bool useTegra()
 {
-    cv::CoreTLSData* data = cv::getCoreTlsData().get();
+    cv::CoreTLSData* data = cv::getCoreTlsData();
 
     if (data->useTegra < 0)
     {
@@ -2423,7 +2479,7 @@ bool useTegra()
 
 void setUseTegra(bool flag)
 {
-    cv::CoreTLSData* data = cv::getCoreTlsData().get();
+    cv::CoreTLSData* data = cv::getCoreTlsData();
     data->useTegra = flag;
 }
 
diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp
index c31d9e1bdd..58f1f909d5 100644
--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@@ -43,6 +43,8 @@
 #include "opencl_kernels_core.hpp"
 #include "umatrix.hpp"
 
+#include <opencv2/core/utils/tls.hpp>
+
 ///////////////////////////////// UMat implementation ///////////////////////////////
 
 namespace cv {
diff --git a/modules/core/test/test_utils.cpp b/modules/core/test/test_utils.cpp
index c566762925..2bae77892b 100644
--- a/modules/core/test/test_utils.cpp
+++ b/modules/core/test/test_utils.cpp
@@ -4,6 +4,8 @@
 #include "test_precomp.hpp"
 #include "opencv2/core/utils/logger.hpp"
 
+#include "test_utils_tls.impl.hpp"
+
 namespace opencv_test { namespace {
 
 static const char * const keys =
diff --git a/modules/core/test/test_utils_tls.impl.hpp b/modules/core/test/test_utils_tls.impl.hpp
new file mode 100644
index 0000000000..36b8805422
--- /dev/null
+++ b/modules/core/test/test_utils_tls.impl.hpp
@@ -0,0 +1,134 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// This is .hpp file included from test_utils.cpp
+
+#ifdef CV_CXX11
+#include <thread>  // std::thread
+#endif
+
+#include "opencv2/core/utils/tls.hpp"
+
+namespace opencv_test { namespace {
+
+class TLSReporter
+{
+public:
+    static int g_last_id;
+    static int g_allocated;
+
+    int id;
+
+    TLSReporter()
+    {
+        id = CV_XADD(&g_last_id, 1);
+        CV_XADD(&g_allocated, 1);
+    }
+    ~TLSReporter()
+    {
+        CV_XADD(&g_allocated, -1);
+    }
+};
+
+int TLSReporter::g_last_id = 0;
+int TLSReporter::g_allocated = 0;
+
+#ifdef CV_CXX11
+
+template<typename T>
+static void callNThreadsWithTLS(int N, TLSData<T>& tls)
+{
+    std::vector<std::thread> threads(N);
+    for (int i = 0; i < N; i++)
+    {
+        threads[i] = std::thread([&]() {
+            TLSReporter* pData = tls.get();
+            (void)pData;
+        });
+    }
+    for (int i = 0; i < N; i++)
+    {
+        threads[i].join();
+    }
+    threads.clear();
+}
+
+TEST(Core_TLS, HandleThreadTermination)
+{
+    const int init_id = TLSReporter::g_last_id;
+    const int init_allocated = TLSReporter::g_allocated;
+
+    const int N = 4;
+    TLSData<TLSReporter> tls;
+
+    // use TLS
+    ASSERT_NO_THROW(callNThreadsWithTLS(N, tls));
+
+    EXPECT_EQ(init_id + N, TLSReporter::g_last_id);
+    EXPECT_EQ(init_allocated + 0, TLSReporter::g_allocated);
+}
+
+
+static void testTLSAccumulator(bool detachFirst)
+{
+    const int init_id = TLSReporter::g_last_id;
+    const int init_allocated = TLSReporter::g_allocated;
+
+    const int N = 4;
+    TLSDataAccumulator<TLSReporter> tls;
+
+    {  // empty TLS checks
+        std::vector<TLSReporter*>& data0 = tls.detachData();
+        EXPECT_EQ((size_t)0, data0.size());
+        tls.cleanupDetachedData();
+    }
+
+    // use TLS
+    ASSERT_NO_THROW(callNThreadsWithTLS(N, tls));
+
+    EXPECT_EQ(init_id + N, TLSReporter::g_last_id);
+    EXPECT_EQ(init_allocated + N, TLSReporter::g_allocated);
+
+    if (detachFirst)
+    {
+        std::vector<TLSReporter*>& data1 = tls.detachData();
+        EXPECT_EQ((size_t)N, data1.size());
+
+        // no data through gather after detachData()
+        std::vector<TLSReporter*> data2;
+        tls.gather(data2);
+        EXPECT_EQ((size_t)0, data2.size());
+
+        tls.cleanupDetachedData();
+
+        EXPECT_EQ(init_id + N, TLSReporter::g_last_id);
+        EXPECT_EQ(init_allocated + 0, TLSReporter::g_allocated);
+        EXPECT_EQ((size_t)0, data1.size());
+    }
+    else
+    {
+        std::vector<TLSReporter*> data2;
+        tls.gather(data2);
+        EXPECT_EQ((size_t)N, data2.size());
+
+        std::vector<TLSReporter*>& data1 = tls.detachData();
+        EXPECT_EQ((size_t)N, data1.size());
+
+        tls.cleanupDetachedData();
+
+        EXPECT_EQ((size_t)0, data1.size());
+        // data2 is not empty, but it has invalid contents
+        EXPECT_EQ((size_t)N, data2.size());
+    }
+
+    EXPECT_EQ(init_id + N, TLSReporter::g_last_id);
+    EXPECT_EQ(init_allocated + 0, TLSReporter::g_allocated);
+}
+
+TEST(Core_TLS, AccumulatorHoldData_detachData) { testTLSAccumulator(true); }
+TEST(Core_TLS, AccumulatorHoldData_gather) { testTLSAccumulator(false); }
+
+#endif
+
+}}  // namespace
diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp
index d4ff218f13..64a5c61afe 100644
--- a/modules/imgproc/src/histogram.cpp
+++ b/modules/imgproc/src/histogram.cpp
@@ -45,6 +45,8 @@
 
 #include "opencv2/core/openvx/ovx_defs.hpp"
 
+#include "opencv2/core/utils/tls.hpp"
+
 namespace cv
 {
 

From 6ec5ae0215390716d7397a9c583bff68f97b4294 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.alekhin@intel.com>
Date: Tue, 22 Oct 2019 16:59:59 +0300
Subject: [PATCH 3/6] core(trace): add ITT control parameter

- OPENCV_TRACE_ITT_ENABLE
---
 modules/core/src/trace.cpp | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/modules/core/src/trace.cpp b/modules/core/src/trace.cpp
index e1c58bb79b..4245ab2080 100644
--- a/modules/core/src/trace.cpp
+++ b/modules/core/src/trace.cpp
@@ -196,14 +196,27 @@ static __itt_domain* domain = NULL;
 
 static bool isITTEnabled()
 {
-    static bool isInitialized = false;
+    static volatile bool isInitialized = false;
     static bool isEnabled = false;
     if (!isInitialized)
     {
-        isEnabled = !!(__itt_api_version());
-        CV_LOG_ITT("ITT is " << (isEnabled ? "enabled" : "disabled"));
-        domain = __itt_domain_create("OpenCVTrace");
-        isInitialized = true;
+        cv::AutoLock lock(cv::getInitializationMutex());
+        if (!isInitialized)
+        {
+            bool param_traceITTEnable = utils::getConfigurationParameterBool("OPENCV_TRACE_ITT_ENABLE", true);
+            if (param_traceITTEnable)
+            {
+                isEnabled = !!(__itt_api_version());
+                CV_LOG_ITT("ITT is " << (isEnabled ? "enabled" : "disabled"));
+                domain = __itt_domain_create("OpenCVTrace");
+            }
+            else
+            {
+                CV_LOG_ITT("ITT is disabled through OpenCV parameter");
+                isEnabled = false;
+            }
+            isInitialized = true;
+        }
     }
     return isEnabled;
 }

From 86a8ff61293ca5e8d3b5a28cb623b4ca36aab74a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Lippok?= <lippok@live.com>
Date: Sun, 27 Oct 2019 17:43:31 +0100
Subject: [PATCH 4/6] Fixed typo in assertion

---
 modules/calib3d/src/fisheye.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/calib3d/src/fisheye.cpp b/modules/calib3d/src/fisheye.cpp
index 5fe6b2a08d..56fd82114d 100644
--- a/modules/calib3d/src/fisheye.cpp
+++ b/modules/calib3d/src/fisheye.cpp
@@ -857,8 +857,8 @@ double cv::fisheye::stereoCalibrate(InputArrayOfArrays objectPoints, InputArrayO
 
     CV_Assert(K1.empty() || (K1.size() == Size(3,3)));
     CV_Assert(D1.empty() || (D1.total() == 4));
-    CV_Assert(K2.empty() || (K1.size() == Size(3,3)));
-    CV_Assert(D2.empty() || (D1.total() == 4));
+    CV_Assert(K2.empty() || (K2.size() == Size(3,3)));
+    CV_Assert(D2.empty() || (D2.total() == 4));
 
     CV_Assert((!K1.empty() && !K2.empty() && !D1.empty() && !D2.empty()) || !(flags & CALIB_FIX_INTRINSIC));
 

From 80c4cedd255c4eec6556f3ab041c14f2aeb3313f Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Sun, 27 Oct 2019 11:14:17 +0000
Subject: [PATCH 5/6] android: use .getRowStride() in JavaCamera2View

---
 modules/core/misc/java/src/java/core+Mat.java | 12 +++++
 modules/core/misc/java/test/MatTest.java      | 18 ++++++++
 .../org/opencv/android/JavaCamera2View.java   | 46 +++++++++++++------
 modules/java/generator/src/cpp/Mat.cpp        | 28 ++++++++++-
 4 files changed, 90 insertions(+), 14 deletions(-)

diff --git a/modules/core/misc/java/src/java/core+Mat.java b/modules/core/misc/java/src/java/core+Mat.java
index 5ebb5ee145..641d9f8ae8 100644
--- a/modules/core/misc/java/src/java/core+Mat.java
+++ b/modules/core/misc/java/src/java/core+Mat.java
@@ -41,6 +41,15 @@ public class Mat {
         nativeObj = n_Mat(rows, cols, type, data);
     }
 
+    //
+    // C++: Mat::Mat(int rows, int cols, int type, void* data, size_t step)
+    //
+
+    // javadoc: Mat::Mat(rows, cols, type, data, step)
+    public Mat(int rows, int cols, int type, ByteBuffer data, long step) {
+        nativeObj = n_Mat(rows, cols, type, data, step);
+    }
+
     //
     // C++: Mat::Mat(Size size, int type)
     //
@@ -1136,6 +1145,9 @@ public class Mat {
     // C++: Mat::Mat(int rows, int cols, int type, void* data)
     private static native long n_Mat(int rows, int cols, int type, ByteBuffer data);
 
+    // C++: Mat::Mat(int rows, int cols, int type, void* data, size_t step)
+    private static native long n_Mat(int rows, int cols, int type, ByteBuffer data, long step);
+
     // C++: Mat::Mat(Size size, int type)
     private static native long n_Mat(double size_width, double size_height, int type);
 
diff --git a/modules/core/misc/java/test/MatTest.java b/modules/core/misc/java/test/MatTest.java
index cdd7950843..039aa39929 100644
--- a/modules/core/misc/java/test/MatTest.java
+++ b/modules/core/misc/java/test/MatTest.java
@@ -1246,4 +1246,22 @@ public class MatTest extends OpenCVTestCase {
         assertEquals(1, bbuf.get(4095));
     }
 
+    public void testMatFromByteBufferWithStep() {
+        ByteBuffer bbuf = ByteBuffer.allocateDirect(80*64);
+        bbuf.putInt(0x01010101);
+        bbuf.putInt(64, 0x02020202);
+        bbuf.putInt(80, 0x03030303);
+        Mat m = new Mat(64, 64, CvType.CV_8UC1, bbuf, 80);
+        assertEquals(8, Core.countNonZero(m));
+        Core.add(m, new Scalar(5), m);
+        assertEquals(4096, Core.countNonZero(m));
+        m.release();
+        assertEquals(6, bbuf.get(0));
+        assertEquals(5, bbuf.get(63));
+        assertEquals(2, bbuf.get(64));
+        assertEquals(0, bbuf.get(79));
+        assertEquals(8, bbuf.get(80));
+        assertEquals(5, bbuf.get(63*80 + 63));
+    }
+
 }
diff --git a/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java b/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java
index e4a58539b5..09e01b01fc 100644
--- a/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java
+++ b/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java
@@ -332,8 +332,10 @@ public class JavaCamera2View extends CameraBridgeViewBase {
             Image.Plane[] planes = mImage.getPlanes();
             int w = mImage.getWidth();
             int h = mImage.getHeight();
+            assert(planes[0].getPixelStride() == 1);
             ByteBuffer y_plane = planes[0].getBuffer();
-            mGray = new Mat(h, w, CvType.CV_8UC1, y_plane);
+            int y_plane_step = planes[0].getRowStride();
+            mGray = new Mat(h, w, CvType.CV_8UC1, y_plane, y_plane_step);
             return mGray;
         }
 
@@ -349,11 +351,14 @@ public class JavaCamera2View extends CameraBridgeViewBase {
                 assert(planes[0].getPixelStride() == 1);
                 assert(planes[2].getPixelStride() == 2);
                 ByteBuffer y_plane = planes[0].getBuffer();
+                int y_plane_step = planes[0].getRowStride();
                 ByteBuffer uv_plane1 = planes[1].getBuffer();
+                int uv_plane1_step = planes[1].getRowStride();
                 ByteBuffer uv_plane2 = planes[2].getBuffer();
-                Mat y_mat = new Mat(h, w, CvType.CV_8UC1, y_plane);
-                Mat uv_mat1 = new Mat(h / 2, w / 2, CvType.CV_8UC2, uv_plane1);
-                Mat uv_mat2 = new Mat(h / 2, w / 2, CvType.CV_8UC2, uv_plane2);
+                int uv_plane2_step = planes[2].getRowStride();
+                Mat y_mat = new Mat(h, w, CvType.CV_8UC1, y_plane, y_plane_step);
+                Mat uv_mat1 = new Mat(h / 2, w / 2, CvType.CV_8UC2, uv_plane1, uv_plane1_step);
+                Mat uv_mat2 = new Mat(h / 2, w / 2, CvType.CV_8UC2, uv_plane2, uv_plane2_step);
                 long addr_diff = uv_mat2.dataAddr() - uv_mat1.dataAddr();
                 if (addr_diff > 0) {
                     assert(addr_diff == 1);
@@ -369,30 +374,45 @@ public class JavaCamera2View extends CameraBridgeViewBase {
                 ByteBuffer u_plane = planes[1].getBuffer();
                 ByteBuffer v_plane = planes[2].getBuffer();
 
-                y_plane.get(yuv_bytes, 0, w*h);
+                int yuv_bytes_offset = 0;
+
+                int y_plane_step = planes[0].getRowStride();
+                if (y_plane_step == w) {
+                    y_plane.get(yuv_bytes, 0, w*h);
+                    yuv_bytes_offset = w*h;
+                } else {
+                    int padding = y_plane_step - w;
+                    for (int i = 0; i < h; i++){
+                        y_plane.get(yuv_bytes, yuv_bytes_offset, w);
+                        yuv_bytes_offset += w;
+                        if (i < h - 1) {
+                            y_plane.position(y_plane.position() + padding);
+                        }
+                    }
+                    assert(yuv_bytes_offset == w * h);
+                }
 
                 int chromaRowStride = planes[1].getRowStride();
                 int chromaRowPadding = chromaRowStride - w/2;
 
-                int offset = w*h;
                 if (chromaRowPadding == 0){
                     // When the row stride of the chroma channels equals their width, we can copy
                     // the entire channels in one go
-                    u_plane.get(yuv_bytes, offset, w*h/4);
-                    offset += w*h/4;
-                    v_plane.get(yuv_bytes, offset, w*h/4);
+                    u_plane.get(yuv_bytes, yuv_bytes_offset, w*h/4);
+                    yuv_bytes_offset += w*h/4;
+                    v_plane.get(yuv_bytes, yuv_bytes_offset, w*h/4);
                 } else {
                     // When not equal, we need to copy the channels row by row
                     for (int i = 0; i < h/2; i++){
-                        u_plane.get(yuv_bytes, offset, w/2);
-                        offset += w/2;
+                        u_plane.get(yuv_bytes, yuv_bytes_offset, w/2);
+                        yuv_bytes_offset += w/2;
                         if (i < h/2-1){
                             u_plane.position(u_plane.position() + chromaRowPadding);
                         }
                     }
                     for (int i = 0; i < h/2; i++){
-                        v_plane.get(yuv_bytes, offset, w/2);
-                        offset += w/2;
+                        v_plane.get(yuv_bytes, yuv_bytes_offset, w/2);
+                        yuv_bytes_offset += w/2;
                         if (i < h/2-1){
                             v_plane.position(v_plane.position() + chromaRowPadding);
                         }
diff --git a/modules/java/generator/src/cpp/Mat.cpp b/modules/java/generator/src/cpp/Mat.cpp
index 1ae2aa6e8c..5203413ae4 100644
--- a/modules/java/generator/src/cpp/Mat.cpp
+++ b/modules/java/generator/src/cpp/Mat.cpp
@@ -74,7 +74,7 @@ JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1Mat__IIILjava_nio_ByteBuffer
 JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1Mat__IIILjava_nio_ByteBuffer_2
   (JNIEnv* env, jclass, jint rows, jint cols, jint type, jobject data)
 {
-    static const char method_name[] = "Mat::n_1Mat__IIILByteBuffer()";
+    static const char method_name[] = "Mat::n_1Mat__IIILjava_nio_ByteBuffer_2()";
     try {
         LOGD("%s", method_name);
         return (jlong) new Mat( rows, cols, type, (void*)env->GetDirectBufferAddress(data) );
@@ -88,6 +88,32 @@ JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1Mat__IIILjava_nio_ByteBuffer
 }
 
 
+/*
+ * Class:     org_opencv_core_Mat
+ * Method:    n_Mat
+ * Signature: (IIILjava/nio/ByteBuffer;J)J
+ *
+ * Mat::Mat(int rows, int cols, int type, void* data, size_t step)
+ */
+JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1Mat__IIILjava_nio_ByteBuffer_2J
+  (JNIEnv* env, jclass, jint rows, jint cols, jint type, jobject data, jlong step);
+
+JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1Mat__IIILjava_nio_ByteBuffer_2J
+  (JNIEnv* env, jclass, jint rows, jint cols, jint type, jobject data, jlong step)
+{
+    static const char method_name[] = "Mat::n_1Mat__IIILjava_nio_ByteBuffer_2J()";
+    try {
+        LOGD("%s", method_name);
+        return (jlong) new Mat(rows, cols, type, (void*)env->GetDirectBufferAddress(data), (size_t)step);
+    } catch(const std::exception &e) {
+        throwJavaException(env, &e, method_name);
+    } catch (...) {
+        throwJavaException(env, 0, method_name);
+    }
+
+    return 0;
+}
+
 
 //
 //   Mat::Mat(int rows, int cols, int type)

From a71ff501301c05e41526c9a8fe69ed05ac953e1e Mon Sep 17 00:00:00 2001
From: Chip Kerchner <49959681+ChipKerchner@users.noreply.github.com>
Date: Tue, 29 Oct 2019 06:42:20 -0400
Subject: [PATCH 6/6] Merge pull request #15623 from
 ChipKerchner:optimizeHOGpipeline

* Use circular lut hustory buffer in computeGradient of HOG

* Initialize prefetch data outside main loop.  Avoid code duplication.
---
 modules/objdetect/src/hog.cpp | 132 +++++++++++++++++++++++++++++-----
 1 file changed, 115 insertions(+), 17 deletions(-)

diff --git a/modules/objdetect/src/hog.cpp b/modules/objdetect/src/hog.cpp
index 378bab3087..e98b9c2e23 100644
--- a/modules/objdetect/src/hog.cpp
+++ b/modules/objdetect/src/hog.cpp
@@ -299,6 +299,11 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
     Mat Dy(1, width, CV_32F, dbuf + width);
     Mat Mag(1, width, CV_32F, dbuf + width*2);
     Mat Angle(1, width, CV_32F, dbuf + width*3);
+#if CV_SIMD128
+    int widthP2 = width+2;
+    AutoBuffer<float> _lutBuf(9*widthP2);
+    float* const lutBuf = _lutBuf.data();
+#endif
 
     if (cn == 3)
     {
@@ -317,6 +322,63 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
         xmap += 1;
     }
 
+#if CV_SIMD128
+    typedef const uchar* const T;
+    float *lutPrev, *lutCurr, *lutNext;
+    {
+        y = 0;
+        const uchar* imgPtr  = img.ptr(ymap[y]);
+        const uchar* prevPtr = img.data + img.step*ymap[y-1];
+
+        lutPrev = lutBuf+widthP2*0;
+        lutCurr = lutBuf+widthP2*3;
+
+        {
+            int x0 = xmap[-1], x1 = xmap[0];
+            T p02 = imgPtr + x0, p12 = imgPtr + x1;
+
+            lutPrev[0+widthP2*0] = lut[prevPtr[x0+0]];
+            lutPrev[0+widthP2*1] = lut[prevPtr[x0+1]];
+            lutPrev[0+widthP2*2] = lut[prevPtr[x0+2]];
+            lutCurr[0+widthP2*0] = lut[p02[0]]; lutCurr[1+widthP2*0] = lut[p12[0]];
+            lutCurr[0+widthP2*1] = lut[p02[1]]; lutCurr[1+widthP2*1] = lut[p12[1]];
+            lutCurr[0+widthP2*2] = lut[p02[2]]; lutCurr[1+widthP2*2] = lut[p12[2]];
+        }
+
+        for( x = 0; x <= width - 4; x += 4 )
+        {
+            int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3];
+            T p02 = imgPtr + xmap[x+1];
+            T p12 = imgPtr + xmap[x+2];
+            T p22 = imgPtr + xmap[x+3];
+            T p32 = imgPtr + xmap[x+4];
+
+            v_float32x4 _dx00 = v_float32x4(lut[p02[0]], lut[p12[0]], lut[p22[0]], lut[p32[0]]);
+            v_float32x4 _dx10 = v_float32x4(lut[p02[1]], lut[p12[1]], lut[p22[1]], lut[p32[1]]);
+            v_float32x4 _dx20 = v_float32x4(lut[p02[2]], lut[p12[2]], lut[p22[2]], lut[p32[2]]);
+
+            v_store(lutCurr+x+widthP2*0+2, _dx00);
+            v_store(lutCurr+x+widthP2*1+2, _dx10);
+            v_store(lutCurr+x+widthP2*2+2, _dx20);
+
+            v_float32x4 _dy00 = v_float32x4(lut[prevPtr[x0+0]], lut[prevPtr[x1+0]], lut[prevPtr[x2+0]], lut[prevPtr[x3+0]]);
+            v_float32x4 _dy10 = v_float32x4(lut[prevPtr[x0+1]], lut[prevPtr[x1+1]], lut[prevPtr[x2+1]], lut[prevPtr[x3+1]]);
+            v_float32x4 _dy20 = v_float32x4(lut[prevPtr[x0+2]], lut[prevPtr[x1+2]], lut[prevPtr[x2+2]], lut[prevPtr[x3+2]]);
+
+            v_store(lutPrev+x+widthP2*0+1, _dy00);
+            v_store(lutPrev+x+widthP2*1+1, _dy10);
+            v_store(lutPrev+x+widthP2*2+1, _dy20);
+        }
+        {
+            int x0 = xmap[x];
+
+            lutPrev[x+widthP2*0+1] = lut[prevPtr[x0+0]];
+            lutPrev[x+widthP2*1+1] = lut[prevPtr[x0+1]];
+            lutPrev[x+widthP2*2+1] = lut[prevPtr[x0+2]];
+        }
+    }
+#endif
+
     float angleScale = signedGradient ? (float)(nbins/(2.0*CV_PI)) : (float)(nbins/CV_PI);
     for( y = 0; y < gradsize.height; y++ )
     {
@@ -342,28 +404,57 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
         {
             x = 0;
 #if CV_SIMD128
+            int yMod = y%3;
+
+            // Circular lut history buffer
+            if (yMod == 0)
+            {
+                lutPrev = lutBuf+widthP2*0;
+                lutCurr = lutBuf+widthP2*3;
+                lutNext = lutBuf+widthP2*6;
+            }
+            else if (yMod == 1)
+            {
+                lutPrev = lutBuf+widthP2*3;
+                lutCurr = lutBuf+widthP2*6;
+                lutNext = lutBuf+widthP2*0;
+            }
+            else
+            {
+                lutPrev = lutBuf+widthP2*6;
+                lutCurr = lutBuf+widthP2*0;
+                lutNext = lutBuf+widthP2*3;
+            }
+
+            {
+                int x0 = xmap[-1];
+
+                lutNext[0+widthP2*0] = lut[nextPtr[x0+0]];
+                lutNext[0+widthP2*1] = lut[nextPtr[x0+1]];
+                lutNext[0+widthP2*2] = lut[nextPtr[x0+2]];
+            }
             for( ; x <= width - 4; x += 4 )
             {
                 int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3];
-                typedef const uchar* const T;
-                T p02 = imgPtr + xmap[x+1], p00 = imgPtr + xmap[x-1];
-                T p12 = imgPtr + xmap[x+2], p10 = imgPtr + xmap[x];
-                T p22 = imgPtr + xmap[x+3], p20 = p02;
-                T p32 = imgPtr + xmap[x+4], p30 = p12;
 
-                v_float32x4 _dx0 = v_float32x4(lut[p02[0]], lut[p12[0]], lut[p22[0]], lut[p32[0]]) -
-                                   v_float32x4(lut[p00[0]], lut[p10[0]], lut[p20[0]], lut[p30[0]]);
-                v_float32x4 _dx1 = v_float32x4(lut[p02[1]], lut[p12[1]], lut[p22[1]], lut[p32[1]]) -
-                                   v_float32x4(lut[p00[1]], lut[p10[1]], lut[p20[1]], lut[p30[1]]);
-                v_float32x4 _dx2 = v_float32x4(lut[p02[2]], lut[p12[2]], lut[p22[2]], lut[p32[2]]) -
-                                   v_float32x4(lut[p00[2]], lut[p10[2]], lut[p20[2]], lut[p30[2]]);
+                v_float32x4 _dx0 = v_load(lutCurr+x+widthP2*0+2) - v_load(lutCurr+x+widthP2*0);
+                v_float32x4 _dx1 = v_load(lutCurr+x+widthP2*1+2) - v_load(lutCurr+x+widthP2*1);
+                v_float32x4 _dx2 = v_load(lutCurr+x+widthP2*2+2) - v_load(lutCurr+x+widthP2*2);
 
-                v_float32x4 _dy0 = v_float32x4(lut[nextPtr[x0]], lut[nextPtr[x1]], lut[nextPtr[x2]], lut[nextPtr[x3]]) -
-                                   v_float32x4(lut[prevPtr[x0]], lut[prevPtr[x1]], lut[prevPtr[x2]], lut[prevPtr[x3]]);
-                v_float32x4 _dy1 = v_float32x4(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]) -
-                                   v_float32x4(lut[prevPtr[x0+1]], lut[prevPtr[x1+1]], lut[prevPtr[x2+1]], lut[prevPtr[x3+1]]);
-                v_float32x4 _dy2 = v_float32x4(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]) -
-                                   v_float32x4(lut[prevPtr[x0+2]], lut[prevPtr[x1+2]], lut[prevPtr[x2+2]], lut[prevPtr[x3+2]]);
+                v_float32x4 _dy00 = v_float32x4(lut[nextPtr[x0+0]], lut[nextPtr[x1+0]], lut[nextPtr[x2+0]], lut[nextPtr[x3+0]]);
+                v_float32x4 _dy0 = _dy00 - v_load(lutPrev+x+widthP2*0+1);
+
+                v_store(lutNext+x+widthP2*0+1, _dy00);
+
+                v_float32x4 _dy10 = v_float32x4(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]);
+                v_float32x4 _dy1 = _dy10 - v_load(lutPrev+x+widthP2*1+1);
+
+                v_store(lutNext+x+widthP2*1+1, _dy10);
+
+                v_float32x4 _dy20 = v_float32x4(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]);
+                v_float32x4 _dy2 = _dy20 - v_load(lutPrev+x+widthP2*2+1);
+
+                v_store(lutNext+x+widthP2*2+1, _dy20);
 
                 v_float32x4 _mag0 = (_dx0 * _dx0) + (_dy0 * _dy0);
                 v_float32x4 _mag1 = (_dx1 * _dx1) + (_dy1 * _dy1);
@@ -380,6 +471,13 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
                 v_store(dbuf + x, _dx2);
                 v_store(dbuf + x + width, _dy2);
             }
+            {
+                int x0 = xmap[x];
+
+                lutNext[x+widthP2*0+1] = lut[nextPtr[x0+0]];
+                lutNext[x+widthP2*1+1] = lut[nextPtr[x0+1]];
+                lutNext[x+widthP2*2+1] = lut[nextPtr[x0+2]];
+            }
 #endif
             for( ; x < width; x++ )
             {