From c46f119e0e865af442a3551d40fa4cdd7ea29123 Mon Sep 17 00:00:00 2001 From: ChipKerchner Date: Wed, 23 Oct 2019 10:47:07 -0500 Subject: [PATCH 1/6] Convert demosaic functions to HAL --- modules/imgproc/src/demosaicing.cpp | 591 +++++++++++++++------------- 1 file changed, 318 insertions(+), 273 deletions(-) diff --git a/modules/imgproc/src/demosaicing.cpp b/modules/imgproc/src/demosaicing.cpp index a14b6d7905..3062023ea7 100644 --- a/modules/imgproc/src/demosaicing.cpp +++ b/modules/imgproc/src/demosaicing.cpp @@ -86,6 +86,7 @@ #include "precomp.hpp" +#include "opencv2/core/hal/intrin.hpp" #include @@ -111,7 +112,7 @@ public: return 0; } - int bayer2RGBA(const T*, int, T*, int, int) const + int bayer2RGBA(const T*, int, T*, int, int, const T) const { return 0; } @@ -122,279 +123,14 @@ public: } }; -#if CV_SSE2 +#if CV_SIMD128 class SIMDBayerInterpolator_8u { public: - SIMDBayerInterpolator_8u() - { - use_simd = checkHardwareSupport(CV_CPU_SSE2); - } - int bayer2Gray(const uchar* bayer, int bayer_step, uchar* dst, int width, int bcoeff, int gcoeff, int rcoeff) const { - if( !use_simd ) - return 0; - - __m128i _b2y = _mm_set1_epi16((short)(rcoeff*2)); - __m128i _g2y = _mm_set1_epi16((short)(gcoeff*2)); - __m128i _r2y = _mm_set1_epi16((short)(bcoeff*2)); - const uchar* bayer_end = bayer + width; - - for( ; bayer <= bayer_end - 18; bayer += 14, dst += 14 ) - { - __m128i r0 = _mm_loadu_si128((const __m128i*)bayer); - __m128i r1 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step)); - __m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2)); - - __m128i b1 = _mm_add_epi16(_mm_srli_epi16(_mm_slli_epi16(r0, 8), 7), - _mm_srli_epi16(_mm_slli_epi16(r2, 8), 7)); - __m128i b0 = _mm_add_epi16(b1, _mm_srli_si128(b1, 2)); - b1 = _mm_slli_epi16(_mm_srli_si128(b1, 2), 1); - - __m128i g0 = _mm_add_epi16(_mm_srli_epi16(r0, 7), _mm_srli_epi16(r2, 7)); - __m128i g1 = _mm_srli_epi16(_mm_slli_epi16(r1, 8), 7); - g0 = _mm_add_epi16(g0, _mm_add_epi16(g1, _mm_srli_si128(g1, 2))); - g1 = _mm_slli_epi16(_mm_srli_si128(g1, 2), 2); - - r0 = _mm_srli_epi16(r1, 8); - r1 = _mm_slli_epi16(_mm_add_epi16(r0, _mm_srli_si128(r0, 2)), 2); - r0 = _mm_slli_epi16(r0, 3); - - g0 = _mm_add_epi16(_mm_mulhi_epi16(b0, _b2y), _mm_mulhi_epi16(g0, _g2y)); - g1 = _mm_add_epi16(_mm_mulhi_epi16(b1, _b2y), _mm_mulhi_epi16(g1, _g2y)); - g0 = _mm_add_epi16(g0, _mm_mulhi_epi16(r0, _r2y)); - g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(r1, _r2y)); - g0 = _mm_srli_epi16(g0, 2); - g1 = _mm_srli_epi16(g1, 2); - g0 = _mm_packus_epi16(g0, g0); - g1 = _mm_packus_epi16(g1, g1); - g0 = _mm_unpacklo_epi8(g0, g1); - _mm_storeu_si128((__m128i*)dst, g0); - } - - return (int)(bayer - (bayer_end - width)); - } - - int bayer2RGB(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const - { - if( !use_simd ) - return 0; - /* - B G B G | B G B G | B G B G | B G B G - G R G R | G R G R | G R G R | G R G R - B G B G | B G B G | B G B G | B G B G - */ - - __m128i delta1 = _mm_set1_epi16(1), delta2 = _mm_set1_epi16(2); - __m128i mask = _mm_set1_epi16(blue < 0 ? -1 : 0), z = _mm_setzero_si128(); - __m128i masklo = _mm_set1_epi16(0x00ff); - const uchar* bayer_end = bayer + width; - - for( ; bayer <= bayer_end - 18; bayer += 14, dst += 42 ) - { - __m128i r0 = _mm_loadu_si128((const __m128i*)bayer); - __m128i r1 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step)); - __m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2)); - - __m128i b1 = _mm_add_epi16(_mm_and_si128(r0, masklo), _mm_and_si128(r2, masklo)); - __m128i nextb1 = _mm_srli_si128(b1, 2); - __m128i b0 = _mm_add_epi16(b1, nextb1); - b1 = _mm_srli_epi16(_mm_add_epi16(nextb1, delta1), 1); - b0 = _mm_srli_epi16(_mm_add_epi16(b0, delta2), 2); - // b0 b2 ... b14 b1 b3 ... b15 - b0 = _mm_packus_epi16(b0, b1); - - __m128i g0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_srli_epi16(r2, 8)); - __m128i g1 = _mm_and_si128(r1, masklo); - g0 = _mm_add_epi16(g0, _mm_add_epi16(g1, _mm_srli_si128(g1, 2))); - g1 = _mm_srli_si128(g1, 2); - g0 = _mm_srli_epi16(_mm_add_epi16(g0, delta2), 2); - // g0 g2 ... g14 g1 g3 ... g15 - g0 = _mm_packus_epi16(g0, g1); - - r0 = _mm_srli_epi16(r1, 8); - r1 = _mm_add_epi16(r0, _mm_srli_si128(r0, 2)); - r1 = _mm_srli_epi16(_mm_add_epi16(r1, delta1), 1); - // r0 r2 ... r14 r1 r3 ... r15 - r0 = _mm_packus_epi16(r0, r1); - - b1 = _mm_and_si128(_mm_xor_si128(b0, r0), mask); - b0 = _mm_xor_si128(b0, b1); - r0 = _mm_xor_si128(r0, b1); - - // b1 g1 b3 g3 b5 g5... - b1 = _mm_unpackhi_epi8(b0, g0); - // b0 g0 b2 g2 b4 g4 .... - b0 = _mm_unpacklo_epi8(b0, g0); - - // r1 0 r3 0 r5 0 ... - r1 = _mm_unpackhi_epi8(r0, z); - // r0 0 r2 0 r4 0 ... - r0 = _mm_unpacklo_epi8(r0, z); - - // 0 b0 g0 r0 0 b2 g2 r2 ... - g0 = _mm_slli_si128(_mm_unpacklo_epi16(b0, r0), 1); - // 0 b8 g8 r8 0 b10 g10 r10 ... - g1 = _mm_slli_si128(_mm_unpackhi_epi16(b0, r0), 1); - - // b1 g1 r1 0 b3 g3 r3 0 ... - r0 = _mm_unpacklo_epi16(b1, r1); - // b9 g9 r9 0 b11 g11 r11 0 ... - r1 = _mm_unpackhi_epi16(b1, r1); - - // 0 b0 g0 r0 b1 g1 r1 0 ... - b0 = _mm_srli_si128(_mm_unpacklo_epi32(g0, r0), 1); - // 0 b4 g4 r4 b5 g5 r5 0 ... - b1 = _mm_srli_si128(_mm_unpackhi_epi32(g0, r0), 1); - - _mm_storel_epi64((__m128i*)(dst-1+0), b0); - _mm_storel_epi64((__m128i*)(dst-1+6*1), _mm_srli_si128(b0, 8)); - _mm_storel_epi64((__m128i*)(dst-1+6*2), b1); - _mm_storel_epi64((__m128i*)(dst-1+6*3), _mm_srli_si128(b1, 8)); - - // 0 b8 g8 r8 b9 g9 r9 0 ... - g0 = _mm_srli_si128(_mm_unpacklo_epi32(g1, r1), 1); - // 0 b12 g12 r12 b13 g13 r13 0 ... - g1 = _mm_srli_si128(_mm_unpackhi_epi32(g1, r1), 1); - - _mm_storel_epi64((__m128i*)(dst-1+6*4), g0); - _mm_storel_epi64((__m128i*)(dst-1+6*5), _mm_srli_si128(g0, 8)); - - _mm_storel_epi64((__m128i*)(dst-1+6*6), g1); - } - - return (int)(bayer - (bayer_end - width)); - } - - int bayer2RGBA(const uchar*, int, uchar*, int, int) const - { - return 0; - } - - int bayer2RGB_EA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const - { - if (!use_simd) - return 0; - - const uchar* bayer_end = bayer + width; - __m128i masklow = _mm_set1_epi16(0x00ff); - __m128i delta1 = _mm_set1_epi16(1), delta2 = _mm_set1_epi16(2); - __m128i full = _mm_set1_epi16(-1), z = _mm_setzero_si128(); - __m128i mask = _mm_set1_epi16(blue > 0 ? -1 : 0); - - for ( ; bayer <= bayer_end - 18; bayer += 14, dst += 42) - { - /* - B G B G | B G B G | B G B G | B G B G - G R G R | G R G R | G R G R | G R G R - B G B G | B G B G | B G B G | B G B G - */ - - __m128i r0 = _mm_loadu_si128((const __m128i*)bayer); - __m128i r1 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step)); - __m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2)); - - __m128i b1 = _mm_add_epi16(_mm_and_si128(r0, masklow), _mm_and_si128(r2, masklow)); - __m128i nextb1 = _mm_srli_si128(b1, 2); - __m128i b0 = _mm_add_epi16(b1, nextb1); - b1 = _mm_srli_epi16(_mm_add_epi16(nextb1, delta1), 1); - b0 = _mm_srli_epi16(_mm_add_epi16(b0, delta2), 2); - // b0 b2 ... b14 b1 b3 ... b15 - b0 = _mm_packus_epi16(b0, b1); - - // vertical sum - __m128i r0g = _mm_srli_epi16(r0, 8); - __m128i r2g = _mm_srli_epi16(r2, 8); - __m128i sumv = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(r0g, r2g), delta1), 1); - // gorizontal sum - __m128i g1 = _mm_and_si128(masklow, r1); - __m128i nextg1 = _mm_srli_si128(g1, 2); - __m128i sumg = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(g1, nextg1), delta1), 1); - - // gradients - __m128i gradv = _mm_adds_epi16(_mm_subs_epu16(r0g, r2g), _mm_subs_epu16(r2g, r0g)); - __m128i gradg = _mm_adds_epi16(_mm_subs_epu16(nextg1, g1), _mm_subs_epu16(g1, nextg1)); - __m128i gmask = _mm_cmpgt_epi16(gradg, gradv); - - __m128i g0 = _mm_add_epi16(_mm_and_si128(gmask, sumv), _mm_and_si128(sumg, _mm_xor_si128(gmask, full))); - // g0 g2 ... g14 g1 g3 ... - g0 = _mm_packus_epi16(g0, nextg1); - - r0 = _mm_srli_epi16(r1, 8); - r1 = _mm_add_epi16(r0, _mm_srli_si128(r0, 2)); - r1 = _mm_srli_epi16(_mm_add_epi16(r1, delta1), 1); - // r0 r2 ... r14 r1 r3 ... r15 - r0 = _mm_packus_epi16(r0, r1); - - b1 = _mm_and_si128(_mm_xor_si128(b0, r0), mask); - b0 = _mm_xor_si128(b0, b1); - r0 = _mm_xor_si128(r0, b1); - - // b1 g1 b3 g3 b5 g5... - b1 = _mm_unpackhi_epi8(b0, g0); - // b0 g0 b2 g2 b4 g4 .... - b0 = _mm_unpacklo_epi8(b0, g0); - - // r1 0 r3 0 r5 0 ... - r1 = _mm_unpackhi_epi8(r0, z); - // r0 0 r2 0 r4 0 ... - r0 = _mm_unpacklo_epi8(r0, z); - - // 0 b0 g0 r0 0 b2 g2 r2 ... - g0 = _mm_slli_si128(_mm_unpacklo_epi16(b0, r0), 1); - // 0 b8 g8 r8 0 b10 g10 r10 ... - g1 = _mm_slli_si128(_mm_unpackhi_epi16(b0, r0), 1); - - // b1 g1 r1 0 b3 g3 r3 0 ... - r0 = _mm_unpacklo_epi16(b1, r1); - // b9 g9 r9 0 b11 g11 r11 0 ... - r1 = _mm_unpackhi_epi16(b1, r1); - - // 0 b0 g0 r0 b1 g1 r1 0 ... - b0 = _mm_srli_si128(_mm_unpacklo_epi32(g0, r0), 1); - // 0 b4 g4 r4 b5 g5 r5 0 ... - b1 = _mm_srli_si128(_mm_unpackhi_epi32(g0, r0), 1); - - _mm_storel_epi64((__m128i*)(dst+0), b0); - _mm_storel_epi64((__m128i*)(dst+6*1), _mm_srli_si128(b0, 8)); - _mm_storel_epi64((__m128i*)(dst+6*2), b1); - _mm_storel_epi64((__m128i*)(dst+6*3), _mm_srli_si128(b1, 8)); - - // 0 b8 g8 r8 b9 g9 r9 0 ... - g0 = _mm_srli_si128(_mm_unpacklo_epi32(g1, r1), 1); - // 0 b12 g12 r12 b13 g13 r13 0 ... - g1 = _mm_srli_si128(_mm_unpackhi_epi32(g1, r1), 1); - - _mm_storel_epi64((__m128i*)(dst+6*4), g0); - _mm_storel_epi64((__m128i*)(dst+6*5), _mm_srli_si128(g0, 8)); - - _mm_storel_epi64((__m128i*)(dst+6*6), g1); - } - - return int(bayer - (bayer_end - width)); - } - - bool use_simd; -}; -#elif CV_NEON -class SIMDBayerInterpolator_8u -{ -public: - SIMDBayerInterpolator_8u() - { - } - - int bayer2Gray(const uchar* bayer, int bayer_step, uchar* dst, - int width, int bcoeff, int gcoeff, int rcoeff) const - { - /* - B G B G | B G B G | B G B G | B G B G - G R G R | G R G R | G R G R | G R G R - B G B G | B G B G | B G B G | B G B G - */ - +#if CV_NEON uint16x8_t masklo = vdupq_n_u16(255); const uchar* bayer_end = bayer + width; @@ -440,6 +176,40 @@ public: vst1_u8(dst, p.val[0]); vst1_u8(dst + 8, p.val[1]); } +#else + v_uint16x8 _b2y = v_setall_u16((ushort)(rcoeff*2)); + v_uint16x8 _g2y = v_setall_u16((ushort)(gcoeff*2)); + v_uint16x8 _r2y = v_setall_u16((ushort)(bcoeff*2)); + const uchar* bayer_end = bayer + width; + + for( ; bayer <= bayer_end - 18; bayer += 14, dst += 14 ) + { + v_uint16x8 r0 = v_load((ushort*)bayer); + v_uint16x8 r1 = v_load((ushort*)(bayer+bayer_step)); + v_uint16x8 r2 = v_load((ushort*)(bayer+bayer_step*2)); + + v_uint16x8 b1 = ((r0 << 8) >> 7) + ((r2 << 8) >> 7); + v_uint16x8 b0 = v_rotate_right<1>(b1) + b1; + b1 = v_rotate_right<1>(b1) << 1; + + v_uint16x8 g0 = (r0 >> 7) + (r2 >> 7); + v_uint16x8 g1 = (r1 << 8) >> 7; + g0 += v_rotate_right<1>(g1) + g1; + g1 = v_rotate_right<1>(g1) << 2; + + r0 = r1 >> 8; + r1 = (v_rotate_right<1>(r0) + r0) << 2; + r0 = r0 << 3; + + g0 = (v_mul_hi(b0, _b2y) + v_mul_hi(g0, _g2y) + v_mul_hi(r0, _r2y)) >> 2; + g1 = (v_mul_hi(b1, _b2y) + v_mul_hi(g1, _g2y) + v_mul_hi(r1, _r2y)) >> 2; + v_uint8x16 pack_lo, pack_hi; + v_zip(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g0)), + v_pack_u(v_reinterpret_as_s16(g1), v_reinterpret_as_s16(g1)), + pack_lo, pack_hi); + v_store(dst, pack_lo); + } +#endif return (int)(bayer - (bayer_end - width)); } @@ -451,6 +221,8 @@ public: G R G R | G R G R | G R G R | G R G R B G B G | B G B G | B G B G | B G B G */ + +#if CV_NEON uint16x8_t masklo = vdupq_n_u16(255); uint8x16x3_t pix; const uchar* bayer_end = bayer + width; @@ -484,21 +256,109 @@ public: vst3q_u8(dst-1, pix); } +#else + v_uint16x8 delta1 = v_setall_u16(1), delta2 = v_setall_u16(2); + v_uint16x8 mask = v_setall_u16(blue < 0 ? (ushort)(-1) : 0); + v_uint16x8 masklo = v_setall_u16(0x00ff); + v_uint8x16 z = v_setzero_u8(); + const uchar* bayer_end = bayer + width; + + for( ; bayer <= bayer_end - 18; bayer += 14, dst += 42 ) + { + v_uint16x8 r0 = v_load((ushort*)bayer); + v_uint16x8 r1 = v_load((ushort*)(bayer+bayer_step)); + v_uint16x8 r2 = v_load((ushort*)(bayer+bayer_step*2)); + + v_uint16x8 b1 = (r0 & masklo) + (r2 & masklo); + v_uint16x8 nextb1 = v_rotate_right<1>(b1); + v_uint16x8 b0 = b1 + nextb1; + b1 = (nextb1 + delta1) >> 1; + b0 = (b0 + delta2) >> 2; + // b0 b2 ... b14 b1 b3 ... b15 + b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1))); + + v_uint16x8 g0 = (r0 >> 8) + (r2 >> 8); + v_uint16x8 g1 = r1 & masklo; + g0 += v_rotate_right<1>(g1) + g1; + g1 = v_rotate_right<1>(g1); + g0 = (g0 + delta2) >> 2; + // g0 g2 ... g14 g1 g3 ... g15 + g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g1))); + + r0 = r1 >> 8; + r1 = v_rotate_right<1>(r0) + r0; + r1 = (r1 + delta1) >> 1; + // r0 r2 ... r14 r1 r3 ... r15 + r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1))); + + b1 = (b0 ^ r0) & mask; + b0 = b0 ^ b1; + r0 = r0 ^ b1; + + // b1 g1 b3 g3 b5 g5... + v_uint8x16 pack_lo, pack_hi; + v_zip(v_reinterpret_as_u8(b0), v_reinterpret_as_u8(g0), pack_lo, pack_hi); + b1 = v_reinterpret_as_u16(pack_hi); + // b0 g0 b2 g2 b4 g4 .... + b0 = v_reinterpret_as_u16(pack_lo); + + // r1 0 r3 0 r5 0 ... + v_zip(v_reinterpret_as_u8(r0), z, pack_lo, pack_hi); + r1 = v_reinterpret_as_u16(pack_hi); + // r0 0 r2 0 r4 0 ... + r0 = v_reinterpret_as_u16(pack_lo); + + // 0 b0 g0 r0 0 b2 g2 r2 ... + v_zip(b0, r0, g0, g1); + g0 = v_reinterpret_as_u16(v_rotate_left<1>(v_reinterpret_as_u8(g0))); + // 0 b8 g8 r8 0 b10 g10 r10 ... + g1 = v_reinterpret_as_u16(v_rotate_left<1>(v_reinterpret_as_u8(g1))); + + // b1 g1 r1 0 b3 g3 r3 0 ... + v_zip(b1, r1, r0, r1); + // b9 g9 r9 0 b11 g11 r11 0 ... + + // 0 b0 g0 r0 b1 g1 r1 0 ... + v_uint32x4 pack32_lo, pack32_hi; + v_zip(v_reinterpret_as_u32(g0), v_reinterpret_as_u32(r0), pack32_lo, pack32_hi); + b0 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_lo))); + // 0 b4 g4 r4 b5 g5 r5 0 ... + b1 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_hi))); + + v_store_low(dst-1+0, v_reinterpret_as_u8(b0)); + v_store_high(dst-1+6*1, v_reinterpret_as_u8(b0)); + v_store_low(dst-1+6*2, v_reinterpret_as_u8(b1)); + v_store_high(dst-1+6*3, v_reinterpret_as_u8(b1)); + + // 0 b8 g8 r8 b9 g9 r9 0 ... + v_zip(v_reinterpret_as_u32(g1), v_reinterpret_as_u32(r1), pack32_lo, pack32_hi); + g0 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_lo))); + // 0 b12 g12 r12 b13 g13 r13 0 ... + g1 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_hi))); + + v_store_low(dst-1+6*4, v_reinterpret_as_u8(g0)); + v_store_high(dst-1+6*5, v_reinterpret_as_u8(g0)); + + v_store_low(dst-1+6*6, v_reinterpret_as_u8(g1)); + } +#endif return (int)(bayer - (bayer_end - width)); } - int bayer2RGBA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const + int bayer2RGBA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue, const uchar alpha) const { /* B G B G | B G B G | B G B G | B G B G G R G R | G R G R | G R G R | G R G R B G B G | B G B G | B G B G | B G B G */ + +#if CV_NEON uint16x8_t masklo = vdupq_n_u16(255); uint8x16x4_t pix; const uchar* bayer_end = bayer + width; - pix.val[3] = vdupq_n_u8(255); + pix.val[3] = vdupq_n_u8(alpha); for( ; bayer <= bayer_end - 18; bayer += 14, dst += 56 ) { @@ -529,13 +389,198 @@ public: vst4q_u8(dst-1, pix); } +#else + v_uint16x8 delta1 = v_setall_u16(1), delta2 = v_setall_u16(2); + v_uint16x8 mask = v_setall_u16(blue < 0 ? (ushort)(-1) : 0); + v_uint16x8 masklo = v_setall_u16(0x00ff); + v_uint8x16 a = v_setall_u8(alpha); + const uchar* bayer_end = bayer + width; + + for( ; bayer <= bayer_end - 18; bayer += 14, dst += 56 ) + { + v_uint16x8 r0 = v_load((ushort*)bayer); + v_uint16x8 r1 = v_load((ushort*)(bayer+bayer_step)); + v_uint16x8 r2 = v_load((ushort*)(bayer+bayer_step*2)); + + v_uint16x8 b1 = (r0 & masklo) + (r2 & masklo); + v_uint16x8 nextb1 = v_rotate_right<1>(b1); + v_uint16x8 b0 = b1 + nextb1; + b1 = (nextb1 + delta1) >> 1; + b0 = (b0 + delta2) >> 2; + // b0 b2 ... b14 b1 b3 ... b15 + b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1))); + + v_uint16x8 g0 = (r0 >> 8) + (r2 >> 8); + v_uint16x8 g1 = r1 & masklo; + g0 += v_rotate_right<1>(g1) + g1; + g1 = v_rotate_right<1>(g1); + g0 = (g0 + delta2) >> 2; + // g0 g2 ... g14 g1 g3 ... g15 + g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g1))); + + r0 = r1 >> 8; + r1 = v_rotate_right<1>(r0) + r0; + r1 = (r1 + delta1) >> 1; + // r0 r2 ... r14 r1 r3 ... r15 + r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1))); + + b1 = (b0 ^ r0) & mask; + b0 = b0 ^ b1; + r0 = r0 ^ b1; + + // b1 g1 b3 g3 b5 g5... + v_uint8x16 pack_lo, pack_hi; + v_zip(v_reinterpret_as_u8(b0), v_reinterpret_as_u8(g0), pack_lo, pack_hi); + b1 = v_reinterpret_as_u16(pack_hi); + // b0 g0 b2 g2 b4 g4 .... + b0 = v_reinterpret_as_u16(pack_lo); + + // r1 a r3 a r5 a ... + v_zip(v_reinterpret_as_u8(r0), a, pack_lo, pack_hi); + r1 = v_reinterpret_as_u16(pack_hi); + // r0 a r2 a r4 a ... + r0 = v_reinterpret_as_u16(pack_lo); + + // a b0 g0 r0 a b2 g2 r2 ... + v_zip(b0, r0, g0, g1); + // a b8 g8 r8 a b10 g10 r10 ... + + // b1 g1 r1 a b3 g3 r3 a ... + v_zip(b1, r1, r0, r1); + // b9 g9 r9 a b11 g11 r11 a ... + + // a b0 g0 r0 b1 g1 r1 a ... + v_uint32x4 pack32_lo, pack32_hi; + v_zip(v_reinterpret_as_u32(g0), v_reinterpret_as_u32(r0), pack32_lo, pack32_hi); + b0 = v_reinterpret_as_u16(pack32_lo); + // a b4 g4 r4 b5 g5 r5 a ... + b1 = v_reinterpret_as_u16(pack32_hi); + + v_store_low(dst-1+0, v_reinterpret_as_u8(b0)); + v_store_high(dst-1+8*1, v_reinterpret_as_u8(b0)); + v_store_low(dst-1+8*2, v_reinterpret_as_u8(b1)); + v_store_high(dst-1+8*3, v_reinterpret_as_u8(b1)); + + // a b8 g8 r8 b9 g9 r9 a ... + v_zip(v_reinterpret_as_u32(g1), v_reinterpret_as_u32(r1), pack32_lo, pack32_hi); + g0 = v_reinterpret_as_u16(pack32_lo); + // a b12 g12 r12 b13 g13 r13 a ... + g1 = v_reinterpret_as_u16(pack32_hi); + + v_store_low(dst-1+8*4, v_reinterpret_as_u8(g0)); + v_store_high(dst-1+8*5, v_reinterpret_as_u8(g0)); + + v_store_low(dst-1+8*6, v_reinterpret_as_u8(g1)); + } +#endif return (int)(bayer - (bayer_end - width)); } - int bayer2RGB_EA(const uchar*, int, uchar*, int, int) const + int bayer2RGB_EA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const { - return 0; + const uchar* bayer_end = bayer + width; + v_uint16x8 masklow = v_setall_u16(0x00ff); + v_uint16x8 delta1 = v_setall_u16(1), delta2 = v_setall_u16(2); + v_uint16x8 full = v_setall_u16((ushort)(-1)); + v_uint8x16 z = v_setzero_u8(); + v_uint16x8 mask = v_setall_u16(blue > 0 ? (ushort)(-1) : 0); + + for ( ; bayer <= bayer_end - 18; bayer += 14, dst += 42) + { + /* + B G B G | B G B G | B G B G | B G B G + G R G R | G R G R | G R G R | G R G R + B G B G | B G B G | B G B G | B G B G + */ + + v_uint16x8 r0 = v_load((ushort*)bayer); + v_uint16x8 r1 = v_load((ushort*)(bayer+bayer_step)); + v_uint16x8 r2 = v_load((ushort*)(bayer+bayer_step*2)); + + v_uint16x8 b1 = (r0 & masklow) + (r2 & masklow); + v_uint16x8 nextb1 = v_rotate_right<1>(b1); + v_uint16x8 b0 = b1 + nextb1; + b1 = (nextb1 + delta1) >> 1; + b0 = (b0 + delta2) >> 2; + // b0 b2 ... b14 b1 b3 ... b15 + b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1))); + + // vertical sum + v_uint16x8 r0g = r0 >> 8; + v_uint16x8 r2g = r2 >> 8; + v_uint16x8 sumv = ((r0g + r2g) + delta1) >> 1; + // horizontal sum + v_uint16x8 g1 = r1 & masklow; + v_uint16x8 nextg1 = v_rotate_right<1>(g1); + v_uint16x8 sumg = (g1 + nextg1 + delta1) >> 1; + + // gradients + v_uint16x8 gradv = (r0g - r2g) + (r2g - r0g); + v_uint16x8 gradg = (nextg1 - g1) + (g1 - nextg1); + v_uint16x8 gmask = gradg > gradv; + v_uint16x8 g0 = (gmask & sumv) + (sumg & (gmask ^ full)); + // g0 g2 ... g14 g1 g3 ... + g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(nextg1))); + + r0 = r1 >> 8; + r1 = v_rotate_right<1>(r0) + r0; + r1 = (r1 + delta1) >> 1; + // r0 r2 ... r14 r1 r3 ... r15 + r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1))); + + b1 = (b0 ^ r0) & mask; + b0 = b0 ^ b1; + r0 = r0 ^ b1; + + // b1 g1 b3 g3 b5 g5... + v_uint8x16 pack_lo, pack_hi; + v_zip(v_reinterpret_as_u8(b0), v_reinterpret_as_u8(g0), pack_lo, pack_hi); + b1 = v_reinterpret_as_u16(pack_hi); + // b0 g0 b2 g2 b4 g4 .... + b0 = v_reinterpret_as_u16(pack_lo); + + // r1 0 r3 0 r5 0 ... + v_zip(v_reinterpret_as_u8(r0), z, pack_lo, pack_hi); + r1 = v_reinterpret_as_u16(pack_hi); + // r0 0 r2 0 r4 0 ... + r0 = v_reinterpret_as_u16(pack_lo); + + // 0 b0 g0 r0 0 b2 g2 r2 ... + v_zip(b0, r0, g0, g1); + g0 = v_reinterpret_as_u16(v_rotate_left<1>(v_reinterpret_as_u8(g0))); + // 0 b8 g8 r8 0 b10 g10 r10 ... + g1 = v_reinterpret_as_u16(v_rotate_left<1>(v_reinterpret_as_u8(g1))); + + // b1 g1 r1 0 b3 g3 r3 0 ... + v_zip(b1, r1, r0, r1); + // b9 g9 r9 0 b11 g11 r11 0 ... + + // 0 b0 g0 r0 b1 g1 r1 0 ... + v_uint32x4 pack32_lo, pack32_hi; + v_zip(v_reinterpret_as_u32(g0), v_reinterpret_as_u32(r0), pack32_lo, pack32_hi); + b0 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_lo))); + // 0 b4 g4 r4 b5 g5 r5 0 ... + b1 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_hi))); + + v_store_low(dst+0, v_reinterpret_as_u8(b0)); + v_store_high(dst+6*1, v_reinterpret_as_u8(b0)); + v_store_low(dst+6*2, v_reinterpret_as_u8(b1)); + v_store_high(dst+6*3, v_reinterpret_as_u8(b1)); + + // 0 b8 g8 r8 b9 g9 r9 0 ... + v_zip(v_reinterpret_as_u32(g1), v_reinterpret_as_u32(r1), pack32_lo, pack32_hi); + g0 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_lo))); + // 0 b12 g12 r12 b13 g13 r13 0 ... + g1 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_hi))); + + v_store_low(dst+6*4, v_reinterpret_as_u8(g0)); + v_store_high(dst+6*5, v_reinterpret_as_u8(g0)); + + v_store_low(dst+6*6, v_reinterpret_as_u8(g1)); + } + + return int(bayer - (bayer_end - width)); } }; #else @@ -775,7 +820,7 @@ public: // simd optimization only for dcn == 3 int delta = dcn == 4 ? - vecOp.bayer2RGBA(bayer, bayer_step, dst, size.width, blue) : + vecOp.bayer2RGBA(bayer, bayer_step, dst, size.width, blue, alpha) : vecOp.bayer2RGB(bayer, bayer_step, dst, size.width, blue); bayer += delta; dst += delta*dcn; From 17e2bf5717559ccf4fe809dd9f83afa7582f8ef5 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sun, 13 Oct 2019 11:14:41 +0000 Subject: [PATCH 2/6] core(tls): implement releasing of TLS on thread termination - move TLS & instrumentation code out of core/utility.hpp - (*) TLSData lost .gather() method (to dispose thread data on thread termination) - use TLSDataAccumulator for reliable collecting of thread data - prefer using of .detachData() + .cleanupDetachedData() instead of .gather() method (*) API is broken: replace TLSData => TLSDataAccumulator if gather required (objects disposal on threads termination is not available in accumulator mode) --- modules/core/include/opencv2/core/private.hpp | 4 + modules/core/include/opencv2/core/utility.hpp | 168 +------------ .../opencv2/core/utils/instrumentation.hpp | 125 +++++++++ .../core/include/opencv2/core/utils/tls.hpp | 237 ++++++++++++++++++ .../opencv2/core/utils/trace.private.hpp | 4 +- modules/core/src/ocl.cpp | 25 +- modules/core/src/ovx.cpp | 12 +- modules/core/src/precomp.hpp | 2 +- modules/core/src/rand.cpp | 2 +- modules/core/src/system.cpp | 170 ++++++++----- modules/core/src/umatrix.cpp | 2 + modules/core/test/test_utils.cpp | 2 + modules/core/test/test_utils_tls.impl.hpp | 134 ++++++++++ modules/imgproc/src/histogram.cpp | 2 + 14 files changed, 650 insertions(+), 239 deletions(-) create mode 100644 modules/core/include/opencv2/core/utils/instrumentation.hpp create mode 100644 modules/core/include/opencv2/core/utils/tls.hpp create mode 100644 modules/core/test/test_utils_tls.impl.hpp diff --git a/modules/core/include/opencv2/core/private.hpp b/modules/core/include/opencv2/core/private.hpp index 5e66801b51..24f7fc69b8 100644 --- a/modules/core/include/opencv2/core/private.hpp +++ b/modules/core/include/opencv2/core/private.hpp @@ -53,6 +53,10 @@ #include +#ifdef ENABLE_INSTRUMENTATION +#include "opencv2/core/utils/instrumentation.hpp" +#endif + #ifdef HAVE_EIGEN # if defined __GNUC__ && defined __APPLE__ # pragma GCC diagnostic ignored "-Wshadow" diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp index cbec10b41b..e7f169b01a 100644 --- a/modules/core/include/opencv2/core/utility.hpp +++ b/modules/core/include/opencv2/core/utility.hpp @@ -63,30 +63,6 @@ namespace cv { -#ifdef CV_COLLECT_IMPL_DATA -CV_EXPORTS void setImpl(int flags); // set implementation flags and reset storage arrays -CV_EXPORTS void addImpl(int flag, const char* func = 0); // add implementation and function name to storage arrays -// Get stored implementation flags and functions names arrays -// Each implementation entry correspond to function name entry, so you can find which implementation was executed in which function -CV_EXPORTS int getImpl(std::vector &impl, std::vector &funName); - -CV_EXPORTS bool useCollection(); // return implementation collection state -CV_EXPORTS void setUseCollection(bool flag); // set implementation collection state - -#define CV_IMPL_PLAIN 0x01 // native CPU OpenCV implementation -#define CV_IMPL_OCL 0x02 // OpenCL implementation -#define CV_IMPL_IPP 0x04 // IPP implementation -#define CV_IMPL_MT 0x10 // multithreaded implementation - -#define CV_IMPL_ADD(impl) \ - if(cv::useCollection()) \ - { \ - cv::addImpl(impl, CV_Func); \ - } -#else -#define CV_IMPL_ADD(impl) -#endif - //! @addtogroup core_utils //! @{ @@ -726,61 +702,6 @@ private: AutoLock& operator = (const AutoLock&); }; -// TLS interface -class CV_EXPORTS TLSDataContainer -{ -protected: - TLSDataContainer(); - virtual ~TLSDataContainer(); - - void gatherData(std::vector &data) const; -#if OPENCV_ABI_COMPATIBILITY > 300 - void* getData() const; - void release(); - -private: -#else - void release(); - -public: - void* getData() const; -#endif - virtual void* createDataInstance() const = 0; - virtual void deleteDataInstance(void* pData) const = 0; - - int key_; - -public: - void cleanup(); //! Release created TLS data container objects. It is similar to release() call, but it keeps TLS container valid. -}; - -// Main TLS data class -template -class TLSData : protected TLSDataContainer -{ -public: - inline TLSData() {} - inline ~TLSData() { release(); } // Release key and delete associated data - inline T* get() const { return (T*)getData(); } // Get data associated with key - inline T& getRef() const { T* ptr = (T*)getData(); CV_Assert(ptr); return *ptr; } // Get data associated with key - - // Get data from all threads - inline void gather(std::vector &data) const - { - std::vector &dataVoid = reinterpret_cast&>(data); - gatherData(dataVoid); - } - - inline void cleanup() { TLSDataContainer::cleanup(); } - -private: - virtual void* createDataInstance() const CV_OVERRIDE {return new T;} // Wrapper to allocate data by template - virtual void deleteDataInstance(void* pData) const CV_OVERRIDE {delete (T*)pData;} // Wrapper to release data by template - - // Disable TLS copy operations - TLSData(TLSData &) {} - TLSData& operator =(const TLSData &) {return *this;} -}; /** @brief Designed for command line parsing @@ -1199,88 +1120,6 @@ public: std::vector*> m_childs; }; -// Instrumentation external interface -namespace instr -{ - -#if !defined OPENCV_ABI_CHECK - -enum TYPE -{ - TYPE_GENERAL = 0, // OpenCV API function, e.g. exported function - TYPE_MARKER, // Information marker - TYPE_WRAPPER, // Wrapper function for implementation - TYPE_FUN, // Simple function call -}; - -enum IMPL -{ - IMPL_PLAIN = 0, - IMPL_IPP, - IMPL_OPENCL, -}; - -struct NodeDataTls -{ - NodeDataTls() - { - m_ticksTotal = 0; - } - uint64 m_ticksTotal; -}; - -class CV_EXPORTS NodeData -{ -public: - NodeData(const char* funName = 0, const char* fileName = NULL, int lineNum = 0, void* retAddress = NULL, bool alwaysExpand = false, cv::instr::TYPE instrType = TYPE_GENERAL, cv::instr::IMPL implType = IMPL_PLAIN); - NodeData(NodeData &ref); - ~NodeData(); - NodeData& operator=(const NodeData&); - - cv::String m_funName; - cv::instr::TYPE m_instrType; - cv::instr::IMPL m_implType; - const char* m_fileName; - int m_lineNum; - void* m_retAddress; - bool m_alwaysExpand; - bool m_funError; - - volatile int m_counter; - volatile uint64 m_ticksTotal; - TLSData m_tls; - int m_threads; - - // No synchronization - double getTotalMs() const { return ((double)m_ticksTotal / cv::getTickFrequency()) * 1000; } - double getMeanMs() const { return (((double)m_ticksTotal/m_counter) / cv::getTickFrequency()) * 1000; } -}; -bool operator==(const NodeData& lhs, const NodeData& rhs); - -typedef Node InstrNode; - -CV_EXPORTS InstrNode* getTrace(); - -#endif // !defined OPENCV_ABI_CHECK - - -CV_EXPORTS bool useInstrumentation(); -CV_EXPORTS void setUseInstrumentation(bool flag); -CV_EXPORTS void resetTrace(); - -enum FLAGS -{ - FLAGS_NONE = 0, - FLAGS_MAPPING = 0x01, - FLAGS_EXPAND_SAME_NAMES = 0x02, -}; - -CV_EXPORTS void setFlags(FLAGS modeFlags); -static inline void setFlags(int modeFlags) { setFlags((FLAGS)modeFlags); } -CV_EXPORTS FLAGS getFlags(); - -} // namespace instr - namespace samples { @@ -1355,6 +1194,13 @@ CV_EXPORTS int getThreadID(); } //namespace cv +#ifdef CV_COLLECT_IMPL_DATA +#include "opencv2/core/utils/instrumentation.hpp" +#else +/// Collect implementation data on OpenCV function call. Requires ENABLE_IMPL_COLLECTION build option. +#define CV_IMPL_ADD(impl) +#endif + #ifndef DISABLE_OPENCV_24_COMPATIBILITY #include "opencv2/core/core_c.h" #endif diff --git a/modules/core/include/opencv2/core/utils/instrumentation.hpp b/modules/core/include/opencv2/core/utils/instrumentation.hpp new file mode 100644 index 0000000000..3639867080 --- /dev/null +++ b/modules/core/include/opencv2/core/utils/instrumentation.hpp @@ -0,0 +1,125 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_UTILS_INSTR_HPP +#define OPENCV_UTILS_INSTR_HPP + +#include +#include + +namespace cv { + +//! @addtogroup core_utils +//! @{ + +#ifdef CV_COLLECT_IMPL_DATA +CV_EXPORTS void setImpl(int flags); // set implementation flags and reset storage arrays +CV_EXPORTS void addImpl(int flag, const char* func = 0); // add implementation and function name to storage arrays +// Get stored implementation flags and functions names arrays +// Each implementation entry correspond to function name entry, so you can find which implementation was executed in which function +CV_EXPORTS int getImpl(std::vector &impl, std::vector &funName); + +CV_EXPORTS bool useCollection(); // return implementation collection state +CV_EXPORTS void setUseCollection(bool flag); // set implementation collection state + +#define CV_IMPL_PLAIN 0x01 // native CPU OpenCV implementation +#define CV_IMPL_OCL 0x02 // OpenCL implementation +#define CV_IMPL_IPP 0x04 // IPP implementation +#define CV_IMPL_MT 0x10 // multithreaded implementation + +#undef CV_IMPL_ADD +#define CV_IMPL_ADD(impl) \ + if(cv::useCollection()) \ + { \ + cv::addImpl(impl, CV_Func); \ + } +#endif + +// Instrumentation external interface +namespace instr +{ + +#if !defined OPENCV_ABI_CHECK + +enum TYPE +{ + TYPE_GENERAL = 0, // OpenCV API function, e.g. exported function + TYPE_MARKER, // Information marker + TYPE_WRAPPER, // Wrapper function for implementation + TYPE_FUN, // Simple function call +}; + +enum IMPL +{ + IMPL_PLAIN = 0, + IMPL_IPP, + IMPL_OPENCL, +}; + +struct NodeDataTls +{ + NodeDataTls() + { + m_ticksTotal = 0; + } + uint64 m_ticksTotal; +}; + +class CV_EXPORTS NodeData +{ +public: + NodeData(const char* funName = 0, const char* fileName = NULL, int lineNum = 0, void* retAddress = NULL, bool alwaysExpand = false, cv::instr::TYPE instrType = TYPE_GENERAL, cv::instr::IMPL implType = IMPL_PLAIN); + NodeData(NodeData &ref); + ~NodeData(); + NodeData& operator=(const NodeData&); + + cv::String m_funName; + cv::instr::TYPE m_instrType; + cv::instr::IMPL m_implType; + const char* m_fileName; + int m_lineNum; + void* m_retAddress; + bool m_alwaysExpand; + bool m_funError; + + volatile int m_counter; + volatile uint64 m_ticksTotal; + TLSDataAccumulator m_tls; + int m_threads; + + // No synchronization + double getTotalMs() const { return ((double)m_ticksTotal / cv::getTickFrequency()) * 1000; } + double getMeanMs() const { return (((double)m_ticksTotal/m_counter) / cv::getTickFrequency()) * 1000; } +}; +bool operator==(const NodeData& lhs, const NodeData& rhs); + +typedef Node InstrNode; + +CV_EXPORTS InstrNode* getTrace(); + +#endif // !defined OPENCV_ABI_CHECK + + +CV_EXPORTS bool useInstrumentation(); +CV_EXPORTS void setUseInstrumentation(bool flag); +CV_EXPORTS void resetTrace(); + +enum FLAGS +{ + FLAGS_NONE = 0, + FLAGS_MAPPING = 0x01, + FLAGS_EXPAND_SAME_NAMES = 0x02, +}; + +CV_EXPORTS void setFlags(FLAGS modeFlags); +static inline void setFlags(int modeFlags) { setFlags((FLAGS)modeFlags); } +CV_EXPORTS FLAGS getFlags(); + +} // namespace instr + +//! @} + +} // namespace + +#endif // OPENCV_UTILS_TLS_HPP diff --git a/modules/core/include/opencv2/core/utils/tls.hpp b/modules/core/include/opencv2/core/utils/tls.hpp new file mode 100644 index 0000000000..b5f1138593 --- /dev/null +++ b/modules/core/include/opencv2/core/utils/tls.hpp @@ -0,0 +1,237 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_UTILS_TLS_HPP +#define OPENCV_UTILS_TLS_HPP + +#include + +namespace cv { + +//! @addtogroup core_utils +//! @{ + +namespace details { class TlsStorage; } + +/** TLS container base implementation + * + * Don't use directly. + * + * @sa TLSData, TLSDataAccumulator templates + */ +class CV_EXPORTS TLSDataContainer +{ +protected: + TLSDataContainer(); + virtual ~TLSDataContainer(); + + /// @deprecated use detachData() instead + void gatherData(std::vector &data) const; + /// get TLS data and detach all data from threads (similar to cleanup() call) + void detachData(std::vector& data); + + void* getData() const; + void release(); + +protected: + virtual void* createDataInstance() const = 0; + virtual void deleteDataInstance(void* pData) const = 0; + +#if OPENCV_ABI_COMPATIBILITY > 300 +private: +#else +public: +#endif + int key_; + + friend class cv::details::TlsStorage; // core/src/system.cpp + +public: + void cleanup(); //!< Release created TLS data container objects. It is similar to release() call, but it keeps TLS container valid. + +private: + // Disable copy/assign (noncopyable pattern) + TLSDataContainer(TLSDataContainer &); + TLSDataContainer& operator =(const TLSDataContainer &); +}; + + +/** @brief Simple TLS data class + * + * @sa TLSDataAccumulator + */ +template +class TLSData : protected TLSDataContainer +{ +public: + inline TLSData() {} + inline ~TLSData() { release(); } + + inline T* get() const { return (T*)getData(); } //!< Get data associated with key + inline T& getRef() const { T* ptr = (T*)getData(); CV_DbgAssert(ptr); return *ptr; } //!< Get data associated with key + + /// Release associated thread data + inline void cleanup() + { + TLSDataContainer::cleanup(); + } + +protected: + /// Wrapper to allocate data by template + virtual void* createDataInstance() const CV_OVERRIDE { return new T; } + /// Wrapper to release data by template + virtual void deleteDataInstance(void* pData) const CV_OVERRIDE { delete (T*)pData; } +}; + + +/// TLS data accumulator with gathering methods +template +class TLSDataAccumulator : public TLSData +{ + mutable cv::Mutex mutex; + mutable std::vector dataFromTerminatedThreads; + std::vector detachedData; + bool cleanupMode; +public: + TLSDataAccumulator() : cleanupMode(false) {} + ~TLSDataAccumulator() + { + release(); + } + + /** @brief Get data from all threads + * @deprecated replaced by detachData() + * + * Lifetime of vector data is valid until next detachData()/cleanup()/release() calls + * + * @param[out] data result buffer (should be empty) + */ + void gather(std::vector &data) const + { + CV_Assert(cleanupMode == false); // state is not valid + CV_Assert(data.empty()); + { + std::vector &dataVoid = reinterpret_cast&>(data); + TLSDataContainer::gatherData(dataVoid); + } + { + AutoLock lock(mutex); + data.reserve(data.size() + dataFromTerminatedThreads.size()); + for (typename std::vector::const_iterator i = dataFromTerminatedThreads.begin(); i != dataFromTerminatedThreads.end(); ++i) + { + data.push_back((T*)*i); + } + } + } + + /** @brief Get and detach data from all threads + * + * Call cleanupDetachedData() when returned vector is not needed anymore. + * + * @return Vector with associated data. Content is preserved (including lifetime of attached data pointers) until next detachData()/cleanupDetachedData()/cleanup()/release() calls + */ + std::vector& detachData() + { + CV_Assert(cleanupMode == false); // state is not valid + std::vector dataVoid; + { + TLSDataContainer::detachData(dataVoid); + } + { + AutoLock lock(mutex); + detachedData.reserve(dataVoid.size() + dataFromTerminatedThreads.size()); + for (typename std::vector::const_iterator i = dataFromTerminatedThreads.begin(); i != dataFromTerminatedThreads.end(); ++i) + { + detachedData.push_back((T*)*i); + } + dataFromTerminatedThreads.clear(); + for (typename std::vector::const_iterator i = dataVoid.begin(); i != dataVoid.end(); ++i) + { + detachedData.push_back((T*)(void*)*i); + } + } + dataVoid.clear(); + return detachedData; + } + + /// Release associated thread data returned by detachData() call + void cleanupDetachedData() + { + AutoLock lock(mutex); + cleanupMode = true; + _cleanupDetachedData(); + cleanupMode = false; + } + + /// Release associated thread data + void cleanup() + { + cleanupMode = true; + TLSDataContainer::cleanup(); + + AutoLock lock(mutex); + _cleanupDetachedData(); + _cleanupTerminatedData(); + cleanupMode = false; + } + + /// Release associated thread data and free TLS key + void release() + { + cleanupMode = true; + TLSDataContainer::release(); + { + AutoLock lock(mutex); + _cleanupDetachedData(); + _cleanupTerminatedData(); + } + } + +protected: + // synchronized + void _cleanupDetachedData() + { + for (typename std::vector::iterator i = detachedData.begin(); i != detachedData.end(); ++i) + { + deleteDataInstance((T*)*i); + } + detachedData.clear(); + } + + // synchronized + void _cleanupTerminatedData() + { + for (typename std::vector::iterator i = dataFromTerminatedThreads.begin(); i != dataFromTerminatedThreads.end(); ++i) + { + deleteDataInstance((T*)*i); + } + dataFromTerminatedThreads.clear(); + } + +protected: + virtual void* createDataInstance() const CV_OVERRIDE + { + // Note: we can collect all allocated data here, but this would require raced mutex locks + return new T; + } + virtual void deleteDataInstance(void* pData) const CV_OVERRIDE + { + if (cleanupMode) + { + delete (T*)pData; + } + else + { + AutoLock lock(mutex); + dataFromTerminatedThreads.push_back((T*)pData); + } + } +}; + + +//! @} + +} // namespace + +#endif // OPENCV_UTILS_TLS_HPP diff --git a/modules/core/include/opencv2/core/utils/trace.private.hpp b/modules/core/include/opencv2/core/utils/trace.private.hpp index 17981663fe..afc41159f6 100644 --- a/modules/core/include/opencv2/core/utils/trace.private.hpp +++ b/modules/core/include/opencv2/core/utils/trace.private.hpp @@ -9,6 +9,8 @@ #include +#include + #include "trace.hpp" //! @cond IGNORED @@ -332,7 +334,7 @@ public: Mutex mutexCreate; Mutex mutexCount; - TLSData tls; + TLSDataAccumulator tls; cv::Ptr trace_storage; private: diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index 22408df723..8ebb0064a9 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -909,19 +909,19 @@ bool haveOpenCL() bool useOpenCL() { - CoreTLSData* data = getCoreTlsData().get(); - if( data->useOpenCL < 0 ) + CoreTLSData& data = getCoreTlsData(); + if (data.useOpenCL < 0) { try { - data->useOpenCL = (int)(haveOpenCL() && Device::getDefault().ptr() && Device::getDefault().available()) ? 1 : 0; + data.useOpenCL = (int)(haveOpenCL() && Device::getDefault().ptr() && Device::getDefault().available()) ? 1 : 0; } catch (...) { - data->useOpenCL = 0; + data.useOpenCL = 0; } } - return data->useOpenCL > 0; + return data.useOpenCL > 0; } #ifdef HAVE_OPENCL @@ -937,14 +937,14 @@ void setUseOpenCL(bool flag) { CV_TRACE_FUNCTION(); - CoreTLSData* data = getCoreTlsData().get(); + CoreTLSData& data = getCoreTlsData(); if (!flag) { - data->useOpenCL = 0; + data.useOpenCL = 0; } else if( haveOpenCL() ) { - data->useOpenCL = (Device::getDefault().ptr() != NULL) ? 1 : 0; + data.useOpenCL = (Device::getDefault().ptr() != NULL) ? 1 : 0; } } @@ -1655,7 +1655,7 @@ size_t Device::profilingTimerResolution() const const Device& Device::getDefault() { const Context& ctx = Context::getDefault(); - int idx = getCoreTlsData().get()->device; + int idx = getCoreTlsData().device; const Device& device = ctx.device(idx); return device; } @@ -2562,9 +2562,10 @@ void attachContext(const String& platformName, void* platformID, void* context, CV_OCL_CHECK(clRetainContext((cl_context)context)); // clear command queue, if any - getCoreTlsData().get()->oclQueue.finish(); + CoreTLSData& data = getCoreTlsData(); + data.oclQueue.finish(); Queue q; - getCoreTlsData().get()->oclQueue = q; + data.oclQueue = q; return; } // attachContext() @@ -2752,7 +2753,7 @@ void* Queue::ptr() const Queue& Queue::getDefault() { - Queue& q = getCoreTlsData().get()->oclQueue; + Queue& q = getCoreTlsData().oclQueue; if( !q.p && haveOpenCL() ) q.create(Context::getDefault()); return q; diff --git a/modules/core/src/ovx.cpp b/modules/core/src/ovx.cpp index d906ead09c..9685cbaed2 100644 --- a/modules/core/src/ovx.cpp +++ b/modules/core/src/ovx.cpp @@ -76,13 +76,13 @@ bool haveOpenVX() bool useOpenVX() { #ifdef HAVE_OPENVX - CoreTLSData* data = getCoreTlsData().get(); - if( data->useOpenVX < 0 ) + CoreTLSData& data = getCoreTlsData(); + if (data.useOpenVX < 0) { // enabled (if available) by default - data->useOpenVX = haveOpenVX() ? 1 : 0; + data.useOpenVX = haveOpenVX() ? 1 : 0; } - return data->useOpenVX > 0; + return data.useOpenVX > 0; #else return false; #endif @@ -93,8 +93,8 @@ void setUseOpenVX(bool flag) #ifdef HAVE_OPENVX if( haveOpenVX() ) { - CoreTLSData* data = getCoreTlsData().get(); - data->useOpenVX = flag ? 1 : 0; + CoreTLSData& data = getCoreTlsData(); + data.useOpenVX = flag ? 1 : 0; } #else CV_Assert(!flag && "OpenVX support isn't enabled at compile time"); diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp index acaee08eab..0ffde8855a 100644 --- a/modules/core/src/precomp.hpp +++ b/modules/core/src/precomp.hpp @@ -354,7 +354,7 @@ struct CoreTLSData #endif }; -TLSData& getCoreTlsData(); +CoreTLSData& getCoreTlsData(); #if defined(BUILD_SHARED_LIBS) #if defined _WIN32 || defined WINCE diff --git a/modules/core/src/rand.cpp b/modules/core/src/rand.cpp index aa952b2448..539f92aeb1 100644 --- a/modules/core/src/rand.cpp +++ b/modules/core/src/rand.cpp @@ -770,7 +770,7 @@ void RNG::fill( InputOutputArray _mat, int disttype, cv::RNG& cv::theRNG() { - return getCoreTlsData().get()->rng; + return getCoreTlsData().rng; } void cv::setRNGSeed(int seed) diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index 1b6777561a..b39173de0d 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -50,6 +50,9 @@ #include +#include +#include + namespace cv { static Mutex* __initialization_mutex = NULL; @@ -1375,6 +1378,8 @@ bool Mutex::trylock() { return impl->trylock(); } //////////////////////////////// thread-local storage //////////////////////////////// +namespace details { + #ifdef _WIN32 #ifdef _MSC_VER #pragma warning(disable:4505) // unreferenced local function has been removed @@ -1436,9 +1441,10 @@ void TlsAbstraction::SetData(void *pData) } #endif #else // _WIN32 +static void opencv_tls_destructor(void* pData); TlsAbstraction::TlsAbstraction() { - CV_Assert(pthread_key_create(&tlsKey, NULL) == 0); + CV_Assert(pthread_key_create(&tlsKey, opencv_tls_destructor) == 0); } TlsAbstraction::~TlsAbstraction() { @@ -1479,42 +1485,46 @@ public: } ~TlsStorage() { - for(size_t i = 0; i < threads.size(); i++) - { - if(threads[i]) - { - /* Current architecture doesn't allow proper global objects release, so this check can cause crashes - - // Check if all slots were properly cleared - for(size_t j = 0; j < threads[i]->slots.size(); j++) - { - CV_Assert(threads[i]->slots[j] == 0); - } - */ - delete threads[i]; - } - } - threads.clear(); + // TlsStorage object should not be released + // There is no reliable way to avoid problems caused by static initialization order fiasco + CV_LOG_FATAL(NULL, "TlsStorage::~TlsStorage() call is not expected"); } - void releaseThread() + void releaseThread(void* tlsValue = NULL) { + ThreadData *pTD = tlsValue == NULL ? (ThreadData*)tls.GetData() : (ThreadData*)tlsValue; + if (pTD == NULL) + return; // no OpenCV TLS data for this thread AutoLock guard(mtxGlobalAccess); - ThreadData *pTD = (ThreadData*)tls.GetData(); - for(size_t i = 0; i < threads.size(); i++) + for (size_t i = 0; i < threads.size(); i++) { - if(pTD == threads[i]) + if (pTD == threads[i]) { - threads[i] = 0; - break; + threads[i] = NULL; + if (tlsValue == NULL) + tls.SetData(0); + std::vector& thread_slots = pTD->slots; + for (size_t slotIdx = 0; slotIdx < thread_slots.size(); slotIdx++) + { + void* pData = thread_slots[slotIdx]; + thread_slots[slotIdx] = NULL; + if (!pData) + continue; + TLSDataContainer* container = tlsSlots[slotIdx].container; + if (container) + container->deleteDataInstance(pData); + else + CV_LOG_ERROR(NULL, "TLS: container for slotIdx=" << slotIdx << " is NULL. Can't release thread data"); + } + delete pTD; + return; } } - tls.SetData(0); - delete pTD; + CV_LOG_WARNING(NULL, "TLS: Can't release thread TLS data (unknown pointer or data race): " << (void*)pTD); } // Reserve TLS storage index - size_t reserveSlot() + size_t reserveSlot(TLSDataContainer* container) { AutoLock guard(mtxGlobalAccess); CV_Assert(tlsSlotsSize == tlsSlots.size()); @@ -1522,15 +1532,15 @@ public: // Find unused slots for(size_t slot = 0; slot < tlsSlotsSize; slot++) { - if(!tlsSlots[slot]) + if (tlsSlots[slot].container == NULL) { - tlsSlots[slot] = 1; + tlsSlots[slot].container = container; return slot; } } // Create new slot - tlsSlots.push_back(1); tlsSlotsSize++; + tlsSlots.push_back(TlsSlotInfo(container)); tlsSlotsSize++; return tlsSlotsSize - 1; } @@ -1555,7 +1565,9 @@ public: } if (!keepSlot) - tlsSlots[slotIdx] = 0; + { + tlsSlots[slotIdx].container = NULL; // mark slot as free (see reserveSlot() implementation) + } } // Get data by TLS storage index @@ -1604,8 +1616,26 @@ public: tls.SetData((void*)threadData); { AutoLock guard(mtxGlobalAccess); - threadData->idx = threads.size(); - threads.push_back(threadData); + + bool found = false; + // Find unused slots + for(size_t slot = 0; slot < threads.size(); slot++) + { + if (threads[slot] == NULL) + { + threadData->idx = (int)slot; + threads[slot] = threadData; + found = true; + break; + } + } + + if (!found) + { + // Create new slot + threadData->idx = threads.size(); + threads.push_back(threadData); + } } } @@ -1622,8 +1652,14 @@ private: Mutex mtxGlobalAccess; // Shared objects operation guard size_t tlsSlotsSize; // equal to tlsSlots.size() in synchronized sections - // without synchronization this counter doesn't desrease - it is used for slotIdx sanity checks - std::vector tlsSlots; // TLS keys state + // without synchronization this counter doesn't decrease - it is used for slotIdx sanity checks + + struct TlsSlotInfo + { + TlsSlotInfo(TLSDataContainer* _container) : container(_container) {} + TLSDataContainer* container; // attached container (to dispose data of terminated threads) + }; + std::vector tlsSlots; // TLS keys state std::vector threads; // Array for all allocated data. Thread data pointers are placed here to allow data cleanup }; @@ -1633,9 +1669,19 @@ static TlsStorage &getTlsStorage() CV_SINGLETON_LAZY_INIT_REF(TlsStorage, new TlsStorage()) } +#ifndef _WIN32 // pthread key destructor +static void opencv_tls_destructor(void* pData) +{ + getTlsStorage().releaseThread(pData); +} +#endif + +} // namespace details +using namespace details; + TLSDataContainer::TLSDataContainer() { - key_ = (int)getTlsStorage().reserveSlot(); // Reserve key from TLS storage + key_ = (int)getTlsStorage().reserveSlot(this); // Reserve key from TLS storage } TLSDataContainer::~TLSDataContainer() @@ -1648,11 +1694,17 @@ void TLSDataContainer::gatherData(std::vector &data) const getTlsStorage().gather(key_, data); } +void TLSDataContainer::detachData(std::vector &data) +{ + getTlsStorage().releaseSlot(key_, data, true); +} + void TLSDataContainer::release() { - std::vector data; - data.reserve(32); - getTlsStorage().releaseSlot(key_, data); // Release key and get stored data for proper destruction + if (key_ == -1) + return; // already released + std::vector data; data.reserve(32); + getTlsStorage().releaseSlot(key_, data, false); // Release key and get stored data for proper destruction key_ = -1; for(size_t i = 0; i < data.size(); i++) // Delete all associated data deleteDataInstance(data[i]); @@ -1660,8 +1712,7 @@ void TLSDataContainer::release() void TLSDataContainer::cleanup() { - std::vector data; - data.reserve(32); + std::vector data; data.reserve(32); getTlsStorage().releaseSlot(key_, data, true); // Extract stored data with removal from TLS tables for(size_t i = 0; i < data.size(); i++) // Delete all associated data deleteDataInstance(data[i]); @@ -1680,11 +1731,16 @@ void* TLSDataContainer::getData() const return pData; } -TLSData& getCoreTlsData() +static TLSData& getCoreTlsDataTLS() { CV_SINGLETON_LAZY_INIT_REF(TLSData, new TLSData()) } +CoreTLSData& getCoreTlsData() +{ + return getCoreTlsDataTLS().getRef(); +} + #if defined CVAPI_EXPORTS && defined _WIN32 && !defined WINCE #ifdef WINRT #pragma warning(disable:4447) // Disable warning 'main' signature found without threading model @@ -2338,12 +2394,12 @@ String getIppVersion() bool useIPP() { #ifdef HAVE_IPP - CoreTLSData* data = getCoreTlsData().get(); - if(data->useIPP < 0) + CoreTLSData& data = getCoreTlsData(); + if (data.useIPP < 0) { - data->useIPP = getIPPSingleton().useIPP; + data.useIPP = getIPPSingleton().useIPP; } - return (data->useIPP > 0); + return (data.useIPP > 0); #else return false; #endif @@ -2351,24 +2407,24 @@ bool useIPP() void setUseIPP(bool flag) { - CoreTLSData* data = getCoreTlsData().get(); + CoreTLSData& data = getCoreTlsData(); #ifdef HAVE_IPP - data->useIPP = (getIPPSingleton().useIPP)?flag:false; + data.useIPP = (getIPPSingleton().useIPP)?flag:false; #else CV_UNUSED(flag); - data->useIPP = false; + data.useIPP = false; #endif } bool useIPP_NotExact() { #ifdef HAVE_IPP - CoreTLSData* data = getCoreTlsData().get(); - if(data->useIPP_NE < 0) + CoreTLSData& data = getCoreTlsData(); + if (data.useIPP_NE < 0) { - data->useIPP_NE = getIPPSingleton().useIPP_NE; + data.useIPP_NE = getIPPSingleton().useIPP_NE; } - return (data->useIPP_NE > 0); + return (data.useIPP_NE > 0); #else return false; #endif @@ -2376,12 +2432,12 @@ bool useIPP_NotExact() void setUseIPP_NotExact(bool flag) { - CoreTLSData* data = getCoreTlsData().get(); + CoreTLSData& data = getCoreTlsData(); #ifdef HAVE_IPP - data->useIPP_NE = flag; + data.useIPP_NE = flag; #else CV_UNUSED(flag); - data->useIPP_NE = false; + data.useIPP_NE = false; #endif } @@ -2407,7 +2463,7 @@ namespace tegra { bool useTegra() { - cv::CoreTLSData* data = cv::getCoreTlsData().get(); + cv::CoreTLSData* data = cv::getCoreTlsData(); if (data->useTegra < 0) { @@ -2423,7 +2479,7 @@ bool useTegra() void setUseTegra(bool flag) { - cv::CoreTLSData* data = cv::getCoreTlsData().get(); + cv::CoreTLSData* data = cv::getCoreTlsData(); data->useTegra = flag; } diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp index c31d9e1bdd..58f1f909d5 100644 --- a/modules/core/src/umatrix.cpp +++ b/modules/core/src/umatrix.cpp @@ -43,6 +43,8 @@ #include "opencl_kernels_core.hpp" #include "umatrix.hpp" +#include + ///////////////////////////////// UMat implementation /////////////////////////////// namespace cv { diff --git a/modules/core/test/test_utils.cpp b/modules/core/test/test_utils.cpp index c566762925..2bae77892b 100644 --- a/modules/core/test/test_utils.cpp +++ b/modules/core/test/test_utils.cpp @@ -4,6 +4,8 @@ #include "test_precomp.hpp" #include "opencv2/core/utils/logger.hpp" +#include "test_utils_tls.impl.hpp" + namespace opencv_test { namespace { static const char * const keys = diff --git a/modules/core/test/test_utils_tls.impl.hpp b/modules/core/test/test_utils_tls.impl.hpp new file mode 100644 index 0000000000..36b8805422 --- /dev/null +++ b/modules/core/test/test_utils_tls.impl.hpp @@ -0,0 +1,134 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// This is .hpp file included from test_utils.cpp + +#ifdef CV_CXX11 +#include // std::thread +#endif + +#include "opencv2/core/utils/tls.hpp" + +namespace opencv_test { namespace { + +class TLSReporter +{ +public: + static int g_last_id; + static int g_allocated; + + int id; + + TLSReporter() + { + id = CV_XADD(&g_last_id, 1); + CV_XADD(&g_allocated, 1); + } + ~TLSReporter() + { + CV_XADD(&g_allocated, -1); + } +}; + +int TLSReporter::g_last_id = 0; +int TLSReporter::g_allocated = 0; + +#ifdef CV_CXX11 + +template +static void callNThreadsWithTLS(int N, TLSData& tls) +{ + std::vector threads(N); + for (int i = 0; i < N; i++) + { + threads[i] = std::thread([&]() { + TLSReporter* pData = tls.get(); + (void)pData; + }); + } + for (int i = 0; i < N; i++) + { + threads[i].join(); + } + threads.clear(); +} + +TEST(Core_TLS, HandleThreadTermination) +{ + const int init_id = TLSReporter::g_last_id; + const int init_allocated = TLSReporter::g_allocated; + + const int N = 4; + TLSData tls; + + // use TLS + ASSERT_NO_THROW(callNThreadsWithTLS(N, tls)); + + EXPECT_EQ(init_id + N, TLSReporter::g_last_id); + EXPECT_EQ(init_allocated + 0, TLSReporter::g_allocated); +} + + +static void testTLSAccumulator(bool detachFirst) +{ + const int init_id = TLSReporter::g_last_id; + const int init_allocated = TLSReporter::g_allocated; + + const int N = 4; + TLSDataAccumulator tls; + + { // empty TLS checks + std::vector& data0 = tls.detachData(); + EXPECT_EQ((size_t)0, data0.size()); + tls.cleanupDetachedData(); + } + + // use TLS + ASSERT_NO_THROW(callNThreadsWithTLS(N, tls)); + + EXPECT_EQ(init_id + N, TLSReporter::g_last_id); + EXPECT_EQ(init_allocated + N, TLSReporter::g_allocated); + + if (detachFirst) + { + std::vector& data1 = tls.detachData(); + EXPECT_EQ((size_t)N, data1.size()); + + // no data through gather after detachData() + std::vector data2; + tls.gather(data2); + EXPECT_EQ((size_t)0, data2.size()); + + tls.cleanupDetachedData(); + + EXPECT_EQ(init_id + N, TLSReporter::g_last_id); + EXPECT_EQ(init_allocated + 0, TLSReporter::g_allocated); + EXPECT_EQ((size_t)0, data1.size()); + } + else + { + std::vector data2; + tls.gather(data2); + EXPECT_EQ((size_t)N, data2.size()); + + std::vector& data1 = tls.detachData(); + EXPECT_EQ((size_t)N, data1.size()); + + tls.cleanupDetachedData(); + + EXPECT_EQ((size_t)0, data1.size()); + // data2 is not empty, but it has invalid contents + EXPECT_EQ((size_t)N, data2.size()); + } + + EXPECT_EQ(init_id + N, TLSReporter::g_last_id); + EXPECT_EQ(init_allocated + 0, TLSReporter::g_allocated); +} + +TEST(Core_TLS, AccumulatorHoldData_detachData) { testTLSAccumulator(true); } +TEST(Core_TLS, AccumulatorHoldData_gather) { testTLSAccumulator(false); } + +#endif + +}} // namespace diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp index d4ff218f13..64a5c61afe 100644 --- a/modules/imgproc/src/histogram.cpp +++ b/modules/imgproc/src/histogram.cpp @@ -45,6 +45,8 @@ #include "opencv2/core/openvx/ovx_defs.hpp" +#include "opencv2/core/utils/tls.hpp" + namespace cv { From 6ec5ae0215390716d7397a9c583bff68f97b4294 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Tue, 22 Oct 2019 16:59:59 +0300 Subject: [PATCH 3/6] core(trace): add ITT control parameter - OPENCV_TRACE_ITT_ENABLE --- modules/core/src/trace.cpp | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/modules/core/src/trace.cpp b/modules/core/src/trace.cpp index e1c58bb79b..4245ab2080 100644 --- a/modules/core/src/trace.cpp +++ b/modules/core/src/trace.cpp @@ -196,14 +196,27 @@ static __itt_domain* domain = NULL; static bool isITTEnabled() { - static bool isInitialized = false; + static volatile bool isInitialized = false; static bool isEnabled = false; if (!isInitialized) { - isEnabled = !!(__itt_api_version()); - CV_LOG_ITT("ITT is " << (isEnabled ? "enabled" : "disabled")); - domain = __itt_domain_create("OpenCVTrace"); - isInitialized = true; + cv::AutoLock lock(cv::getInitializationMutex()); + if (!isInitialized) + { + bool param_traceITTEnable = utils::getConfigurationParameterBool("OPENCV_TRACE_ITT_ENABLE", true); + if (param_traceITTEnable) + { + isEnabled = !!(__itt_api_version()); + CV_LOG_ITT("ITT is " << (isEnabled ? "enabled" : "disabled")); + domain = __itt_domain_create("OpenCVTrace"); + } + else + { + CV_LOG_ITT("ITT is disabled through OpenCV parameter"); + isEnabled = false; + } + isInitialized = true; + } } return isEnabled; } From 86a8ff61293ca5e8d3b5a28cb623b4ca36aab74a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Lippok?= Date: Sun, 27 Oct 2019 17:43:31 +0100 Subject: [PATCH 4/6] Fixed typo in assertion --- modules/calib3d/src/fisheye.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/calib3d/src/fisheye.cpp b/modules/calib3d/src/fisheye.cpp index 5fe6b2a08d..56fd82114d 100644 --- a/modules/calib3d/src/fisheye.cpp +++ b/modules/calib3d/src/fisheye.cpp @@ -857,8 +857,8 @@ double cv::fisheye::stereoCalibrate(InputArrayOfArrays objectPoints, InputArrayO CV_Assert(K1.empty() || (K1.size() == Size(3,3))); CV_Assert(D1.empty() || (D1.total() == 4)); - CV_Assert(K2.empty() || (K1.size() == Size(3,3))); - CV_Assert(D2.empty() || (D1.total() == 4)); + CV_Assert(K2.empty() || (K2.size() == Size(3,3))); + CV_Assert(D2.empty() || (D2.total() == 4)); CV_Assert((!K1.empty() && !K2.empty() && !D1.empty() && !D2.empty()) || !(flags & CALIB_FIX_INTRINSIC)); From 80c4cedd255c4eec6556f3ab041c14f2aeb3313f Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sun, 27 Oct 2019 11:14:17 +0000 Subject: [PATCH 5/6] android: use .getRowStride() in JavaCamera2View --- modules/core/misc/java/src/java/core+Mat.java | 12 +++++ modules/core/misc/java/test/MatTest.java | 18 ++++++++ .../org/opencv/android/JavaCamera2View.java | 46 +++++++++++++------ modules/java/generator/src/cpp/Mat.cpp | 28 ++++++++++- 4 files changed, 90 insertions(+), 14 deletions(-) diff --git a/modules/core/misc/java/src/java/core+Mat.java b/modules/core/misc/java/src/java/core+Mat.java index 5ebb5ee145..641d9f8ae8 100644 --- a/modules/core/misc/java/src/java/core+Mat.java +++ b/modules/core/misc/java/src/java/core+Mat.java @@ -41,6 +41,15 @@ public class Mat { nativeObj = n_Mat(rows, cols, type, data); } + // + // C++: Mat::Mat(int rows, int cols, int type, void* data, size_t step) + // + + // javadoc: Mat::Mat(rows, cols, type, data, step) + public Mat(int rows, int cols, int type, ByteBuffer data, long step) { + nativeObj = n_Mat(rows, cols, type, data, step); + } + // // C++: Mat::Mat(Size size, int type) // @@ -1136,6 +1145,9 @@ public class Mat { // C++: Mat::Mat(int rows, int cols, int type, void* data) private static native long n_Mat(int rows, int cols, int type, ByteBuffer data); + // C++: Mat::Mat(int rows, int cols, int type, void* data, size_t step) + private static native long n_Mat(int rows, int cols, int type, ByteBuffer data, long step); + // C++: Mat::Mat(Size size, int type) private static native long n_Mat(double size_width, double size_height, int type); diff --git a/modules/core/misc/java/test/MatTest.java b/modules/core/misc/java/test/MatTest.java index cdd7950843..039aa39929 100644 --- a/modules/core/misc/java/test/MatTest.java +++ b/modules/core/misc/java/test/MatTest.java @@ -1246,4 +1246,22 @@ public class MatTest extends OpenCVTestCase { assertEquals(1, bbuf.get(4095)); } + public void testMatFromByteBufferWithStep() { + ByteBuffer bbuf = ByteBuffer.allocateDirect(80*64); + bbuf.putInt(0x01010101); + bbuf.putInt(64, 0x02020202); + bbuf.putInt(80, 0x03030303); + Mat m = new Mat(64, 64, CvType.CV_8UC1, bbuf, 80); + assertEquals(8, Core.countNonZero(m)); + Core.add(m, new Scalar(5), m); + assertEquals(4096, Core.countNonZero(m)); + m.release(); + assertEquals(6, bbuf.get(0)); + assertEquals(5, bbuf.get(63)); + assertEquals(2, bbuf.get(64)); + assertEquals(0, bbuf.get(79)); + assertEquals(8, bbuf.get(80)); + assertEquals(5, bbuf.get(63*80 + 63)); + } + } diff --git a/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java b/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java index e4a58539b5..09e01b01fc 100644 --- a/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java +++ b/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java @@ -332,8 +332,10 @@ public class JavaCamera2View extends CameraBridgeViewBase { Image.Plane[] planes = mImage.getPlanes(); int w = mImage.getWidth(); int h = mImage.getHeight(); + assert(planes[0].getPixelStride() == 1); ByteBuffer y_plane = planes[0].getBuffer(); - mGray = new Mat(h, w, CvType.CV_8UC1, y_plane); + int y_plane_step = planes[0].getRowStride(); + mGray = new Mat(h, w, CvType.CV_8UC1, y_plane, y_plane_step); return mGray; } @@ -349,11 +351,14 @@ public class JavaCamera2View extends CameraBridgeViewBase { assert(planes[0].getPixelStride() == 1); assert(planes[2].getPixelStride() == 2); ByteBuffer y_plane = planes[0].getBuffer(); + int y_plane_step = planes[0].getRowStride(); ByteBuffer uv_plane1 = planes[1].getBuffer(); + int uv_plane1_step = planes[1].getRowStride(); ByteBuffer uv_plane2 = planes[2].getBuffer(); - Mat y_mat = new Mat(h, w, CvType.CV_8UC1, y_plane); - Mat uv_mat1 = new Mat(h / 2, w / 2, CvType.CV_8UC2, uv_plane1); - Mat uv_mat2 = new Mat(h / 2, w / 2, CvType.CV_8UC2, uv_plane2); + int uv_plane2_step = planes[2].getRowStride(); + Mat y_mat = new Mat(h, w, CvType.CV_8UC1, y_plane, y_plane_step); + Mat uv_mat1 = new Mat(h / 2, w / 2, CvType.CV_8UC2, uv_plane1, uv_plane1_step); + Mat uv_mat2 = new Mat(h / 2, w / 2, CvType.CV_8UC2, uv_plane2, uv_plane2_step); long addr_diff = uv_mat2.dataAddr() - uv_mat1.dataAddr(); if (addr_diff > 0) { assert(addr_diff == 1); @@ -369,30 +374,45 @@ public class JavaCamera2View extends CameraBridgeViewBase { ByteBuffer u_plane = planes[1].getBuffer(); ByteBuffer v_plane = planes[2].getBuffer(); - y_plane.get(yuv_bytes, 0, w*h); + int yuv_bytes_offset = 0; + + int y_plane_step = planes[0].getRowStride(); + if (y_plane_step == w) { + y_plane.get(yuv_bytes, 0, w*h); + yuv_bytes_offset = w*h; + } else { + int padding = y_plane_step - w; + for (int i = 0; i < h; i++){ + y_plane.get(yuv_bytes, yuv_bytes_offset, w); + yuv_bytes_offset += w; + if (i < h - 1) { + y_plane.position(y_plane.position() + padding); + } + } + assert(yuv_bytes_offset == w * h); + } int chromaRowStride = planes[1].getRowStride(); int chromaRowPadding = chromaRowStride - w/2; - int offset = w*h; if (chromaRowPadding == 0){ // When the row stride of the chroma channels equals their width, we can copy // the entire channels in one go - u_plane.get(yuv_bytes, offset, w*h/4); - offset += w*h/4; - v_plane.get(yuv_bytes, offset, w*h/4); + u_plane.get(yuv_bytes, yuv_bytes_offset, w*h/4); + yuv_bytes_offset += w*h/4; + v_plane.get(yuv_bytes, yuv_bytes_offset, w*h/4); } else { // When not equal, we need to copy the channels row by row for (int i = 0; i < h/2; i++){ - u_plane.get(yuv_bytes, offset, w/2); - offset += w/2; + u_plane.get(yuv_bytes, yuv_bytes_offset, w/2); + yuv_bytes_offset += w/2; if (i < h/2-1){ u_plane.position(u_plane.position() + chromaRowPadding); } } for (int i = 0; i < h/2; i++){ - v_plane.get(yuv_bytes, offset, w/2); - offset += w/2; + v_plane.get(yuv_bytes, yuv_bytes_offset, w/2); + yuv_bytes_offset += w/2; if (i < h/2-1){ v_plane.position(v_plane.position() + chromaRowPadding); } diff --git a/modules/java/generator/src/cpp/Mat.cpp b/modules/java/generator/src/cpp/Mat.cpp index 1ae2aa6e8c..5203413ae4 100644 --- a/modules/java/generator/src/cpp/Mat.cpp +++ b/modules/java/generator/src/cpp/Mat.cpp @@ -74,7 +74,7 @@ JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1Mat__IIILjava_nio_ByteBuffer JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1Mat__IIILjava_nio_ByteBuffer_2 (JNIEnv* env, jclass, jint rows, jint cols, jint type, jobject data) { - static const char method_name[] = "Mat::n_1Mat__IIILByteBuffer()"; + static const char method_name[] = "Mat::n_1Mat__IIILjava_nio_ByteBuffer_2()"; try { LOGD("%s", method_name); return (jlong) new Mat( rows, cols, type, (void*)env->GetDirectBufferAddress(data) ); @@ -88,6 +88,32 @@ JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1Mat__IIILjava_nio_ByteBuffer } +/* + * Class: org_opencv_core_Mat + * Method: n_Mat + * Signature: (IIILjava/nio/ByteBuffer;J)J + * + * Mat::Mat(int rows, int cols, int type, void* data, size_t step) + */ +JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1Mat__IIILjava_nio_ByteBuffer_2J + (JNIEnv* env, jclass, jint rows, jint cols, jint type, jobject data, jlong step); + +JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1Mat__IIILjava_nio_ByteBuffer_2J + (JNIEnv* env, jclass, jint rows, jint cols, jint type, jobject data, jlong step) +{ + static const char method_name[] = "Mat::n_1Mat__IIILjava_nio_ByteBuffer_2J()"; + try { + LOGD("%s", method_name); + return (jlong) new Mat(rows, cols, type, (void*)env->GetDirectBufferAddress(data), (size_t)step); + } catch(const std::exception &e) { + throwJavaException(env, &e, method_name); + } catch (...) { + throwJavaException(env, 0, method_name); + } + + return 0; +} + // // Mat::Mat(int rows, int cols, int type) From a71ff501301c05e41526c9a8fe69ed05ac953e1e Mon Sep 17 00:00:00 2001 From: Chip Kerchner <49959681+ChipKerchner@users.noreply.github.com> Date: Tue, 29 Oct 2019 06:42:20 -0400 Subject: [PATCH 6/6] Merge pull request #15623 from ChipKerchner:optimizeHOGpipeline * Use circular lut hustory buffer in computeGradient of HOG * Initialize prefetch data outside main loop. Avoid code duplication. --- modules/objdetect/src/hog.cpp | 132 +++++++++++++++++++++++++++++----- 1 file changed, 115 insertions(+), 17 deletions(-) diff --git a/modules/objdetect/src/hog.cpp b/modules/objdetect/src/hog.cpp index 378bab3087..e98b9c2e23 100644 --- a/modules/objdetect/src/hog.cpp +++ b/modules/objdetect/src/hog.cpp @@ -299,6 +299,11 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle, Mat Dy(1, width, CV_32F, dbuf + width); Mat Mag(1, width, CV_32F, dbuf + width*2); Mat Angle(1, width, CV_32F, dbuf + width*3); +#if CV_SIMD128 + int widthP2 = width+2; + AutoBuffer _lutBuf(9*widthP2); + float* const lutBuf = _lutBuf.data(); +#endif if (cn == 3) { @@ -317,6 +322,63 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle, xmap += 1; } +#if CV_SIMD128 + typedef const uchar* const T; + float *lutPrev, *lutCurr, *lutNext; + { + y = 0; + const uchar* imgPtr = img.ptr(ymap[y]); + const uchar* prevPtr = img.data + img.step*ymap[y-1]; + + lutPrev = lutBuf+widthP2*0; + lutCurr = lutBuf+widthP2*3; + + { + int x0 = xmap[-1], x1 = xmap[0]; + T p02 = imgPtr + x0, p12 = imgPtr + x1; + + lutPrev[0+widthP2*0] = lut[prevPtr[x0+0]]; + lutPrev[0+widthP2*1] = lut[prevPtr[x0+1]]; + lutPrev[0+widthP2*2] = lut[prevPtr[x0+2]]; + lutCurr[0+widthP2*0] = lut[p02[0]]; lutCurr[1+widthP2*0] = lut[p12[0]]; + lutCurr[0+widthP2*1] = lut[p02[1]]; lutCurr[1+widthP2*1] = lut[p12[1]]; + lutCurr[0+widthP2*2] = lut[p02[2]]; lutCurr[1+widthP2*2] = lut[p12[2]]; + } + + for( x = 0; x <= width - 4; x += 4 ) + { + int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3]; + T p02 = imgPtr + xmap[x+1]; + T p12 = imgPtr + xmap[x+2]; + T p22 = imgPtr + xmap[x+3]; + T p32 = imgPtr + xmap[x+4]; + + v_float32x4 _dx00 = v_float32x4(lut[p02[0]], lut[p12[0]], lut[p22[0]], lut[p32[0]]); + v_float32x4 _dx10 = v_float32x4(lut[p02[1]], lut[p12[1]], lut[p22[1]], lut[p32[1]]); + v_float32x4 _dx20 = v_float32x4(lut[p02[2]], lut[p12[2]], lut[p22[2]], lut[p32[2]]); + + v_store(lutCurr+x+widthP2*0+2, _dx00); + v_store(lutCurr+x+widthP2*1+2, _dx10); + v_store(lutCurr+x+widthP2*2+2, _dx20); + + v_float32x4 _dy00 = v_float32x4(lut[prevPtr[x0+0]], lut[prevPtr[x1+0]], lut[prevPtr[x2+0]], lut[prevPtr[x3+0]]); + v_float32x4 _dy10 = v_float32x4(lut[prevPtr[x0+1]], lut[prevPtr[x1+1]], lut[prevPtr[x2+1]], lut[prevPtr[x3+1]]); + v_float32x4 _dy20 = v_float32x4(lut[prevPtr[x0+2]], lut[prevPtr[x1+2]], lut[prevPtr[x2+2]], lut[prevPtr[x3+2]]); + + v_store(lutPrev+x+widthP2*0+1, _dy00); + v_store(lutPrev+x+widthP2*1+1, _dy10); + v_store(lutPrev+x+widthP2*2+1, _dy20); + } + { + int x0 = xmap[x]; + + lutPrev[x+widthP2*0+1] = lut[prevPtr[x0+0]]; + lutPrev[x+widthP2*1+1] = lut[prevPtr[x0+1]]; + lutPrev[x+widthP2*2+1] = lut[prevPtr[x0+2]]; + } + } +#endif + float angleScale = signedGradient ? (float)(nbins/(2.0*CV_PI)) : (float)(nbins/CV_PI); for( y = 0; y < gradsize.height; y++ ) { @@ -342,28 +404,57 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle, { x = 0; #if CV_SIMD128 + int yMod = y%3; + + // Circular lut history buffer + if (yMod == 0) + { + lutPrev = lutBuf+widthP2*0; + lutCurr = lutBuf+widthP2*3; + lutNext = lutBuf+widthP2*6; + } + else if (yMod == 1) + { + lutPrev = lutBuf+widthP2*3; + lutCurr = lutBuf+widthP2*6; + lutNext = lutBuf+widthP2*0; + } + else + { + lutPrev = lutBuf+widthP2*6; + lutCurr = lutBuf+widthP2*0; + lutNext = lutBuf+widthP2*3; + } + + { + int x0 = xmap[-1]; + + lutNext[0+widthP2*0] = lut[nextPtr[x0+0]]; + lutNext[0+widthP2*1] = lut[nextPtr[x0+1]]; + lutNext[0+widthP2*2] = lut[nextPtr[x0+2]]; + } for( ; x <= width - 4; x += 4 ) { int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3]; - typedef const uchar* const T; - T p02 = imgPtr + xmap[x+1], p00 = imgPtr + xmap[x-1]; - T p12 = imgPtr + xmap[x+2], p10 = imgPtr + xmap[x]; - T p22 = imgPtr + xmap[x+3], p20 = p02; - T p32 = imgPtr + xmap[x+4], p30 = p12; - v_float32x4 _dx0 = v_float32x4(lut[p02[0]], lut[p12[0]], lut[p22[0]], lut[p32[0]]) - - v_float32x4(lut[p00[0]], lut[p10[0]], lut[p20[0]], lut[p30[0]]); - v_float32x4 _dx1 = v_float32x4(lut[p02[1]], lut[p12[1]], lut[p22[1]], lut[p32[1]]) - - v_float32x4(lut[p00[1]], lut[p10[1]], lut[p20[1]], lut[p30[1]]); - v_float32x4 _dx2 = v_float32x4(lut[p02[2]], lut[p12[2]], lut[p22[2]], lut[p32[2]]) - - v_float32x4(lut[p00[2]], lut[p10[2]], lut[p20[2]], lut[p30[2]]); + v_float32x4 _dx0 = v_load(lutCurr+x+widthP2*0+2) - v_load(lutCurr+x+widthP2*0); + v_float32x4 _dx1 = v_load(lutCurr+x+widthP2*1+2) - v_load(lutCurr+x+widthP2*1); + v_float32x4 _dx2 = v_load(lutCurr+x+widthP2*2+2) - v_load(lutCurr+x+widthP2*2); - v_float32x4 _dy0 = v_float32x4(lut[nextPtr[x0]], lut[nextPtr[x1]], lut[nextPtr[x2]], lut[nextPtr[x3]]) - - v_float32x4(lut[prevPtr[x0]], lut[prevPtr[x1]], lut[prevPtr[x2]], lut[prevPtr[x3]]); - v_float32x4 _dy1 = v_float32x4(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]) - - v_float32x4(lut[prevPtr[x0+1]], lut[prevPtr[x1+1]], lut[prevPtr[x2+1]], lut[prevPtr[x3+1]]); - v_float32x4 _dy2 = v_float32x4(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]) - - v_float32x4(lut[prevPtr[x0+2]], lut[prevPtr[x1+2]], lut[prevPtr[x2+2]], lut[prevPtr[x3+2]]); + v_float32x4 _dy00 = v_float32x4(lut[nextPtr[x0+0]], lut[nextPtr[x1+0]], lut[nextPtr[x2+0]], lut[nextPtr[x3+0]]); + v_float32x4 _dy0 = _dy00 - v_load(lutPrev+x+widthP2*0+1); + + v_store(lutNext+x+widthP2*0+1, _dy00); + + v_float32x4 _dy10 = v_float32x4(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]); + v_float32x4 _dy1 = _dy10 - v_load(lutPrev+x+widthP2*1+1); + + v_store(lutNext+x+widthP2*1+1, _dy10); + + v_float32x4 _dy20 = v_float32x4(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]); + v_float32x4 _dy2 = _dy20 - v_load(lutPrev+x+widthP2*2+1); + + v_store(lutNext+x+widthP2*2+1, _dy20); v_float32x4 _mag0 = (_dx0 * _dx0) + (_dy0 * _dy0); v_float32x4 _mag1 = (_dx1 * _dx1) + (_dy1 * _dy1); @@ -380,6 +471,13 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle, v_store(dbuf + x, _dx2); v_store(dbuf + x + width, _dy2); } + { + int x0 = xmap[x]; + + lutNext[x+widthP2*0+1] = lut[nextPtr[x0+0]]; + lutNext[x+widthP2*1+1] = lut[nextPtr[x0+1]]; + lutNext[x+widthP2*2+1] = lut[nextPtr[x0+2]]; + } #endif for( ; x < width; x++ ) {