diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index 30377048ae..af4efa238c 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -1133,6 +1133,41 @@ inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b, return v_float32x8(_mm256_hadd_ps(ab, cd)); } +inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b) +{ + return (unsigned)_v_cvtsi256_si32(_mm256_sad_epu8(a.val, b.val)); +} +inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b) +{ + __m256i half = _mm256_set1_epi8(0x7f); + return (unsigned)_v_cvtsi256_si32(_mm256_sad_epu8(_mm256_add_epi8(a.val, half), _mm256_add_epi8(b.val, half))); +} +inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b) +{ + v_uint32x8 l, h; + v_expand(v_add_wrap(a - b, b - a), l, h); + return v_reduce_sum(l + h); +} +inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b) +{ + v_uint32x8 l, h; + v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h); + return v_reduce_sum(l + h); +} +inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b) +{ + return v_reduce_sum(v_max(a, b) - v_min(a, b)); +} +inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b) +{ + v_int32x8 m = a < b; + return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m)); +} +inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b) +{ + return v_reduce_sum((a - b) & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)))); +} + /** Popcount **/ #define OPENCV_HAL_IMPL_AVX_POPCOUNT(_Tpvec) \ inline v_uint32x8 v_popcount(const _Tpvec& a) \ diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index 1cfb14ae06..65a01f31d7 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -686,10 +686,10 @@ OPENCV_HAL_IMPL_CMP_OP(!=) template inline v_reg v_not_nan(const v_reg& a) { -typedef typename V_TypeTraits::int_type itype; -v_reg c; -for (int i = 0; i < n; i++) - c.s[i] = V_TypeTraits::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i])); + typedef typename V_TypeTraits::int_type itype; + v_reg c; + for (int i = 0; i < n; i++) + c.s[i] = V_TypeTraits::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i])); return c; } template @@ -1063,6 +1063,21 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, return r; } +/** @brief Sum absolute differences of values + +Scheme: +@code +{A1 A2 A3 ...} {B1 B2 B3 ...} => sum{ABS(A1-B1),abs(A2-B2),abs(A3-B3),...} +@endcode +For all types except 64-bit types.*/ +template inline typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type v_reduce_sad(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type c = _absdiff(a.s[0], b.s[0]); + for (int i = 1; i < n; i++) + c += _absdiff(a.s[i], b.s[i]); + return c; +} + /** @brief Get negative values mask Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes. diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index 2de4e45283..1b35896009 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -999,6 +999,49 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, return v_float32x4(vaddq_f32(v0, v1)); } +inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b) +{ + uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vabdq_u8(a.val, b.val))); + uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0)); + return vget_lane_u32(vpadd_u32(t1, t1), 0); +} +inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b) +{ + uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s8(vabdq_s8(a.val, b.val)))); + uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0)); + return vget_lane_u32(vpadd_u32(t1, t1), 0); +} +inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b) +{ + uint32x4_t t0 = vpaddlq_u16(vabdq_u16(a.val, b.val)); + uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0)); + return vget_lane_u32(vpadd_u32(t1, t1), 0); +} +inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b) +{ + uint32x4_t t0 = vpaddlq_u16(vreinterpretq_u16_s16(vabdq_s16(a.val, b.val))); + uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0)); + return vget_lane_u32(vpadd_u32(t1, t1), 0); +} +inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b) +{ + uint32x4_t t0 = vabdq_u32(a.val, b.val); + uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0)); + return vget_lane_u32(vpadd_u32(t1, t1), 0); +} +inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b) +{ + uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val)); + uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0)); + return vget_lane_u32(vpadd_u32(t1, t1), 0); +} +inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b) +{ + float32x4_t t0 = vabdq_f32(a.val, b.val); + float32x2_t t1 = vpadd_f32(vget_low_f32(t0), vget_high_f32(t0)); + return vget_lane_f32(vpadd_f32(t1, t1), 0); +} + #define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \ inline v_uint32x4 v_popcount(const _Tpvec& a) \ { \ diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 283c5158d7..24a34a3921 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -1477,6 +1477,41 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min) OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max) OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min) +inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b) +{ + return (unsigned)_mm_cvtsi128_si32(_mm_sad_epu8(a.val, b.val)); +} +inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b) +{ + __m128i half = _mm_set1_epi8(0x7f); + return (unsigned)_mm_cvtsi128_si32(_mm_sad_epu8(_mm_add_epi8(a.val, half), + _mm_add_epi8(b.val, half))); +} +inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b) +{ + v_uint32x4 l, h; + v_expand(v_absdiff(a, b), l, h); + return v_reduce_sum(l + h); +} +inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b) +{ + v_uint32x4 l, h; + v_expand(v_absdiff(a, b), l, h); + return v_reduce_sum(l + h); +} +inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b) +{ + return v_reduce_sum(v_absdiff(a, b)); +} +inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b) +{ + return v_reduce_sum(v_absdiff(a, b)); +} +inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b) +{ + return v_reduce_sum(v_absdiff(a, b)); +} + #define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \ inline v_uint32x4 v_popcount(const _Tpvec& a) \ { \ @@ -1930,13 +1965,11 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b) { - const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); - __m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1 __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3 - a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3 - b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3 + a.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0)); // a0 a1 a2 a3 + b.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(3, 1, 3, 1)); // b0 b1 ab b3 } inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c) diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index fe4a5db5df..efea72c281 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -739,6 +739,50 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, return v_float32x4(vec_mergeh(ac, bd)); } +inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b) +{ + const vec_uint4 zero4 = vec_uint4_z; + vec_uint4 sum4 = vec_sum4s(vec_absd(a.val, b.val), zero4); + return (unsigned)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3); +} +inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b) +{ + const vec_int4 zero4 = vec_int4_z; + vec_char16 ad = vec_abss(vec_subs(a.val, b.val)); + vec_int4 sum4 = vec_sum4s(ad, zero4); + return (unsigned)vec_extract(vec_sums(sum4, zero4), 3); +} +inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b) +{ + vec_ushort8 ad = vec_absd(a.val, b.val); + VSX_UNUSED(vec_int4) sum = vec_sums(vec_int4_c(vec_unpackhu(ad)), vec_int4_c(vec_unpacklu(ad))); + return (unsigned)vec_extract(sum, 3); +} +inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b) +{ + const vec_int4 zero4 = vec_int4_z; + vec_short8 ad = vec_abss(vec_subs(a.val, b.val)); + vec_int4 sum4 = vec_sum4s(ad, zero4); + return (unsigned)vec_extract(vec_sums(sum4, zero4), 3); +} +inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b) +{ + const vec_uint4 ad = vec_absd(a.val, b.val); + const vec_uint4 rd = vec_add(ad, vec_sld(ad, ad, 8)); + return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0); +} +inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b) +{ + vec_int4 ad = vec_abss(vec_sub(a.val, b.val)); + return (unsigned)vec_extract(vec_sums(ad, vec_int4_z), 3); +} +inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b) +{ + const vec_float4 ad = vec_abs(vec_sub(a.val, b.val)); + const vec_float4 rd = vec_add(ad, vec_sld(ad, ad, 8)); + return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0); +} + /** Popcount **/ template inline v_uint32x4 v_popcount(const _Tpvec& a) diff --git a/modules/core/include/opencv2/core/sse_utils.hpp b/modules/core/include/opencv2/core/sse_utils.hpp index 69efffe41d..0906583ea4 100644 --- a/modules/core/include/opencv2/core/sse_utils.hpp +++ b/modules/core/include/opencv2/core/sse_utils.hpp @@ -567,7 +567,7 @@ inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1) { - const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); + enum { mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1) }; __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); __m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); @@ -588,7 +588,7 @@ inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m12 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) { - const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); + enum { mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1) }; __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); __m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); @@ -615,7 +615,7 @@ inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1) { - const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); + enum { mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1) }; __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); __m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); diff --git a/modules/core/perf/perf_norm.cpp b/modules/core/perf/perf_norm.cpp index 4e0673652b..07f989f21c 100644 --- a/modules/core/perf/perf_norm.cpp +++ b/modules/core/perf/perf_norm.cpp @@ -253,4 +253,53 @@ PERF_TEST_P( Size_MatType, normalize_minmax, TYPICAL_MATS ) SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE); } +typedef TestBaseWithParam< int > test_len; +PERF_TEST_P(test_len, hal_normL1_u8, + testing::Values(300000, 2000000) + ) +{ + int len = GetParam(); + + Mat src1(1, len, CV_8UC1); + Mat src2(1, len, CV_8UC1); + + declare.in(src1, src2, WARMUP_RNG); + double n; + TEST_CYCLE() n = hal::normL1_(src1.ptr(0), src2.ptr(0), len); + CV_UNUSED(n); + SANITY_CHECK_NOTHING(); +} + +PERF_TEST_P(test_len, hal_normL1_f32, + testing::Values(300000, 2000000) + ) +{ + int len = GetParam(); + + Mat src1(1, len, CV_32FC1); + Mat src2(1, len, CV_32FC1); + + declare.in(src1, src2, WARMUP_RNG); + double n; + TEST_CYCLE() n = hal::normL1_(src1.ptr(0), src2.ptr(0), len); + CV_UNUSED(n); + SANITY_CHECK_NOTHING(); +} + +PERF_TEST_P(test_len, hal_normL2Sqr, + testing::Values(300000, 2000000) + ) +{ + int len = GetParam(); + + Mat src1(1, len, CV_32FC1); + Mat src2(1, len, CV_32FC1); + + declare.in(src1, src2, WARMUP_RNG); + double n; + TEST_CYCLE() n = hal::normL2Sqr_(src1.ptr(0), src2.ptr(0), len); + CV_UNUSED(n); + SANITY_CHECK_NOTHING(); +} + } // namespace diff --git a/modules/core/src/norm.cpp b/modules/core/src/norm.cpp index e0642ea3d9..d3eec98e38 100644 --- a/modules/core/src/norm.cpp +++ b/modules/core/src/norm.cpp @@ -98,43 +98,15 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize) float normL2Sqr_(const float* a, const float* b, int n) { int j = 0; float d = 0.f; -#if CV_AVX2 - float CV_DECL_ALIGNED(32) buf[8]; - __m256 d0 = _mm256_setzero_ps(); - - for( ; j <= n - 8; j += 8 ) +#if CV_SIMD + v_float32 v_d = vx_setzero_f32(); + for (; j <= n - v_float32::nlanes; j += v_float32::nlanes) { - __m256 t0 = _mm256_sub_ps(_mm256_loadu_ps(a + j), _mm256_loadu_ps(b + j)); -#if CV_FMA3 - d0 = _mm256_fmadd_ps(t0, t0, d0); -#else - d0 = _mm256_add_ps(d0, _mm256_mul_ps(t0, t0)); + v_float32 t = vx_load(a + j) - vx_load(b + j); + v_d = v_muladd(t, t, v_d); + } + d = v_reduce_sum(v_d); #endif - } - _mm256_store_ps(buf, d0); - d = buf[0] + buf[1] + buf[2] + buf[3] + buf[4] + buf[5] + buf[6] + buf[7]; -#elif CV_SSE - float CV_DECL_ALIGNED(16) buf[4]; - __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps(); - - for( ; j <= n - 8; j += 8 ) - { - __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j)); - __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4)); - d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0)); - d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1)); - } - _mm_store_ps(buf, _mm_add_ps(d0, d1)); - d = buf[0] + buf[1] + buf[2] + buf[3]; -#endif - { - for( ; j <= n - 4; j += 4 ) - { - float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3]; - d += t0*t0 + t1*t1 + t2*t2 + t3*t3; - } - } - for( ; j < n; j++ ) { float t = a[j] - b[j]; @@ -147,38 +119,12 @@ float normL2Sqr_(const float* a, const float* b, int n) float normL1_(const float* a, const float* b, int n) { int j = 0; float d = 0.f; -#if CV_SSE - float CV_DECL_ALIGNED(16) buf[4]; - static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; - __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps(); - __m128 absmask = _mm_load_ps((const float*)absbuf); - - for( ; j <= n - 8; j += 8 ) - { - __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j)); - __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4)); - d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask)); - d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask)); - } - _mm_store_ps(buf, _mm_add_ps(d0, d1)); - d = buf[0] + buf[1] + buf[2] + buf[3]; -#elif CV_NEON - float32x4_t v_sum = vdupq_n_f32(0.0f); - for ( ; j <= n - 4; j += 4) - v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j))); - - float CV_DECL_ALIGNED(16) buf[4]; - vst1q_f32(buf, v_sum); - d = buf[0] + buf[1] + buf[2] + buf[3]; +#if CV_SIMD + v_float32 v_d = vx_setzero_f32(); + for (; j <= n - v_float32::nlanes; j += v_float32::nlanes) + v_d += v_absdiff(vx_load(a + j), vx_load(b + j)); + d = v_reduce_sum(v_d); #endif - { - for( ; j <= n - 4; j += 4 ) - { - d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) + - std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]); - } - } - for( ; j < n; j++ ) d += std::abs(a[j] - b[j]); return d; @@ -187,46 +133,10 @@ float normL1_(const float* a, const float* b, int n) int normL1_(const uchar* a, const uchar* b, int n) { int j = 0, d = 0; -#if CV_SSE - __m128i d0 = _mm_setzero_si128(); - - for( ; j <= n - 16; j += 16 ) - { - __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j)); - __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j)); - - d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); - } - - for( ; j <= n - 4; j += 4 ) - { - __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j)); - __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j)); - - d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); - } - d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0))); -#elif CV_NEON - uint32x4_t v_sum = vdupq_n_u32(0.0f); - for ( ; j <= n - 16; j += 16) - { - uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j)); - uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst)); - v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high))); - v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high))); - } - - uint CV_DECL_ALIGNED(16) buf[4]; - vst1q_u32(buf, v_sum); - d = buf[0] + buf[1] + buf[2] + buf[3]; +#if CV_SIMD + for (; j <= n - v_uint8::nlanes; j += v_uint8::nlanes) + d += v_reduce_sad(vx_load(a + j), vx_load(b + j)); #endif - { - for( ; j <= n - 4; j += 4 ) - { - d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) + - std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]); - } - } for( ; j < n; j++ ) d += std::abs(a[j] - b[j]); return d; diff --git a/modules/core/src/utils/filesystem.cpp b/modules/core/src/utils/filesystem.cpp index de18fb81fb..43503f1d72 100644 --- a/modules/core/src/utils/filesystem.cpp +++ b/modules/core/src/utils/filesystem.cpp @@ -563,6 +563,7 @@ cv::String getCacheDirectory(const char* sub_directory_name, const char* configu cv::String canonical(const cv::String& /*path*/) { NOT_IMPLEMENTED } bool exists(const cv::String& /*path*/) { NOT_IMPLEMENTED } void remove_all(const cv::String& /*path*/) { NOT_IMPLEMENTED } +cv::String getcwd() { NOT_IMPLEMENTED } bool createDirectory(const cv::String& /*path*/) { NOT_IMPLEMENTED } bool createDirectories(const cv::String& /*path*/) { NOT_IMPLEMENTED } cv::String getCacheDirectory(const char* /*sub_directory_name*/, const char* /*configuration_name = NULL*/) { NOT_IMPLEMENTED } diff --git a/modules/core/src/va_intel.cpp b/modules/core/src/va_intel.cpp index ac74f0c533..c571b90b5f 100644 --- a/modules/core/src/va_intel.cpp +++ b/modules/core/src/va_intel.cpp @@ -340,8 +340,8 @@ static void copy_convert_yv12_to_bgr(const VAImage& image, const unsigned char* 1.5959997177f }; - CV_CheckEQ(image.format.fourcc, VA_FOURCC_YV12, "Unexpected image format"); - CV_CheckEQ(image.num_planes, 3, ""); + CV_CheckEQ((size_t)image.format.fourcc, (size_t)VA_FOURCC_YV12, "Unexpected image format"); + CV_CheckEQ((size_t)image.num_planes, (size_t)3, ""); const size_t srcOffsetY = image.offsets[0]; const size_t srcOffsetV = image.offsets[1]; @@ -417,8 +417,8 @@ static void copy_convert_bgr_to_yv12(const VAImage& image, const Mat& bgr, unsig -0.2909994125f, 0.438999176f, -0.3679990768f, -0.0709991455f }; - CV_CheckEQ(image.format.fourcc, VA_FOURCC_YV12, "Unexpected image format"); - CV_CheckEQ(image.num_planes, 3, ""); + CV_CheckEQ((size_t)image.format.fourcc, (size_t)VA_FOURCC_YV12, "Unexpected image format"); + CV_CheckEQ((size_t)image.num_planes, (size_t)3, ""); const size_t dstOffsetY = image.offsets[0]; const size_t dstOffsetV = image.offsets[1]; diff --git a/modules/imgproc/perf/perf_contours.cpp b/modules/imgproc/perf/perf_contours.cpp index 7606605cce..d3a70cfdd7 100644 --- a/modules/imgproc/perf/perf_contours.cpp +++ b/modules/imgproc/perf/perf_contours.cpp @@ -3,7 +3,7 @@ // of this distribution and at http://opencv.org/license.html. #include "perf_precomp.hpp" -namespace opencv_test { +namespace opencv_test { namespace { CV_ENUM(RetrMode, RETR_EXTERNAL, RETR_LIST, RETR_CCOMP, RETR_TREE) CV_ENUM(ApproxMode, CHAIN_APPROX_NONE, CHAIN_APPROX_SIMPLE, CHAIN_APPROX_TC89_L1, CHAIN_APPROX_TC89_KCOS) @@ -84,4 +84,4 @@ PERF_TEST_P(TestFindContoursFF, findContours, SANITY_CHECK_NOTHING(); } -} // namespace +} } // namespace diff --git a/modules/imgproc/src/grabcut.cpp b/modules/imgproc/src/grabcut.cpp index d59117f745..c7fd91c25a 100644 --- a/modules/imgproc/src/grabcut.cpp +++ b/modules/imgproc/src/grabcut.cpp @@ -175,7 +175,6 @@ void GMM::addSample( int ci, const Vec3d color ) void GMM::endLearning() { - CV_Assert(totalSampleCount > 0); for( int ci = 0; ci < componentsCount; ci++ ) { int n = sampleCounts[ci]; @@ -183,6 +182,7 @@ void GMM::endLearning() coefs[ci] = 0; else { + CV_Assert(totalSampleCount > 0); double inv_n = 1.0 / n; coefs[ci] = (double)n/totalSampleCount; diff --git a/modules/imgproc/src/median_blur.cpp b/modules/imgproc/src/median_blur.cpp index 24002d3b83..9f5a9ba7d5 100644 --- a/modules/imgproc/src/median_blur.cpp +++ b/modules/imgproc/src/median_blur.cpp @@ -74,30 +74,27 @@ namespace cv { -namespace -{ - -typedef ushort HT; - -/** - * This structure represents a two-tier histogram. The first tier (known as the - * "coarse" level) is 4 bit wide and the second tier (known as the "fine" level) - * is 8 bit wide. Pixels inserted in the fine level also get inserted into the - * coarse bucket designated by the 4 MSBs of the fine bucket value. - * - * The structure is aligned on 16 bits, which is a prerequisite for SIMD - * instructions. Each bucket is 16 bit wide, which means that extra care must be - * taken to prevent overflow. - */ -typedef struct -{ - HT coarse[16]; - HT fine[16][16]; -} Histogram; - static void medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) { + typedef ushort HT; + + /** + * This structure represents a two-tier histogram. The first tier (known as the + * "coarse" level) is 4 bit wide and the second tier (known as the "fine" level) + * is 8 bit wide. Pixels inserted in the fine level also get inserted into the + * coarse bucket designated by the 4 MSBs of the fine bucket value. + * + * The structure is aligned on 16 bits, which is a prerequisite for SIMD + * instructions. Each bucket is 16 bit wide, which means that extra care must be + * taken to prevent overflow. + */ + typedef struct + { + HT coarse[16]; + HT fine[16][16]; + } Histogram; + /** * HOP is short for Histogram OPeration. This macro makes an operation \a op on * histogram \a h for pixel value \a x. It takes care of handling both levels. @@ -136,7 +133,7 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) for( c = 0; c < cn; c++ ) { for( j = 0; j < n; j++ ) - COP( c, j, src[cn*j+c], += (cv::HT)(r+2) ); + COP( c, j, src[cn*j+c], += (HT)(r+2) ); for( i = 1; i < r; i++ ) { @@ -172,7 +169,7 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) v_store(H[c].fine[k] + 8, v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k) + 8), v_setall_u16((ushort)(2 * r + 1))) + v_load(H[c].fine[k] + 8)); #else for (int ind = 0; ind < 16; ++ind) - H[c].fine[k][ind] += (2 * r + 1) * h_fine[16 * n*(16 * c + k) + ind]; + H[c].fine[k][ind] = (HT)(H[c].fine[k][ind] + (2 * r + 1) * h_fine[16 * n*(16 * c + k) + ind]); #endif } @@ -245,7 +242,7 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) memset(&H[c].fine[k], 0, 16 * sizeof(HT)); #endif px = h_fine + 16 * (n*(16 * c + k) + j - r); - for (luc[c][k] = cv::HT(j - r); luc[c][k] < MIN(j + r + 1, n); ++luc[c][k], px += 16) + for (luc[c][k] = HT(j - r); luc[c][k] < MIN(j + r + 1, n); ++luc[c][k], px += 16) { #if CV_SIMD256 v_fine += v256_load(px); @@ -268,7 +265,7 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) v_fineh += v_mul_wrap(v_load(px + 8), v_setall_u16((ushort)(j + r + 1 - n))); #else for (int ind = 0; ind < 16; ++ind) - H[c].fine[k][ind] += (j + r + 1 - n) * px[ind]; + H[c].fine[k][ind] = (HT)(H[c].fine[k][ind] + (j + r + 1 - n) * px[ind]); #endif luc[c][k] = (HT)(j+r+1); } @@ -479,6 +476,8 @@ medianBlur_8u_Om( const Mat& _src, Mat& _dst, int m ) } +namespace { + struct MinMax8u { typedef uchar value_type; diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp index 123ad4fd72..f182b77d0c 100644 --- a/modules/imgproc/src/resize.cpp +++ b/modules/imgproc/src/resize.cpp @@ -3782,9 +3782,9 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize, Size ssize = _src.size(); CV_Assert( !ssize.empty() ); - CV_Assert( !dsize.empty() || (inv_scale_x > 0 && inv_scale_y > 0) ); if( dsize.empty() ) { + CV_Assert(inv_scale_x > 0); CV_Assert(inv_scale_y > 0); dsize = Size(saturate_cast(ssize.width*inv_scale_x), saturate_cast(ssize.height*inv_scale_y)); CV_Assert( !dsize.empty() ); @@ -3793,6 +3793,7 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize, { inv_scale_x = (double)dsize.width/ssize.width; inv_scale_y = (double)dsize.height/ssize.height; + CV_Assert(inv_scale_x > 0); CV_Assert(inv_scale_y > 0); } if (interpolation == INTER_LINEAR_EXACT && (_src.depth() == CV_32F || _src.depth() == CV_64F)) diff --git a/modules/imgproc/test/test_resize_bitexact.cpp b/modules/imgproc/test/test_resize_bitexact.cpp index 2dafd50216..f76eb6f9d2 100644 --- a/modules/imgproc/test/test_resize_bitexact.cpp +++ b/modules/imgproc/test/test_resize_bitexact.cpp @@ -45,9 +45,9 @@ TEST(Resize_Bitexact, Linear8U) { CV_8UC4, Size( 4, 3) }, { CV_8UC1, Size( 342, 384) }, // 1/3 1/2 { CV_8UC1, Size( 342, 256) }, // 1/3 1/3 - { CV_8UC1, Size( 342, 256) }, - { CV_8UC1, Size( 342, 256) }, - { CV_8UC1, Size( 342, 256) }, + { CV_8UC2, Size( 342, 256) }, + { CV_8UC3, Size( 342, 256) }, + { CV_8UC4, Size( 342, 256) }, { CV_8UC1, Size( 512, 256) }, // 1/2 1/3 { CV_8UC1, Size( 146, 110) }, // 1/7 1/7 { CV_8UC3, Size( 146, 110) }, @@ -83,13 +83,13 @@ TEST(Resize_Bitexact, Linear8U) softdouble scale_y = softdouble::one() / softdouble(inv_scale_y); Mat src(rows, cols, type), refdst(drows, dcols, type), dst; + RNG rnd(0x123456789abcdefULL); for (int j = 0; j < rows; j++) { uint8_t* line = src.ptr(j); for (int i = 0; i < cols; i++) for (int c = 0; c < cn; c++) { - RNG rnd(0x123456789abcdefULL); double val = j < rows / 2 ? ( i < cols / 2 ? ((sin((i + 1)*CV_PI / 256.)*sin((j + 1)*CV_PI / 256.)*sin((cn + 4)*CV_PI / 8.) + 1.)*128.) : (((i / 128 + j / 128) % 2) * 250 + (j / 128) % 2) ) : ( i < cols / 2 ? ((i / 128) * (85 - j / 256 * 40) * ((j / 128) % 2) + (7 - i / 128) * (85 - j / 256 * 40) * ((j / 128 + 1) % 2)) : diff --git a/modules/objdetect/src/qrcode.cpp b/modules/objdetect/src/qrcode.cpp index bb639942e0..9a719e44f2 100644 --- a/modules/objdetect/src/qrcode.cpp +++ b/modules/objdetect/src/qrcode.cpp @@ -959,8 +959,7 @@ bool QRDecode::samplingForVersion() const int delta_rows = cvRound((postIntermediate.rows * 1.0) / version_size); const int delta_cols = cvRound((postIntermediate.cols * 1.0) / version_size); - vector listFrequencyElem(version_size * version_size, 0); - int k = 0; + vector listFrequencyElem; for (int r = 0; r < postIntermediate.rows; r += delta_rows) { for (int c = 0; c < postIntermediate.cols; c += delta_cols) @@ -969,7 +968,7 @@ bool QRDecode::samplingForVersion() Range(r, min(r + delta_rows, postIntermediate.rows)), Range(c, min(c + delta_cols, postIntermediate.cols))); const double frequencyElem = (countNonZero(tile) * 1.0) / tile.total(); - listFrequencyElem[k] = frequencyElem; k++; + listFrequencyElem.push_back(frequencyElem); } } diff --git a/modules/ts/include/opencv2/ts.hpp b/modules/ts/include/opencv2/ts.hpp index df77447c83..b9d6b74ffc 100644 --- a/modules/ts/include/opencv2/ts.hpp +++ b/modules/ts/include/opencv2/ts.hpp @@ -17,6 +17,8 @@ #include "opencv2/core/utils/trace.hpp" +#include "opencv2/core/hal/hal.hpp" + #include // for va_list #include "cvconfig.h" diff --git a/modules/videoio/src/cap_v4l.cpp b/modules/videoio/src/cap_v4l.cpp index 620a42e27b..930abb1bbf 100644 --- a/modules/videoio/src/cap_v4l.cpp +++ b/modules/videoio/src/cap_v4l.cpp @@ -347,8 +347,17 @@ struct CvCaptureCAM_V4L CV_FINAL : public CvCapture /*********************** Implementations ***************************************/ -CvCaptureCAM_V4L::CvCaptureCAM_V4L() : deviceHandle(-1), bufferIndex(-1) +CvCaptureCAM_V4L::CvCaptureCAM_V4L() : + deviceHandle(-1), bufferIndex(-1), + FirstCapture(true), + palette(0), + width(0), height(0), width_set(0), height_set(0), + bufferSize(DEFAULT_V4L_BUFFERS), + fps(0), convert_rgb(0), frame_allocated(false), returnFrame(false), + channelNumber(-1), normalizePropRange(false), + type(V4L2_BUF_TYPE_VIDEO_CAPTURE) { + frame = cvIplImage(); memset(×tamp, 0, sizeof(timestamp)); } diff --git a/platforms/ios/build_framework.py b/platforms/ios/build_framework.py index f4970633e5..5a2ea41991 100755 --- a/platforms/ios/build_framework.py +++ b/platforms/ios/build_framework.py @@ -285,12 +285,15 @@ if __name__ == "__main__": parser.add_argument('--disable-bitcode', default=False, dest='bitcodedisabled', action='store_true', help='disable bitcode (enabled by default)') parser.add_argument('--iphoneos_deployment_target', default=os.environ.get('IPHONEOS_DEPLOYMENT_TARGET', IPHONEOS_DEPLOYMENT_TARGET), help='specify IPHONEOS_DEPLOYMENT_TARGET') parser.add_argument('--iphoneos_archs', default='armv7,armv7s,arm64', help='select iPhoneOS target ARCHS') + parser.add_argument('--iphonesimulator_archs', default='i386,x86_64', help='select iPhoneSimulator target ARCHS') args = parser.parse_args() os.environ['IPHONEOS_DEPLOYMENT_TARGET'] = args.iphoneos_deployment_target print('Using IPHONEOS_DEPLOYMENT_TARGET=' + os.environ['IPHONEOS_DEPLOYMENT_TARGET']) iphoneos_archs = args.iphoneos_archs.split(',') print('Using iPhoneOS ARCHS=' + str(iphoneos_archs)) + iphonesimulator_archs = args.iphonesimulator_archs.split(',') + print('Using iPhoneSimulator ARCHS=' + str(iphonesimulator_archs)) b = iOSBuilder(args.opencv, args.contrib, args.dynamic, args.bitcodedisabled, args.without, [ @@ -298,6 +301,6 @@ if __name__ == "__main__": ] if os.environ.get('BUILD_PRECOMMIT', None) else [ (iphoneos_archs, "iPhoneOS"), - (["i386", "x86_64"], "iPhoneSimulator"), + (iphonesimulator_archs, "iPhoneSimulator"), ]) b.build(args.out)