From e932160a8d5409afad94a7ac8d2bd8fc77067822 Mon Sep 17 00:00:00 2001 From: Tomoaki Teshima Date: Fri, 22 Sep 2017 23:43:05 +0900 Subject: [PATCH] replace raw SSE2/NEON implementation with universal intrinsic --- modules/imgproc/src/smooth.cpp | 698 +++++++++------------------------ 1 file changed, 177 insertions(+), 521 deletions(-) diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index 889ce2e022..f5d0f8cbd5 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -299,11 +299,9 @@ struct ColumnSum : bool haveScale = scale != 1; double _scale = scale; - #if CV_SSE2 - bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); - #elif CV_NEON - bool haveNEON = checkHardwareSupport(CV_CPU_NEON); - #endif +#if CV_SIMD128 + bool haveSIMD128 = hasSIMD128(); +#endif if( width != (int)sum.size() ) { @@ -319,23 +317,15 @@ struct ColumnSum : { const int* Sp = (const int*)src[0]; int i = 0; - #if CV_SSE2 - if(haveSSE2) +#if CV_SIMD128 + if( haveSIMD128 ) { - for( ; i <= width-4; i+=4 ) + for (; i <= width - 4; i += 4) { - __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i)); - __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i)); - _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp)); + v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); } } - #elif CV_NEON - if(haveNEON) - { - for( ; i <= width - 4; i+=4 ) - vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i))); - } - #endif +#endif for( ; i < width; i++ ) SUM[i] += Sp[i]; } @@ -354,51 +344,27 @@ struct ColumnSum : if( haveScale ) { int i = 0; - #if CV_SSE2 - if(haveSSE2) +#if CV_SIMD128 + if( haveSIMD128 ) { - const __m128 scale4 = _mm_set1_ps((float)_scale); + + v_float32x4 v_scale = v_setall_f32((float)_scale); for( ; i <= width-8; i+=8 ) { - __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); - __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4)); + v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); + v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4); - __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), - _mm_loadu_si128((const __m128i*)(Sp+i))); - __m128i _s01 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)), - _mm_loadu_si128((const __m128i*)(Sp+i+4))); + v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale)); + v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale)); - __m128i _s0T = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0))); - __m128i _s0T1 = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s01))); + v_uint16x8 v_dst = v_pack(v_s0d, v_s01d); + v_pack_store(D + i, v_dst); - _s0T = _mm_packs_epi32(_s0T, _s0T1); - - _mm_storel_epi64((__m128i*)(D+i), _mm_packus_epi16(_s0T, _s0T)); - - _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); - _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1)); + v_store(SUM + i, v_s0 - v_load(Sm + i)); + v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4)); } } - #elif CV_NEON - if(haveNEON) - { - float32x4_t v_scale = vdupq_n_f32((float)_scale); - for( ; i <= width-8; i+=8 ) - { - int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); - int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); - - uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale)); - uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale)); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d)); - vst1_u8(D + i, vqmovn_u16(v_dst)); - - vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); - vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); - } - } - #endif +#endif for( ; i < width; i++ ) { int s0 = SUM[i] + Sp[i]; @@ -409,43 +375,22 @@ struct ColumnSum : else { int i = 0; - #if CV_SSE2 - if(haveSSE2) +#if CV_SIMD128 + if( haveSIMD128 ) { for( ; i <= width-8; i+=8 ) { - __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); - __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4)); + v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); + v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4); - __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), - _mm_loadu_si128((const __m128i*)(Sp+i))); - __m128i _s01 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)), - _mm_loadu_si128((const __m128i*)(Sp+i+4))); + v_uint16x8 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)); + v_pack_store(D + i, v_dst); - __m128i _s0T = _mm_packs_epi32(_s0, _s01); - - _mm_storel_epi64((__m128i*)(D+i), _mm_packus_epi16(_s0T, _s0T)); - - _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); - _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1)); + v_store(SUM + i, v_s0 - v_load(Sm + i)); + v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4)); } } - #elif CV_NEON - if(haveNEON) - { - for( ; i <= width-8; i+=8 ) - { - int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); - int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); - - uint16x8_t v_dst = vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01)); - vst1_u8(D + i, vqmovn_u16(v_dst)); - - vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); - vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); - } - } - #endif +#endif for( ; i < width; i++ ) { @@ -502,10 +447,8 @@ public BaseColumnFilter ushort* SUM; const bool haveScale = scale != 1; -#if CV_SSE2 - bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); -#elif CV_NEON - bool haveNEON = checkHardwareSupport(CV_CPU_NEON); +#if CV_SIMD128 + bool haveSIMD128 = hasSIMD128(); #endif if( width != (int)sum.size() ) @@ -522,22 +465,14 @@ public BaseColumnFilter { const ushort* Sp = (const ushort*)src[0]; int i = 0; -#if CV_SSE2 - if(haveSSE2) +#if CV_SIMD128 + if( haveSIMD128 ) { - for( ; i <= width-8; i+=8 ) + for( ; i <= width - 8; i += 8 ) { - __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i)); - __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i)); - _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi16(_sum, _sp)); + v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); } } -#elif CV_NEON - if(haveNEON) - { - for( ; i <= width - 8; i+=8 ) - vst1q_u16(SUM + i, vaddq_u16(vld1q_u16(SUM + i), vld1q_u16(Sp + i))); - } #endif for( ; i < width; i++ ) SUM[i] += Sp[i]; @@ -641,11 +576,9 @@ struct ColumnSum : bool haveScale = scale != 1; double _scale = scale; - #if CV_SSE2 - bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); - #elif CV_NEON - bool haveNEON = checkHardwareSupport(CV_CPU_NEON); - #endif +#if CV_SIMD128 + bool haveSIMD128 = hasSIMD128(); +#endif if( width != (int)sum.size() ) { @@ -661,21 +594,13 @@ struct ColumnSum : { const int* Sp = (const int*)src[0]; i = 0; - #if CV_SSE2 - if(haveSSE2) - { - for( ; i <= width-4; i+=4 ) - { - __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i)); - __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i)); - _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp)); - } - } - #elif CV_NEON - if(haveNEON) +#if CV_SIMD128 + if( haveSIMD128 ) { for( ; i <= width - 4; i+=4 ) - vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i))); + { + v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); + } } #endif for( ; i < width; i++ ) @@ -696,47 +621,24 @@ struct ColumnSum : if( haveScale ) { i = 0; - #if CV_SSE2 - if(haveSSE2) +#if CV_SIMD128 + if( haveSIMD128 ) { - const __m128 scale4 = _mm_set1_ps((float)_scale); + v_float32x4 v_scale = v_setall_f32((float)_scale); for( ; i <= width-8; i+=8 ) { - __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); - __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4)); + v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); + v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4); - __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), - _mm_loadu_si128((const __m128i*)(Sp+i))); - __m128i _s01 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)), - _mm_loadu_si128((const __m128i*)(Sp+i+4))); + v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale); + v_int32x4 v_s01d = v_round(v_cvt_f32(v_s01) * v_scale); + v_store(D + i, v_pack(v_s0d, v_s01d)); - __m128i _s0T = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0))); - __m128i _s0T1 = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s01))); - - _mm_storeu_si128((__m128i*)(D+i), _mm_packs_epi32(_s0T, _s0T1)); - - _mm_storeu_si128((__m128i*)(SUM+i),_mm_sub_epi32(_s0,_sm)); - _mm_storeu_si128((__m128i*)(SUM+i+4), _mm_sub_epi32(_s01,_sm1)); + v_store(SUM + i, v_s0 - v_load(Sm + i)); + v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4)); } } - #elif CV_NEON - if(haveNEON) - { - float32x4_t v_scale = vdupq_n_f32((float)_scale); - for( ; i <= width-8; i+=8 ) - { - int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); - int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); - - int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale)); - int32x4_t v_s01d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale)); - vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0d), vqmovn_s32(v_s01d))); - - vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); - vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); - } - } - #endif +#endif for( ; i < width; i++ ) { int s0 = SUM[i] + Sp[i]; @@ -747,41 +649,21 @@ struct ColumnSum : else { i = 0; - #if CV_SSE2 - if(haveSSE2) +#if CV_SIMD128 + if( haveSIMD128 ) { for( ; i <= width-8; i+=8 ) { + v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); + v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4); - __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); - __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4)); + v_store(D + i, v_pack(v_s0, v_s01)); - __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), - _mm_loadu_si128((const __m128i*)(Sp+i))); - __m128i _s01 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)), - _mm_loadu_si128((const __m128i*)(Sp+i+4))); - - _mm_storeu_si128((__m128i*)(D+i), _mm_packs_epi32(_s0, _s01)); - - _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); - _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1)); + v_store(SUM + i, v_s0 - v_load(Sm + i)); + v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4)); } } - #elif CV_NEON - if(haveNEON) - { - for( ; i <= width-8; i+=8 ) - { - int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); - int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); - - vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0), vqmovn_s32(v_s01))); - - vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); - vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); - } - } - #endif +#endif for( ; i < width; i++ ) { @@ -821,11 +703,9 @@ struct ColumnSum : bool haveScale = scale != 1; double _scale = scale; - #if CV_SSE2 - bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); - #elif CV_NEON - bool haveNEON = checkHardwareSupport(CV_CPU_NEON); - #endif +#if CV_SIMD128 + bool haveSIMD128 = hasSIMD128(); +#endif if( width != (int)sum.size() ) { @@ -841,23 +721,15 @@ struct ColumnSum : { const int* Sp = (const int*)src[0]; int i = 0; - #if CV_SSE2 - if(haveSSE2) +#if CV_SIMD128 + if( haveSIMD128 ) { - for( ; i <= width-4; i+=4 ) + for (; i <= width - 4; i += 4) { - __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i)); - __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i)); - _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp)); + v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); } } - #elif CV_NEON - if(haveNEON) - { - for( ; i <= width - 4; i+=4 ) - vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i))); - } - #endif +#endif for( ; i < width; i++ ) SUM[i] += Sp[i]; } @@ -876,46 +748,24 @@ struct ColumnSum : if( haveScale ) { int i = 0; - #if CV_SSE2 - if(haveSSE2) +#if CV_SIMD128 + if( haveSIMD128 ) { - const __m128 scale4 = _mm_set1_ps((float)_scale); - const __m128i delta0 = _mm_set1_epi32(0x8000); - const __m128i delta1 = _mm_set1_epi32(0x80008000); - - for( ; i < width-4; i+=4) - { - __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); - __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), - _mm_loadu_si128((const __m128i*)(Sp+i))); - - __m128i _res = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0))); - - _res = _mm_sub_epi32(_res, delta0); - _res = _mm_add_epi16(_mm_packs_epi32(_res, _res), delta1); - - _mm_storel_epi64((__m128i*)(D+i), _res); - _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); - } - } - #elif CV_NEON - if(haveNEON) - { - float32x4_t v_scale = vdupq_n_f32((float)_scale); + v_float32x4 v_scale = v_setall_f32((float)_scale); for( ; i <= width-8; i+=8 ) { - int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); - int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); + v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); + v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4); - uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale)); - uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale)); - vst1q_u16(D + i, vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d))); + v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale)); + v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale)); + v_store(D + i, v_pack(v_s0d, v_s01d)); - vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); - vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); + v_store(SUM + i, v_s0 - v_load(Sm + i)); + v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4)); } } - #endif +#endif for( ; i < width; i++ ) { int s0 = SUM[i] + Sp[i]; @@ -926,41 +776,21 @@ struct ColumnSum : else { int i = 0; - #if CV_SSE2 - if(haveSSE2) - { - const __m128i delta0 = _mm_set1_epi32(0x8000); - const __m128i delta1 = _mm_set1_epi32(0x80008000); - - for( ; i < width-4; i+=4 ) - { - __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); - __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), - _mm_loadu_si128((const __m128i*)(Sp+i))); - - __m128i _res = _mm_sub_epi32(_s0, delta0); - _res = _mm_add_epi16(_mm_packs_epi32(_res, _res), delta1); - - _mm_storel_epi64((__m128i*)(D+i), _res); - _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); - } - } - #elif CV_NEON - if(haveNEON) +#if CV_SIMD128 + if( haveSIMD128 ) { for( ; i <= width-8; i+=8 ) { - int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); - int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); + v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); + v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4); - vst1q_u16(D + i, vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01))); + v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01))); - vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); - vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); + v_store(SUM + i, v_s0 - v_load(Sm + i)); + v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4)); } } - #endif - +#endif for( ; i < width; i++ ) { int s0 = SUM[i] + Sp[i]; @@ -998,11 +828,9 @@ struct ColumnSum : bool haveScale = scale != 1; double _scale = scale; - #if CV_SSE2 - bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); - #elif CV_NEON - bool haveNEON = checkHardwareSupport(CV_CPU_NEON); - #endif +#if CV_SIMD128 + bool haveSIMD128 = hasSIMD128(); +#endif if( width != (int)sum.size() ) { @@ -1018,23 +846,15 @@ struct ColumnSum : { const int* Sp = (const int*)src[0]; int i = 0; - #if CV_SSE2 - if(haveSSE2) - { - for( ; i <= width-4; i+=4 ) - { - __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i)); - __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i)); - _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp)); - } - } - #elif CV_NEON - if(haveNEON) +#if CV_SIMD128 + if( haveSIMD128 ) { for( ; i <= width - 4; i+=4 ) - vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i))); + { + v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); + } } - #endif +#endif for( ; i < width; i++ ) SUM[i] += Sp[i]; } @@ -1053,38 +873,20 @@ struct ColumnSum : if( haveScale ) { int i = 0; - #if CV_SSE2 - if(haveSSE2) +#if CV_SIMD128 + if( haveSIMD128 ) { - const __m128 scale4 = _mm_set1_ps((float)_scale); + v_float32x4 v_scale = v_setall_f32((float)_scale); for( ; i <= width-4; i+=4 ) { - __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); + v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); + v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale); - __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), - _mm_loadu_si128((const __m128i*)(Sp+i))); - - __m128i _s0T = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0))); - - _mm_storeu_si128((__m128i*)(D+i), _s0T); - _mm_storeu_si128((__m128i*)(SUM+i),_mm_sub_epi32(_s0,_sm)); + v_store(D + i, v_s0d); + v_store(SUM + i, v_s0 - v_load(Sm + i)); } } - #elif CV_NEON - if(haveNEON) - { - float32x4_t v_scale = vdupq_n_f32((float)_scale); - for( ; i <= width-4; i+=4 ) - { - int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); - - int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale)); - vst1q_s32(D + i, v_s0d); - - vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); - } - } - #endif +#endif for( ; i < width; i++ ) { int s0 = SUM[i] + Sp[i]; @@ -1095,32 +897,18 @@ struct ColumnSum : else { int i = 0; - #if CV_SSE2 - if(haveSSE2) +#if CV_SIMD128 + if( haveSIMD128 ) { for( ; i <= width-4; i+=4 ) { - __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); - __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), - _mm_loadu_si128((const __m128i*)(Sp+i))); + v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - _mm_storeu_si128((__m128i*)(D+i), _s0); - _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); + v_store(D + i, v_s0); + v_store(SUM + i, v_s0 - v_load(Sm + i)); } } - #elif CV_NEON - if(haveNEON) - { - for( ; i <= width-4; i+=4 ) - { - int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); - - vst1q_s32(D + i, v_s0); - vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); - } - } - #endif - +#endif for( ; i < width; i++ ) { int s0 = SUM[i] + Sp[i]; @@ -1159,11 +947,9 @@ struct ColumnSum : bool haveScale = scale != 1; double _scale = scale; - #if CV_SSE2 - bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); - #elif CV_NEON - bool haveNEON = checkHardwareSupport(CV_CPU_NEON); - #endif +#if CV_SIMD128 + bool haveSIMD128 = hasSIMD128(); +#endif if( width != (int)sum.size() ) { @@ -1179,23 +965,15 @@ struct ColumnSum : { const int* Sp = (const int*)src[0]; int i = 0; - #if CV_SSE2 - if(haveSSE2) - { - for( ; i <= width-4; i+=4 ) - { - __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i)); - __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i)); - _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp)); - } - } - #elif CV_NEON - if(haveNEON) +#if CV_SIMD128 + if( haveSIMD128 ) { for( ; i <= width - 4; i+=4 ) - vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i))); + { + v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); + } } - #endif +#endif for( ; i < width; i++ ) SUM[i] += Sp[i]; @@ -1216,39 +994,23 @@ struct ColumnSum : { int i = 0; - #if CV_SSE2 - if(haveSSE2) +#if CV_SIMD128 + if( haveSIMD128 ) { - const __m128 scale4 = _mm_set1_ps((float)_scale); - - for( ; i < width-4; i+=4) + v_float32x4 v_scale = v_setall_f32((float)_scale); + for (; i <= width - 8; i += 8) { - __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); - __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), - _mm_loadu_si128((const __m128i*)(Sp+i))); + v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); + v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4); - _mm_storeu_ps(D+i, _mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0))); - _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); + v_store(D + i, v_cvt_f32(v_s0) * v_scale); + v_store(D + i + 4, v_cvt_f32(v_s01) * v_scale); + + v_store(SUM + i, v_s0 - v_load(Sm + i)); + v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4)); } } - #elif CV_NEON - if(haveNEON) - { - float32x4_t v_scale = vdupq_n_f32((float)_scale); - for( ; i <= width-8; i+=8 ) - { - int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); - int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); - - vst1q_f32(D + i, vmulq_f32(vcvtq_f32_s32(v_s0), v_scale)); - vst1q_f32(D + i + 4, vmulq_f32(vcvtq_f32_s32(v_s01), v_scale)); - - vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); - vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); - } - } - #endif - +#endif for( ; i < width; i++ ) { int s0 = SUM[i] + Sp[i]; @@ -1260,36 +1022,22 @@ struct ColumnSum : { int i = 0; - #if CV_SSE2 - if(haveSSE2) - { - for( ; i < width-4; i+=4) - { - __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); - __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), - _mm_loadu_si128((const __m128i*)(Sp+i))); - - _mm_storeu_ps(D+i, _mm_cvtepi32_ps(_s0)); - _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); - } - } - #elif CV_NEON - if(haveNEON) +#if CV_SIMD128 + if( haveSIMD128 ) { for( ; i <= width-8; i+=8 ) { - int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); - int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); + v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); + v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4); - vst1q_f32(D + i, vcvtq_f32_s32(v_s0)); - vst1q_f32(D + i + 4, vcvtq_f32_s32(v_s01)); + v_store(D + i, v_cvt_f32(v_s0)); + v_store(D + i + 4, v_cvt_f32(v_s01)); - vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); - vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); + v_store(SUM + i, v_s0 - v_load(Sm + i)); + v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4)); } } - #endif - +#endif for( ; i < width; i++ ) { int s0 = SUM[i] + Sp[i]; @@ -2395,46 +2143,20 @@ typedef struct } Histogram; -#if CV_SSE2 -#define MEDIAN_HAVE_SIMD 1 +#if CV_SIMD128 static inline void histogram_add_simd( const HT x[16], HT y[16] ) { - const __m128i* rx = (const __m128i*)x; - __m128i* ry = (__m128i*)y; - __m128i r0 = _mm_add_epi16(_mm_load_si128(ry+0),_mm_load_si128(rx+0)); - __m128i r1 = _mm_add_epi16(_mm_load_si128(ry+1),_mm_load_si128(rx+1)); - _mm_store_si128(ry+0, r0); - _mm_store_si128(ry+1, r1); + v_store(y, v_load(x) + v_load(y)); + v_store(y + 8, v_load(x + 8) + v_load(y + 8)); } static inline void histogram_sub_simd( const HT x[16], HT y[16] ) { - const __m128i* rx = (const __m128i*)x; - __m128i* ry = (__m128i*)y; - __m128i r0 = _mm_sub_epi16(_mm_load_si128(ry+0),_mm_load_si128(rx+0)); - __m128i r1 = _mm_sub_epi16(_mm_load_si128(ry+1),_mm_load_si128(rx+1)); - _mm_store_si128(ry+0, r0); - _mm_store_si128(ry+1, r1); + v_store(y, v_load(y) - v_load(x)); + v_store(y + 8, v_load(y + 8) - v_load(x + 8)); } -#elif CV_NEON -#define MEDIAN_HAVE_SIMD 1 - -static inline void histogram_add_simd( const HT x[16], HT y[16] ) -{ - vst1q_u16(y, vaddq_u16(vld1q_u16(x), vld1q_u16(y))); - vst1q_u16(y + 8, vaddq_u16(vld1q_u16(x + 8), vld1q_u16(y + 8))); -} - -static inline void histogram_sub_simd( const HT x[16], HT y[16] ) -{ - vst1q_u16(y, vsubq_u16(vld1q_u16(y), vld1q_u16(x))); - vst1q_u16(y + 8, vsubq_u16(vld1q_u16(y + 8), vld1q_u16(x + 8))); -} - -#else -#define MEDIAN_HAVE_SIMD 0 #endif @@ -2486,8 +2208,8 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) std::vector _h_fine(16 * 16 * (STRIPE_SIZE + 2*r) * cn + 16); HT* h_coarse = alignPtr(&_h_coarse[0], 16); HT* h_fine = alignPtr(&_h_fine[0], 16); -#if MEDIAN_HAVE_SIMD - volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); +#if CV_SIMD128 + volatile bool useSIMD = hasSIMD128(); #endif for( int x = 0; x < _dst.cols; x += STRIPE_SIZE ) @@ -2533,7 +2255,7 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) for( k = 0; k < 16; ++k ) histogram_muladd( 2*r+1, &h_fine[16*n*(16*c+k)], &H[c].fine[k][0] ); - #if MEDIAN_HAVE_SIMD +#if CV_SIMD128 if( useSIMD ) { for( j = 0; j < 2*r; ++j ) @@ -2597,7 +2319,7 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) } } else - #endif +#endif { for( j = 0; j < 2*r; ++j ) histogram_add( &h_coarse[16*(n*c+j)], H[c].coarse ); @@ -2871,20 +2593,20 @@ struct MinMax32f } }; -#if CV_SSE2 +#if CV_SIMD128 struct MinMaxVec8u { typedef uchar value_type; - typedef __m128i arg_type; + typedef v_uint8x16 arg_type; enum { SIZE = 16 }; - arg_type load(const uchar* ptr) { return _mm_loadu_si128((const __m128i*)ptr); } - void store(uchar* ptr, arg_type val) { _mm_storeu_si128((__m128i*)ptr, val); } + arg_type load(const uchar* ptr) { return v_load(ptr); } + void store(uchar* ptr, const arg_type &val) { v_store(ptr, val); } void operator()(arg_type& a, arg_type& b) const { arg_type t = a; - a = _mm_min_epu8(a, b); - b = _mm_max_epu8(b, t); + a = v_min(a, b); + b = v_max(b, t); } }; @@ -2892,15 +2614,15 @@ struct MinMaxVec8u struct MinMaxVec16u { typedef ushort value_type; - typedef __m128i arg_type; + typedef v_uint16x8 arg_type; enum { SIZE = 8 }; - arg_type load(const ushort* ptr) { return _mm_loadu_si128((const __m128i*)ptr); } - void store(ushort* ptr, arg_type val) { _mm_storeu_si128((__m128i*)ptr, val); } + arg_type load(const ushort* ptr) { return v_load(ptr); } + void store(ushort* ptr, const arg_type &val) { v_store(ptr, val); } void operator()(arg_type& a, arg_type& b) const { - arg_type t = _mm_subs_epu16(a, b); - a = _mm_subs_epu16(a, t); - b = _mm_adds_epu16(b, t); + arg_type t = a; + a = v_min(a, b); + b = v_max(b, t); } }; @@ -2908,15 +2630,15 @@ struct MinMaxVec16u struct MinMaxVec16s { typedef short value_type; - typedef __m128i arg_type; + typedef v_int16x8 arg_type; enum { SIZE = 8 }; - arg_type load(const short* ptr) { return _mm_loadu_si128((const __m128i*)ptr); } - void store(short* ptr, arg_type val) { _mm_storeu_si128((__m128i*)ptr, val); } + arg_type load(const short* ptr) { return v_load(ptr); } + void store(short* ptr, const arg_type &val) { v_store(ptr, val); } void operator()(arg_type& a, arg_type& b) const { arg_type t = a; - a = _mm_min_epi16(a, b); - b = _mm_max_epi16(b, t); + a = v_min(a, b); + b = v_max(b, t); } }; @@ -2924,84 +2646,18 @@ struct MinMaxVec16s struct MinMaxVec32f { typedef float value_type; - typedef __m128 arg_type; + typedef v_float32x4 arg_type; enum { SIZE = 4 }; - arg_type load(const float* ptr) { return _mm_loadu_ps(ptr); } - void store(float* ptr, arg_type val) { _mm_storeu_ps(ptr, val); } + arg_type load(const float* ptr) { return v_load(ptr); } + void store(float* ptr, const arg_type &val) { v_store(ptr, val); } void operator()(arg_type& a, arg_type& b) const { arg_type t = a; - a = _mm_min_ps(a, b); - b = _mm_max_ps(b, t); + a = v_min(a, b); + b = v_max(b, t); } }; -#elif CV_NEON - -struct MinMaxVec8u -{ - typedef uchar value_type; - typedef uint8x16_t arg_type; - enum { SIZE = 16 }; - arg_type load(const uchar* ptr) { return vld1q_u8(ptr); } - void store(uchar* ptr, arg_type val) { vst1q_u8(ptr, val); } - void operator()(arg_type& a, arg_type& b) const - { - arg_type t = a; - a = vminq_u8(a, b); - b = vmaxq_u8(b, t); - } -}; - - -struct MinMaxVec16u -{ - typedef ushort value_type; - typedef uint16x8_t arg_type; - enum { SIZE = 8 }; - arg_type load(const ushort* ptr) { return vld1q_u16(ptr); } - void store(ushort* ptr, arg_type val) { vst1q_u16(ptr, val); } - void operator()(arg_type& a, arg_type& b) const - { - arg_type t = a; - a = vminq_u16(a, b); - b = vmaxq_u16(b, t); - } -}; - - -struct MinMaxVec16s -{ - typedef short value_type; - typedef int16x8_t arg_type; - enum { SIZE = 8 }; - arg_type load(const short* ptr) { return vld1q_s16(ptr); } - void store(short* ptr, arg_type val) { vst1q_s16(ptr, val); } - void operator()(arg_type& a, arg_type& b) const - { - arg_type t = a; - a = vminq_s16(a, b); - b = vmaxq_s16(b, t); - } -}; - - -struct MinMaxVec32f -{ - typedef float value_type; - typedef float32x4_t arg_type; - enum { SIZE = 4 }; - arg_type load(const float* ptr) { return vld1q_f32(ptr); } - void store(float* ptr, arg_type val) { vst1q_f32(ptr, val); } - void operator()(arg_type& a, arg_type& b) const - { - arg_type t = a; - a = vminq_f32(a, b); - b = vmaxq_f32(b, t); - } -}; - - #else typedef MinMax8u MinMaxVec8u; @@ -3027,7 +2683,7 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m ) int i, j, k, cn = _src.channels(); Op op; VecOp vop; - volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); + volatile bool useSIMD = hasSIMD128(); if( m == 3 ) { @@ -3478,7 +3134,7 @@ void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize ) #endif bool useSortNet = ksize == 3 || (ksize == 5 -#if !(CV_SSE2 || CV_NEON) +#if !(CV_SIMD128) && ( src0.depth() > CV_8U || src0.channels() == 2 || src0.channels() > 4 ) #endif ); @@ -3513,7 +3169,7 @@ void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize ) double img_size_mp = (double)(src0.total())/(1 << 20); if( ksize <= 3 + (img_size_mp < 1 ? 12 : img_size_mp < 4 ? 6 : 2)* - (MEDIAN_HAVE_SIMD && (checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON)) ? 1 : 3)) + (CV_SIMD128 && hasSIMD128() ? 1 : 3)) medianBlur_8u_Om( src, dst, ksize ); else medianBlur_8u_O1( src, dst, ksize );