From 1ef740fa2c4bb5f2abbbb9e5d97f6686fec60026 Mon Sep 17 00:00:00 2001 From: Tomoaki Teshima Date: Tue, 11 Oct 2016 20:22:23 +0900 Subject: [PATCH] use universal intrinsic implementation for calcSharrDeriv --- .../include/opencv2/core/hal/intrin_sse.hpp | 9 ++ modules/video/src/lkpyramid.cpp | 103 +++++------------- 2 files changed, 38 insertions(+), 74 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 9f4ec66ae3..85c694a24c 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -1425,6 +1425,15 @@ inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3 } +inline void v_store_interleave( short* ptr, const v_int16x8& a, const v_int16x8& b ) +{ + __m128i t0, t1; + t0 = _mm_unpacklo_epi16(a.val, b.val); + t1 = _mm_unpackhi_epi16(a.val, b.val); + _mm_storeu_si128((__m128i*)(ptr), t0); + _mm_storeu_si128((__m128i*)(ptr + 8), t1); +} + inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, const v_uint8x16& c ) { diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp index b96a43c5f9..24e39e0347 100644 --- a/modules/video/src/lkpyramid.cpp +++ b/modules/video/src/lkpyramid.cpp @@ -44,6 +44,7 @@ #include #include "lkpyramid.hpp" #include "opencl_kernels_video.hpp" +#include "opencv2/core/hal/intrin.hpp" #define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) @@ -66,16 +67,9 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst) AutoBuffer _tempBuf(delta*2 + 64); deriv_type *trow0 = alignPtr(_tempBuf + cn, 16), *trow1 = alignPtr(trow0 + delta, 16); -#if CV_SSE2 - __m128i z = _mm_setzero_si128(), c3 = _mm_set1_epi16(3), c10 = _mm_set1_epi16(10); -#endif - -#if CV_NEON - const uint16x8_t q8 = vdupq_n_u16(3); - const uint8x8_t d18 = vdup_n_u8(10); - - const int16x8_t q8i = vdupq_n_s16(3); - const int16x8_t q9 = vdupq_n_s16(10); +#if CV_SIMD128 + v_int16x8 c3 = v_setall_s16(3), c10 = v_setall_s16(10); + bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); #endif for( y = 0; y < rows; y++ ) @@ -87,33 +81,21 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst) // do vertical convolution x = 0; -#if CV_SSE2 - for( ; x <= colsn - 8; x += 8 ) +#if CV_SIMD128 + if(haveSIMD) { - __m128i s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow0 + x)), z); - __m128i s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow1 + x)), z); - __m128i s2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow2 + x)), z); - __m128i t0 = _mm_add_epi16(_mm_mullo_epi16(_mm_add_epi16(s0, s2), c3), _mm_mullo_epi16(s1, c10)); - __m128i t1 = _mm_sub_epi16(s2, s0); - _mm_store_si128((__m128i*)(trow0 + x), t0); - _mm_store_si128((__m128i*)(trow1 + x), t1); - } -#endif + for( ; x <= colsn - 8; x += 8 ) + { + v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(srow0 + x)); + v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x)); + v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x)); -#if CV_NEON - for( ; x <= colsn - 8; x += 8) - { - uint8x8_t d0 = vld1_u8((const uint8_t*)&srow0[x]); - uint8x8_t d1 = vld1_u8((const uint8_t*)&srow1[x]); - uint8x8_t d2 = vld1_u8((const uint8_t*)&srow2[x]); - uint16x8_t q4 = vaddl_u8(d0, d2); - uint16x8_t q11 = vsubl_u8(d2, d0); - uint16x8_t q5 = vmulq_u16(q4, q8); - uint16x8_t q6 = vmull_u8(d1, d18); - uint16x8_t q10 = vaddq_u16(q6, q5); - vst1q_u16((uint16_t*)&trow0[x], q10); - vst1q_u16((uint16_t*)&trow1[x], q11); + v_int16x8 t1 = s2 - s0; + v_int16x8 t0 = (s0 + s2) * c3 + s1 * c10; + v_store(trow0 + x, t0); + v_store(trow1 + x, t1); + } } #endif @@ -135,49 +117,22 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst) // do horizontal convolution, interleave the results and store them to dst x = 0; -#if CV_SSE2 - for( ; x <= colsn - 8; x += 8 ) +#if CV_SIMD128 + if(haveSIMD) { - __m128i s0 = _mm_loadu_si128((const __m128i*)(trow0 + x - cn)); - __m128i s1 = _mm_loadu_si128((const __m128i*)(trow0 + x + cn)); - __m128i s2 = _mm_loadu_si128((const __m128i*)(trow1 + x - cn)); - __m128i s3 = _mm_load_si128((const __m128i*)(trow1 + x)); - __m128i s4 = _mm_loadu_si128((const __m128i*)(trow1 + x + cn)); + for( ; x <= colsn - 8; x += 8 ) + { + v_int16x8 s0 = v_load(trow0 + x - cn); + v_int16x8 s1 = v_load(trow0 + x + cn); + v_int16x8 s2 = v_load(trow1 + x - cn); + v_int16x8 s3 = v_load(trow1 + x); + v_int16x8 s4 = v_load(trow1 + x + cn); - __m128i t0 = _mm_sub_epi16(s1, s0); - __m128i t1 = _mm_add_epi16(_mm_mullo_epi16(_mm_add_epi16(s2, s4), c3), _mm_mullo_epi16(s3, c10)); - __m128i t2 = _mm_unpacklo_epi16(t0, t1); - t0 = _mm_unpackhi_epi16(t0, t1); - // this can probably be replaced with aligned stores if we aligned dst properly. - _mm_storeu_si128((__m128i*)(drow + x*2), t2); - _mm_storeu_si128((__m128i*)(drow + x*2 + 8), t0); - } -#endif - -#if CV_NEON - for( ; x <= colsn - 8; x += 8 ) - { - - int16x8_t q0 = vld1q_s16((const int16_t*)&trow0[x+cn]); - int16x8_t q1 = vld1q_s16((const int16_t*)&trow0[x-cn]); - int16x8_t q2 = vld1q_s16((const int16_t*)&trow1[x+cn]); - int16x8_t q3 = vld1q_s16((const int16_t*)&trow1[x-cn]); - int16x8_t q5 = vsubq_s16(q0, q1); - int16x8_t q6 = vaddq_s16(q2, q3); - int16x8_t q4 = vld1q_s16((const int16_t*)&trow1[x]); - int16x8_t q7 = vmulq_s16(q6, q8i); - int16x8_t q10 = vmulq_s16(q4, q9); - int16x8_t q11 = vaddq_s16(q7, q10); - int16x4_t d22 = vget_low_s16(q11); - int16x4_t d23 = vget_high_s16(q11); - int16x4_t d11 = vget_high_s16(q5); - int16x4_t d10 = vget_low_s16(q5); - int16x4x2_t q5x2, q11x2; - q5x2.val[0] = d10; q5x2.val[1] = d22; - q11x2.val[0] = d11; q11x2.val[1] = d23; - vst2_s16((int16_t*)&drow[x*2], q5x2); - vst2_s16((int16_t*)&drow[(x*2)+8], q11x2); + v_int16x8 t0 = s1 - s0; + v_int16x8 t1 = ((s2 + s4) * c3) + (s3 * c10); + v_store_interleave((drow + x*2), t0, t1); + } } #endif for( ; x < colsn; x++ )