diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp index 3436c741a3..3e81f3be58 100644 --- a/modules/video/src/lkpyramid.cpp +++ b/modules/video/src/lkpyramid.cpp @@ -239,13 +239,12 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const acctype iA11 = 0, iA12 = 0, iA22 = 0; float A11, A12, A22; -#if CV_SSE2 - __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); - __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); - __m128i z = _mm_setzero_si128(); - __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); - __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); - __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps(); +#if CV_SIMD128 && !CV_NEON + v_int16x8 qw0((short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01)); + v_int16x8 qw1((short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11)); + v_int32x4 qdelta_d = v_setall_s32(1 << (W_BITS1-1)); + v_int32x4 qdelta = v_setall_s32(1 << (W_BITS1-5-1)); + v_float32x4 qA11 = v_setzero_f32(), qA12 = v_setzero_f32(), qA22 = v_setzero_f32(); #endif #if CV_NEON @@ -275,44 +274,75 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const x = 0; -#if CV_SSE2 - for( ; x <= winSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) +#if CV_SIMD128 && !CV_NEON + for( ; x <= winSize.width*cn - 8; x += 8, dsrc += 8*2, dIptr += 8*2 ) { - __m128i v00, v01, v10, v11, t0, t1; + v_int32x4 t0, t1; + v_int16x8 v00, v01, v10, v11, t00, t01, t10, t11; - v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); - v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); - v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + stepI)), z); - v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + stepI + cn)), z); + v00 = v_reinterpret_as_s16(v_load_expand(src + x)); + v01 = v_reinterpret_as_s16(v_load_expand(src + x + cn)); + v10 = v_reinterpret_as_s16(v_load_expand(src + x + stepI)); + v11 = v_reinterpret_as_s16(v_load_expand(src + x + stepI + cn)); - t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); - _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); + v_zip(v00, v01, t00, t01); + v_zip(v10, v11, t10, t11); - v00 = _mm_loadu_si128((const __m128i*)(dsrc)); - v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); - v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); - v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); + t0 = v_dotprod(t00, qw0, qdelta) + v_dotprod(t10, qw1); + t1 = v_dotprod(t01, qw0, qdelta) + v_dotprod(t11, qw1); + t0 = t0 >> (W_BITS1-5); + t1 = t1 >> (W_BITS1-5); + v_store(Iptr + x, v_pack(t0, t1)); - t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); - t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); - v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + v00 = v_reinterpret_as_s16(v_load(dsrc)); + v01 = v_reinterpret_as_s16(v_load(dsrc + cn2)); + v10 = v_reinterpret_as_s16(v_load(dsrc + dstep)); + v11 = v_reinterpret_as_s16(v_load(dsrc + dstep + cn2)); - _mm_storeu_si128((__m128i*)dIptr, v00); - t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3 - t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3 + v_zip(v00, v01, t00, t01); + v_zip(v10, v11, t10, t11); - __m128 fy = _mm_cvtepi32_ps(t0); - __m128 fx = _mm_cvtepi32_ps(t1); + t0 = v_dotprod(t00, qw0, qdelta_d) + v_dotprod(t10, qw1); + t1 = v_dotprod(t01, qw0, qdelta_d) + v_dotprod(t11, qw1); + t0 = t0 >> W_BITS1; + t1 = t1 >> W_BITS1; + v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + v_store(dIptr, v00); - qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy)); - qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy)); - qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx)); + v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00)))); + v_expand(v00, t1, t0); + + v_float32x4 fy = v_cvt_f32(t0); + v_float32x4 fx = v_cvt_f32(t1); + + qA22 = v_muladd(fy, fy, qA22); + qA12 = v_muladd(fx, fy, qA12); + qA11 = v_muladd(fx, fx, qA11); + + v00 = v_reinterpret_as_s16(v_load(dsrc + 4*2)); + v01 = v_reinterpret_as_s16(v_load(dsrc + 4*2 + cn2)); + v10 = v_reinterpret_as_s16(v_load(dsrc + 4*2 + dstep)); + v11 = v_reinterpret_as_s16(v_load(dsrc + 4*2 + dstep + cn2)); + + v_zip(v00, v01, t00, t01); + v_zip(v10, v11, t10, t11); + + t0 = v_dotprod(t00, qw0, qdelta_d) + v_dotprod(t10, qw1); + t1 = v_dotprod(t01, qw0, qdelta_d) + v_dotprod(t11, qw1); + t0 = t0 >> W_BITS1; + t1 = t1 >> W_BITS1; + v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + v_store(dIptr + 4*2, v00); + + v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00)))); + v_expand(v00, t1, t0); + + fy = v_cvt_f32(t0); + fx = v_cvt_f32(t1); + + qA22 = v_muladd(fy, fy, qA22); + qA12 = v_muladd(fx, fy, qA12); + qA11 = v_muladd(fx, fx, qA11); } #endif @@ -419,14 +449,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const } } -#if CV_SSE2 - float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4]; - _mm_store_ps(A11buf, qA11); - _mm_store_ps(A12buf, qA12); - _mm_store_ps(A22buf, qA22); - iA11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; - iA12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; - iA22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; +#if CV_SIMD128 && !CV_NEON + iA11 += v_reduce_sum(qA11); + iA12 += v_reduce_sum(qA12); + iA22 += v_reduce_sum(qA22); #endif #if CV_NEON @@ -479,10 +505,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; acctype ib1 = 0, ib2 = 0; float b1, b2; -#if CV_SSE2 - qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); - qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); - __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(); +#if CV_SIMD128 && !CV_NEON + qw0 = v_int16x8((short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01)); + qw1 = v_int16x8((short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11)); + v_float32x4 qb0 = v_setzero_f32(), qb1 = v_setzero_f32(); #endif #if CV_NEON @@ -503,34 +529,32 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const x = 0; -#if CV_SSE2 +#if CV_SIMD128 && !CV_NEON for( ; x <= winSize.width*cn - 8; x += 8, dIptr += 8*2 ) { - __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)), diff1; - __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); - __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); - __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + stepJ)), z); - __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + stepJ + cn)), z); + v_int16x8 diff0 = v_reinterpret_as_s16(v_load(Iptr + x)), diff1, diff2; + v_int16x8 v00 = v_reinterpret_as_s16(v_load_expand(Jptr + x)); + v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn)); + v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr + x + stepJ)); + v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr + x + stepJ + cn)); - __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); - t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5); - diff0 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0); - diff1 = _mm_unpackhi_epi16(diff0, diff0); - diff0 = _mm_unpacklo_epi16(diff0, diff0); // It0 It0 It1 It1 ... - v00 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... - v01 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); - v10 = _mm_unpacklo_epi16(v00, v01); - v11 = _mm_unpackhi_epi16(v00, v01); - v00 = _mm_unpacklo_epi16(diff0, diff1); - v01 = _mm_unpackhi_epi16(diff0, diff1); - v00 = _mm_madd_epi16(v00, v10); - v11 = _mm_madd_epi16(v01, v11); - qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); - qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v11)); + v_int32x4 t0, t1; + v_int16x8 t00, t01, t10, t11; + v_zip(v00, v01, t00, t01); + v_zip(v10, v11, t10, t11); + + t0 = v_dotprod(t00, qw0, qdelta) + v_dotprod(t10, qw1); + t1 = v_dotprod(t01, qw0, qdelta) + v_dotprod(t11, qw1); + t0 = t0 >> (W_BITS1-5); + t1 = t1 >> (W_BITS1-5); + diff0 = v_pack(t0, t1) - diff0; + v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ... + v00 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... + v01 = v_reinterpret_as_s16(v_load(dIptr + 8)); + v_zip(v00, v01, v10, v11); + v_zip(diff2, diff1, v00, v01); + qb0 += v_cvt_f32(v_dotprod(v00, v10)); + qb1 += v_cvt_f32(v_dotprod(v01, v11)); } #endif @@ -616,11 +640,11 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const } } -#if CV_SSE2 - float CV_DECL_ALIGNED(16) bbuf[4]; - _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); - ib1 += bbuf[0] + bbuf[2]; - ib2 += bbuf[1] + bbuf[3]; +#if CV_SIMD128 && !CV_NEON + v_float32x4 qf0, qf1; + v_recombine(v_interleave_pairs(qb0 + qb1), v_setzero_f32(), qf0, qf1); + ib1 += v_reduce_sum(qf0); + ib2 += v_reduce_sum(qf1); #endif #if CV_NEON