mirror of
https://github.com/opencv/opencv.git
synced 2025-06-11 11:45:30 +08:00
Merge pull request #15274 from ChipKerchner:lkpyramidToHal
* Convert lkpyramid from SSE SIMD to HAL - 90% faster on Power (VSX). * Replace stores with reduce_sum. Rework to handle endianess correctly. * Fix compiler warnings by casting values explicitly to shorts * Switch to CV_SIMD128 compiler definition. Unroll loop to 8 elements since we've already loaded the data.
This commit is contained in:
parent
ca7640e10f
commit
30a60d396b
@ -239,13 +239,12 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
|
||||
acctype iA11 = 0, iA12 = 0, iA22 = 0;
|
||||
float A11, A12, A22;
|
||||
|
||||
#if CV_SSE2
|
||||
__m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
|
||||
__m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
|
||||
__m128i z = _mm_setzero_si128();
|
||||
__m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
|
||||
__m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
|
||||
__m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
|
||||
#if CV_SIMD128 && !CV_NEON
|
||||
v_int16x8 qw0((short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01));
|
||||
v_int16x8 qw1((short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11));
|
||||
v_int32x4 qdelta_d = v_setall_s32(1 << (W_BITS1-1));
|
||||
v_int32x4 qdelta = v_setall_s32(1 << (W_BITS1-5-1));
|
||||
v_float32x4 qA11 = v_setzero_f32(), qA12 = v_setzero_f32(), qA22 = v_setzero_f32();
|
||||
#endif
|
||||
|
||||
#if CV_NEON
|
||||
@ -275,44 +274,75 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
|
||||
|
||||
x = 0;
|
||||
|
||||
#if CV_SSE2
|
||||
for( ; x <= winSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
|
||||
#if CV_SIMD128 && !CV_NEON
|
||||
for( ; x <= winSize.width*cn - 8; x += 8, dsrc += 8*2, dIptr += 8*2 )
|
||||
{
|
||||
__m128i v00, v01, v10, v11, t0, t1;
|
||||
v_int32x4 t0, t1;
|
||||
v_int16x8 v00, v01, v10, v11, t00, t01, t10, t11;
|
||||
|
||||
v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
|
||||
v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
|
||||
v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + stepI)), z);
|
||||
v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + stepI + cn)), z);
|
||||
v00 = v_reinterpret_as_s16(v_load_expand(src + x));
|
||||
v01 = v_reinterpret_as_s16(v_load_expand(src + x + cn));
|
||||
v10 = v_reinterpret_as_s16(v_load_expand(src + x + stepI));
|
||||
v11 = v_reinterpret_as_s16(v_load_expand(src + x + stepI + cn));
|
||||
|
||||
t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
|
||||
_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
|
||||
t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
|
||||
_mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
|
||||
v_zip(v00, v01, t00, t01);
|
||||
v_zip(v10, v11, t10, t11);
|
||||
|
||||
v00 = _mm_loadu_si128((const __m128i*)(dsrc));
|
||||
v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
|
||||
v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
|
||||
v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
|
||||
t0 = v_dotprod(t00, qw0, qdelta) + v_dotprod(t10, qw1);
|
||||
t1 = v_dotprod(t01, qw0, qdelta) + v_dotprod(t11, qw1);
|
||||
t0 = t0 >> (W_BITS1-5);
|
||||
t1 = t1 >> (W_BITS1-5);
|
||||
v_store(Iptr + x, v_pack(t0, t1));
|
||||
|
||||
t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
|
||||
_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
|
||||
t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
|
||||
_mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
|
||||
t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
|
||||
t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
|
||||
v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
|
||||
v00 = v_reinterpret_as_s16(v_load(dsrc));
|
||||
v01 = v_reinterpret_as_s16(v_load(dsrc + cn2));
|
||||
v10 = v_reinterpret_as_s16(v_load(dsrc + dstep));
|
||||
v11 = v_reinterpret_as_s16(v_load(dsrc + dstep + cn2));
|
||||
|
||||
_mm_storeu_si128((__m128i*)dIptr, v00);
|
||||
t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3
|
||||
t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3
|
||||
v_zip(v00, v01, t00, t01);
|
||||
v_zip(v10, v11, t10, t11);
|
||||
|
||||
__m128 fy = _mm_cvtepi32_ps(t0);
|
||||
__m128 fx = _mm_cvtepi32_ps(t1);
|
||||
t0 = v_dotprod(t00, qw0, qdelta_d) + v_dotprod(t10, qw1);
|
||||
t1 = v_dotprod(t01, qw0, qdelta_d) + v_dotprod(t11, qw1);
|
||||
t0 = t0 >> W_BITS1;
|
||||
t1 = t1 >> W_BITS1;
|
||||
v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
|
||||
v_store(dIptr, v00);
|
||||
|
||||
qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy));
|
||||
qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy));
|
||||
qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx));
|
||||
v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00))));
|
||||
v_expand(v00, t1, t0);
|
||||
|
||||
v_float32x4 fy = v_cvt_f32(t0);
|
||||
v_float32x4 fx = v_cvt_f32(t1);
|
||||
|
||||
qA22 = v_muladd(fy, fy, qA22);
|
||||
qA12 = v_muladd(fx, fy, qA12);
|
||||
qA11 = v_muladd(fx, fx, qA11);
|
||||
|
||||
v00 = v_reinterpret_as_s16(v_load(dsrc + 4*2));
|
||||
v01 = v_reinterpret_as_s16(v_load(dsrc + 4*2 + cn2));
|
||||
v10 = v_reinterpret_as_s16(v_load(dsrc + 4*2 + dstep));
|
||||
v11 = v_reinterpret_as_s16(v_load(dsrc + 4*2 + dstep + cn2));
|
||||
|
||||
v_zip(v00, v01, t00, t01);
|
||||
v_zip(v10, v11, t10, t11);
|
||||
|
||||
t0 = v_dotprod(t00, qw0, qdelta_d) + v_dotprod(t10, qw1);
|
||||
t1 = v_dotprod(t01, qw0, qdelta_d) + v_dotprod(t11, qw1);
|
||||
t0 = t0 >> W_BITS1;
|
||||
t1 = t1 >> W_BITS1;
|
||||
v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
|
||||
v_store(dIptr + 4*2, v00);
|
||||
|
||||
v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00))));
|
||||
v_expand(v00, t1, t0);
|
||||
|
||||
fy = v_cvt_f32(t0);
|
||||
fx = v_cvt_f32(t1);
|
||||
|
||||
qA22 = v_muladd(fy, fy, qA22);
|
||||
qA12 = v_muladd(fx, fy, qA12);
|
||||
qA11 = v_muladd(fx, fx, qA11);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -419,14 +449,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
|
||||
}
|
||||
}
|
||||
|
||||
#if CV_SSE2
|
||||
float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
|
||||
_mm_store_ps(A11buf, qA11);
|
||||
_mm_store_ps(A12buf, qA12);
|
||||
_mm_store_ps(A22buf, qA22);
|
||||
iA11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
|
||||
iA12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
|
||||
iA22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
|
||||
#if CV_SIMD128 && !CV_NEON
|
||||
iA11 += v_reduce_sum(qA11);
|
||||
iA12 += v_reduce_sum(qA12);
|
||||
iA22 += v_reduce_sum(qA22);
|
||||
#endif
|
||||
|
||||
#if CV_NEON
|
||||
@ -479,10 +505,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
|
||||
iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
|
||||
acctype ib1 = 0, ib2 = 0;
|
||||
float b1, b2;
|
||||
#if CV_SSE2
|
||||
qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
|
||||
qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
|
||||
__m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps();
|
||||
#if CV_SIMD128 && !CV_NEON
|
||||
qw0 = v_int16x8((short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01));
|
||||
qw1 = v_int16x8((short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11));
|
||||
v_float32x4 qb0 = v_setzero_f32(), qb1 = v_setzero_f32();
|
||||
#endif
|
||||
|
||||
#if CV_NEON
|
||||
@ -503,34 +529,32 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
|
||||
|
||||
x = 0;
|
||||
|
||||
#if CV_SSE2
|
||||
#if CV_SIMD128 && !CV_NEON
|
||||
for( ; x <= winSize.width*cn - 8; x += 8, dIptr += 8*2 )
|
||||
{
|
||||
__m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)), diff1;
|
||||
__m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
|
||||
__m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
|
||||
__m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + stepJ)), z);
|
||||
__m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + stepJ + cn)), z);
|
||||
v_int16x8 diff0 = v_reinterpret_as_s16(v_load(Iptr + x)), diff1, diff2;
|
||||
v_int16x8 v00 = v_reinterpret_as_s16(v_load_expand(Jptr + x));
|
||||
v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
|
||||
v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr + x + stepJ));
|
||||
v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr + x + stepJ + cn));
|
||||
|
||||
__m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
|
||||
_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
|
||||
__m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
|
||||
_mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
|
||||
t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
|
||||
t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
|
||||
diff0 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0);
|
||||
diff1 = _mm_unpackhi_epi16(diff0, diff0);
|
||||
diff0 = _mm_unpacklo_epi16(diff0, diff0); // It0 It0 It1 It1 ...
|
||||
v00 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
|
||||
v01 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
|
||||
v10 = _mm_unpacklo_epi16(v00, v01);
|
||||
v11 = _mm_unpackhi_epi16(v00, v01);
|
||||
v00 = _mm_unpacklo_epi16(diff0, diff1);
|
||||
v01 = _mm_unpackhi_epi16(diff0, diff1);
|
||||
v00 = _mm_madd_epi16(v00, v10);
|
||||
v11 = _mm_madd_epi16(v01, v11);
|
||||
qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
|
||||
qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v11));
|
||||
v_int32x4 t0, t1;
|
||||
v_int16x8 t00, t01, t10, t11;
|
||||
v_zip(v00, v01, t00, t01);
|
||||
v_zip(v10, v11, t10, t11);
|
||||
|
||||
t0 = v_dotprod(t00, qw0, qdelta) + v_dotprod(t10, qw1);
|
||||
t1 = v_dotprod(t01, qw0, qdelta) + v_dotprod(t11, qw1);
|
||||
t0 = t0 >> (W_BITS1-5);
|
||||
t1 = t1 >> (W_BITS1-5);
|
||||
diff0 = v_pack(t0, t1) - diff0;
|
||||
v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
|
||||
v00 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
|
||||
v01 = v_reinterpret_as_s16(v_load(dIptr + 8));
|
||||
v_zip(v00, v01, v10, v11);
|
||||
v_zip(diff2, diff1, v00, v01);
|
||||
qb0 += v_cvt_f32(v_dotprod(v00, v10));
|
||||
qb1 += v_cvt_f32(v_dotprod(v01, v11));
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -616,11 +640,11 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
|
||||
}
|
||||
}
|
||||
|
||||
#if CV_SSE2
|
||||
float CV_DECL_ALIGNED(16) bbuf[4];
|
||||
_mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
|
||||
ib1 += bbuf[0] + bbuf[2];
|
||||
ib2 += bbuf[1] + bbuf[3];
|
||||
#if CV_SIMD128 && !CV_NEON
|
||||
v_float32x4 qf0, qf1;
|
||||
v_recombine(v_interleave_pairs(qb0 + qb1), v_setzero_f32(), qf0, qf1);
|
||||
ib1 += v_reduce_sum(qf0);
|
||||
ib2 += v_reduce_sum(qf1);
|
||||
#endif
|
||||
|
||||
#if CV_NEON
|
||||
|
Loading…
Reference in New Issue
Block a user