Merge pull request #6983 from K-Shinotsuka:issue8

This commit is contained in:
Vadim Pisarevsky 2016-08-10 10:31:12 +00:00
commit df665e2386

View File

@ -227,16 +227,15 @@ struct MomentsInTile_SIMD<uchar, int, int>
if( useSIMD )
{
__m128i qx_init = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
__m128i dx = _mm_set1_epi16(8);
__m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = qx_init;
__m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
for( ; x <= len - 8; x += 8 )
{
__m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z);
__m128i sx = _mm_mullo_epi16(qx, qx);
qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z));
qx0 = _mm_add_epi16(qx0, p);
qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx));
qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx));
qx3 = _mm_add_epi32(qx3, _mm_madd_epi16( _mm_mullo_epi16(p, qx), sx));
@ -244,14 +243,21 @@ struct MomentsInTile_SIMD<uchar, int, int>
qx = _mm_add_epi16(qx, dx);
}
_mm_store_si128((__m128i*)buf, qx0);
x0 = buf[0] + buf[1] + buf[2] + buf[3];
_mm_store_si128((__m128i*)buf, qx1);
x1 = buf[0] + buf[1] + buf[2] + buf[3];
_mm_store_si128((__m128i*)buf, qx2);
x2 = buf[0] + buf[1] + buf[2] + buf[3];
_mm_store_si128((__m128i*)buf, qx3);
x3 = buf[0] + buf[1] + buf[2] + buf[3];
__m128i qx01_lo = _mm_unpacklo_epi32(qx0, qx1);
__m128i qx23_lo = _mm_unpacklo_epi32(qx2, qx3);
__m128i qx01_hi = _mm_unpackhi_epi32(qx0, qx1);
__m128i qx23_hi = _mm_unpackhi_epi32(qx2, qx3);
qx01_lo = _mm_add_epi32(qx01_lo, qx01_hi);
qx23_lo = _mm_add_epi32(qx23_lo, qx23_hi);
__m128i qx0123_lo = _mm_unpacklo_epi64(qx01_lo, qx23_lo);
__m128i qx0123_hi = _mm_unpackhi_epi64(qx01_lo, qx23_lo);
qx0123_lo = _mm_add_epi32(qx0123_lo, qx0123_hi);
_mm_store_si128((__m128i*)buf, qx0123_lo);
x0 = (buf[0] & 0xffff) + (buf[0] >> 16);
x1 = buf[1];
x2 = buf[2];
x3 = buf[3];
}
return x;