From f57136fd7966b3252ad7de223e54c3bae77aebf0 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 30 Dec 2014 00:34:09 +0300 Subject: [PATCH] SSE2 cv::Mat::dot --- modules/core/src/matmul.cpp | 47 ++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp index fec2ecb8b8..b2f36b3292 100644 --- a/modules/core/src/matmul.cpp +++ b/modules/core/src/matmul.cpp @@ -3012,7 +3012,52 @@ static double dotProd_8s(const schar* src1, const schar* src2, int len) int i = 0; double r = 0.0; -#if CV_NEON +#if CV_SSE2 + if( USE_SSE2 ) + { + int j, len0 = len & -4, blockSize0 = (1 << 13), blockSize; + __m128i z = _mm_setzero_si128(); + CV_DECL_ALIGNED(16) int buf[4]; + + while( i < len0 ) + { + blockSize = std::min(len0 - i, blockSize0); + __m128i s = z; + j = 0; + for( ; j <= blockSize - 16; j += 16 ) + { + __m128i b0 = _mm_loadu_si128((const __m128i*)(src1 + j)); + __m128i b1 = _mm_loadu_si128((const __m128i*)(src2 + j)); + __m128i s0, s1, s2, s3; + s0 = _mm_srai_epi16(_mm_unpacklo_epi8(b0, b0), 8); + s2 = _mm_srai_epi16(_mm_unpackhi_epi8(b0, b0), 8); + s1 = _mm_srai_epi16(_mm_unpacklo_epi8(b1, b1), 8); + s3 = _mm_srai_epi16(_mm_unpackhi_epi8(b1, b1), 8); + s0 = _mm_madd_epi16(s0, s1); + s2 = _mm_madd_epi16(s2, s3); + s = _mm_add_epi32(s, s0); + s = _mm_add_epi32(s, s2); + } + + for( ; j < blockSize; j += 4 ) + { + __m128i s0 = _mm_cvtsi32_si128(*(const int*)(src1 + j)); + __m128i s1 = _mm_cvtsi32_si128(*(const int*)(src2 + j)); + s0 = _mm_srai_epi16(_mm_unpacklo_epi8(s0, s0), 8); + s1 = _mm_srai_epi16(_mm_unpacklo_epi8(s1, s1), 8); + s0 = _mm_madd_epi16(s0, s1); + s = _mm_add_epi32(s, s0); + } + + _mm_store_si128((__m128i*)buf, s); + r += buf[0] + buf[1] + buf[2] + buf[3]; + + src1 += blockSize; + src2 += blockSize; + i += blockSize; + } + } +#elif CV_NEON int len0 = len & -8, blockSize0 = (1 << 14), blockSize; int32x4_t v_zero = vdupq_n_s32(0); CV_DECL_ALIGNED(16) int buf[4];