From ba73249dc07588ce235ac3c62663b2332f8446b3 Mon Sep 17 00:00:00 2001 From: Tomoaki Teshima Date: Wed, 10 Aug 2016 18:36:00 +0900 Subject: [PATCH] let the test of AccSqr_SIMD pass * The difference becomes too large when multiply is done in int16 * To reproduce the test failure, IPP has to be switched off --- modules/imgproc/src/accum.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/modules/imgproc/src/accum.cpp b/modules/imgproc/src/accum.cpp index e1f1ce71e8..083192bdbd 100644 --- a/modules/imgproc/src/accum.cpp +++ b/modules/imgproc/src/accum.cpp @@ -964,13 +964,15 @@ struct AccSqr_SIMD for ( ; x <= len - 8; x += 8) { __m128i v_src = _mm_loadu_si128((const __m128i*)(src + x)); - __m128i v_src0 = _mm_unpacklo_epi16(v_src, v_0); - __m128i v_src1 = _mm_unpackhi_epi16(v_src, v_0); - v_src0 = _mm_mullo_epi16(v_src0, v_src0); - v_src1 = _mm_mullo_epi16(v_src1, v_src1); + __m128i v_int0 = _mm_unpacklo_epi16(v_src, v_0); + __m128i v_int1 = _mm_unpackhi_epi16(v_src, v_0); + __m128 v_src0 = _mm_cvtepi32_ps(v_int0); + __m128 v_src1 = _mm_cvtepi32_ps(v_int1); + v_src0 = _mm_mul_ps(v_src0, v_src0); + v_src1 = _mm_mul_ps(v_src1, v_src1); - _mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), _mm_cvtepi32_ps(v_src0))); - _mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), _mm_cvtepi32_ps(v_src1))); + _mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), v_src0)); + _mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), v_src1)); } }