diff --git a/modules/imgproc/src/accum.cpp b/modules/imgproc/src/accum.cpp index c7cbc40f37..e1f1ce71e8 100644 --- a/modules/imgproc/src/accum.cpp +++ b/modules/imgproc/src/accum.cpp @@ -83,6 +83,1370 @@ struct AccW_SIMD } }; +#if CV_AVX +template <> +struct Acc_SIMD +{ + int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + if (!mask) + { + len *= cn; + for ( ; x <= len - 8 ; x += 8) + { + __m256 v_src = _mm256_loadu_ps(src + x); + __m256 v_dst = _mm256_loadu_ps(dst + x); + v_dst = _mm256_add_ps(v_src, v_dst); + _mm256_storeu_ps(dst + x, v_dst); + } + } + return x; + } +}; + +template <> +struct Acc_SIMD +{ + int operator() (const float * src, double * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + if (!mask) + { + len *= cn; + for ( ; x <= len - 8 ; x += 8) + { + __m256 v_src = _mm256_loadu_ps(src + x); + __m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,0)); + __m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,1)); + __m256d v_dst0 = _mm256_loadu_pd(dst + x); + __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4); + v_dst0 = _mm256_add_pd(v_src0, v_dst0); + v_dst1 = _mm256_add_pd(v_src1, v_dst1); + _mm256_storeu_pd(dst + x, v_dst0); + _mm256_storeu_pd(dst + x + 4, v_dst1); + } + } + return x; + } +}; + +template <> +struct Acc_SIMD +{ + int operator() (const double * src, double * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 4; x += 4) + { + __m256d v_src = _mm256_loadu_pd(src + x); + __m256d v_dst = _mm256_loadu_pd(dst + x); + + v_dst = _mm256_add_pd(v_dst, v_src); + _mm256_storeu_pd(dst + x, v_dst); + } + } + return x; + } +}; + +template <> +struct AccSqr_SIMD +{ + int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + if (!mask) + { + len *= cn; + for ( ; x <= len - 8 ; x += 8) + { + __m256 v_src = _mm256_loadu_ps(src + x); + __m256 v_dst = _mm256_loadu_ps(dst + x); + + v_src = _mm256_mul_ps(v_src, v_src); + v_dst = _mm256_add_ps(v_src, v_dst); + _mm256_storeu_ps(dst + x, v_dst); + } + } + return x; + } +}; + +template <> +struct AccSqr_SIMD +{ + int operator() (const float * src, double * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + if (!mask) + { + len *= cn; + for ( ; x <= len - 8 ; x += 8) + { + __m256 v_src = _mm256_loadu_ps(src + x); + __m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,0)); + __m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,1)); + __m256d v_dst0 = _mm256_loadu_pd(dst + x); + __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4); + + v_src0 = _mm256_mul_pd(v_src0, v_src0); + v_src1 = _mm256_mul_pd(v_src1, v_src1); + v_dst0 = _mm256_add_pd(v_src0, v_dst0); + v_dst1 = _mm256_add_pd(v_src1, v_dst1); + _mm256_storeu_pd(dst + x, v_dst0); + _mm256_storeu_pd(dst + x + 4, v_dst1); + } + } + return x; + } +}; + +template <> +struct AccSqr_SIMD +{ + int operator() (const double * src, double * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 4; x += 4) + { + __m256d v_src = _mm256_loadu_pd(src + x); + __m256d v_dst = _mm256_loadu_pd(dst + x); + + v_src = _mm256_mul_pd(v_src, v_src); + v_dst = _mm256_add_pd(v_dst, v_src); + _mm256_storeu_pd(dst + x, v_dst); + } + } + return x; + } +}; + +template <> +struct AccProd_SIMD +{ + int operator() (const float * src1, const float * src2, float * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 8; x += 8) + { + __m256 v_src0 = _mm256_loadu_ps(src1 + x); + __m256 v_src1 = _mm256_loadu_ps(src2 + x); + __m256 v_dst = _mm256_loadu_ps(dst + x); + __m256 v_src = _mm256_mul_ps(v_src0, v_src1); + + v_dst = _mm256_add_ps(v_src, v_dst); + _mm256_storeu_ps(dst + x, v_dst); + } + } + + return x; + } +}; + +template <> +struct AccProd_SIMD +{ + int operator() (const float * src1, const float * src2, double * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 8; x += 8) + { + __m256 v_1src = _mm256_loadu_ps(src1 + x); + __m256 v_2src = _mm256_loadu_ps(src2 + x); + __m256d v_src00 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_1src,0)); + __m256d v_src01 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_1src,1)); + __m256d v_src10 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_2src,0)); + __m256d v_src11 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_2src,1)); + __m256d v_dst0 = _mm256_loadu_pd(dst + x); + __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4); + + __m256d v_src0 = _mm256_mul_pd(v_src00, v_src10); + __m256d v_src1 = _mm256_mul_pd(v_src01, v_src11); + v_dst0 = _mm256_add_pd(v_src0, v_dst0); + v_dst1 = _mm256_add_pd(v_src1, v_dst1); + _mm256_storeu_pd(dst + x, v_dst0); + _mm256_storeu_pd(dst + x + 4, v_dst1); + } + } + return x; + } +}; + +template <> +struct AccProd_SIMD +{ + int operator() (const double * src1, const double * src2, double * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 4; x += 4) + { + __m256d v_src0 = _mm256_loadu_pd(src1 + x); + __m256d v_src1 = _mm256_loadu_pd(src2 + x); + __m256d v_dst = _mm256_loadu_pd(dst + x); + + v_src0 = _mm256_mul_pd(v_src0, v_src1); + v_dst = _mm256_add_pd(v_dst, v_src0); + _mm256_storeu_pd(dst + x, v_dst); + } + } + return x; + } +}; + +template <> +struct AccW_SIMD +{ + int operator() (const float * src, float * dst, const uchar * mask, int len, int cn, float alpha) const + { + int x = 0; + __m256 v_alpha = _mm256_set1_ps(alpha); + __m256 v_beta = _mm256_set1_ps(1.0f - alpha); + + if (!mask) + { + len *= cn; + for ( ; x <= len - 16; x += 16) + { + _mm256_storeu_ps(dst + x, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x), v_alpha))); + _mm256_storeu_ps(dst + x + 8, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x + 8), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x + 8), v_alpha))); + } + } + + return x; + } +}; + +template <> +struct AccW_SIMD +{ + int operator() (const float * src, double * dst, const uchar * mask, int len, int cn, double alpha) const + { + int x = 0; + __m256d v_alpha = _mm256_set1_pd(alpha); + __m256d v_beta = _mm256_set1_pd(1.0f - alpha); + + if (!mask) + { + len *= cn; + for ( ; x <= len - 16; x += 16) + { + __m256 v_src0 = _mm256_loadu_ps(src + x); + __m256 v_src1 = _mm256_loadu_ps(src + x + 8); + __m256d v_src00 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src0,0)); + __m256d v_src01 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src0,1)); + __m256d v_src10 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src1,0)); + __m256d v_src11 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src1,1)); + + _mm256_storeu_pd(dst + x, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x), v_beta), _mm256_mul_pd(v_src00, v_alpha))); + _mm256_storeu_pd(dst + x + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 4), v_beta), _mm256_mul_pd(v_src01, v_alpha))); + _mm256_storeu_pd(dst + x + 8, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 8), v_beta), _mm256_mul_pd(v_src10, v_alpha))); + _mm256_storeu_pd(dst + x + 12, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 12), v_beta), _mm256_mul_pd(v_src11, v_alpha))); + } + } + + return x; + } +}; + +template <> +struct AccW_SIMD +{ + int operator() (const double * src, double * dst, const uchar * mask, int len, int cn, double alpha) const + { + int x = 0; + __m256d v_alpha = _mm256_set1_pd(alpha); + __m256d v_beta = _mm256_set1_pd(1.0f - alpha); + + if (!mask) + { + len *= cn; + for ( ; x <= len - 8; x += 8) + { + __m256d v_src0 = _mm256_loadu_pd(src + x); + __m256d v_src1 = _mm256_loadu_pd(src + x + 4); + + _mm256_storeu_pd(dst + x, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x), v_beta), _mm256_mul_pd(v_src0, v_alpha))); + _mm256_storeu_pd(dst + x + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 4), v_beta), _mm256_mul_pd(v_src1, v_alpha))); + } + } + + return x; + } +}; +#elif CV_SSE2 +template <> +struct Acc_SIMD +{ + int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 8; x += 8) + { + _mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), _mm_loadu_ps(src + x))); + _mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), _mm_loadu_ps(src + x + 4))); + } + } + + return x; + } +}; + +template <> +struct Acc_SIMD +{ + int operator() (const float * src, double * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 4; x += 4) + { + __m128 v_src = _mm_loadu_ps(src + x); + __m128d v_src0 = _mm_cvtps_pd(v_src); + __m128d v_src1 = _mm_cvtps_pd(_mm_shuffle_ps(v_src, v_src, _MM_SHUFFLE(1, 0, 3, 2))); + + __m128d v_dst0 = _mm_loadu_pd(dst + x); + __m128d v_dst1 = _mm_loadu_pd(dst + x + 2); + + v_dst0 = _mm_add_pd(v_dst0, v_src0); + v_dst1 = _mm_add_pd(v_dst1, v_src1); + _mm_storeu_pd(dst + x, v_dst0); + _mm_storeu_pd(dst + x + 2, v_dst1); + } + } + return x; + } +}; + +template <> +struct Acc_SIMD +{ + int operator() (const double * src, double * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 4; x += 4) + { + __m128d v_src0 = _mm_loadu_pd(src + x); + __m128d v_src1 = _mm_loadu_pd(src + x + 2); + + __m128d v_dst0 = _mm_loadu_pd(dst + x); + __m128d v_dst1 = _mm_loadu_pd(dst + x + 2); + + v_dst0 = _mm_add_pd(v_dst0, v_src0); + v_dst1 = _mm_add_pd(v_dst1, v_src1); + _mm_storeu_pd(dst + x, v_dst0); + _mm_storeu_pd(dst + x + 2, v_dst1); + } + } + return x; + } +}; + +template <> +struct AccSqr_SIMD +{ + int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 8; x += 8) + { + __m128 v_src0 = _mm_loadu_ps(src + x); + __m128 v_src1 = _mm_loadu_ps(src + x + 4); + v_src0 = _mm_mul_ps(v_src0, v_src0); + v_src1 = _mm_mul_ps(v_src1, v_src1); + _mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), v_src0)); + _mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), v_src1)); + } + } + + return x; + } +}; + +template <> +struct AccSqr_SIMD +{ + int operator() (const float * src, double * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 4; x += 4) + { + __m128 v_src = _mm_loadu_ps(src + x); + __m128d v_src0 = _mm_cvtps_pd(v_src); + __m128d v_src1 = _mm_cvtps_pd(_mm_shuffle_ps(v_src, v_src, _MM_SHUFFLE(1, 0, 3, 2))); + v_src0 = _mm_mul_pd(v_src0, v_src0); + v_src1 = _mm_mul_pd(v_src1, v_src1); + + __m128d v_dst0 = _mm_loadu_pd(dst + x); + __m128d v_dst1 = _mm_loadu_pd(dst + x + 2); + v_dst0 = _mm_add_pd(v_dst0, v_src0); + v_dst1 = _mm_add_pd(v_dst1, v_src1); + + _mm_storeu_pd(dst + x, v_dst0); + _mm_storeu_pd(dst + x + 2, v_dst1); + } + } + return x; + } +}; + +template <> +struct AccSqr_SIMD +{ + int operator() (const double * src, double * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 4; x += 4) + { + __m128d v_src0 = _mm_loadu_pd(src + x); + __m128d v_src1 = _mm_loadu_pd(src + x + 2); + v_src0 = _mm_mul_pd(v_src0, v_src0); + v_src1 = _mm_mul_pd(v_src1, v_src1); + + __m128d v_dst0 = _mm_loadu_pd(dst + x); + __m128d v_dst1 = _mm_loadu_pd(dst + x + 2); + + v_dst0 = _mm_add_pd(v_dst0, v_src0); + v_dst1 = _mm_add_pd(v_dst1, v_src1); + _mm_storeu_pd(dst + x, v_dst0); + _mm_storeu_pd(dst + x + 2, v_dst1); + } + } + return x; + } +}; + +template <> +struct AccProd_SIMD +{ + int operator() (const float * src1, const float * src2, float * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 8; x += 8) + { + _mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), _mm_mul_ps(_mm_loadu_ps(src1 + x), _mm_loadu_ps(src2 + x)))); + _mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), _mm_mul_ps(_mm_loadu_ps(src1 + x + 4), _mm_loadu_ps(src2 + x + 4)))); + } + } + + return x; + } +}; + +template <> +struct AccProd_SIMD +{ + int operator() (const float * src1, const float * src2, double * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 4; x += 4) + { + __m128 v_1src = _mm_loadu_ps(src1 + x); + __m128 v_2src = _mm_loadu_ps(src2 + x); + + __m128d v_1src0 = _mm_cvtps_pd(v_1src); + __m128d v_1src1 = _mm_cvtps_pd(_mm_shuffle_ps(v_1src, v_1src, _MM_SHUFFLE(0, 0, 3, 2))); + __m128d v_2src0 = _mm_cvtps_pd(v_2src); + __m128d v_2src1 = _mm_cvtps_pd(_mm_shuffle_ps(v_2src, v_2src, _MM_SHUFFLE(0, 0, 3, 2))); + + __m128d v_dst0 = _mm_loadu_pd(dst + x); + __m128d v_dst1 = _mm_loadu_pd(dst + x + 2); + + v_dst0 = _mm_add_pd(v_dst0, _mm_mul_pd(v_1src0, v_2src0)); + v_dst1 = _mm_add_pd(v_dst1, _mm_mul_pd(v_1src1, v_2src1)); + + _mm_storeu_pd(dst + x, v_dst0); + _mm_storeu_pd(dst + x + 2, v_dst1); + } + } + return x; + } +}; + +template <> +struct AccProd_SIMD +{ + int operator() (const double * src1, const double * src2, double * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 4; x += 4) + { + __m128d v_src00 = _mm_loadu_pd(src1 + x); + __m128d v_src01 = _mm_loadu_pd(src1 + x + 2); + __m128d v_src10 = _mm_loadu_pd(src2 + x); + __m128d v_src11 = _mm_loadu_pd(src2 + x + 2); + __m128d v_src0 = _mm_mul_pd(v_src00, v_src10); + __m128d v_src1 = _mm_mul_pd(v_src01, v_src11); + + __m128d v_dst0 = _mm_loadu_pd(dst + x); + __m128d v_dst1 = _mm_loadu_pd(dst + x + 2); + + v_dst0 = _mm_add_pd(v_dst0, v_src0); + v_dst1 = _mm_add_pd(v_dst1, v_src1); + _mm_storeu_pd(dst + x, v_dst0); + _mm_storeu_pd(dst + x + 2, v_dst1); + } + } + return x; + } +}; + +template <> +struct AccW_SIMD +{ + int operator() (const float * src, float * dst, const uchar * mask, int len, int cn, float alpha) const + { + int x = 0; + __m128 v_alpha = _mm_set1_ps(alpha); + __m128 v_beta = _mm_set1_ps(1.0f - alpha); + + if (!mask) + { + len *= cn; + for ( ; x <= len - 8; x += 8) + { + _mm_storeu_ps(dst + x, _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(dst + x), v_beta), _mm_mul_ps(_mm_loadu_ps(src + x), v_alpha))); + _mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(dst + x + 4), v_beta), _mm_mul_ps(_mm_loadu_ps(src + x + 4), v_alpha))); + } + } + + return x; + } +}; + +template <> +struct AccW_SIMD +{ + int operator() (const float * src, double * dst, const uchar * mask, int len, int cn, double alpha) const + { + int x = 0; + __m128d v_alpha = _mm_set1_pd(alpha); + __m128d v_beta = _mm_set1_pd(1.0f - alpha); + + if (!mask) + { + len *= cn; + for ( ; x <= len - 8; x += 8) + { + __m128 v_src0 = _mm_loadu_ps(src + x); + __m128 v_src1 = _mm_loadu_ps(src + x + 4); + __m128d v_src00 = _mm_cvtps_pd(v_src0); + __m128d v_src01 = _mm_cvtps_pd(_mm_shuffle_ps(v_src0, v_src0, _MM_SHUFFLE(0, 0, 3, 2))); + __m128d v_src10 = _mm_cvtps_pd(v_src1); + __m128d v_src11 = _mm_cvtps_pd(_mm_shuffle_ps(v_src1, v_src1, _MM_SHUFFLE(0, 0, 3, 2))); + + _mm_storeu_pd(dst + x, _mm_add_pd(_mm_mul_pd(_mm_loadu_pd(dst + x), v_beta), _mm_mul_pd(v_src00, v_alpha))); + _mm_storeu_pd(dst + x + 2, _mm_add_pd(_mm_mul_pd(_mm_loadu_pd(dst + x + 2), v_beta), _mm_mul_pd(v_src01, v_alpha))); + _mm_storeu_pd(dst + x + 4, _mm_add_pd(_mm_mul_pd(_mm_loadu_pd(dst + x + 4), v_beta), _mm_mul_pd(v_src10, v_alpha))); + _mm_storeu_pd(dst + x + 6, _mm_add_pd(_mm_mul_pd(_mm_loadu_pd(dst + x + 6), v_beta), _mm_mul_pd(v_src11, v_alpha))); + } + } + + return x; + } +}; + +template <> +struct AccW_SIMD +{ + int operator() (const double * src, double * dst, const uchar * mask, int len, int cn, double alpha) const + { + int x = 0; + __m128d v_alpha = _mm_set1_pd(alpha); + __m128d v_beta = _mm_set1_pd(1.0f - alpha); + + if (!mask) + { + len *= cn; + for ( ; x <= len - 4; x += 4) + { + __m128d v_src0 = _mm_loadu_pd(src + x); + __m128d v_src1 = _mm_loadu_pd(src + x + 2); + + _mm_storeu_pd(dst + x, _mm_add_pd(_mm_mul_pd(_mm_loadu_pd(dst + x), v_beta), _mm_mul_pd(v_src0, v_alpha))); + _mm_storeu_pd(dst + x + 2, _mm_add_pd(_mm_mul_pd(_mm_loadu_pd(dst + x + 2), v_beta), _mm_mul_pd(v_src1, v_alpha))); + } + } + + return x; + } +}; +#endif + +#if CV_SSE2 +template <> +struct Acc_SIMD +{ + int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + __m128i v_0 = _mm_setzero_si128(); + + if (!mask) + { + len *= cn; + for ( ; x <= len - 16; x += 16) + { + __m128i v_src = _mm_loadu_si128((const __m128i*)(src + x)); + __m128i v_src0 = _mm_unpacklo_epi8(v_src, v_0); + __m128i v_src1 = _mm_unpackhi_epi8(v_src, v_0); + + _mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src0, v_0)))); + _mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src0, v_0)))); + _mm_storeu_ps(dst + x + 8, _mm_add_ps(_mm_loadu_ps(dst + x + 8), _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_0)))); + _mm_storeu_ps(dst + x + 12, _mm_add_ps(_mm_loadu_ps(dst + x + 12), _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_0)))); + } + } + else if (cn == 1) + { + __m128i v_255 = _mm_set1_epi8(-1); + + for ( ; x <= len - 16; x += 16) + { + __m128i v_src = _mm_and_si128(_mm_loadu_si128((const __m128i*)(src + x)), _mm_xor_si128(v_255, _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i*)(mask + x)), v_0))); + __m128i v_src0 = _mm_unpacklo_epi8(v_src, v_0); + __m128i v_src1 = _mm_unpackhi_epi8(v_src, v_0); + + _mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src0, v_0)))); + _mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src0, v_0)))); + _mm_storeu_ps(dst + x + 8, _mm_add_ps(_mm_loadu_ps(dst + x + 8), _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_0)))); + _mm_storeu_ps(dst + x + 12, _mm_add_ps(_mm_loadu_ps(dst + x + 12), _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_0)))); + } + } + + return x; + } +}; + +template <> +struct Acc_SIMD +{ + int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + __m128i v_0 = _mm_setzero_si128(); + + if (!mask) + { + len *= cn; + for ( ; x <= len - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((const __m128i*)(src + x)); + __m128i v_src0 = _mm_unpacklo_epi16(v_src, v_0); + __m128i v_src1 = _mm_unpackhi_epi16(v_src, v_0); + + _mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), _mm_cvtepi32_ps(v_src0))); + _mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), _mm_cvtepi32_ps(v_src1))); + } + } + + return x; + } +}; + +template <> +struct Acc_SIMD +{ + int operator() (const uchar * src, double * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + __m128i v_0 = _mm_setzero_si128(); + len *= cn; + for ( ; x <= len - 16; x += 16) + { + __m128i v_src = _mm_loadu_si128((const __m128i*)(src + x)); + __m128i v_int0 = _mm_unpacklo_epi8(v_src, v_0); + __m128i v_int1 = _mm_unpackhi_epi8(v_src, v_0); + __m128i v_int00 = _mm_unpacklo_epi16(v_int0, v_0); + __m128i v_int01 = _mm_unpackhi_epi16(v_int0, v_0); + __m128i v_int10 = _mm_unpacklo_epi16(v_int1, v_0); + __m128i v_int11 = _mm_unpackhi_epi16(v_int1, v_0); + __m128d v_src0 = _mm_cvtepi32_pd(v_int00); + __m128d v_src1 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int00, _MM_SHUFFLE(0, 0, 3, 2))); + __m128d v_src2 = _mm_cvtepi32_pd(v_int01); + __m128d v_src3 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int01, _MM_SHUFFLE(0, 0, 3, 2))); + __m128d v_src4 = _mm_cvtepi32_pd(v_int10); + __m128d v_src5 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int10, _MM_SHUFFLE(0, 0, 3, 2))); + __m128d v_src6 = _mm_cvtepi32_pd(v_int11); + __m128d v_src7 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int11, _MM_SHUFFLE(0, 0, 3, 2))); + + __m128d v_dst0 = _mm_loadu_pd(dst + x); + __m128d v_dst1 = _mm_loadu_pd(dst + x + 2); + __m128d v_dst2 = _mm_loadu_pd(dst + x + 4); + __m128d v_dst3 = _mm_loadu_pd(dst + x + 6); + __m128d v_dst4 = _mm_loadu_pd(dst + x + 8); + __m128d v_dst5 = _mm_loadu_pd(dst + x + 10); + __m128d v_dst6 = _mm_loadu_pd(dst + x + 12); + __m128d v_dst7 = _mm_loadu_pd(dst + x + 14); + + v_dst0 = _mm_add_pd(v_dst0, v_src0); + v_dst1 = _mm_add_pd(v_dst1, v_src1); + v_dst2 = _mm_add_pd(v_dst2, v_src2); + v_dst3 = _mm_add_pd(v_dst3, v_src3); + v_dst4 = _mm_add_pd(v_dst4, v_src4); + v_dst5 = _mm_add_pd(v_dst5, v_src5); + v_dst6 = _mm_add_pd(v_dst6, v_src6); + v_dst7 = _mm_add_pd(v_dst7, v_src7); + + _mm_storeu_pd(dst + x, v_dst0); + _mm_storeu_pd(dst + x + 2, v_dst1); + _mm_storeu_pd(dst + x + 4, v_dst2); + _mm_storeu_pd(dst + x + 6, v_dst3); + _mm_storeu_pd(dst + x + 8, v_dst4); + _mm_storeu_pd(dst + x + 10, v_dst5); + _mm_storeu_pd(dst + x + 12, v_dst6); + _mm_storeu_pd(dst + x + 14, v_dst7); + } + } + return x; + } +}; + +template <> +struct Acc_SIMD +{ + int operator() (const ushort * src, double * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + __m128i v_0 = _mm_setzero_si128(); + len *= cn; + for ( ; x <= len - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((const __m128i*)(src + x)); + __m128i v_int0 = _mm_unpacklo_epi16(v_src, v_0); + __m128i v_int1 = _mm_unpackhi_epi16(v_src, v_0); + __m128d v_src0 = _mm_cvtepi32_pd(v_int0); + __m128d v_src1 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int0, _MM_SHUFFLE(0, 0, 3, 2))); + __m128d v_src2 = _mm_cvtepi32_pd(v_int1); + __m128d v_src3 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int1, _MM_SHUFFLE(0, 0, 3, 2))); + + __m128d v_dst0 = _mm_loadu_pd(dst + x); + __m128d v_dst1 = _mm_loadu_pd(dst + x + 2); + __m128d v_dst2 = _mm_loadu_pd(dst + x + 4); + __m128d v_dst3 = _mm_loadu_pd(dst + x + 6); + + v_dst0 = _mm_add_pd(v_dst0, v_src0); + v_dst1 = _mm_add_pd(v_dst1, v_src1); + v_dst2 = _mm_add_pd(v_dst2, v_src2); + v_dst3 = _mm_add_pd(v_dst3, v_src3); + + _mm_storeu_pd(dst + x, v_dst0); + _mm_storeu_pd(dst + x + 2, v_dst1); + _mm_storeu_pd(dst + x + 4, v_dst2); + _mm_storeu_pd(dst + x + 6, v_dst3); + } + } + return x; + } +}; + +template <> +struct AccSqr_SIMD +{ + int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + __m128i v_0 = _mm_setzero_si128(); + + if (!mask) + { + len *= cn; + for ( ; x <= len - 16; x += 16) + { + __m128i v_src = _mm_loadu_si128((const __m128i*)(src + x)); + __m128i v_src0 = _mm_unpacklo_epi8(v_src, v_0); + __m128i v_src1 = _mm_unpackhi_epi8(v_src, v_0); + v_src0 = _mm_mullo_epi16(v_src0, v_src0); + v_src1 = _mm_mullo_epi16(v_src1, v_src1); + + _mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src0, v_0)))); + _mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src0, v_0)))); + _mm_storeu_ps(dst + x + 8, _mm_add_ps(_mm_loadu_ps(dst + x + 8), _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_0)))); + _mm_storeu_ps(dst + x + 12, _mm_add_ps(_mm_loadu_ps(dst + x + 12), _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_0)))); + } + } + else if (cn == 1) + { + + __m128i v_255 = _mm_set1_epi8(-1); + for ( ; x <= len - 16; x += 16) + { + + __m128i v_src = _mm_and_si128(_mm_loadu_si128((const __m128i*)(src + x)), _mm_xor_si128(v_255, _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i*)(mask + x)), v_0))); + __m128i v_src0 = _mm_unpacklo_epi8(v_src, v_0); + __m128i v_src1 = _mm_unpackhi_epi8(v_src, v_0); + v_src0 = _mm_mullo_epi16(v_src0, v_src0); + v_src1 = _mm_mullo_epi16(v_src1, v_src1); + + _mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src0, v_0)))); + _mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src0, v_0)))); + _mm_storeu_ps(dst + x + 8, _mm_add_ps(_mm_loadu_ps(dst + x + 8), _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_0)))); + _mm_storeu_ps(dst + x + 12, _mm_add_ps(_mm_loadu_ps(dst + x + 12), _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_0)))); + } + } + + return x; + } +}; + +template <> +struct AccSqr_SIMD +{ + int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + __m128i v_0 = _mm_setzero_si128(); + + if (!mask) + { + len *= cn; + for ( ; x <= len - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((const __m128i*)(src + x)); + __m128i v_src0 = _mm_unpacklo_epi16(v_src, v_0); + __m128i v_src1 = _mm_unpackhi_epi16(v_src, v_0); + v_src0 = _mm_mullo_epi16(v_src0, v_src0); + v_src1 = _mm_mullo_epi16(v_src1, v_src1); + + _mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), _mm_cvtepi32_ps(v_src0))); + _mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), _mm_cvtepi32_ps(v_src1))); + } + } + + return x; + } +}; + +template <> +struct AccSqr_SIMD +{ + int operator() (const uchar * src, double * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + __m128i v_0 = _mm_setzero_si128(); + len *= cn; + for ( ; x <= len - 8; x += 8) + { + __m128i v_src = _mm_loadl_epi64((const __m128i*)(src + x)); + __m128i v_int = _mm_unpacklo_epi8(v_src, v_0); + __m128i v_int0 = _mm_unpacklo_epi16(v_int, v_0); + __m128i v_int1 = _mm_unpackhi_epi16(v_int, v_0); + __m128d v_src0 = _mm_cvtepi32_pd(v_int0); + __m128d v_src1 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int0, _MM_SHUFFLE(0, 0, 3, 2))); + __m128d v_src2 = _mm_cvtepi32_pd(v_int1); + __m128d v_src3 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int1, _MM_SHUFFLE(0, 0, 3, 2))); + v_src0 = _mm_mul_pd(v_src0, v_src0); + v_src1 = _mm_mul_pd(v_src1, v_src1); + v_src2 = _mm_mul_pd(v_src2, v_src2); + v_src3 = _mm_mul_pd(v_src3, v_src3); + + __m128d v_dst0 = _mm_loadu_pd(dst + x); + __m128d v_dst1 = _mm_loadu_pd(dst + x + 2); + __m128d v_dst2 = _mm_loadu_pd(dst + x + 4); + __m128d v_dst3 = _mm_loadu_pd(dst + x + 6); + + v_dst0 = _mm_add_pd(v_dst0, v_src0); + v_dst1 = _mm_add_pd(v_dst1, v_src1); + v_dst2 = _mm_add_pd(v_dst2, v_src2); + v_dst3 = _mm_add_pd(v_dst3, v_src3); + + _mm_storeu_pd(dst + x, v_dst0); + _mm_storeu_pd(dst + x + 2, v_dst1); + _mm_storeu_pd(dst + x + 4, v_dst2); + _mm_storeu_pd(dst + x + 6, v_dst3); + } + } + return x; + } +}; + +template <> +struct AccSqr_SIMD +{ + int operator() (const ushort * src, double * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + __m128i v_0 = _mm_setzero_si128(); + len *= cn; + for ( ; x <= len - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((const __m128i*)(src + x)); + __m128i v_int0 = _mm_unpacklo_epi16(v_src, v_0); + __m128i v_int1 = _mm_unpackhi_epi16(v_src, v_0); + __m128d v_src0 = _mm_cvtepi32_pd(v_int0); + __m128d v_src1 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int0, _MM_SHUFFLE(0, 0, 3, 2))); + __m128d v_src2 = _mm_cvtepi32_pd(v_int1); + __m128d v_src3 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int1, _MM_SHUFFLE(0, 0, 3, 2))); + v_src0 = _mm_mul_pd(v_src0, v_src0); + v_src1 = _mm_mul_pd(v_src1, v_src1); + v_src2 = _mm_mul_pd(v_src2, v_src2); + v_src3 = _mm_mul_pd(v_src3, v_src3); + + __m128d v_dst0 = _mm_loadu_pd(dst + x); + __m128d v_dst1 = _mm_loadu_pd(dst + x + 2); + __m128d v_dst2 = _mm_loadu_pd(dst + x + 4); + __m128d v_dst3 = _mm_loadu_pd(dst + x + 6); + + v_dst0 = _mm_add_pd(v_dst0, v_src0); + v_dst1 = _mm_add_pd(v_dst1, v_src1); + v_dst2 = _mm_add_pd(v_dst2, v_src2); + v_dst3 = _mm_add_pd(v_dst3, v_src3); + + _mm_storeu_pd(dst + x, v_dst0); + _mm_storeu_pd(dst + x + 2, v_dst1); + _mm_storeu_pd(dst + x + 4, v_dst2); + _mm_storeu_pd(dst + x + 6, v_dst3); + } + } + return x; + } +}; + +template <> +struct AccProd_SIMD +{ + int operator() (const uchar * src1, const uchar * src2, float * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + __m128i v_0 = _mm_setzero_si128(); + len *= cn; + if (!mask) + { + for ( ; x <= len - 16; x += 16) + { + __m128i v_1src = _mm_loadu_si128((const __m128i*)(src1 + x)); + __m128i v_2src = _mm_loadu_si128((const __m128i*)(src2 + x)); + + __m128i v_1src0 = _mm_unpacklo_epi8(v_1src, v_0); + __m128i v_1src1 = _mm_unpackhi_epi8(v_1src, v_0); + __m128i v_2src0 = _mm_unpacklo_epi8(v_2src, v_0); + __m128i v_2src1 = _mm_unpackhi_epi8(v_2src, v_0); + __m128i v_src0 = _mm_mullo_epi16(v_1src0, v_2src0); + __m128i v_src1 = _mm_mullo_epi16(v_1src1, v_2src1); + __m128 v_src00 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src0, v_0)); + __m128 v_src01 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src0, v_0)); + __m128 v_src10 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_0)); + __m128 v_src11 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_0)); + + _mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), v_src00)); + _mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), v_src01)); + _mm_storeu_ps(dst + x + 8, _mm_add_ps(_mm_loadu_ps(dst + x + 8), v_src10)); + _mm_storeu_ps(dst + x + 12, _mm_add_ps(_mm_loadu_ps(dst + x + 12), v_src11)); + } + } + else if (cn == 1) + { + __m128i v_255 = _mm_set1_epi8(-1); + + for ( ; x <= len - 16; x += 16) + { + __m128i v_mask = _mm_loadu_si128((const __m128i*)(mask + x)); + v_mask = _mm_xor_si128(v_255, _mm_cmpeq_epi8(v_mask, v_0)); + __m128i v_1src = _mm_and_si128(_mm_loadu_si128((const __m128i*)(src1 + x)), v_mask); + __m128i v_2src = _mm_and_si128(_mm_loadu_si128((const __m128i*)(src2 + x)), v_mask); + + __m128i v_1src0 = _mm_unpacklo_epi8(v_1src, v_0); + __m128i v_1src1 = _mm_unpackhi_epi8(v_1src, v_0); + __m128i v_2src0 = _mm_unpacklo_epi8(v_2src, v_0); + __m128i v_2src1 = _mm_unpackhi_epi8(v_2src, v_0); + __m128i v_src0 = _mm_mullo_epi16(v_1src0, v_2src0); + __m128i v_src1 = _mm_mullo_epi16(v_1src1, v_2src1); + __m128 v_src00 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src0, v_0)); + __m128 v_src01 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src0, v_0)); + __m128 v_src10 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_0)); + __m128 v_src11 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_0)); + + _mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), v_src00)); + _mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), v_src01)); + _mm_storeu_ps(dst + x + 8, _mm_add_ps(_mm_loadu_ps(dst + x + 8), v_src10)); + _mm_storeu_ps(dst + x + 12, _mm_add_ps(_mm_loadu_ps(dst + x + 12), v_src11)); + } + } + + return x; + } +}; + +template <> +struct AccProd_SIMD +{ + int operator() (const ushort * src1, const ushort * src2, float * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + __m128i v_0 = _mm_setzero_si128(); + + if (!mask) + { + len *= cn; + for ( ; x <= len - 8; x += 8) + { + __m128i v_1src = _mm_loadu_si128((const __m128i*)(src1 + x)); + __m128i v_2src = _mm_loadu_si128((const __m128i*)(src2 + x)); + __m128 v_1src0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_1src, v_0)); + __m128 v_1src1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_1src, v_0)); + __m128 v_2src0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_2src, v_0)); + __m128 v_2src1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_2src, v_0)); + __m128 v_src0 = _mm_mul_ps(v_1src0, v_2src0); + __m128 v_src1 = _mm_mul_ps(v_1src1, v_2src1); + + _mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), v_src0)); + _mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), v_src1)); + } + } + else if (cn == 1) + { + __m128i v_65535 = _mm_set1_epi16(-1); + + for ( ; x <= len - 8; x += 8) + { + __m128i v_mask = _mm_loadl_epi64((const __m128i*)(mask + x)); + __m128i v_mask0 = _mm_unpacklo_epi8(v_mask, v_0); + v_mask0 = _mm_xor_si128(v_65535, _mm_cmpeq_epi16(v_mask0, v_0)); + + __m128i v_1src = _mm_and_si128(_mm_loadu_si128((const __m128i*)(src1 + x)), v_mask0); + __m128i v_2src = _mm_and_si128(_mm_loadu_si128((const __m128i*)(src2 + x)), v_mask0); + __m128 v_1src0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_1src, v_0)); + __m128 v_1src1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_1src, v_0)); + __m128 v_2src0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_2src, v_0)); + __m128 v_2src1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_2src, v_0)); + __m128 v_src0 = _mm_mul_ps(v_1src0, v_2src0); + __m128 v_src1 = _mm_mul_ps(v_1src1, v_2src1); + + _mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), v_src0)); + _mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), v_src1)); + } + } + + return x; + } +}; + +template <> +struct AccProd_SIMD +{ + int operator() (const uchar * src1, const uchar * src2, double * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + __m128i v_0 = _mm_setzero_si128(); + len *= cn; + for ( ; x <= len - 8; x += 8) + { + __m128i v_1src = _mm_loadl_epi64((const __m128i*)(src1 + x)); + __m128i v_2src = _mm_loadl_epi64((const __m128i*)(src2 + x)); + + __m128i v_1int = _mm_unpacklo_epi8(v_1src, v_0); + __m128i v_2int = _mm_unpacklo_epi8(v_2src, v_0); + __m128i v_1int0 = _mm_unpacklo_epi16(v_1int, v_0); + __m128i v_1int1 = _mm_unpackhi_epi16(v_1int, v_0); + __m128i v_2int0 = _mm_unpacklo_epi16(v_2int, v_0); + __m128i v_2int1 = _mm_unpackhi_epi16(v_2int, v_0); + __m128d v_src0 = _mm_mul_pd(_mm_cvtepi32_pd(v_1int0),_mm_cvtepi32_pd(v_2int0)); + __m128d v_src1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_shuffle_epi32(v_1int0, _MM_SHUFFLE(0, 0, 3, 2))),_mm_cvtepi32_pd(_mm_shuffle_epi32(v_2int0, _MM_SHUFFLE(0, 0, 3, 2)))); + __m128d v_src2 = _mm_mul_pd(_mm_cvtepi32_pd(v_1int1),_mm_cvtepi32_pd(v_2int1)); + __m128d v_src3 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_shuffle_epi32(v_1int1, _MM_SHUFFLE(0, 0, 3, 2))),_mm_cvtepi32_pd(_mm_shuffle_epi32(v_2int1, _MM_SHUFFLE(0, 0, 3, 2)))); + + __m128d v_dst0 = _mm_loadu_pd(dst + x); + __m128d v_dst1 = _mm_loadu_pd(dst + x + 2); + __m128d v_dst2 = _mm_loadu_pd(dst + x + 4); + __m128d v_dst3 = _mm_loadu_pd(dst + x + 6); + + v_dst0 = _mm_add_pd(v_dst0, v_src0); + v_dst1 = _mm_add_pd(v_dst1, v_src1); + v_dst2 = _mm_add_pd(v_dst2, v_src2); + v_dst3 = _mm_add_pd(v_dst3, v_src3); + + _mm_storeu_pd(dst + x, v_dst0); + _mm_storeu_pd(dst + x + 2, v_dst1); + _mm_storeu_pd(dst + x + 4, v_dst2); + _mm_storeu_pd(dst + x + 6, v_dst3); + } + } + return x; + } +}; + +template <> +struct AccProd_SIMD +{ + int operator() (const ushort * src1, const ushort * src2, double * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + __m128i v_0 = _mm_setzero_si128(); + len *= cn; + for ( ; x <= len - 8; x += 8) + { + __m128i v_1src = _mm_loadu_si128((const __m128i*)(src1 + x)); + __m128i v_2src = _mm_loadu_si128((const __m128i*)(src2 + x)); + __m128i v_1int0 = _mm_unpacklo_epi16(v_1src, v_0); + __m128i v_1int1 = _mm_unpackhi_epi16(v_1src, v_0); + __m128i v_2int0 = _mm_unpacklo_epi16(v_2src, v_0); + __m128i v_2int1 = _mm_unpackhi_epi16(v_2src, v_0); + __m128d v_src0 = _mm_mul_pd(_mm_cvtepi32_pd(v_1int0),_mm_cvtepi32_pd(v_2int0)); + __m128d v_src1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_shuffle_epi32(v_1int0, _MM_SHUFFLE(0, 0, 3, 2))),_mm_cvtepi32_pd(_mm_shuffle_epi32(v_2int0, _MM_SHUFFLE(0, 0, 3, 2)))); + __m128d v_src2 = _mm_mul_pd(_mm_cvtepi32_pd(v_1int1),_mm_cvtepi32_pd(v_2int1)); + __m128d v_src3 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_shuffle_epi32(v_1int1, _MM_SHUFFLE(0, 0, 3, 2))),_mm_cvtepi32_pd(_mm_shuffle_epi32(v_2int1, _MM_SHUFFLE(0, 0, 3, 2)))); + + __m128d v_dst0 = _mm_loadu_pd(dst + x); + __m128d v_dst1 = _mm_loadu_pd(dst + x + 2); + __m128d v_dst2 = _mm_loadu_pd(dst + x + 4); + __m128d v_dst3 = _mm_loadu_pd(dst + x + 6); + + v_dst0 = _mm_add_pd(v_dst0, v_src0); + v_dst1 = _mm_add_pd(v_dst1, v_src1); + v_dst2 = _mm_add_pd(v_dst2, v_src2); + v_dst3 = _mm_add_pd(v_dst3, v_src3); + + _mm_storeu_pd(dst + x, v_dst0); + _mm_storeu_pd(dst + x + 2, v_dst1); + _mm_storeu_pd(dst + x + 4, v_dst2); + _mm_storeu_pd(dst + x + 6, v_dst3); + } + } + return x; + } +}; + +template <> +struct AccW_SIMD +{ + int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn, float alpha) const + { + int x = 0; + __m128 v_alpha = _mm_set1_ps(alpha); + __m128 v_beta = _mm_set1_ps(1.0f - alpha); + __m128i v_0 = _mm_setzero_si128(); + + if (!mask) + { + len *= cn; + for ( ; x <= len - 16; x += 16) + { + __m128i v_src = _mm_loadu_si128((const __m128i*)(src + x)); + __m128i v_src0 = _mm_unpacklo_epi8(v_src, v_0); + __m128i v_src1 = _mm_unpackhi_epi8(v_src, v_0); + __m128 v_src00 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src0, v_0)); + __m128 v_src01 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src0, v_0)); + __m128 v_src10 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_0)); + __m128 v_src11 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_0)); + + __m128 v_dst00 = _mm_loadu_ps(dst + x); + __m128 v_dst01 = _mm_loadu_ps(dst + x + 4); + __m128 v_dst10 = _mm_loadu_ps(dst + x + 8); + __m128 v_dst11 = _mm_loadu_ps(dst + x + 12); + + v_dst00 = _mm_add_ps(_mm_mul_ps(v_dst00, v_beta), _mm_mul_ps(v_src00, v_alpha)); + v_dst01 = _mm_add_ps(_mm_mul_ps(v_dst01, v_beta), _mm_mul_ps(v_src01, v_alpha)); + v_dst10 = _mm_add_ps(_mm_mul_ps(v_dst10, v_beta), _mm_mul_ps(v_src10, v_alpha)); + v_dst11 = _mm_add_ps(_mm_mul_ps(v_dst11, v_beta), _mm_mul_ps(v_src11, v_alpha)); + + _mm_storeu_ps(dst + x, v_dst00); + _mm_storeu_ps(dst + x + 4, v_dst01); + _mm_storeu_ps(dst + x + 8, v_dst10); + _mm_storeu_ps(dst + x + 12, v_dst11); + } + } + + return x; + } +}; + +template <> +struct AccW_SIMD +{ + int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn, float alpha) const + { + int x = 0; + __m128 v_alpha = _mm_set1_ps(alpha); + __m128 v_beta = _mm_set1_ps(1.0f - alpha); + __m128i v_0 = _mm_setzero_si128(); + + if (!mask) + { + len *= cn; + for ( ; x <= len - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((const __m128i*)(src + x)); + __m128 v_src0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_0)); + __m128 v_src1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_0)); + v_src0 = _mm_mul_ps(v_src0, v_alpha); + v_src1 = _mm_mul_ps(v_src1, v_alpha); + + __m128 v_dst0 = _mm_mul_ps(_mm_loadu_ps(dst + x), v_beta); + __m128 v_dst1 = _mm_mul_ps(_mm_loadu_ps(dst + x + 4), v_beta); + + _mm_storeu_ps(dst + x, _mm_add_ps(v_dst0, v_src0)); + _mm_storeu_ps(dst + x + 4, _mm_add_ps(v_dst1, v_src1)); + } + } + + return x; + } +}; + +template <> +struct AccW_SIMD +{ + int operator() (const uchar * src, double * dst, const uchar * mask, int len, int cn, double alpha) const + { + int x = 0; + __m128d v_alpha = _mm_set1_pd(alpha); + __m128d v_beta = _mm_set1_pd(1.0f - alpha); + __m128i v_0 = _mm_setzero_si128(); + + if (!mask) + { + len *= cn; + for ( ; x <= len - 8; x += 8) + { + __m128i v_src = _mm_loadl_epi64((const __m128i*)(src + x)); + __m128i v_int = _mm_unpacklo_epi8(v_src, v_0); + __m128i v_int0 = _mm_unpacklo_epi16(v_int, v_0); + __m128i v_int1 = _mm_unpackhi_epi16(v_int, v_0); + __m128d v_src0 = _mm_cvtepi32_pd(v_int0); + __m128d v_src1 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int0, _MM_SHUFFLE(0, 0, 3, 2))); + __m128d v_src2 = _mm_cvtepi32_pd(v_int1); + __m128d v_src3 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int1, _MM_SHUFFLE(0, 0, 3, 2))); + + __m128d v_dst0 = _mm_loadu_pd(dst + x); + __m128d v_dst1 = _mm_loadu_pd(dst + x + 2); + __m128d v_dst2 = _mm_loadu_pd(dst + x + 4); + __m128d v_dst3 = _mm_loadu_pd(dst + x + 6); + + v_dst0 = _mm_add_pd(_mm_mul_pd(v_dst0, v_beta), _mm_mul_pd(v_src0, v_alpha)); + v_dst1 = _mm_add_pd(_mm_mul_pd(v_dst1, v_beta), _mm_mul_pd(v_src1, v_alpha)); + v_dst2 = _mm_add_pd(_mm_mul_pd(v_dst2, v_beta), _mm_mul_pd(v_src2, v_alpha)); + v_dst3 = _mm_add_pd(_mm_mul_pd(v_dst3, v_beta), _mm_mul_pd(v_src3, v_alpha)); + + _mm_storeu_pd(dst + x, v_dst0); + _mm_storeu_pd(dst + x + 2, v_dst1); + _mm_storeu_pd(dst + x + 4, v_dst2); + _mm_storeu_pd(dst + x + 6, v_dst3); + } + } + + return x; + } +}; + +template <> +struct AccW_SIMD +{ + int operator() (const ushort * src, double * dst, const uchar * mask, int len, int cn, double alpha) const + { + int x = 0; + __m128d v_alpha = _mm_set1_pd(alpha); + __m128d v_beta = _mm_set1_pd(1.0f - alpha); + __m128i v_0 = _mm_setzero_si128(); + + if (!mask) + { + len *= cn; + for ( ; x <= len - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((const __m128i*)(src + x)); + __m128i v_int0 = _mm_unpacklo_epi16(v_src, v_0); + __m128i v_int1 = _mm_unpackhi_epi16(v_src, v_0); + __m128d v_src00 = _mm_cvtepi32_pd(v_int0); + __m128d v_src01 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int0, _MM_SHUFFLE(0, 0, 3, 2))); + __m128d v_src10 = _mm_cvtepi32_pd(v_int1); + __m128d v_src11 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int1, _MM_SHUFFLE(0, 0, 3, 2))); + + __m128d v_dst00 = _mm_loadu_pd(dst + x); + __m128d v_dst01 = _mm_loadu_pd(dst + x + 2); + __m128d v_dst10 = _mm_loadu_pd(dst + x + 4); + __m128d v_dst11 = _mm_loadu_pd(dst + x + 6); + + v_dst00 = _mm_add_pd(_mm_mul_pd(v_dst00, v_beta), _mm_mul_pd(v_src00, v_alpha)); + v_dst01 = _mm_add_pd(_mm_mul_pd(v_dst01, v_beta), _mm_mul_pd(v_src01, v_alpha)); + v_dst10 = _mm_add_pd(_mm_mul_pd(v_dst10, v_beta), _mm_mul_pd(v_src10, v_alpha)); + v_dst11 = _mm_add_pd(_mm_mul_pd(v_dst11, v_beta), _mm_mul_pd(v_src11, v_alpha)); + + _mm_storeu_pd(dst + x, v_dst00); + _mm_storeu_pd(dst + x + 2, v_dst01); + _mm_storeu_pd(dst + x + 4, v_dst10); + _mm_storeu_pd(dst + x + 6, v_dst11); + } + } + + return x; + } +}; +#endif //CV_SSE2 + #if CV_NEON template <> diff --git a/modules/video/test/test_accum.cpp b/modules/video/test/test_accum.cpp index 6895bb4eae..fe045c6ea6 100644 --- a/modules/video/test/test_accum.cpp +++ b/modules/video/test/test_accum.cpp @@ -72,11 +72,11 @@ void CV_AccumBaseTest::get_test_array_types_and_sizes( int test_case_idx, vector >& sizes, vector >& types ) { RNG& rng = ts->get_rng(); - int depth = cvtest::randInt(rng) % 3, cn = cvtest::randInt(rng) & 1 ? 3 : 1; - int accdepth = std::max((int)(cvtest::randInt(rng) % 2 + 1), depth); + int depth = cvtest::randInt(rng) % 4, cn = cvtest::randInt(rng) & 1 ? 3 : 1; + int accdepth = (int)(cvtest::randInt(rng) % 2 + 1); int i, input_count = (int)test_array[INPUT].size(); cvtest::ArrayTest::get_test_array_types_and_sizes( test_case_idx, sizes, types ); - depth = depth == 0 ? CV_8U : depth == 1 ? CV_32F : CV_64F; + depth = depth == 0 ? CV_8U : depth == 1 ? CV_16U : depth == 2 ? CV_32F : CV_64F; accdepth = accdepth == 1 ? CV_32F : CV_64F; accdepth = MAX(accdepth, depth);