opencv/modules/imgproc/src/sumpixels.simd.hpp

1148 lines
61 KiB
C++
Raw Normal View History

/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
2020-01-17 21:54:08 +08:00
// Copyright (C) 2000-2020 Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
2015-01-12 15:59:30 +08:00
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "opencv2/core/hal/intrin.hpp"
2020-01-17 21:54:08 +08:00
#if CV_AVX512_SKX
#include "sumpixels.avx512_skx.hpp"
#endif
namespace cv { namespace hal {
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
// forward declarations
bool integral_SIMD(
int depth, int sdepth, int sqdepth,
const uchar* src, size_t srcstep,
uchar* sum, size_t sumstep,
uchar* sqsum, size_t sqsumstep,
uchar* tilted, size_t tstep,
int width, int height, int cn);
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
namespace {
2014-12-29 20:38:49 +08:00
template <typename T, typename ST, typename QT>
struct Integral_SIMD
{
bool operator()(const T *, size_t,
ST *, size_t,
QT *, size_t,
ST *, size_t,
2016-08-31 23:43:43 +08:00
int, int, int) const
2014-12-29 20:38:49 +08:00
{
return false;
}
};
#if CV_SIMD && CV_SIMD_WIDTH <= 64
2014-12-29 20:38:49 +08:00
template <>
struct Integral_SIMD<uchar, int, double>
{
Integral_SIMD() {}
2014-12-29 20:38:49 +08:00
bool operator()(const uchar * src, size_t _srcstep,
int * sum, size_t _sumstep,
double * sqsum, size_t,
int * tilted, size_t,
2016-08-31 23:43:43 +08:00
int width, int height, int cn) const
2014-12-29 20:38:49 +08:00
{
if (sqsum || tilted || cn > 4)
return false;
#if !CV_SSE4_1 && CV_SSE2
// 3 channel code is slower for SSE2 & SSE3
if (cn == 3)
2014-12-29 20:38:49 +08:00
return false;
#endif
width *= cn;
2014-12-29 20:38:49 +08:00
// the first iteration
memset(sum, 0, (width + cn) * sizeof(int));
2014-12-29 20:38:49 +08:00
if (cn == 1)
2014-12-29 20:38:49 +08:00
{
// the others
for (int i = 0; i < height; ++i)
{
const uchar * src_row = src + _srcstep * i;
int * prev_sum_row = (int *)((uchar *)sum + _sumstep * i) + 1;
int * sum_row = (int *)((uchar *)sum + _sumstep * (i + 1)) + 1;
2014-12-29 20:38:49 +08:00
sum_row[-1] = 0;
v_int32 prev = vx_setzero_s32();
int j = 0;
for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
{
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
v_int32 el4l, el4h;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2));
vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4));
vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8));
__m256i shmask = _mm256_set1_epi32(7);
el4l.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_low(vsum)), prev.val);
el4h.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum)), _mm256_permutevar8x32_epi32(el4l.val, shmask));
prev.val = _mm256_permutevar8x32_epi32(el4h.val, shmask);
#else
el8 += v_rotate_left<1>(el8);
el8 += v_rotate_left<2>(el8);
#if CV_SIMD_WIDTH >= 32
el8 += v_rotate_left<4>(el8);
#if CV_SIMD_WIDTH == 64
el8 += v_rotate_left<8>(el8);
#endif
#endif
v_expand(el8, el4l, el4h);
el4l += prev;
el4h += el4l;
prev = v_broadcast_element<v_int32::nlanes - 1>(el4h);
#endif
v_store(sum_row + j , el4l + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_int32::nlanes, el4h + vx_load(prev_sum_row + j + v_int32::nlanes));
}
2014-12-29 20:38:49 +08:00
for (int v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
sum_row[j] = (v += src_row[j]) + prev_sum_row[j];
}
}
else if (cn == 2)
{
// the others
v_int16 mask = vx_setall_s16((short)0xff);
for (int i = 0; i < height; ++i)
2014-12-29 20:38:49 +08:00
{
const uchar * src_row = src + _srcstep * i;
int * prev_sum_row = (int *)((uchar *)sum + _sumstep * i) + cn;
int * sum_row = (int *)((uchar *)sum + _sumstep * (i + 1)) + cn;
sum_row[-1] = sum_row[-2] = 0;
v_int32 prev_1 = vx_setzero_s32(), prev_2 = vx_setzero_s32();
int j = 0;
for ( ; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn)
{
v_int16 v_src_row = v_reinterpret_as_s16(vx_load(src_row + j));
v_int16 el8_1 = v_src_row & mask;
v_int16 el8_2 = v_reinterpret_as_s16(v_reinterpret_as_u16(v_src_row) >> 8);
v_int32 el4l_1, el4h_1, el4l_2, el4h_2;
Merge pull request #14210 from terfendail:wui_512 AVX512 wide universal intrinsics (#14210) * Added implementation of 512-bit wide universal intrinsics(WIP) * Added implementation of 512-bit wide universal intrinsics: implemented WUI vector types(WIP) * Added implementation of 512-bit wide universal intrinsics(WIP): implemented load/store * Added implementation of 512-bit wide universal intrinsics(WIP): implemented fp16 load/store * Added implementation of 512-bit wide universal intrinsics(WIP): implemented recombine and zip, implemented non-saturating and saturating arithmetics * Added implementation of 512-bit wide universal intrinsics(WIP): implemented bit operations * Added implementation of 512-bit wide universal intrinsics(WIP): implemented comparisons * Added implementation of 512-bit wide universal intrinsics(WIP): implemented lane shifts and reduction * Added implementation of 512-bit wide universal intrinsics(WIP): implemented absolute values * Added implementation of 512-bit wide universal intrinsics(WIP): implemented rounding and cast to float * Added implementation of 512-bit wide universal intrinsics(WIP): implemented LUT * Added implementation of 512-bit wide universal intrinsics(WIP): implemented type extension/narrowing and matrix operations * Added implementation of 512-bit wide universal intrinsics(WIP): implemented load_deinterleave for 2 and 3 channels images * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented load_deinterleave for 2- and implemented for 4-channel images * Added implementation of 512-bit wide universal intrinsics(WIP): implemented store_interleave * Added implementation of 512-bit wide universal intrinsics(WIP): implemented signmask and checks * Added implementation of 512-bit wide universal intrinsics(WIP): build fixes * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented popcount in case AVX512_BITALG is unavailable * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented zip * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented rotate for s8 and s16 * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented interleave/deinterleave for s8 and s16 * Added implementation of 512-bit wide universal intrinsics(WIP): updated v512_set macros * Added implementation of 512-bit wide universal intrinsics(WIP): fix for GCC wrong _mm512_abs_pd definition * Added implementation of 512-bit wide universal intrinsics(WIP): reworked v_zip to avoid AVX512_VBMI intrinsics * Added implementation of 512-bit wide universal intrinsics(WIP): reworked v_invsqrt to avoid AVX512_ER intrinsics * Added implementation of 512-bit wide universal intrinsics(WIP): reworked v_rotate, v_popcount and interleave/deinterleave for U8 to avoid AVX512_VBMI intrinsics * Added implementation of 512-bit wide universal intrinsics(WIP): fixed integral image SIMD part * Added implementation of 512-bit wide universal intrinsics(WIP): fixed warnings * Added implementation of 512-bit wide universal intrinsics(WIP): fixed load_deinterleave for u8 and u16 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed v_invsqrt accuracy for f64 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed interleave/deinterleave for u32 and u64 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed interleave_pairs, interleave_quads and pack_triplets * Added implementation of 512-bit wide universal intrinsics(WIP): fixed rotate_left * Added implementation of 512-bit wide universal intrinsics(WIP): fixed rotate_left/right, part 2 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed 512-wide universal intrinsics based resize * Added implementation of 512-bit wide universal intrinsics(WIP): fixed findContours by avoiding use of uint64 dependent 512-wide v_signmask() * Added implementation of 512-bit wide universal intrinsics(WIP): fixed trailing whitespaces * Added implementation of 512-bit wide universal intrinsics(WIP): reworked specific intrinsic sets dependent parts to check availability of intrinsics based on CPU feature group defines * Added implementation of 512-bit wide universal intrinsics(WIP):Updated AVX512 implementation of v_popcount to avoid AVX512VPOPCNTDQ intrinsics if unavailable. * Added implementation of 512-bit wide universal intrinsics(WIP): Fixed universal intrinsics data initialisation, v_mul_wrap, v_floor, v_ceil and v_signmask. * Added implementation of 512-bit wide universal intrinsics(WIP): Removed hasSIMD512() * Added implementation of 512-bit wide universal intrinsics(WIP): Fixes for gcc build * Added implementation of 512-bit wide universal intrinsics(WIP): Reworked v_signmask, v_check_any() and v_check_all() implementation.
2019-06-03 23:05:35 +08:00
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2));
__m256i vsum_2 = _mm256_add_epi16(el8_2.val, _mm256_slli_si256(el8_2.val, 2));
vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 4));
vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 4));
vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 8));
vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 8));
__m256i shmask = _mm256_set1_epi32(7);
el4l_1.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_low(vsum_1)), prev_1.val);
el4l_2.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_low(vsum_2)), prev_2.val);
el4h_1.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum_1)), _mm256_permutevar8x32_epi32(el4l_1.val, shmask));
el4h_2.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum_2)), _mm256_permutevar8x32_epi32(el4l_2.val, shmask));
prev_1.val = _mm256_permutevar8x32_epi32(el4h_1.val, shmask);
prev_2.val = _mm256_permutevar8x32_epi32(el4h_2.val, shmask);
#else
el8_1 += v_rotate_left<1>(el8_1);
el8_2 += v_rotate_left<1>(el8_2);
el8_1 += v_rotate_left<2>(el8_1);
el8_2 += v_rotate_left<2>(el8_2);
Merge pull request #14210 from terfendail:wui_512 AVX512 wide universal intrinsics (#14210) * Added implementation of 512-bit wide universal intrinsics(WIP) * Added implementation of 512-bit wide universal intrinsics: implemented WUI vector types(WIP) * Added implementation of 512-bit wide universal intrinsics(WIP): implemented load/store * Added implementation of 512-bit wide universal intrinsics(WIP): implemented fp16 load/store * Added implementation of 512-bit wide universal intrinsics(WIP): implemented recombine and zip, implemented non-saturating and saturating arithmetics * Added implementation of 512-bit wide universal intrinsics(WIP): implemented bit operations * Added implementation of 512-bit wide universal intrinsics(WIP): implemented comparisons * Added implementation of 512-bit wide universal intrinsics(WIP): implemented lane shifts and reduction * Added implementation of 512-bit wide universal intrinsics(WIP): implemented absolute values * Added implementation of 512-bit wide universal intrinsics(WIP): implemented rounding and cast to float * Added implementation of 512-bit wide universal intrinsics(WIP): implemented LUT * Added implementation of 512-bit wide universal intrinsics(WIP): implemented type extension/narrowing and matrix operations * Added implementation of 512-bit wide universal intrinsics(WIP): implemented load_deinterleave for 2 and 3 channels images * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented load_deinterleave for 2- and implemented for 4-channel images * Added implementation of 512-bit wide universal intrinsics(WIP): implemented store_interleave * Added implementation of 512-bit wide universal intrinsics(WIP): implemented signmask and checks * Added implementation of 512-bit wide universal intrinsics(WIP): build fixes * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented popcount in case AVX512_BITALG is unavailable * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented zip * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented rotate for s8 and s16 * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented interleave/deinterleave for s8 and s16 * Added implementation of 512-bit wide universal intrinsics(WIP): updated v512_set macros * Added implementation of 512-bit wide universal intrinsics(WIP): fix for GCC wrong _mm512_abs_pd definition * Added implementation of 512-bit wide universal intrinsics(WIP): reworked v_zip to avoid AVX512_VBMI intrinsics * Added implementation of 512-bit wide universal intrinsics(WIP): reworked v_invsqrt to avoid AVX512_ER intrinsics * Added implementation of 512-bit wide universal intrinsics(WIP): reworked v_rotate, v_popcount and interleave/deinterleave for U8 to avoid AVX512_VBMI intrinsics * Added implementation of 512-bit wide universal intrinsics(WIP): fixed integral image SIMD part * Added implementation of 512-bit wide universal intrinsics(WIP): fixed warnings * Added implementation of 512-bit wide universal intrinsics(WIP): fixed load_deinterleave for u8 and u16 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed v_invsqrt accuracy for f64 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed interleave/deinterleave for u32 and u64 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed interleave_pairs, interleave_quads and pack_triplets * Added implementation of 512-bit wide universal intrinsics(WIP): fixed rotate_left * Added implementation of 512-bit wide universal intrinsics(WIP): fixed rotate_left/right, part 2 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed 512-wide universal intrinsics based resize * Added implementation of 512-bit wide universal intrinsics(WIP): fixed findContours by avoiding use of uint64 dependent 512-wide v_signmask() * Added implementation of 512-bit wide universal intrinsics(WIP): fixed trailing whitespaces * Added implementation of 512-bit wide universal intrinsics(WIP): reworked specific intrinsic sets dependent parts to check availability of intrinsics based on CPU feature group defines * Added implementation of 512-bit wide universal intrinsics(WIP):Updated AVX512 implementation of v_popcount to avoid AVX512VPOPCNTDQ intrinsics if unavailable. * Added implementation of 512-bit wide universal intrinsics(WIP): Fixed universal intrinsics data initialisation, v_mul_wrap, v_floor, v_ceil and v_signmask. * Added implementation of 512-bit wide universal intrinsics(WIP): Removed hasSIMD512() * Added implementation of 512-bit wide universal intrinsics(WIP): Fixes for gcc build * Added implementation of 512-bit wide universal intrinsics(WIP): Reworked v_signmask, v_check_any() and v_check_all() implementation.
2019-06-03 23:05:35 +08:00
#if CV_SIMD_WIDTH >= 32
el8_1 += v_rotate_left<4>(el8_1);
el8_2 += v_rotate_left<4>(el8_2);
#if CV_SIMD_WIDTH == 64
el8_1 += v_rotate_left<8>(el8_1);
el8_2 += v_rotate_left<8>(el8_2);
#endif
#endif
v_expand(el8_1, el4l_1, el4h_1);
v_expand(el8_2, el4l_2, el4h_2);
el4l_1 += prev_1;
el4l_2 += prev_2;
el4h_1 += el4l_1;
el4h_2 += el4l_2;
prev_1 = v_broadcast_element<v_int32::nlanes - 1>(el4h_1);
prev_2 = v_broadcast_element<v_int32::nlanes - 1>(el4h_2);
#endif
v_int32 el4_1, el4_2, el4_3, el4_4;
v_zip(el4l_1, el4l_2, el4_1, el4_2);
v_zip(el4h_1, el4h_2, el4_3, el4_4);
v_store(sum_row + j , el4_1 + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_int32::nlanes , el4_2 + vx_load(prev_sum_row + j + v_int32::nlanes ));
v_store(sum_row + j + v_int32::nlanes * 2, el4_3 + vx_load(prev_sum_row + j + v_int32::nlanes * 2));
v_store(sum_row + j + v_int32::nlanes * 3, el4_4 + vx_load(prev_sum_row + j + v_int32::nlanes * 3));
}
for (int v2 = sum_row[j - 1] - prev_sum_row[j - 1],
v1 = sum_row[j - 2] - prev_sum_row[j - 2]; j < width; j += 2)
{
sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j];
sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1];
}
}
}
#if CV_SSE4_1 || !CV_SSE2
else if (cn == 3)
{
// the others
for (int i = 0; i < height; ++i)
{
const uchar * src_row = src + _srcstep * i;
int * prev_sum_row = (int *)((uchar *)sum + _sumstep * i) + cn;
int * sum_row = (int *)((uchar *)sum + _sumstep * (i + 1)) + cn;
int row_cache[v_int32::nlanes * 6];
sum_row[-1] = sum_row[-2] = sum_row[-3] = 0;
v_int32 prev_1 = vx_setzero_s32(), prev_2 = vx_setzero_s32(),
prev_3 = vx_setzero_s32();
int j = 0;
for ( ; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn)
{
v_uint8 v_src_row_1, v_src_row_2, v_src_row_3;
v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3);
v_int16 el8_1 = v_reinterpret_as_s16(v_expand_low(v_src_row_1));
v_int16 el8_2 = v_reinterpret_as_s16(v_expand_low(v_src_row_2));
v_int16 el8_3 = v_reinterpret_as_s16(v_expand_low(v_src_row_3));
v_int32 el4l_1, el4h_1, el4l_2, el4h_2, el4l_3, el4h_3;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2));
__m256i vsum_2 = _mm256_add_epi16(el8_2.val, _mm256_slli_si256(el8_2.val, 2));
__m256i vsum_3 = _mm256_add_epi16(el8_3.val, _mm256_slli_si256(el8_3.val, 2));
vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 4));
vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 4));
vsum_3 = _mm256_add_epi16(vsum_3, _mm256_slli_si256(vsum_3, 4));
vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 8));
vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 8));
vsum_3 = _mm256_add_epi16(vsum_3, _mm256_slli_si256(vsum_3, 8));
__m256i shmask = _mm256_set1_epi32(7);
el4l_1.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_low(vsum_1)), prev_1.val);
el4l_2.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_low(vsum_2)), prev_2.val);
el4l_3.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_low(vsum_3)), prev_3.val);
el4h_1.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum_1)), _mm256_permutevar8x32_epi32(el4l_1.val, shmask));
el4h_2.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum_2)), _mm256_permutevar8x32_epi32(el4l_2.val, shmask));
el4h_3.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum_3)), _mm256_permutevar8x32_epi32(el4l_3.val, shmask));
prev_1.val = _mm256_permutevar8x32_epi32(el4h_1.val, shmask);
prev_2.val = _mm256_permutevar8x32_epi32(el4h_2.val, shmask);
prev_3.val = _mm256_permutevar8x32_epi32(el4h_3.val, shmask);
#else
el8_1 += v_rotate_left<1>(el8_1);
el8_2 += v_rotate_left<1>(el8_2);
el8_3 += v_rotate_left<1>(el8_3);
el8_1 += v_rotate_left<2>(el8_1);
el8_2 += v_rotate_left<2>(el8_2);
el8_3 += v_rotate_left<2>(el8_3);
#if CV_SIMD_WIDTH >= 32
el8_1 += v_rotate_left<4>(el8_1);
el8_2 += v_rotate_left<4>(el8_2);
el8_3 += v_rotate_left<4>(el8_3);
#if CV_SIMD_WIDTH == 64
el8_1 += v_rotate_left<8>(el8_1);
el8_2 += v_rotate_left<8>(el8_2);
el8_3 += v_rotate_left<8>(el8_3);
#endif
#endif
v_expand(el8_1, el4l_1, el4h_1);
v_expand(el8_2, el4l_2, el4h_2);
v_expand(el8_3, el4l_3, el4h_3);
el4l_1 += prev_1;
el4l_2 += prev_2;
el4l_3 += prev_3;
el4h_1 += el4l_1;
el4h_2 += el4l_2;
el4h_3 += el4l_3;
prev_1 = v_broadcast_element<v_int32::nlanes - 1>(el4h_1);
prev_2 = v_broadcast_element<v_int32::nlanes - 1>(el4h_2);
prev_3 = v_broadcast_element<v_int32::nlanes - 1>(el4h_3);
#endif
v_store_interleave(row_cache , el4l_1, el4l_2, el4l_3);
v_store_interleave(row_cache + v_int32::nlanes * 3, el4h_1, el4h_2, el4h_3);
el4l_1 = vx_load(row_cache );
el4l_2 = vx_load(row_cache + v_int32::nlanes );
el4l_3 = vx_load(row_cache + v_int32::nlanes * 2);
el4h_1 = vx_load(row_cache + v_int32::nlanes * 3);
el4h_2 = vx_load(row_cache + v_int32::nlanes * 4);
el4h_3 = vx_load(row_cache + v_int32::nlanes * 5);
v_store(sum_row + j , el4l_1 + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_int32::nlanes , el4l_2 + vx_load(prev_sum_row + j + v_int32::nlanes ));
v_store(sum_row + j + v_int32::nlanes * 2, el4l_3 + vx_load(prev_sum_row + j + v_int32::nlanes * 2));
v_store(sum_row + j + v_int32::nlanes * 3, el4h_1 + vx_load(prev_sum_row + j + v_int32::nlanes * 3));
v_store(sum_row + j + v_int32::nlanes * 4, el4h_2 + vx_load(prev_sum_row + j + v_int32::nlanes * 4));
v_store(sum_row + j + v_int32::nlanes * 5, el4h_3 + vx_load(prev_sum_row + j + v_int32::nlanes * 5));
}
for (int v3 = sum_row[j - 1] - prev_sum_row[j - 1],
v2 = sum_row[j - 2] - prev_sum_row[j - 2],
v1 = sum_row[j - 3] - prev_sum_row[j - 3]; j < width; j += 3)
{
sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j];
sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1];
sum_row[j + 2] = (v3 += src_row[j + 2]) + prev_sum_row[j + 2];
}
}
}
#endif
else if (cn == 4)
{
// the others
for (int i = 0; i < height; ++i)
{
const uchar * src_row = src + _srcstep * i;
int * prev_sum_row = (int *)((uchar *)sum + _sumstep * i) + cn;
int * sum_row = (int *)((uchar *)sum + _sumstep * (i + 1)) + cn;
2014-12-29 20:38:49 +08:00
sum_row[-1] = sum_row[-2] = sum_row[-3] = sum_row[-4] = 0;
v_int32 prev = vx_setzero_s32();
int j = 0;
for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
{
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
v_int32 el4l, el4h;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 8));
el4l.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_low(vsum)), prev.val);
el4h.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum)), _mm256_permute2x128_si256(el4l.val, el4l.val, 0x31));
prev.val = _mm256_permute2x128_si256(el4h.val, el4h.val, 0x31);
#else
#if CV_SIMD_WIDTH >= 32
el8 += v_rotate_left<4>(el8);
#if CV_SIMD_WIDTH == 64
el8 += v_rotate_left<8>(el8);
#endif
#endif
v_expand(el8, el4l, el4h);
el4l += prev;
el4h += el4l;
#if CV_SIMD_WIDTH == 16
prev = el4h;
#elif CV_SIMD_WIDTH == 32
prev = v_combine_high(el4h, el4h);
#else
v_int32 t = v_rotate_right<12>(el4h);
t |= v_rotate_left<4>(t);
prev = v_combine_low(t, t);
#endif
#endif
v_store(sum_row + j , el4l + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_int32::nlanes, el4h + vx_load(prev_sum_row + j + v_int32::nlanes));
}
for (int v4 = sum_row[j - 1] - prev_sum_row[j - 1],
v3 = sum_row[j - 2] - prev_sum_row[j - 2],
v2 = sum_row[j - 3] - prev_sum_row[j - 3],
v1 = sum_row[j - 4] - prev_sum_row[j - 4]; j < width; j += 4)
{
sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j];
sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1];
sum_row[j + 2] = (v3 += src_row[j + 2]) + prev_sum_row[j + 2];
sum_row[j + 3] = (v4 += src_row[j + 3]) + prev_sum_row[j + 3];
}
}
}
else
{
return false;
}
vx_cleanup();
return true;
}
};
2014-12-29 20:38:49 +08:00
template <>
struct Integral_SIMD<uchar, float, double>
{
Integral_SIMD() {}
2014-12-29 20:38:49 +08:00
bool operator()(const uchar * src, size_t _srcstep,
float * sum, size_t _sumstep,
double * sqsum, size_t,
float * tilted, size_t,
int width, int height, int cn) const
{
if (sqsum || tilted || cn > 4)
return false;
2014-12-29 20:38:49 +08:00
width *= cn;
// the first iteration
memset(sum, 0, (width + cn) * sizeof(float));
2014-12-29 20:38:49 +08:00
if (cn == 1)
{
// the others
for (int i = 0; i < height; ++i)
{
const uchar * src_row = src + _srcstep * i;
float * prev_sum_row = (float *)((uchar *)sum + _sumstep * i) + 1;
float * sum_row = (float *)((uchar *)sum + _sumstep * (i + 1)) + 1;
sum_row[-1] = 0;
2014-12-29 20:38:49 +08:00
v_float32 prev = vx_setzero_f32();
int j = 0;
for (; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
{
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
v_float32 el4l, el4h;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2));
vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4));
vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8));
__m256i shmask = _mm256_set1_epi32(7);
el4l.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_low(vsum))), prev.val);
el4h.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum))), _mm256_permutevar8x32_ps(el4l.val, shmask));
prev.val = _mm256_permutevar8x32_ps(el4h.val, shmask);
#else
el8 += v_rotate_left<1>(el8);
el8 += v_rotate_left<2>(el8);
#if CV_SIMD_WIDTH >= 32
el8 += v_rotate_left<4>(el8);
#if CV_SIMD_WIDTH == 64
el8 += v_rotate_left<8>(el8);
#endif
#endif
v_int32 el4li, el4hi;
v_expand(el8, el4li, el4hi);
el4l = v_cvt_f32(el4li) + prev;
el4h = v_cvt_f32(el4hi) + el4l;
prev = v_broadcast_element<v_float32::nlanes - 1>(el4h);
#endif
v_store(sum_row + j , el4l + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_float32::nlanes, el4h + vx_load(prev_sum_row + j + v_float32::nlanes));
}
2014-12-29 20:38:49 +08:00
for (float v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
sum_row[j] = (v += src_row[j]) + prev_sum_row[j];
}
}
else if (cn == 2)
{
// the others
v_int16 mask = vx_setall_s16((short)0xff);
for (int i = 0; i < height; ++i)
{
const uchar * src_row = src + _srcstep * i;
float * prev_sum_row = (float *)((uchar *)sum + _sumstep * i) + cn;
float * sum_row = (float *)((uchar *)sum + _sumstep * (i + 1)) + cn;
sum_row[-1] = sum_row[-2] = 0;
v_float32 prev_1 = vx_setzero_f32(), prev_2 = vx_setzero_f32();
int j = 0;
for (; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn)
{
v_int16 v_src_row = v_reinterpret_as_s16(vx_load(src_row + j));
v_int16 el8_1 = v_src_row & mask;
v_int16 el8_2 = v_reinterpret_as_s16(v_reinterpret_as_u16(v_src_row) >> 8);
v_float32 el4l_1, el4h_1, el4l_2, el4h_2;
Merge pull request #14210 from terfendail:wui_512 AVX512 wide universal intrinsics (#14210) * Added implementation of 512-bit wide universal intrinsics(WIP) * Added implementation of 512-bit wide universal intrinsics: implemented WUI vector types(WIP) * Added implementation of 512-bit wide universal intrinsics(WIP): implemented load/store * Added implementation of 512-bit wide universal intrinsics(WIP): implemented fp16 load/store * Added implementation of 512-bit wide universal intrinsics(WIP): implemented recombine and zip, implemented non-saturating and saturating arithmetics * Added implementation of 512-bit wide universal intrinsics(WIP): implemented bit operations * Added implementation of 512-bit wide universal intrinsics(WIP): implemented comparisons * Added implementation of 512-bit wide universal intrinsics(WIP): implemented lane shifts and reduction * Added implementation of 512-bit wide universal intrinsics(WIP): implemented absolute values * Added implementation of 512-bit wide universal intrinsics(WIP): implemented rounding and cast to float * Added implementation of 512-bit wide universal intrinsics(WIP): implemented LUT * Added implementation of 512-bit wide universal intrinsics(WIP): implemented type extension/narrowing and matrix operations * Added implementation of 512-bit wide universal intrinsics(WIP): implemented load_deinterleave for 2 and 3 channels images * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented load_deinterleave for 2- and implemented for 4-channel images * Added implementation of 512-bit wide universal intrinsics(WIP): implemented store_interleave * Added implementation of 512-bit wide universal intrinsics(WIP): implemented signmask and checks * Added implementation of 512-bit wide universal intrinsics(WIP): build fixes * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented popcount in case AVX512_BITALG is unavailable * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented zip * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented rotate for s8 and s16 * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented interleave/deinterleave for s8 and s16 * Added implementation of 512-bit wide universal intrinsics(WIP): updated v512_set macros * Added implementation of 512-bit wide universal intrinsics(WIP): fix for GCC wrong _mm512_abs_pd definition * Added implementation of 512-bit wide universal intrinsics(WIP): reworked v_zip to avoid AVX512_VBMI intrinsics * Added implementation of 512-bit wide universal intrinsics(WIP): reworked v_invsqrt to avoid AVX512_ER intrinsics * Added implementation of 512-bit wide universal intrinsics(WIP): reworked v_rotate, v_popcount and interleave/deinterleave for U8 to avoid AVX512_VBMI intrinsics * Added implementation of 512-bit wide universal intrinsics(WIP): fixed integral image SIMD part * Added implementation of 512-bit wide universal intrinsics(WIP): fixed warnings * Added implementation of 512-bit wide universal intrinsics(WIP): fixed load_deinterleave for u8 and u16 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed v_invsqrt accuracy for f64 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed interleave/deinterleave for u32 and u64 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed interleave_pairs, interleave_quads and pack_triplets * Added implementation of 512-bit wide universal intrinsics(WIP): fixed rotate_left * Added implementation of 512-bit wide universal intrinsics(WIP): fixed rotate_left/right, part 2 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed 512-wide universal intrinsics based resize * Added implementation of 512-bit wide universal intrinsics(WIP): fixed findContours by avoiding use of uint64 dependent 512-wide v_signmask() * Added implementation of 512-bit wide universal intrinsics(WIP): fixed trailing whitespaces * Added implementation of 512-bit wide universal intrinsics(WIP): reworked specific intrinsic sets dependent parts to check availability of intrinsics based on CPU feature group defines * Added implementation of 512-bit wide universal intrinsics(WIP):Updated AVX512 implementation of v_popcount to avoid AVX512VPOPCNTDQ intrinsics if unavailable. * Added implementation of 512-bit wide universal intrinsics(WIP): Fixed universal intrinsics data initialisation, v_mul_wrap, v_floor, v_ceil and v_signmask. * Added implementation of 512-bit wide universal intrinsics(WIP): Removed hasSIMD512() * Added implementation of 512-bit wide universal intrinsics(WIP): Fixes for gcc build * Added implementation of 512-bit wide universal intrinsics(WIP): Reworked v_signmask, v_check_any() and v_check_all() implementation.
2019-06-03 23:05:35 +08:00
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2));
__m256i vsum_2 = _mm256_add_epi16(el8_2.val, _mm256_slli_si256(el8_2.val, 2));
vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 4));
vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 4));
vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 8));
vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 8));
__m256i shmask = _mm256_set1_epi32(7);
el4l_1.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_low(vsum_1))), prev_1.val);
el4l_2.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_low(vsum_2))), prev_2.val);
el4h_1.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum_1))), _mm256_permutevar8x32_ps(el4l_1.val, shmask));
el4h_2.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum_2))), _mm256_permutevar8x32_ps(el4l_2.val, shmask));
prev_1.val = _mm256_permutevar8x32_ps(el4h_1.val, shmask);
prev_2.val = _mm256_permutevar8x32_ps(el4h_2.val, shmask);
#else
el8_1 += v_rotate_left<1>(el8_1);
el8_2 += v_rotate_left<1>(el8_2);
el8_1 += v_rotate_left<2>(el8_1);
el8_2 += v_rotate_left<2>(el8_2);
Merge pull request #14210 from terfendail:wui_512 AVX512 wide universal intrinsics (#14210) * Added implementation of 512-bit wide universal intrinsics(WIP) * Added implementation of 512-bit wide universal intrinsics: implemented WUI vector types(WIP) * Added implementation of 512-bit wide universal intrinsics(WIP): implemented load/store * Added implementation of 512-bit wide universal intrinsics(WIP): implemented fp16 load/store * Added implementation of 512-bit wide universal intrinsics(WIP): implemented recombine and zip, implemented non-saturating and saturating arithmetics * Added implementation of 512-bit wide universal intrinsics(WIP): implemented bit operations * Added implementation of 512-bit wide universal intrinsics(WIP): implemented comparisons * Added implementation of 512-bit wide universal intrinsics(WIP): implemented lane shifts and reduction * Added implementation of 512-bit wide universal intrinsics(WIP): implemented absolute values * Added implementation of 512-bit wide universal intrinsics(WIP): implemented rounding and cast to float * Added implementation of 512-bit wide universal intrinsics(WIP): implemented LUT * Added implementation of 512-bit wide universal intrinsics(WIP): implemented type extension/narrowing and matrix operations * Added implementation of 512-bit wide universal intrinsics(WIP): implemented load_deinterleave for 2 and 3 channels images * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented load_deinterleave for 2- and implemented for 4-channel images * Added implementation of 512-bit wide universal intrinsics(WIP): implemented store_interleave * Added implementation of 512-bit wide universal intrinsics(WIP): implemented signmask and checks * Added implementation of 512-bit wide universal intrinsics(WIP): build fixes * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented popcount in case AVX512_BITALG is unavailable * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented zip * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented rotate for s8 and s16 * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented interleave/deinterleave for s8 and s16 * Added implementation of 512-bit wide universal intrinsics(WIP): updated v512_set macros * Added implementation of 512-bit wide universal intrinsics(WIP): fix for GCC wrong _mm512_abs_pd definition * Added implementation of 512-bit wide universal intrinsics(WIP): reworked v_zip to avoid AVX512_VBMI intrinsics * Added implementation of 512-bit wide universal intrinsics(WIP): reworked v_invsqrt to avoid AVX512_ER intrinsics * Added implementation of 512-bit wide universal intrinsics(WIP): reworked v_rotate, v_popcount and interleave/deinterleave for U8 to avoid AVX512_VBMI intrinsics * Added implementation of 512-bit wide universal intrinsics(WIP): fixed integral image SIMD part * Added implementation of 512-bit wide universal intrinsics(WIP): fixed warnings * Added implementation of 512-bit wide universal intrinsics(WIP): fixed load_deinterleave for u8 and u16 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed v_invsqrt accuracy for f64 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed interleave/deinterleave for u32 and u64 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed interleave_pairs, interleave_quads and pack_triplets * Added implementation of 512-bit wide universal intrinsics(WIP): fixed rotate_left * Added implementation of 512-bit wide universal intrinsics(WIP): fixed rotate_left/right, part 2 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed 512-wide universal intrinsics based resize * Added implementation of 512-bit wide universal intrinsics(WIP): fixed findContours by avoiding use of uint64 dependent 512-wide v_signmask() * Added implementation of 512-bit wide universal intrinsics(WIP): fixed trailing whitespaces * Added implementation of 512-bit wide universal intrinsics(WIP): reworked specific intrinsic sets dependent parts to check availability of intrinsics based on CPU feature group defines * Added implementation of 512-bit wide universal intrinsics(WIP):Updated AVX512 implementation of v_popcount to avoid AVX512VPOPCNTDQ intrinsics if unavailable. * Added implementation of 512-bit wide universal intrinsics(WIP): Fixed universal intrinsics data initialisation, v_mul_wrap, v_floor, v_ceil and v_signmask. * Added implementation of 512-bit wide universal intrinsics(WIP): Removed hasSIMD512() * Added implementation of 512-bit wide universal intrinsics(WIP): Fixes for gcc build * Added implementation of 512-bit wide universal intrinsics(WIP): Reworked v_signmask, v_check_any() and v_check_all() implementation.
2019-06-03 23:05:35 +08:00
#if CV_SIMD_WIDTH >= 32
el8_1 += v_rotate_left<4>(el8_1);
el8_2 += v_rotate_left<4>(el8_2);
#if CV_SIMD_WIDTH == 64
el8_1 += v_rotate_left<8>(el8_1);
el8_2 += v_rotate_left<8>(el8_2);
#endif
#endif
v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2;
v_expand(el8_1, el4li_1, el4hi_1);
v_expand(el8_2, el4li_2, el4hi_2);
el4l_1 = v_cvt_f32(el4li_1) + prev_1;
el4l_2 = v_cvt_f32(el4li_2) + prev_2;
el4h_1 = v_cvt_f32(el4hi_1) + el4l_1;
el4h_2 = v_cvt_f32(el4hi_2) + el4l_2;
prev_1 = v_broadcast_element<v_float32::nlanes - 1>(el4h_1);
prev_2 = v_broadcast_element<v_float32::nlanes - 1>(el4h_2);
#endif
v_float32 el4_1, el4_2, el4_3, el4_4;
v_zip(el4l_1, el4l_2, el4_1, el4_2);
v_zip(el4h_1, el4h_2, el4_3, el4_4);
v_store(sum_row + j , el4_1 + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_float32::nlanes , el4_2 + vx_load(prev_sum_row + j + v_float32::nlanes ));
v_store(sum_row + j + v_float32::nlanes * 2, el4_3 + vx_load(prev_sum_row + j + v_float32::nlanes * 2));
v_store(sum_row + j + v_float32::nlanes * 3, el4_4 + vx_load(prev_sum_row + j + v_float32::nlanes * 3));
}
for (float v2 = sum_row[j - 1] - prev_sum_row[j - 1],
v1 = sum_row[j - 2] - prev_sum_row[j - 2]; j < width; j += 2)
{
sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j];
sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1];
}
}
}
else if (cn == 3)
{
// the others
for (int i = 0; i < height; ++i)
{
const uchar * src_row = src + _srcstep * i;
float * prev_sum_row = (float *)((uchar *)sum + _sumstep * i) + cn;
float * sum_row = (float *)((uchar *)sum + _sumstep * (i + 1)) + cn;
float row_cache[v_float32::nlanes * 6];
sum_row[-1] = sum_row[-2] = sum_row[-3] = 0;
v_float32 prev_1 = vx_setzero_f32(), prev_2 = vx_setzero_f32(),
prev_3 = vx_setzero_f32();
int j = 0;
for (; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn)
{
v_uint8 v_src_row_1, v_src_row_2, v_src_row_3;
v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3);
v_int16 el8_1 = v_reinterpret_as_s16(v_expand_low(v_src_row_1));
v_int16 el8_2 = v_reinterpret_as_s16(v_expand_low(v_src_row_2));
v_int16 el8_3 = v_reinterpret_as_s16(v_expand_low(v_src_row_3));
v_float32 el4l_1, el4h_1, el4l_2, el4h_2, el4l_3, el4h_3;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2));
__m256i vsum_2 = _mm256_add_epi16(el8_2.val, _mm256_slli_si256(el8_2.val, 2));
__m256i vsum_3 = _mm256_add_epi16(el8_3.val, _mm256_slli_si256(el8_3.val, 2));
vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 4));
vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 4));
vsum_3 = _mm256_add_epi16(vsum_3, _mm256_slli_si256(vsum_3, 4));
vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 8));
vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 8));
vsum_3 = _mm256_add_epi16(vsum_3, _mm256_slli_si256(vsum_3, 8));
__m256i shmask = _mm256_set1_epi32(7);
el4l_1.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_low(vsum_1))), prev_1.val);
el4l_2.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_low(vsum_2))), prev_2.val);
el4l_3.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_low(vsum_3))), prev_3.val);
el4h_1.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum_1))), _mm256_permutevar8x32_ps(el4l_1.val, shmask));
el4h_2.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum_2))), _mm256_permutevar8x32_ps(el4l_2.val, shmask));
el4h_3.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum_3))), _mm256_permutevar8x32_ps(el4l_3.val, shmask));
prev_1.val = _mm256_permutevar8x32_ps(el4h_1.val, shmask);
prev_2.val = _mm256_permutevar8x32_ps(el4h_2.val, shmask);
prev_3.val = _mm256_permutevar8x32_ps(el4h_3.val, shmask);
#else
el8_1 += v_rotate_left<1>(el8_1);
el8_2 += v_rotate_left<1>(el8_2);
el8_3 += v_rotate_left<1>(el8_3);
el8_1 += v_rotate_left<2>(el8_1);
el8_2 += v_rotate_left<2>(el8_2);
el8_3 += v_rotate_left<2>(el8_3);
#if CV_SIMD_WIDTH >= 32
el8_1 += v_rotate_left<4>(el8_1);
el8_2 += v_rotate_left<4>(el8_2);
el8_3 += v_rotate_left<4>(el8_3);
#if CV_SIMD_WIDTH == 64
el8_1 += v_rotate_left<8>(el8_1);
el8_2 += v_rotate_left<8>(el8_2);
el8_3 += v_rotate_left<8>(el8_3);
#endif
#endif
v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2, el4li_3, el4hi_3;
v_expand(el8_1, el4li_1, el4hi_1);
v_expand(el8_2, el4li_2, el4hi_2);
v_expand(el8_3, el4li_3, el4hi_3);
el4l_1 = v_cvt_f32(el4li_1) + prev_1;
el4l_2 = v_cvt_f32(el4li_2) + prev_2;
el4l_3 = v_cvt_f32(el4li_3) + prev_3;
el4h_1 = v_cvt_f32(el4hi_1) + el4l_1;
el4h_2 = v_cvt_f32(el4hi_2) + el4l_2;
el4h_3 = v_cvt_f32(el4hi_3) + el4l_3;
prev_1 = v_broadcast_element<v_float32::nlanes - 1>(el4h_1);
prev_2 = v_broadcast_element<v_float32::nlanes - 1>(el4h_2);
prev_3 = v_broadcast_element<v_float32::nlanes - 1>(el4h_3);
#endif
v_store_interleave(row_cache , el4l_1, el4l_2, el4l_3);
v_store_interleave(row_cache + v_float32::nlanes * 3, el4h_1, el4h_2, el4h_3);
el4l_1 = vx_load(row_cache );
el4l_2 = vx_load(row_cache + v_float32::nlanes );
el4l_3 = vx_load(row_cache + v_float32::nlanes * 2);
el4h_1 = vx_load(row_cache + v_float32::nlanes * 3);
el4h_2 = vx_load(row_cache + v_float32::nlanes * 4);
el4h_3 = vx_load(row_cache + v_float32::nlanes * 5);
v_store(sum_row + j , el4l_1 + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_float32::nlanes , el4l_2 + vx_load(prev_sum_row + j + v_float32::nlanes ));
v_store(sum_row + j + v_float32::nlanes * 2, el4l_3 + vx_load(prev_sum_row + j + v_float32::nlanes * 2));
v_store(sum_row + j + v_float32::nlanes * 3, el4h_1 + vx_load(prev_sum_row + j + v_float32::nlanes * 3));
v_store(sum_row + j + v_float32::nlanes * 4, el4h_2 + vx_load(prev_sum_row + j + v_float32::nlanes * 4));
v_store(sum_row + j + v_float32::nlanes * 5, el4h_3 + vx_load(prev_sum_row + j + v_float32::nlanes * 5));
}
for (float v3 = sum_row[j - 1] - prev_sum_row[j - 1],
v2 = sum_row[j - 2] - prev_sum_row[j - 2],
v1 = sum_row[j - 3] - prev_sum_row[j - 3]; j < width; j += 3)
{
sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j];
sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1];
sum_row[j + 2] = (v3 += src_row[j + 2]) + prev_sum_row[j + 2];
}
2014-12-29 20:38:49 +08:00
}
}
else if (cn == 4)
{
// the others
for (int i = 0; i < height; ++i)
{
const uchar * src_row = src + _srcstep * i;
float * prev_sum_row = (float *)((uchar *)sum + _sumstep * i) + cn;
float * sum_row = (float *)((uchar *)sum + _sumstep * (i + 1)) + cn;
sum_row[-1] = sum_row[-2] = sum_row[-3] = sum_row[-4] = 0;
2014-12-29 20:38:49 +08:00
v_float32 prev = vx_setzero_f32();
int j = 0;
for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
{
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
v_float32 el4l, el4h;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 8));
el4l.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_low(vsum))), prev.val);
el4h.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum))), _mm256_permute2f128_ps(el4l.val, el4l.val, 0x31));
prev.val = _mm256_permute2f128_ps(el4h.val, el4h.val, 0x31);
#else
#if CV_SIMD_WIDTH >= 32
el8 += v_rotate_left<4>(el8);
#if CV_SIMD_WIDTH == 64
el8 += v_rotate_left<8>(el8);
#endif
#endif
v_int32 el4li, el4hi;
v_expand(el8, el4li, el4hi);
el4l = v_cvt_f32(el4li) + prev;
el4h = v_cvt_f32(el4hi) + el4l;
#if CV_SIMD_WIDTH == 16
prev = el4h;
#elif CV_SIMD_WIDTH == 32
prev = v_combine_high(el4h, el4h);
#else
v_float32 t = v_rotate_right<12>(el4h);
t |= v_rotate_left<4>(t);
prev = v_combine_low(t, t);
#endif
#endif
v_store(sum_row + j , el4l + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_float32::nlanes, el4h + vx_load(prev_sum_row + j + v_float32::nlanes));
}
for (float v4 = sum_row[j - 1] - prev_sum_row[j - 1],
v3 = sum_row[j - 2] - prev_sum_row[j - 2],
v2 = sum_row[j - 3] - prev_sum_row[j - 3],
v1 = sum_row[j - 4] - prev_sum_row[j - 4]; j < width; j += 4)
{
sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j];
sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1];
sum_row[j + 2] = (v3 += src_row[j + 2]) + prev_sum_row[j + 2];
sum_row[j + 3] = (v4 += src_row[j + 3]) + prev_sum_row[j + 3];
}
}
}
else
{
return false;
2014-12-29 20:38:49 +08:00
}
vx_cleanup();
2014-12-29 20:38:49 +08:00
return true;
}
};
#if CV_SIMD128_64F
template <>
struct Integral_SIMD<uchar, double, double>
{
Integral_SIMD() {}
bool operator()(const uchar * src, size_t _srcstep,
double * sum, size_t _sumstep,
double * sqsum, size_t _sqsumstep,
double * tilted, size_t,
int width, int height, int cn) const
{
#if CV_AVX512_SKX
if (!tilted && cn <= 4 && (cn > 1 || sqsum))
{
calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height, cn);
return true;
}
#else
CV_UNUSED(_sqsumstep);
#endif
if (sqsum || tilted || cn > 4)
return false;
width *= cn;
// the first iteration
memset(sum, 0, (width + cn) * sizeof(double));
if (cn == 1)
{
// the others
for (int i = 0; i < height; ++i)
{
const uchar * src_row = src + _srcstep * i;
double * prev_sum_row = (double *)((uchar *)sum + _sumstep * i) + 1;
double * sum_row = (double *)((uchar *)sum + _sumstep * (i + 1)) + 1;
sum_row[-1] = 0;
v_float64 prev = vx_setzero_f64();
int j = 0;
for (; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
{
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
v_float64 el4ll, el4lh, el4hl, el4hh;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2));
vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4));
vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8));
__m256i el4l_32 = _mm256_cvtepi16_epi32(_v256_extract_low(vsum));
__m256i el4h_32 = _mm256_cvtepi16_epi32(_v256_extract_high(vsum));
el4ll.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4l_32)), prev.val);
el4lh.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4l_32)), prev.val);
__m256d el4d = _mm256_permute4x64_pd(el4lh.val, 0xff);
el4hl.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4h_32)), el4d);
el4hh.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4h_32)), el4d);
prev.val = _mm256_permute4x64_pd(el4hh.val, 0xff);
#else
el8 += v_rotate_left<1>(el8);
el8 += v_rotate_left<2>(el8);
#if CV_SIMD_WIDTH >= 32
el8 += v_rotate_left<4>(el8);
#if CV_SIMD_WIDTH == 64
el8 += v_rotate_left<8>(el8);
#endif
#endif
v_int32 el4li, el4hi;
v_expand(el8, el4li, el4hi);
el4ll = v_cvt_f64(el4li) + prev;
el4lh = v_cvt_f64_high(el4li) + prev;
el4hl = v_cvt_f64(el4hi) + el4ll;
el4hh = v_cvt_f64_high(el4hi) + el4lh;
prev = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh));
// prev = v_broadcast_element<v_float64::nlanes - 1>(el4hh);
#endif
v_store(sum_row + j , el4ll + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_float64::nlanes , el4lh + vx_load(prev_sum_row + j + v_float64::nlanes ));
v_store(sum_row + j + v_float64::nlanes * 2, el4hl + vx_load(prev_sum_row + j + v_float64::nlanes * 2));
v_store(sum_row + j + v_float64::nlanes * 3, el4hh + vx_load(prev_sum_row + j + v_float64::nlanes * 3));
}
for (double v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
sum_row[j] = (v += src_row[j]) + prev_sum_row[j];
}
}
else if (cn == 2)
{
// the others
v_int16 mask = vx_setall_s16((short)0xff);
for (int i = 0; i < height; ++i)
{
const uchar * src_row = src + _srcstep * i;
double * prev_sum_row = (double *)((uchar *)sum + _sumstep * i) + cn;
double * sum_row = (double *)((uchar *)sum + _sumstep * (i + 1)) + cn;
sum_row[-1] = sum_row[-2] = 0;
v_float64 prev_1 = vx_setzero_f64(), prev_2 = vx_setzero_f64();
int j = 0;
for (; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn)
{
v_int16 v_src_row = v_reinterpret_as_s16(vx_load(src_row + j));
v_int16 el8_1 = v_src_row & mask;
v_int16 el8_2 = v_reinterpret_as_s16(v_reinterpret_as_u16(v_src_row) >> 8);
v_float64 el4ll_1, el4lh_1, el4hl_1, el4hh_1, el4ll_2, el4lh_2, el4hl_2, el4hh_2;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2));
__m256i vsum_2 = _mm256_add_epi16(el8_2.val, _mm256_slli_si256(el8_2.val, 2));
vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 4));
vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 4));
vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 8));
vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 8));
__m256i el4l1_32 = _mm256_cvtepi16_epi32(_v256_extract_low(vsum_1));
__m256i el4l2_32 = _mm256_cvtepi16_epi32(_v256_extract_low(vsum_2));
__m256i el4h1_32 = _mm256_cvtepi16_epi32(_v256_extract_high(vsum_1));
__m256i el4h2_32 = _mm256_cvtepi16_epi32(_v256_extract_high(vsum_2));
el4ll_1.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4l1_32)), prev_1.val);
el4ll_2.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4l2_32)), prev_2.val);
el4lh_1.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4l1_32)), prev_1.val);
el4lh_2.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4l2_32)), prev_2.val);
__m256d el4d_1 = _mm256_permute4x64_pd(el4lh_1.val, 0xff);
__m256d el4d_2 = _mm256_permute4x64_pd(el4lh_2.val, 0xff);
el4hl_1.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4h1_32)), el4d_1);
el4hl_2.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4h2_32)), el4d_2);
el4hh_1.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4h1_32)), el4d_1);
el4hh_2.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4h2_32)), el4d_2);
prev_1.val = _mm256_permute4x64_pd(el4hh_1.val, 0xff);
prev_2.val = _mm256_permute4x64_pd(el4hh_2.val, 0xff);
#else
el8_1 += v_rotate_left<1>(el8_1);
el8_2 += v_rotate_left<1>(el8_2);
el8_1 += v_rotate_left<2>(el8_1);
el8_2 += v_rotate_left<2>(el8_2);
#if CV_SIMD_WIDTH >= 32
el8_1 += v_rotate_left<4>(el8_1);
el8_2 += v_rotate_left<4>(el8_2);
#if CV_SIMD_WIDTH == 64
el8_1 += v_rotate_left<8>(el8_1);
el8_2 += v_rotate_left<8>(el8_2);
#endif
#endif
v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2;
v_expand(el8_1, el4li_1, el4hi_1);
v_expand(el8_2, el4li_2, el4hi_2);
el4ll_1 = v_cvt_f64(el4li_1) + prev_1;
el4ll_2 = v_cvt_f64(el4li_2) + prev_2;
el4lh_1 = v_cvt_f64_high(el4li_1) + prev_1;
el4lh_2 = v_cvt_f64_high(el4li_2) + prev_2;
el4hl_1 = v_cvt_f64(el4hi_1) + el4ll_1;
el4hl_2 = v_cvt_f64(el4hi_2) + el4ll_2;
el4hh_1 = v_cvt_f64_high(el4hi_1) + el4lh_1;
el4hh_2 = v_cvt_f64_high(el4hi_2) + el4lh_2;
prev_1 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_1));
prev_2 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_2));
// prev_1 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_1);
// prev_2 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_2);
#endif
v_float64 el4_1, el4_2, el4_3, el4_4, el4_5, el4_6, el4_7, el4_8;
v_zip(el4ll_1, el4ll_2, el4_1, el4_2);
v_zip(el4lh_1, el4lh_2, el4_3, el4_4);
v_zip(el4hl_1, el4hl_2, el4_5, el4_6);
v_zip(el4hh_1, el4hh_2, el4_7, el4_8);
v_store(sum_row + j , el4_1 + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_float64::nlanes , el4_2 + vx_load(prev_sum_row + j + v_float64::nlanes ));
v_store(sum_row + j + v_float64::nlanes * 2, el4_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 2));
v_store(sum_row + j + v_float64::nlanes * 3, el4_4 + vx_load(prev_sum_row + j + v_float64::nlanes * 3));
v_store(sum_row + j + v_float64::nlanes * 4, el4_5 + vx_load(prev_sum_row + j + v_float64::nlanes * 4));
v_store(sum_row + j + v_float64::nlanes * 5, el4_6 + vx_load(prev_sum_row + j + v_float64::nlanes * 5));
v_store(sum_row + j + v_float64::nlanes * 6, el4_7 + vx_load(prev_sum_row + j + v_float64::nlanes * 6));
v_store(sum_row + j + v_float64::nlanes * 7, el4_8 + vx_load(prev_sum_row + j + v_float64::nlanes * 7));
}
for (double v2 = sum_row[j - 1] - prev_sum_row[j - 1],
v1 = sum_row[j - 2] - prev_sum_row[j - 2]; j < width; j += 2)
{
sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j];
sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1];
}
}
}
else if (cn == 3)
{
// the others
for (int i = 0; i < height; ++i)
{
const uchar * src_row = src + _srcstep * i;
double * prev_sum_row = (double *)((uchar *)sum + _sumstep * i) + cn;
double * sum_row = (double *)((uchar *)sum + _sumstep * (i + 1)) + cn;
double row_cache[v_float64::nlanes * 12];
sum_row[-1] = sum_row[-2] = sum_row[-3] = 0;
v_float64 prev_1 = vx_setzero_f64(), prev_2 = vx_setzero_f64(),
prev_3 = vx_setzero_f64();
int j = 0;
for (; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn)
{
v_uint8 v_src_row_1, v_src_row_2, v_src_row_3;
v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3);
v_int16 el8_1 = v_reinterpret_as_s16(v_expand_low(v_src_row_1));
v_int16 el8_2 = v_reinterpret_as_s16(v_expand_low(v_src_row_2));
v_int16 el8_3 = v_reinterpret_as_s16(v_expand_low(v_src_row_3));
v_float64 el4ll_1, el4lh_1, el4hl_1, el4hh_1, el4ll_2, el4lh_2, el4hl_2, el4hh_2, el4ll_3, el4lh_3, el4hl_3, el4hh_3;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2));
__m256i vsum_2 = _mm256_add_epi16(el8_2.val, _mm256_slli_si256(el8_2.val, 2));
__m256i vsum_3 = _mm256_add_epi16(el8_3.val, _mm256_slli_si256(el8_3.val, 2));
vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 4));
vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 4));
vsum_3 = _mm256_add_epi16(vsum_3, _mm256_slli_si256(vsum_3, 4));
vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 8));
vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 8));
vsum_3 = _mm256_add_epi16(vsum_3, _mm256_slli_si256(vsum_3, 8));
__m256i el4l1_32 = _mm256_cvtepi16_epi32(_v256_extract_low(vsum_1));
__m256i el4l2_32 = _mm256_cvtepi16_epi32(_v256_extract_low(vsum_2));
__m256i el4l3_32 = _mm256_cvtepi16_epi32(_v256_extract_low(vsum_3));
__m256i el4h1_32 = _mm256_cvtepi16_epi32(_v256_extract_high(vsum_1));
__m256i el4h2_32 = _mm256_cvtepi16_epi32(_v256_extract_high(vsum_2));
__m256i el4h3_32 = _mm256_cvtepi16_epi32(_v256_extract_high(vsum_3));
el4ll_1.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4l1_32)), prev_1.val);
el4ll_2.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4l2_32)), prev_2.val);
el4ll_3.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4l3_32)), prev_3.val);
el4lh_1.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4l1_32)), prev_1.val);
el4lh_2.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4l2_32)), prev_2.val);
el4lh_3.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4l3_32)), prev_3.val);
__m256d el4d_1 = _mm256_permute4x64_pd(el4lh_1.val, 0xff);
__m256d el4d_2 = _mm256_permute4x64_pd(el4lh_2.val, 0xff);
__m256d el4d_3 = _mm256_permute4x64_pd(el4lh_3.val, 0xff);
el4hl_1.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4h1_32)), el4d_1);
el4hl_2.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4h2_32)), el4d_2);
el4hl_3.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4h3_32)), el4d_3);
el4hh_1.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4h1_32)), el4d_1);
el4hh_2.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4h2_32)), el4d_2);
el4hh_3.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4h3_32)), el4d_3);
prev_1.val = _mm256_permute4x64_pd(el4hh_1.val, 0xff);
prev_2.val = _mm256_permute4x64_pd(el4hh_2.val, 0xff);
prev_3.val = _mm256_permute4x64_pd(el4hh_3.val, 0xff);
#else
el8_1 += v_rotate_left<1>(el8_1);
el8_2 += v_rotate_left<1>(el8_2);
el8_3 += v_rotate_left<1>(el8_3);
el8_1 += v_rotate_left<2>(el8_1);
el8_2 += v_rotate_left<2>(el8_2);
el8_3 += v_rotate_left<2>(el8_3);
#if CV_SIMD_WIDTH >= 32
el8_1 += v_rotate_left<4>(el8_1);
el8_2 += v_rotate_left<4>(el8_2);
el8_3 += v_rotate_left<4>(el8_3);
#if CV_SIMD_WIDTH == 64
el8_1 += v_rotate_left<8>(el8_1);
el8_2 += v_rotate_left<8>(el8_2);
el8_3 += v_rotate_left<8>(el8_3);
#endif
#endif
v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2, el4li_3, el4hi_3;
v_expand(el8_1, el4li_1, el4hi_1);
v_expand(el8_2, el4li_2, el4hi_2);
v_expand(el8_3, el4li_3, el4hi_3);
el4ll_1 = v_cvt_f64(el4li_1) + prev_1;
el4ll_2 = v_cvt_f64(el4li_2) + prev_2;
el4ll_3 = v_cvt_f64(el4li_3) + prev_3;
el4lh_1 = v_cvt_f64_high(el4li_1) + prev_1;
el4lh_2 = v_cvt_f64_high(el4li_2) + prev_2;
el4lh_3 = v_cvt_f64_high(el4li_3) + prev_3;
el4hl_1 = v_cvt_f64(el4hi_1) + el4ll_1;
el4hl_2 = v_cvt_f64(el4hi_2) + el4ll_2;
el4hl_3 = v_cvt_f64(el4hi_3) + el4ll_3;
el4hh_1 = v_cvt_f64_high(el4hi_1) + el4lh_1;
el4hh_2 = v_cvt_f64_high(el4hi_2) + el4lh_2;
el4hh_3 = v_cvt_f64_high(el4hi_3) + el4lh_3;
prev_1 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_1));
prev_2 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_2));
prev_3 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_3));
// prev_1 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_1);
// prev_2 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_2);
// prev_3 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_3);
#endif
v_store_interleave(row_cache , el4ll_1, el4ll_2, el4ll_3);
v_store_interleave(row_cache + v_float64::nlanes * 3, el4lh_1, el4lh_2, el4lh_3);
v_store_interleave(row_cache + v_float64::nlanes * 6, el4hl_1, el4hl_2, el4hl_3);
v_store_interleave(row_cache + v_float64::nlanes * 9, el4hh_1, el4hh_2, el4hh_3);
el4ll_1 = vx_load(row_cache );
el4ll_2 = vx_load(row_cache + v_float64::nlanes );
el4ll_3 = vx_load(row_cache + v_float64::nlanes * 2 );
el4lh_1 = vx_load(row_cache + v_float64::nlanes * 3 );
el4lh_2 = vx_load(row_cache + v_float64::nlanes * 4 );
el4lh_3 = vx_load(row_cache + v_float64::nlanes * 5 );
el4hl_1 = vx_load(row_cache + v_float64::nlanes * 6 );
el4hl_2 = vx_load(row_cache + v_float64::nlanes * 7 );
el4hl_3 = vx_load(row_cache + v_float64::nlanes * 8 );
el4hh_1 = vx_load(row_cache + v_float64::nlanes * 9 );
el4hh_2 = vx_load(row_cache + v_float64::nlanes * 10);
el4hh_3 = vx_load(row_cache + v_float64::nlanes * 11);
v_store(sum_row + j , el4ll_1 + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_float64::nlanes , el4ll_2 + vx_load(prev_sum_row + j + v_float64::nlanes ));
v_store(sum_row + j + v_float64::nlanes * 2 , el4ll_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 2 ));
v_store(sum_row + j + v_float64::nlanes * 3 , el4lh_1 + vx_load(prev_sum_row + j + v_float64::nlanes * 3 ));
v_store(sum_row + j + v_float64::nlanes * 4 , el4lh_2 + vx_load(prev_sum_row + j + v_float64::nlanes * 4 ));
v_store(sum_row + j + v_float64::nlanes * 5 , el4lh_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 5 ));
v_store(sum_row + j + v_float64::nlanes * 6 , el4hl_1 + vx_load(prev_sum_row + j + v_float64::nlanes * 6 ));
v_store(sum_row + j + v_float64::nlanes * 7 , el4hl_2 + vx_load(prev_sum_row + j + v_float64::nlanes * 7 ));
v_store(sum_row + j + v_float64::nlanes * 8 , el4hl_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 8 ));
v_store(sum_row + j + v_float64::nlanes * 9 , el4hh_1 + vx_load(prev_sum_row + j + v_float64::nlanes * 9 ));
v_store(sum_row + j + v_float64::nlanes * 10, el4hh_2 + vx_load(prev_sum_row + j + v_float64::nlanes * 10));
v_store(sum_row + j + v_float64::nlanes * 11, el4hh_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 11));
}
for (double v3 = sum_row[j - 1] - prev_sum_row[j - 1],
v2 = sum_row[j - 2] - prev_sum_row[j - 2],
v1 = sum_row[j - 3] - prev_sum_row[j - 3]; j < width; j += 3)
{
sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j];
sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1];
sum_row[j + 2] = (v3 += src_row[j + 2]) + prev_sum_row[j + 2];
}
}
}
else if (cn == 4)
{
// the others
for (int i = 0; i < height; ++i)
{
const uchar * src_row = src + _srcstep * i;
double * prev_sum_row = (double *)((uchar *)sum + _sumstep * i) + cn;
double * sum_row = (double *)((uchar *)sum + _sumstep * (i + 1)) + cn;
sum_row[-1] = sum_row[-2] = sum_row[-3] = sum_row[-4] = 0;
v_float64 prev_1 = vx_setzero_f64(), prev_2 = vx_setzero_f64();
int j = 0;
for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
{
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
v_float64 el4ll, el4lh, el4hl, el4hh;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 8));
__m256i el4l_32 = _mm256_cvtepi16_epi32(_v256_extract_low(vsum));
__m256i el4h_32 = _mm256_cvtepi16_epi32(_v256_extract_high(vsum));
el4ll.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4l_32)), prev_1.val);
el4lh.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4l_32)), prev_2.val);
el4hl.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4h_32)), el4lh.val);
el4hh.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4h_32)), el4lh.val);
prev_1.val = prev_2.val = el4hh.val;
#else
#if CV_SIMD_WIDTH >= 32
el8 += v_rotate_left<4>(el8);
#if CV_SIMD_WIDTH == 64
el8 += v_rotate_left<8>(el8);
#endif
#endif
v_int32 el4li, el4hi;
v_expand(el8, el4li, el4hi);
el4ll = v_cvt_f64(el4li) + prev_1;
el4lh = v_cvt_f64_high(el4li) + prev_2;
el4hl = v_cvt_f64(el4hi) + el4ll;
el4hh = v_cvt_f64_high(el4hi) + el4lh;
#if CV_SIMD_WIDTH == 16
prev_1 = el4hl;
prev_2 = el4hh;
#elif CV_SIMD_WIDTH == 32
prev_1 = prev_2 = el4hh;
#else
prev_1 = prev_2 = v_combine_high(el4hh, el4hh);
#endif
#endif
v_store(sum_row + j , el4ll + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_float64::nlanes , el4lh + vx_load(prev_sum_row + j + v_float64::nlanes ));
v_store(sum_row + j + v_float64::nlanes * 2, el4hl + vx_load(prev_sum_row + j + v_float64::nlanes * 2));
v_store(sum_row + j + v_float64::nlanes * 3, el4hh + vx_load(prev_sum_row + j + v_float64::nlanes * 3));
}
for (double v4 = sum_row[j - 1] - prev_sum_row[j - 1],
v3 = sum_row[j - 2] - prev_sum_row[j - 2],
v2 = sum_row[j - 3] - prev_sum_row[j - 3],
v1 = sum_row[j - 4] - prev_sum_row[j - 4]; j < width; j += 4)
{
sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j];
sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1];
sum_row[j + 2] = (v3 += src_row[j + 2]) + prev_sum_row[j + 2];
sum_row[j + 3] = (v4 += src_row[j + 3]) + prev_sum_row[j + 3];
}
}
}
else
{
return false;
}
vx_cleanup();
return true;
}
};
#endif
2014-12-29 20:38:49 +08:00
#endif
2020-01-17 21:54:08 +08:00
} // namespace anon
2014-12-29 20:38:49 +08:00
2020-01-17 21:54:08 +08:00
bool integral_SIMD(
int depth, int sdepth, int sqdepth,
const uchar* src, size_t srcstep,
uchar* sum, size_t sumstep,
uchar* sqsum, size_t sqsumstep,
uchar* tilted, size_t tstep,
int width, int height, int cn)
2013-12-05 23:43:50 +08:00
{
2020-01-17 21:54:08 +08:00
CV_INSTRUMENT_REGION();
2016-08-31 23:43:43 +08:00
2020-01-17 21:54:08 +08:00
#define ONE_CALL(T, ST, QT) \
return Integral_SIMD<T, ST, QT>()((const T*)src, srcstep, (ST*)sum, sumstep, (QT*)sqsum, sqsumstep, (ST*)tilted, tstep, width, height, cn)
2016-08-31 23:43:43 +08:00
if( depth == CV_8U && sdepth == CV_32S && sqdepth == CV_64F )
ONE_CALL(uchar, int, double);
else if( depth == CV_8U && sdepth == CV_32S && sqdepth == CV_32F )
ONE_CALL(uchar, int, float);
else if( depth == CV_8U && sdepth == CV_32S && sqdepth == CV_32S )
ONE_CALL(uchar, int, int);
else if( depth == CV_8U && sdepth == CV_32F && sqdepth == CV_64F )
ONE_CALL(uchar, float, double);
else if( depth == CV_8U && sdepth == CV_32F && sqdepth == CV_32F )
ONE_CALL(uchar, float, float);
else if( depth == CV_8U && sdepth == CV_64F && sqdepth == CV_64F )
ONE_CALL(uchar, double, double);
else if( depth == CV_16U && sdepth == CV_64F && sqdepth == CV_64F )
ONE_CALL(ushort, double, double);
else if( depth == CV_16S && sdepth == CV_64F && sqdepth == CV_64F )
ONE_CALL(short, double, double);
else if( depth == CV_32F && sdepth == CV_32F && sqdepth == CV_64F )
ONE_CALL(float, float, double);
else if( depth == CV_32F && sdepth == CV_32F && sqdepth == CV_32F )
ONE_CALL(float, float, float);
else if( depth == CV_32F && sdepth == CV_64F && sqdepth == CV_64F )
ONE_CALL(float, double, double);
else if( depth == CV_64F && sdepth == CV_64F && sqdepth == CV_64F )
ONE_CALL(double, double, double);
else
2020-01-17 21:54:08 +08:00
return false;
2016-08-31 23:43:43 +08:00
#undef ONE_CALL
}
2020-01-17 21:54:08 +08:00
#endif
CV_CPU_OPTIMIZATION_NAMESPACE_END
2016-08-31 23:43:43 +08:00
}} // cv::hal::