mirror of
https://github.com/opencv/opencv.git
synced 2025-08-05 22:19:14 +08:00
Merge pull request #14210 from terfendail:wui_512
AVX512 wide universal intrinsics (#14210) * Added implementation of 512-bit wide universal intrinsics(WIP) * Added implementation of 512-bit wide universal intrinsics: implemented WUI vector types(WIP) * Added implementation of 512-bit wide universal intrinsics(WIP): implemented load/store * Added implementation of 512-bit wide universal intrinsics(WIP): implemented fp16 load/store * Added implementation of 512-bit wide universal intrinsics(WIP): implemented recombine and zip, implemented non-saturating and saturating arithmetics * Added implementation of 512-bit wide universal intrinsics(WIP): implemented bit operations * Added implementation of 512-bit wide universal intrinsics(WIP): implemented comparisons * Added implementation of 512-bit wide universal intrinsics(WIP): implemented lane shifts and reduction * Added implementation of 512-bit wide universal intrinsics(WIP): implemented absolute values * Added implementation of 512-bit wide universal intrinsics(WIP): implemented rounding and cast to float * Added implementation of 512-bit wide universal intrinsics(WIP): implemented LUT * Added implementation of 512-bit wide universal intrinsics(WIP): implemented type extension/narrowing and matrix operations * Added implementation of 512-bit wide universal intrinsics(WIP): implemented load_deinterleave for 2 and 3 channels images * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented load_deinterleave for 2- and implemented for 4-channel images * Added implementation of 512-bit wide universal intrinsics(WIP): implemented store_interleave * Added implementation of 512-bit wide universal intrinsics(WIP): implemented signmask and checks * Added implementation of 512-bit wide universal intrinsics(WIP): build fixes * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented popcount in case AVX512_BITALG is unavailable * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented zip * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented rotate for s8 and s16 * Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented interleave/deinterleave for s8 and s16 * Added implementation of 512-bit wide universal intrinsics(WIP): updated v512_set macros * Added implementation of 512-bit wide universal intrinsics(WIP): fix for GCC wrong _mm512_abs_pd definition * Added implementation of 512-bit wide universal intrinsics(WIP): reworked v_zip to avoid AVX512_VBMI intrinsics * Added implementation of 512-bit wide universal intrinsics(WIP): reworked v_invsqrt to avoid AVX512_ER intrinsics * Added implementation of 512-bit wide universal intrinsics(WIP): reworked v_rotate, v_popcount and interleave/deinterleave for U8 to avoid AVX512_VBMI intrinsics * Added implementation of 512-bit wide universal intrinsics(WIP): fixed integral image SIMD part * Added implementation of 512-bit wide universal intrinsics(WIP): fixed warnings * Added implementation of 512-bit wide universal intrinsics(WIP): fixed load_deinterleave for u8 and u16 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed v_invsqrt accuracy for f64 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed interleave/deinterleave for u32 and u64 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed interleave_pairs, interleave_quads and pack_triplets * Added implementation of 512-bit wide universal intrinsics(WIP): fixed rotate_left * Added implementation of 512-bit wide universal intrinsics(WIP): fixed rotate_left/right, part 2 * Added implementation of 512-bit wide universal intrinsics(WIP): fixed 512-wide universal intrinsics based resize * Added implementation of 512-bit wide universal intrinsics(WIP): fixed findContours by avoiding use of uint64 dependent 512-wide v_signmask() * Added implementation of 512-bit wide universal intrinsics(WIP): fixed trailing whitespaces * Added implementation of 512-bit wide universal intrinsics(WIP): reworked specific intrinsic sets dependent parts to check availability of intrinsics based on CPU feature group defines * Added implementation of 512-bit wide universal intrinsics(WIP):Updated AVX512 implementation of v_popcount to avoid AVX512VPOPCNTDQ intrinsics if unavailable. * Added implementation of 512-bit wide universal intrinsics(WIP): Fixed universal intrinsics data initialisation, v_mul_wrap, v_floor, v_ceil and v_signmask. * Added implementation of 512-bit wide universal intrinsics(WIP): Removed hasSIMD512() * Added implementation of 512-bit wide universal intrinsics(WIP): Fixes for gcc build * Added implementation of 512-bit wide universal intrinsics(WIP): Reworked v_signmask, v_check_any() and v_check_all() implementation.
This commit is contained in:
parent
3289a0aff9
commit
3b015dfc7d
@ -13,8 +13,9 @@ ocv_add_dispatched_file(split SSE2 AVX2)
|
||||
ocv_add_dispatched_file(sum SSE2 AVX2)
|
||||
|
||||
# dispatching for accuracy tests
|
||||
ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2)
|
||||
ocv_add_dispatched_file_force_all(test_intrin256 TEST AVX2)
|
||||
ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2 AVX512_SKX)
|
||||
ocv_add_dispatched_file_force_all(test_intrin256 TEST AVX2 AVX512_SKX)
|
||||
ocv_add_dispatched_file_force_all(test_intrin512 TEST AVX512_SKX)
|
||||
|
||||
ocv_add_module(core
|
||||
OPTIONAL opencv_cudev
|
||||
|
@ -180,6 +180,18 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
|
||||
|
||||
#endif
|
||||
|
||||
// AVX512 can be used together with SSE2 and AVX2, so
|
||||
// we define those sets of intrinsics at once.
|
||||
// For some of AVX512 intrinsics get v512_ prefix instead of v_, e.g. v512_load() vs v_load().
|
||||
// Wide intrinsics will be mapped to v512_ counterparts in this case(e.g. vx_load() => v512_load())
|
||||
#if CV_AVX512_SKX
|
||||
|
||||
#define CV__SIMD_FORWARD 512
|
||||
#include "opencv2/core/hal/intrin_forward.hpp"
|
||||
#include "opencv2/core/hal/intrin_avx512.hpp"
|
||||
|
||||
#endif
|
||||
|
||||
//! @cond IGNORED
|
||||
|
||||
namespace cv {
|
||||
@ -321,13 +333,41 @@ template<typename _Tp> struct V_RegTraits
|
||||
CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
|
||||
#endif
|
||||
|
||||
#if CV_SIMD512
|
||||
CV_DEF_REG_TRAITS(v512, v_uint8x64, uchar, u8, v_uint8x64, v_uint16x32, v_uint32x16, v_int8x64, void);
|
||||
CV_DEF_REG_TRAITS(v512, v_int8x64, schar, s8, v_uint8x64, v_int16x32, v_int32x16, v_int8x64, void);
|
||||
CV_DEF_REG_TRAITS(v512, v_uint16x32, ushort, u16, v_uint16x32, v_uint32x16, v_uint64x8, v_int16x32, void);
|
||||
CV_DEF_REG_TRAITS(v512, v_int16x32, short, s16, v_uint16x32, v_int32x16, v_int64x8, v_int16x32, void);
|
||||
CV_DEF_REG_TRAITS(v512, v_uint32x16, unsigned, u32, v_uint32x16, v_uint64x8, void, v_int32x16, void);
|
||||
CV_DEF_REG_TRAITS(v512, v_int32x16, int, s32, v_uint32x16, v_int64x8, void, v_int32x16, void);
|
||||
CV_DEF_REG_TRAITS(v512, v_float32x16, float, f32, v_float32x16, v_float64x8, void, v_int32x16, v_int32x16);
|
||||
CV_DEF_REG_TRAITS(v512, v_uint64x8, uint64, u64, v_uint64x8, void, void, v_int64x8, void);
|
||||
CV_DEF_REG_TRAITS(v512, v_int64x8, int64, s64, v_uint64x8, void, void, v_int64x8, void);
|
||||
CV_DEF_REG_TRAITS(v512, v_float64x8, double, f64, v_float64x8, void, void, v_int64x8, v_int32x16);
|
||||
#endif
|
||||
|
||||
#if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
|
||||
#define CV__SIMD_NAMESPACE simd512
|
||||
namespace CV__SIMD_NAMESPACE {
|
||||
#define CV_SIMD 1
|
||||
#define CV_SIMD_64F CV_SIMD512_64F
|
||||
#define CV_SIMD_FP16 CV_SIMD512_FP16
|
||||
#define CV_SIMD_WIDTH 64
|
||||
// TODO typedef v_uint8 / v_int32 / etc types here
|
||||
typedef v_uint8x64 v_uint8;
|
||||
typedef v_int8x64 v_int8;
|
||||
typedef v_uint16x32 v_uint16;
|
||||
typedef v_int16x32 v_int16;
|
||||
typedef v_uint32x16 v_uint32;
|
||||
typedef v_int32x16 v_int32;
|
||||
typedef v_uint64x8 v_uint64;
|
||||
typedef v_int64x8 v_int64;
|
||||
typedef v_float32x16 v_float32;
|
||||
CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v512)
|
||||
#if CV_SIMD512_64F
|
||||
typedef v_float64x8 v_float64;
|
||||
CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v512, load)
|
||||
#endif
|
||||
inline void vx_cleanup() { v512_cleanup(); }
|
||||
} // namespace
|
||||
using namespace CV__SIMD_NAMESPACE;
|
||||
#elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)
|
||||
|
2743
modules/core/include/opencv2/core/hal/intrin_avx512.hpp
Normal file
2743
modules/core/include/opencv2/core/hal/intrin_avx512.hpp
Normal file
File diff suppressed because it is too large
Load Diff
@ -14,9 +14,32 @@ namespace cv
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
||||
|
||||
/** Types **/
|
||||
#if CV__SIMD_FORWARD == 512
|
||||
// [todo] 512
|
||||
#error "AVX512 Not implemented yet"
|
||||
#if CV__SIMD_FORWARD == 1024
|
||||
// [todo] 1024
|
||||
#error "1024-long ops not implemented yet"
|
||||
#elif CV__SIMD_FORWARD == 512
|
||||
// 512
|
||||
#define __CV_VX(fun) v512_##fun
|
||||
#define __CV_V_UINT8 v_uint8x64
|
||||
#define __CV_V_INT8 v_int8x64
|
||||
#define __CV_V_UINT16 v_uint16x32
|
||||
#define __CV_V_INT16 v_int16x32
|
||||
#define __CV_V_UINT32 v_uint32x16
|
||||
#define __CV_V_INT32 v_int32x16
|
||||
#define __CV_V_UINT64 v_uint64x8
|
||||
#define __CV_V_INT64 v_int64x8
|
||||
#define __CV_V_FLOAT32 v_float32x16
|
||||
#define __CV_V_FLOAT64 v_float64x8
|
||||
struct v_uint8x64;
|
||||
struct v_int8x64;
|
||||
struct v_uint16x32;
|
||||
struct v_int16x32;
|
||||
struct v_uint32x16;
|
||||
struct v_int32x16;
|
||||
struct v_uint64x8;
|
||||
struct v_int64x8;
|
||||
struct v_float32x16;
|
||||
struct v_float64x8;
|
||||
#elif CV__SIMD_FORWARD == 256
|
||||
// 256
|
||||
#define __CV_VX(fun) v256_##fun
|
||||
|
@ -7,11 +7,15 @@
|
||||
#include "test_intrin128.simd_declarations.hpp"
|
||||
|
||||
#undef CV_CPU_DISPATCH_MODES_ALL
|
||||
|
||||
#include "opencv2/core/cv_cpu_dispatch.h"
|
||||
#include "test_intrin256.simd.hpp"
|
||||
#include "test_intrin256.simd_declarations.hpp"
|
||||
|
||||
#undef CV_CPU_DISPATCH_MODES_ALL
|
||||
#include "opencv2/core/cv_cpu_dispatch.h"
|
||||
#include "test_intrin512.simd.hpp"
|
||||
#include "test_intrin512.simd_declarations.hpp"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
# pragma warning(disable:4702) // unreachable code
|
||||
#endif
|
||||
@ -30,6 +34,11 @@ namespace opencv_test { namespace hal {
|
||||
throw SkipTestException("SIMD256 (" #cpu_opt ") is not available or disabled"); \
|
||||
} while(0)
|
||||
|
||||
#define DISPATCH_SIMD512(fn, cpu_opt) do { \
|
||||
CV_CPU_CALL_ ## cpu_opt ## _(fn, ()); \
|
||||
throw SkipTestException("SIMD512 (" #cpu_opt ") is not available or disabled"); \
|
||||
} while(0)
|
||||
|
||||
#define DEFINE_SIMD_TESTS(simd_size, cpu_opt) \
|
||||
TEST(hal_intrin ## simd_size, uint8x16_ ## cpu_opt) { DISPATCH_SIMD ## simd_size(test_hal_intrin_uint8, cpu_opt); } \
|
||||
TEST(hal_intrin ## simd_size, int8x16_ ## cpu_opt) { DISPATCH_SIMD ## simd_size(test_hal_intrin_int8, cpu_opt); } \
|
||||
@ -67,6 +76,9 @@ DEFINE_SIMD_TESTS(128, AVX)
|
||||
#if defined CV_CPU_DISPATCH_COMPILE_AVX2 || defined CV_CPU_BASELINE_COMPILE_AVX2
|
||||
DEFINE_SIMD_TESTS(128, AVX2)
|
||||
#endif
|
||||
#if defined CV_CPU_DISPATCH_COMPILE_AVX512_SKX || defined CV_CPU_BASELINE_COMPILE_AVX512_SKX
|
||||
DEFINE_SIMD_TESTS(128, AVX512_SKX)
|
||||
#endif
|
||||
|
||||
TEST(hal_intrin128, float16x8_FP16)
|
||||
{
|
||||
@ -91,6 +103,10 @@ namespace intrin256 {
|
||||
DEFINE_SIMD_TESTS(256, AVX2)
|
||||
#endif
|
||||
|
||||
#if defined CV_CPU_DISPATCH_COMPILE_AVX512_SKX || defined CV_CPU_BASELINE_COMPILE_AVX512_SKX
|
||||
DEFINE_SIMD_TESTS(256, AVX512_SKX)
|
||||
#endif
|
||||
|
||||
TEST(hal_intrin256, float16x16_FP16)
|
||||
{
|
||||
//CV_CPU_CALL_FP16_(test_hal_intrin_float16, ());
|
||||
@ -101,4 +117,19 @@ TEST(hal_intrin256, float16x16_FP16)
|
||||
|
||||
} // namespace intrin256
|
||||
|
||||
namespace intrin512 {
|
||||
|
||||
#if defined CV_CPU_DISPATCH_COMPILE_AVX512_SKX || defined CV_CPU_BASELINE_COMPILE_AVX512_SKX
|
||||
DEFINE_SIMD_TESTS(512, AVX512_SKX)
|
||||
#endif
|
||||
|
||||
TEST(hal_intrin512, float16x32_FP16)
|
||||
{
|
||||
CV_CPU_CALL_AVX512_SKX_(test_hal_intrin_float16, ());
|
||||
throw SkipTestException("Unsupported hardware: FP16 is not available");
|
||||
}
|
||||
|
||||
|
||||
} // namespace intrin512
|
||||
|
||||
}} // namespace
|
23
modules/core/test/test_intrin512.simd.hpp
Normal file
23
modules/core/test/test_intrin512.simd.hpp
Normal file
@ -0,0 +1,23 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
#if !defined CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY && \
|
||||
!defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS // TODO? C++ fallback implementation for SIMD512
|
||||
|
||||
#define CV__SIMD_FORCE_WIDTH 512
|
||||
#include "opencv2/core/hal/intrin.hpp"
|
||||
#undef CV__SIMD_FORCE_WIDTH
|
||||
|
||||
#if CV_SIMD_WIDTH != 64
|
||||
#error "Invalid build configuration"
|
||||
#endif
|
||||
|
||||
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
||||
|
||||
namespace opencv_test { namespace hal { namespace intrin512 {
|
||||
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
|
||||
|
||||
#include "test_intrin_utils.hpp"
|
||||
|
||||
CV_CPU_OPTIMIZATION_NAMESPACE_END
|
||||
}}} //namespace
|
@ -811,7 +811,9 @@ template<typename R> struct TheTest
|
||||
R a = dataA, b = dataB, c = dataC, d = dataD, e = dataE;
|
||||
|
||||
EXPECT_EQ(2, v_signmask(a));
|
||||
#if CV_SIMD_WIDTH <= 32
|
||||
EXPECT_EQ(2 | (1 << (R::nlanes / 2)) | (1 << (R::nlanes - 1)), v_signmask(b));
|
||||
#endif
|
||||
|
||||
EXPECT_EQ(false, v_check_all(a));
|
||||
EXPECT_EQ(false, v_check_all(b));
|
||||
|
@ -1061,10 +1061,16 @@ cvFindNextContour( CvContourScanner scanner )
|
||||
}
|
||||
else
|
||||
{
|
||||
v_uint8 v_prev = vx_setall_u8((uchar)prev);
|
||||
for (; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
v_uint8 vx_prev = vx_setall_u8((uchar)prev);
|
||||
while (x <= width - v_uint8::nlanes &&
|
||||
v_check_all(vx_load((uchar*)(img + x)) == vx_prev))
|
||||
x += v_uint8::nlanes;
|
||||
#endif
|
||||
v_uint8x16 v_prev = v_setall_u8((uchar)prev);
|
||||
for (; x <= width - v_uint8x16::nlanes; x += v_uint8x16::nlanes)
|
||||
{
|
||||
unsigned int mask = (unsigned int)v_signmask(vx_load((uchar*)(img + x)) != v_prev);
|
||||
unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(img + x)) != v_prev);
|
||||
if (mask)
|
||||
{
|
||||
p = img[(x += cv::trailingZeros32(mask))];
|
||||
@ -1328,10 +1334,16 @@ CvLinkedRunPoint;
|
||||
inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j)
|
||||
{
|
||||
#if CV_SIMD
|
||||
v_uint8 v_zero = vx_setzero_u8();
|
||||
for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes)
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
v_uint8 vx_zero = vx_setzero_u8();
|
||||
while (j <= img_size.width - v_uint8::nlanes &&
|
||||
v_check_all(vx_load((uchar*)(src_data + j)) == vx_zero))
|
||||
j += v_uint8::nlanes;
|
||||
#endif
|
||||
v_uint8x16 v_zero = v_setzero_u8();
|
||||
for (; j <= img_size.width - v_uint8x16::nlanes; j += v_uint8x16::nlanes)
|
||||
{
|
||||
unsigned int mask = (unsigned int)v_signmask(vx_load((uchar*)(src_data + j)) != v_zero);
|
||||
unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(src_data + j)) != v_zero);
|
||||
if (mask)
|
||||
{
|
||||
j += cv::trailingZeros32(mask);
|
||||
@ -1353,10 +1365,16 @@ inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j)
|
||||
}
|
||||
else
|
||||
{
|
||||
v_uint8 v_zero = vx_setzero_u8();
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
v_uint8 vx_zero = vx_setzero_u8();
|
||||
while (j <= img_size.width - v_uint8::nlanes &&
|
||||
v_check_all(vx_load((uchar*)(src_data + j)) != vx_zero))
|
||||
j += v_uint8::nlanes;
|
||||
#endif
|
||||
v_uint8x16 v_zero = v_setzero_u8();
|
||||
for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes)
|
||||
{
|
||||
unsigned int mask = (unsigned int)v_signmask(vx_load((uchar*)(src_data + j)) == v_zero);
|
||||
unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(src_data + j)) == v_zero);
|
||||
if (mask)
|
||||
{
|
||||
j += cv::trailingZeros32(mask);
|
||||
|
@ -2148,6 +2148,7 @@ public:
|
||||
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
|
||||
bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
|
||||
#elif CV_SIMD_WIDTH == 64
|
||||
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
|
||||
v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
|
||||
bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
|
||||
#endif
|
||||
@ -2167,6 +2168,7 @@ public:
|
||||
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
|
||||
bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
|
||||
#elif CV_SIMD_WIDTH == 64
|
||||
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
|
||||
v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
|
||||
bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
|
||||
#endif
|
||||
|
@ -127,7 +127,7 @@ struct Integral_SIMD<uchar, int, double>
|
||||
{
|
||||
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
|
||||
v_int32 el4l, el4h;
|
||||
#if CV_AVX2
|
||||
#if CV_AVX2 && CV_SIMD_WIDTH == 32
|
||||
__m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2));
|
||||
vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4));
|
||||
vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8));
|
||||
@ -138,7 +138,7 @@ struct Integral_SIMD<uchar, int, double>
|
||||
#else
|
||||
el8 += v_rotate_left<1>(el8);
|
||||
el8 += v_rotate_left<2>(el8);
|
||||
#if CV_SIMD_WIDTH == 32
|
||||
#if CV_SIMD_WIDTH >= 32
|
||||
el8 += v_rotate_left<4>(el8);
|
||||
#if CV_SIMD_WIDTH == 64
|
||||
el8 += v_rotate_left<8>(el8);
|
||||
@ -194,7 +194,7 @@ struct Integral_SIMD<uchar, float, double>
|
||||
{
|
||||
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
|
||||
v_float32 el4l, el4h;
|
||||
#if CV_AVX2
|
||||
#if CV_AVX2 && CV_SIMD_WIDTH == 32
|
||||
__m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2));
|
||||
vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4));
|
||||
vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8));
|
||||
@ -205,7 +205,7 @@ struct Integral_SIMD<uchar, float, double>
|
||||
#else
|
||||
el8 += v_rotate_left<1>(el8);
|
||||
el8 += v_rotate_left<2>(el8);
|
||||
#if CV_SIMD_WIDTH == 32
|
||||
#if CV_SIMD_WIDTH >= 32
|
||||
el8 += v_rotate_left<4>(el8);
|
||||
#if CV_SIMD_WIDTH == 64
|
||||
el8 += v_rotate_left<8>(el8);
|
||||
|
Loading…
Reference in New Issue
Block a user