mirror of
https://github.com/opencv/opencv.git
synced 2025-06-07 09:25:45 +08:00
Merge remote-tracking branch 'upstream/3.4' into merge-3.4
This commit is contained in:
commit
65573784c4
@ -477,6 +477,7 @@ OCV_OPTION(CV_DISABLE_OPTIMIZATION "Disable explicit optimized code (dispatch
|
||||
OCV_OPTION(CV_TRACE "Enable OpenCV code trace" ON)
|
||||
OCV_OPTION(OPENCV_GENERATE_SETUPVARS "Generate setup_vars* scripts" ON IF (NOT ANDROID AND NOT APPLE_FRAMEWORK) )
|
||||
OCV_OPTION(ENABLE_CONFIG_VERIFICATION "Fail build if actual configuration doesn't match requested (WITH_XXX != HAVE_XXX)" OFF)
|
||||
OCV_OPTION(OPENCV_ENABLE_MEMALIGN "Enable posix_memalign or memalign usage" ON)
|
||||
|
||||
OCV_OPTION(ENABLE_PYLINT "Add target with Pylint checks" (BUILD_DOCS OR BUILD_EXAMPLES) IF (NOT CMAKE_CROSSCOMPILING AND NOT APPLE_FRAMEWORK) )
|
||||
OCV_OPTION(ENABLE_FLAKE8 "Add target with Python flake8 checker" (BUILD_DOCS OR BUILD_EXAMPLES) IF (NOT CMAKE_CROSSCOMPILING AND NOT APPLE_FRAMEWORK) )
|
||||
@ -625,10 +626,15 @@ if(UNIX)
|
||||
set(HAVE_PTHREAD 1)
|
||||
endif()
|
||||
|
||||
CHECK_SYMBOL_EXISTS(posix_memalign stdlib.h HAVE_POSIX_MEMALIGN)
|
||||
CHECK_INCLUDE_FILE(malloc.h HAVE_MALLOC_H)
|
||||
if(HAVE_MALLOC_H)
|
||||
CHECK_SYMBOL_EXISTS(memalign malloc.h HAVE_MEMALIGN)
|
||||
if(OPENCV_ENABLE_MEMALIGN)
|
||||
CHECK_SYMBOL_EXISTS(posix_memalign stdlib.h HAVE_POSIX_MEMALIGN)
|
||||
CHECK_INCLUDE_FILE(malloc.h HAVE_MALLOC_H)
|
||||
if(HAVE_MALLOC_H)
|
||||
CHECK_SYMBOL_EXISTS(memalign malloc.h HAVE_MEMALIGN)
|
||||
endif()
|
||||
# TODO:
|
||||
# - _aligned_malloc() on Win32
|
||||
# - std::aligned_alloc() C++17 / C11
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
@ -6,5 +6,6 @@ void test()
|
||||
{
|
||||
int data[8] = {0,0,0,0, 0,0,0,0};
|
||||
__m256i a = _mm256_loadu_si256((const __m256i *)data);
|
||||
__m256i b = _mm256_bslli_epi128(a, 1); // available in GCC 4.9.3+
|
||||
}
|
||||
int main() { return 0; }
|
||||
|
@ -6,7 +6,7 @@ ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3)
|
||||
ocv_add_dispatched_file(convert SSE2 AVX2 VSX3)
|
||||
ocv_add_dispatched_file(convert_scale SSE2 AVX2)
|
||||
ocv_add_dispatched_file(count_non_zero SSE2 AVX2)
|
||||
ocv_add_dispatched_file(matmul SSE2 AVX2)
|
||||
ocv_add_dispatched_file(matmul SSE2 SSE4_1 AVX2 AVX512_SKX)
|
||||
ocv_add_dispatched_file(mean SSE2 AVX2)
|
||||
ocv_add_dispatched_file(merge SSE2 AVX2)
|
||||
ocv_add_dispatched_file(split SSE2 AVX2)
|
||||
|
@ -342,4 +342,4 @@ struct VZeroUpperGuard {
|
||||
|
||||
#ifndef CV_WASM_SIMD
|
||||
# define CV_WASM_SIMD 0
|
||||
#endif
|
||||
#endif
|
||||
|
@ -1431,6 +1431,28 @@ inline v_float64x4 v_cvt_f64(const v_float32x8& a)
|
||||
inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
|
||||
{ return v_float64x4(_mm256_cvtps_pd(_v256_extract_high(a.val))); }
|
||||
|
||||
// from (Mysticial and wim) https://stackoverflow.com/q/41144668
|
||||
inline v_float64x4 v_cvt_f64(const v_int64x4& v)
|
||||
{
|
||||
// constants encoded as floating-point
|
||||
__m256i magic_i_lo = _mm256_set1_epi64x(0x4330000000000000); // 2^52
|
||||
__m256i magic_i_hi32 = _mm256_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
|
||||
__m256i magic_i_all = _mm256_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
|
||||
__m256d magic_d_all = _mm256_castsi256_pd(magic_i_all);
|
||||
|
||||
// Blend the 32 lowest significant bits of v with magic_int_lo
|
||||
__m256i v_lo = _mm256_blend_epi32(magic_i_lo, v.val, 0x55);
|
||||
// Extract the 32 most significant bits of v
|
||||
__m256i v_hi = _mm256_srli_epi64(v.val, 32);
|
||||
// Flip the msb of v_hi and blend with 0x45300000
|
||||
v_hi = _mm256_xor_si256(v_hi, magic_i_hi32);
|
||||
// Compute in double precision
|
||||
__m256d v_hi_dbl = _mm256_sub_pd(_mm256_castsi256_pd(v_hi), magic_d_all);
|
||||
// (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition
|
||||
__m256d result = _mm256_add_pd(v_hi_dbl, _mm256_castsi256_pd(v_lo));
|
||||
return v_float64x4(result);
|
||||
}
|
||||
|
||||
////////////// Lookup table access ////////////////////
|
||||
|
||||
inline v_int8x32 v256_lut(const schar* tab, const int* idx)
|
||||
@ -1638,12 +1660,165 @@ inline v_float32x8 v_pack_triplets(const v_float32x8& vec)
|
||||
|
||||
////////// Matrix operations /////////
|
||||
|
||||
//////// Dot Product ////////
|
||||
|
||||
// 16 >> 32
|
||||
inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
|
||||
{ return v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
|
||||
|
||||
inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
|
||||
{ return v_dotprod(a, b) + c; }
|
||||
|
||||
// 32 >> 64
|
||||
inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
|
||||
{
|
||||
__m256i even = _mm256_mul_epi32(a.val, b.val);
|
||||
__m256i odd = _mm256_mul_epi32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
|
||||
return v_int64x4(_mm256_add_epi64(even, odd));
|
||||
}
|
||||
inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
|
||||
{ return v_dotprod(a, b) + c; }
|
||||
|
||||
// 8 >> 32
|
||||
inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
|
||||
{
|
||||
__m256i even_m = _mm256_set1_epi32(0xFF00FF00);
|
||||
__m256i even_a = _mm256_blendv_epi8(a.val, _mm256_setzero_si256(), even_m);
|
||||
__m256i odd_a = _mm256_srli_epi16(a.val, 8);
|
||||
|
||||
__m256i even_b = _mm256_blendv_epi8(b.val, _mm256_setzero_si256(), even_m);
|
||||
__m256i odd_b = _mm256_srli_epi16(b.val, 8);
|
||||
|
||||
__m256i prod0 = _mm256_madd_epi16(even_a, even_b);
|
||||
__m256i prod1 = _mm256_madd_epi16(odd_a, odd_b);
|
||||
return v_uint32x8(_mm256_add_epi32(prod0, prod1));
|
||||
}
|
||||
inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
|
||||
{
|
||||
__m256i even_a = _mm256_srai_epi16(_mm256_bslli_epi128(a.val, 1), 8);
|
||||
__m256i odd_a = _mm256_srai_epi16(a.val, 8);
|
||||
|
||||
__m256i even_b = _mm256_srai_epi16(_mm256_bslli_epi128(b.val, 1), 8);
|
||||
__m256i odd_b = _mm256_srai_epi16(b.val, 8);
|
||||
|
||||
__m256i prod0 = _mm256_madd_epi16(even_a, even_b);
|
||||
__m256i prod1 = _mm256_madd_epi16(odd_a, odd_b);
|
||||
return v_int32x8(_mm256_add_epi32(prod0, prod1));
|
||||
}
|
||||
inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
// 16 >> 64
|
||||
inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
|
||||
{
|
||||
__m256i mullo = _mm256_mullo_epi16(a.val, b.val);
|
||||
__m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);
|
||||
__m256i mul0 = _mm256_unpacklo_epi16(mullo, mulhi);
|
||||
__m256i mul1 = _mm256_unpackhi_epi16(mullo, mulhi);
|
||||
|
||||
__m256i p02 = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);
|
||||
__m256i p13 = _mm256_srli_epi64(mul0, 32);
|
||||
__m256i p46 = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);
|
||||
__m256i p57 = _mm256_srli_epi64(mul1, 32);
|
||||
|
||||
__m256i p15_ = _mm256_add_epi64(p02, p13);
|
||||
__m256i p9d_ = _mm256_add_epi64(p46, p57);
|
||||
|
||||
return v_uint64x4(_mm256_add_epi64(
|
||||
_mm256_unpacklo_epi64(p15_, p9d_),
|
||||
_mm256_unpackhi_epi64(p15_, p9d_)
|
||||
));
|
||||
}
|
||||
inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
|
||||
{
|
||||
__m256i prod = _mm256_madd_epi16(a.val, b.val);
|
||||
__m256i sign = _mm256_srai_epi32(prod, 31);
|
||||
|
||||
__m256i lo = _mm256_unpacklo_epi32(prod, sign);
|
||||
__m256i hi = _mm256_unpackhi_epi32(prod, sign);
|
||||
|
||||
return v_int64x4(_mm256_add_epi64(
|
||||
_mm256_unpacklo_epi64(lo, hi),
|
||||
_mm256_unpackhi_epi64(lo, hi)
|
||||
));
|
||||
}
|
||||
inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
// 32 >> 64f
|
||||
inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
|
||||
{ return v_cvt_f64(v_dotprod(a, b)); }
|
||||
inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
//////// Fast Dot Product ////////
|
||||
|
||||
// 16 >> 32
|
||||
inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b)
|
||||
{ return v_dotprod(a, b); }
|
||||
inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
|
||||
{ return v_dotprod(a, b, c); }
|
||||
|
||||
// 32 >> 64
|
||||
inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b)
|
||||
{ return v_dotprod(a, b); }
|
||||
inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
|
||||
{ return v_dotprod(a, b, c); }
|
||||
|
||||
// 8 >> 32
|
||||
inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
|
||||
{ return v_dotprod_expand(a, b, c); }
|
||||
|
||||
inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
|
||||
{ return v_dotprod_expand(a, b, c); }
|
||||
|
||||
// 16 >> 64
|
||||
inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b)
|
||||
{
|
||||
__m256i mullo = _mm256_mullo_epi16(a.val, b.val);
|
||||
__m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);
|
||||
__m256i mul0 = _mm256_unpacklo_epi16(mullo, mulhi);
|
||||
__m256i mul1 = _mm256_unpackhi_epi16(mullo, mulhi);
|
||||
|
||||
__m256i p02 = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);
|
||||
__m256i p13 = _mm256_srli_epi64(mul0, 32);
|
||||
__m256i p46 = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);
|
||||
__m256i p57 = _mm256_srli_epi64(mul1, 32);
|
||||
|
||||
__m256i p15_ = _mm256_add_epi64(p02, p13);
|
||||
__m256i p9d_ = _mm256_add_epi64(p46, p57);
|
||||
|
||||
return v_uint64x4(_mm256_add_epi64(p15_, p9d_));
|
||||
}
|
||||
inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
|
||||
{
|
||||
__m256i prod = _mm256_madd_epi16(a.val, b.val);
|
||||
__m256i sign = _mm256_srai_epi32(prod, 31);
|
||||
__m256i lo = _mm256_unpacklo_epi32(prod, sign);
|
||||
__m256i hi = _mm256_unpackhi_epi32(prod, sign);
|
||||
return v_int64x4(_mm256_add_epi64(lo, hi));
|
||||
}
|
||||
inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
// 32 >> 64f
|
||||
inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
|
||||
{ return v_dotprod_expand(a, b, c); }
|
||||
|
||||
#define OPENCV_HAL_AVX_SPLAT2_PS(a, im) \
|
||||
v_float32x8(_mm256_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
|
||||
|
||||
|
@ -1473,6 +1473,32 @@ inline v_float64x8 v_cvt_f64(const v_float32x16& a)
|
||||
inline v_float64x8 v_cvt_f64_high(const v_float32x16& a)
|
||||
{ return v_float64x8(_mm512_cvtps_pd(_v512_extract_high(a.val))); }
|
||||
|
||||
// from (Mysticial and wim) https://stackoverflow.com/q/41144668
|
||||
inline v_float64x8 v_cvt_f64(const v_int64x8& v)
|
||||
{
|
||||
#if CV_AVX_512DQ
|
||||
return v_float64x8(_mm512_cvtepi64_pd(v.val));
|
||||
#else
|
||||
// constants encoded as floating-point
|
||||
__m512i magic_i_lo = _mm512_set1_epi64x(0x4330000000000000); // 2^52
|
||||
__m512i magic_i_hi32 = _mm512_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
|
||||
__m512i magic_i_all = _mm512_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
|
||||
__m512d magic_d_all = _mm512_castsi512_pd(magic_i_all);
|
||||
|
||||
// Blend the 32 lowest significant bits of v with magic_int_lo
|
||||
__m512i v_lo = _mm512_blend_epi32(magic_i_lo, v.val, 0x55);
|
||||
// Extract the 32 most significant bits of v
|
||||
__m512i v_hi = _mm512_srli_epi64(v.val, 32);
|
||||
// Flip the msb of v_hi and blend with 0x45300000
|
||||
v_hi = _mm512_xor_si512(v_hi, magic_i_hi32);
|
||||
// Compute in double precision
|
||||
__m512d v_hi_dbl = _mm512_sub_pd(_mm512_castsi512_pd(v_hi), magic_d_all);
|
||||
// (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition
|
||||
__m512d result = _mm512_add_pd(v_hi_dbl, _mm512_castsi512_pd(v_lo));
|
||||
return v_float64x8(result);
|
||||
#endif
|
||||
}
|
||||
|
||||
////////////// Lookup table access ////////////////////
|
||||
|
||||
inline v_int8x64 v512_lut(const schar* tab, const int* idx)
|
||||
@ -1672,12 +1698,152 @@ inline v_float32x16 v_pack_triplets(const v_float32x16& vec)
|
||||
|
||||
////////// Matrix operations /////////
|
||||
|
||||
//////// Dot Product ////////
|
||||
|
||||
// 16 >> 32
|
||||
inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b)
|
||||
{ return v_int32x16(_mm512_madd_epi16(a.val, b.val)); }
|
||||
|
||||
inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
|
||||
{ return v_dotprod(a, b) + c; }
|
||||
|
||||
// 32 >> 64
|
||||
inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)
|
||||
{
|
||||
__m512i even = _mm512_mul_epi32(a.val, b.val);
|
||||
__m512i odd = _mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32));
|
||||
return v_int64x8(_mm512_add_epi64(even, odd));
|
||||
}
|
||||
inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
|
||||
{ return v_dotprod(a, b) + c; }
|
||||
|
||||
// 8 >> 32
|
||||
inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)
|
||||
{
|
||||
__m512i even_a = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, _mm512_setzero_si512());
|
||||
__m512i odd_a = _mm512_srli_epi16(a.val, 8);
|
||||
|
||||
__m512i even_b = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b.val, _mm512_setzero_si512());
|
||||
__m512i odd_b = _mm512_srli_epi16(b.val, 8);
|
||||
|
||||
__m512i prod0 = _mm512_madd_epi16(even_a, even_b);
|
||||
__m512i prod1 = _mm512_madd_epi16(odd_a, odd_b);
|
||||
return v_uint32x16(_mm512_add_epi32(prod0, prod1));
|
||||
}
|
||||
inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)
|
||||
{
|
||||
__m512i even_a = _mm512_srai_epi16(_mm512_bslli_epi128(a.val, 1), 8);
|
||||
__m512i odd_a = _mm512_srai_epi16(a.val, 8);
|
||||
|
||||
__m512i even_b = _mm512_srai_epi16(_mm512_bslli_epi128(b.val, 1), 8);
|
||||
__m512i odd_b = _mm512_srai_epi16(b.val, 8);
|
||||
|
||||
__m512i prod0 = _mm512_madd_epi16(even_a, even_b);
|
||||
__m512i prod1 = _mm512_madd_epi16(odd_a, odd_b);
|
||||
return v_int32x16(_mm512_add_epi32(prod0, prod1));
|
||||
}
|
||||
inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
// 16 >> 64
|
||||
inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)
|
||||
{
|
||||
__m512i mullo = _mm512_mullo_epi16(a.val, b.val);
|
||||
__m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
|
||||
__m512i mul0 = _mm512_unpacklo_epi16(mullo, mulhi);
|
||||
__m512i mul1 = _mm512_unpackhi_epi16(mullo, mulhi);
|
||||
|
||||
__m512i p02 = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
|
||||
__m512i p13 = _mm512_srli_epi64(mul0, 32);
|
||||
__m512i p46 = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
|
||||
__m512i p57 = _mm512_srli_epi64(mul1, 32);
|
||||
|
||||
__m512i p15_ = _mm512_add_epi64(p02, p13);
|
||||
__m512i p9d_ = _mm512_add_epi64(p46, p57);
|
||||
|
||||
return v_uint64x8(_mm512_add_epi64(
|
||||
_mm512_unpacklo_epi64(p15_, p9d_),
|
||||
_mm512_unpackhi_epi64(p15_, p9d_)
|
||||
));
|
||||
}
|
||||
inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)
|
||||
{
|
||||
__m512i prod = _mm512_madd_epi16(a.val, b.val);
|
||||
__m512i even = _mm512_srai_epi64(_mm512_bslli_epi128(prod, 4), 32);
|
||||
__m512i odd = _mm512_srai_epi64(prod, 32);
|
||||
return v_int64x8(_mm512_add_epi64(even, odd));
|
||||
}
|
||||
inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
// 32 >> 64f
|
||||
inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b)
|
||||
{ return v_cvt_f64(v_dotprod(a, b)); }
|
||||
inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
//////// Fast Dot Product ////////
|
||||
|
||||
// 16 >> 32
|
||||
inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b)
|
||||
{ return v_dotprod(a, b); }
|
||||
inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
|
||||
{ return v_dotprod(a, b, c); }
|
||||
|
||||
// 32 >> 64
|
||||
inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b)
|
||||
{ return v_dotprod(a, b); }
|
||||
inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
|
||||
{ return v_dotprod(a, b, c); }
|
||||
|
||||
// 8 >> 32
|
||||
inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
|
||||
{ return v_dotprod_expand(a, b, c); }
|
||||
|
||||
inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
|
||||
{ return v_dotprod_expand(a, b, c); }
|
||||
|
||||
// 16 >> 64
|
||||
inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b)
|
||||
{
|
||||
__m512i mullo = _mm512_mullo_epi16(a.val, b.val);
|
||||
__m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
|
||||
__m512i mul0 = _mm512_unpacklo_epi16(mullo, mulhi);
|
||||
__m512i mul1 = _mm512_unpackhi_epi16(mullo, mulhi);
|
||||
|
||||
__m512i p02 = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
|
||||
__m512i p13 = _mm512_srli_epi64(mul0, 32);
|
||||
__m512i p46 = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
|
||||
__m512i p57 = _mm512_srli_epi64(mul1, 32);
|
||||
|
||||
__m512i p15_ = _mm512_add_epi64(p02, p13);
|
||||
__m512i p9d_ = _mm512_add_epi64(p46, p57);
|
||||
return v_uint64x8(_mm512_add_epi64(p15_, p9d_));
|
||||
}
|
||||
inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
|
||||
{ return v_dotprod_expand(a, b, c); }
|
||||
|
||||
// 32 >> 64f
|
||||
inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
|
||||
#define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \
|
||||
v_float32x16(_mm512_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
|
||||
|
||||
|
@ -171,7 +171,8 @@ Different type conversions and casts:
|
||||
|
||||
### Matrix operations
|
||||
|
||||
In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_matmul, @ref v_transpose4x4
|
||||
In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_dotprod_fast,
|
||||
@ref v_dotprod_expand, @ref v_dotprod_expand_fast, @ref v_matmul, @ref v_transpose4x4
|
||||
|
||||
### Usability
|
||||
|
||||
@ -195,7 +196,10 @@ Regular integers:
|
||||
|mul_expand | x | x | x | x | x | |
|
||||
|compare | x | x | x | x | x | x |
|
||||
|shift | | | x | x | x | x |
|
||||
|dotprod | | | | x | | |
|
||||
|dotprod | | | | x | | x |
|
||||
|dotprod_fast | | | | x | | x |
|
||||
|dotprod_expand | x | x | x | x | | x |
|
||||
|dotprod_expand_fast| x | x | x | x | | x |
|
||||
|logical | x | x | x | x | x | x |
|
||||
|min, max | x | x | x | x | x | x |
|
||||
|absdiff | x | x | x | x | x | x |
|
||||
@ -222,6 +226,7 @@ Big integers:
|
||||
|logical | x | x |
|
||||
|extract | x | x |
|
||||
|rotate (lanes) | x | x |
|
||||
|cvt_flt64 | | x |
|
||||
|
||||
Floating point:
|
||||
|
||||
@ -853,17 +858,18 @@ inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
|
||||
/** @brief Dot product of elements
|
||||
|
||||
Multiply values in two registers and sum adjacent result pairs.
|
||||
|
||||
Scheme:
|
||||
@code
|
||||
{A1 A2 ...} // 16-bit
|
||||
x {B1 B2 ...} // 16-bit
|
||||
-------------
|
||||
{A1B1+A2B2 ...} // 32-bit
|
||||
|
||||
@endcode
|
||||
Implemented only for 16-bit signed source type (v_int16x8).
|
||||
*/
|
||||
template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
|
||||
v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
|
||||
v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
|
||||
{
|
||||
typedef typename V_TypeTraits<_Tp>::w_type w_type;
|
||||
v_reg<w_type, n/2> c;
|
||||
@ -881,12 +887,11 @@ Scheme:
|
||||
x {B1 B2 ...} // 16-bit
|
||||
-------------
|
||||
{A1B1+A2B2+C1 ...} // 32-bit
|
||||
|
||||
@endcode
|
||||
Implemented only for 16-bit signed source type (v_int16x8).
|
||||
*/
|
||||
template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
|
||||
v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
|
||||
v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
|
||||
const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
|
||||
{
|
||||
typedef typename V_TypeTraits<_Tp>::w_type w_type;
|
||||
v_reg<w_type, n/2> s;
|
||||
@ -895,6 +900,95 @@ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n
|
||||
return s;
|
||||
}
|
||||
|
||||
/** @brief Fast Dot product of elements
|
||||
|
||||
Same as cv::v_dotprod, but it may perform unorder sum between result pairs in some platforms,
|
||||
this intrinsic can be used if the sum among all lanes is only matters
|
||||
and also it should be yielding better performance on the affected platforms.
|
||||
|
||||
*/
|
||||
template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
|
||||
v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
|
||||
{ return v_dotprod(a, b); }
|
||||
|
||||
/** @brief Fast Dot product of elements
|
||||
|
||||
Same as cv::v_dotprod_fast, but add a third element to the sum of adjacent pairs.
|
||||
*/
|
||||
template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
|
||||
v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
|
||||
const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
|
||||
{ return v_dotprod(a, b, c); }
|
||||
|
||||
/** @brief Dot product of elements and expand
|
||||
|
||||
Multiply values in two registers and expand the sum of adjacent result pairs.
|
||||
|
||||
Scheme:
|
||||
@code
|
||||
{A1 A2 A3 A4 ...} // 8-bit
|
||||
x {B1 B2 B3 B4 ...} // 8-bit
|
||||
-------------
|
||||
{A1B1+A2B2+A3B3+A4B4 ...} // 32-bit
|
||||
|
||||
@endcode
|
||||
*/
|
||||
template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
|
||||
v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
|
||||
{
|
||||
typedef typename V_TypeTraits<_Tp>::q_type q_type;
|
||||
v_reg<q_type, n/4> s;
|
||||
for( int i = 0; i < (n/4); i++ )
|
||||
s.s[i] = (q_type)a.s[i*4 ]*b.s[i*4 ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
|
||||
(q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3];
|
||||
return s;
|
||||
}
|
||||
|
||||
/** @brief Dot product of elements
|
||||
|
||||
Same as cv::v_dotprod_expand, but add a third element to the sum of adjacent pairs.
|
||||
Scheme:
|
||||
@code
|
||||
{A1 A2 A3 A4 ...} // 8-bit
|
||||
x {B1 B2 B3 B4 ...} // 8-bit
|
||||
-------------
|
||||
{A1B1+A2B2+A3B3+A4B4+C1 ...} // 32-bit
|
||||
@endcode
|
||||
*/
|
||||
template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
|
||||
v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
|
||||
const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
|
||||
{
|
||||
typedef typename V_TypeTraits<_Tp>::q_type q_type;
|
||||
v_reg<q_type, n/4> s;
|
||||
for( int i = 0; i < (n/4); i++ )
|
||||
s.s[i] = (q_type)a.s[i*4 ]*b.s[i*4 ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
|
||||
(q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3] + c.s[i];
|
||||
return s;
|
||||
}
|
||||
|
||||
/** @brief Fast Dot product of elements and expand
|
||||
|
||||
Multiply values in two registers and expand the sum of adjacent result pairs.
|
||||
|
||||
Same as cv::v_dotprod_expand, but it may perform unorder sum between result pairs in some platforms,
|
||||
this intrinsic can be used if the sum among all lanes is only matters
|
||||
and also it should be yielding better performance on the affected platforms.
|
||||
|
||||
*/
|
||||
template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
|
||||
v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
|
||||
/** @brief Fast Dot product of elements
|
||||
|
||||
Same as cv::v_dotprod_expand_fast, but add a third element to the sum of adjacent pairs.
|
||||
*/
|
||||
template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
|
||||
v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
|
||||
const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
|
||||
{ return v_dotprod_expand(a, b, c); }
|
||||
|
||||
/** @brief Multiply and expand
|
||||
|
||||
Multiply values two registers and store results in two registers with wider pack type.
|
||||
@ -1810,6 +1904,17 @@ template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
|
||||
return c;
|
||||
}
|
||||
|
||||
/** @brief Convert to double
|
||||
|
||||
Supported input type is cv::v_int64x2. */
|
||||
template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int64, n>& a)
|
||||
{
|
||||
v_reg<double, n> c;
|
||||
for( int i = 0; i < n; i++ )
|
||||
c.s[i] = (double)a.s[i];
|
||||
return c;
|
||||
}
|
||||
|
||||
template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut(const _Tp* tab, const int* idx)
|
||||
{
|
||||
v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
|
||||
|
@ -160,6 +160,16 @@ void v_mul_expand(const __CV_V_UINT32&, const __CV_V_UINT32&, __CV_V_UINT64&, __
|
||||
void v_mul_expand(const __CV_V_INT32&, const __CV_V_INT32&, __CV_V_INT64&, __CV_V_INT64&);
|
||||
#endif
|
||||
|
||||
// Conversions
|
||||
__CV_V_FLOAT32 v_cvt_f32(const __CV_V_INT32& a);
|
||||
__CV_V_FLOAT32 v_cvt_f32(const __CV_V_FLOAT64& a);
|
||||
__CV_V_FLOAT32 v_cvt_f32(const __CV_V_FLOAT64& a, const __CV_V_FLOAT64& b);
|
||||
__CV_V_FLOAT64 v_cvt_f64(const __CV_V_INT32& a);
|
||||
__CV_V_FLOAT64 v_cvt_f64_high(const __CV_V_INT32& a);
|
||||
__CV_V_FLOAT64 v_cvt_f64(const __CV_V_FLOAT32& a);
|
||||
__CV_V_FLOAT64 v_cvt_f64_high(const __CV_V_FLOAT32& a);
|
||||
__CV_V_FLOAT64 v_cvt_f64(const __CV_V_INT64& a);
|
||||
|
||||
/** Cleanup **/
|
||||
#undef CV__SIMD_FORWARD
|
||||
#undef __CV_VX
|
||||
|
@ -461,15 +461,124 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
|
||||
msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi)), 16));
|
||||
}
|
||||
|
||||
//////// Dot Product ////////
|
||||
|
||||
// 16 >> 32
|
||||
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
|
||||
{ return v_int32x4(msa_dotp_s_w(a.val, b.val)); }
|
||||
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
|
||||
{ return v_int32x4(msa_dpadd_s_w(c.val , a.val, b.val)); }
|
||||
|
||||
// 32 >> 64
|
||||
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_int64x2(msa_dotp_s_d(a.val, b.val)); }
|
||||
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
|
||||
{ return v_int64x2(msa_dpadd_s_d(c.val , a.val, b.val)); }
|
||||
|
||||
// 8 >> 32
|
||||
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
|
||||
{
|
||||
return v_int32x4(msa_dotp_s_w(a.val, b.val));
|
||||
v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
|
||||
v8u16 odd_a = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
|
||||
v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
|
||||
v8u16 odd_b = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
|
||||
v4u32 prod = msa_dotp_u_w(even_a, even_b);
|
||||
return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
|
||||
}
|
||||
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
|
||||
{
|
||||
v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
|
||||
v8u16 odd_a = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
|
||||
v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
|
||||
v8u16 odd_b = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
|
||||
v4u32 prod = msa_dpadd_u_w(c.val, even_a, even_b);
|
||||
return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
|
||||
}
|
||||
|
||||
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
|
||||
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
return v_int32x4(msa_dpadd_s_w(c.val , a.val, b.val));
|
||||
v8i16 prod = msa_dotp_s_h(a.val, b.val);
|
||||
return v_int32x4(msa_hadd_s32(prod, prod));
|
||||
}
|
||||
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
|
||||
const v_int32x4& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
// 16 >> 64
|
||||
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{
|
||||
v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
|
||||
v4u32 odd_a = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
|
||||
v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
|
||||
v4u32 odd_b = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
|
||||
v2u64 prod = msa_dotp_u_d(even_a, even_b);
|
||||
return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
|
||||
}
|
||||
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b,
|
||||
const v_uint64x2& c)
|
||||
{
|
||||
v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
|
||||
v4u32 odd_a = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
|
||||
v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
|
||||
v4u32 odd_b = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
|
||||
v2u64 prod = msa_dpadd_u_d(c.val, even_a, even_b);
|
||||
return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
|
||||
}
|
||||
|
||||
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
v4i32 prod = msa_dotp_s_w(a.val, b.val);
|
||||
return v_int64x2(msa_hadd_s64(prod, prod));
|
||||
}
|
||||
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
// 32 >> 64f
|
||||
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_cvt_f64(v_dotprod(a, b)); }
|
||||
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
|
||||
//////// Fast Dot Product ////////
|
||||
|
||||
// 16 >> 32
|
||||
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
|
||||
{ return v_dotprod(a, b); }
|
||||
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
|
||||
{ return v_dotprod(a, b, c); }
|
||||
|
||||
// 32 >> 64
|
||||
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_dotprod(a, b); }
|
||||
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
|
||||
{ return v_dotprod(a, b, c); }
|
||||
|
||||
// 8 >> 32
|
||||
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
|
||||
{ return v_dotprod_expand(a, b, c); }
|
||||
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
|
||||
{ return v_dotprod_expand(a, b, c); }
|
||||
|
||||
// 16 >> 64
|
||||
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
|
||||
{ return v_dotprod_expand(a, b, c); }
|
||||
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
|
||||
{ return v_dotprod_expand(a, b, c); }
|
||||
|
||||
// 32 >> 64f
|
||||
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
|
||||
{ return v_dotprod_expand(a, b, c); }
|
||||
|
||||
#define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix) \
|
||||
@ -1311,6 +1420,11 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
|
||||
return v_float64x2(msa_cvtfhq_f64_f32(a.val));
|
||||
}
|
||||
|
||||
inline v_float64x2 v_cvt_f64(const v_int64x2& a)
|
||||
{
|
||||
return v_float64x2(msa_cvtfintq_f64_s64(a.val));
|
||||
}
|
||||
|
||||
////////////// Lookup table access ////////////////////
|
||||
inline v_int8x16 v_lut(const schar* tab, const int* idx)
|
||||
{
|
||||
|
@ -62,23 +62,63 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
||||
#define CV_SIMD128_64F 0
|
||||
#endif
|
||||
|
||||
// TODO
|
||||
#define CV_NEON_DOT 0
|
||||
|
||||
//////////// Utils ////////////
|
||||
|
||||
#if CV_SIMD128_64F
|
||||
#define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
|
||||
inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
|
||||
{ c = vuzp1q_##suffix(a, b); d = vuzp2q_##suffix(a, b); }
|
||||
#define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
|
||||
inline void _v128_unzip(const _Tpv&a, const _Tpv&b, _Tpv& c, _Tpv& d) \
|
||||
{ c = vuzp1_##suffix(a, b); d = vuzp2_##suffix(a, b); }
|
||||
#else
|
||||
#define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
|
||||
inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
|
||||
{ _Tpvx2 ab = vuzpq_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
|
||||
#define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
|
||||
inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
|
||||
{ _Tpvx2 ab = vuzp_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
|
||||
#endif
|
||||
|
||||
#if CV_SIMD128_64F
|
||||
#define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
|
||||
template <typename T> static inline \
|
||||
_Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
|
||||
template <typename T> static inline \
|
||||
float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
|
||||
OPENCV_HAL_IMPL_NEON_REINTERPRET(uint8x16_t, u8)
|
||||
OPENCV_HAL_IMPL_NEON_REINTERPRET(int8x16_t, s8)
|
||||
OPENCV_HAL_IMPL_NEON_REINTERPRET(uint16x8_t, u16)
|
||||
OPENCV_HAL_IMPL_NEON_REINTERPRET(int16x8_t, s16)
|
||||
OPENCV_HAL_IMPL_NEON_REINTERPRET(uint32x4_t, u32)
|
||||
OPENCV_HAL_IMPL_NEON_REINTERPRET(int32x4_t, s32)
|
||||
OPENCV_HAL_IMPL_NEON_REINTERPRET(uint64x2_t, u64)
|
||||
OPENCV_HAL_IMPL_NEON_REINTERPRET(int64x2_t, s64)
|
||||
OPENCV_HAL_IMPL_NEON_REINTERPRET(float32x4_t, f32)
|
||||
template <typename T> static inline \
|
||||
_Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
|
||||
template <typename T> static inline \
|
||||
float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
|
||||
#else
|
||||
#define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix)
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(_Tpv, _Tpvl, suffix) \
|
||||
OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix) \
|
||||
OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpvl##_t, _Tpvl##x2_t, suffix) \
|
||||
OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(_Tpv, _Tpvl, suffix) \
|
||||
OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(_Tpv, _Tpvl, suffix) \
|
||||
OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix)
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint8x16, uint8x8, u8)
|
||||
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int8x16, int8x8, s8)
|
||||
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint16x8, uint16x4, u16)
|
||||
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int16x8, int16x4, s16)
|
||||
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint32x4, uint32x2, u32)
|
||||
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int32x4, int32x2, s32)
|
||||
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(float32x4, float32x2, f32)
|
||||
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(uint64x2, uint64x1, u64)
|
||||
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2, int64x1, s64)
|
||||
#if CV_SIMD128_64F
|
||||
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(float64x2, float64x1,f64)
|
||||
#endif
|
||||
|
||||
//////////// Types ////////////
|
||||
|
||||
struct v_uint8x16
|
||||
{
|
||||
typedef uchar lane_type;
|
||||
@ -528,20 +568,272 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
|
||||
));
|
||||
}
|
||||
|
||||
//////// Dot Product ////////
|
||||
|
||||
// 16 >> 32
|
||||
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
|
||||
int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
|
||||
int32x4x2_t cd = vuzpq_s32(c, d);
|
||||
return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
|
||||
int16x8_t uzp1, uzp2;
|
||||
_v128_unzip(a.val, b.val, uzp1, uzp2);
|
||||
int16x4_t a0 = vget_low_s16(uzp1);
|
||||
int16x4_t b0 = vget_high_s16(uzp1);
|
||||
int16x4_t a1 = vget_low_s16(uzp2);
|
||||
int16x4_t b1 = vget_high_s16(uzp2);
|
||||
int32x4_t p = vmull_s16(a0, b0);
|
||||
return v_int32x4(vmlal_s16(p, a1, b1));
|
||||
}
|
||||
|
||||
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
|
||||
{
|
||||
v_int32x4 s = v_dotprod(a, b);
|
||||
return v_int32x4(vaddq_s32(s.val , c.val));
|
||||
int16x8_t uzp1, uzp2;
|
||||
_v128_unzip(a.val, b.val, uzp1, uzp2);
|
||||
int16x4_t a0 = vget_low_s16(uzp1);
|
||||
int16x4_t b0 = vget_high_s16(uzp1);
|
||||
int16x4_t a1 = vget_low_s16(uzp2);
|
||||
int16x4_t b1 = vget_high_s16(uzp2);
|
||||
int32x4_t p = vmlal_s16(c.val, a0, b0);
|
||||
return v_int32x4(vmlal_s16(p, a1, b1));
|
||||
}
|
||||
|
||||
// 32 >> 64
|
||||
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
|
||||
{
|
||||
int32x4_t uzp1, uzp2;
|
||||
_v128_unzip(a.val, b.val, uzp1, uzp2);
|
||||
int32x2_t a0 = vget_low_s32(uzp1);
|
||||
int32x2_t b0 = vget_high_s32(uzp1);
|
||||
int32x2_t a1 = vget_low_s32(uzp2);
|
||||
int32x2_t b1 = vget_high_s32(uzp2);
|
||||
int64x2_t p = vmull_s32(a0, b0);
|
||||
return v_int64x2(vmlal_s32(p, a1, b1));
|
||||
}
|
||||
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
|
||||
{
|
||||
int32x4_t uzp1, uzp2;
|
||||
_v128_unzip(a.val, b.val, uzp1, uzp2);
|
||||
int32x2_t a0 = vget_low_s32(uzp1);
|
||||
int32x2_t b0 = vget_high_s32(uzp1);
|
||||
int32x2_t a1 = vget_low_s32(uzp2);
|
||||
int32x2_t b1 = vget_high_s32(uzp2);
|
||||
int64x2_t p = vmlal_s32(c.val, a0, b0);
|
||||
return v_int64x2(vmlal_s32(p, a1, b1));
|
||||
}
|
||||
|
||||
// 8 >> 32
|
||||
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
|
||||
{
|
||||
#if CV_NEON_DOT
|
||||
return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
|
||||
#else
|
||||
const uint8x16_t zero = vreinterpretq_u8_u32(vdupq_n_u32(0));
|
||||
const uint8x16_t mask = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF));
|
||||
const uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0));
|
||||
const uint16x8_t mask32 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
|
||||
|
||||
uint16x8_t even = vmulq_u16(vreinterpretq_u16_u8(vbslq_u8(mask, a.val, zero)),
|
||||
vreinterpretq_u16_u8(vbslq_u8(mask, b.val, zero)));
|
||||
uint16x8_t odd = vmulq_u16(vshrq_n_u16(vreinterpretq_u16_u8(a.val), 8),
|
||||
vshrq_n_u16(vreinterpretq_u16_u8(b.val), 8));
|
||||
|
||||
uint32x4_t s0 = vaddq_u32(vreinterpretq_u32_u16(vbslq_u16(mask32, even, zero32)),
|
||||
vreinterpretq_u32_u16(vbslq_u16(mask32, odd, zero32)));
|
||||
uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16),
|
||||
vshrq_n_u32(vreinterpretq_u32_u16(odd), 16));
|
||||
return v_uint32x4(vaddq_u32(s0, s1));
|
||||
#endif
|
||||
}
|
||||
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
|
||||
const v_uint32x4& c)
|
||||
{
|
||||
#if CV_NEON_DOT
|
||||
return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
|
||||
#else
|
||||
return v_dotprod_expand(a, b) + c;
|
||||
#endif
|
||||
}
|
||||
|
||||
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
#if CV_NEON_DOT
|
||||
return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
|
||||
#else
|
||||
int16x8_t p0 = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
|
||||
int16x8_t p1 = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
|
||||
int16x8_t uzp1, uzp2;
|
||||
_v128_unzip(p0, p1, uzp1, uzp2);
|
||||
int16x8_t sum = vaddq_s16(uzp1, uzp2);
|
||||
int16x4_t uzpl1, uzpl2;
|
||||
_v128_unzip(vget_low_s16(sum), vget_high_s16(sum), uzpl1, uzpl2);
|
||||
return v_int32x4(vaddl_s16(uzpl1, uzpl2));
|
||||
#endif
|
||||
}
|
||||
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
|
||||
const v_int32x4& c)
|
||||
{
|
||||
#if CV_NEON_DOT
|
||||
return v_int32x4(vdotq_s32(c.val, a.val, b.val));
|
||||
#else
|
||||
return v_dotprod_expand(a, b) + c;
|
||||
#endif
|
||||
}
|
||||
|
||||
// 16 >> 64
|
||||
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{
|
||||
const uint16x8_t zero = vreinterpretq_u16_u32(vdupq_n_u32(0));
|
||||
const uint16x8_t mask = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
|
||||
|
||||
uint32x4_t even = vmulq_u32(vreinterpretq_u32_u16(vbslq_u16(mask, a.val, zero)),
|
||||
vreinterpretq_u32_u16(vbslq_u16(mask, b.val, zero)));
|
||||
uint32x4_t odd = vmulq_u32(vshrq_n_u32(vreinterpretq_u32_u16(a.val), 16),
|
||||
vshrq_n_u32(vreinterpretq_u32_u16(b.val), 16));
|
||||
uint32x4_t uzp1, uzp2;
|
||||
_v128_unzip(even, odd, uzp1, uzp2);
|
||||
uint64x2_t s0 = vaddl_u32(vget_low_u32(uzp1), vget_high_u32(uzp1));
|
||||
uint64x2_t s1 = vaddl_u32(vget_low_u32(uzp2), vget_high_u32(uzp2));
|
||||
return v_uint64x2(vaddq_u64(s0, s1));
|
||||
}
|
||||
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
int32x4_t p0 = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
|
||||
int32x4_t p1 = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
|
||||
|
||||
int32x4_t uzp1, uzp2;
|
||||
_v128_unzip(p0, p1, uzp1, uzp2);
|
||||
int32x4_t sum = vaddq_s32(uzp1, uzp2);
|
||||
|
||||
int32x2_t uzpl1, uzpl2;
|
||||
_v128_unzip(vget_low_s32(sum), vget_high_s32(sum), uzpl1, uzpl2);
|
||||
return v_int64x2(vaddl_s32(uzpl1, uzpl2));
|
||||
}
|
||||
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
|
||||
const v_int64x2& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
// 32 >> 64f
|
||||
#if CV_SIMD128_64F
|
||||
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_cvt_f64(v_dotprod(a, b)); }
|
||||
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
|
||||
const v_float64x2& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
#endif
|
||||
|
||||
//////// Fast Dot Product ////////
|
||||
|
||||
// 16 >> 32
|
||||
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
int16x4_t a0 = vget_low_s16(a.val);
|
||||
int16x4_t a1 = vget_high_s16(a.val);
|
||||
int16x4_t b0 = vget_low_s16(b.val);
|
||||
int16x4_t b1 = vget_high_s16(b.val);
|
||||
int32x4_t p = vmull_s16(a0, b0);
|
||||
return v_int32x4(vmlal_s16(p, a1, b1));
|
||||
}
|
||||
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
|
||||
{
|
||||
int16x4_t a0 = vget_low_s16(a.val);
|
||||
int16x4_t a1 = vget_high_s16(a.val);
|
||||
int16x4_t b0 = vget_low_s16(b.val);
|
||||
int16x4_t b1 = vget_high_s16(b.val);
|
||||
int32x4_t p = vmlal_s16(c.val, a0, b0);
|
||||
return v_int32x4(vmlal_s16(p, a1, b1));
|
||||
}
|
||||
|
||||
// 32 >> 64
|
||||
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
|
||||
{
|
||||
int32x2_t a0 = vget_low_s32(a.val);
|
||||
int32x2_t a1 = vget_high_s32(a.val);
|
||||
int32x2_t b0 = vget_low_s32(b.val);
|
||||
int32x2_t b1 = vget_high_s32(b.val);
|
||||
int64x2_t p = vmull_s32(a0, b0);
|
||||
return v_int64x2(vmlal_s32(p, a1, b1));
|
||||
}
|
||||
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
|
||||
{
|
||||
int32x2_t a0 = vget_low_s32(a.val);
|
||||
int32x2_t a1 = vget_high_s32(a.val);
|
||||
int32x2_t b0 = vget_low_s32(b.val);
|
||||
int32x2_t b1 = vget_high_s32(b.val);
|
||||
int64x2_t p = vmlal_s32(c.val, a0, b0);
|
||||
return v_int64x2(vmlal_s32(p, a1, b1));
|
||||
}
|
||||
|
||||
// 8 >> 32
|
||||
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
|
||||
{
|
||||
#if CV_NEON_DOT
|
||||
return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
|
||||
#else
|
||||
uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
|
||||
uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
|
||||
uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1));
|
||||
uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1));
|
||||
return v_uint32x4(vaddq_u32(s0, s1));
|
||||
#endif
|
||||
}
|
||||
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
|
||||
{
|
||||
#if CV_NEON_DOT
|
||||
return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
|
||||
#else
|
||||
return v_dotprod_expand_fast(a, b) + c;
|
||||
#endif
|
||||
}
|
||||
|
||||
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
#if CV_NEON_DOT
|
||||
return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
|
||||
#else
|
||||
int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
|
||||
prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
|
||||
return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
|
||||
#endif
|
||||
}
|
||||
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
|
||||
{
|
||||
#if CV_NEON_DOT
|
||||
return v_int32x4(vdotq_s32(c.val, a.val, b.val));
|
||||
#else
|
||||
return v_dotprod_expand_fast(a, b) + c;
|
||||
#endif
|
||||
}
|
||||
|
||||
// 16 >> 64
|
||||
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{
|
||||
uint32x4_t p0 = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
|
||||
uint32x4_t p1 = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
|
||||
uint64x2_t s0 = vaddl_u32(vget_low_u32(p0), vget_high_u32(p0));
|
||||
uint64x2_t s1 = vaddl_u32(vget_low_u32(p1), vget_high_u32(p1));
|
||||
return v_uint64x2(vaddq_u64(s0, s1));
|
||||
}
|
||||
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
int32x4_t prod = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
|
||||
prod = vmlal_s16(prod, vget_high_s16(a.val), vget_high_s16(b.val));
|
||||
return v_int64x2(vaddl_s32(vget_low_s32(prod), vget_high_s32(prod)));
|
||||
}
|
||||
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
// 32 >> 64f
|
||||
#if CV_SIMD128_64F
|
||||
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_cvt_f64(v_dotprod_fast(a, b)); }
|
||||
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
#endif
|
||||
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
|
||||
@ -1593,6 +1885,10 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
|
||||
{
|
||||
return v_float64x2(vcvt_f64_f32(vget_high_f32(a.val)));
|
||||
}
|
||||
|
||||
inline v_float64x2 v_cvt_f64(const v_int64x2& a)
|
||||
{ return v_float64x2(vcvtq_f64_s64(a.val)); }
|
||||
|
||||
#endif
|
||||
|
||||
////////////// Lookup table access ////////////////////
|
||||
|
@ -225,9 +225,13 @@ struct v_uint64x2
|
||||
}
|
||||
uint64 get0() const
|
||||
{
|
||||
#if !defined(__x86_64__) && !defined(_M_X64)
|
||||
int a = _mm_cvtsi128_si32(val);
|
||||
int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
|
||||
return (unsigned)a | ((uint64)(unsigned)b << 32);
|
||||
#else
|
||||
return (uint64)_mm_cvtsi128_si64(val);
|
||||
#endif
|
||||
}
|
||||
|
||||
__m128i val;
|
||||
@ -247,9 +251,13 @@ struct v_int64x2
|
||||
}
|
||||
int64 get0() const
|
||||
{
|
||||
#if !defined(__x86_64__) && !defined(_M_X64)
|
||||
int a = _mm_cvtsi128_si32(val);
|
||||
int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
|
||||
return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
|
||||
#else
|
||||
return _mm_cvtsi128_si64(val);
|
||||
#endif
|
||||
}
|
||||
|
||||
__m128i val;
|
||||
@ -791,15 +799,195 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
|
||||
inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) { return v_int16x8(_mm_mulhi_epi16(a.val, b.val)); }
|
||||
inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_uint16x8(_mm_mulhi_epu16(a.val, b.val)); }
|
||||
|
||||
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
return v_int32x4(_mm_madd_epi16(a.val, b.val));
|
||||
}
|
||||
//////// Dot Product ////////
|
||||
|
||||
// 16 >> 32
|
||||
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
|
||||
{ return v_int32x4(_mm_madd_epi16(a.val, b.val)); }
|
||||
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
|
||||
{ return v_dotprod(a, b) + c; }
|
||||
|
||||
// 32 >> 64
|
||||
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
|
||||
{
|
||||
return v_int32x4(_mm_add_epi32(_mm_madd_epi16(a.val, b.val), c.val));
|
||||
#if CV_SSE4_1
|
||||
__m128i even = _mm_mul_epi32(a.val, b.val);
|
||||
__m128i odd = _mm_mul_epi32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
|
||||
return v_int64x2(_mm_add_epi64(even, odd));
|
||||
#else
|
||||
__m128i even_u = _mm_mul_epu32(a.val, b.val);
|
||||
__m128i odd_u = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
|
||||
// convert unsigned to signed high multiplication (from: Agner Fog(veclib) and H S Warren: Hacker's delight, 2003, p. 132)
|
||||
__m128i a_sign = _mm_srai_epi32(a.val, 31);
|
||||
__m128i b_sign = _mm_srai_epi32(b.val, 31);
|
||||
// |x * sign of x
|
||||
__m128i axb = _mm_and_si128(a.val, b_sign);
|
||||
__m128i bxa = _mm_and_si128(b.val, a_sign);
|
||||
// sum of sign corrections
|
||||
__m128i ssum = _mm_add_epi32(bxa, axb);
|
||||
__m128i even_ssum = _mm_slli_epi64(ssum, 32);
|
||||
__m128i odd_ssum = _mm_and_si128(ssum, _mm_set_epi32(-1, 0, -1, 0));
|
||||
// convert to signed and prod
|
||||
return v_int64x2(_mm_add_epi64(_mm_sub_epi64(even_u, even_ssum), _mm_sub_epi64(odd_u, odd_ssum)));
|
||||
#endif
|
||||
}
|
||||
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
|
||||
{ return v_dotprod(a, b) + c; }
|
||||
|
||||
// 8 >> 32
|
||||
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
|
||||
{
|
||||
__m128i a0 = _mm_srli_epi16(_mm_slli_si128(a.val, 1), 8); // even
|
||||
__m128i a1 = _mm_srli_epi16(a.val, 8); // odd
|
||||
__m128i b0 = _mm_srli_epi16(_mm_slli_si128(b.val, 1), 8);
|
||||
__m128i b1 = _mm_srli_epi16(b.val, 8);
|
||||
__m128i p0 = _mm_madd_epi16(a0, b0);
|
||||
__m128i p1 = _mm_madd_epi16(a1, b1);
|
||||
return v_uint32x4(_mm_add_epi32(p0, p1));
|
||||
}
|
||||
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
__m128i a0 = _mm_srai_epi16(_mm_slli_si128(a.val, 1), 8); // even
|
||||
__m128i a1 = _mm_srai_epi16(a.val, 8); // odd
|
||||
__m128i b0 = _mm_srai_epi16(_mm_slli_si128(b.val, 1), 8);
|
||||
__m128i b1 = _mm_srai_epi16(b.val, 8);
|
||||
__m128i p0 = _mm_madd_epi16(a0, b0);
|
||||
__m128i p1 = _mm_madd_epi16(a1, b1);
|
||||
return v_int32x4(_mm_add_epi32(p0, p1));
|
||||
}
|
||||
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
// 16 >> 64
|
||||
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{
|
||||
v_uint32x4 c, d;
|
||||
v_mul_expand(a, b, c, d);
|
||||
|
||||
v_uint64x2 c0, c1, d0, d1;
|
||||
v_expand(c, c0, c1);
|
||||
v_expand(d, d0, d1);
|
||||
|
||||
c0 += c1; d0 += d1;
|
||||
return v_uint64x2(_mm_add_epi64(
|
||||
_mm_unpacklo_epi64(c0.val, d0.val),
|
||||
_mm_unpackhi_epi64(c0.val, d0.val)
|
||||
));
|
||||
}
|
||||
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
v_int32x4 prod = v_dotprod(a, b);
|
||||
v_int64x2 c, d;
|
||||
v_expand(prod, c, d);
|
||||
return v_int64x2(_mm_add_epi64(
|
||||
_mm_unpacklo_epi64(c.val, d.val),
|
||||
_mm_unpackhi_epi64(c.val, d.val)
|
||||
));
|
||||
}
|
||||
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
// 32 >> 64f
|
||||
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
|
||||
{
|
||||
#if CV_SSE4_1
|
||||
return v_cvt_f64(v_dotprod(a, b));
|
||||
#else
|
||||
v_float64x2 c = v_cvt_f64(a) * v_cvt_f64(b);
|
||||
v_float64x2 d = v_cvt_f64_high(a) * v_cvt_f64_high(b);
|
||||
|
||||
return v_float64x2(_mm_add_pd(
|
||||
_mm_unpacklo_pd(c.val, d.val),
|
||||
_mm_unpackhi_pd(c.val, d.val)
|
||||
));
|
||||
#endif
|
||||
}
|
||||
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
//////// Fast Dot Product ////////
|
||||
|
||||
// 16 >> 32
|
||||
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
|
||||
{ return v_dotprod(a, b); }
|
||||
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
|
||||
{ return v_dotprod(a, b) + c; }
|
||||
|
||||
// 32 >> 64
|
||||
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_dotprod(a, b); }
|
||||
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
|
||||
{ return v_dotprod_fast(a, b) + c; }
|
||||
|
||||
// 8 >> 32
|
||||
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
|
||||
{
|
||||
__m128i a0 = v_expand_low(a).val;
|
||||
__m128i a1 = v_expand_high(a).val;
|
||||
__m128i b0 = v_expand_low(b).val;
|
||||
__m128i b1 = v_expand_high(b).val;
|
||||
__m128i p0 = _mm_madd_epi16(a0, b0);
|
||||
__m128i p1 = _mm_madd_epi16(a1, b1);
|
||||
return v_uint32x4(_mm_add_epi32(p0, p1));
|
||||
}
|
||||
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
#if CV_SSE4_1
|
||||
__m128i a0 = _mm_cvtepi8_epi16(a.val);
|
||||
__m128i a1 = v_expand_high(a).val;
|
||||
__m128i b0 = _mm_cvtepi8_epi16(b.val);
|
||||
__m128i b1 = v_expand_high(b).val;
|
||||
__m128i p0 = _mm_madd_epi16(a0, b0);
|
||||
__m128i p1 = _mm_madd_epi16(a1, b1);
|
||||
return v_int32x4(_mm_add_epi32(p0, p1));
|
||||
#else
|
||||
return v_dotprod_expand(a, b);
|
||||
#endif
|
||||
}
|
||||
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
// 16 >> 64
|
||||
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{
|
||||
v_uint32x4 c, d;
|
||||
v_mul_expand(a, b, c, d);
|
||||
|
||||
v_uint64x2 c0, c1, d0, d1;
|
||||
v_expand(c, c0, c1);
|
||||
v_expand(d, d0, d1);
|
||||
|
||||
c0 += c1; d0 += d1;
|
||||
return c0 + d0;
|
||||
}
|
||||
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
v_int32x4 prod = v_dotprod(a, b);
|
||||
v_int64x2 c, d;
|
||||
v_expand(prod, c, d);
|
||||
return c + d;
|
||||
}
|
||||
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
// 32 >> 64f
|
||||
v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c);
|
||||
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
|
||||
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
|
||||
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
|
||||
|
||||
#define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
|
||||
@ -2739,6 +2927,32 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
|
||||
return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
|
||||
}
|
||||
|
||||
// from (Mysticial and wim) https://stackoverflow.com/q/41144668
|
||||
inline v_float64x2 v_cvt_f64(const v_int64x2& v)
|
||||
{
|
||||
// constants encoded as floating-point
|
||||
__m128i magic_i_hi32 = _mm_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
|
||||
__m128i magic_i_all = _mm_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
|
||||
__m128d magic_d_all = _mm_castsi128_pd(magic_i_all);
|
||||
// Blend the 32 lowest significant bits of v with magic_int_lo
|
||||
#if CV_SSE4_1
|
||||
__m128i magic_i_lo = _mm_set1_epi64x(0x4330000000000000); // 2^52
|
||||
__m128i v_lo = _mm_blend_epi16(v.val, magic_i_lo, 0xcc);
|
||||
#else
|
||||
__m128i magic_i_lo = _mm_set1_epi32(0x43300000); // 2^52
|
||||
__m128i v_lo = _mm_unpacklo_epi32(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(0, 0, 2, 0)), magic_i_lo);
|
||||
#endif
|
||||
// Extract the 32 most significant bits of v
|
||||
__m128i v_hi = _mm_srli_epi64(v.val, 32);
|
||||
// Flip the msb of v_hi and blend with 0x45300000
|
||||
v_hi = _mm_xor_si128(v_hi, magic_i_hi32);
|
||||
// Compute in double precision
|
||||
__m128d v_hi_dbl = _mm_sub_pd(_mm_castsi128_pd(v_hi), magic_d_all);
|
||||
// (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition
|
||||
__m128d result = _mm_add_pd(v_hi_dbl, _mm_castsi128_pd(v_lo));
|
||||
return v_float64x2(result);
|
||||
}
|
||||
|
||||
////////////// Lookup table access ////////////////////
|
||||
|
||||
inline v_int8x16 v_lut(const schar* tab, const int* idx)
|
||||
|
@ -499,12 +499,6 @@ inline void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d)
|
||||
v_zip(p0, p1, c, d);
|
||||
}
|
||||
|
||||
inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c, v_uint64x2& d)
|
||||
{
|
||||
c.val = vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val));
|
||||
d.val = vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val));
|
||||
}
|
||||
|
||||
inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
vec_int4 p0 = vec_mule(a.val, b.val);
|
||||
@ -1043,14 +1037,8 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)
|
||||
inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
|
||||
{ return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }
|
||||
|
||||
// The altivec intrinsic is missing for this 2.06 insn
|
||||
inline v_float64x2 v_cvt_f64(const v_int64x2& a)
|
||||
{
|
||||
vec_double2 out;
|
||||
|
||||
__asm__ ("xvcvsxddp %x0,%x1" : "=wa"(out) : "wa"(a.val));
|
||||
return v_float64x2(out);
|
||||
}
|
||||
{ return v_float64x2(vec_ctd(a.val)); }
|
||||
|
||||
////////////// Lookup table access ////////////////////
|
||||
|
||||
@ -1322,12 +1310,134 @@ inline void v_cleanup() {}
|
||||
|
||||
////////// Matrix operations /////////
|
||||
|
||||
//////// Dot Product ////////
|
||||
// 16 >> 32
|
||||
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
|
||||
{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); }
|
||||
|
||||
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
|
||||
{ return v_int32x4(vec_msum(a.val, b.val, c.val)); }
|
||||
|
||||
// 32 >> 64
|
||||
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
|
||||
{
|
||||
vec_dword2 even = vec_mule(a.val, b.val);
|
||||
vec_dword2 odd = vec_mulo(a.val, b.val);
|
||||
return v_int64x2(vec_add(even, odd));
|
||||
}
|
||||
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
|
||||
{ return v_dotprod(a, b) + c; }
|
||||
|
||||
// 8 >> 32
|
||||
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
|
||||
{ return v_uint32x4(vec_msum(a.val, b.val, c.val)); }
|
||||
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
|
||||
{ return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)); }
|
||||
|
||||
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
const vec_ushort8 eight = vec_ushort8_sp(8);
|
||||
vec_short8 a0 = vec_sra((vec_short8)vec_sld(a.val, a.val, 1), eight); // even
|
||||
vec_short8 a1 = vec_sra((vec_short8)a.val, eight); // odd
|
||||
vec_short8 b0 = vec_sra((vec_short8)vec_sld(b.val, b.val, 1), eight);
|
||||
vec_short8 b1 = vec_sra((vec_short8)b.val, eight);
|
||||
return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
|
||||
}
|
||||
|
||||
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
|
||||
{
|
||||
const vec_ushort8 eight = vec_ushort8_sp(8);
|
||||
vec_short8 a0 = vec_sra((vec_short8)vec_sld(a.val, a.val, 1), eight); // even
|
||||
vec_short8 a1 = vec_sra((vec_short8)a.val, eight); // odd
|
||||
vec_short8 b0 = vec_sra((vec_short8)vec_sld(b.val, b.val, 1), eight);
|
||||
vec_short8 b1 = vec_sra((vec_short8)b.val, eight);
|
||||
return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, c.val)));
|
||||
}
|
||||
|
||||
// 16 >> 64
|
||||
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{
|
||||
const vec_uint4 zero = vec_uint4_z;
|
||||
vec_uint4 even = vec_mule(a.val, b.val);
|
||||
vec_uint4 odd = vec_mulo(a.val, b.val);
|
||||
vec_udword2 e0 = (vec_udword2)vec_mergee(even, zero);
|
||||
vec_udword2 e1 = (vec_udword2)vec_mergeo(even, zero);
|
||||
vec_udword2 o0 = (vec_udword2)vec_mergee(odd, zero);
|
||||
vec_udword2 o1 = (vec_udword2)vec_mergeo(odd, zero);
|
||||
vec_udword2 s0 = vec_add(e0, o0);
|
||||
vec_udword2 s1 = vec_add(e1, o1);
|
||||
return v_uint64x2(vec_add(s0, s1));
|
||||
}
|
||||
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
v_int32x4 prod = v_dotprod(a, b);
|
||||
v_int64x2 c, d;
|
||||
v_expand(prod, c, d);
|
||||
return v_int64x2(vec_add(vec_mergeh(c.val, d.val), vec_mergel(c.val, d.val)));
|
||||
}
|
||||
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
// 32 >> 64f
|
||||
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_cvt_f64(v_dotprod(a, b)); }
|
||||
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
//////// Fast Dot Product ////////
|
||||
|
||||
// 16 >> 32
|
||||
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
|
||||
{ return v_dotprod(a, b); }
|
||||
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
|
||||
{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)) + c; }
|
||||
// 32 >> 64
|
||||
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_dotprod(a, b); }
|
||||
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
|
||||
{ return v_dotprod(a, b, c); }
|
||||
|
||||
// 8 >> 32
|
||||
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
|
||||
{ return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)) + c; }
|
||||
|
||||
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
vec_short8 a0 = vec_unpackh(a.val);
|
||||
vec_short8 a1 = vec_unpackl(a.val);
|
||||
vec_short8 b0 = vec_unpackh(b.val);
|
||||
vec_short8 b1 = vec_unpackl(b.val);
|
||||
return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
|
||||
}
|
||||
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
// 16 >> 64
|
||||
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
|
||||
{ return v_dotprod_expand(a, b, c); }
|
||||
|
||||
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
v_int32x4 prod = v_dotprod(a, b);
|
||||
v_int64x2 c, d;
|
||||
v_expand(prod, c, d);
|
||||
return c + d;
|
||||
}
|
||||
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
// 32 >> 64f
|
||||
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
|
||||
{ return v_dotprod_expand(a, b, c); }
|
||||
|
||||
inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
|
||||
const v_float32x4& m1, const v_float32x4& m2,
|
||||
const v_float32x4& m3)
|
||||
|
@ -682,6 +682,29 @@ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n
|
||||
return s;
|
||||
}
|
||||
|
||||
template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
|
||||
v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
|
||||
{
|
||||
typedef typename V_TypeTraits<_Tp>::q_type q_type;
|
||||
v_reg<q_type, n/4> s;
|
||||
for( int i = 0; i < (n/4); i++ )
|
||||
s.s[i] = (q_type)a.s[i*4 ]*b.s[i*4 ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
|
||||
(q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3];
|
||||
return s;
|
||||
}
|
||||
|
||||
template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
|
||||
v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
|
||||
const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
|
||||
{
|
||||
typedef typename V_TypeTraits<_Tp>::q_type q_type;
|
||||
v_reg<q_type, n/4> s;
|
||||
for( int i = 0; i < (n/4); i++ )
|
||||
s.s[i] = (q_type)a.s[i*4 ]*b.s[i*4 ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
|
||||
(q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3] + c.s[i];
|
||||
return s;
|
||||
}
|
||||
|
||||
template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
|
||||
v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
|
||||
v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
|
||||
@ -1282,6 +1305,14 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
|
||||
return c;
|
||||
}
|
||||
|
||||
inline v_float64x2 v_cvt_f64(const v_int64x2& a)
|
||||
{
|
||||
v_float64x2 c;
|
||||
for( int i = 0; i < 2; i++ )
|
||||
c.s[i] = (double)a.s[i];
|
||||
return c;
|
||||
}
|
||||
|
||||
template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut(const _Tp* tab, const int* idx)
|
||||
{
|
||||
v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
|
||||
@ -2398,6 +2429,8 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
|
||||
return v_uint16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
|
||||
}
|
||||
|
||||
//////// Dot Product ////////
|
||||
|
||||
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
|
||||
@ -2410,15 +2443,140 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
|
||||
}
|
||||
|
||||
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
|
||||
{ return v_dotprod(a, b) + c; }
|
||||
|
||||
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
|
||||
{
|
||||
v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
|
||||
v128_t a1 = wasm_i32x4_shr(a.val, 16);
|
||||
v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
|
||||
v128_t b1 = wasm_i32x4_shr(b.val, 16);
|
||||
v128_t d = wasm_i32x4_mul(a0, b0);
|
||||
v128_t e = wasm_i32x4_mul(a1, b1);
|
||||
return v_int32x4(wasm_i32x4_add(wasm_i32x4_add(d, e), c.val));
|
||||
#ifdef __wasm_unimplemented_simd128__
|
||||
v128_t a0 = wasm_i64x2_shr(wasm_i64x2_shl(a.val, 32), 32);
|
||||
v128_t a1 = wasm_i64x2_shr(a.val, 32);
|
||||
v128_t b0 = wasm_i64x2_shr(wasm_i64x2_shl(b.val, 32), 32);
|
||||
v128_t b1 = wasm_i64x2_shr(b.val, 32);
|
||||
v128_t c = (v128_t)((__i64x2)a0 * (__i64x2)b0);
|
||||
v128_t d = (v128_t)((__i64x2)a1 * (__i64x2)b1);
|
||||
return v_int64x2(wasm_i64x2_add(c, d));
|
||||
#else
|
||||
fallback::v_int32x4 a_(a);
|
||||
fallback::v_int32x4 b_(b);
|
||||
return fallback::v_dotprod(a_, b_);
|
||||
#endif
|
||||
}
|
||||
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
|
||||
{
|
||||
#ifdef __wasm_unimplemented_simd128__
|
||||
return v_dotprod(a, b) + c;
|
||||
#else
|
||||
fallback::v_int32x4 a_(a);
|
||||
fallback::v_int32x4 b_(b);
|
||||
fallback::v_int64x2 c_(c);
|
||||
return fallback::v_dotprod(a_, b_, c_);
|
||||
#endif
|
||||
}
|
||||
|
||||
// 8 >> 32
|
||||
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
|
||||
{
|
||||
v128_t a0 = wasm_u16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
|
||||
v128_t a1 = wasm_u16x8_shr(a.val, 8);
|
||||
v128_t b0 = wasm_u16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
|
||||
v128_t b1 = wasm_u16x8_shr(b.val, 8);
|
||||
return v_uint32x4((
|
||||
v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
|
||||
v_dotprod(v_int16x8(a1), v_int16x8(b1))).val
|
||||
);
|
||||
}
|
||||
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
v128_t a0 = wasm_i16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
|
||||
v128_t a1 = wasm_i16x8_shr(a.val, 8);
|
||||
v128_t b0 = wasm_i16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
|
||||
v128_t b1 = wasm_i16x8_shr(b.val, 8);
|
||||
return v_int32x4(
|
||||
v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
|
||||
v_dotprod(v_int16x8(a1), v_int16x8(b1))
|
||||
);
|
||||
}
|
||||
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
// 16 >> 64
|
||||
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{
|
||||
fallback::v_uint16x8 a_(a);
|
||||
fallback::v_uint16x8 b_(b);
|
||||
return fallback::v_dotprod_expand(a_, b_);
|
||||
}
|
||||
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
|
||||
{
|
||||
fallback::v_uint16x8 a_(a);
|
||||
fallback::v_uint16x8 b_(b);
|
||||
fallback::v_uint64x2 c_(c);
|
||||
return fallback::v_dotprod_expand(a_, b_, c_);
|
||||
}
|
||||
|
||||
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
fallback::v_int16x8 a_(a);
|
||||
fallback::v_int16x8 b_(b);
|
||||
return fallback::v_dotprod_expand(a_, b_);
|
||||
}
|
||||
|
||||
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
|
||||
{
|
||||
fallback::v_int16x8 a_(a);
|
||||
fallback::v_int16x8 b_(b);
|
||||
fallback::v_int64x2 c_(c);
|
||||
return fallback::v_dotprod_expand(a_, b_, c_);
|
||||
}
|
||||
|
||||
// 32 >> 64f
|
||||
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_cvt_f64(v_dotprod(a, b)); }
|
||||
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
//////// Fast Dot Product ////////
|
||||
|
||||
// 16 >> 32
|
||||
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
|
||||
{ return v_dotprod(a, b); }
|
||||
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
|
||||
{ return v_dotprod(a, b, c); }
|
||||
|
||||
// 32 >> 64
|
||||
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_dotprod(a, b); }
|
||||
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
|
||||
{ return v_dotprod(a, b, c); }
|
||||
|
||||
// 8 >> 32
|
||||
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
|
||||
{ return v_dotprod_expand(a, b, c); }
|
||||
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
|
||||
{ return v_dotprod_expand(a, b, c); }
|
||||
|
||||
// 16 >> 64
|
||||
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
|
||||
{ return v_dotprod_expand(a, b, c); }
|
||||
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
|
||||
{ return v_dotprod_expand(a, b, c); }
|
||||
|
||||
// 32 >> 64f
|
||||
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
|
||||
{ return v_dotprod_expand(a, b, c); }
|
||||
|
||||
#define OPENCV_HAL_IMPL_WASM_LOGIC_OP(_Tpvec) \
|
||||
OPENCV_HAL_IMPL_WASM_BIN_OP(&, _Tpvec, wasm_v128_and) \
|
||||
@ -3815,6 +3973,16 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
|
||||
return fallback::v_cvt_f64_high(a_);
|
||||
}
|
||||
|
||||
inline v_float64x2 v_cvt_f64(const v_int64x2& a)
|
||||
{
|
||||
#ifdef __wasm_unimplemented_simd128__
|
||||
return v_float64x2(wasm_convert_f64x2_i64x2(a.val));
|
||||
#else
|
||||
fallback::v_int64x2 a_(a);
|
||||
return fallback::v_cvt_f64(a_);
|
||||
#endif
|
||||
}
|
||||
|
||||
////////////// Lookup table access ////////////////////
|
||||
|
||||
inline v_int8x16 v_lut(const schar* tab, const int* idx)
|
||||
|
@ -72,7 +72,9 @@
|
||||
# include "opencv2/core/cuda_stream_accessor.hpp"
|
||||
# include "opencv2/core/cuda/common.hpp"
|
||||
|
||||
# ifndef NPP_VERSION
|
||||
# define NPP_VERSION (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD)
|
||||
# endif
|
||||
|
||||
# define CUDART_MINIMUM_REQUIRED_VERSION 6050
|
||||
|
||||
|
@ -144,10 +144,10 @@ VSX_FINLINE(rt) fnm(const rg& a, const rg& b) \
|
||||
VSX_REDIRECT_2RG(vec_uint4, vec_ushort8, vec_mulo, __builtin_vec_mulo)
|
||||
|
||||
// dword2 support arrived in ISA 2.07 and GCC 8+
|
||||
VSX_IMPL_2VRG(vec_dword2, vec_int4, vmulesw, vec_mule)
|
||||
VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmuleuw, vec_mule)
|
||||
VSX_IMPL_2VRG(vec_dword2, vec_int4, vmulosw, vec_mulo)
|
||||
VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmulouw, vec_mulo)
|
||||
VSX_IMPL_2VRG(vec_dword2, vec_int4, vmulosw, vec_mule)
|
||||
VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmulouw, vec_mule)
|
||||
VSX_IMPL_2VRG(vec_dword2, vec_int4, vmulesw, vec_mulo)
|
||||
VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmuleuw, vec_mulo)
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -9,7 +9,7 @@ typedef TestBaseWithParam<MatType_Length_t> MatType_Length;
|
||||
|
||||
PERF_TEST_P( MatType_Length, dot,
|
||||
testing::Combine(
|
||||
testing::Values( CV_8UC1, CV_32SC1, CV_32FC1 ),
|
||||
testing::Values( CV_8UC1, CV_8SC1, CV_16SC1, CV_16UC1, CV_32SC1, CV_32FC1 ),
|
||||
testing::Values( 32, 64, 128, 256, 512, 1024 )
|
||||
))
|
||||
{
|
||||
|
@ -46,6 +46,7 @@
|
||||
#undef CV_LOG_STRIP_LEVEL
|
||||
#define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_VERBOSE + 1
|
||||
#include <opencv2/core/utils/logger.hpp>
|
||||
#include <opencv2/core/utils/configuration.private.hpp>
|
||||
|
||||
#define CV__ALLOCATOR_STATS_LOG(...) CV_LOG_VERBOSE(NULL, 0, "alloc.cpp: " << __VA_ARGS__)
|
||||
#include "opencv2/core/utils/allocator_stats.impl.hpp"
|
||||
@ -81,6 +82,38 @@ cv::utils::AllocatorStatisticsInterface& getAllocatorStatistics()
|
||||
return allocator_stats;
|
||||
}
|
||||
|
||||
#if defined HAVE_POSIX_MEMALIGN || defined HAVE_MEMALIGN
|
||||
static bool readMemoryAlignmentParameter()
|
||||
{
|
||||
bool value = true;
|
||||
#if defined(__GLIBC__) && defined(__linux__) \
|
||||
&& !defined(CV_STATIC_ANALYSIS) \
|
||||
&& !defined(OPENCV_ENABLE_MEMORY_SANITIZER) \
|
||||
&& !defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) /* oss-fuzz */ \
|
||||
&& !defined(_WIN32) /* MinGW? */
|
||||
{
|
||||
// https://github.com/opencv/opencv/issues/15526
|
||||
value = false;
|
||||
}
|
||||
#endif
|
||||
value = cv::utils::getConfigurationParameterBool("OPENCV_ENABLE_MEMALIGN", value); // should not call fastMalloc() internally
|
||||
// TODO add checks for valgrind, ASAN if value == false
|
||||
return value;
|
||||
}
|
||||
static inline
|
||||
bool isAlignedAllocationEnabled()
|
||||
{
|
||||
static bool initialized = false;
|
||||
static bool useMemalign = true;
|
||||
if (!initialized)
|
||||
{
|
||||
initialized = true; // trick to avoid stuck in acquire (works only if allocations are scope based)
|
||||
useMemalign = readMemoryAlignmentParameter();
|
||||
}
|
||||
return useMemalign;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OPENCV_ALLOC_ENABLE_STATISTICS
|
||||
static inline
|
||||
void* fastMalloc_(size_t size)
|
||||
@ -89,25 +122,30 @@ void* fastMalloc(size_t size)
|
||||
#endif
|
||||
{
|
||||
#ifdef HAVE_POSIX_MEMALIGN
|
||||
void* ptr = NULL;
|
||||
if(posix_memalign(&ptr, CV_MALLOC_ALIGN, size))
|
||||
ptr = NULL;
|
||||
if(!ptr)
|
||||
return OutOfMemoryError(size);
|
||||
return ptr;
|
||||
if (isAlignedAllocationEnabled())
|
||||
{
|
||||
void* ptr = NULL;
|
||||
if(posix_memalign(&ptr, CV_MALLOC_ALIGN, size))
|
||||
ptr = NULL;
|
||||
if(!ptr)
|
||||
return OutOfMemoryError(size);
|
||||
return ptr;
|
||||
}
|
||||
#elif defined HAVE_MEMALIGN
|
||||
void* ptr = memalign(CV_MALLOC_ALIGN, size);
|
||||
if(!ptr)
|
||||
return OutOfMemoryError(size);
|
||||
return ptr;
|
||||
#else
|
||||
if (isAlignedAllocationEnabled())
|
||||
{
|
||||
void* ptr = memalign(CV_MALLOC_ALIGN, size);
|
||||
if(!ptr)
|
||||
return OutOfMemoryError(size);
|
||||
return ptr;
|
||||
}
|
||||
#endif
|
||||
uchar* udata = (uchar*)malloc(size + sizeof(void*) + CV_MALLOC_ALIGN);
|
||||
if(!udata)
|
||||
return OutOfMemoryError(size);
|
||||
uchar** adata = alignPtr((uchar**)udata + 1, CV_MALLOC_ALIGN);
|
||||
adata[-1] = udata;
|
||||
return adata;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef OPENCV_ALLOC_ENABLE_STATISTICS
|
||||
@ -118,8 +156,12 @@ void fastFree(void* ptr)
|
||||
#endif
|
||||
{
|
||||
#if defined HAVE_POSIX_MEMALIGN || defined HAVE_MEMALIGN
|
||||
free(ptr);
|
||||
#else
|
||||
if (isAlignedAllocationEnabled())
|
||||
{
|
||||
free(ptr);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
if(ptr)
|
||||
{
|
||||
uchar* udata = ((uchar**)ptr)[-1];
|
||||
@ -127,7 +169,6 @@ void fastFree(void* ptr)
|
||||
((uchar*)ptr - udata) <= (ptrdiff_t)(sizeof(void*)+CV_MALLOC_ALIGN));
|
||||
free(udata);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef OPENCV_ALLOC_ENABLE_STATISTICS
|
||||
|
@ -782,36 +782,10 @@ void flip( InputArray _src, OutputArray _dst, int flip_mode )
|
||||
flipHoriz( dst.ptr(), dst.step, dst.ptr(), dst.step, dst.size(), esz );
|
||||
}
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
|
||||
static bool ocl_rotate(InputArray _src, OutputArray _dst, int rotateMode)
|
||||
{
|
||||
switch (rotateMode)
|
||||
{
|
||||
case ROTATE_90_CLOCKWISE:
|
||||
transpose(_src, _dst);
|
||||
flip(_dst, _dst, 1);
|
||||
break;
|
||||
case ROTATE_180:
|
||||
flip(_src, _dst, -1);
|
||||
break;
|
||||
case ROTATE_90_COUNTERCLOCKWISE:
|
||||
transpose(_src, _dst);
|
||||
flip(_dst, _dst, 0);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
void rotate(InputArray _src, OutputArray _dst, int rotateMode)
|
||||
{
|
||||
CV_Assert(_src.dims() <= 2);
|
||||
|
||||
CV_OCL_RUN(_dst.isUMat(), ocl_rotate(_src, _dst, rotateMode))
|
||||
|
||||
switch (rotateMode)
|
||||
{
|
||||
case ROTATE_90_CLOCKWISE:
|
||||
|
@ -2320,26 +2320,22 @@ double dotProd_8u(const uchar* src1, const uchar* src2, int len)
|
||||
while (i < len0)
|
||||
{
|
||||
blockSize = std::min(len0 - i, blockSize0);
|
||||
v_int32 v_sum = vx_setzero_s32();
|
||||
v_uint32 v_sum = vx_setzero_u32();
|
||||
const int cWidth = v_uint16::nlanes;
|
||||
|
||||
int j = 0;
|
||||
for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
|
||||
{
|
||||
v_uint16 v_src10, v_src20, v_src11, v_src21;
|
||||
v_expand(vx_load(src1 + j), v_src10, v_src11);
|
||||
v_expand(vx_load(src2 + j), v_src20, v_src21);
|
||||
|
||||
v_sum += v_dotprod(v_reinterpret_as_s16(v_src10), v_reinterpret_as_s16(v_src20));
|
||||
v_sum += v_dotprod(v_reinterpret_as_s16(v_src11), v_reinterpret_as_s16(v_src21));
|
||||
v_uint8 v_src1 = vx_load(src1 + j);
|
||||
v_uint8 v_src2 = vx_load(src2 + j);
|
||||
v_sum = v_dotprod_expand_fast(v_src1, v_src2, v_sum);
|
||||
}
|
||||
|
||||
for (; j <= blockSize - cWidth; j += cWidth)
|
||||
{
|
||||
v_int16 v_src10 = v_reinterpret_as_s16(vx_load_expand(src1 + j));
|
||||
v_int16 v_src20 = v_reinterpret_as_s16(vx_load_expand(src2 + j));
|
||||
|
||||
v_sum += v_dotprod(v_src10, v_src20);
|
||||
v_sum += v_reinterpret_as_u32(v_dotprod_fast(v_src10, v_src20));
|
||||
}
|
||||
r += (double)v_reduce_sum(v_sum);
|
||||
|
||||
@ -2348,48 +2344,6 @@ double dotProd_8u(const uchar* src1, const uchar* src2, int len)
|
||||
i += blockSize;
|
||||
}
|
||||
vx_cleanup();
|
||||
#elif CV_NEON
|
||||
if( cv::checkHardwareSupport(CV_CPU_NEON) )
|
||||
{
|
||||
int len0 = len & -8, blockSize0 = (1 << 15), blockSize;
|
||||
uint32x4_t v_zero = vdupq_n_u32(0u);
|
||||
CV_DECL_ALIGNED(16) uint buf[4];
|
||||
|
||||
while( i < len0 )
|
||||
{
|
||||
blockSize = std::min(len0 - i, blockSize0);
|
||||
uint32x4_t v_sum = v_zero;
|
||||
|
||||
int j = 0;
|
||||
for( ; j <= blockSize - 16; j += 16 )
|
||||
{
|
||||
uint8x16_t v_src1 = vld1q_u8(src1 + j), v_src2 = vld1q_u8(src2 + j);
|
||||
|
||||
uint16x8_t v_src10 = vmovl_u8(vget_low_u8(v_src1)), v_src20 = vmovl_u8(vget_low_u8(v_src2));
|
||||
v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20));
|
||||
v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20));
|
||||
|
||||
v_src10 = vmovl_u8(vget_high_u8(v_src1));
|
||||
v_src20 = vmovl_u8(vget_high_u8(v_src2));
|
||||
v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20));
|
||||
v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20));
|
||||
}
|
||||
|
||||
for( ; j <= blockSize - 8; j += 8 )
|
||||
{
|
||||
uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j)), v_src2 = vmovl_u8(vld1_u8(src2 + j));
|
||||
v_sum = vmlal_u16(v_sum, vget_low_u16(v_src1), vget_low_u16(v_src2));
|
||||
v_sum = vmlal_u16(v_sum, vget_high_u16(v_src1), vget_high_u16(v_src2));
|
||||
}
|
||||
|
||||
vst1q_u32(buf, v_sum);
|
||||
r += buf[0] + buf[1] + buf[2] + buf[3];
|
||||
|
||||
src1 += blockSize;
|
||||
src2 += blockSize;
|
||||
i += blockSize;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return r + dotProd_(src1, src2, len - i);
|
||||
}
|
||||
@ -2412,20 +2366,16 @@ double dotProd_8s(const schar* src1, const schar* src2, int len)
|
||||
int j = 0;
|
||||
for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
|
||||
{
|
||||
v_int16 v_src10, v_src20, v_src11, v_src21;
|
||||
v_expand(vx_load(src1 + j), v_src10, v_src11);
|
||||
v_expand(vx_load(src2 + j), v_src20, v_src21);
|
||||
|
||||
v_sum += v_dotprod(v_src10, v_src20);
|
||||
v_sum += v_dotprod(v_src11, v_src21);
|
||||
v_int8 v_src1 = vx_load(src1 + j);
|
||||
v_int8 v_src2 = vx_load(src2 + j);
|
||||
v_sum = v_dotprod_expand_fast(v_src1, v_src2, v_sum);
|
||||
}
|
||||
|
||||
for (; j <= blockSize - cWidth; j += cWidth)
|
||||
{
|
||||
v_int16 v_src10 = vx_load_expand(src1 + j);
|
||||
v_int16 v_src20 = vx_load_expand(src2 + j);
|
||||
|
||||
v_sum += v_dotprod(v_src10, v_src20);
|
||||
v_int16 v_src1 = vx_load_expand(src1 + j);
|
||||
v_int16 v_src2 = vx_load_expand(src2 + j);
|
||||
v_sum = v_dotprod_fast(v_src1, v_src2, v_sum);
|
||||
}
|
||||
r += (double)v_reduce_sum(v_sum);
|
||||
|
||||
@ -2434,87 +2384,6 @@ double dotProd_8s(const schar* src1, const schar* src2, int len)
|
||||
i += blockSize;
|
||||
}
|
||||
vx_cleanup();
|
||||
#elif CV_NEON
|
||||
if( cv::checkHardwareSupport(CV_CPU_NEON) )
|
||||
{
|
||||
int len0 = len & -8, blockSize0 = (1 << 14), blockSize;
|
||||
int32x4_t v_zero = vdupq_n_s32(0);
|
||||
CV_DECL_ALIGNED(16) int buf[4];
|
||||
|
||||
while( i < len0 )
|
||||
{
|
||||
blockSize = std::min(len0 - i, blockSize0);
|
||||
int32x4_t v_sum = v_zero;
|
||||
|
||||
int j = 0;
|
||||
for( ; j <= blockSize - 16; j += 16 )
|
||||
{
|
||||
int8x16_t v_src1 = vld1q_s8(src1 + j), v_src2 = vld1q_s8(src2 + j);
|
||||
|
||||
int16x8_t v_src10 = vmovl_s8(vget_low_s8(v_src1)), v_src20 = vmovl_s8(vget_low_s8(v_src2));
|
||||
v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20));
|
||||
v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20));
|
||||
|
||||
v_src10 = vmovl_s8(vget_high_s8(v_src1));
|
||||
v_src20 = vmovl_s8(vget_high_s8(v_src2));
|
||||
v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20));
|
||||
v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20));
|
||||
}
|
||||
|
||||
for( ; j <= blockSize - 8; j += 8 )
|
||||
{
|
||||
int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + j)), v_src2 = vmovl_s8(vld1_s8(src2 + j));
|
||||
v_sum = vmlal_s16(v_sum, vget_low_s16(v_src1), vget_low_s16(v_src2));
|
||||
v_sum = vmlal_s16(v_sum, vget_high_s16(v_src1), vget_high_s16(v_src2));
|
||||
}
|
||||
|
||||
vst1q_s32(buf, v_sum);
|
||||
r += buf[0] + buf[1] + buf[2] + buf[3];
|
||||
|
||||
src1 += blockSize;
|
||||
src2 += blockSize;
|
||||
i += blockSize;
|
||||
}
|
||||
}
|
||||
#elif CV_MSA
|
||||
int len0 = len & -8, blockSize0 = (1 << 14), blockSize;
|
||||
v4i32 v_zero = msa_dupq_n_s32(0);
|
||||
CV_DECL_ALIGNED(16) int buf[4];
|
||||
|
||||
while( i < len0 )
|
||||
{
|
||||
blockSize = std::min(len0 - i, blockSize0);
|
||||
v4i32 v_sum = v_zero;
|
||||
|
||||
int j = 0;
|
||||
for( ; j <= blockSize - 16; j += 16 )
|
||||
{
|
||||
v16i8 v_src1 = msa_ld1q_s8(src1 + j), v_src2 = msa_ld1q_s8(src2 + j);
|
||||
|
||||
v8i16 v_src10 = msa_movl_s8(msa_get_low_s8(v_src1)), v_src20 = msa_movl_s8(msa_get_low_s8(v_src2));
|
||||
v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src10), msa_get_low_s16(v_src20));
|
||||
v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src10), msa_get_high_s16(v_src20));
|
||||
|
||||
v_src10 = msa_movl_s8(msa_get_high_s8(v_src1));
|
||||
v_src20 = msa_movl_s8(msa_get_high_s8(v_src2));
|
||||
v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src10), msa_get_low_s16(v_src20));
|
||||
v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src10), msa_get_high_s16(v_src20));
|
||||
}
|
||||
|
||||
for( ; j <= blockSize - 8; j += 8 )
|
||||
{
|
||||
v8i16 v_src1 = msa_movl_s8(msa_ld1_s8(src1 + j)), v_src2 = msa_movl_s8(msa_ld1_s8(src2 + j));
|
||||
v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src1), msa_get_low_s16(v_src2));
|
||||
v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src1), msa_get_high_s16(v_src2));
|
||||
}
|
||||
|
||||
msa_st1q_s32(buf, v_sum);
|
||||
r += buf[0] + buf[1] + buf[2] + buf[3];
|
||||
|
||||
src1 += blockSize;
|
||||
src2 += blockSize;
|
||||
i += blockSize;
|
||||
}
|
||||
#endif
|
||||
|
||||
return r + dotProd_(src1, src2, len - i);
|
||||
@ -2522,42 +2391,97 @@ double dotProd_8s(const schar* src1, const schar* src2, int len)
|
||||
|
||||
double dotProd_16u(const ushort* src1, const ushort* src2, int len)
|
||||
{
|
||||
return dotProd_(src1, src2, len);
|
||||
double r = 0.0;
|
||||
int i = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
int len0 = len & -v_uint16::nlanes, blockSize0 = (1 << 24), blockSize;
|
||||
|
||||
while (i < len0)
|
||||
{
|
||||
blockSize = std::min(len0 - i, blockSize0);
|
||||
v_uint64 v_sum = vx_setzero_u64();
|
||||
const int cWidth = v_uint16::nlanes;
|
||||
|
||||
int j = 0;
|
||||
for (; j <= blockSize - cWidth; j += cWidth)
|
||||
{
|
||||
v_uint16 v_src1 = vx_load(src1 + j);
|
||||
v_uint16 v_src2 = vx_load(src2 + j);
|
||||
v_sum = v_dotprod_expand_fast(v_src1, v_src2, v_sum);
|
||||
}
|
||||
r += (double)v_reduce_sum(v_sum);
|
||||
|
||||
src1 += blockSize;
|
||||
src2 += blockSize;
|
||||
i += blockSize;
|
||||
}
|
||||
vx_cleanup();
|
||||
#endif
|
||||
return r + dotProd_(src1, src2, len - i);
|
||||
}
|
||||
|
||||
double dotProd_16s(const short* src1, const short* src2, int len)
|
||||
{
|
||||
return dotProd_(src1, src2, len);
|
||||
double r = 0.0;
|
||||
int i = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
int len0 = len & -v_int16::nlanes, blockSize0 = (1 << 24), blockSize;
|
||||
|
||||
while (i < len0)
|
||||
{
|
||||
blockSize = std::min(len0 - i, blockSize0);
|
||||
v_int64 v_sum = vx_setzero_s64();
|
||||
const int cWidth = v_int16::nlanes;
|
||||
|
||||
int j = 0;
|
||||
for (; j <= blockSize - cWidth; j += cWidth)
|
||||
{
|
||||
v_int16 v_src1 = vx_load(src1 + j);
|
||||
v_int16 v_src2 = vx_load(src2 + j);
|
||||
v_sum = v_dotprod_expand_fast(v_src1, v_src2, v_sum);
|
||||
}
|
||||
r += (double)v_reduce_sum(v_sum);
|
||||
|
||||
src1 += blockSize;
|
||||
src2 += blockSize;
|
||||
i += blockSize;
|
||||
}
|
||||
vx_cleanup();
|
||||
#endif
|
||||
return r + dotProd_(src1, src2, len - i);
|
||||
}
|
||||
|
||||
double dotProd_32s(const int* src1, const int* src2, int len)
|
||||
{
|
||||
#if CV_SIMD128_64F
|
||||
double r = 0.0;
|
||||
#if CV_SIMD_64F
|
||||
double r = .0;
|
||||
int i = 0;
|
||||
int lenAligned = len & -v_int32x4::nlanes;
|
||||
v_float64x2 a(0.0, 0.0);
|
||||
v_float64x2 b(0.0, 0.0);
|
||||
|
||||
for( i = 0; i < lenAligned; i += v_int32x4::nlanes )
|
||||
{
|
||||
v_int32x4 s1 = v_load(src1);
|
||||
v_int32x4 s2 = v_load(src2);
|
||||
|
||||
#if CV_VSX
|
||||
// Do 32x32->64 multiplies, convert/round to double, accumulate
|
||||
// Potentially less precise than FMA, but 1.5x faster than fma below.
|
||||
a += v_cvt_f64(v_int64(vec_mule(s1.val, s2.val)));
|
||||
b += v_cvt_f64(v_int64(vec_mulo(s1.val, s2.val)));
|
||||
#else
|
||||
a = v_fma(v_cvt_f64(s1), v_cvt_f64(s2), a);
|
||||
b = v_fma(v_cvt_f64_high(s1), v_cvt_f64_high(s2), b);
|
||||
const int step = v_int32::nlanes;
|
||||
v_float64 v_sum0 = vx_setzero_f64();
|
||||
#if CV_SIMD_WIDTH == 16
|
||||
const int wstep = step * 2;
|
||||
v_float64 v_sum1 = vx_setzero_f64();
|
||||
for (; i < len - wstep; i += wstep, src1 += wstep, src2 += wstep)
|
||||
{
|
||||
v_int32 v_src10 = vx_load(src1);
|
||||
v_int32 v_src20 = vx_load(src2);
|
||||
v_int32 v_src11 = vx_load(src1 + step);
|
||||
v_int32 v_src21 = vx_load(src2 + step);
|
||||
v_sum0 = v_dotprod_expand_fast(v_src10, v_src20, v_sum0);
|
||||
v_sum1 = v_dotprod_expand_fast(v_src11, v_src21, v_sum1);
|
||||
}
|
||||
v_sum0 += v_sum1;
|
||||
#endif
|
||||
src1 += v_int32x4::nlanes;
|
||||
src2 += v_int32x4::nlanes;
|
||||
}
|
||||
a += b;
|
||||
r = v_reduce_sum(a);
|
||||
for (; i < len - step; i += step, src1 += step, src2 += step)
|
||||
{
|
||||
v_int32 v_src1 = vx_load(src1);
|
||||
v_int32 v_src2 = vx_load(src2);
|
||||
v_sum0 = v_dotprod_expand_fast(v_src1, v_src2, v_sum0);
|
||||
}
|
||||
r = v_reduce_sum(v_sum0);
|
||||
vx_cleanup();
|
||||
return r + dotProd_(src1, src2, len - i);
|
||||
#else
|
||||
return dotProd_(src1, src2, len);
|
||||
|
@ -45,6 +45,13 @@
|
||||
#ifdef HAVE_OPENGL
|
||||
# include "gl_core_3_1.hpp"
|
||||
# ifdef HAVE_CUDA
|
||||
# if (defined(__arm__) || defined(__aarch64__)) \
|
||||
&& !defined(OPENCV_SKIP_CUDA_OPENGL_ARM_WORKAROUND)
|
||||
# include <GL/gl.h>
|
||||
# ifndef GL_VERSION
|
||||
# define GL_VERSION 0x1F02
|
||||
# endif
|
||||
# endif
|
||||
# include <cuda_gl_interop.h>
|
||||
# endif
|
||||
#else // HAVE_OPENGL
|
||||
|
@ -603,12 +603,14 @@ template<typename R> struct TheTest
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_dot_prod()
|
||||
TheTest & test_dotprod()
|
||||
{
|
||||
typedef typename V_RegTraits<R>::w_reg Rx2;
|
||||
typedef typename Rx2::lane_type w_type;
|
||||
|
||||
Data<R> dataA, dataB(2);
|
||||
Data<R> dataA, dataB;
|
||||
dataA += std::numeric_limits<LaneType>::max() - R::nlanes;
|
||||
dataB += std::numeric_limits<LaneType>::min() + R::nlanes;
|
||||
R a = dataA, b = dataB;
|
||||
|
||||
Data<Rx2> dataC;
|
||||
@ -621,12 +623,95 @@ template<typename R> struct TheTest
|
||||
resE = v_dotprod(a, b, c);
|
||||
|
||||
const int n = R::nlanes / 2;
|
||||
w_type sumAB = 0, sumABC = 0, tmp_sum;
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
SCOPED_TRACE(cv::format("i=%d", i));
|
||||
EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], resD[i]);
|
||||
EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1] + dataC[i], resE[i]);
|
||||
|
||||
tmp_sum = (w_type)dataA[i*2] * (w_type)dataB[i*2] +
|
||||
(w_type)dataA[i*2 + 1] * (w_type)dataB[i*2 + 1];
|
||||
sumAB += tmp_sum;
|
||||
EXPECT_EQ(tmp_sum, resD[i]);
|
||||
|
||||
tmp_sum = tmp_sum + dataC[i];
|
||||
sumABC += tmp_sum;
|
||||
EXPECT_EQ(tmp_sum, resE[i]);
|
||||
}
|
||||
|
||||
w_type resF = v_reduce_sum(v_dotprod_fast(a, b)),
|
||||
resG = v_reduce_sum(v_dotprod_fast(a, b, c));
|
||||
EXPECT_EQ(sumAB, resF);
|
||||
EXPECT_EQ(sumABC, resG);
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_dotprod_expand()
|
||||
{
|
||||
typedef typename V_RegTraits<R>::q_reg Rx4;
|
||||
typedef typename Rx4::lane_type l4_type;
|
||||
|
||||
Data<R> dataA, dataB;
|
||||
dataA += std::numeric_limits<LaneType>::max() - R::nlanes;
|
||||
dataB += std::numeric_limits<LaneType>::min() + R::nlanes;
|
||||
R a = dataA, b = dataB;
|
||||
|
||||
Data<Rx4> dataC;
|
||||
Rx4 c = dataC;
|
||||
|
||||
Data<Rx4> resD = v_dotprod_expand(a, b),
|
||||
resE = v_dotprod_expand(a, b, c);
|
||||
|
||||
l4_type sumAB = 0, sumABC = 0, tmp_sum;
|
||||
for (int i = 0; i < Rx4::nlanes; ++i)
|
||||
{
|
||||
SCOPED_TRACE(cv::format("i=%d", i));
|
||||
tmp_sum = (l4_type)dataA[i*4] * (l4_type)dataB[i*4] +
|
||||
(l4_type)dataA[i*4 + 1] * (l4_type)dataB[i*4 + 1] +
|
||||
(l4_type)dataA[i*4 + 2] * (l4_type)dataB[i*4 + 2] +
|
||||
(l4_type)dataA[i*4 + 3] * (l4_type)dataB[i*4 + 3];
|
||||
sumAB += tmp_sum;
|
||||
EXPECT_EQ(tmp_sum, resD[i]);
|
||||
|
||||
tmp_sum = tmp_sum + dataC[i];
|
||||
sumABC += tmp_sum;
|
||||
EXPECT_EQ(tmp_sum, resE[i]);
|
||||
}
|
||||
|
||||
l4_type resF = v_reduce_sum(v_dotprod_expand_fast(a, b)),
|
||||
resG = v_reduce_sum(v_dotprod_expand_fast(a, b, c));
|
||||
EXPECT_EQ(sumAB, resF);
|
||||
EXPECT_EQ(sumABC, resG);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_dotprod_expand_f64()
|
||||
{
|
||||
#if CV_SIMD_64F
|
||||
Data<R> dataA, dataB;
|
||||
dataA += std::numeric_limits<LaneType>::max() - R::nlanes;
|
||||
dataB += std::numeric_limits<LaneType>::min();
|
||||
R a = dataA, b = dataB;
|
||||
|
||||
Data<v_float64> dataC;
|
||||
v_float64 c = dataC;
|
||||
|
||||
Data<v_float64> resA = v_dotprod_expand(a, a),
|
||||
resB = v_dotprod_expand(b, b),
|
||||
resC = v_dotprod_expand(a, b, c);
|
||||
|
||||
const int n = R::nlanes / 2;
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
SCOPED_TRACE(cv::format("i=%d", i));
|
||||
EXPECT_EQ((double)dataA[i*2] * (double)dataA[i*2] +
|
||||
(double)dataA[i*2 + 1] * (double)dataA[i*2 + 1], resA[i]);
|
||||
EXPECT_EQ((double)dataB[i*2] * (double)dataB[i*2] +
|
||||
(double)dataB[i*2 + 1] * (double)dataB[i*2 + 1], resB[i]);
|
||||
EXPECT_EQ((double)dataA[i*2] * (double)dataB[i*2] +
|
||||
(double)dataA[i*2 + 1] * (double)dataB[i*2 + 1] + dataC[i], resC[i]);
|
||||
}
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -1165,6 +1250,29 @@ template<typename R> struct TheTest
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_cvt64_double()
|
||||
{
|
||||
#if CV_SIMD_64F
|
||||
Data<R> dataA(std::numeric_limits<LaneType>::max()),
|
||||
dataB(std::numeric_limits<LaneType>::min());
|
||||
dataB += R::nlanes;
|
||||
|
||||
R a = dataA, b = dataB;
|
||||
v_float64 c = v_cvt_f64(a), d = v_cvt_f64(b);
|
||||
|
||||
Data<v_float64> resC = c;
|
||||
Data<v_float64> resD = d;
|
||||
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
SCOPED_TRACE(cv::format("i=%d", i));
|
||||
EXPECT_EQ((double)dataA[i], resC[i]);
|
||||
EXPECT_EQ((double)dataB[i], resD[i]);
|
||||
}
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_matmul()
|
||||
{
|
||||
Data<R> dataV, dataA, dataB, dataC, dataD;
|
||||
@ -1341,6 +1449,7 @@ void test_hal_intrin_uint8()
|
||||
.test_mul_expand()
|
||||
.test_cmp()
|
||||
.test_logic()
|
||||
.test_dotprod_expand()
|
||||
.test_min_max()
|
||||
.test_absdiff()
|
||||
.test_reduce_sad()
|
||||
@ -1378,6 +1487,7 @@ void test_hal_intrin_int8()
|
||||
.test_mul_expand()
|
||||
.test_cmp()
|
||||
.test_logic()
|
||||
.test_dotprod_expand()
|
||||
.test_min_max()
|
||||
.test_absdiff()
|
||||
.test_absdiffs()
|
||||
@ -1408,6 +1518,7 @@ void test_hal_intrin_uint16()
|
||||
.test_cmp()
|
||||
.test_shift<1>()
|
||||
.test_shift<8>()
|
||||
.test_dotprod_expand()
|
||||
.test_logic()
|
||||
.test_min_max()
|
||||
.test_absdiff()
|
||||
@ -1437,7 +1548,8 @@ void test_hal_intrin_int16()
|
||||
.test_cmp()
|
||||
.test_shift<1>()
|
||||
.test_shift<8>()
|
||||
.test_dot_prod()
|
||||
.test_dotprod()
|
||||
.test_dotprod_expand()
|
||||
.test_logic()
|
||||
.test_min_max()
|
||||
.test_absdiff()
|
||||
@ -1497,6 +1609,8 @@ void test_hal_intrin_int32()
|
||||
.test_cmp()
|
||||
.test_popcount()
|
||||
.test_shift<1>().test_shift<8>()
|
||||
.test_dotprod()
|
||||
.test_dotprod_expand_f64()
|
||||
.test_logic()
|
||||
.test_min_max()
|
||||
.test_absdiff()
|
||||
@ -1538,6 +1652,7 @@ void test_hal_intrin_int64()
|
||||
.test_logic()
|
||||
.test_extract<0>().test_extract<1>()
|
||||
.test_rotate<0>().test_rotate<1>()
|
||||
.test_cvt64_double()
|
||||
;
|
||||
}
|
||||
|
||||
|
@ -50,6 +50,8 @@ namespace cv
|
||||
|
||||
////////////////// Helper functions //////////////////////
|
||||
|
||||
#define CV_CLAMP_INT(v, vmin, vmax) (v < vmin ? vmin : (vmax < v ? vmax : v))
|
||||
|
||||
static const size_t OUT_OF_RANGE = (size_t)1 << (sizeof(size_t)*8 - 2);
|
||||
|
||||
static void
|
||||
@ -71,15 +73,18 @@ calcHistLookupTables_8u( const Mat& hist, const SparseMat& shist,
|
||||
int sz = !issparse ? hist.size[i] : shist.size(i);
|
||||
size_t step = !issparse ? hist.step[i] : 1;
|
||||
|
||||
double v_lo = ranges[i][0];
|
||||
double v_hi = ranges[i][1];
|
||||
|
||||
for( j = low; j < high; j++ )
|
||||
{
|
||||
int idx = cvFloor(j*a + b);
|
||||
size_t written_idx;
|
||||
if( (unsigned)idx < (unsigned)sz )
|
||||
size_t written_idx = OUT_OF_RANGE;
|
||||
if (j >= v_lo && j < v_hi)
|
||||
{
|
||||
idx = CV_CLAMP_INT(idx, 0, sz - 1);
|
||||
written_idx = idx*step;
|
||||
else
|
||||
written_idx = OUT_OF_RANGE;
|
||||
|
||||
}
|
||||
tab[i*(high - low) + j - low] = written_idx;
|
||||
}
|
||||
}
|
||||
@ -197,6 +202,10 @@ static void histPrepareImages( const Mat* images, int nimages, const int* channe
|
||||
double t = histSize[i]/(high - low);
|
||||
uniranges[i*2] = t;
|
||||
uniranges[i*2+1] = -t*low;
|
||||
#if 0 // This should be true by math, but it is not accurate numerically
|
||||
CV_Assert(cvFloor(low * uniranges[i*2] + uniranges[i*2+1]) == 0);
|
||||
CV_Assert((high * uniranges[i*2] + uniranges[i*2+1]) < histSize[i]);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -243,22 +252,33 @@ calcHist_( std::vector<uchar*>& _ptrs, const std::vector<int>& _deltas,
|
||||
int sz = size[0], d0 = deltas[0], step0 = deltas[1];
|
||||
const T* p0 = (const T*)ptrs[0];
|
||||
|
||||
double v0_lo = _ranges[0][0];
|
||||
double v0_hi = _ranges[0][1];
|
||||
|
||||
for( ; imsize.height--; p0 += step0, mask += mstep )
|
||||
{
|
||||
if( !mask )
|
||||
for( x = 0; x < imsize.width; x++, p0 += d0 )
|
||||
{
|
||||
int idx = cvFloor(*p0*a + b);
|
||||
if( (unsigned)idx < (unsigned)sz )
|
||||
((int*)H)[idx]++;
|
||||
double v0 = (double)*p0;
|
||||
int idx = cvFloor(v0*a + b);
|
||||
if (v0 < v0_lo || v0 >= v0_hi)
|
||||
continue;
|
||||
idx = CV_CLAMP_INT(idx, 0, sz - 1);
|
||||
CV_DbgAssert((unsigned)idx < (unsigned)sz);
|
||||
((int*)H)[idx]++;
|
||||
}
|
||||
else
|
||||
for( x = 0; x < imsize.width; x++, p0 += d0 )
|
||||
if( mask[x] )
|
||||
{
|
||||
int idx = cvFloor(*p0*a + b);
|
||||
if( (unsigned)idx < (unsigned)sz )
|
||||
((int*)H)[idx]++;
|
||||
double v0 = (double)*p0;
|
||||
int idx = cvFloor(v0*a + b);
|
||||
if (v0 < v0_lo || v0 >= v0_hi)
|
||||
continue;
|
||||
idx = CV_CLAMP_INT(idx, 0, sz - 1);
|
||||
CV_DbgAssert((unsigned)idx < (unsigned)sz);
|
||||
((int*)H)[idx]++;
|
||||
}
|
||||
}
|
||||
return;
|
||||
@ -273,24 +293,45 @@ calcHist_( std::vector<uchar*>& _ptrs, const std::vector<int>& _deltas,
|
||||
const T* p0 = (const T*)ptrs[0];
|
||||
const T* p1 = (const T*)ptrs[1];
|
||||
|
||||
double v0_lo = _ranges[0][0];
|
||||
double v0_hi = _ranges[0][1];
|
||||
double v1_lo = _ranges[1][0];
|
||||
double v1_hi = _ranges[1][1];
|
||||
|
||||
for( ; imsize.height--; p0 += step0, p1 += step1, mask += mstep )
|
||||
{
|
||||
if( !mask )
|
||||
for( x = 0; x < imsize.width; x++, p0 += d0, p1 += d1 )
|
||||
{
|
||||
int idx0 = cvFloor(*p0*a0 + b0);
|
||||
int idx1 = cvFloor(*p1*a1 + b1);
|
||||
if( (unsigned)idx0 < (unsigned)sz0 && (unsigned)idx1 < (unsigned)sz1 )
|
||||
((int*)(H + hstep0*idx0))[idx1]++;
|
||||
double v0 = (double)*p0;
|
||||
double v1 = (double)*p1;
|
||||
int idx0 = cvFloor(v0*a0 + b0);
|
||||
int idx1 = cvFloor(v1*a1 + b1);
|
||||
if (v0 < v0_lo || v0 >= v0_hi)
|
||||
continue;
|
||||
if (v1 < v1_lo || v1 >= v1_hi)
|
||||
continue;
|
||||
idx0 = CV_CLAMP_INT(idx0, 0, sz0 - 1);
|
||||
idx1 = CV_CLAMP_INT(idx1, 0, sz1 - 1);
|
||||
CV_DbgAssert((unsigned)idx0 < (unsigned)sz0 && (unsigned)idx1 < (unsigned)sz1);
|
||||
((int*)(H + hstep0*idx0))[idx1]++;
|
||||
}
|
||||
else
|
||||
for( x = 0; x < imsize.width; x++, p0 += d0, p1 += d1 )
|
||||
if( mask[x] )
|
||||
{
|
||||
int idx0 = cvFloor(*p0*a0 + b0);
|
||||
int idx1 = cvFloor(*p1*a1 + b1);
|
||||
if( (unsigned)idx0 < (unsigned)sz0 && (unsigned)idx1 < (unsigned)sz1 )
|
||||
((int*)(H + hstep0*idx0))[idx1]++;
|
||||
double v0 = (double)*p0;
|
||||
double v1 = (double)*p1;
|
||||
int idx0 = cvFloor(v0*a0 + b0);
|
||||
int idx1 = cvFloor(v1*a1 + b1);
|
||||
if (v0 < v0_lo || v0 >= v0_hi)
|
||||
continue;
|
||||
if (v1 < v1_lo || v1 >= v1_hi)
|
||||
continue;
|
||||
idx0 = CV_CLAMP_INT(idx0, 0, sz0 - 1);
|
||||
idx1 = CV_CLAMP_INT(idx1, 0, sz1 - 1);
|
||||
CV_DbgAssert((unsigned)idx0 < (unsigned)sz0 && (unsigned)idx1 < (unsigned)sz1);
|
||||
((int*)(H + hstep0*idx0))[idx1]++;
|
||||
}
|
||||
}
|
||||
return;
|
||||
@ -309,30 +350,63 @@ calcHist_( std::vector<uchar*>& _ptrs, const std::vector<int>& _deltas,
|
||||
const T* p1 = (const T*)ptrs[1];
|
||||
const T* p2 = (const T*)ptrs[2];
|
||||
|
||||
double v0_lo = _ranges[0][0];
|
||||
double v0_hi = _ranges[0][1];
|
||||
double v1_lo = _ranges[1][0];
|
||||
double v1_hi = _ranges[1][1];
|
||||
double v2_lo = _ranges[2][0];
|
||||
double v2_hi = _ranges[2][1];
|
||||
|
||||
for( ; imsize.height--; p0 += step0, p1 += step1, p2 += step2, mask += mstep )
|
||||
{
|
||||
if( !mask )
|
||||
for( x = 0; x < imsize.width; x++, p0 += d0, p1 += d1, p2 += d2 )
|
||||
{
|
||||
int idx0 = cvFloor(*p0*a0 + b0);
|
||||
int idx1 = cvFloor(*p1*a1 + b1);
|
||||
int idx2 = cvFloor(*p2*a2 + b2);
|
||||
if( (unsigned)idx0 < (unsigned)sz0 &&
|
||||
double v0 = (double)*p0;
|
||||
double v1 = (double)*p1;
|
||||
double v2 = (double)*p2;
|
||||
int idx0 = cvFloor(v0*a0 + b0);
|
||||
int idx1 = cvFloor(v1*a1 + b1);
|
||||
int idx2 = cvFloor(v2*a2 + b2);
|
||||
if (v0 < v0_lo || v0 >= v0_hi)
|
||||
continue;
|
||||
if (v1 < v1_lo || v1 >= v1_hi)
|
||||
continue;
|
||||
if (v2 < v2_lo || v2 >= v2_hi)
|
||||
continue;
|
||||
idx0 = CV_CLAMP_INT(idx0, 0, sz0 - 1);
|
||||
idx1 = CV_CLAMP_INT(idx1, 0, sz1 - 1);
|
||||
idx2 = CV_CLAMP_INT(idx2, 0, sz2 - 1);
|
||||
CV_DbgAssert(
|
||||
(unsigned)idx0 < (unsigned)sz0 &&
|
||||
(unsigned)idx1 < (unsigned)sz1 &&
|
||||
(unsigned)idx2 < (unsigned)sz2 )
|
||||
((int*)(H + hstep0*idx0 + hstep1*idx1))[idx2]++;
|
||||
(unsigned)idx2 < (unsigned)sz2);
|
||||
((int*)(H + hstep0*idx0 + hstep1*idx1))[idx2]++;
|
||||
}
|
||||
else
|
||||
for( x = 0; x < imsize.width; x++, p0 += d0, p1 += d1, p2 += d2 )
|
||||
if( mask[x] )
|
||||
{
|
||||
int idx0 = cvFloor(*p0*a0 + b0);
|
||||
int idx1 = cvFloor(*p1*a1 + b1);
|
||||
int idx2 = cvFloor(*p2*a2 + b2);
|
||||
if( (unsigned)idx0 < (unsigned)sz0 &&
|
||||
(unsigned)idx1 < (unsigned)sz1 &&
|
||||
(unsigned)idx2 < (unsigned)sz2 )
|
||||
((int*)(H + hstep0*idx0 + hstep1*idx1))[idx2]++;
|
||||
double v0 = (double)*p0;
|
||||
double v1 = (double)*p1;
|
||||
double v2 = (double)*p2;
|
||||
int idx0 = cvFloor(v0*a0 + b0);
|
||||
int idx1 = cvFloor(v1*a1 + b1);
|
||||
int idx2 = cvFloor(v2*a2 + b2);
|
||||
if (v0 < v0_lo || v0 >= v0_hi)
|
||||
continue;
|
||||
if (v1 < v1_lo || v1 >= v1_hi)
|
||||
continue;
|
||||
if (v2 < v2_lo || v2 >= v2_hi)
|
||||
continue;
|
||||
idx0 = CV_CLAMP_INT(idx0, 0, sz0 - 1);
|
||||
idx1 = CV_CLAMP_INT(idx1, 0, sz1 - 1);
|
||||
idx2 = CV_CLAMP_INT(idx2, 0, sz2 - 1);
|
||||
CV_DbgAssert(
|
||||
(unsigned)idx0 < (unsigned)sz0 &&
|
||||
(unsigned)idx1 < (unsigned)sz1 &&
|
||||
(unsigned)idx2 < (unsigned)sz2);
|
||||
((int*)(H + hstep0*idx0 + hstep1*idx1))[idx2]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -346,9 +420,14 @@ calcHist_( std::vector<uchar*>& _ptrs, const std::vector<int>& _deltas,
|
||||
uchar* Hptr = H;
|
||||
for( i = 0; i < dims; i++ )
|
||||
{
|
||||
int idx = cvFloor(*ptrs[i]*uniranges[i*2] + uniranges[i*2+1]);
|
||||
if( (unsigned)idx >= (unsigned)size[i] )
|
||||
double v_lo = _ranges[i][0];
|
||||
double v_hi = _ranges[i][1];
|
||||
double v = *ptrs[i];
|
||||
if (v < v_lo || v >= v_hi)
|
||||
break;
|
||||
int idx = cvFloor(v*uniranges[i*2] + uniranges[i*2+1]);
|
||||
idx = CV_CLAMP_INT(idx, 0, size[i] - 1);
|
||||
CV_DbgAssert((unsigned)idx < (unsigned)size[i]);
|
||||
ptrs[i] += deltas[i*2];
|
||||
Hptr += idx*hstep[i];
|
||||
}
|
||||
@ -367,9 +446,14 @@ calcHist_( std::vector<uchar*>& _ptrs, const std::vector<int>& _deltas,
|
||||
if( mask[x] )
|
||||
for( ; i < dims; i++ )
|
||||
{
|
||||
int idx = cvFloor(*ptrs[i]*uniranges[i*2] + uniranges[i*2+1]);
|
||||
if( (unsigned)idx >= (unsigned)size[i] )
|
||||
double v_lo = _ranges[i][0];
|
||||
double v_hi = _ranges[i][1];
|
||||
double v = *ptrs[i];
|
||||
if (v < v_lo || v >= v_hi)
|
||||
break;
|
||||
int idx = cvFloor(v*uniranges[i*2] + uniranges[i*2+1]);
|
||||
idx = CV_CLAMP_INT(idx, 0, size[i] - 1);
|
||||
CV_DbgAssert((unsigned)idx < (unsigned)size[i]);
|
||||
ptrs[i] += deltas[i*2];
|
||||
Hptr += idx*hstep[i];
|
||||
}
|
||||
|
@ -1307,9 +1307,18 @@ cvTsCalcHist( const vector<Mat>& images, CvHistogram* hist, Mat mask, const vect
|
||||
for( k = 0; k < cdims; k++ )
|
||||
{
|
||||
double v = val[k], lo = hist->thresh[k][0], hi = hist->thresh[k][1];
|
||||
idx[k] = cvFloor((v - lo)*dims[k]/(hi - lo));
|
||||
if( idx[k] < 0 || idx[k] >= dims[k] )
|
||||
if (v < lo || v >= hi)
|
||||
break;
|
||||
double idx_ = (v - lo)*dims[k]/(hi - lo);
|
||||
idx[k] = cvFloor(idx_);
|
||||
if (idx[k] < 0)
|
||||
{
|
||||
idx[k] = 0;
|
||||
}
|
||||
if (idx[k] >= dims[k])
|
||||
{
|
||||
idx[k] = dims[k] - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
|
Loading…
Reference in New Issue
Block a user