mirror of
https://github.com/opencv/opencv.git
synced 2025-06-11 11:45:30 +08:00
Merge remote-tracking branch 'upstream/3.4' into merge-3.4
This commit is contained in:
commit
2e0150e601
@ -1133,6 +1133,41 @@ inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b,
|
||||
return v_float32x8(_mm256_hadd_ps(ab, cd));
|
||||
}
|
||||
|
||||
inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b)
|
||||
{
|
||||
return (unsigned)_v_cvtsi256_si32(_mm256_sad_epu8(a.val, b.val));
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
|
||||
{
|
||||
__m256i half = _mm256_set1_epi8(0x7f);
|
||||
return (unsigned)_v_cvtsi256_si32(_mm256_sad_epu8(_mm256_add_epi8(a.val, half), _mm256_add_epi8(b.val, half)));
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
|
||||
{
|
||||
v_uint32x8 l, h;
|
||||
v_expand(v_add_wrap(a - b, b - a), l, h);
|
||||
return v_reduce_sum(l + h);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
|
||||
{
|
||||
v_uint32x8 l, h;
|
||||
v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
|
||||
return v_reduce_sum(l + h);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
|
||||
{
|
||||
return v_reduce_sum(v_max(a, b) - v_min(a, b));
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
|
||||
{
|
||||
v_int32x8 m = a < b;
|
||||
return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
|
||||
}
|
||||
inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
|
||||
{
|
||||
return v_reduce_sum((a - b) & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
|
||||
}
|
||||
|
||||
/** Popcount **/
|
||||
#define OPENCV_HAL_IMPL_AVX_POPCOUNT(_Tpvec) \
|
||||
inline v_uint32x8 v_popcount(const _Tpvec& a) \
|
||||
|
@ -686,10 +686,10 @@ OPENCV_HAL_IMPL_CMP_OP(!=)
|
||||
template<int n>
|
||||
inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
|
||||
{
|
||||
typedef typename V_TypeTraits<float>::int_type itype;
|
||||
v_reg<float, n> c;
|
||||
for (int i = 0; i < n; i++)
|
||||
c.s[i] = V_TypeTraits<float>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
|
||||
typedef typename V_TypeTraits<float>::int_type itype;
|
||||
v_reg<float, n> c;
|
||||
for (int i = 0; i < n; i++)
|
||||
c.s[i] = V_TypeTraits<float>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
|
||||
return c;
|
||||
}
|
||||
template<int n>
|
||||
@ -1063,6 +1063,21 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
|
||||
return r;
|
||||
}
|
||||
|
||||
/** @brief Sum absolute differences of values
|
||||
|
||||
Scheme:
|
||||
@code
|
||||
{A1 A2 A3 ...} {B1 B2 B3 ...} => sum{ABS(A1-B1),abs(A2-B2),abs(A3-B3),...}
|
||||
@endcode
|
||||
For all types except 64-bit types.*/
|
||||
template<typename _Tp, int n> inline typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type v_reduce_sad(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
|
||||
{
|
||||
typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type c = _absdiff(a.s[0], b.s[0]);
|
||||
for (int i = 1; i < n; i++)
|
||||
c += _absdiff(a.s[i], b.s[i]);
|
||||
return c;
|
||||
}
|
||||
|
||||
/** @brief Get negative values mask
|
||||
|
||||
Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
|
||||
|
@ -999,6 +999,49 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
|
||||
return v_float32x4(vaddq_f32(v0, v1));
|
||||
}
|
||||
|
||||
inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
|
||||
{
|
||||
uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vabdq_u8(a.val, b.val)));
|
||||
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
|
||||
return vget_lane_u32(vpadd_u32(t1, t1), 0);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s8(vabdq_s8(a.val, b.val))));
|
||||
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
|
||||
return vget_lane_u32(vpadd_u32(t1, t1), 0);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{
|
||||
uint32x4_t t0 = vpaddlq_u16(vabdq_u16(a.val, b.val));
|
||||
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
|
||||
return vget_lane_u32(vpadd_u32(t1, t1), 0);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
uint32x4_t t0 = vpaddlq_u16(vreinterpretq_u16_s16(vabdq_s16(a.val, b.val)));
|
||||
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
|
||||
return vget_lane_u32(vpadd_u32(t1, t1), 0);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
|
||||
{
|
||||
uint32x4_t t0 = vabdq_u32(a.val, b.val);
|
||||
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
|
||||
return vget_lane_u32(vpadd_u32(t1, t1), 0);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
|
||||
{
|
||||
uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
|
||||
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
|
||||
return vget_lane_u32(vpadd_u32(t1, t1), 0);
|
||||
}
|
||||
inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
float32x4_t t0 = vabdq_f32(a.val, b.val);
|
||||
float32x2_t t1 = vpadd_f32(vget_low_f32(t0), vget_high_f32(t0));
|
||||
return vget_lane_f32(vpadd_f32(t1, t1), 0);
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \
|
||||
inline v_uint32x4 v_popcount(const _Tpvec& a) \
|
||||
{ \
|
||||
|
@ -1477,6 +1477,41 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
|
||||
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
|
||||
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
|
||||
|
||||
inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
|
||||
{
|
||||
return (unsigned)_mm_cvtsi128_si32(_mm_sad_epu8(a.val, b.val));
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
__m128i half = _mm_set1_epi8(0x7f);
|
||||
return (unsigned)_mm_cvtsi128_si32(_mm_sad_epu8(_mm_add_epi8(a.val, half),
|
||||
_mm_add_epi8(b.val, half)));
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{
|
||||
v_uint32x4 l, h;
|
||||
v_expand(v_absdiff(a, b), l, h);
|
||||
return v_reduce_sum(l + h);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
v_uint32x4 l, h;
|
||||
v_expand(v_absdiff(a, b), l, h);
|
||||
return v_reduce_sum(l + h);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
|
||||
{
|
||||
return v_reduce_sum(v_absdiff(a, b));
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
|
||||
{
|
||||
return v_reduce_sum(v_absdiff(a, b));
|
||||
}
|
||||
inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
return v_reduce_sum(v_absdiff(a, b));
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \
|
||||
inline v_uint32x4 v_popcount(const _Tpvec& a) \
|
||||
{ \
|
||||
@ -1930,13 +1965,11 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4&
|
||||
|
||||
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
|
||||
{
|
||||
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
|
||||
|
||||
__m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1
|
||||
__m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
|
||||
|
||||
a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
|
||||
b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
|
||||
a.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0)); // a0 a1 a2 a3
|
||||
b.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(3, 1, 3, 1)); // b0 b1 ab b3
|
||||
}
|
||||
|
||||
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
|
||||
|
@ -739,6 +739,50 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
|
||||
return v_float32x4(vec_mergeh(ac, bd));
|
||||
}
|
||||
|
||||
inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
|
||||
{
|
||||
const vec_uint4 zero4 = vec_uint4_z;
|
||||
vec_uint4 sum4 = vec_sum4s(vec_absd(a.val, b.val), zero4);
|
||||
return (unsigned)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
const vec_int4 zero4 = vec_int4_z;
|
||||
vec_char16 ad = vec_abss(vec_subs(a.val, b.val));
|
||||
vec_int4 sum4 = vec_sum4s(ad, zero4);
|
||||
return (unsigned)vec_extract(vec_sums(sum4, zero4), 3);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{
|
||||
vec_ushort8 ad = vec_absd(a.val, b.val);
|
||||
VSX_UNUSED(vec_int4) sum = vec_sums(vec_int4_c(vec_unpackhu(ad)), vec_int4_c(vec_unpacklu(ad)));
|
||||
return (unsigned)vec_extract(sum, 3);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
const vec_int4 zero4 = vec_int4_z;
|
||||
vec_short8 ad = vec_abss(vec_subs(a.val, b.val));
|
||||
vec_int4 sum4 = vec_sum4s(ad, zero4);
|
||||
return (unsigned)vec_extract(vec_sums(sum4, zero4), 3);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
|
||||
{
|
||||
const vec_uint4 ad = vec_absd(a.val, b.val);
|
||||
const vec_uint4 rd = vec_add(ad, vec_sld(ad, ad, 8));
|
||||
return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
|
||||
{
|
||||
vec_int4 ad = vec_abss(vec_sub(a.val, b.val));
|
||||
return (unsigned)vec_extract(vec_sums(ad, vec_int4_z), 3);
|
||||
}
|
||||
inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
const vec_float4 ad = vec_abs(vec_sub(a.val, b.val));
|
||||
const vec_float4 rd = vec_add(ad, vec_sld(ad, ad, 8));
|
||||
return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0);
|
||||
}
|
||||
|
||||
/** Popcount **/
|
||||
template<typename _Tpvec>
|
||||
inline v_uint32x4 v_popcount(const _Tpvec& a)
|
||||
|
@ -567,7 +567,7 @@ inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m
|
||||
|
||||
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
|
||||
{
|
||||
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
|
||||
enum { mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1) };
|
||||
|
||||
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
|
||||
__m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
|
||||
@ -588,7 +588,7 @@ inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m12
|
||||
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
|
||||
__m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
|
||||
{
|
||||
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
|
||||
enum { mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1) };
|
||||
|
||||
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
|
||||
__m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
|
||||
@ -615,7 +615,7 @@ inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
|
||||
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
|
||||
__m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
|
||||
{
|
||||
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
|
||||
enum { mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1) };
|
||||
|
||||
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
|
||||
__m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
|
||||
|
@ -253,4 +253,53 @@ PERF_TEST_P( Size_MatType, normalize_minmax, TYPICAL_MATS )
|
||||
SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
|
||||
}
|
||||
|
||||
typedef TestBaseWithParam< int > test_len;
|
||||
PERF_TEST_P(test_len, hal_normL1_u8,
|
||||
testing::Values(300000, 2000000)
|
||||
)
|
||||
{
|
||||
int len = GetParam();
|
||||
|
||||
Mat src1(1, len, CV_8UC1);
|
||||
Mat src2(1, len, CV_8UC1);
|
||||
|
||||
declare.in(src1, src2, WARMUP_RNG);
|
||||
double n;
|
||||
TEST_CYCLE() n = hal::normL1_(src1.ptr<uchar>(0), src2.ptr<uchar>(0), len);
|
||||
CV_UNUSED(n);
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
|
||||
PERF_TEST_P(test_len, hal_normL1_f32,
|
||||
testing::Values(300000, 2000000)
|
||||
)
|
||||
{
|
||||
int len = GetParam();
|
||||
|
||||
Mat src1(1, len, CV_32FC1);
|
||||
Mat src2(1, len, CV_32FC1);
|
||||
|
||||
declare.in(src1, src2, WARMUP_RNG);
|
||||
double n;
|
||||
TEST_CYCLE() n = hal::normL1_(src1.ptr<float>(0), src2.ptr<float>(0), len);
|
||||
CV_UNUSED(n);
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
|
||||
PERF_TEST_P(test_len, hal_normL2Sqr,
|
||||
testing::Values(300000, 2000000)
|
||||
)
|
||||
{
|
||||
int len = GetParam();
|
||||
|
||||
Mat src1(1, len, CV_32FC1);
|
||||
Mat src2(1, len, CV_32FC1);
|
||||
|
||||
declare.in(src1, src2, WARMUP_RNG);
|
||||
double n;
|
||||
TEST_CYCLE() n = hal::normL2Sqr_(src1.ptr<float>(0), src2.ptr<float>(0), len);
|
||||
CV_UNUSED(n);
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
@ -98,43 +98,15 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
|
||||
float normL2Sqr_(const float* a, const float* b, int n)
|
||||
{
|
||||
int j = 0; float d = 0.f;
|
||||
#if CV_AVX2
|
||||
float CV_DECL_ALIGNED(32) buf[8];
|
||||
__m256 d0 = _mm256_setzero_ps();
|
||||
|
||||
for( ; j <= n - 8; j += 8 )
|
||||
#if CV_SIMD
|
||||
v_float32 v_d = vx_setzero_f32();
|
||||
for (; j <= n - v_float32::nlanes; j += v_float32::nlanes)
|
||||
{
|
||||
__m256 t0 = _mm256_sub_ps(_mm256_loadu_ps(a + j), _mm256_loadu_ps(b + j));
|
||||
#if CV_FMA3
|
||||
d0 = _mm256_fmadd_ps(t0, t0, d0);
|
||||
#else
|
||||
d0 = _mm256_add_ps(d0, _mm256_mul_ps(t0, t0));
|
||||
v_float32 t = vx_load(a + j) - vx_load(b + j);
|
||||
v_d = v_muladd(t, t, v_d);
|
||||
}
|
||||
d = v_reduce_sum(v_d);
|
||||
#endif
|
||||
}
|
||||
_mm256_store_ps(buf, d0);
|
||||
d = buf[0] + buf[1] + buf[2] + buf[3] + buf[4] + buf[5] + buf[6] + buf[7];
|
||||
#elif CV_SSE
|
||||
float CV_DECL_ALIGNED(16) buf[4];
|
||||
__m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
|
||||
|
||||
for( ; j <= n - 8; j += 8 )
|
||||
{
|
||||
__m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
|
||||
__m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
|
||||
d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
|
||||
d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
|
||||
}
|
||||
_mm_store_ps(buf, _mm_add_ps(d0, d1));
|
||||
d = buf[0] + buf[1] + buf[2] + buf[3];
|
||||
#endif
|
||||
{
|
||||
for( ; j <= n - 4; j += 4 )
|
||||
{
|
||||
float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
|
||||
d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
|
||||
}
|
||||
}
|
||||
|
||||
for( ; j < n; j++ )
|
||||
{
|
||||
float t = a[j] - b[j];
|
||||
@ -147,38 +119,12 @@ float normL2Sqr_(const float* a, const float* b, int n)
|
||||
float normL1_(const float* a, const float* b, int n)
|
||||
{
|
||||
int j = 0; float d = 0.f;
|
||||
#if CV_SSE
|
||||
float CV_DECL_ALIGNED(16) buf[4];
|
||||
static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
|
||||
__m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
|
||||
__m128 absmask = _mm_load_ps((const float*)absbuf);
|
||||
|
||||
for( ; j <= n - 8; j += 8 )
|
||||
{
|
||||
__m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
|
||||
__m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
|
||||
d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
|
||||
d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
|
||||
}
|
||||
_mm_store_ps(buf, _mm_add_ps(d0, d1));
|
||||
d = buf[0] + buf[1] + buf[2] + buf[3];
|
||||
#elif CV_NEON
|
||||
float32x4_t v_sum = vdupq_n_f32(0.0f);
|
||||
for ( ; j <= n - 4; j += 4)
|
||||
v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
|
||||
|
||||
float CV_DECL_ALIGNED(16) buf[4];
|
||||
vst1q_f32(buf, v_sum);
|
||||
d = buf[0] + buf[1] + buf[2] + buf[3];
|
||||
#if CV_SIMD
|
||||
v_float32 v_d = vx_setzero_f32();
|
||||
for (; j <= n - v_float32::nlanes; j += v_float32::nlanes)
|
||||
v_d += v_absdiff(vx_load(a + j), vx_load(b + j));
|
||||
d = v_reduce_sum(v_d);
|
||||
#endif
|
||||
{
|
||||
for( ; j <= n - 4; j += 4 )
|
||||
{
|
||||
d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
|
||||
std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
|
||||
}
|
||||
}
|
||||
|
||||
for( ; j < n; j++ )
|
||||
d += std::abs(a[j] - b[j]);
|
||||
return d;
|
||||
@ -187,46 +133,10 @@ float normL1_(const float* a, const float* b, int n)
|
||||
int normL1_(const uchar* a, const uchar* b, int n)
|
||||
{
|
||||
int j = 0, d = 0;
|
||||
#if CV_SSE
|
||||
__m128i d0 = _mm_setzero_si128();
|
||||
|
||||
for( ; j <= n - 16; j += 16 )
|
||||
{
|
||||
__m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
|
||||
__m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
|
||||
|
||||
d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
|
||||
}
|
||||
|
||||
for( ; j <= n - 4; j += 4 )
|
||||
{
|
||||
__m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
|
||||
__m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
|
||||
|
||||
d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
|
||||
}
|
||||
d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
|
||||
#elif CV_NEON
|
||||
uint32x4_t v_sum = vdupq_n_u32(0.0f);
|
||||
for ( ; j <= n - 16; j += 16)
|
||||
{
|
||||
uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
|
||||
uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
|
||||
v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
|
||||
v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
|
||||
}
|
||||
|
||||
uint CV_DECL_ALIGNED(16) buf[4];
|
||||
vst1q_u32(buf, v_sum);
|
||||
d = buf[0] + buf[1] + buf[2] + buf[3];
|
||||
#if CV_SIMD
|
||||
for (; j <= n - v_uint8::nlanes; j += v_uint8::nlanes)
|
||||
d += v_reduce_sad(vx_load(a + j), vx_load(b + j));
|
||||
#endif
|
||||
{
|
||||
for( ; j <= n - 4; j += 4 )
|
||||
{
|
||||
d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
|
||||
std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
|
||||
}
|
||||
}
|
||||
for( ; j < n; j++ )
|
||||
d += std::abs(a[j] - b[j]);
|
||||
return d;
|
||||
|
@ -563,6 +563,7 @@ cv::String getCacheDirectory(const char* sub_directory_name, const char* configu
|
||||
cv::String canonical(const cv::String& /*path*/) { NOT_IMPLEMENTED }
|
||||
bool exists(const cv::String& /*path*/) { NOT_IMPLEMENTED }
|
||||
void remove_all(const cv::String& /*path*/) { NOT_IMPLEMENTED }
|
||||
cv::String getcwd() { NOT_IMPLEMENTED }
|
||||
bool createDirectory(const cv::String& /*path*/) { NOT_IMPLEMENTED }
|
||||
bool createDirectories(const cv::String& /*path*/) { NOT_IMPLEMENTED }
|
||||
cv::String getCacheDirectory(const char* /*sub_directory_name*/, const char* /*configuration_name = NULL*/) { NOT_IMPLEMENTED }
|
||||
|
@ -340,8 +340,8 @@ static void copy_convert_yv12_to_bgr(const VAImage& image, const unsigned char*
|
||||
1.5959997177f
|
||||
};
|
||||
|
||||
CV_CheckEQ(image.format.fourcc, VA_FOURCC_YV12, "Unexpected image format");
|
||||
CV_CheckEQ(image.num_planes, 3, "");
|
||||
CV_CheckEQ((size_t)image.format.fourcc, (size_t)VA_FOURCC_YV12, "Unexpected image format");
|
||||
CV_CheckEQ((size_t)image.num_planes, (size_t)3, "");
|
||||
|
||||
const size_t srcOffsetY = image.offsets[0];
|
||||
const size_t srcOffsetV = image.offsets[1];
|
||||
@ -417,8 +417,8 @@ static void copy_convert_bgr_to_yv12(const VAImage& image, const Mat& bgr, unsig
|
||||
-0.2909994125f, 0.438999176f, -0.3679990768f, -0.0709991455f
|
||||
};
|
||||
|
||||
CV_CheckEQ(image.format.fourcc, VA_FOURCC_YV12, "Unexpected image format");
|
||||
CV_CheckEQ(image.num_planes, 3, "");
|
||||
CV_CheckEQ((size_t)image.format.fourcc, (size_t)VA_FOURCC_YV12, "Unexpected image format");
|
||||
CV_CheckEQ((size_t)image.num_planes, (size_t)3, "");
|
||||
|
||||
const size_t dstOffsetY = image.offsets[0];
|
||||
const size_t dstOffsetV = image.offsets[1];
|
||||
|
@ -3,7 +3,7 @@
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
#include "perf_precomp.hpp"
|
||||
|
||||
namespace opencv_test {
|
||||
namespace opencv_test { namespace {
|
||||
|
||||
CV_ENUM(RetrMode, RETR_EXTERNAL, RETR_LIST, RETR_CCOMP, RETR_TREE)
|
||||
CV_ENUM(ApproxMode, CHAIN_APPROX_NONE, CHAIN_APPROX_SIMPLE, CHAIN_APPROX_TC89_L1, CHAIN_APPROX_TC89_KCOS)
|
||||
@ -84,4 +84,4 @@ PERF_TEST_P(TestFindContoursFF, findContours,
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} } // namespace
|
||||
|
@ -175,7 +175,6 @@ void GMM::addSample( int ci, const Vec3d color )
|
||||
|
||||
void GMM::endLearning()
|
||||
{
|
||||
CV_Assert(totalSampleCount > 0);
|
||||
for( int ci = 0; ci < componentsCount; ci++ )
|
||||
{
|
||||
int n = sampleCounts[ci];
|
||||
@ -183,6 +182,7 @@ void GMM::endLearning()
|
||||
coefs[ci] = 0;
|
||||
else
|
||||
{
|
||||
CV_Assert(totalSampleCount > 0);
|
||||
double inv_n = 1.0 / n;
|
||||
coefs[ci] = (double)n/totalSampleCount;
|
||||
|
||||
|
@ -74,30 +74,27 @@
|
||||
namespace cv
|
||||
{
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
typedef ushort HT;
|
||||
|
||||
/**
|
||||
* This structure represents a two-tier histogram. The first tier (known as the
|
||||
* "coarse" level) is 4 bit wide and the second tier (known as the "fine" level)
|
||||
* is 8 bit wide. Pixels inserted in the fine level also get inserted into the
|
||||
* coarse bucket designated by the 4 MSBs of the fine bucket value.
|
||||
*
|
||||
* The structure is aligned on 16 bits, which is a prerequisite for SIMD
|
||||
* instructions. Each bucket is 16 bit wide, which means that extra care must be
|
||||
* taken to prevent overflow.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
HT coarse[16];
|
||||
HT fine[16][16];
|
||||
} Histogram;
|
||||
|
||||
static void
|
||||
medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
|
||||
{
|
||||
typedef ushort HT;
|
||||
|
||||
/**
|
||||
* This structure represents a two-tier histogram. The first tier (known as the
|
||||
* "coarse" level) is 4 bit wide and the second tier (known as the "fine" level)
|
||||
* is 8 bit wide. Pixels inserted in the fine level also get inserted into the
|
||||
* coarse bucket designated by the 4 MSBs of the fine bucket value.
|
||||
*
|
||||
* The structure is aligned on 16 bits, which is a prerequisite for SIMD
|
||||
* instructions. Each bucket is 16 bit wide, which means that extra care must be
|
||||
* taken to prevent overflow.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
HT coarse[16];
|
||||
HT fine[16][16];
|
||||
} Histogram;
|
||||
|
||||
/**
|
||||
* HOP is short for Histogram OPeration. This macro makes an operation \a op on
|
||||
* histogram \a h for pixel value \a x. It takes care of handling both levels.
|
||||
@ -136,7 +133,7 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
|
||||
for( c = 0; c < cn; c++ )
|
||||
{
|
||||
for( j = 0; j < n; j++ )
|
||||
COP( c, j, src[cn*j+c], += (cv::HT)(r+2) );
|
||||
COP( c, j, src[cn*j+c], += (HT)(r+2) );
|
||||
|
||||
for( i = 1; i < r; i++ )
|
||||
{
|
||||
@ -172,7 +169,7 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
|
||||
v_store(H[c].fine[k] + 8, v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k) + 8), v_setall_u16((ushort)(2 * r + 1))) + v_load(H[c].fine[k] + 8));
|
||||
#else
|
||||
for (int ind = 0; ind < 16; ++ind)
|
||||
H[c].fine[k][ind] += (2 * r + 1) * h_fine[16 * n*(16 * c + k) + ind];
|
||||
H[c].fine[k][ind] = (HT)(H[c].fine[k][ind] + (2 * r + 1) * h_fine[16 * n*(16 * c + k) + ind]);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -245,7 +242,7 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
|
||||
memset(&H[c].fine[k], 0, 16 * sizeof(HT));
|
||||
#endif
|
||||
px = h_fine + 16 * (n*(16 * c + k) + j - r);
|
||||
for (luc[c][k] = cv::HT(j - r); luc[c][k] < MIN(j + r + 1, n); ++luc[c][k], px += 16)
|
||||
for (luc[c][k] = HT(j - r); luc[c][k] < MIN(j + r + 1, n); ++luc[c][k], px += 16)
|
||||
{
|
||||
#if CV_SIMD256
|
||||
v_fine += v256_load(px);
|
||||
@ -268,7 +265,7 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
|
||||
v_fineh += v_mul_wrap(v_load(px + 8), v_setall_u16((ushort)(j + r + 1 - n)));
|
||||
#else
|
||||
for (int ind = 0; ind < 16; ++ind)
|
||||
H[c].fine[k][ind] += (j + r + 1 - n) * px[ind];
|
||||
H[c].fine[k][ind] = (HT)(H[c].fine[k][ind] + (j + r + 1 - n) * px[ind]);
|
||||
#endif
|
||||
luc[c][k] = (HT)(j+r+1);
|
||||
}
|
||||
@ -479,6 +476,8 @@ medianBlur_8u_Om( const Mat& _src, Mat& _dst, int m )
|
||||
}
|
||||
|
||||
|
||||
namespace {
|
||||
|
||||
struct MinMax8u
|
||||
{
|
||||
typedef uchar value_type;
|
||||
|
@ -3782,9 +3782,9 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
|
||||
Size ssize = _src.size();
|
||||
|
||||
CV_Assert( !ssize.empty() );
|
||||
CV_Assert( !dsize.empty() || (inv_scale_x > 0 && inv_scale_y > 0) );
|
||||
if( dsize.empty() )
|
||||
{
|
||||
CV_Assert(inv_scale_x > 0); CV_Assert(inv_scale_y > 0);
|
||||
dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x),
|
||||
saturate_cast<int>(ssize.height*inv_scale_y));
|
||||
CV_Assert( !dsize.empty() );
|
||||
@ -3793,6 +3793,7 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
|
||||
{
|
||||
inv_scale_x = (double)dsize.width/ssize.width;
|
||||
inv_scale_y = (double)dsize.height/ssize.height;
|
||||
CV_Assert(inv_scale_x > 0); CV_Assert(inv_scale_y > 0);
|
||||
}
|
||||
|
||||
if (interpolation == INTER_LINEAR_EXACT && (_src.depth() == CV_32F || _src.depth() == CV_64F))
|
||||
|
@ -45,9 +45,9 @@ TEST(Resize_Bitexact, Linear8U)
|
||||
{ CV_8UC4, Size( 4, 3) },
|
||||
{ CV_8UC1, Size( 342, 384) }, // 1/3 1/2
|
||||
{ CV_8UC1, Size( 342, 256) }, // 1/3 1/3
|
||||
{ CV_8UC1, Size( 342, 256) },
|
||||
{ CV_8UC1, Size( 342, 256) },
|
||||
{ CV_8UC1, Size( 342, 256) },
|
||||
{ CV_8UC2, Size( 342, 256) },
|
||||
{ CV_8UC3, Size( 342, 256) },
|
||||
{ CV_8UC4, Size( 342, 256) },
|
||||
{ CV_8UC1, Size( 512, 256) }, // 1/2 1/3
|
||||
{ CV_8UC1, Size( 146, 110) }, // 1/7 1/7
|
||||
{ CV_8UC3, Size( 146, 110) },
|
||||
@ -83,13 +83,13 @@ TEST(Resize_Bitexact, Linear8U)
|
||||
softdouble scale_y = softdouble::one() / softdouble(inv_scale_y);
|
||||
|
||||
Mat src(rows, cols, type), refdst(drows, dcols, type), dst;
|
||||
RNG rnd(0x123456789abcdefULL);
|
||||
for (int j = 0; j < rows; j++)
|
||||
{
|
||||
uint8_t* line = src.ptr(j);
|
||||
for (int i = 0; i < cols; i++)
|
||||
for (int c = 0; c < cn; c++)
|
||||
{
|
||||
RNG rnd(0x123456789abcdefULL);
|
||||
double val = j < rows / 2 ? ( i < cols / 2 ? ((sin((i + 1)*CV_PI / 256.)*sin((j + 1)*CV_PI / 256.)*sin((cn + 4)*CV_PI / 8.) + 1.)*128.) :
|
||||
(((i / 128 + j / 128) % 2) * 250 + (j / 128) % 2) ) :
|
||||
( i < cols / 2 ? ((i / 128) * (85 - j / 256 * 40) * ((j / 128) % 2) + (7 - i / 128) * (85 - j / 256 * 40) * ((j / 128 + 1) % 2)) :
|
||||
|
@ -959,8 +959,7 @@ bool QRDecode::samplingForVersion()
|
||||
const int delta_rows = cvRound((postIntermediate.rows * 1.0) / version_size);
|
||||
const int delta_cols = cvRound((postIntermediate.cols * 1.0) / version_size);
|
||||
|
||||
vector<double> listFrequencyElem(version_size * version_size, 0);
|
||||
int k = 0;
|
||||
vector<double> listFrequencyElem;
|
||||
for (int r = 0; r < postIntermediate.rows; r += delta_rows)
|
||||
{
|
||||
for (int c = 0; c < postIntermediate.cols; c += delta_cols)
|
||||
@ -969,7 +968,7 @@ bool QRDecode::samplingForVersion()
|
||||
Range(r, min(r + delta_rows, postIntermediate.rows)),
|
||||
Range(c, min(c + delta_cols, postIntermediate.cols)));
|
||||
const double frequencyElem = (countNonZero(tile) * 1.0) / tile.total();
|
||||
listFrequencyElem[k] = frequencyElem; k++;
|
||||
listFrequencyElem.push_back(frequencyElem);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -17,6 +17,8 @@
|
||||
|
||||
#include "opencv2/core/utils/trace.hpp"
|
||||
|
||||
#include "opencv2/core/hal/hal.hpp"
|
||||
|
||||
#include <stdarg.h> // for va_list
|
||||
|
||||
#include "cvconfig.h"
|
||||
|
@ -347,8 +347,17 @@ struct CvCaptureCAM_V4L CV_FINAL : public CvCapture
|
||||
|
||||
/*********************** Implementations ***************************************/
|
||||
|
||||
CvCaptureCAM_V4L::CvCaptureCAM_V4L() : deviceHandle(-1), bufferIndex(-1)
|
||||
CvCaptureCAM_V4L::CvCaptureCAM_V4L() :
|
||||
deviceHandle(-1), bufferIndex(-1),
|
||||
FirstCapture(true),
|
||||
palette(0),
|
||||
width(0), height(0), width_set(0), height_set(0),
|
||||
bufferSize(DEFAULT_V4L_BUFFERS),
|
||||
fps(0), convert_rgb(0), frame_allocated(false), returnFrame(false),
|
||||
channelNumber(-1), normalizePropRange(false),
|
||||
type(V4L2_BUF_TYPE_VIDEO_CAPTURE)
|
||||
{
|
||||
frame = cvIplImage();
|
||||
memset(×tamp, 0, sizeof(timestamp));
|
||||
}
|
||||
|
||||
|
@ -285,12 +285,15 @@ if __name__ == "__main__":
|
||||
parser.add_argument('--disable-bitcode', default=False, dest='bitcodedisabled', action='store_true', help='disable bitcode (enabled by default)')
|
||||
parser.add_argument('--iphoneos_deployment_target', default=os.environ.get('IPHONEOS_DEPLOYMENT_TARGET', IPHONEOS_DEPLOYMENT_TARGET), help='specify IPHONEOS_DEPLOYMENT_TARGET')
|
||||
parser.add_argument('--iphoneos_archs', default='armv7,armv7s,arm64', help='select iPhoneOS target ARCHS')
|
||||
parser.add_argument('--iphonesimulator_archs', default='i386,x86_64', help='select iPhoneSimulator target ARCHS')
|
||||
args = parser.parse_args()
|
||||
|
||||
os.environ['IPHONEOS_DEPLOYMENT_TARGET'] = args.iphoneos_deployment_target
|
||||
print('Using IPHONEOS_DEPLOYMENT_TARGET=' + os.environ['IPHONEOS_DEPLOYMENT_TARGET'])
|
||||
iphoneos_archs = args.iphoneos_archs.split(',')
|
||||
print('Using iPhoneOS ARCHS=' + str(iphoneos_archs))
|
||||
iphonesimulator_archs = args.iphonesimulator_archs.split(',')
|
||||
print('Using iPhoneSimulator ARCHS=' + str(iphonesimulator_archs))
|
||||
|
||||
b = iOSBuilder(args.opencv, args.contrib, args.dynamic, args.bitcodedisabled, args.without,
|
||||
[
|
||||
@ -298,6 +301,6 @@ if __name__ == "__main__":
|
||||
] if os.environ.get('BUILD_PRECOMMIT', None) else
|
||||
[
|
||||
(iphoneos_archs, "iPhoneOS"),
|
||||
(["i386", "x86_64"], "iPhoneSimulator"),
|
||||
(iphonesimulator_archs, "iPhoneSimulator"),
|
||||
])
|
||||
b.build(args.out)
|
||||
|
Loading…
Reference in New Issue
Block a user