mirror of
https://github.com/opencv/opencv.git
synced 2024-11-25 19:50:38 +08:00
use universal intrinsic and SSE4 popcount instruction in normHamming
- add v_popcount in universal intrinsic - add test for v_popcount - add wrapper of popcount for both MSVC and GCC
This commit is contained in:
parent
c0cde75d54
commit
8b22099da2
@ -188,8 +188,16 @@ enum CpuFeatures {
|
||||
# if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500)
|
||||
# ifdef _MSC_VER
|
||||
# include <nmmintrin.h>
|
||||
# if defined(_M_X64)
|
||||
# define CV_POPCNT_U64 _mm_popcnt_u64
|
||||
# endif
|
||||
# define CV_POPCNT_U32 _mm_popcnt_u32
|
||||
# else
|
||||
# include <popcntintrin.h>
|
||||
# if defined(__x86_64__)
|
||||
# define CV_POPCNT_U64 __builtin_popcountll
|
||||
# endif
|
||||
# define CV_POPCNT_U32 __builtin_popcount
|
||||
# endif
|
||||
# define CV_POPCNT 1
|
||||
# endif
|
||||
|
@ -149,7 +149,7 @@ Element-wise binary and unary operations.
|
||||
|
||||
Most of these operations return only one value.
|
||||
|
||||
- Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum
|
||||
- Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum, @ref v_popcount
|
||||
- Mask: @ref v_signmask, @ref v_check_all, @ref v_check_any, @ref v_select
|
||||
|
||||
### Other math
|
||||
@ -574,6 +574,49 @@ Scheme:
|
||||
For 32-bit integer and 32-bit floating point types. */
|
||||
OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)
|
||||
|
||||
static const unsigned char popCountTable[] =
|
||||
{
|
||||
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
|
||||
};
|
||||
/** @brief Count the 1 bits in the vector and return 4 values
|
||||
|
||||
Scheme:
|
||||
@code
|
||||
{A1 A2 A3 ...} => popcount(A1)
|
||||
@endcode
|
||||
Any types but result will be in v_uint32x4*/
|
||||
template<typename _Tp, int n> inline v_uint32x4 v_popcount(const v_reg<_Tp, n>& a)
|
||||
{
|
||||
v_uint8x16 b;
|
||||
b = v_reinterpret_as_u8(a);
|
||||
for( int i = 0; i < v_uint8x16::nlanes; i++ )
|
||||
{
|
||||
b.s[i] = popCountTable[b.s[i]];
|
||||
}
|
||||
v_uint32x4 c;
|
||||
for( int i = 0; i < v_uint32x4::nlanes; i++ )
|
||||
{
|
||||
c.s[i] = b.s[i*4] + b.s[i*4+1] + b.s[i*4+2] + b.s[i*4+3];
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
//! @cond IGNORED
|
||||
template<typename _Tp, int n>
|
||||
inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
|
||||
|
@ -813,6 +813,22 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \
|
||||
inline v_uint32x4 v_popcount(const _Tpvec& a) \
|
||||
{ \
|
||||
uint8x16_t t = vcntq_u8(cast(a.val)); \
|
||||
uint16x8_t t0 = vpaddlq_u8(t); /* 16 -> 8 */ \
|
||||
uint32x4_t t1 = vpaddlq_u16(t0); /* 8 -> 4 */ \
|
||||
return v_uint32x4(t1); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint8x16, OPENCV_HAL_NOP)
|
||||
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint16x8, vreinterpretq_u8_u16)
|
||||
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint32x4, vreinterpretq_u8_u32)
|
||||
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int8x16, vreinterpretq_u8_s8)
|
||||
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int16x8, vreinterpretq_u8_s16)
|
||||
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int32x4, vreinterpretq_u8_s32)
|
||||
|
||||
inline int v_signmask(const v_uint8x16& a)
|
||||
{
|
||||
int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
|
||||
|
@ -1121,6 +1121,28 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
|
||||
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
|
||||
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
|
||||
|
||||
#define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \
|
||||
inline v_uint32x4 v_popcount(const _Tpvec& a) \
|
||||
{ \
|
||||
__m128i m1 = _mm_set1_epi32(0x55555555); \
|
||||
__m128i m2 = _mm_set1_epi32(0x33333333); \
|
||||
__m128i m4 = _mm_set1_epi32(0x0f0f0f0f); \
|
||||
__m128i p = a.val; \
|
||||
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); \
|
||||
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); \
|
||||
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); \
|
||||
p = _mm_adds_epi8(p, _mm_srli_si128(p, 1)); \
|
||||
p = _mm_adds_epi8(p, _mm_srli_si128(p, 2)); \
|
||||
return v_uint32x4(_mm_and_si128(p, _mm_set1_epi32(0x000000ff))); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint8x16)
|
||||
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint16x8)
|
||||
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint32x4)
|
||||
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int8x16)
|
||||
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int16x8)
|
||||
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int32x4)
|
||||
|
||||
#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
|
||||
inline int v_signmask(const _Tpvec& a) \
|
||||
{ \
|
||||
|
@ -44,6 +44,7 @@
|
||||
#include "precomp.hpp"
|
||||
#include <climits>
|
||||
#include <limits>
|
||||
#include "opencv2/core/hal/intrin.hpp"
|
||||
|
||||
#include "opencl_kernels_core.hpp"
|
||||
|
||||
@ -4238,22 +4239,8 @@ int normHamming(const uchar* a, int n)
|
||||
{
|
||||
int i = 0;
|
||||
int result = 0;
|
||||
#if CV_NEON
|
||||
{
|
||||
uint32x4_t bits = vmovq_n_u32(0);
|
||||
for (; i <= n - 16; i += 16) {
|
||||
uint8x16_t A_vec = vld1q_u8 (a + i);
|
||||
uint8x16_t bitsSet = vcntq_u8 (A_vec);
|
||||
uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
|
||||
uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
|
||||
bits = vaddq_u32(bits, bitSet4);
|
||||
}
|
||||
uint64x2_t bitSet2 = vpaddlq_u32 (bits);
|
||||
result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
|
||||
result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
|
||||
}
|
||||
#elif CV_AVX2
|
||||
if (USE_AVX2)
|
||||
#if CV_AVX2
|
||||
if(USE_AVX2)
|
||||
{
|
||||
__m256i _r0 = _mm256_setzero_si256();
|
||||
__m256i _0 = _mm256_setzero_si256();
|
||||
@ -4274,12 +4261,40 @@ int normHamming(const uchar* a, int n)
|
||||
_r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
|
||||
result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
|
||||
}
|
||||
#elif CV_POPCNT
|
||||
if(checkHardwareSupport(CV_CPU_POPCNT))
|
||||
{
|
||||
# if defined CV_POPCNT_U64
|
||||
for(; i <= n - 8; i += 8)
|
||||
{
|
||||
result += (int)CV_POPCNT_U64(*(uint64*)(a + i));
|
||||
}
|
||||
# endif
|
||||
for(; i <= n - 4; i += 4)
|
||||
{
|
||||
result += CV_POPCNT_U32(*(uint*)(a + i));
|
||||
}
|
||||
}
|
||||
#elif CV_SIMD128
|
||||
if(hasSIMD128())
|
||||
{
|
||||
v_uint32x4 t = v_setzero_u32();
|
||||
for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
|
||||
{
|
||||
t += v_popcount(v_load(a + i));
|
||||
}
|
||||
result = v_reduce_sum(t);
|
||||
}
|
||||
#endif
|
||||
for( ; i <= n - 4; i += 4 )
|
||||
result += popCountTable[a[i]] + popCountTable[a[i+1]] +
|
||||
popCountTable[a[i+2]] + popCountTable[a[i+3]];
|
||||
for( ; i < n; i++ )
|
||||
for(; i <= n - 4; i += 4)
|
||||
{
|
||||
result += popCountTable[a[i]] + popCountTable[a[i+1]] +
|
||||
popCountTable[a[i+2]] + popCountTable[a[i+3]];
|
||||
}
|
||||
for(; i < n; i++)
|
||||
{
|
||||
result += popCountTable[a[i]];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -4287,24 +4302,8 @@ int normHamming(const uchar* a, const uchar* b, int n)
|
||||
{
|
||||
int i = 0;
|
||||
int result = 0;
|
||||
#if CV_NEON
|
||||
{
|
||||
uint32x4_t bits = vmovq_n_u32(0);
|
||||
for (; i <= n - 16; i += 16) {
|
||||
uint8x16_t A_vec = vld1q_u8 (a + i);
|
||||
uint8x16_t B_vec = vld1q_u8 (b + i);
|
||||
uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
|
||||
uint8x16_t bitsSet = vcntq_u8 (AxorB);
|
||||
uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
|
||||
uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
|
||||
bits = vaddq_u32(bits, bitSet4);
|
||||
}
|
||||
uint64x2_t bitSet2 = vpaddlq_u32 (bits);
|
||||
result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
|
||||
result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
|
||||
}
|
||||
#elif CV_AVX2
|
||||
if (USE_AVX2)
|
||||
#if CV_AVX2
|
||||
if(USE_AVX2)
|
||||
{
|
||||
__m256i _r0 = _mm256_setzero_si256();
|
||||
__m256i _0 = _mm256_setzero_si256();
|
||||
@ -4328,12 +4327,40 @@ int normHamming(const uchar* a, const uchar* b, int n)
|
||||
_r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
|
||||
result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
|
||||
}
|
||||
#elif CV_POPCNT
|
||||
if(checkHardwareSupport(CV_CPU_POPCNT))
|
||||
{
|
||||
# if defined CV_POPCNT_U64
|
||||
for(; i <= n - 8; i += 8)
|
||||
{
|
||||
result += (int)CV_POPCNT_U64(*(uint64*)(a + i) ^ *(uint64*)(b + i));
|
||||
}
|
||||
# endif
|
||||
for(; i <= n - 4; i += 4)
|
||||
{
|
||||
result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i));
|
||||
}
|
||||
}
|
||||
#elif CV_SIMD128
|
||||
if(hasSIMD128())
|
||||
{
|
||||
v_uint32x4 t = v_setzero_u32();
|
||||
for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
|
||||
{
|
||||
t += v_popcount(v_load(a + i) ^ v_load(b + i));
|
||||
}
|
||||
result = v_reduce_sum(t);
|
||||
}
|
||||
#endif
|
||||
for( ; i <= n - 4; i += 4 )
|
||||
result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
|
||||
popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
|
||||
for( ; i < n; i++ )
|
||||
for(; i <= n - 4; i += 4)
|
||||
{
|
||||
result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
|
||||
popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
|
||||
}
|
||||
for(; i < n; i++)
|
||||
{
|
||||
result += popCountTable[a[i] ^ b[i]];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -404,6 +404,18 @@ template<typename R> struct TheTest
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_popcount()
|
||||
{
|
||||
static unsigned popcountTable[] = {0, 1, 2, 4, 5, 7, 9, 12, 13, 15, 17, 20, 22, 25, 28, 32, 33};
|
||||
Data<R> dataA;
|
||||
R a = dataA;
|
||||
|
||||
unsigned resB = (unsigned)v_reduce_sum(v_popcount(a));
|
||||
EXPECT_EQ(popcountTable[R::nlanes], resB);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_absdiff()
|
||||
{
|
||||
typedef typename V_RegTrait128<LaneType>::u_reg Ru;
|
||||
@ -798,6 +810,7 @@ TEST(hal_intrin, uint8x16) {
|
||||
.test_min_max()
|
||||
.test_absdiff()
|
||||
.test_mask()
|
||||
.test_popcount()
|
||||
.test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
|
||||
.test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
|
||||
.test_unpack()
|
||||
@ -819,6 +832,7 @@ TEST(hal_intrin, int8x16) {
|
||||
.test_absdiff()
|
||||
.test_abs()
|
||||
.test_mask()
|
||||
.test_popcount()
|
||||
.test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
|
||||
.test_unpack()
|
||||
.test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
|
||||
@ -844,6 +858,7 @@ TEST(hal_intrin, uint16x8) {
|
||||
.test_absdiff()
|
||||
.test_reduce()
|
||||
.test_mask()
|
||||
.test_popcount()
|
||||
.test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
|
||||
.test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
|
||||
.test_unpack()
|
||||
@ -870,6 +885,7 @@ TEST(hal_intrin, int16x8) {
|
||||
.test_abs()
|
||||
.test_reduce()
|
||||
.test_mask()
|
||||
.test_popcount()
|
||||
.test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
|
||||
.test_unpack()
|
||||
.test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
|
||||
@ -894,6 +910,7 @@ TEST(hal_intrin, uint32x4) {
|
||||
.test_absdiff()
|
||||
.test_reduce()
|
||||
.test_mask()
|
||||
.test_popcount()
|
||||
.test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
|
||||
.test_unpack()
|
||||
.test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
|
||||
@ -910,6 +927,7 @@ TEST(hal_intrin, int32x4) {
|
||||
.test_mul()
|
||||
.test_abs()
|
||||
.test_cmp()
|
||||
.test_popcount()
|
||||
.test_shift<1>().test_shift<8>()
|
||||
.test_logic()
|
||||
.test_min_max()
|
||||
|
Loading…
Reference in New Issue
Block a user