use universal intrinsic and SSE4 popcount instruction in normHamming

- add v_popcount in universal intrinsic
 - add test for v_popcount
 - add wrapper of popcount for both MSVC and GCC
This commit is contained in:
Tomoaki Teshima 2017-01-12 08:47:28 +09:00
parent c0cde75d54
commit 8b22099da2
6 changed files with 177 additions and 43 deletions

View File

@ -188,8 +188,16 @@ enum CpuFeatures {
# if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500)
# ifdef _MSC_VER
# include <nmmintrin.h>
# if defined(_M_X64)
# define CV_POPCNT_U64 _mm_popcnt_u64
# endif
# define CV_POPCNT_U32 _mm_popcnt_u32
# else
# include <popcntintrin.h>
# if defined(__x86_64__)
# define CV_POPCNT_U64 __builtin_popcountll
# endif
# define CV_POPCNT_U32 __builtin_popcount
# endif
# define CV_POPCNT 1
# endif

View File

@ -149,7 +149,7 @@ Element-wise binary and unary operations.
Most of these operations return only one value.
- Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum
- Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum, @ref v_popcount
- Mask: @ref v_signmask, @ref v_check_all, @ref v_check_any, @ref v_select
### Other math
@ -574,6 +574,49 @@ Scheme:
For 32-bit integer and 32-bit floating point types. */
OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)
static const unsigned char popCountTable[] =
{
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
};
/** @brief Count the 1 bits in the vector and return 4 values
Scheme:
@code
{A1 A2 A3 ...} => popcount(A1)
@endcode
Any types but result will be in v_uint32x4*/
template<typename _Tp, int n> inline v_uint32x4 v_popcount(const v_reg<_Tp, n>& a)
{
v_uint8x16 b;
b = v_reinterpret_as_u8(a);
for( int i = 0; i < v_uint8x16::nlanes; i++ )
{
b.s[i] = popCountTable[b.s[i]];
}
v_uint32x4 c;
for( int i = 0; i < v_uint32x4::nlanes; i++ )
{
c.s[i] = b.s[i*4] + b.s[i*4+1] + b.s[i*4+2] + b.s[i*4+3];
}
return c;
}
//! @cond IGNORED
template<typename _Tp, int n>
inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,

View File

@ -813,6 +813,22 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
#define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \
inline v_uint32x4 v_popcount(const _Tpvec& a) \
{ \
uint8x16_t t = vcntq_u8(cast(a.val)); \
uint16x8_t t0 = vpaddlq_u8(t); /* 16 -> 8 */ \
uint32x4_t t1 = vpaddlq_u16(t0); /* 8 -> 4 */ \
return v_uint32x4(t1); \
}
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint8x16, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint16x8, vreinterpretq_u8_u16)
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint32x4, vreinterpretq_u8_u32)
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int8x16, vreinterpretq_u8_s8)
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int16x8, vreinterpretq_u8_s16)
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int32x4, vreinterpretq_u8_s32)
inline int v_signmask(const v_uint8x16& a)
{
int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));

View File

@ -1121,6 +1121,28 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
#define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \
inline v_uint32x4 v_popcount(const _Tpvec& a) \
{ \
__m128i m1 = _mm_set1_epi32(0x55555555); \
__m128i m2 = _mm_set1_epi32(0x33333333); \
__m128i m4 = _mm_set1_epi32(0x0f0f0f0f); \
__m128i p = a.val; \
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); \
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); \
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); \
p = _mm_adds_epi8(p, _mm_srli_si128(p, 1)); \
p = _mm_adds_epi8(p, _mm_srli_si128(p, 2)); \
return v_uint32x4(_mm_and_si128(p, _mm_set1_epi32(0x000000ff))); \
}
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint8x16)
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint16x8)
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint32x4)
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int8x16)
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int16x8)
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int32x4)
#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
inline int v_signmask(const _Tpvec& a) \
{ \

View File

@ -44,6 +44,7 @@
#include "precomp.hpp"
#include <climits>
#include <limits>
#include "opencv2/core/hal/intrin.hpp"
#include "opencl_kernels_core.hpp"
@ -4238,22 +4239,8 @@ int normHamming(const uchar* a, int n)
{
int i = 0;
int result = 0;
#if CV_NEON
{
uint32x4_t bits = vmovq_n_u32(0);
for (; i <= n - 16; i += 16) {
uint8x16_t A_vec = vld1q_u8 (a + i);
uint8x16_t bitsSet = vcntq_u8 (A_vec);
uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
bits = vaddq_u32(bits, bitSet4);
}
uint64x2_t bitSet2 = vpaddlq_u32 (bits);
result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
}
#elif CV_AVX2
if (USE_AVX2)
#if CV_AVX2
if(USE_AVX2)
{
__m256i _r0 = _mm256_setzero_si256();
__m256i _0 = _mm256_setzero_si256();
@ -4274,12 +4261,40 @@ int normHamming(const uchar* a, int n)
_r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
}
#elif CV_POPCNT
if(checkHardwareSupport(CV_CPU_POPCNT))
{
# if defined CV_POPCNT_U64
for(; i <= n - 8; i += 8)
{
result += (int)CV_POPCNT_U64(*(uint64*)(a + i));
}
# endif
for(; i <= n - 4; i += 4)
{
result += CV_POPCNT_U32(*(uint*)(a + i));
}
}
#elif CV_SIMD128
if(hasSIMD128())
{
v_uint32x4 t = v_setzero_u32();
for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
{
t += v_popcount(v_load(a + i));
}
result = v_reduce_sum(t);
}
#endif
for( ; i <= n - 4; i += 4 )
result += popCountTable[a[i]] + popCountTable[a[i+1]] +
popCountTable[a[i+2]] + popCountTable[a[i+3]];
for( ; i < n; i++ )
for(; i <= n - 4; i += 4)
{
result += popCountTable[a[i]] + popCountTable[a[i+1]] +
popCountTable[a[i+2]] + popCountTable[a[i+3]];
}
for(; i < n; i++)
{
result += popCountTable[a[i]];
}
return result;
}
@ -4287,24 +4302,8 @@ int normHamming(const uchar* a, const uchar* b, int n)
{
int i = 0;
int result = 0;
#if CV_NEON
{
uint32x4_t bits = vmovq_n_u32(0);
for (; i <= n - 16; i += 16) {
uint8x16_t A_vec = vld1q_u8 (a + i);
uint8x16_t B_vec = vld1q_u8 (b + i);
uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
uint8x16_t bitsSet = vcntq_u8 (AxorB);
uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
bits = vaddq_u32(bits, bitSet4);
}
uint64x2_t bitSet2 = vpaddlq_u32 (bits);
result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
}
#elif CV_AVX2
if (USE_AVX2)
#if CV_AVX2
if(USE_AVX2)
{
__m256i _r0 = _mm256_setzero_si256();
__m256i _0 = _mm256_setzero_si256();
@ -4328,12 +4327,40 @@ int normHamming(const uchar* a, const uchar* b, int n)
_r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
}
#elif CV_POPCNT
if(checkHardwareSupport(CV_CPU_POPCNT))
{
# if defined CV_POPCNT_U64
for(; i <= n - 8; i += 8)
{
result += (int)CV_POPCNT_U64(*(uint64*)(a + i) ^ *(uint64*)(b + i));
}
# endif
for(; i <= n - 4; i += 4)
{
result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i));
}
}
#elif CV_SIMD128
if(hasSIMD128())
{
v_uint32x4 t = v_setzero_u32();
for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
{
t += v_popcount(v_load(a + i) ^ v_load(b + i));
}
result = v_reduce_sum(t);
}
#endif
for( ; i <= n - 4; i += 4 )
result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
for( ; i < n; i++ )
for(; i <= n - 4; i += 4)
{
result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
}
for(; i < n; i++)
{
result += popCountTable[a[i] ^ b[i]];
}
return result;
}

View File

@ -404,6 +404,18 @@ template<typename R> struct TheTest
return *this;
}
TheTest & test_popcount()
{
static unsigned popcountTable[] = {0, 1, 2, 4, 5, 7, 9, 12, 13, 15, 17, 20, 22, 25, 28, 32, 33};
Data<R> dataA;
R a = dataA;
unsigned resB = (unsigned)v_reduce_sum(v_popcount(a));
EXPECT_EQ(popcountTable[R::nlanes], resB);
return *this;
}
TheTest & test_absdiff()
{
typedef typename V_RegTrait128<LaneType>::u_reg Ru;
@ -798,6 +810,7 @@ TEST(hal_intrin, uint8x16) {
.test_min_max()
.test_absdiff()
.test_mask()
.test_popcount()
.test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
.test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
.test_unpack()
@ -819,6 +832,7 @@ TEST(hal_intrin, int8x16) {
.test_absdiff()
.test_abs()
.test_mask()
.test_popcount()
.test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
@ -844,6 +858,7 @@ TEST(hal_intrin, uint16x8) {
.test_absdiff()
.test_reduce()
.test_mask()
.test_popcount()
.test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
.test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
.test_unpack()
@ -870,6 +885,7 @@ TEST(hal_intrin, int16x8) {
.test_abs()
.test_reduce()
.test_mask()
.test_popcount()
.test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
@ -894,6 +910,7 @@ TEST(hal_intrin, uint32x4) {
.test_absdiff()
.test_reduce()
.test_mask()
.test_popcount()
.test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
@ -910,6 +927,7 @@ TEST(hal_intrin, int32x4) {
.test_mul()
.test_abs()
.test_cmp()
.test_popcount()
.test_shift<1>().test_shift<8>()
.test_logic()
.test_min_max()