From d7ff92439d6784b16ad2e8c4a3ca264fa70c075a Mon Sep 17 00:00:00 2001 From: Andrey Kamaev Date: Wed, 26 Oct 2011 13:04:53 +0000 Subject: [PATCH] Better NEON Hamming distance --- .../core/include/opencv2/core/internal.hpp | 2 +- modules/core/src/stat.cpp | 18 +++---- modules/flann/include/opencv2/flann/dist.h | 49 +++++++++---------- modules/flann/src/miniflann.cpp | 4 ++ modules/flann/src/precomp.hpp | 7 ++- 5 files changed, 42 insertions(+), 38 deletions(-) diff --git a/modules/core/include/opencv2/core/internal.hpp b/modules/core/include/opencv2/core/internal.hpp index b16bcf30da..7c9a900f7d 100644 --- a/modules/core/include/opencv2/core/internal.hpp +++ b/modules/core/include/opencv2/core/internal.hpp @@ -119,7 +119,7 @@ CV_INLINE IppiSize ippiSize(int width, int height) #define CV_SSE3 0 #endif -#if defined ANDROID && defined __ARM_NEON__ +#if defined ANDROID && defined __ARM_NEON__ && defined __GNUC__ #include "arm_neon.h" #define CV_NEON 1 diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 381d9fb6e0..60da0a53a4 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -963,26 +963,22 @@ static const uchar popCountTable4[] = int normHamming(const uchar* a, const uchar* b, int n) { int i = 0, result = 0; -#if defined __GNUC__ && CV_NEON +#if CV_NEON if (CPU_HAS_NEON_FEATURE) { - result = 0; - for( ; i <= n - 16; i += 16 ) - { + uint32x4_t bits = vmovq_n_u32(0); + for (; i <= n - 16; i += 16) { uint8x16_t A_vec = vld1q_u8 (a + i); uint8x16_t B_vec = vld1q_u8 (b + i); - //uint8x16_t veorq_u8 (uint8x16_t, uint8x16_t) uint8x16_t AxorB = veorq_u8 (A_vec, B_vec); - uint8x16_t bitsSet = vcntq_u8 (AxorB); - //uint16x8_t vpadalq_u8 (uint16x8_t, uint8x16_t) uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet); uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8); - - uint64x2_t bitSet2 = vpaddlq_u32 (bitSet4); - result += vgetq_lane_u64 (bitSet2,0); - result += vgetq_lane_u64 (bitSet2,1); + bits = vaddq_u32(bits, bitSet4); } + uint64x2_t bitSet2 = vpaddlq_u32 (bits); + result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0); + result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2); } else #endif diff --git a/modules/flann/include/opencv2/flann/dist.h b/modules/flann/include/opencv2/flann/dist.h index 199d8833dd..b16ca394dd 100644 --- a/modules/flann/include/opencv2/flann/dist.h +++ b/modules/flann/include/opencv2/flann/dist.h @@ -421,43 +421,42 @@ struct Hamming { ResultType result = 0; #if __GNUC__ -#if ANDROID && HAVE_NEON - static uint64_t features = android_getCpuFeatures(); - if ((features& ANDROID_CPU_ARM_FEATURE_NEON)) { +#if CV_NEON + if (CPU_HAS_NEON_FEATURE) { + uint32x4_t bits = vmovq_n_u32(0); for (size_t i = 0; i < size; i += 16) { uint8x16_t A_vec = vld1q_u8 (a + i); uint8x16_t B_vec = vld1q_u8 (b + i); - //uint8x16_t veorq_u8 (uint8x16_t, uint8x16_t) uint8x16_t AxorB = veorq_u8 (A_vec, B_vec); - - uint8x16_t bitsSet += vcntq_u8 (AxorB); - //uint16x8_t vpadalq_u8 (uint16x8_t, uint8x16_t) + uint8x16_t bitsSet = vcntq_u8 (AxorB); uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet); uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8); - - uint64x2_t bitSet2 = vpaddlq_u32 (bitSet4); - result += vgetq_lane_u64 (bitSet2,0); - result += vgetq_lane_u64 (bitSet2,1); + bits = vaddq_u32(bits, bitSet4); } + uint64x2_t bitSet2 = vpaddlq_u32 (bits); + result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0); + result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2); } else #endif - //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll) - typedef unsigned long long pop_t; - const size_t modulo = size % sizeof(pop_t); - const pop_t* a2 = reinterpret_cast (a); - const pop_t* b2 = reinterpret_cast (b); - const pop_t* a2_end = a2 + (size / sizeof(pop_t)); + { + //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll) + typedef unsigned long long pop_t; + const size_t modulo = size % sizeof(pop_t); + const pop_t* a2 = reinterpret_cast (a); + const pop_t* b2 = reinterpret_cast (b); + const pop_t* a2_end = a2 + (size / sizeof(pop_t)); - for (; a2 != a2_end; ++a2, ++b2) result += __builtin_popcountll((*a2) ^ (*b2)); + for (; a2 != a2_end; ++a2, ++b2) result += __builtin_popcountll((*a2) ^ (*b2)); - if (modulo) { - //in the case where size is not dividable by sizeof(size_t) - //need to mask off the bits at the end - pop_t a_final = 0, b_final = 0; - memcpy(&a_final, a2, modulo); - memcpy(&b_final, b2, modulo); - result += __builtin_popcountll(a_final ^ b_final); + if (modulo) { + //in the case where size is not dividable by sizeof(size_t) + //need to mask off the bits at the end + pop_t a_final = 0, b_final = 0; + memcpy(&a_final, a2, modulo); + memcpy(&b_final, b2, modulo); + result += __builtin_popcountll(a_final ^ b_final); + } } #else HammingLUT lut; diff --git a/modules/flann/src/miniflann.cpp b/modules/flann/src/miniflann.cpp index e5ec067c2e..55bb8b6e64 100644 --- a/modules/flann/src/miniflann.cpp +++ b/modules/flann/src/miniflann.cpp @@ -312,7 +312,11 @@ buildIndex(void*& index, const Mat& data, const IndexParams& params, const Dista buildIndex_ >(index, data, params, dist); } +#if CV_NEON +typedef ::cvflann::Hamming HammingDistance; +#else typedef ::cvflann::HammingLUT HammingDistance; +#endif typedef ::cvflann::LshIndex LshIndex; Index::Index() diff --git a/modules/flann/src/precomp.hpp b/modules/flann/src/precomp.hpp index 7678a47ebc..72731af92f 100644 --- a/modules/flann/src/precomp.hpp +++ b/modules/flann/src/precomp.hpp @@ -5,6 +5,12 @@ #include #include +#ifdef HAVE_CVCONFIG_H +# include "cvconfig.h" +#endif +#include "opencv2/core/core.hpp" +#include "opencv2/core/internal.hpp" + #include "opencv2/flann/miniflann.hpp" #include "opencv2/flann/dist.h" #include "opencv2/flann/index_testing.h" @@ -15,7 +21,6 @@ // index types #include "opencv2/flann/all_indices.h" - #include "opencv2/flann/flann_base.hpp" #endif