From d7ff92439d6784b16ad2e8c4a3ca264fa70c075a Mon Sep 17 00:00:00 2001
From: Andrey Kamaev <no@email>
Date: Wed, 26 Oct 2011 13:04:53 +0000
Subject: [PATCH] Better NEON Hamming distance

---
 .../core/include/opencv2/core/internal.hpp    |  2 +-
 modules/core/src/stat.cpp                     | 18 +++----
 modules/flann/include/opencv2/flann/dist.h    | 49 +++++++++----------
 modules/flann/src/miniflann.cpp               |  4 ++
 modules/flann/src/precomp.hpp                 |  7 ++-
 5 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/modules/core/include/opencv2/core/internal.hpp b/modules/core/include/opencv2/core/internal.hpp
index b16bcf30da..7c9a900f7d 100644
--- a/modules/core/include/opencv2/core/internal.hpp
+++ b/modules/core/include/opencv2/core/internal.hpp
@@ -119,7 +119,7 @@ CV_INLINE IppiSize ippiSize(int width, int height)
 #define CV_SSE3 0
 #endif
 
-#if defined ANDROID && defined __ARM_NEON__
+#if defined ANDROID && defined __ARM_NEON__ && defined __GNUC__
 #include "arm_neon.h"
 #define CV_NEON 1
 
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index 381d9fb6e0..60da0a53a4 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -963,26 +963,22 @@ static const uchar popCountTable4[] =
 int normHamming(const uchar* a, const uchar* b, int n)
 {
     int i = 0, result = 0;
-#if defined __GNUC__ && CV_NEON
+#if CV_NEON
     if (CPU_HAS_NEON_FEATURE)
     {
-        result = 0;  
-        for( ; i <= n - 16; i += 16 )
-        {
+        uint32x4_t bits = vmovq_n_u32(0);
+        for (; i <= n - 16; i += 16) {
             uint8x16_t A_vec = vld1q_u8 (a + i);
             uint8x16_t B_vec = vld1q_u8 (b + i);
-            //uint8x16_t veorq_u8 (uint8x16_t, uint8x16_t)
             uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
-            
             uint8x16_t bitsSet = vcntq_u8 (AxorB);
-            //uint16x8_t vpadalq_u8 (uint16x8_t, uint8x16_t)
             uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
             uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
-            
-            uint64x2_t bitSet2 = vpaddlq_u32 (bitSet4);
-            result += vgetq_lane_u64 (bitSet2,0);
-            result += vgetq_lane_u64 (bitSet2,1);
+            bits = vaddq_u32(bits, bitSet4);
         }
+        uint64x2_t bitSet2 = vpaddlq_u32 (bits);
+        result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
+        result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
     }
     else
 #endif
diff --git a/modules/flann/include/opencv2/flann/dist.h b/modules/flann/include/opencv2/flann/dist.h
index 199d8833dd..b16ca394dd 100644
--- a/modules/flann/include/opencv2/flann/dist.h
+++ b/modules/flann/include/opencv2/flann/dist.h
@@ -421,43 +421,42 @@ struct Hamming
     {
         ResultType result = 0;
 #if __GNUC__
-#if ANDROID && HAVE_NEON
-        static uint64_t features = android_getCpuFeatures();
-        if ((features& ANDROID_CPU_ARM_FEATURE_NEON)) {
+#if CV_NEON
+        if (CPU_HAS_NEON_FEATURE) {
+            uint32x4_t bits = vmovq_n_u32(0);
             for (size_t i = 0; i < size; i += 16) {
                 uint8x16_t A_vec = vld1q_u8 (a + i);
                 uint8x16_t B_vec = vld1q_u8 (b + i);
-                //uint8x16_t veorq_u8 (uint8x16_t, uint8x16_t)
                 uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
-
-                uint8x16_t bitsSet += vcntq_u8 (AxorB);
-                //uint16x8_t vpadalq_u8 (uint16x8_t, uint8x16_t)
+                uint8x16_t bitsSet = vcntq_u8 (AxorB);
                 uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
                 uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
-
-                uint64x2_t bitSet2 = vpaddlq_u32 (bitSet4);
-                result += vgetq_lane_u64 (bitSet2,0);
-                result += vgetq_lane_u64 (bitSet2,1);
+                bits = vaddq_u32(bits, bitSet4);
             }
+            uint64x2_t bitSet2 = vpaddlq_u32 (bits);
+            result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
+            result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
         }
         else
 #endif
-        //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll)
-        typedef unsigned long long pop_t;
-        const size_t modulo = size % sizeof(pop_t);
-        const pop_t* a2 = reinterpret_cast<const pop_t*> (a);
-        const pop_t* b2 = reinterpret_cast<const pop_t*> (b);
-        const pop_t* a2_end = a2 + (size / sizeof(pop_t));
+        {
+            //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll)
+            typedef unsigned long long pop_t;
+            const size_t modulo = size % sizeof(pop_t);
+            const pop_t* a2 = reinterpret_cast<const pop_t*> (a);
+            const pop_t* b2 = reinterpret_cast<const pop_t*> (b);
+            const pop_t* a2_end = a2 + (size / sizeof(pop_t));
 
-        for (; a2 != a2_end; ++a2, ++b2) result += __builtin_popcountll((*a2) ^ (*b2));
+            for (; a2 != a2_end; ++a2, ++b2) result += __builtin_popcountll((*a2) ^ (*b2));
 
-        if (modulo) {
-            //in the case where size is not dividable by sizeof(size_t)
-            //need to mask off the bits at the end
-            pop_t a_final = 0, b_final = 0;
-            memcpy(&a_final, a2, modulo);
-            memcpy(&b_final, b2, modulo);
-            result += __builtin_popcountll(a_final ^ b_final);
+            if (modulo) {
+                //in the case where size is not dividable by sizeof(size_t)
+                //need to mask off the bits at the end
+                pop_t a_final = 0, b_final = 0;
+                memcpy(&a_final, a2, modulo);
+                memcpy(&b_final, b2, modulo);
+                result += __builtin_popcountll(a_final ^ b_final);
+            }
         }
 #else
         HammingLUT lut;
diff --git a/modules/flann/src/miniflann.cpp b/modules/flann/src/miniflann.cpp
index e5ec067c2e..55bb8b6e64 100644
--- a/modules/flann/src/miniflann.cpp
+++ b/modules/flann/src/miniflann.cpp
@@ -312,7 +312,11 @@ buildIndex(void*& index, const Mat& data, const IndexParams& params, const Dista
     buildIndex_<Distance, ::cvflann::Index<Distance> >(index, data, params, dist);
 }
 
+#if CV_NEON
+typedef ::cvflann::Hamming<uchar> HammingDistance;
+#else
 typedef ::cvflann::HammingLUT HammingDistance;
+#endif
 typedef ::cvflann::LshIndex<HammingDistance> LshIndex;
 
 Index::Index()
diff --git a/modules/flann/src/precomp.hpp b/modules/flann/src/precomp.hpp
index 7678a47ebc..72731af92f 100644
--- a/modules/flann/src/precomp.hpp
+++ b/modules/flann/src/precomp.hpp
@@ -5,6 +5,12 @@
 #include <cstdarg>
 #include <sstream>
 
+#ifdef HAVE_CVCONFIG_H
+# include "cvconfig.h"
+#endif
+#include "opencv2/core/core.hpp"
+#include "opencv2/core/internal.hpp"
+
 #include "opencv2/flann/miniflann.hpp"
 #include "opencv2/flann/dist.h"
 #include "opencv2/flann/index_testing.h"
@@ -15,7 +21,6 @@
 
 // index types
 #include "opencv2/flann/all_indices.h"
-
 #include "opencv2/flann/flann_base.hpp"
 
 #endif