Merge remote-tracking branch 'upstream/3.4' into merge-3.4

2025-07-01 09:30:52 +08:00 · 2020-07-13 19:54:49 +00:00 · 2020-07-13 19:54:49 +00:00 · e5e767abc1
commit e5e767abc1
parent 521aac9665 d41b20b268
13 changed files with 668 additions and 128 deletions
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@ -116,7 +116,18 @@ if(CUDA_FOUND)
  if(OPENCV_CUDA_DETECTION_NVCC_FLAGS MATCHES "-ccbin")
    # already specified by user
  elseif(CUDA_HOST_COMPILER AND EXISTS "${CUDA_HOST_COMPILER}")
-    LIST(APPEND OPENCV_CUDA_DETECTION_NVCC_FLAGS -ccbin "${CUDA_HOST_COMPILER}")
+    get_filename_component(c_compiler_realpath "${CMAKE_C_COMPILER}" REALPATH)
+    # C compiler doesn't work with --run option, forcing C++ compiler instead
+    if(CUDA_HOST_COMPILER STREQUAL c_compiler_realpath OR CUDA_HOST_COMPILER STREQUAL CMAKE_C_COMPILER)
+      if(DEFINED CMAKE_CXX_COMPILER)
+        get_filename_component(cxx_compiler_realpath "${CMAKE_CXX_COMPILER}" REALPATH)
+        LIST(APPEND OPENCV_CUDA_DETECTION_NVCC_FLAGS -ccbin "${cxx_compiler_realpath}")
+      else()
+        message(STATUS "CUDA: CMAKE_CXX_COMPILER is not available. You may need to specify CUDA_HOST_COMPILER.")
+      endif()
+    else()
+      LIST(APPEND OPENCV_CUDA_DETECTION_NVCC_FLAGS -ccbin "${CUDA_HOST_COMPILER}")
+    endif()
  elseif(WIN32 AND CMAKE_LINKER) # Workaround for VS cl.exe not being in the env. path
    get_filename_component(host_compiler_bindir ${CMAKE_LINKER} DIRECTORY)
    LIST(APPEND OPENCV_CUDA_DETECTION_NVCC_FLAGS -ccbin "${host_compiler_bindir}")
--- a/modules/core/src/trace.cpp
+++ b/modules/core/src/trace.cpp
@ -72,9 +72,13 @@ static int64 getTimestamp()
    return (int64)((t - g_zero_timestamp) * tick_to_ns);
 }

-// TODO lazy configuration flags
-static bool param_traceEnable = utils::getConfigurationParameterBool("OPENCV_TRACE", false);
+static bool getParameterTraceEnable()
+{
+    static bool param_traceEnable = utils::getConfigurationParameterBool("OPENCV_TRACE", false);
+    return param_traceEnable;
+}

+// TODO lazy configuration flags
 static int param_maxRegionDepthOpenCV = (int)utils::getConfigurationParameterSizeT("OPENCV_TRACE_DEPTH_OPENCV", 1);
 static int param_maxRegionChildrenOpenCV = (int)utils::getConfigurationParameterSizeT("OPENCV_TRACE_MAX_CHILDREN_OPENCV", 1000);
 static int param_maxRegionChildren = (int)utils::getConfigurationParameterSizeT("OPENCV_TRACE_MAX_CHILDREN", 10000);
@ -841,7 +845,7 @@ TraceManager::TraceManager()
    CV_LOG("TraceManager ctor: " << (void*)this);

    CV_LOG("TraceManager configure()");
-    activated = param_traceEnable;
+    activated = getParameterTraceEnable();

    if (activated)
        trace_storage.reset(new SyncTraceStorage(std::string(param_traceLocation) + ".txt"));
--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@ -831,18 +831,18 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::CNNNetwork& net)
                CV_LOG_INFO(NULL, "DNN-IE: Can't register OpenCV custom layers extension: " << e.what());
            }
 #endif
-#ifndef _WIN32
            // Limit the number of CPU threads.
 #if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
+#ifndef _WIN32
            enginePtr->SetConfig({{
                InferenceEngine::PluginConfigParams::KEY_CPU_THREADS_NUM, format("%d", getNumThreads()),
            }}, 0);
+#endif  // _WIN32
 #else
            if (device_name == "CPU")
                ie.SetConfig({{
                    InferenceEngine::PluginConfigParams::KEY_CPU_THREADS_NUM, format("%d", getNumThreads()),
                }}, device_name);
-#endif
 #endif
        }
 #if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
--- a/modules/features2d/src/blobdetector.cpp
+++ b/modules/features2d/src/blobdetector.cpp
@ -257,7 +257,7 @@ void SimpleBlobDetectorImpl::findBlobs(InputArray _image, InputArray _binaryImag
        {
            std::vector < Point > hull;
            convexHull(contours[contourIdx], hull);
-            double area = contourArea(contours[contourIdx]);
+            double area = moms.m00;
            double hullArea = contourArea(hull);
            if (fabs(hullArea) < DBL_EPSILON)
                continue;
--- a/modules/features2d/src/matchers.cpp
+++ b/modules/features2d/src/matchers.cpp
@ -625,15 +625,20 @@ void DescriptorMatcher::checkMasks( InputArrayOfArrays _masks, int queryDescript
    if( isMaskSupported() && !masks.empty() )
    {
        // Check masks
-        size_t imageCount = std::max(trainDescCollection.size(), utrainDescCollection.size() );
+        const size_t imageCount = std::max(trainDescCollection.size(), utrainDescCollection.size() );
        CV_Assert( masks.size() == imageCount );
        for( size_t i = 0; i < imageCount; i++ )
        {
-            if( !masks[i].empty() && (!trainDescCollection[i].empty() || !utrainDescCollection[i].empty() ) )
+            if (masks[i].empty())
+                continue;
+            const bool hasTrainDesc = !trainDescCollection.empty() && !trainDescCollection[i].empty();
+            const bool hasUTrainDesc = !utrainDescCollection.empty() && !utrainDescCollection[i].empty();
+            if (hasTrainDesc || hasUTrainDesc)
            {
-                int rows = trainDescCollection[i].empty() ? utrainDescCollection[i].rows : trainDescCollection[i].rows;
-                    CV_Assert( masks[i].rows == queryDescriptorsCount &&
-                        masks[i].cols == rows && masks[i].type() == CV_8UC1);
+                const int rows = hasTrainDesc ? trainDescCollection[i].rows : utrainDescCollection[i].rows;
+                CV_Assert(masks[i].type() == CV_8UC1
+                    && masks[i].rows == queryDescriptorsCount
+                    && masks[i].cols == rows);
            }
        }
    }
--- a/modules/features2d/src/sift.simd.hpp
+++ b/modules/features2d/src/sift.simd.hpp
@ -73,6 +73,7 @@

 #include <opencv2/core/hal/hal.hpp>
 #include "opencv2/core/hal/intrin.hpp"
+#include <opencv2/core/utils/buffer_area.private.hpp>

 namespace cv {

@ -167,23 +168,17 @@ float calcOrientationHist(
    int i, j, k, len = (radius*2+1)*(radius*2+1);

    float expf_scale = -1.f/(2.f * sigma * sigma);
-#if CV_SIMD
-    AutoBuffer<float> bufX(len + v_float32::nlanes);
-    AutoBuffer<float> bufY(len + v_float32::nlanes);
-    AutoBuffer<float> bufO(len + v_float32::nlanes);
-    AutoBuffer<float> bufW(len + v_float32::nlanes);
-    AutoBuffer<float> bufT(n+4 + v_float32::nlanes);
-    float *X = alignPtr(bufX.data(), CV_SIMD_WIDTH);
-    float *Y = alignPtr(bufY.data(), CV_SIMD_WIDTH);
-    float *Mag = X;
-    float *Ori = alignPtr(bufO.data(), CV_SIMD_WIDTH);
-    float *W = alignPtr(bufW.data(), CV_SIMD_WIDTH);
-    float *temphist = alignPtr(bufT.data(), CV_SIMD_WIDTH)+2;
-#else
-    AutoBuffer<float> buf(len*4 + n+4);
-    float *X = buf.data(), *Y = X + len, *Mag = X, *Ori = Y + len, *W = Ori + len;
-    float* temphist = W + len + 2;
-#endif
+
+    cv::utils::BufferArea area;
+    float *X = 0, *Y = 0, *Mag, *Ori = 0, *W = 0, *temphist = 0;
+    area.allocate(X, len, CV_SIMD_WIDTH);
+    area.allocate(Y, len, CV_SIMD_WIDTH);
+    area.allocate(Ori, len, CV_SIMD_WIDTH);
+    area.allocate(W, len, CV_SIMD_WIDTH);
+    area.allocate(temphist, n+4, CV_SIMD_WIDTH);
+    area.commit();
+    temphist += 2;
+    Mag = X;

    for( i = 0; i < n; i++ )
        temphist[i] = 0.f;
@ -656,7 +651,7 @@ void calcSIFTDescriptor(
            v_float32 v_rco011 = v_rc01*obin, v_rco010 = v_rc01 - v_rco011;
            v_float32 v_rco001 = v_rc00*obin, v_rco000 = v_rc00 - v_rco001;

-            v_int32 idx = v_fma(v_fma(r0+__1, __d_plus_2, c0+__1), __n_plus_2, o0);
+            v_int32 idx = v_muladd(v_muladd(r0+__1, __d_plus_2, c0+__1), __n_plus_2, o0);
            v_store_aligned(idx_buf, idx);

            v_store_aligned(rco_buf,           v_rco000);
--- a/modules/features2d/test/test_matchers_algorithmic.cpp
+++ b/modules/features2d/test/test_matchers_algorithmic.cpp
@ -603,7 +603,6 @@ TEST(Features2d_DMatch, issue_11855)
                                        1, 1, 1);
    Mat targets = (Mat_<uchar>(2, 3) << 1, 1, 1,
                                        0, 0, 0);
-
    Ptr<BFMatcher> bf = BFMatcher::create(NORM_HAMMING, true);
    vector<vector<DMatch> > match;
    bf->knnMatch(sources, targets, match, 1, noArray(), true);
@ -615,4 +614,18 @@ TEST(Features2d_DMatch, issue_11855)
    EXPECT_EQ(0.0f, match[0][0].distance);
 }

+TEST(Features2d_DMatch, issue_17771)
+{
+    Mat sources = (Mat_<uchar>(2, 3) << 1, 1, 0,
+                                        1, 1, 1);
+    Mat targets = (Mat_<uchar>(2, 3) << 1, 1, 1,
+                                        0, 0, 0);
+    UMat usources = sources.getUMat(ACCESS_READ);
+    UMat utargets = targets.getUMat(ACCESS_READ);
+    vector<vector<DMatch> > match;
+    Ptr<BFMatcher> ubf = BFMatcher::create(NORM_HAMMING);
+    Mat mask = (Mat_<uchar>(2, 2) << 1, 0, 0, 1);
+    EXPECT_NO_THROW(ubf->knnMatch(usources, utargets, match, 1, mask, true));
+}
+
 }} // namespace
--- a/modules/flann/include/opencv2/flann.hpp
+++ b/modules/flann/include/opencv2/flann.hpp
@ -536,7 +536,7 @@ private:
@param features The points to be clustered. The matrix must have elements of type
 Distance::ElementType.
@param centers The centers of the clusters obtained. The matrix must have type
-Distance::ResultType. The number of rows in this matrix represents the number of clusters desired,
+Distance::CentersType. The number of rows in this matrix represents the number of clusters desired,
 however, because of the way the cut in the hierarchical tree is chosen, the number of clusters
 computed will be the highest number of the form (branching-1)\*k+1 that's lower than the number of
 clusters desired, where branching is the tree's branching factor (see description of the
@ -553,15 +553,15 @@ int hierarchicalClustering(const Mat& features, Mat& centers, const ::cvflann::K
                           Distance d = Distance())
 {
    typedef typename Distance::ElementType ElementType;
-    typedef typename Distance::ResultType DistanceType;
+    typedef typename Distance::CentersType CentersType;

    CV_Assert(features.type() == CvType<ElementType>::type());
    CV_Assert(features.isContinuous());
    ::cvflann::Matrix<ElementType> m_features((ElementType*)features.ptr<ElementType>(0), features.rows, features.cols);

-    CV_Assert(centers.type() == CvType<DistanceType>::type());
+    CV_Assert(centers.type() == CvType<CentersType>::type());
    CV_Assert(centers.isContinuous());
-    ::cvflann::Matrix<DistanceType> m_centers((DistanceType*)centers.ptr<DistanceType>(0), centers.rows, centers.cols);
+    ::cvflann::Matrix<CentersType> m_centers((CentersType*)centers.ptr<CentersType>(0), centers.rows, centers.cols);

    return ::cvflann::hierarchicalClustering<Distance>(m_features, m_centers, params, d);
 }
--- a/modules/flann/include/opencv2/flann/all_indices.h
+++ b/modules/flann/include/opencv2/flann/all_indices.h
@ -130,6 +130,9 @@ struct index_creator<False,False,Distance>
        case FLANN_INDEX_LINEAR:
            nnIndex = new LinearIndex<Distance>(dataset, params, distance);
            break;
+        case FLANN_INDEX_KMEANS:
+            nnIndex = new KMeansIndex<Distance>(dataset, params, distance);
+            break;
        case FLANN_INDEX_HIERARCHICAL:
            nnIndex = new HierarchicalClusteringIndex<Distance>(dataset, params, distance);
            break;
--- a/modules/flann/include/opencv2/flann/dist.h
+++ b/modules/flann/include/opencv2/flann/dist.h
@ -1,4 +1,4 @@
-/***********************************************************************
+/***********************************************************************
 * Software License Agreement (BSD License)
 *
 * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
@ -68,6 +68,63 @@ inline float abs<float>(float x) { return fabsf(x); }
 template<>
 inline double abs<double>(double x) { return fabs(x); }

+
+template<typename TargetType>
+inline TargetType round(float x) { return static_cast<TargetType>(x); }
+
+template<>
+inline unsigned int round<unsigned int>(float x) { return static_cast<unsigned int>(x + 0.5f); }
+
+template<>
+inline unsigned short round<unsigned short>(float x) { return static_cast<unsigned short>(x + 0.5f); }
+
+template<>
+inline unsigned char round<unsigned char>(float x) { return static_cast<unsigned char>(x + 0.5f); }
+
+template<>
+inline long long round<long long>(float x) { return static_cast<long long>(x + 0.5f); }
+
+template<>
+inline long round<long>(float x) { return static_cast<long>(x + 0.5f); }
+
+template<>
+inline int round<int>(float x) { return static_cast<int>(x + 0.5f) - (x<0); }
+
+template<>
+inline short round<short>(float x) { return static_cast<short>(x + 0.5f) - (x<0); }
+
+template<>
+inline char round<char>(float x) { return static_cast<char>(x + 0.5f) - (x<0); }
+
+
+template<typename TargetType>
+inline TargetType round(double x) { return static_cast<TargetType>(x); }
+
+template<>
+inline unsigned int round<unsigned int>(double x) { return static_cast<unsigned int>(x + 0.5); }
+
+template<>
+inline unsigned short round<unsigned short>(double x) { return static_cast<unsigned short>(x + 0.5); }
+
+template<>
+inline unsigned char round<unsigned char>(double x) { return static_cast<unsigned char>(x + 0.5); }
+
+template<>
+inline long long round<long long>(double x) { return static_cast<long long>(x + 0.5); }
+
+template<>
+inline long round<long>(double x) { return static_cast<long>(x + 0.5); }
+
+template<>
+inline int round<int>(double x) { return static_cast<int>(x + 0.5) - (x<0); }
+
+template<>
+inline short round<short>(double x) { return static_cast<short>(x + 0.5) - (x<0); }
+
+template<>
+inline char round<char>(double x) { return static_cast<char>(x + 0.5) - (x<0); }
+
+
 template<typename T>
 struct Accumulator { typedef T Type; };
 template<>
@ -88,13 +145,57 @@ struct Accumulator<int> { typedef float Type; };

 class True
 {
+public:
+    static const bool val = true;
 };

 class False
 {
+public:
+    static const bool val = false;
 };


+/*
+ * This is a "zero iterator". It basically behaves like a zero filled
+ * array to all algorithms that use arrays as iterators (STL style).
+ * It's useful when there's a need to compute the distance between feature
+ * and origin it and allows for better compiler optimisation than using a
+ * zero-filled array.
+ */
+template <typename T>
+struct ZeroIterator
+{
+
+    T operator*()
+    {
+        return 0;
+    }
+
+    T operator[](int)
+    {
+        return 0;
+    }
+
+    const ZeroIterator<T>& operator ++()
+    {
+        return *this;
+    }
+
+    ZeroIterator<T> operator ++(int)
+    {
+        return *this;
+    }
+
+    ZeroIterator<T>& operator+=(int)
+    {
+        return *this;
+    }
+
+};
+
+
+
 /**
 * Squared Euclidean distance functor.
 *
@ -109,6 +210,7 @@ struct L2_Simple

    typedef T ElementType;
    typedef typename Accumulator<T>::Type ResultType;
+    typedef ResultType CentersType;

    template <typename Iterator1, typename Iterator2>
    ResultType operator()(Iterator1 a, Iterator2 b, size_t size, ResultType /*worst_dist*/ = -1) const
@ -142,6 +244,7 @@ struct L2

    typedef T ElementType;
    typedef typename Accumulator<T>::Type ResultType;
+    typedef ResultType CentersType;

    /**
     *  Compute the squared Euclidean distance between two vectors.
@ -207,6 +310,7 @@ struct L1

    typedef T ElementType;
    typedef typename Accumulator<T>::Type ResultType;
+    typedef ResultType CentersType;

    /**
     *  Compute the Manhattan (L_1) distance between two vectors.
@ -264,6 +368,7 @@ struct MinkowskiDistance

    typedef T ElementType;
    typedef typename Accumulator<T>::Type ResultType;
+    typedef ResultType CentersType;

    int order;

@ -328,6 +433,7 @@ struct MaxDistance

    typedef T ElementType;
    typedef typename Accumulator<T>::Type ResultType;
+    typedef ResultType CentersType;

    /**
     *  Compute the max distance (L_infinity) between two vectors.
@ -385,10 +491,12 @@ struct HammingLUT

    typedef unsigned char ElementType;
    typedef int ResultType;
+    typedef ElementType CentersType;

    /** this will count the bits in a ^ b
     */
-    ResultType operator()(const unsigned char* a, const unsigned char* b, size_t size) const
+    template<typename Iterator2>
+    ResultType operator()(const unsigned char* a, const Iterator2 b, size_t size) const
    {
        static const uchar popCountTable[] =
        {
@ -402,8 +510,31 @@ struct HammingLUT
            3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
        };
        ResultType result = 0;
+        const unsigned char* b2 = reinterpret_cast<const unsigned char*> (b);
        for (size_t i = 0; i < size; i++) {
-            result += popCountTable[a[i] ^ b[i]];
+            result += popCountTable[a[i] ^ b2[i]];
+        }
+        return result;
+    }
+
+
+    ResultType operator()(const unsigned char* a, const ZeroIterator<unsigned char> b, size_t size) const
+    {
+        (void)b;
+        static const uchar popCountTable[] =
+        {
+            0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+            1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+            1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+            2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+            1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+            2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+            2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+            3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
+        };
+        ResultType result = 0;
+        for (size_t i = 0; i < size; i++) {
+            result += popCountTable[a[i]];
        }
        return result;
    }
@ -422,17 +553,20 @@ struct Hamming

    typedef T ElementType;
    typedef int ResultType;
+    typedef ElementType CentersType;

    template<typename Iterator1, typename Iterator2>
-    ResultType operator()(Iterator1 a, Iterator2 b, size_t size, ResultType /*worst_dist*/ = -1) const
+    ResultType operator()(const Iterator1 a, const Iterator2 b, size_t size, ResultType /*worst_dist*/ = -1) const
    {
        ResultType result = 0;
 #if defined(__ARM_NEON__) && !defined(__CUDACC__)
        {
+            const unsigned char* a2 = reinterpret_cast<const unsigned char*> (a);
+            const unsigned char* b2 = reinterpret_cast<const unsigned char*> (b);
            uint32x4_t bits = vmovq_n_u32(0);
            for (size_t i = 0; i < size; i += 16) {
-                uint8x16_t A_vec = vld1q_u8 (a + i);
-                uint8x16_t B_vec = vld1q_u8 (b + i);
+                uint8x16_t A_vec = vld1q_u8 (a2 + i);
+                uint8x16_t B_vec = vld1q_u8 (b2 + i);
                uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
                uint8x16_t bitsSet = vcntq_u8 (AxorB);
                uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
@ -470,6 +604,52 @@ struct Hamming
 #endif
        return result;
    }
+
+
+    template<typename Iterator1>
+    ResultType operator()(const Iterator1 a, ZeroIterator<unsigned char> b, size_t size, ResultType /*worst_dist*/ = -1) const
+    {
+        (void)b;
+        ResultType result = 0;
+#if defined(__ARM_NEON__) && !defined(__CUDACC__)
+        {
+            const unsigned char* a2 = reinterpret_cast<const unsigned char*> (a);
+            uint32x4_t bits = vmovq_n_u32(0);
+            for (size_t i = 0; i < size; i += 16) {
+                uint8x16_t A_vec = vld1q_u8 (a2 + i);
+                uint8x16_t bitsSet = vcntq_u8 (A_vec);
+                uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
+                uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
+                bits = vaddq_u32(bits, bitSet4);
+            }
+            uint64x2_t bitSet2 = vpaddlq_u32 (bits);
+            result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
+            result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
+        }
+#elif defined(__GNUC__)
+        {
+            //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll)
+            typedef unsigned long long pop_t;
+            const size_t modulo = size % sizeof(pop_t);
+            const pop_t* a2 = reinterpret_cast<const pop_t*> (a);
+            const pop_t* a2_end = a2 + (size / sizeof(pop_t));
+
+            for (; a2 != a2_end; ++a2) result += __builtin_popcountll(*a2);
+
+            if (modulo) {
+                //in the case where size is not dividable by sizeof(size_t)
+                //need to mask off the bits at the end
+                pop_t a_final = 0;
+                memcpy(&a_final, a2, modulo);
+                result += __builtin_popcountll(a_final);
+            }
+        }
+#else // NO NEON and NOT GNUC
+        HammingLUT lut;
+        result = lut(reinterpret_cast<const unsigned char*> (a), b, size);
+#endif
+        return result;
+    }
 };

 template<typename T>
@ -480,6 +660,7 @@ struct Hamming2

    typedef T ElementType;
    typedef int ResultType;
+    typedef ElementType CentersType;

    /** This is popcount_3() from:
     * http://en.wikipedia.org/wiki/Hamming_weight */
@ -500,7 +681,7 @@ struct Hamming2
 #endif

    template <typename Iterator1, typename Iterator2>
-    ResultType operator()(Iterator1 a, Iterator2 b, size_t size, ResultType /*worst_dist*/ = -1) const
+    ResultType operator()(const Iterator1 a, const Iterator2 b, size_t size, ResultType /*worst_dist*/ = -1) const
    {
 #ifdef FLANN_PLATFORM_64_BIT
        const uint64_t* pa = reinterpret_cast<const uint64_t*>(a);
@ -526,6 +707,31 @@ struct Hamming2
        return result;
    }

+
+    template <typename Iterator1>
+    ResultType operator()(const Iterator1 a, ZeroIterator<unsigned char> b, size_t size, ResultType /*worst_dist*/ = -1) const
+    {
+        (void)b;
+#ifdef FLANN_PLATFORM_64_BIT
+        const uint64_t* pa = reinterpret_cast<const uint64_t*>(a);
+        ResultType result = 0;
+        size /= long_word_size_;
+        for(size_t i = 0; i < size; ++i ) {
+            result += popcnt64(*pa);
+            ++pa;
+        }
+#else
+        const uint32_t* pa = reinterpret_cast<const uint32_t*>(a);
+        ResultType result = 0;
+        size /= long_word_size_;
+        for(size_t i = 0; i < size; ++i ) {
+            result += popcnt32(*pa);
+            ++pa;
+        }
+#endif
+        return result;
+    }
+
 private:
 #ifdef FLANN_PLATFORM_64_BIT
    static const size_t long_word_size_ = sizeof(uint64_t)/sizeof(unsigned char);
@ -546,6 +752,7 @@ struct HistIntersectionDistance

    typedef T ElementType;
    typedef typename Accumulator<T>::Type ResultType;
+    typedef ResultType CentersType;

    /**
     *  Compute the histogram intersection distance
@ -601,6 +808,7 @@ struct HellingerDistance

    typedef T ElementType;
    typedef typename Accumulator<T>::Type ResultType;
+    typedef ResultType CentersType;

    /**
     *  Compute the Hellinger distance
@ -650,6 +858,7 @@ struct ChiSquareDistance

    typedef T ElementType;
    typedef typename Accumulator<T>::Type ResultType;
+    typedef ResultType CentersType;

    /**
     *  Compute the chi-square distance
@ -704,6 +913,7 @@ struct KL_Divergence

    typedef T ElementType;
    typedef typename Accumulator<T>::Type ResultType;
+    typedef ResultType CentersType;

    /**
     *  Compute the Kullback-Leibler divergence
@ -749,46 +959,6 @@ struct KL_Divergence
 };


-
-/*
- * This is a "zero iterator". It basically behaves like a zero filled
- * array to all algorithms that use arrays as iterators (STL style).
- * It's useful when there's a need to compute the distance between feature
- * and origin it and allows for better compiler optimisation than using a
- * zero-filled array.
- */
-template <typename T>
-struct ZeroIterator
-{
-
-    T operator*()
-    {
-        return 0;
-    }
-
-    T operator[](int)
-    {
-        return 0;
-    }
-
-    const ZeroIterator<T>& operator ++()
-    {
-        return *this;
-    }
-
-    ZeroIterator<T> operator ++(int)
-    {
-        return *this;
-    }
-
-    ZeroIterator<T>& operator+=(int)
-    {
-        return *this;
-    }
-
-};
-
-
 /*
 * Depending on processed distances, some of them are already squared (e.g. L2)
 * and some are not (e.g.Hamming). In KMeans++ for instance we want to be sure
@ -849,6 +1019,58 @@ typename Distance::ResultType ensureSquareDistance( typename Distance::ResultTyp
 }


+/*
+ * ...a template to tell the user if the distance he is working with is actually squared
+ */
+
+template <typename Distance, typename ElementType>
+struct isSquareDist
+{
+    bool operator()() { return false; }
+};
+
+
+template <typename ElementType>
+struct isSquareDist<L2_Simple<ElementType>, ElementType>
+{
+    bool operator()() { return true; }
+};
+
+template <typename ElementType>
+struct isSquareDist<L2<ElementType>, ElementType>
+{
+    bool operator()() { return true; }
+};
+
+
+template <typename ElementType>
+struct isSquareDist<MinkowskiDistance<ElementType>, ElementType>
+{
+    bool operator()() { return true; }
+};
+
+template <typename ElementType>
+struct isSquareDist<HellingerDistance<ElementType>, ElementType>
+{
+    bool operator()() { return true; }
+};
+
+template <typename ElementType>
+struct isSquareDist<ChiSquareDistance<ElementType>, ElementType>
+{
+    bool operator()() { return true; }
+};
+
+
+template <typename Distance>
+bool isSquareDistance()
+{
+    typedef typename Distance::ElementType ElementType;
+
+    isSquareDist<Distance, ElementType> dummy;
+    return dummy();
+}
+
 /*
 * ...and a template to ensure the user that he will process the normal distance,
 * and not squared distance, without losing processing time calling sqrt(ensureSquareDistance)
--- a/modules/flann/include/opencv2/flann/flann_base.hpp
+++ b/modules/flann/include/opencv2/flann/flann_base.hpp
@ -282,7 +282,7 @@ private:
 * of the form (branching-1)*K+1 smaller than clusters.rows).
 */
 template <typename Distance>
-int hierarchicalClustering(const Matrix<typename Distance::ElementType>& points, Matrix<typename Distance::ResultType>& centers,
+int hierarchicalClustering(const Matrix<typename Distance::ElementType>& points, Matrix<typename Distance::CentersType>& centers,
                           const KMeansIndexParams& params, Distance d = Distance())
 {
    KMeansIndex<Distance> kmeans(points, params, d);
--- a/modules/flann/include/opencv2/flann/kmeans_index.h
+++ b/modules/flann/include/opencv2/flann/kmeans_index.h
@ -49,6 +49,8 @@
 #include "saving.h"
 #include "logger.h"

+#define BITS_PER_CHAR 8
+

 namespace cvflann
 {
@ -83,6 +85,10 @@ class KMeansIndex : public NNIndex<Distance>
 public:
    typedef typename Distance::ElementType ElementType;
    typedef typename Distance::ResultType DistanceType;
+    typedef typename Distance::CentersType CentersType;
+
+    typedef typename Distance::is_kdtree_distance is_kdtree_distance;
+    typedef typename Distance::is_vector_space_distance is_vector_space_distance;



@ -272,12 +278,14 @@ public:
        return FLANN_INDEX_KMEANS;
    }

+    template<class CentersContainerType>
    class KMeansDistanceComputer : public cv::ParallelLoopBody
    {
    public:
        KMeansDistanceComputer(Distance _distance, const Matrix<ElementType>& _dataset,
-            const int _branching, const int* _indices, const Matrix<double>& _dcenters, const size_t _veclen,
-            std::vector<int> &_new_centroids, std::vector<DistanceType> &_sq_dists)
+            const int _branching, const int* _indices, const CentersContainerType& _dcenters,
+            const size_t _veclen, std::vector<int> &_new_centroids,
+            std::vector<DistanceType> &_sq_dists)
            : distance(_distance)
            , dataset(_dataset)
            , branching(_branching)
@ -315,7 +323,7 @@ public:
        const Matrix<ElementType>& dataset;
        const int branching;
        const int* indices;
-        const Matrix<double>& dcenters;
+        const CentersContainerType& dcenters;
        const size_t veclen;
        std::vector<int> &new_centroids;
        std::vector<DistanceType> &sq_dists;
@ -429,8 +437,16 @@ public:
        root_ = pool_.allocate<KMeansNode>();
        std::memset(root_, 0, sizeof(KMeansNode));

-        computeNodeStatistics(root_, indices_, (int)size_);
-        computeClustering(root_, indices_, (int)size_, branching_,0);
+        if(is_kdtree_distance::val || is_vector_space_distance::val)
+        {
+            computeNodeStatistics(root_, indices_, (unsigned int)size_);
+            computeClustering(root_, indices_, (int)size_, branching_,0);
+        }
+        else
+        {
+            computeBitfieldNodeStatistics(root_, indices_, (unsigned int)size_);
+            computeBitfieldClustering(root_, indices_, (int)size_, branching_,0);
+        }
    }


@ -515,7 +531,7 @@ public:
     *     numClusters = number of clusters to have in the clustering computed
     * Returns: number of cluster centers
     */
-    int getClusterCenters(Matrix<DistanceType>& centers)
+    int getClusterCenters(Matrix<CentersType>& centers)
    {
        int numClusters = centers.rows;
        if (numClusters<1) {
@ -530,7 +546,7 @@ public:
        Logger::info("Clusters requested: %d, returning %d\n",numClusters, clusterCount);

        for (int i=0; i<clusterCount; ++i) {
-            DistanceType* center = clusters[i]->pivot;
+            CentersType* center = clusters[i]->pivot;
            for (size_t j=0; j<veclen_; ++j) {
                centers[i][j] = center[j];
            }
@ -555,7 +571,7 @@ private:
        /**
         * The cluster center.
         */
-        DistanceType* pivot;
+        CentersType* pivot;
        /**
         * The cluster radius.
         */
@ -615,7 +631,7 @@ private:
    {
        node = pool_.allocate<KMeansNode>();
        load_value(stream, *node);
-        node->pivot = new DistanceType[veclen_];
+        node->pivot = new CentersType[veclen_];
        load_value(stream, *(node->pivot), (int)veclen_);
        if (node->childs==NULL) {
            int indices_offset;
@ -652,32 +668,31 @@ private:
     *     indices = array of indices of the points belonging to the node
     *     indices_length = number of indices in the array
     */
-    void computeNodeStatistics(KMeansNodePtr node, int* indices, int indices_length)
+    void computeNodeStatistics(KMeansNodePtr node, int* indices, unsigned int indices_length)
    {
-
-        DistanceType radius = 0;
        DistanceType variance = 0;
-        DistanceType* mean = new DistanceType[veclen_];
-        memoryCounter_ += int(veclen_*sizeof(DistanceType));
+        CentersType* mean = new CentersType[veclen_];
+        memoryCounter_ += int(veclen_*sizeof(CentersType));

-        memset(mean,0,veclen_*sizeof(DistanceType));
+        memset(mean,0,veclen_*sizeof(CentersType));

-        for (int i=0; i<indices_length; ++i) {
+        for (unsigned int i=0; i<indices_length; ++i) {
            ElementType* vec = dataset_[indices[i]];
            for (size_t j=0; j<veclen_; ++j) {
                mean[j] += vec[j];
            }
            variance += distance_(vec, ZeroIterator<ElementType>(), veclen_);
        }
+        float length = static_cast<float>(indices_length);
        for (size_t j=0; j<veclen_; ++j) {
-            mean[j] /= indices_length;
+            mean[j] = cvflann::round<CentersType>( mean[j] / static_cast<double>(indices_length) );
        }
-        variance /= indices_length;
+        variance /= static_cast<DistanceType>( length );
        variance -= distance_(mean, ZeroIterator<ElementType>(), veclen_);

-        DistanceType tmp = 0;
-        for (int i=0; i<indices_length; ++i) {
-            tmp = distance_(mean, dataset_[indices[i]], veclen_);
+        DistanceType radius = 0;
+        for (unsigned int i=0; i<indices_length; ++i) {
+            DistanceType tmp = distance_(mean, dataset_[indices[i]], veclen_);
            if (tmp>radius) {
                radius = tmp;
            }
@ -689,6 +704,70 @@ private:
    }


+    void computeBitfieldNodeStatistics(KMeansNodePtr node, int* indices,
+                                       unsigned int indices_length)
+    {
+        const unsigned int accumulator_veclen = static_cast<unsigned int>(
+                                                veclen_*sizeof(CentersType)*BITS_PER_CHAR);
+
+        unsigned long long variance = 0ull;
+        CentersType* mean = new CentersType[veclen_];
+        memoryCounter_ += int(veclen_*sizeof(CentersType));
+        unsigned int* mean_accumulator = new unsigned int[accumulator_veclen];
+
+        memset(mean_accumulator, 0, accumulator_veclen);
+
+        for (unsigned int i=0; i<indices_length; ++i) {
+            variance += static_cast<unsigned long long>( ensureSquareDistance<Distance>(
+                        distance_(dataset_[indices[i]], ZeroIterator<ElementType>(), veclen_)));
+            unsigned char* vec = (unsigned char*)dataset_[indices[i]];
+            for (size_t k=0, l=0; k<accumulator_veclen; k+=BITS_PER_CHAR, ++l) {
+                mean_accumulator[k]   += (vec[l])    & 0x01;
+                mean_accumulator[k+1] += (vec[l]>>1) & 0x01;
+                mean_accumulator[k+2] += (vec[l]>>2) & 0x01;
+                mean_accumulator[k+3] += (vec[l]>>3) & 0x01;
+                mean_accumulator[k+4] += (vec[l]>>4) & 0x01;
+                mean_accumulator[k+5] += (vec[l]>>5) & 0x01;
+                mean_accumulator[k+6] += (vec[l]>>6) & 0x01;
+                mean_accumulator[k+7] += (vec[l]>>7) & 0x01;
+            }
+        }
+        double cnt = static_cast<double>(indices_length);
+        unsigned char* char_mean = (unsigned char*)mean;
+        for (size_t k=0, l=0; k<accumulator_veclen; k+=BITS_PER_CHAR, ++l) {
+            char_mean[l] = static_cast<unsigned char>(
+                              (((int)(0.5 + (double)(mean_accumulator[k])   / cnt)))
+                            | (((int)(0.5 + (double)(mean_accumulator[k+1]) / cnt))<<1)
+                            | (((int)(0.5 + (double)(mean_accumulator[k+2]) / cnt))<<2)
+                            | (((int)(0.5 + (double)(mean_accumulator[k+3]) / cnt))<<3)
+                            | (((int)(0.5 + (double)(mean_accumulator[k+4]) / cnt))<<4)
+                            | (((int)(0.5 + (double)(mean_accumulator[k+5]) / cnt))<<5)
+                            | (((int)(0.5 + (double)(mean_accumulator[k+6]) / cnt))<<6)
+                            | (((int)(0.5 + (double)(mean_accumulator[k+7]) / cnt))<<7));
+        }
+        variance = static_cast<unsigned long long>(
+                    0.5 + static_cast<double>(variance) / static_cast<double>(indices_length));
+        variance -= static_cast<unsigned long long>(
+                    ensureSquareDistance<Distance>(
+                        distance_(mean, ZeroIterator<ElementType>(), veclen_)));
+
+        DistanceType radius = 0;
+        for (unsigned int i=0; i<indices_length; ++i) {
+            DistanceType tmp = distance_(mean, dataset_[indices[i]], veclen_);
+            if (tmp>radius) {
+                radius = tmp;
+            }
+        }
+
+        node->variance = static_cast<DistanceType>(variance);
+        node->radius = radius;
+        node->pivot = mean;
+
+        delete[] mean_accumulator;
+    }
+
+
+
    /**
     * The method responsible with actually doing the recursive hierarchical
     * clustering
@ -737,7 +816,6 @@ private:
        cv::AutoBuffer<int> belongs_to_buf(indices_length);
        int* belongs_to = belongs_to_buf.data();
        for (int i=0; i<indices_length; ++i) {
-
            DistanceType sq_dist = distance_(dataset_[indices[i]], dataset_[centers_idx[0]], veclen_);
            belongs_to[i] = 0;
            for (int j=1; j<branching; ++j) {
@ -791,7 +869,7 @@ private:
            std::vector<DistanceType> sq_dists(indices_length);

            // reassign points to clusters
-            KMeansDistanceComputer invoker(distance_, dataset_, branching, indices, dcenters, veclen_, new_centroids, sq_dists);
+            KMeansDistanceComputer<Matrix<double> > invoker(distance_, dataset_, branching, indices, dcenters, veclen_, new_centroids, sq_dists);
            parallel_for_(cv::Range(0, (int)indices_length), invoker);

            for (int i=0; i < (int)indices_length; ++i) {
@ -834,13 +912,13 @@ private:

        }

-        DistanceType** centers = new DistanceType*[branching];
+        CentersType** centers = new CentersType*[branching];

        for (int i=0; i<branching; ++i) {
-            centers[i] = new DistanceType[veclen_];
-            memoryCounter_ += (int)(veclen_*sizeof(DistanceType));
+            centers[i] = new CentersType[veclen_];
+            memoryCounter_ += (int)(veclen_*sizeof(CentersType));
            for (size_t k=0; k<veclen_; ++k) {
-                centers[i][k] = (DistanceType)dcenters[i][k];
+                centers[i][k] = (CentersType)dcenters[i][k];
            }
        }

@ -858,7 +936,7 @@ private:
                if (belongs_to[i]==c) {
                    DistanceType d = distance_(dataset_[indices[i]], ZeroIterator<ElementType>(), veclen_);
                    variance += d;
-                    mean_radius += sqrt(d);
+                    mean_radius += static_cast<DistanceType>( sqrt(d) );
                    std::swap(indices[i],indices[end]);
                    std::swap(belongs_to[i],belongs_to[end]);
                    end++;
@ -883,6 +961,204 @@ private:



+    void computeBitfieldClustering(KMeansNodePtr node, int* indices,
+                                   int indices_length, int branching, int level)
+    {
+        node->size = indices_length;
+        node->level = level;
+
+        if (indices_length < branching) {
+            node->indices = indices;
+            std::sort(node->indices,node->indices+indices_length);
+            node->childs = NULL;
+            return;
+        }
+
+        cv::AutoBuffer<int> centers_idx_buf(branching);
+        int* centers_idx = centers_idx_buf.data();
+        int centers_length;
+        (this->*chooseCenters)(branching, indices, indices_length, centers_idx, centers_length);
+
+        if (centers_length<branching) {
+            node->indices = indices;
+            std::sort(node->indices,node->indices+indices_length);
+            node->childs = NULL;
+            return;
+        }
+
+        const unsigned int accumulator_veclen = static_cast<unsigned int>(
+                                                veclen_*sizeof(ElementType)*BITS_PER_CHAR);
+        cv::AutoBuffer<unsigned int> dcenters_buf(branching*accumulator_veclen);
+        Matrix<unsigned int> dcenters(dcenters_buf.data(), branching, accumulator_veclen);
+
+        CentersType** centers = new CentersType*[branching];
+
+        for (int i=0; i<branching; ++i) {
+            centers[i] = new CentersType[veclen_];
+            memoryCounter_ += (int)(veclen_*sizeof(CentersType));
+        }
+
+        std::vector<DistanceType> radiuses(branching);
+        cv::AutoBuffer<int> count_buf(branching);
+        int* count = count_buf.data();
+        for (int i=0; i<branching; ++i) {
+            radiuses[i] = 0;
+            count[i] = 0;
+        }
+
+        //	assign points to clusters
+        cv::AutoBuffer<int> belongs_to_buf(indices_length);
+        int* belongs_to = belongs_to_buf.data();
+        for (int i=0; i<indices_length; ++i) {
+
+            DistanceType dist = distance_(dataset_[indices[i]], dataset_[centers_idx[0]], veclen_);
+            belongs_to[i] = 0;
+            for (int j=1; j<branching; ++j) {
+                DistanceType new_dist = distance_(dataset_[indices[i]], dataset_[centers_idx[j]], veclen_);
+                if (dist>new_dist) {
+                    belongs_to[i] = j;
+                    dist = new_dist;
+                }
+            }
+            if (dist>radiuses[belongs_to[i]]) {
+                radiuses[belongs_to[i]] = dist;
+            }
+            count[belongs_to[i]]++;
+        }
+
+        bool converged = false;
+        int iteration = 0;
+        while (!converged && iteration<iterations_) {
+            converged = true;
+            iteration++;
+
+            // compute the new cluster centers
+            for (int i=0; i<branching; ++i) {
+                memset(dcenters[i],0,sizeof(unsigned int)*accumulator_veclen);
+                radiuses[i] = 0;
+            }
+            for (int i=0; i<indices_length; ++i) {
+                unsigned char* vec = (unsigned char*)dataset_[indices[i]];
+                unsigned int* dcenter = dcenters[belongs_to[i]];
+                for (size_t k=0, l=0; k<accumulator_veclen; k+=BITS_PER_CHAR, ++l) {
+                    dcenter[k]   += (vec[l])    & 0x01;
+                    dcenter[k+1] += (vec[l]>>1) & 0x01;
+                    dcenter[k+2] += (vec[l]>>2) & 0x01;
+                    dcenter[k+3] += (vec[l]>>3) & 0x01;
+                    dcenter[k+4] += (vec[l]>>4) & 0x01;
+                    dcenter[k+5] += (vec[l]>>5) & 0x01;
+                    dcenter[k+6] += (vec[l]>>6) & 0x01;
+                    dcenter[k+7] += (vec[l]>>7) & 0x01;
+                }
+            }
+            for (int i=0; i<branching; ++i) {
+                double cnt = static_cast<double>(count[i]);
+                unsigned int* dcenter = dcenters[i];
+                unsigned char* charCenter = (unsigned char*)centers[i];
+                for (size_t k=0, l=0; k<accumulator_veclen; k+=BITS_PER_CHAR, ++l) {
+                    charCenter[l] = static_cast<unsigned char>(
+                                      (((int)(0.5 + (double)(dcenter[k])   / cnt)))
+                                    | (((int)(0.5 + (double)(dcenter[k+1]) / cnt))<<1)
+                                    | (((int)(0.5 + (double)(dcenter[k+2]) / cnt))<<2)
+                                    | (((int)(0.5 + (double)(dcenter[k+3]) / cnt))<<3)
+                                    | (((int)(0.5 + (double)(dcenter[k+4]) / cnt))<<4)
+                                    | (((int)(0.5 + (double)(dcenter[k+5]) / cnt))<<5)
+                                    | (((int)(0.5 + (double)(dcenter[k+6]) / cnt))<<6)
+                                    | (((int)(0.5 + (double)(dcenter[k+7]) / cnt))<<7));
+                }
+            }
+
+            std::vector<int> new_centroids(indices_length);
+            std::vector<DistanceType> dists(indices_length);
+
+            // reassign points to clusters
+            KMeansDistanceComputer<ElementType**> invoker(distance_, dataset_, branching, indices, centers, veclen_, new_centroids, dists);
+            parallel_for_(cv::Range(0, (int)indices_length), invoker);
+
+            for (int i=0; i < indices_length; ++i) {
+                DistanceType dist(dists[i]);
+                int new_centroid(new_centroids[i]);
+                if (dist > radiuses[new_centroid]) {
+                    radiuses[new_centroid] = dist;
+                }
+                if (new_centroid != belongs_to[i]) {
+                    count[belongs_to[i]]--;
+                    count[new_centroid]++;
+                    belongs_to[i] = new_centroid;
+                    converged = false;
+                }
+            }
+
+            for (int i=0; i<branching; ++i) {
+                // if one cluster converges to an empty cluster,
+                // move an element into that cluster
+                if (count[i]==0) {
+                    int j = (i+1)%branching;
+                    while (count[j]<=1) {
+                        j = (j+1)%branching;
+                    }
+
+                    for (int k=0; k<indices_length; ++k) {
+                        if (belongs_to[k]==j) {
+                            // for cluster j, we move the furthest element from the center to the empty cluster i
+                            if ( distance_(dataset_[indices[k]], centers[j], veclen_) == radiuses[j] ) {
+                                belongs_to[k] = i;
+                                count[j]--;
+                                count[i]++;
+                                break;
+                            }
+                        }
+                    }
+                    converged = false;
+                }
+            }
+
+        }
+
+
+        // compute kmeans clustering for each of the resulting clusters
+        node->childs = pool_.allocate<KMeansNodePtr>(branching);
+        int start = 0;
+        int end = start;
+        for (int c=0; c<branching; ++c) {
+            int s = count[c];
+
+            unsigned long long variance = 0ull;
+            DistanceType mean_radius =0;
+            for (int i=0; i<indices_length; ++i) {
+                if (belongs_to[i]==c) {
+                    DistanceType d = distance_(dataset_[indices[i]], ZeroIterator<ElementType>(), veclen_);
+                    variance += static_cast<unsigned long long>( ensureSquareDistance<Distance>(d) );
+                    mean_radius += ensureSimpleDistance<Distance>(d);
+                    std::swap(indices[i],indices[end]);
+                    std::swap(belongs_to[i],belongs_to[end]);
+                    end++;
+                }
+            }
+            mean_radius = static_cast<DistanceType>(
+                        0.5f + static_cast<float>(mean_radius) / static_cast<float>(s));
+            variance = static_cast<unsigned long long>(
+                        0.5 + static_cast<double>(variance) / static_cast<double>(s));
+            variance -= static_cast<unsigned long long>(
+                        ensureSquareDistance<Distance>(
+                            distance_(centers[c], ZeroIterator<ElementType>(), veclen_)));
+
+            node->childs[c] = pool_.allocate<KMeansNode>();
+            std::memset(node->childs[c], 0, sizeof(KMeansNode));
+            node->childs[c]->radius = radiuses[c];
+            node->childs[c]->pivot = centers[c];
+            node->childs[c]->variance = static_cast<DistanceType>(variance);
+            node->childs[c]->mean_radius = mean_radius;
+            computeBitfieldClustering(node->childs[c],indices+start, end-start, branching, level+1);
+            start=end;
+        }
+
+        delete[] centers;
+    }
+
+
+
+
    /**
     * Performs one descent in the hierarchical k-means tree. The branches not
     * visited are stored in a priority queue.
@ -905,12 +1181,16 @@ private:
            DistanceType rsq = node->radius;
            DistanceType wsq = result.worstDist();

-            DistanceType val = bsq-rsq-wsq;
-            DistanceType val2 = val*val-4*rsq*wsq;
-
-            //if (val>0) {
-            if ((val>0)&&(val2>0)) {
-                return;
+            if (isSquareDistance<Distance>())
+            {
+                DistanceType val = bsq-rsq-wsq;
+                if ((val>0) && (val*val > 4*rsq*wsq))
+                    return;
+            }
+            else
+            {
+                if (bsq-rsq > wsq)
+                    return;
            }
        }

@ -956,7 +1236,8 @@ private:
        //		float* best_center = node->childs[best_index]->pivot;
        for (int i=0; i<branching_; ++i) {
            if (i != best_index) {
-                domain_distances[i] -= cb_index_*node->childs[i]->variance;
+                domain_distances[i] -= cvflann::round<DistanceType>(
+                                        cb_index_*node->childs[i]->variance );

                //				float dist_to_border = getDistanceToBorder(node.childs[i].pivot,best_center,q);
                //				if (domain_distances[i]<dist_to_border) {
@ -981,12 +1262,16 @@ private:
            DistanceType rsq = node->radius;
            DistanceType wsq = result.worstDist();

-            DistanceType val = bsq-rsq-wsq;
-            DistanceType val2 = val*val-4*rsq*wsq;
-
-            //                  if (val>0) {
-            if ((val>0)&&(val2>0)) {
-                return;
+            if (isSquareDistance<Distance>())
+            {
+                DistanceType val = bsq-rsq-wsq;
+                if ((val>0) && (val*val > 4*rsq*wsq))
+                    return;
+            }
+            else
+            {
+                if (bsq-rsq > wsq)
+                    return;
            }
        }

@ -1024,7 +1309,8 @@ private:
            DistanceType dist = distance_(q, node->childs[i]->pivot, veclen_);

            int j=0;
-            while (domain_distances[j]<dist && j<i) j++;
+            while (domain_distances[j]<dist && j<i)
+                j++;
            for (int k=i; k>j; --k) {
                domain_distances[k] = domain_distances[k-1];
                sort_indices[k] = sort_indices[k-1];
--- a/modules/videoio/src/cap_v4l.cpp
+++ b/modules/videoio/src/cap_v4l.cpp
@ -1048,15 +1048,16 @@ bool CvCaptureCAM_V4L::grabFrame()
            return false;
        }

+        // No need to skip this if the first read returns false
+        /* preparation is ok */
+        FirstCapture = false;
+
 #if defined(V4L_ABORT_BADJPEG)
        // skip first frame. it is often bad -- this is unnotied in traditional apps,
        //  but could be fatal if bad jpeg is enabled
        if (!read_frame_v4l2())
            return false;
 #endif
-
-        /* preparation is ok */
-        FirstCapture = false;
    }
    // In the case that the grab frame was without retrieveFrame
    if (bufferIndex >= 0)