Merge pull request #18019 from pemmanuelviel:pev--multiple-kmeans-trees

* Possibility to set more than one tree for the hierarchical KMeans (default is still 1 tree). This particularly improves NN retrieval results with binary vectors, allowing better quality compared to LSH for similar processing time when speed is the criterium. * Add explanations on the FLANN's hierarchical KMeans for binary data.
2025-01-18 06:03:15 +08:00 · 2020-08-03 20:29:57 +02:00 · 2020-08-03 20:29:57 +02:00 · 793e7c0d9f
commit 793e7c0d9f
parent 3b337a12c9
2 changed files with 179 additions and 48 deletions
--- a/modules/flann/include/opencv2/flann.hpp
+++ b/modules/flann/include/opencv2/flann.hpp
@ -191,8 +191,28 @@ public:
            KDTreeIndexParams( int trees = 4 );
        };
        @endcode
+        - **HierarchicalClusteringIndexParams** When passing an object of this type the index constructed
+        will be a hierarchical tree of clusters, dividing each set of points into n clusters whose centers
+        are picked among the points without further refinement of their position.
+        This algorithm fits both floating, integer and binary vectors. :
+        @code
+        struct HierarchicalClusteringIndexParams : public IndexParams
+        {
+            HierarchicalClusteringIndexParams(
+                int branching = 32,
+                flann_centers_init_t centers_init = CENTERS_RANDOM,
+                int trees = 4,
+                int leaf_size = 100);
+
+        };
+        @endcode
        - **KMeansIndexParams** When passing an object of this type the index constructed will be a
-        hierarchical k-means tree. :
+        hierarchical k-means tree (one tree by default), dividing each set of points into n clusters
+        whose barycenters are refined iteratively.
+        Note that this algorithm has been extended to the support of binary vectors as an alternative
+        to LSH when knn search speed is the criterium. It will also outperform LSH when processing
+        directly (i.e. without the use of MCA/PCA) datasets whose points share mostly the same values
+        for most of the dimensions. It is recommended to set more than one tree with binary data. :
        @code
        struct KMeansIndexParams : public IndexParams
        {
@ -201,6 +221,13 @@ public:
                int iterations = 11,
                flann_centers_init_t centers_init = CENTERS_RANDOM,
                float cb_index = 0.2 );
+
+            KMeansIndexParams(
+                int branching,
+                int iterations,
+                flann_centers_init_t centers_init,
+                float cb_index,
+                int trees );
        };
        @endcode
        - **CompositeIndexParams** When using a parameters object of this type the index created
@ -219,7 +246,8 @@ public:
        - **LshIndexParams** When using a parameters object of this type the index created uses
        multi-probe LSH (by Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity Search
        by Qin Lv, William Josephson, Zhe Wang, Moses Charikar, Kai Li., Proceedings of the 33rd
-        International Conference on Very Large Data Bases (VLDB). Vienna, Austria. September 2007) :
+        International Conference on Very Large Data Bases (VLDB). Vienna, Austria. September 2007).
+        This algorithm is designed for binary vectors. :
        @code
        struct LshIndexParams : public IndexParams
        {
--- a/modules/flann/include/opencv2/flann/kmeans_index.h
+++ b/modules/flann/include/opencv2/flann/kmeans_index.h
@ -57,8 +57,8 @@ namespace cvflann

 struct KMeansIndexParams : public IndexParams
 {
-    KMeansIndexParams(int branching = 32, int iterations = 11,
-                      flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM, float cb_index = 0.2 )
+    void indexParams(int branching, int iterations,
+                     flann_centers_init_t centers_init, float cb_index, int trees)
    {
        (*this)["algorithm"] = FLANN_INDEX_KMEANS;
        // branching factor
@ -69,6 +69,20 @@ struct KMeansIndexParams : public IndexParams
        (*this)["centers_init"] = centers_init;
        // cluster boundary index. Used when searching the kmeans tree
        (*this)["cb_index"] = cb_index;
+        // number of kmeans trees to search in
+        (*this)["trees"] = trees;
+    }
+
+    KMeansIndexParams(int branching = 32, int iterations = 11,
+                      flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM, float cb_index = 0.2 )
+    {
+        indexParams(branching, iterations, centers_init, cb_index, 1);
+    }
+
+    KMeansIndexParams(int branching, int iterations,
+                      flann_centers_init_t centers_init, float cb_index, int trees)
+    {
+        indexParams(branching, iterations, centers_init, cb_index, trees);
    }
 };

@ -347,6 +361,7 @@ public:
        veclen_ = dataset_.cols;

        branching_ = get_param(params,"branching",32);
+        trees_ = get_param(params,"trees",1);
        iterations_ = get_param(params,"iterations",11);
        if (iterations_<0) {
            iterations_ = (std::numeric_limits<int>::max)();
@ -367,6 +382,13 @@ public:
        }
        cb_index_ = 0.4f;

+        root_ = new KMeansNodePtr[trees_];
+        indices_ = new int*[trees_];
+
+        for (int i=0; i<trees_; ++i) {
+            root_[i] = NULL;
+            indices_[i] = NULL;
+        }
    }


@ -382,9 +404,11 @@ public:
    virtual ~KMeansIndex()
    {
        if (root_ != NULL) {
-            free_centers(root_);
+            free_centers();
+            delete[] root_;
        }
        if (indices_!=NULL) {
+            free_indices();
            delete[] indices_;
        }
    }
@ -429,23 +453,24 @@ public:
            throw FLANNException("Branching factor must be at least 2");
        }

-        indices_ = new int[size_];
-        for (size_t i=0; i<size_; ++i) {
-            indices_[i] = int(i);
-        }
+        free_indices();

-        root_ = pool_.allocate<KMeansNode>();
-        std::memset(root_, 0, sizeof(KMeansNode));
+        for (int i=0; i<trees_; ++i) {
+            indices_[i] = new int[size_];
+            for (size_t j=0; j<size_; ++j) {
+                indices_[i][j] = int(j);
+            }
+            root_[i] = pool_.allocate<KMeansNode>();
+            std::memset(root_[i], 0, sizeof(KMeansNode));

-        if(is_kdtree_distance::val || is_vector_space_distance::val)
-        {
-            computeNodeStatistics(root_, indices_, (unsigned int)size_);
-            computeClustering(root_, indices_, (int)size_, branching_,0);
-        }
-        else
-        {
-            computeBitfieldNodeStatistics(root_, indices_, (unsigned int)size_);
-            computeBitfieldClustering(root_, indices_, (int)size_, branching_,0);
+            if(is_kdtree_distance::val || is_vector_space_distance::val) {
+                computeNodeStatistics(root_[i], indices_[i], (unsigned int)size_);
+                computeClustering(root_[i], indices_[i], (int)size_, branching_,0);
+            }
+            else {
+                computeBitfieldNodeStatistics(root_[i], indices_[i], (unsigned int)size_);
+                computeBitfieldClustering(root_[i], indices_[i], (int)size_, branching_,0);
+            }
        }
    }

@ -456,35 +481,43 @@ public:
        save_value(stream, iterations_);
        save_value(stream, memoryCounter_);
        save_value(stream, cb_index_);
-        save_value(stream, *indices_, (int)size_);
-
-        save_tree(stream, root_);
+        save_value(stream, trees_);
+        for (int i=0; i<trees_; ++i) {
+            save_value(stream, *indices_[i], (int)size_);
+            save_tree(stream, root_[i], i);
+        }
    }


    void loadIndex(FILE* stream) CV_OVERRIDE
    {
+        if (indices_!=NULL) {
+            free_indices();
+            delete[] indices_;
+        }
+        if (root_!=NULL) {
+            free_centers();
+        }
+
        load_value(stream, branching_);
        load_value(stream, iterations_);
        load_value(stream, memoryCounter_);
        load_value(stream, cb_index_);
-        if (indices_!=NULL) {
-            delete[] indices_;
-        }
-        indices_ = new int[size_];
-        load_value(stream, *indices_, size_);
+        load_value(stream, trees_);

-        if (root_!=NULL) {
-            free_centers(root_);
+        indices_ = new int*[trees_];
+        for (int i=0; i<trees_; ++i) {
+            indices_[i] = new int[size_];
+            load_value(stream, *indices_[i], size_);
+            load_tree(stream, root_[i], i);
        }
-        load_tree(stream, root_);

        index_params_["algorithm"] = getType();
        index_params_["branching"] = branching_;
+        index_params_["trees"] = trees_;
        index_params_["iterations"] = iterations_;
        index_params_["centers_init"] = centers_init_;
        index_params_["cb_index"] = cb_index_;
-
    }


@ -500,17 +533,21 @@ public:
    void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) CV_OVERRIDE
    {

-        int maxChecks = get_param(searchParams,"checks",32);
+        const int maxChecks = get_param(searchParams,"checks",32);

        if (maxChecks==FLANN_CHECKS_UNLIMITED) {
-            findExactNN(root_, result, vec);
+            findExactNN(root_[0], result, vec);
        }
        else {
            // Priority queue storing intermediate branches in the best-bin-first search
            Heap<BranchSt>* heap = new Heap<BranchSt>((int)size_);

            int checks = 0;
-            findNN(root_, result, vec, checks, maxChecks, heap);
+            for (int i=0; i<trees_; ++i) {
+                findNN(root_[i], result, vec, checks, maxChecks, heap);
+                if ((checks >= maxChecks) && result.full())
+                    break;
+            }

            BranchSt branch;
            while (heap->popMin(branch) && (checks<maxChecks || !result.full())) {
@ -521,7 +558,6 @@ public:

            CV_Assert(result.full());
        }
-
    }

    /**
@ -541,7 +577,7 @@ public:
        DistanceType variance;
        KMeansNodePtr* clusters = new KMeansNodePtr[numClusters];

-        int clusterCount = getMinVarianceClusters(root_, clusters, numClusters, variance);
+        int clusterCount = getMinVarianceClusters(root_[0], clusters, numClusters, variance);

        Logger::info("Clusters requested: %d, returning %d\n",numClusters, clusterCount);

@ -611,23 +647,23 @@ private:



-    void save_tree(FILE* stream, KMeansNodePtr node)
+    void save_tree(FILE* stream, KMeansNodePtr node, int num)
    {
        save_value(stream, *node);
        save_value(stream, *(node->pivot), (int)veclen_);
        if (node->childs==NULL) {
-            int indices_offset = (int)(node->indices - indices_);
+            int indices_offset = (int)(node->indices - indices_[num]);
            save_value(stream, indices_offset);
        }
        else {
            for(int i=0; i<branching_; ++i) {
-                save_tree(stream, node->childs[i]);
+                save_tree(stream, node->childs[i], num);
            }
        }
    }


-    void load_tree(FILE* stream, KMeansNodePtr& node)
+    void load_tree(FILE* stream, KMeansNodePtr& node, int num)
    {
        node = pool_.allocate<KMeansNode>();
        load_value(stream, *node);
@ -636,12 +672,12 @@ private:
        if (node->childs==NULL) {
            int indices_offset;
            load_value(stream, indices_offset);
-            node->indices = indices_ + indices_offset;
+            node->indices = indices_[num] + indices_offset;
        }
        else {
            node->childs = pool_.allocate<KMeansNodePtr>(branching_);
            for(int i=0; i<branching_; ++i) {
-                load_tree(stream, node->childs[i]);
+                load_tree(stream, node->childs[i], num);
            }
        }
    }
@ -660,6 +696,32 @@ private:
        }
    }

+    void free_centers()
+    {
+       if (root_ != NULL) {
+           for(int i=0; i<trees_; ++i) {
+               if (root_[i] != NULL) {
+                   free_centers(root_[i]);
+               }
+           }
+       }
+    }
+
+    /**
+     * Release the inner elements of indices[]
+     */
+    void free_indices()
+    {
+        if (indices_!=NULL) {
+            for(int i=0; i<trees_; ++i) {
+                if (indices_[i]!=NULL) {
+                    delete[] indices_[i];
+                    indices_[i] = NULL;
+                }
+            }
+        }
+    }
+
    /**
     * Computes the statistics of a node (mean, radius, variance).
     *
@ -960,7 +1022,45 @@ private:
    }


-
+    /**
+     * The method responsible with doing the recursive hierarchical clustering on
+     * binary vectors.
+     * As some might have heared that KMeans on binary data doesn't make sense,
+     * it's worth a little explanation why it actually fairly works. As
+     * with the Hierarchical Clustering algortihm, we seed several centers for the
+     * current node by picking some of its points. Then in a first pass each point
+     * of the node is then related to its closest center. Now let's have a look at
+     * the 5 central dimensions of the 9 following points:
+     *
+     * xxxxxx11100xxxxx (1)
+     * xxxxxx11010xxxxx (2)
+     * xxxxxx11001xxxxx (3)
+     * xxxxxx10110xxxxx (4)
+     * xxxxxx10101xxxxx (5)
+     * xxxxxx10011xxxxx (6)
+     * xxxxxx01110xxxxx (7)
+     * xxxxxx01101xxxxx (8)
+     * xxxxxx01011xxxxx (9)
+     * sum   _____
+     * of 1: 66555
+     *
+     * Even if the barycenter notion doesn't apply, we can set a center
+     * xxxxxx11111xxxxx that will better fit the five dimensions we are focusing
+     * on for these points.
+     *
+     * Note that convergence isn't ensured anymore. In practice, using Gonzales
+     * as seeding algorithm should be fine for getting convergence ("iterations"
+     * value can be set to -1). But with KMeans++ seeding you should definitely
+     * set a maximum number of iterations (but make it higher than the "iterations"
+     * default value of 11).
+     *
+     * Params:
+     *     node = the node to cluster
+     *     indices = indices of the points belonging to the current node
+     *     indices_length = number of points in the current node
+     *     branching = the branching factor to use in the clustering
+     *     level = 0 for the root node, it increases with the subdivision levels
+     */
    void computeBitfieldClustering(KMeansNodePtr node, int* indices,
                                   int indices_length, int branching, int level)
    {
@ -1195,8 +1295,8 @@ private:
        }

        if (node->childs==NULL) {
-            if (checks>=maxChecks) {
-                if (result.full()) return;
+            if ((checks>=maxChecks) && result.full()) {
+                return;
            }
            checks += node->size;
            for (int i=0; i<node->size; ++i) {
@ -1397,6 +1497,9 @@ private:
    /** The branching factor used in the hierarchical k-means clustering */
    int branching_;

+    /** Number of kmeans trees (default is one) */
+    int trees_;
+
    /** Maximum number of iterations to use when performing k-means clustering */
    int iterations_;

@ -1432,12 +1535,12 @@ private:
    /**
     * The root node in the tree.
     */
-    KMeansNodePtr root_;
+    KMeansNodePtr* root_;

    /**
     *  Array of indices to vectors in the dataset.
     */
-    int* indices_;
+    int** indices_;

    /**
     * The distance