mirror of
https://github.com/opencv/opencv.git
synced 2025-01-18 06:03:15 +08:00
Merge pull request #18019 from pemmanuelviel:pev--multiple-kmeans-trees
* Possibility to set more than one tree for the hierarchical KMeans (default is still 1 tree). This particularly improves NN retrieval results with binary vectors, allowing better quality compared to LSH for similar processing time when speed is the criterium. * Add explanations on the FLANN's hierarchical KMeans for binary data.
This commit is contained in:
parent
3b337a12c9
commit
793e7c0d9f
@ -191,8 +191,28 @@ public:
|
||||
KDTreeIndexParams( int trees = 4 );
|
||||
};
|
||||
@endcode
|
||||
- **HierarchicalClusteringIndexParams** When passing an object of this type the index constructed
|
||||
will be a hierarchical tree of clusters, dividing each set of points into n clusters whose centers
|
||||
are picked among the points without further refinement of their position.
|
||||
This algorithm fits both floating, integer and binary vectors. :
|
||||
@code
|
||||
struct HierarchicalClusteringIndexParams : public IndexParams
|
||||
{
|
||||
HierarchicalClusteringIndexParams(
|
||||
int branching = 32,
|
||||
flann_centers_init_t centers_init = CENTERS_RANDOM,
|
||||
int trees = 4,
|
||||
int leaf_size = 100);
|
||||
|
||||
};
|
||||
@endcode
|
||||
- **KMeansIndexParams** When passing an object of this type the index constructed will be a
|
||||
hierarchical k-means tree. :
|
||||
hierarchical k-means tree (one tree by default), dividing each set of points into n clusters
|
||||
whose barycenters are refined iteratively.
|
||||
Note that this algorithm has been extended to the support of binary vectors as an alternative
|
||||
to LSH when knn search speed is the criterium. It will also outperform LSH when processing
|
||||
directly (i.e. without the use of MCA/PCA) datasets whose points share mostly the same values
|
||||
for most of the dimensions. It is recommended to set more than one tree with binary data. :
|
||||
@code
|
||||
struct KMeansIndexParams : public IndexParams
|
||||
{
|
||||
@ -201,6 +221,13 @@ public:
|
||||
int iterations = 11,
|
||||
flann_centers_init_t centers_init = CENTERS_RANDOM,
|
||||
float cb_index = 0.2 );
|
||||
|
||||
KMeansIndexParams(
|
||||
int branching,
|
||||
int iterations,
|
||||
flann_centers_init_t centers_init,
|
||||
float cb_index,
|
||||
int trees );
|
||||
};
|
||||
@endcode
|
||||
- **CompositeIndexParams** When using a parameters object of this type the index created
|
||||
@ -219,7 +246,8 @@ public:
|
||||
- **LshIndexParams** When using a parameters object of this type the index created uses
|
||||
multi-probe LSH (by Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity Search
|
||||
by Qin Lv, William Josephson, Zhe Wang, Moses Charikar, Kai Li., Proceedings of the 33rd
|
||||
International Conference on Very Large Data Bases (VLDB). Vienna, Austria. September 2007) :
|
||||
International Conference on Very Large Data Bases (VLDB). Vienna, Austria. September 2007).
|
||||
This algorithm is designed for binary vectors. :
|
||||
@code
|
||||
struct LshIndexParams : public IndexParams
|
||||
{
|
||||
|
@ -57,8 +57,8 @@ namespace cvflann
|
||||
|
||||
struct KMeansIndexParams : public IndexParams
|
||||
{
|
||||
KMeansIndexParams(int branching = 32, int iterations = 11,
|
||||
flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM, float cb_index = 0.2 )
|
||||
void indexParams(int branching, int iterations,
|
||||
flann_centers_init_t centers_init, float cb_index, int trees)
|
||||
{
|
||||
(*this)["algorithm"] = FLANN_INDEX_KMEANS;
|
||||
// branching factor
|
||||
@ -69,6 +69,20 @@ struct KMeansIndexParams : public IndexParams
|
||||
(*this)["centers_init"] = centers_init;
|
||||
// cluster boundary index. Used when searching the kmeans tree
|
||||
(*this)["cb_index"] = cb_index;
|
||||
// number of kmeans trees to search in
|
||||
(*this)["trees"] = trees;
|
||||
}
|
||||
|
||||
KMeansIndexParams(int branching = 32, int iterations = 11,
|
||||
flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM, float cb_index = 0.2 )
|
||||
{
|
||||
indexParams(branching, iterations, centers_init, cb_index, 1);
|
||||
}
|
||||
|
||||
KMeansIndexParams(int branching, int iterations,
|
||||
flann_centers_init_t centers_init, float cb_index, int trees)
|
||||
{
|
||||
indexParams(branching, iterations, centers_init, cb_index, trees);
|
||||
}
|
||||
};
|
||||
|
||||
@ -347,6 +361,7 @@ public:
|
||||
veclen_ = dataset_.cols;
|
||||
|
||||
branching_ = get_param(params,"branching",32);
|
||||
trees_ = get_param(params,"trees",1);
|
||||
iterations_ = get_param(params,"iterations",11);
|
||||
if (iterations_<0) {
|
||||
iterations_ = (std::numeric_limits<int>::max)();
|
||||
@ -367,6 +382,13 @@ public:
|
||||
}
|
||||
cb_index_ = 0.4f;
|
||||
|
||||
root_ = new KMeansNodePtr[trees_];
|
||||
indices_ = new int*[trees_];
|
||||
|
||||
for (int i=0; i<trees_; ++i) {
|
||||
root_[i] = NULL;
|
||||
indices_[i] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -382,9 +404,11 @@ public:
|
||||
virtual ~KMeansIndex()
|
||||
{
|
||||
if (root_ != NULL) {
|
||||
free_centers(root_);
|
||||
free_centers();
|
||||
delete[] root_;
|
||||
}
|
||||
if (indices_!=NULL) {
|
||||
free_indices();
|
||||
delete[] indices_;
|
||||
}
|
||||
}
|
||||
@ -429,23 +453,24 @@ public:
|
||||
throw FLANNException("Branching factor must be at least 2");
|
||||
}
|
||||
|
||||
indices_ = new int[size_];
|
||||
for (size_t i=0; i<size_; ++i) {
|
||||
indices_[i] = int(i);
|
||||
}
|
||||
free_indices();
|
||||
|
||||
root_ = pool_.allocate<KMeansNode>();
|
||||
std::memset(root_, 0, sizeof(KMeansNode));
|
||||
for (int i=0; i<trees_; ++i) {
|
||||
indices_[i] = new int[size_];
|
||||
for (size_t j=0; j<size_; ++j) {
|
||||
indices_[i][j] = int(j);
|
||||
}
|
||||
root_[i] = pool_.allocate<KMeansNode>();
|
||||
std::memset(root_[i], 0, sizeof(KMeansNode));
|
||||
|
||||
if(is_kdtree_distance::val || is_vector_space_distance::val)
|
||||
{
|
||||
computeNodeStatistics(root_, indices_, (unsigned int)size_);
|
||||
computeClustering(root_, indices_, (int)size_, branching_,0);
|
||||
}
|
||||
else
|
||||
{
|
||||
computeBitfieldNodeStatistics(root_, indices_, (unsigned int)size_);
|
||||
computeBitfieldClustering(root_, indices_, (int)size_, branching_,0);
|
||||
if(is_kdtree_distance::val || is_vector_space_distance::val) {
|
||||
computeNodeStatistics(root_[i], indices_[i], (unsigned int)size_);
|
||||
computeClustering(root_[i], indices_[i], (int)size_, branching_,0);
|
||||
}
|
||||
else {
|
||||
computeBitfieldNodeStatistics(root_[i], indices_[i], (unsigned int)size_);
|
||||
computeBitfieldClustering(root_[i], indices_[i], (int)size_, branching_,0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -456,35 +481,43 @@ public:
|
||||
save_value(stream, iterations_);
|
||||
save_value(stream, memoryCounter_);
|
||||
save_value(stream, cb_index_);
|
||||
save_value(stream, *indices_, (int)size_);
|
||||
|
||||
save_tree(stream, root_);
|
||||
save_value(stream, trees_);
|
||||
for (int i=0; i<trees_; ++i) {
|
||||
save_value(stream, *indices_[i], (int)size_);
|
||||
save_tree(stream, root_[i], i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void loadIndex(FILE* stream) CV_OVERRIDE
|
||||
{
|
||||
if (indices_!=NULL) {
|
||||
free_indices();
|
||||
delete[] indices_;
|
||||
}
|
||||
if (root_!=NULL) {
|
||||
free_centers();
|
||||
}
|
||||
|
||||
load_value(stream, branching_);
|
||||
load_value(stream, iterations_);
|
||||
load_value(stream, memoryCounter_);
|
||||
load_value(stream, cb_index_);
|
||||
if (indices_!=NULL) {
|
||||
delete[] indices_;
|
||||
}
|
||||
indices_ = new int[size_];
|
||||
load_value(stream, *indices_, size_);
|
||||
load_value(stream, trees_);
|
||||
|
||||
if (root_!=NULL) {
|
||||
free_centers(root_);
|
||||
indices_ = new int*[trees_];
|
||||
for (int i=0; i<trees_; ++i) {
|
||||
indices_[i] = new int[size_];
|
||||
load_value(stream, *indices_[i], size_);
|
||||
load_tree(stream, root_[i], i);
|
||||
}
|
||||
load_tree(stream, root_);
|
||||
|
||||
index_params_["algorithm"] = getType();
|
||||
index_params_["branching"] = branching_;
|
||||
index_params_["trees"] = trees_;
|
||||
index_params_["iterations"] = iterations_;
|
||||
index_params_["centers_init"] = centers_init_;
|
||||
index_params_["cb_index"] = cb_index_;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -500,17 +533,21 @@ public:
|
||||
void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) CV_OVERRIDE
|
||||
{
|
||||
|
||||
int maxChecks = get_param(searchParams,"checks",32);
|
||||
const int maxChecks = get_param(searchParams,"checks",32);
|
||||
|
||||
if (maxChecks==FLANN_CHECKS_UNLIMITED) {
|
||||
findExactNN(root_, result, vec);
|
||||
findExactNN(root_[0], result, vec);
|
||||
}
|
||||
else {
|
||||
// Priority queue storing intermediate branches in the best-bin-first search
|
||||
Heap<BranchSt>* heap = new Heap<BranchSt>((int)size_);
|
||||
|
||||
int checks = 0;
|
||||
findNN(root_, result, vec, checks, maxChecks, heap);
|
||||
for (int i=0; i<trees_; ++i) {
|
||||
findNN(root_[i], result, vec, checks, maxChecks, heap);
|
||||
if ((checks >= maxChecks) && result.full())
|
||||
break;
|
||||
}
|
||||
|
||||
BranchSt branch;
|
||||
while (heap->popMin(branch) && (checks<maxChecks || !result.full())) {
|
||||
@ -521,7 +558,6 @@ public:
|
||||
|
||||
CV_Assert(result.full());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
@ -541,7 +577,7 @@ public:
|
||||
DistanceType variance;
|
||||
KMeansNodePtr* clusters = new KMeansNodePtr[numClusters];
|
||||
|
||||
int clusterCount = getMinVarianceClusters(root_, clusters, numClusters, variance);
|
||||
int clusterCount = getMinVarianceClusters(root_[0], clusters, numClusters, variance);
|
||||
|
||||
Logger::info("Clusters requested: %d, returning %d\n",numClusters, clusterCount);
|
||||
|
||||
@ -611,23 +647,23 @@ private:
|
||||
|
||||
|
||||
|
||||
void save_tree(FILE* stream, KMeansNodePtr node)
|
||||
void save_tree(FILE* stream, KMeansNodePtr node, int num)
|
||||
{
|
||||
save_value(stream, *node);
|
||||
save_value(stream, *(node->pivot), (int)veclen_);
|
||||
if (node->childs==NULL) {
|
||||
int indices_offset = (int)(node->indices - indices_);
|
||||
int indices_offset = (int)(node->indices - indices_[num]);
|
||||
save_value(stream, indices_offset);
|
||||
}
|
||||
else {
|
||||
for(int i=0; i<branching_; ++i) {
|
||||
save_tree(stream, node->childs[i]);
|
||||
save_tree(stream, node->childs[i], num);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void load_tree(FILE* stream, KMeansNodePtr& node)
|
||||
void load_tree(FILE* stream, KMeansNodePtr& node, int num)
|
||||
{
|
||||
node = pool_.allocate<KMeansNode>();
|
||||
load_value(stream, *node);
|
||||
@ -636,12 +672,12 @@ private:
|
||||
if (node->childs==NULL) {
|
||||
int indices_offset;
|
||||
load_value(stream, indices_offset);
|
||||
node->indices = indices_ + indices_offset;
|
||||
node->indices = indices_[num] + indices_offset;
|
||||
}
|
||||
else {
|
||||
node->childs = pool_.allocate<KMeansNodePtr>(branching_);
|
||||
for(int i=0; i<branching_; ++i) {
|
||||
load_tree(stream, node->childs[i]);
|
||||
load_tree(stream, node->childs[i], num);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -660,6 +696,32 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
void free_centers()
|
||||
{
|
||||
if (root_ != NULL) {
|
||||
for(int i=0; i<trees_; ++i) {
|
||||
if (root_[i] != NULL) {
|
||||
free_centers(root_[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Release the inner elements of indices[]
|
||||
*/
|
||||
void free_indices()
|
||||
{
|
||||
if (indices_!=NULL) {
|
||||
for(int i=0; i<trees_; ++i) {
|
||||
if (indices_[i]!=NULL) {
|
||||
delete[] indices_[i];
|
||||
indices_[i] = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the statistics of a node (mean, radius, variance).
|
||||
*
|
||||
@ -960,7 +1022,45 @@ private:
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* The method responsible with doing the recursive hierarchical clustering on
|
||||
* binary vectors.
|
||||
* As some might have heared that KMeans on binary data doesn't make sense,
|
||||
* it's worth a little explanation why it actually fairly works. As
|
||||
* with the Hierarchical Clustering algortihm, we seed several centers for the
|
||||
* current node by picking some of its points. Then in a first pass each point
|
||||
* of the node is then related to its closest center. Now let's have a look at
|
||||
* the 5 central dimensions of the 9 following points:
|
||||
*
|
||||
* xxxxxx11100xxxxx (1)
|
||||
* xxxxxx11010xxxxx (2)
|
||||
* xxxxxx11001xxxxx (3)
|
||||
* xxxxxx10110xxxxx (4)
|
||||
* xxxxxx10101xxxxx (5)
|
||||
* xxxxxx10011xxxxx (6)
|
||||
* xxxxxx01110xxxxx (7)
|
||||
* xxxxxx01101xxxxx (8)
|
||||
* xxxxxx01011xxxxx (9)
|
||||
* sum _____
|
||||
* of 1: 66555
|
||||
*
|
||||
* Even if the barycenter notion doesn't apply, we can set a center
|
||||
* xxxxxx11111xxxxx that will better fit the five dimensions we are focusing
|
||||
* on for these points.
|
||||
*
|
||||
* Note that convergence isn't ensured anymore. In practice, using Gonzales
|
||||
* as seeding algorithm should be fine for getting convergence ("iterations"
|
||||
* value can be set to -1). But with KMeans++ seeding you should definitely
|
||||
* set a maximum number of iterations (but make it higher than the "iterations"
|
||||
* default value of 11).
|
||||
*
|
||||
* Params:
|
||||
* node = the node to cluster
|
||||
* indices = indices of the points belonging to the current node
|
||||
* indices_length = number of points in the current node
|
||||
* branching = the branching factor to use in the clustering
|
||||
* level = 0 for the root node, it increases with the subdivision levels
|
||||
*/
|
||||
void computeBitfieldClustering(KMeansNodePtr node, int* indices,
|
||||
int indices_length, int branching, int level)
|
||||
{
|
||||
@ -1195,8 +1295,8 @@ private:
|
||||
}
|
||||
|
||||
if (node->childs==NULL) {
|
||||
if (checks>=maxChecks) {
|
||||
if (result.full()) return;
|
||||
if ((checks>=maxChecks) && result.full()) {
|
||||
return;
|
||||
}
|
||||
checks += node->size;
|
||||
for (int i=0; i<node->size; ++i) {
|
||||
@ -1397,6 +1497,9 @@ private:
|
||||
/** The branching factor used in the hierarchical k-means clustering */
|
||||
int branching_;
|
||||
|
||||
/** Number of kmeans trees (default is one) */
|
||||
int trees_;
|
||||
|
||||
/** Maximum number of iterations to use when performing k-means clustering */
|
||||
int iterations_;
|
||||
|
||||
@ -1432,12 +1535,12 @@ private:
|
||||
/**
|
||||
* The root node in the tree.
|
||||
*/
|
||||
KMeansNodePtr root_;
|
||||
KMeansNodePtr* root_;
|
||||
|
||||
/**
|
||||
* Array of indices to vectors in the dataset.
|
||||
*/
|
||||
int* indices_;
|
||||
int** indices_;
|
||||
|
||||
/**
|
||||
* The distance
|
||||
|
Loading…
Reference in New Issue
Block a user