opencv/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
pemmanuelviel e6ec42d462
Merge pull request #17735 from pemmanuelviel:pev-fix-trees-descent
* Fix trees parsing behavior in hierarchical_clustering_index:
Before, when maxCheck was reached in the first descent of a tree, time was still wasted parsing
the next trees till their best leaf, just to skip the points stored there.
Now we can choose either to keep this behavior, and so we skip parsing other trees after reaching
maxCheck, or we choose to do one descent in each tree, even if in one tree we reach maxCheck.

* Apply the same change to kdtree.
As each leaf contains only 1 point (unlike hierarchical_clustering), difference is visible if trees > maxCheck

* Add the new explore_all_trees parameters to miniflann

* Adapt the FlannBasedMatcher read_write test to the additional search parameter

* Adapt java tests to the additional parameter in SearchParams

* Fix the ABI dumps failure on SearchParams interface change

* Support of ctor calling another ctor of the class is only fully supported from C+11
2020-08-03 18:00:59 +00:00

854 lines
26 KiB
C++

/***********************************************************************
* Software License Agreement (BSD License)
*
* Copyright 2008-2011 Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
* Copyright 2008-2011 David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
*
* THE BSD LICENSE
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*************************************************************************/
#ifndef OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_
#define OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_
//! @cond IGNORED
#include <algorithm>
#include <map>
#include <limits>
#include <cmath>
#include "general.h"
#include "nn_index.h"
#include "dist.h"
#include "matrix.h"
#include "result_set.h"
#include "heap.h"
#include "allocator.h"
#include "random.h"
#include "saving.h"
namespace cvflann
{
struct HierarchicalClusteringIndexParams : public IndexParams
{
HierarchicalClusteringIndexParams(int branching = 32,
flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM,
int trees = 4, int leaf_size = 100)
{
(*this)["algorithm"] = FLANN_INDEX_HIERARCHICAL;
// The branching factor used in the hierarchical clustering
(*this)["branching"] = branching;
// Algorithm used for picking the initial cluster centers
(*this)["centers_init"] = centers_init;
// number of parallel trees to build
(*this)["trees"] = trees;
// maximum leaf size
(*this)["leaf_size"] = leaf_size;
}
};
/**
* Hierarchical index
*
* Contains a tree constructed through a hierarchical clustering
* and other information for indexing a set of points for nearest-neighbour matching.
*/
template <typename Distance>
class HierarchicalClusteringIndex : public NNIndex<Distance>
{
public:
typedef typename Distance::ElementType ElementType;
typedef typename Distance::ResultType DistanceType;
private:
typedef void (HierarchicalClusteringIndex::* centersAlgFunction)(int, int*, int, int*, int&);
/**
* The function used for choosing the cluster centers.
*/
centersAlgFunction chooseCenters;
/**
* Chooses the initial centers in the k-means clustering in a random manner.
*
* Params:
* k = number of centers
* vecs = the dataset of points
* indices = indices in the dataset
* indices_length = length of indices vector
*
*/
void chooseCentersRandom(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
{
UniqueRandom r(indices_length);
int index;
for (index=0; index<k; ++index) {
bool duplicate = true;
int rnd;
while (duplicate) {
duplicate = false;
rnd = r.next();
if (rnd<0) {
centers_length = index;
return;
}
centers[index] = dsindices[rnd];
for (int j=0; j<index; ++j) {
DistanceType sq = distance(dataset[centers[index]], dataset[centers[j]], dataset.cols);
if (sq<1e-16) {
duplicate = true;
}
}
}
}
centers_length = index;
}
/**
* Chooses the initial centers in the k-means using Gonzales' algorithm
* so that the centers are spaced apart from each other.
*
* Params:
* k = number of centers
* vecs = the dataset of points
* indices = indices in the dataset
* Returns:
*/
void chooseCentersGonzales(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
{
int n = indices_length;
int rnd = rand_int(n);
CV_DbgAssert(rnd >=0 && rnd < n);
centers[0] = dsindices[rnd];
int index;
for (index=1; index<k; ++index) {
int best_index = -1;
DistanceType best_val = 0;
for (int j=0; j<n; ++j) {
DistanceType dist = distance(dataset[centers[0]],dataset[dsindices[j]],dataset.cols);
for (int i=1; i<index; ++i) {
DistanceType tmp_dist = distance(dataset[centers[i]],dataset[dsindices[j]],dataset.cols);
if (tmp_dist<dist) {
dist = tmp_dist;
}
}
if (dist>best_val) {
best_val = dist;
best_index = j;
}
}
if (best_index!=-1) {
centers[index] = dsindices[best_index];
}
else {
break;
}
}
centers_length = index;
}
/**
* Chooses the initial centers in the k-means using the algorithm
* proposed in the KMeans++ paper:
* Arthur, David; Vassilvitskii, Sergei - k-means++: The Advantages of Careful Seeding
*
* Implementation of this function was converted from the one provided in Arthur's code.
*
* Params:
* k = number of centers
* vecs = the dataset of points
* indices = indices in the dataset
* Returns:
*/
void chooseCentersKMeanspp(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
{
int n = indices_length;
double currentPot = 0;
DistanceType* closestDistSq = new DistanceType[n];
// Choose one random center and set the closestDistSq values
int index = rand_int(n);
CV_DbgAssert(index >=0 && index < n);
centers[0] = dsindices[index];
// Computing distance^2 will have the advantage of even higher probability further to pick new centers
// far from previous centers (and this complies to "k-means++: the advantages of careful seeding" article)
for (int i = 0; i < n; i++) {
closestDistSq[i] = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);
closestDistSq[i] = ensureSquareDistance<Distance>( closestDistSq[i] );
currentPot += closestDistSq[i];
}
const int numLocalTries = 1;
// Choose each center
int centerCount;
for (centerCount = 1; centerCount < k; centerCount++) {
// Repeat several trials
double bestNewPot = -1;
int bestNewIndex = 0;
for (int localTrial = 0; localTrial < numLocalTries; localTrial++) {
// Choose our center - have to be slightly careful to return a valid answer even accounting
// for possible rounding errors
double randVal = rand_double(currentPot);
for (index = 0; index < n-1; index++) {
if (randVal <= closestDistSq[index]) break;
else randVal -= closestDistSq[index];
}
// Compute the new potential
double newPot = 0;
for (int i = 0; i < n; i++) {
DistanceType dist = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);
newPot += std::min( ensureSquareDistance<Distance>(dist), closestDistSq[i] );
}
// Store the best result
if ((bestNewPot < 0)||(newPot < bestNewPot)) {
bestNewPot = newPot;
bestNewIndex = index;
}
}
// Add the appropriate center
centers[centerCount] = dsindices[bestNewIndex];
currentPot = bestNewPot;
for (int i = 0; i < n; i++) {
DistanceType dist = distance(dataset[dsindices[i]], dataset[dsindices[bestNewIndex]], dataset.cols);
closestDistSq[i] = std::min( ensureSquareDistance<Distance>(dist), closestDistSq[i] );
}
}
centers_length = centerCount;
delete[] closestDistSq;
}
/**
* Chooses the initial centers in a way inspired by Gonzales (by Pierre-Emmanuel Viel):
* select the first point of the list as a candidate, then parse the points list. If another
* point is further than current candidate from the other centers, test if it is a good center
* of a local aggregation. If it is, replace current candidate by this point. And so on...
*
* Used with KMeansIndex that computes centers coordinates by averaging positions of clusters points,
* this doesn't make a real difference with previous methods. But used with HierarchicalClusteringIndex
* class that pick centers among existing points instead of computing the barycenters, there is a real
* improvement.
*
* Params:
* k = number of centers
* vecs = the dataset of points
* indices = indices in the dataset
* Returns:
*/
void GroupWiseCenterChooser(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
{
const float kSpeedUpFactor = 1.3f;
int n = indices_length;
DistanceType* closestDistSq = new DistanceType[n];
// Choose one random center and set the closestDistSq values
int index = rand_int(n);
CV_DbgAssert(index >=0 && index < n);
centers[0] = dsindices[index];
for (int i = 0; i < n; i++) {
closestDistSq[i] = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);
}
// Choose each center
int centerCount;
for (centerCount = 1; centerCount < k; centerCount++) {
// Repeat several trials
double bestNewPot = -1;
int bestNewIndex = 0;
DistanceType furthest = 0;
for (index = 0; index < n; index++) {
// We will test only the potential of the points further than current candidate
if( closestDistSq[index] > kSpeedUpFactor * (float)furthest ) {
// Compute the new potential
double newPot = 0;
for (int i = 0; i < n; i++) {
newPot += std::min( distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols)
, closestDistSq[i] );
}
// Store the best result
if ((bestNewPot < 0)||(newPot <= bestNewPot)) {
bestNewPot = newPot;
bestNewIndex = index;
furthest = closestDistSq[index];
}
}
}
// Add the appropriate center
centers[centerCount] = dsindices[bestNewIndex];
for (int i = 0; i < n; i++) {
closestDistSq[i] = std::min( distance(dataset[dsindices[i]], dataset[dsindices[bestNewIndex]], dataset.cols)
, closestDistSq[i] );
}
}
centers_length = centerCount;
delete[] closestDistSq;
}
public:
/**
* Index constructor
*
* Params:
* inputData = dataset with the input features
* params = parameters passed to the hierarchical k-means algorithm
*/
HierarchicalClusteringIndex(const Matrix<ElementType>& inputData, const IndexParams& index_params = HierarchicalClusteringIndexParams(),
Distance d = Distance())
: dataset(inputData), params(index_params), root(NULL), indices(NULL), distance(d)
{
memoryCounter = 0;
size_ = dataset.rows;
veclen_ = dataset.cols;
branching_ = get_param(params,"branching",32);
centers_init_ = get_param(params,"centers_init", FLANN_CENTERS_RANDOM);
trees_ = get_param(params,"trees",4);
leaf_size_ = get_param(params,"leaf_size",100);
if (centers_init_==FLANN_CENTERS_RANDOM) {
chooseCenters = &HierarchicalClusteringIndex::chooseCentersRandom;
}
else if (centers_init_==FLANN_CENTERS_GONZALES) {
chooseCenters = &HierarchicalClusteringIndex::chooseCentersGonzales;
}
else if (centers_init_==FLANN_CENTERS_KMEANSPP) {
chooseCenters = &HierarchicalClusteringIndex::chooseCentersKMeanspp;
}
else if (centers_init_==FLANN_CENTERS_GROUPWISE) {
chooseCenters = &HierarchicalClusteringIndex::GroupWiseCenterChooser;
}
else {
throw FLANNException("Unknown algorithm for choosing initial centers.");
}
root = new NodePtr[trees_];
indices = new int*[trees_];
for (int i=0; i<trees_; ++i) {
root[i] = NULL;
indices[i] = NULL;
}
}
HierarchicalClusteringIndex(const HierarchicalClusteringIndex&);
HierarchicalClusteringIndex& operator=(const HierarchicalClusteringIndex&);
/**
* Index destructor.
*
* Release the memory used by the index.
*/
virtual ~HierarchicalClusteringIndex()
{
free_elements();
if (root!=NULL) {
delete[] root;
}
if (indices!=NULL) {
delete[] indices;
}
}
/**
* Release the inner elements of indices[]
*/
void free_elements()
{
if (indices!=NULL) {
for(int i=0; i<trees_; ++i) {
if (indices[i]!=NULL) {
delete[] indices[i];
indices[i] = NULL;
}
}
}
}
/**
* Returns size of index.
*/
size_t size() const CV_OVERRIDE
{
return size_;
}
/**
* Returns the length of an index feature.
*/
size_t veclen() const CV_OVERRIDE
{
return veclen_;
}
/**
* Computes the inde memory usage
* Returns: memory used by the index
*/
int usedMemory() const CV_OVERRIDE
{
return pool.usedMemory+pool.wastedMemory+memoryCounter;
}
/**
* Builds the index
*/
void buildIndex() CV_OVERRIDE
{
if (branching_<2) {
throw FLANNException("Branching factor must be at least 2");
}
free_elements();
for (int i=0; i<trees_; ++i) {
indices[i] = new int[size_];
for (size_t j=0; j<size_; ++j) {
indices[i][j] = (int)j;
}
root[i] = pool.allocate<Node>();
computeClustering(root[i], indices[i], (int)size_, branching_,0);
}
}
flann_algorithm_t getType() const CV_OVERRIDE
{
return FLANN_INDEX_HIERARCHICAL;
}
void saveIndex(FILE* stream) CV_OVERRIDE
{
save_value(stream, branching_);
save_value(stream, trees_);
save_value(stream, centers_init_);
save_value(stream, leaf_size_);
save_value(stream, memoryCounter);
for (int i=0; i<trees_; ++i) {
save_value(stream, *indices[i], size_);
save_tree(stream, root[i], i);
}
}
void loadIndex(FILE* stream) CV_OVERRIDE
{
free_elements();
if (root!=NULL) {
delete[] root;
}
if (indices!=NULL) {
delete[] indices;
}
load_value(stream, branching_);
load_value(stream, trees_);
load_value(stream, centers_init_);
load_value(stream, leaf_size_);
load_value(stream, memoryCounter);
indices = new int*[trees_];
root = new NodePtr[trees_];
for (int i=0; i<trees_; ++i) {
indices[i] = new int[size_];
load_value(stream, *indices[i], size_);
load_tree(stream, root[i], i);
}
params["algorithm"] = getType();
params["branching"] = branching_;
params["trees"] = trees_;
params["centers_init"] = centers_init_;
params["leaf_size"] = leaf_size_;
}
/**
* Find set of nearest neighbors to vec. Their indices are stored inside
* the result object.
*
* Params:
* result = the result object in which the indices of the nearest-neighbors are stored
* vec = the vector for which to search the nearest neighbors
* searchParams = parameters that influence the search algorithm (checks)
*/
void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) CV_OVERRIDE
{
const int maxChecks = get_param(searchParams,"checks",32);
const bool explore_all_trees = get_param(searchParams,"explore_all_trees",false);
// Priority queue storing intermediate branches in the best-bin-first search
Heap<BranchSt>* heap = new Heap<BranchSt>((int)size_);
std::vector<bool> checked(size_,false);
int checks = 0;
for (int i=0; i<trees_; ++i) {
findNN(root[i], result, vec, checks, maxChecks, heap, checked, explore_all_trees);
if (!explore_all_trees && (checks >= maxChecks) && result.full())
break;
}
BranchSt branch;
while (heap->popMin(branch) && (checks<maxChecks || !result.full())) {
NodePtr node = branch.node;
findNN(node, result, vec, checks, maxChecks, heap, checked, false);
}
delete heap;
CV_Assert(result.full());
}
IndexParams getParameters() const CV_OVERRIDE
{
return params;
}
private:
/**
* Structure representing a node in the hierarchical k-means tree.
*/
struct Node
{
/**
* The cluster center index
*/
int pivot;
/**
* The cluster size (number of points in the cluster)
*/
int size;
/**
* Child nodes (only for non-terminal nodes)
*/
Node** childs;
/**
* Node points (only for terminal nodes)
*/
int* indices;
/**
* Level
*/
int level;
};
typedef Node* NodePtr;
/**
* Alias definition for a nicer syntax.
*/
typedef BranchStruct<NodePtr, DistanceType> BranchSt;
void save_tree(FILE* stream, NodePtr node, int num)
{
save_value(stream, *node);
if (node->childs==NULL) {
int indices_offset = (int)(node->indices - indices[num]);
save_value(stream, indices_offset);
}
else {
for(int i=0; i<branching_; ++i) {
save_tree(stream, node->childs[i], num);
}
}
}
void load_tree(FILE* stream, NodePtr& node, int num)
{
node = pool.allocate<Node>();
load_value(stream, *node);
if (node->childs==NULL) {
int indices_offset;
load_value(stream, indices_offset);
node->indices = indices[num] + indices_offset;
}
else {
node->childs = pool.allocate<NodePtr>(branching_);
for(int i=0; i<branching_; ++i) {
load_tree(stream, node->childs[i], num);
}
}
}
void computeLabels(int* dsindices, int indices_length, int* centers, int centers_length, int* labels, DistanceType& cost)
{
cost = 0;
for (int i=0; i<indices_length; ++i) {
ElementType* point = dataset[dsindices[i]];
DistanceType dist = distance(point, dataset[centers[0]], veclen_);
labels[i] = 0;
for (int j=1; j<centers_length; ++j) {
DistanceType new_dist = distance(point, dataset[centers[j]], veclen_);
if (dist>new_dist) {
labels[i] = j;
dist = new_dist;
}
}
cost += dist;
}
}
/**
* The method responsible with actually doing the recursive hierarchical
* clustering
*
* Params:
* node = the node to cluster
* indices = indices of the points belonging to the current node
* branching = the branching factor to use in the clustering
*
* TODO: for 1-sized clusters don't store a cluster center (it's the same as the single cluster point)
*/
void computeClustering(NodePtr node, int* dsindices, int indices_length, int branching, int level)
{
node->size = indices_length;
node->level = level;
if (indices_length < leaf_size_) { // leaf node
node->indices = dsindices;
std::sort(node->indices,node->indices+indices_length);
node->childs = NULL;
return;
}
std::vector<int> centers(branching);
std::vector<int> labels(indices_length);
int centers_length;
(this->*chooseCenters)(branching, dsindices, indices_length, &centers[0], centers_length);
if (centers_length<branching) {
node->indices = dsindices;
std::sort(node->indices,node->indices+indices_length);
node->childs = NULL;
return;
}
// assign points to clusters
DistanceType cost;
computeLabels(dsindices, indices_length, &centers[0], centers_length, &labels[0], cost);
node->childs = pool.allocate<NodePtr>(branching);
int start = 0;
int end = start;
for (int i=0; i<branching; ++i) {
for (int j=0; j<indices_length; ++j) {
if (labels[j]==i) {
std::swap(dsindices[j],dsindices[end]);
std::swap(labels[j],labels[end]);
end++;
}
}
node->childs[i] = pool.allocate<Node>();
node->childs[i]->pivot = centers[i];
node->childs[i]->indices = NULL;
computeClustering(node->childs[i],dsindices+start, end-start, branching, level+1);
start=end;
}
}
/**
* Performs one descent in the hierarchical k-means tree. The branches not
* visited are stored in a priority queue.
*
* Params:
* node = node to explore
* result = container for the k-nearest neighbors found
* vec = query points
* checks = how many points in the dataset have been checked so far
* maxChecks = maximum dataset points to checks
*/
void findNN(NodePtr node, ResultSet<DistanceType>& result, const ElementType* vec, int& checks, int maxChecks,
Heap<BranchSt>* heap, std::vector<bool>& checked, bool explore_all_trees = false)
{
if (node->childs==NULL) {
if (!explore_all_trees && (checks>=maxChecks) && result.full()) {
return;
}
for (int i=0; i<node->size; ++i) {
int index = node->indices[i];
if (!checked[index]) {
DistanceType dist = distance(dataset[index], vec, veclen_);
result.addPoint(dist, index);
checked[index] = true;
++checks;
}
}
}
else {
DistanceType* domain_distances = new DistanceType[branching_];
int best_index = 0;
domain_distances[best_index] = distance(vec, dataset[node->childs[best_index]->pivot], veclen_);
for (int i=1; i<branching_; ++i) {
domain_distances[i] = distance(vec, dataset[node->childs[i]->pivot], veclen_);
if (domain_distances[i]<domain_distances[best_index]) {
best_index = i;
}
}
for (int i=0; i<branching_; ++i) {
if (i!=best_index) {
heap->insert(BranchSt(node->childs[i],domain_distances[i]));
}
}
delete[] domain_distances;
findNN(node->childs[best_index],result,vec, checks, maxChecks, heap, checked, explore_all_trees);
}
}
private:
/**
* The dataset used by this index
*/
const Matrix<ElementType> dataset;
/**
* Parameters used by this index
*/
IndexParams params;
/**
* Number of features in the dataset.
*/
size_t size_;
/**
* Length of each feature.
*/
size_t veclen_;
/**
* The root node in the tree.
*/
NodePtr* root;
/**
* Array of indices to vectors in the dataset.
*/
int** indices;
/**
* The distance
*/
Distance distance;
/**
* Pooled memory allocator.
*
* Using a pooled memory allocator is more efficient
* than allocating memory directly when there is a large
* number small of memory allocations.
*/
PooledAllocator pool;
/**
* Memory occupied by the index.
*/
int memoryCounter;
/** index parameters */
int branching_;
int trees_;
flann_centers_init_t centers_init_;
int leaf_size_;
};
}
//! @endcond
#endif /* OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_ */