Add a new method for initializing KMeans centers that leads to better clusters and thus better retrieval when final centers have to be existing keypoints instead of clusters barycenters.

This commit is contained in:
Pierre-Emmanuel Viel 2013-12-26 02:56:28 +01:00
parent 65b020f0d3
commit 112d63ae96
2 changed files with 82 additions and 0 deletions

View File

@ -107,6 +107,7 @@ enum flann_centers_init_t
FLANN_CENTERS_RANDOM = 0,
FLANN_CENTERS_GONZALES = 1,
FLANN_CENTERS_KMEANSPP = 2,
FLANN_CENTERS_GROUPWISE = 3,
// deprecated constants, should use the FLANN_CENTERS_* ones instead
CENTERS_RANDOM = 0,

View File

@ -257,6 +257,84 @@ private:
}
/**
* Chooses the initial centers in a way inspired by Gonzales (by Pierre-Emmanuel Viel):
* select the first point of the list as a candidate, then parse the points list. If another
* point is further than current candidate from the other centers, test if it is a good center
* of a local aggregation. If it is, replace current candidate by this point. And so on...
*
* Used with KMeansIndex that computes centers coordinates by averaging positions of clusters points,
* this doesn't make a real difference with previous methods. But used with HierarchicalClusteringIndex
* class that pick centers among existing points instead of computing the barycenters, there is a real
* improvement.
*
* Params:
* k = number of centers
* vecs = the dataset of points
* indices = indices in the dataset
* Returns:
*/
void GroupWiseCenterChooser(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
{
const float kSpeedUpFactor = 1.3f;
int n = indices_length;
DistanceType* closestDistSq = new DistanceType[n];
// Choose one random center and set the closestDistSq values
int index = rand_int(n);
assert(index >=0 && index < n);
centers[0] = dsindices[index];
for (int i = 0; i < n; i++) {
closestDistSq[i] = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);
}
// Choose each center
int centerCount;
for (centerCount = 1; centerCount < k; centerCount++) {
// Repeat several trials
double bestNewPot = -1;
int bestNewIndex = 0;
DistanceType furthest = 0;
for (index = 0; index < n; index++) {
// We will test only the potential of the points further than current candidate
if( closestDistSq[index] > kSpeedUpFactor * (float)furthest ) {
// Compute the new potential
double newPot = 0;
for (int i = 0; i < n; i++) {
newPot += std::min( distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols)
, closestDistSq[i] );
}
// Store the best result
if ((bestNewPot < 0)||(newPot <= bestNewPot)) {
bestNewPot = newPot;
bestNewIndex = index;
furthest = closestDistSq[index];
}
}
}
// Add the appropriate center
centers[centerCount] = dsindices[bestNewIndex];
for (int i = 0; i < n; i++) {
closestDistSq[i] = std::min( distance(dataset[dsindices[i]], dataset[dsindices[bestNewIndex]], dataset.cols)
, closestDistSq[i] );
}
}
centers_length = centerCount;
delete[] closestDistSq;
}
public:
@ -290,6 +368,9 @@ public:
else if (centers_init_==FLANN_CENTERS_KMEANSPP) {
chooseCenters = &HierarchicalClusteringIndex::chooseCentersKMeanspp;
}
else if (centers_init_==FLANN_CENTERS_GROUPWISE) {
chooseCenters = &HierarchicalClusteringIndex::GroupWiseCenterChooser;
}
else {
throw FLANNException("Unknown algorithm for choosing initial centers.");
}