mirror of
https://github.com/opencv/opencv.git
synced 2024-11-28 21:20:18 +08:00
Merge pull request #18211 from pemmanuelviel:pev--handle-dna-vectors
* DNA-mode: update miniflann to handle DNA * DNA-mode: update hierarchical kmeans to handle DNA sequences
This commit is contained in:
parent
698b2bf729
commit
31dc3e9256
@ -50,6 +50,9 @@
|
||||
#include "logger.h"
|
||||
|
||||
#define BITS_PER_CHAR 8
|
||||
#define BITS_PER_BASE 2 // for DNA/RNA sequences
|
||||
#define BASE_PER_CHAR (BITS_PER_CHAR/BITS_PER_BASE)
|
||||
#define HISTOS_PER_BASE (1<<BITS_PER_BASE)
|
||||
|
||||
|
||||
namespace cvflann
|
||||
@ -825,6 +828,73 @@ private:
|
||||
}
|
||||
|
||||
|
||||
void computeDnaNodeStatistics(KMeansNodePtr node, int* indices,
|
||||
unsigned int indices_length)
|
||||
{
|
||||
const unsigned int histos_veclen = static_cast<unsigned int>(
|
||||
veclen_*sizeof(CentersType)*(HISTOS_PER_BASE*BASE_PER_CHAR));
|
||||
|
||||
unsigned long long variance = 0ull;
|
||||
unsigned int* histograms = new unsigned int[histos_veclen];
|
||||
memset(histograms, 0, sizeof(unsigned int)*histos_veclen);
|
||||
|
||||
for (unsigned int i=0; i<indices_length; ++i) {
|
||||
variance += static_cast<unsigned long long>( ensureSquareDistance<Distance>(
|
||||
distance_(dataset_[indices[i]], ZeroIterator<ElementType>(), veclen_)));
|
||||
|
||||
unsigned char* vec = (unsigned char*)dataset_[indices[i]];
|
||||
for (size_t k=0, l=0; k<histos_veclen; k+=HISTOS_PER_BASE*BASE_PER_CHAR, ++l) {
|
||||
histograms[k + ((vec[l]) & 0x03)]++;
|
||||
histograms[k + 4 + ((vec[l]>>2) & 0x03)]++;
|
||||
histograms[k + 8 + ((vec[l]>>4) & 0x03)]++;
|
||||
histograms[k +12 + ((vec[l]>>6) & 0x03)]++;
|
||||
}
|
||||
}
|
||||
|
||||
CentersType* mean = new CentersType[veclen_];
|
||||
memoryCounter_ += int(veclen_*sizeof(CentersType));
|
||||
unsigned char* char_mean = (unsigned char*)mean;
|
||||
unsigned int* h = histograms;
|
||||
for (size_t k=0, l=0; k<histos_veclen; k+=HISTOS_PER_BASE*BASE_PER_CHAR, ++l) {
|
||||
char_mean[l] = (h[k] > h[k+1] ? h[k+2] > h[k+3] ? h[k] > h[k+2] ? 0x00 : 0x10
|
||||
: h[k] > h[k+3] ? 0x00 : 0x11
|
||||
: h[k+2] > h[k+3] ? h[k+1] > h[k+2] ? 0x01 : 0x10
|
||||
: h[k+1] > h[k+3] ? 0x01 : 0x11)
|
||||
| (h[k+4]>h[k+5] ? h[k+6] > h[k+7] ? h[k+4] > h[k+6] ? 0x00 : 0x1000
|
||||
: h[k+4] > h[k+7] ? 0x00 : 0x1100
|
||||
: h[k+6] > h[k+7] ? h[k+5] > h[k+6] ? 0x0100 : 0x1000
|
||||
: h[k+5] > h[k+7] ? 0x0100 : 0x1100)
|
||||
| (h[k+8]>h[k+9] ? h[k+10]>h[k+11] ? h[k+8] >h[k+10] ? 0x00 : 0x100000
|
||||
: h[k+8] >h[k+11] ? 0x00 : 0x110000
|
||||
: h[k+10]>h[k+11] ? h[k+9] >h[k+10] ? 0x010000 : 0x100000
|
||||
: h[k+9] >h[k+11] ? 0x010000 : 0x110000)
|
||||
| (h[k+12]>h[k+13] ? h[k+14]>h[k+15] ? h[k+12] >h[k+14] ? 0x00 : 0x10000000
|
||||
: h[k+12] >h[k+15] ? 0x00 : 0x11000000
|
||||
: h[k+14]>h[k+15] ? h[k+13] >h[k+14] ? 0x01000000 : 0x10000000
|
||||
: h[k+13] >h[k+15] ? 0x01000000 : 0x11000000);
|
||||
}
|
||||
variance = static_cast<unsigned long long>(
|
||||
0.5 + static_cast<double>(variance) / static_cast<double>(indices_length));
|
||||
variance -= static_cast<unsigned long long>(
|
||||
ensureSquareDistance<Distance>(
|
||||
distance_(mean, ZeroIterator<ElementType>(), veclen_)));
|
||||
|
||||
DistanceType radius = 0;
|
||||
for (unsigned int i=0; i<indices_length; ++i) {
|
||||
DistanceType tmp = distance_(mean, dataset_[indices[i]], veclen_);
|
||||
if (tmp>radius) {
|
||||
radius = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
node->variance = static_cast<DistanceType>(variance);
|
||||
node->radius = radius;
|
||||
node->pivot = mean;
|
||||
|
||||
delete[] histograms;
|
||||
}
|
||||
|
||||
|
||||
template<typename DistType>
|
||||
void computeNodeStatistics(KMeansNodePtr node, int* indices,
|
||||
unsigned int indices_length,
|
||||
@ -858,6 +928,22 @@ private:
|
||||
computeBitfieldNodeStatistics(node, indices, indices_length);
|
||||
}
|
||||
|
||||
void computeNodeStatistics(KMeansNodePtr node, int* indices,
|
||||
unsigned int indices_length,
|
||||
const cvflann::DNAmmingLUT* identifier)
|
||||
{
|
||||
(void)identifier;
|
||||
computeDnaNodeStatistics(node, indices, indices_length);
|
||||
}
|
||||
|
||||
void computeNodeStatistics(KMeansNodePtr node, int* indices,
|
||||
unsigned int indices_length,
|
||||
const cvflann::DNAmming2<unsigned char>* identifier)
|
||||
{
|
||||
(void)identifier;
|
||||
computeDnaNodeStatistics(node, indices, indices_length);
|
||||
}
|
||||
|
||||
|
||||
void refineClustering(int* indices, int indices_length, int branching, CentersType** centers,
|
||||
std::vector<DistanceType>& radiuses, int* belongs_to, int* count)
|
||||
@ -1051,6 +1137,112 @@ private:
|
||||
}
|
||||
|
||||
|
||||
void refineDnaClustering(int* indices, int indices_length, int branching, CentersType** centers,
|
||||
std::vector<DistanceType>& radiuses, int* belongs_to, int* count)
|
||||
{
|
||||
for (int i=0; i<branching; ++i) {
|
||||
centers[i] = new CentersType[veclen_];
|
||||
memoryCounter_ += (int)(veclen_*sizeof(CentersType));
|
||||
}
|
||||
|
||||
const unsigned int histos_veclen = static_cast<unsigned int>(
|
||||
veclen_*sizeof(CentersType)*(HISTOS_PER_BASE*BASE_PER_CHAR));
|
||||
cv::AutoBuffer<unsigned int> histos_buf(branching*histos_veclen);
|
||||
Matrix<unsigned int> histos(histos_buf.data(), branching, histos_veclen);
|
||||
|
||||
bool converged = false;
|
||||
int iteration = 0;
|
||||
while (!converged && iteration<iterations_) {
|
||||
converged = true;
|
||||
iteration++;
|
||||
|
||||
// compute the new cluster centers
|
||||
for (int i=0; i<branching; ++i) {
|
||||
memset(histos[i],0,sizeof(unsigned int)*histos_veclen);
|
||||
radiuses[i] = 0;
|
||||
}
|
||||
for (int i=0; i<indices_length; ++i) {
|
||||
unsigned char* vec = (unsigned char*)dataset_[indices[i]];
|
||||
unsigned int* h = histos[belongs_to[i]];
|
||||
for (size_t k=0, l=0; k<histos_veclen; k+=HISTOS_PER_BASE*BASE_PER_CHAR, ++l) {
|
||||
h[k + ((vec[l]) & 0x03)]++;
|
||||
h[k + 4 + ((vec[l]>>2) & 0x03)]++;
|
||||
h[k + 8 + ((vec[l]>>4) & 0x03)]++;
|
||||
h[k +12 + ((vec[l]>>6) & 0x03)]++;
|
||||
}
|
||||
}
|
||||
for (int i=0; i<branching; ++i) {
|
||||
unsigned int* h = histos[i];
|
||||
unsigned char* charCenter = (unsigned char*)centers[i];
|
||||
for (size_t k=0, l=0; k<histos_veclen; k+=HISTOS_PER_BASE*BASE_PER_CHAR, ++l) {
|
||||
charCenter[l]= (h[k] > h[k+1] ? h[k+2] > h[k+3] ? h[k] > h[k+2] ? 0x00 : 0x10
|
||||
: h[k] > h[k+3] ? 0x00 : 0x11
|
||||
: h[k+2] > h[k+3] ? h[k+1] > h[k+2] ? 0x01 : 0x10
|
||||
: h[k+1] > h[k+3] ? 0x01 : 0x11)
|
||||
| (h[k+4]>h[k+5] ? h[k+6] > h[k+7] ? h[k+4] > h[k+6] ? 0x00 : 0x1000
|
||||
: h[k+4] > h[k+7] ? 0x00 : 0x1100
|
||||
: h[k+6] > h[k+7] ? h[k+5] > h[k+6] ? 0x0100 : 0x1000
|
||||
: h[k+5] > h[k+7] ? 0x0100 : 0x1100)
|
||||
| (h[k+8]>h[k+9] ? h[k+10]>h[k+11] ? h[k+8] >h[k+10] ? 0x00 : 0x100000
|
||||
: h[k+8] >h[k+11] ? 0x00 : 0x110000
|
||||
: h[k+10]>h[k+11] ? h[k+9] >h[k+10] ? 0x010000 : 0x100000
|
||||
: h[k+9] >h[k+11] ? 0x010000 : 0x110000)
|
||||
| (h[k+12]>h[k+13] ? h[k+14]>h[k+15] ? h[k+12] >h[k+14] ? 0x00 : 0x10000000
|
||||
: h[k+12] >h[k+15] ? 0x00 : 0x11000000
|
||||
: h[k+14]>h[k+15] ? h[k+13] >h[k+14] ? 0x01000000 : 0x10000000
|
||||
: h[k+13] >h[k+15] ? 0x01000000 : 0x11000000);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<int> new_centroids(indices_length);
|
||||
std::vector<DistanceType> dists(indices_length);
|
||||
|
||||
// reassign points to clusters
|
||||
KMeansDistanceComputer<ElementType**> invoker(
|
||||
distance_, dataset_, branching, indices, centers, veclen_, new_centroids, dists);
|
||||
parallel_for_(cv::Range(0, (int)indices_length), invoker);
|
||||
|
||||
for (int i=0; i < indices_length; ++i) {
|
||||
DistanceType dist(dists[i]);
|
||||
int new_centroid(new_centroids[i]);
|
||||
if (dist > radiuses[new_centroid]) {
|
||||
radiuses[new_centroid] = dist;
|
||||
}
|
||||
if (new_centroid != belongs_to[i]) {
|
||||
count[belongs_to[i]]--;
|
||||
count[new_centroid]++;
|
||||
belongs_to[i] = new_centroid;
|
||||
converged = false;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i=0; i<branching; ++i) {
|
||||
// if one cluster converges to an empty cluster,
|
||||
// move an element into that cluster
|
||||
if (count[i]==0) {
|
||||
int j = (i+1)%branching;
|
||||
while (count[j]<=1) {
|
||||
j = (j+1)%branching;
|
||||
}
|
||||
|
||||
for (int k=0; k<indices_length; ++k) {
|
||||
if (belongs_to[k]==j) {
|
||||
// for cluster j, we move the furthest element from the center to the empty cluster i
|
||||
if ( distance_(dataset_[indices[k]], centers[j], veclen_) == radiuses[j] ) {
|
||||
belongs_to[k] = i;
|
||||
count[j]--;
|
||||
count[i]++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
converged = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void computeSubClustering(KMeansNodePtr node, int* indices, int indices_length,
|
||||
int branching, int level, CentersType** centers,
|
||||
std::vector<DistanceType>& radiuses, int* belongs_to, int* count)
|
||||
@ -1150,7 +1342,7 @@ private:
|
||||
/**
|
||||
* The methods responsible with doing the recursive hierarchical clustering on
|
||||
* binary vectors.
|
||||
* As some might have heared that KMeans on binary data doesn't make sense,
|
||||
* As some might have heard that KMeans on binary data doesn't make sense,
|
||||
* it's worth a little explanation why it actually fairly works. As
|
||||
* with the Hierarchical Clustering algortihm, we seed several centers for the
|
||||
* current node by picking some of its points. Then in a first pass each point
|
||||
@ -1233,6 +1425,34 @@ private:
|
||||
}
|
||||
|
||||
|
||||
void refineAndSplitClustering(
|
||||
KMeansNodePtr node, int* indices, int indices_length, int branching,
|
||||
int level, CentersType** centers, std::vector<DistanceType>& radiuses,
|
||||
int* belongs_to, int* count, const cvflann::DNAmmingLUT* identifier)
|
||||
{
|
||||
(void)identifier;
|
||||
refineDnaClustering(
|
||||
indices, indices_length, branching, centers, radiuses, belongs_to, count);
|
||||
|
||||
computeAnyBitfieldSubClustering(node, indices, indices_length, branching,
|
||||
level, centers, radiuses, belongs_to, count);
|
||||
}
|
||||
|
||||
|
||||
void refineAndSplitClustering(
|
||||
KMeansNodePtr node, int* indices, int indices_length, int branching,
|
||||
int level, CentersType** centers, std::vector<DistanceType>& radiuses,
|
||||
int* belongs_to, int* count, const cvflann::DNAmming2<unsigned char>* identifier)
|
||||
{
|
||||
(void)identifier;
|
||||
refineDnaClustering(
|
||||
indices, indices_length, branching, centers, radiuses, belongs_to, count);
|
||||
|
||||
computeAnyBitfieldSubClustering(node, indices, indices_length, branching,
|
||||
level, centers, radiuses, belongs_to, count);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* The method responsible with actually doing the recursive hierarchical
|
||||
* clustering
|
||||
|
@ -345,6 +345,7 @@ typedef ::cvflann::Hamming<uchar> HammingDistance;
|
||||
#else
|
||||
typedef ::cvflann::HammingLUT HammingDistance;
|
||||
#endif
|
||||
typedef ::cvflann::DNAmming2<uchar> DNAmmingDistance;
|
||||
|
||||
Index::Index()
|
||||
{
|
||||
@ -397,6 +398,9 @@ void Index::build(InputArray _data, const IndexParams& params, flann_distance_t
|
||||
buildIndex< ::cvflann::L1<float> >(index, data, params);
|
||||
break;
|
||||
#if MINIFLANN_SUPPORT_EXOTIC_DISTANCE_TYPES
|
||||
case FLANN_DIST_DNAMMING:
|
||||
buildIndex< DNAmmingDistance >(index, data, params);
|
||||
break;
|
||||
case FLANN_DIST_MAX:
|
||||
buildIndex< ::cvflann::MaxDistance<float> >(index, data, params);
|
||||
break;
|
||||
@ -452,6 +456,9 @@ void Index::release()
|
||||
deleteIndex< ::cvflann::L1<float> >(index);
|
||||
break;
|
||||
#if MINIFLANN_SUPPORT_EXOTIC_DISTANCE_TYPES
|
||||
case FLANN_DIST_DNAMMING:
|
||||
deleteIndex< DNAmmingDistance >(index);
|
||||
break;
|
||||
case FLANN_DIST_MAX:
|
||||
deleteIndex< ::cvflann::MaxDistance<float> >(index);
|
||||
break;
|
||||
@ -573,7 +580,8 @@ void Index::knnSearch(InputArray _query, OutputArray _indices,
|
||||
CV_INSTRUMENT_REGION();
|
||||
|
||||
Mat query = _query.getMat(), indices, dists;
|
||||
int dtype = distType == FLANN_DIST_HAMMING ? CV_32S : CV_32F;
|
||||
int dtype = (distType == FLANN_DIST_HAMMING)
|
||||
|| (distType == FLANN_DIST_DNAMMING) ? CV_32S : CV_32F;
|
||||
|
||||
createIndicesDists( _indices, _dists, indices, dists, query.rows, knn, knn, dtype );
|
||||
|
||||
@ -589,6 +597,9 @@ void Index::knnSearch(InputArray _query, OutputArray _indices,
|
||||
runKnnSearch< ::cvflann::L1<float> >(index, query, indices, dists, knn, params);
|
||||
break;
|
||||
#if MINIFLANN_SUPPORT_EXOTIC_DISTANCE_TYPES
|
||||
case FLANN_DIST_DNAMMING:
|
||||
runKnnSearch<DNAmmingDistance>(index, query, indices, dists, knn, params);
|
||||
break;
|
||||
case FLANN_DIST_MAX:
|
||||
runKnnSearch< ::cvflann::MaxDistance<float> >(index, query, indices, dists, knn, params);
|
||||
break;
|
||||
@ -617,7 +628,8 @@ int Index::radiusSearch(InputArray _query, OutputArray _indices,
|
||||
CV_INSTRUMENT_REGION();
|
||||
|
||||
Mat query = _query.getMat(), indices, dists;
|
||||
int dtype = distType == FLANN_DIST_HAMMING ? CV_32S : CV_32F;
|
||||
int dtype = (distType == FLANN_DIST_HAMMING)
|
||||
|| (distType == FLANN_DIST_DNAMMING) ? CV_32S : CV_32F;
|
||||
CV_Assert( maxResults > 0 );
|
||||
createIndicesDists( _indices, _dists, indices, dists, query.rows, maxResults, INT_MAX, dtype );
|
||||
|
||||
@ -634,6 +646,8 @@ int Index::radiusSearch(InputArray _query, OutputArray _indices,
|
||||
case FLANN_DIST_L1:
|
||||
return runRadiusSearch< ::cvflann::L1<float> >(index, query, indices, dists, radius, params);
|
||||
#if MINIFLANN_SUPPORT_EXOTIC_DISTANCE_TYPES
|
||||
case FLANN_DIST_DNAMMING:
|
||||
return runRadiusSearch< DNAmmingDistance >(index, query, indices, dists, radius, params);
|
||||
case FLANN_DIST_MAX:
|
||||
return runRadiusSearch< ::cvflann::MaxDistance<float> >(index, query, indices, dists, radius, params);
|
||||
case FLANN_DIST_HIST_INTERSECT:
|
||||
@ -697,6 +711,9 @@ void Index::save(const String& filename) const
|
||||
saveIndex< ::cvflann::L1<float> >(this, index, fout);
|
||||
break;
|
||||
#if MINIFLANN_SUPPORT_EXOTIC_DISTANCE_TYPES
|
||||
case FLANN_DIST_DNAMMING:
|
||||
saveIndex< DNAmmingDistance >(this, index, fout);
|
||||
break;
|
||||
case FLANN_DIST_MAX:
|
||||
saveIndex< ::cvflann::MaxDistance<float> >(this, index, fout);
|
||||
break;
|
||||
@ -778,6 +795,7 @@ bool Index::load(InputArray _data, const String& filename)
|
||||
distType = (flann_distance_t)idistType;
|
||||
|
||||
if( !((distType == FLANN_DIST_HAMMING && featureType == CV_8U) ||
|
||||
(distType == FLANN_DIST_DNAMMING && featureType == CV_8U) ||
|
||||
(distType != FLANN_DIST_HAMMING && featureType == CV_32F)) )
|
||||
{
|
||||
fprintf(stderr, "Reading FLANN index error: unsupported feature type %d for the index type %d\n", featureType, algo);
|
||||
@ -797,6 +815,9 @@ bool Index::load(InputArray _data, const String& filename)
|
||||
loadIndex< ::cvflann::L1<float> >(this, index, data, fin);
|
||||
break;
|
||||
#if MINIFLANN_SUPPORT_EXOTIC_DISTANCE_TYPES
|
||||
case FLANN_DIST_DNAMMING:
|
||||
loadIndex< DNAmmingDistance >(this, index, data, fin);
|
||||
break;
|
||||
case FLANN_DIST_MAX:
|
||||
loadIndex< ::cvflann::MaxDistance<float> >(this, index, data, fin);
|
||||
break;
|
||||
|
Loading…
Reference in New Issue
Block a user