opencv/doc/core_clustering_search.tex

\section{Clustering}

\ifCPy

\cvCPyFunc{KMeans2}
Splits set of vectors by a given number of clusters.

\cvdefC{int cvKMeans2(const CvArr* samples, int nclusters,\par
                      CvArr* labels, CvTermCriteria termcrit,\par
                      int attempts=1, CvRNG* rng=0, \par
                      int flags=0, CvArr* centers=0,\par
                      double* compactness=0);}
\cvdefPy{KMeans2(samples,nclusters,labels,termcrit)-> None}

\begin{description}
\cvarg{samples}{Floating-point matrix of input samples, one row per sample}
\cvarg{nclusters}{Number of clusters to split the set by}
\cvarg{labels}{Output integer vector storing cluster indices for every sample}
\cvarg{termcrit}{Specifies maximum number of iterations and/or accuracy (distance the centers can move by between subsequent iterations)}
\ifC
\cvarg{attempts}{How many times the algorithm is executed using different initial labelings. The algorithm returns labels that yield the best compactness (see the last function parameter)}
\cvarg{rng}{Optional external random number generator; can be used to fully control the function behaviour}
\cvarg{flags}{Can be 0 or \texttt{CV\_KMEANS\_USE\_INITIAL\_LABELS}. The latter
value means that during the first (and possibly the only) attempt, the
function uses the user-supplied labels as the initial approximation
instead of generating random labels. For the second and further attempts,
the function will use randomly generated labels in any case}
\cvarg{centers}{The optional output array of the cluster centers}
\cvarg{compactness}{The optional output parameter, which is computed as
$\sum_i ||\texttt{samples}_i - \texttt{centers}_{\texttt{labels}_i}||^2$
after every attempt; the best (minimum) value is chosen and the
corresponding labels are returned by the function. Basically, the
user can use only the core of the function, set the number of
attempts to 1, initialize labels each time using a custom algorithm
(\texttt{flags=CV\_KMEAN\_USE\_INITIAL\_LABELS}) and, based on the output compactness
or any other criteria, choose the best clustering.}
\fi
\end{description}

The function \texttt{cvKMeans2} implements a k-means algorithm that finds the
centers of \texttt{nclusters} clusters and groups the input samples
around the clusters. On output, $\texttt{labels}_i$ contains a cluster index for
samples stored in the i-th row of the \texttt{samples} matrix.

\ifC
% Example: Clustering random samples of multi-gaussian distribution with k-means
\begin{lstlisting}
#include "cxcore.h"
#include "highgui.h"

void main( int argc, char** argv )
{
    #define MAX_CLUSTERS 5
    CvScalar color_tab[MAX_CLUSTERS];
    IplImage* img = cvCreateImage( cvSize( 500, 500 ), 8, 3 );
    CvRNG rng = cvRNG(0xffffffff);

    color_tab[0] = CV_RGB(255,0,0);
    color_tab[1] = CV_RGB(0,255,0);
    color_tab[2] = CV_RGB(100,100,255);
    color_tab[3] = CV_RGB(255,0,255);
    color_tab[4] = CV_RGB(255,255,0);

    cvNamedWindow( "clusters", 1 );

    for(;;)
    {
        int k, cluster_count = cvRandInt(&rng)%MAX_CLUSTERS + 1;
        int i, sample_count = cvRandInt(&rng)%1000 + 1;
        CvMat* points = cvCreateMat( sample_count, 1, CV_32FC2 );
        CvMat* clusters = cvCreateMat( sample_count, 1, CV_32SC1 );

        /* generate random sample from multigaussian distribution */
        for( k = 0; k < cluster_count; k++ )
        {
            CvPoint center;
            CvMat point_chunk;
            center.x = cvRandInt(&rng)%img->width;
            center.y = cvRandInt(&rng)%img->height;
            cvGetRows( points,
                       &point_chunk,
                       k*sample_count/cluster_count,
                       (k == (cluster_count - 1)) ?
                           sample_count :
                           (k+1)*sample_count/cluster_count );
            cvRandArr( &rng, &point_chunk, CV_RAND_NORMAL,
                       cvScalar(center.x,center.y,0,0),
                       cvScalar(img->width/6, img->height/6,0,0) );
        }

        /* shuffle samples */
        for( i = 0; i < sample_count/2; i++ )
        {
            CvPoint2D32f* pt1 =
                (CvPoint2D32f*)points->data.fl + cvRandInt(&rng)%sample_count;
            CvPoint2D32f* pt2 =
                (CvPoint2D32f*)points->data.fl + cvRandInt(&rng)%sample_count;
            CvPoint2D32f temp;
            CV_SWAP( *pt1, *pt2, temp );
        }

        cvKMeans2( points, cluster_count, clusters,
                   cvTermCriteria( CV_TERMCRIT_EPS+CV_TERMCRIT_ITER, 10, 1.0 ));

        cvZero( img );

        for( i = 0; i < sample_count; i++ )
        {
            CvPoint2D32f pt = ((CvPoint2D32f*)points->data.fl)[i];
            int cluster_idx = clusters->data.i[i];
            cvCircle( img,
                      cvPointFrom32f(pt),
                      2,
                      color_tab[cluster_idx],
                      CV_FILLED );
        }

        cvReleaseMat( &points );
        cvReleaseMat( &clusters );

        cvShowImage( "clusters", img );

        int key = cvWaitKey(0);
        if( key == 27 )
            break;
    }
}
\end{lstlisting}

\cvCPyFunc{SeqPartition}
Splits a sequence into equivalency classes.

\cvdefC{
int cvSeqPartition( \par const CvSeq* seq,\par CvMemStorage* storage,\par CvSeq** labels,\par CvCmpFunc is\_equal,\par void* userdata );
}

\begin{description}
\cvarg{seq}{The sequence to partition}
\cvarg{storage}{The storage block to store the sequence of equivalency classes. If it is NULL, the function uses \texttt{seq->storage} for output labels}
\cvarg{labels}{Ouput parameter. Double pointer to the sequence of 0-based labels of input sequence elements}
\cvarg{is\_equal}{The relation function that should return non-zero if the two particular sequence elements are from the same class, and zero otherwise. The partitioning algorithm uses transitive closure of the relation function as an equivalency critria}
\cvarg{userdata}{Pointer that is transparently passed to the \texttt{is\_equal} function}
\end{description}

\begin{lstlisting}
typedef int (CV_CDECL* CvCmpFunc)(const void* a, const void* b, void* userdata);
\end{lstlisting}

The function \texttt{cvSeqPartition} implements a quadratic algorithm for
splitting a set into one or more equivalancy classes. The function
returns the number of equivalency classes.

% Example: Partitioning a 2d point set
\begin{lstlisting}

#include "cxcore.h"
#include "highgui.h"
#include <stdio.h>

CvSeq* point_seq = 0;
IplImage* canvas = 0;
CvScalar* colors = 0;
int pos = 10;

int is_equal( const void* _a, const void* _b, void* userdata )
{
    CvPoint a = *(const CvPoint*)_a;
    CvPoint b = *(const CvPoint*)_b;
    double threshold = *(double*)userdata;
    return (double)((a.x - b.x)*(a.x - b.x) + (a.y - b.y)*(a.y - b.y)) <=
        threshold;
}

void on_track( int pos )
{
    CvSeq* labels = 0;
    double threshold = pos*pos;
    int i, class_count = cvSeqPartition( point_seq,
                                         0,
                                         &labels,
                                         is_equal,
                                         &threshold );
    printf("%4d classes\n", class_count );
    cvZero( canvas );

    for( i = 0; i < labels->total; i++ )
    {
        CvPoint pt = *(CvPoint*)cvGetSeqElem( point_seq, i );
        CvScalar color = colors[*(int*)cvGetSeqElem( labels, i )];
        cvCircle( canvas, pt, 1, color, -1 );
    }

    cvShowImage( "points", canvas );
}

int main( int argc, char** argv )
{
    CvMemStorage* storage = cvCreateMemStorage(0);
    point_seq = cvCreateSeq( CV_32SC2,
                             sizeof(CvSeq),
                             sizeof(CvPoint),
                             storage );
    CvRNG rng = cvRNG(0xffffffff);

    int width = 500, height = 500;
    int i, count = 1000;
    canvas = cvCreateImage( cvSize(width,height), 8, 3 );

    colors = (CvScalar*)cvAlloc( count*sizeof(colors[0]) );
    for( i = 0; i < count; i++ )
    {
        CvPoint pt;
        int icolor;
        pt.x = cvRandInt( &rng ) % width;
        pt.y = cvRandInt( &rng ) % height;
        cvSeqPush( point_seq, &pt );
        icolor = cvRandInt( &rng ) | 0x00404040;
        colors[i] = CV_RGB(icolor & 255,
                           (icolor >> 8)&255,
                           (icolor >> 16)&255);
    }

    cvNamedWindow( "points", 1 );
    cvCreateTrackbar( "threshold", "points", &pos, 50, on_track );
    on_track(pos);
    cvWaitKey(0);
    return 0;
}
\end{lstlisting}

\fi

\fi

\ifCpp

\cvCppFunc{kmeans}
Finds the centers of clusters and groups the input samples around the clusters.
\cvdefCpp{double kmeans( const Mat\& samples, int clusterCount, Mat\& labels,\par
               TermCriteria termcrit, int attempts,\par
               int flags, Mat* centers );}
\begin{description}
\cvarg{samples}{Floating-point matrix of input samples, one row per sample}
\cvarg{clusterCount}{The number of clusters to split the set by}
\cvarg{labels}{The input/output integer array that will store the cluster indices for every sample}
\cvarg{termcrit}{Specifies maximum number of iterations and/or accuracy (distance the centers can move by between subsequent iterations)}

\cvarg{attempts}{How many times the algorithm is executed using different initial labelings. The algorithm returns the labels that yield the best compactness (see the last function parameter)}
\cvarg{flags}{It can take the following values:
\begin{description}
\cvarg{KMEANS\_RANDOM\_CENTERS}{Random initial centers are selected in each attempt}
\cvarg{KMEANS\_PP\_CENTERS}{Use kmeans++ center initialization by Arthur and Vassilvitskii}
\cvarg{KMEANS\_USE\_INITIAL\_LABELS}{During the first (and possibly the only) attempt, the
function uses the user-supplied labels instaed of computing them from the initial centers. For the second and further attempts, the function will use the random or semi-random centers (use one of \texttt{KMEANS\_*\_CENTERS} flag to specify the exact method)}
\end{description}}
\cvarg{centers}{The output matrix of the cluster centers, one row per each cluster center}
\end{description}

The function \texttt{kmeans} implements a k-means algorithm that finds the
centers of \texttt{clusterCount} clusters and groups the input samples
around the clusters. On output, $\texttt{labels}_i$ contains a 0-based cluster index for
the sample stored in the $i^{th}$ row of the \texttt{samples} matrix.

The function returns the compactness measure, which is computed as
\[
\sum_i \|\texttt{samples}_i - \texttt{centers}_{\texttt{labels}_i}\|^2
\]
after every attempt; the best (minimum) value is chosen and the
corresponding labels and the compactness value are returned by the function.
Basically, the user can use only the core of the function, set the number of
attempts to 1, initialize labels each time using some custom algorithm and pass them with
\par (\texttt{flags}=\texttt{KMEANS\_USE\_INITIAL\_LABELS}) flag, and then choose the best (most-compact) clustering.

\cvCppFunc{partition}
Splits an element set into equivalency classes.

\cvdefCpp{template<typename \_Tp, class \_EqPredicate> int\newline
    partition( const vector<\_Tp>\& vec, vector<int>\& labels,\par
               \_EqPredicate predicate=\_EqPredicate());}
\begin{description}
\cvarg{vec}{The set of elements stored as a vector}
\cvarg{labels}{The output vector of labels; will contain as many elements as \texttt{vec}. Each label \texttt{labels[i]} is 0-based cluster index of \texttt{vec[i]}}
\cvarg{predicate}{The equivalence predicate (i.e. pointer to a boolean function of two arguments or an instance of the class that has the method \texttt{bool operator()(const \_Tp\& a, const \_Tp\& b)}. The predicate returns true when the elements are certainly if the same class, and false if they may or may not be in the same class}
\end{description}

The generic function \texttt{partition} implements an $O(N^2)$ algorithm for
splitting a set of $N$ elements into one or more equivalency classes, as described in \url{http://en.wikipedia.org/wiki/Disjoint-set_data_structure}. The function
returns the number of equivalency classes.


\fi