BFMatcher

match radiusMatch
2025-06-11 03:33:28 +08:00 · 2013-12-27 13:04:02 +04:00 · 2013-12-27 13:04:02 +04:00 · ca5689e0db
commit ca5689e0db
parent ee331001f5
8 changed files with 1947 additions and 63 deletions
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@ -113,6 +113,7 @@ public:
    virtual Mat getMat(int idx=-1) const;
    virtual UMat getUMat(int idx=-1) const;
    virtual void getMatVector(std::vector<Mat>& mv) const;
    virtual void getUMatVector(std::vector<UMat>& umv) const;
    virtual cuda::GpuMat getGpuMat() const;
    virtual ogl::Buffer getOGlBuffer() const;
    void* getObj() const;
@ -134,7 +135,7 @@ public:
    virtual size_t step(int i=-1) const;
    bool isMat() const;
    bool isUMat() const;
-    bool isMatVectot() const;
+    bool isMatVector() const;
    bool isUMatVector() const;
    bool isMatx();
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@ -110,7 +110,7 @@ inline _InputArray::~_InputArray() {}
 inline bool _InputArray::isMat() const { return kind() == _InputArray::MAT; }
 inline bool _InputArray::isUMat() const  { return kind() == _InputArray::UMAT; }
-inline bool _InputArray::isMatVectot() const { return kind() == _InputArray::STD_VECTOR_MAT; }
+inline bool _InputArray::isMatVector() const { return kind() == _InputArray::STD_VECTOR_MAT; }
 inline bool _InputArray::isUMatVector() const  { return kind() == _InputArray::STD_VECTOR_UMAT; }
 inline bool _InputArray::isMatx()  { return kind() == _InputArray::MATX; }
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@ -1324,6 +1324,42 @@ void _InputArray::getMatVector(std::vector<Mat>& mv) const
    CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type");
 }
 void _InputArray::getUMatVector(std::vector<UMat>& umv) const
 {
    int k = kind();
    int accessFlags = flags & ACCESS_MASK;
    if( k == NONE )
    {
        umv.clear();
        return;
    }
    if( k == STD_VECTOR_MAT )
    {
        const std::vector<Mat>& v = *(const std::vector<Mat>*)obj;
        size_t i, n = v.size();
        umv.resize(n);
        for( i = 0; i < n; i++ )
            umv[i] = v[i].getUMat(accessFlags);
        return;
    }
    if( k == STD_VECTOR_UMAT )
    {
        const std::vector<UMat>& v = *(const std::vector<UMat>*)obj;
        size_t i, n = v.size();
        umv.resize(n);
        for( i = 0; i < n; i++ )
            umv[i] = v[i];
        return;
    }
    CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type");
 }
 cuda::GpuMat _InputArray::getGpuMat() const
 {
    int k = kind();
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@ -998,7 +998,7 @@ public:
     * Add descriptors to train descriptor collection.
     * descriptors      Descriptors to add. Each descriptors[i] is a descriptors set from one image.
     */
-    CV_WRAP virtual void add( const std::vector<Mat>& descriptors );
+    CV_WRAP virtual void add( InputArray descriptors );
    /*
     * Get train descriptors collection.
     */
@ -1034,29 +1034,29 @@ public:
     * Method train() is run in this methods.
     */
    // Find one best match for each query descriptor (if mask is empty).
-    CV_WRAP void match( const Mat& queryDescriptors, const Mat& trainDescriptors,
+    CV_WRAP void match( InputArray queryDescriptors, InputArray trainDescriptors,
-                CV_OUT std::vector<DMatch>& matches, const Mat& mask=Mat() ) const;
+                CV_OUT std::vector<DMatch>& matches, InputArray mask=Mat() ) const;
    // Find k best matches for each query descriptor (in increasing order of distances).
    // compactResult is used when mask is not empty. If compactResult is false matches
    // vector will have the same size as queryDescriptors rows. If compactResult is true
    // matches vector will not contain matches for fully masked out query descriptors.
-    CV_WRAP void knnMatch( const Mat& queryDescriptors, const Mat& trainDescriptors,
+    CV_WRAP void knnMatch( InputArray queryDescriptors, InputArray trainDescriptors,
                   CV_OUT std::vector<std::vector<DMatch> >& matches, int k,
-                   const Mat& mask=Mat(), bool compactResult=false ) const;
+                   InputArray mask=Mat(), bool compactResult=false ) const;
    // Find best matches for each query descriptor which have distance less than
    // maxDistance (in increasing order of distances).
-    void radiusMatch( const Mat& queryDescriptors, const Mat& trainDescriptors,
+    void radiusMatch( InputArray queryDescriptors, InputArray trainDescriptors,
                      std::vector<std::vector<DMatch> >& matches, float maxDistance,
-                      const Mat& mask=Mat(), bool compactResult=false ) const;
+                      InputArray mask=Mat(), bool compactResult=false ) const;
    /*
     * Group of methods to match descriptors from one image to image set.
     * See description of similar methods for matching image pair above.
     */
-    CV_WRAP void match( const Mat& queryDescriptors, CV_OUT std::vector<DMatch>& matches,
+    CV_WRAP void match( InputArray queryDescriptors, CV_OUT std::vector<DMatch>& matches,
                const std::vector<Mat>& masks=std::vector<Mat>() );
-    CV_WRAP void knnMatch( const Mat& queryDescriptors, CV_OUT std::vector<std::vector<DMatch> >& matches, int k,
+    CV_WRAP void knnMatch( InputArray queryDescriptors, CV_OUT std::vector<std::vector<DMatch> >& matches, int k,
           const std::vector<Mat>& masks=std::vector<Mat>(), bool compactResult=false );
-    void radiusMatch( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
+    void radiusMatch( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
                   const std::vector<Mat>& masks=std::vector<Mat>(), bool compactResult=false );
    // Reads matcher object from a file node
@ -1101,10 +1101,10 @@ protected:
    // In fact the matching is implemented only by the following two methods. These methods suppose
    // that the class object has been trained already. Public match methods call these methods
    // after calling train().
-    virtual void knnMatchImpl( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
+    virtual void knnMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
-           const std::vector<Mat>& masks=std::vector<Mat>(), bool compactResult=false ) = 0;
+        InputArrayOfArrays masks=std::vector<Mat>(), bool compactResult=false ) = 0;
-    virtual void radiusMatchImpl( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
+    virtual void radiusMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
-           const std::vector<Mat>& masks=std::vector<Mat>(), bool compactResult=false ) = 0;
+        InputArrayOfArrays masks=std::vector<Mat>(), bool compactResult=false ) = 0;
    static bool isPossibleMatch( const Mat& mask, int queryIdx, int trainIdx );
    static bool isMaskedOut( const std::vector<Mat>& masks, int queryIdx );
@ -1114,6 +1114,7 @@ protected:
    // Collection of descriptors from train images.
    std::vector<Mat> trainDescCollection;
    std::vector<UMat> utrainDescCollection;
 };
 /*
@ -1137,10 +1138,16 @@ public:
    AlgorithmInfo* info() const;
 protected:
-    virtual void knnMatchImpl( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
+    virtual void knnMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
-           const std::vector<Mat>& masks=std::vector<Mat>(), bool compactResult=false );
+        InputArrayOfArrays masks=std::vector<Mat>(), bool compactResult=false );
-    virtual void radiusMatchImpl( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
+    virtual void radiusMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
-           const std::vector<Mat>& masks=std::vector<Mat>(), bool compactResult=false );
+        InputArrayOfArrays masks=std::vector<Mat>(), bool compactResult=false );
    bool ocl_knnMatch(InputArray query, InputArray train, std::vector< std::vector<DMatch> > &matches,
        int k, int dstType, bool compactResult=false);
    bool ocl_radiusMatch(InputArray query, InputArray train, std::vector< std::vector<DMatch> > &matches,
        float maxDistance, int dstType, bool compactResult=false);
    bool ocl_match(InputArray query, InputArray train, std::vector< std::vector<DMatch> > &matches, int dstType);
    int normType;
    bool crossCheck;
@ -1175,10 +1182,10 @@ protected:
                                   const Mat& indices, const Mat& distances,
                                   std::vector<std::vector<DMatch> >& matches );
-    virtual void knnMatchImpl( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
+    virtual void knnMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
-                   const std::vector<Mat>& masks=std::vector<Mat>(), bool compactResult=false );
+        InputArrayOfArrays masks=std::vector<Mat>(), bool compactResult=false );
-    virtual void radiusMatchImpl( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
+    virtual void radiusMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
-                   const std::vector<Mat>& masks=std::vector<Mat>(), bool compactResult=false );
+        InputArrayOfArrays masks=std::vector<Mat>(), bool compactResult=false );
    Ptr<flann::IndexParams> indexParams;
    Ptr<flann::SearchParams> searchParams;
--- a/modules/features2d/src/matchers.cpp
+++ b/modules/features2d/src/matchers.cpp
--- a/modules/features2d/src/opencl/brute_force_match.cl
+++ b/modules/features2d/src/opencl/brute_force_match.cl
@ -0,0 +1,789 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //    Nathan, liujun@multicorewareinc.com
 //    Peng Xiao, pengxiao@outlook.com
 //    Baichuan Su, baichuan@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
 #define MAX_FLOAT 3.40282e+038f
 #ifndef T
 #define T float
 #endif
 #ifndef BLOCK_SIZE
 #define BLOCK_SIZE 16
 #endif
 #ifndef MAX_DESC_LEN
 #define MAX_DESC_LEN 64
 #endif
 #ifndef DIST_TYPE
 #define DIST_TYPE 2
 #endif
 // dirty fix for non-template support
 #if   (DIST_TYPE == 2) // L1Dist
 #   ifdef T_FLOAT
 #       define DIST(x, y) fabs((x) - (y))
        typedef float value_type;
        typedef float result_type;
 #   else
 #       define DIST(x, y) abs((x) - (y))
        typedef int value_type;
        typedef int result_type;
 #   endif
 #define DIST_RES(x) (x)
 #elif (DIST_TYPE == 4) // L2Dist
 #define DIST(x, y) (((x) - (y)) * ((x) - (y)))
 typedef float value_type;
 typedef float result_type;
 #define DIST_RES(x) sqrt(x)
 #elif (DIST_TYPE == 6) // Hamming
 //http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
 inline int bit1Count(int v)
 {
    v = v - ((v >> 1) & 0x55555555);                    // reuse input as temporary
    v = (v & 0x33333333) + ((v >> 2) & 0x33333333);     // temp
    return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; // count
 }
 #define DIST(x, y) bit1Count( (x) ^ (y) )
 typedef int value_type;
 typedef int result_type;
 #define DIST_RES(x) (x)
 #endif
 inline result_type reduce_block(
    __local value_type *s_query,
    __local value_type *s_train,
    int lidx,
    int lidy
    )
 {
    result_type result = 0;
    #pragma unroll
    for (int j = 0 ; j < BLOCK_SIZE ; j++)
    {
        result += DIST(
            s_query[lidy * BLOCK_SIZE + j],
            s_train[j * BLOCK_SIZE + lidx]);
    }
    return DIST_RES(result);
 }
 inline result_type reduce_block_match(
    __local value_type *s_query,
    __local value_type *s_train,
    int lidx,
    int lidy
    )
 {
    result_type result = 0;
    #pragma unroll
    for (int j = 0 ; j < BLOCK_SIZE ; j++)
    {
        result += DIST(
            s_query[lidy * BLOCK_SIZE + j],
            s_train[j * BLOCK_SIZE + lidx]);
    }
    return (result);
 }
 inline result_type reduce_multi_block(
    __local value_type *s_query,
    __local value_type *s_train,
    int block_index,
    int lidx,
    int lidy
    )
 {
    result_type result = 0;
    #pragma unroll
    for (int j = 0 ; j < BLOCK_SIZE ; j++)
    {
        result += DIST(
            s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j],
            s_train[j * BLOCK_SIZE + lidx]);
    }
    return result;
 }
 /* 2dim launch, global size: dim0 is (query rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, dim1 is BLOCK_SIZE
 local size: dim0 is BLOCK_SIZE, dim1 is BLOCK_SIZE.
 */
 __kernel void BruteForceMatch_UnrollMatch(
    __global T *query,
    __global T *train,
    //__global float *mask,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    __local float *sharebuffer,
    int query_rows,
    int query_cols,
    int train_rows,
    int train_cols,
    int step
 )
 {
    const int lidx = get_local_id(0);
    const int lidy = get_local_id(1);
    const int groupidx = get_group_id(0);
    __local value_type *s_query = (__local value_type *)sharebuffer;
    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
    int queryIdx = groupidx * BLOCK_SIZE + lidy;
    // load the query into local memory.
    #pragma unroll
    for (int i = 0 ;  i <  MAX_DESC_LEN / BLOCK_SIZE; i ++)
    {
        int loadx = lidx + i * BLOCK_SIZE;
        s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
    }
    float myBestDistance = MAX_FLOAT;
    int myBestTrainIdx = -1;
    // loopUnrolledCached to find the best trainIdx and best distance.
    for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++)
    {
        result_type result = 0;
        #pragma unroll
        for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)
        {
            //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
            const int loadx = lidx + i * BLOCK_SIZE;
            s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
            //synchronize to make sure each elem for reduceIteration in share memory is written already.
            barrier(CLK_LOCAL_MEM_FENCE);
            result += reduce_multi_block(s_query, s_train, i, lidx, lidy);
            barrier(CLK_LOCAL_MEM_FENCE);
        }
        result = DIST_RES(result);
        int trainIdx = t * BLOCK_SIZE + lidx;
        if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
        {
            myBestDistance = result;
            myBestTrainIdx = trainIdx;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    __local float *s_distance = (__local float*)(sharebuffer);
    __local int* s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
    //find BestMatch
    s_distance += lidy * BLOCK_SIZE;
    s_trainIdx += lidy * BLOCK_SIZE;
    s_distance[lidx] = myBestDistance;
    s_trainIdx[lidx] = myBestTrainIdx;
    barrier(CLK_LOCAL_MEM_FENCE);
    //reduce -- now all reduce implement in each threads.
    #pragma unroll
    for (int k = 0 ; k < BLOCK_SIZE; k++)
    {
        if (myBestDistance > s_distance[k])
        {
            myBestDistance = s_distance[k];
            myBestTrainIdx = s_trainIdx[k];
        }
    }
    if (queryIdx < query_rows && lidx == 0)
    {
        bestTrainIdx[queryIdx] = myBestTrainIdx;
        bestDistance[queryIdx] = myBestDistance;
    }
 }
 __kernel void BruteForceMatch_Match(
    __global T *query,
    __global T *train,
    //__global float *mask,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    __local float *sharebuffer,
    int query_rows,
    int query_cols,
    int train_rows,
    int train_cols,
    int step
 )
 {
    const int lidx = get_local_id(0);
    const int lidy = get_local_id(1);
    const int groupidx = get_group_id(0);
    const int queryIdx = groupidx * BLOCK_SIZE + lidy;
    float myBestDistance = MAX_FLOAT;
    int myBestTrainIdx = -1;
    __local value_type *s_query = (__local value_type *)sharebuffer;
    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
    // loop
    for (int t = 0 ;  t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
    {
        result_type result = 0;
        for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; i++)
        {
            const int loadx = lidx + i * BLOCK_SIZE;
            //load query and train into local memory
            s_query[lidy * BLOCK_SIZE + lidx] = 0;
            s_train[lidx * BLOCK_SIZE + lidy] = 0;
            if (loadx < query_cols)
            {
                s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
                s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            result += reduce_block_match(s_query, s_train, lidx, lidy);
            barrier(CLK_LOCAL_MEM_FENCE);
        }
        result = DIST_RES(result);
        const int trainIdx = t * BLOCK_SIZE + lidx;
        if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
        {
            myBestDistance = result;
            myBestTrainIdx = trainIdx;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    __local float *s_distance = (__local float *)sharebuffer;
    __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
    //findBestMatch
    s_distance += lidy * BLOCK_SIZE;
    s_trainIdx += lidy * BLOCK_SIZE;
    s_distance[lidx] = myBestDistance;
    s_trainIdx[lidx] = myBestTrainIdx;
    barrier(CLK_LOCAL_MEM_FENCE);
    //reduce -- now all reduce implement in each threads.
    for (int k = 0 ; k < BLOCK_SIZE; k++)
    {
        if (myBestDistance > s_distance[k])
        {
            myBestDistance = s_distance[k];
            myBestTrainIdx = s_trainIdx[k];
        }
    }
    if (queryIdx < query_rows && lidx == 0)
    {
        bestTrainIdx[queryIdx] = myBestTrainIdx;
        bestDistance[queryIdx] = myBestDistance;
    }
 }
 //radius_unrollmatch
 __kernel void BruteForceMatch_RadiusUnrollMatch(
    __global T *query,
    __global T *train,
    float maxDistance,
    //__global float *mask,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    __global int *nMatches,
    __local float *sharebuffer,
    int query_rows,
    int query_cols,
    int train_rows,
    int train_cols,
    int bestTrainIdx_cols,
    int step,
    int ostep
 )
 {
    const int lidx = get_local_id(0);
    const int lidy = get_local_id(1);
    const int groupidx = get_group_id(0);
    const int groupidy = get_group_id(1);
    const int queryIdx = groupidy * BLOCK_SIZE + lidy;
    const int trainIdx = groupidx * BLOCK_SIZE + lidx;
    __local value_type *s_query = (__local value_type *)sharebuffer;
    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
    result_type result = 0;
    for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; ++i)
    {
        //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
        const int loadx = lidx + i * BLOCK_SIZE;
        s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
        s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
        //synchronize to make sure each elem for reduceIteration in share memory is written already.
        barrier(CLK_LOCAL_MEM_FENCE);
        result += reduce_block(s_query, s_train, lidx, lidy);
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (queryIdx < query_rows && trainIdx < train_rows &&
        convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/)
    {
        int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
        if(ind < bestTrainIdx_cols)
        {
            bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
            bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
        }
    }
 }
 //radius_match
 __kernel void BruteForceMatch_RadiusMatch(
    __global T *query,
    __global T *train,
    float maxDistance,
    //__global float *mask,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    __global int *nMatches,
    __local float *sharebuffer,
    int query_rows,
    int query_cols,
    int train_rows,
    int train_cols,
    int bestTrainIdx_cols,
    int step,
    int ostep
 )
 {
    const int lidx = get_local_id(0);
    const int lidy = get_local_id(1);
    const int groupidx = get_group_id(0);
    const int groupidy = get_group_id(1);
    const int queryIdx = groupidy * BLOCK_SIZE + lidy;
    const int trainIdx = groupidx * BLOCK_SIZE + lidx;
    __local value_type *s_query = (__local value_type *)sharebuffer;
    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
    result_type result = 0;
    for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; ++i)
    {
        //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
        const int loadx = lidx + i * BLOCK_SIZE;
        s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
        s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
        //synchronize to make sure each elem for reduceIteration in share memory is written already.
        barrier(CLK_LOCAL_MEM_FENCE);
        result += reduce_block(s_query, s_train, lidx, lidy);
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (queryIdx < query_rows && trainIdx < train_rows &&
        convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/)
    {
        int ind = atom_inc(nMatches + queryIdx);
        if(ind < bestTrainIdx_cols)
        {
            bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
            bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
        }
    }
 }
 __kernel void BruteForceMatch_knnUnrollMatch(
    __global T *query,
    __global T *train,
    //__global float *mask,
    __global int2 *bestTrainIdx,
    __global float2 *bestDistance,
    __local float *sharebuffer,
    int query_rows,
    int query_cols,
    int train_rows,
    int train_cols,
    int step
 )
 {
    const int lidx = get_local_id(0);
    const int lidy = get_local_id(1);
    const int groupidx = get_group_id(0);
    const int queryIdx = groupidx * BLOCK_SIZE + lidy;
    __local value_type *s_query = (__local value_type *)sharebuffer;
    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
    // load the query into local memory.
    for (int i = 0 ;  i <  MAX_DESC_LEN / BLOCK_SIZE; i ++)
    {
        int loadx = lidx + i * BLOCK_SIZE;
        s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
    }
    float myBestDistance1 = MAX_FLOAT;
    float myBestDistance2 = MAX_FLOAT;
    int myBestTrainIdx1 = -1;
    int myBestTrainIdx2 = -1;
    //loopUnrolledCached
    for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
    {
        result_type result = 0;
        for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)
        {
            //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
            const int loadx = lidx + i * BLOCK_SIZE;
            s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
            //synchronize to make sure each elem for reduceIteration in share memory is written already.
            barrier(CLK_LOCAL_MEM_FENCE);
            result += reduce_multi_block(s_query, s_train, i, lidx, lidy);
            barrier(CLK_LOCAL_MEM_FENCE);
        }
        result = DIST_RES(result);
        const int trainIdx = t * BLOCK_SIZE + lidx;
        if (queryIdx < query_rows && trainIdx < train_rows)
        {
            if (result < myBestDistance1)
            {
                myBestDistance2 = myBestDistance1;
                myBestTrainIdx2 = myBestTrainIdx1;
                myBestDistance1 = result;
                myBestTrainIdx1 = trainIdx;
            }
            else if (result < myBestDistance2)
            {
                myBestDistance2 = result;
                myBestTrainIdx2 = trainIdx;
            }
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    __local float *s_distance = (local float *)sharebuffer;
    __local int *s_trainIdx = (local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
    // find BestMatch
    s_distance += lidy * BLOCK_SIZE;
    s_trainIdx += lidy * BLOCK_SIZE;
    s_distance[lidx] = myBestDistance1;
    s_trainIdx[lidx] = myBestTrainIdx1;
    float bestDistance1 = MAX_FLOAT;
    float bestDistance2 = MAX_FLOAT;
    int bestTrainIdx1 = -1;
    int bestTrainIdx2 = -1;
    barrier(CLK_LOCAL_MEM_FENCE);
    if (lidx == 0)
    {
        for (int i = 0 ; i < BLOCK_SIZE ; i++)
        {
            float val = s_distance[i];
            if (val < bestDistance1)
            {
                bestDistance2 = bestDistance1;
                bestTrainIdx2 = bestTrainIdx1;
                bestDistance1 = val;
                bestTrainIdx1 = s_trainIdx[i];
            }
            else if (val < bestDistance2)
            {
                bestDistance2 = val;
                bestTrainIdx2 = s_trainIdx[i];
            }
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    s_distance[lidx] = myBestDistance2;
    s_trainIdx[lidx] = myBestTrainIdx2;
    barrier(CLK_LOCAL_MEM_FENCE);
    if (lidx == 0)
    {
        for (int i = 0 ; i < BLOCK_SIZE ; i++)
        {
            float val = s_distance[i];
            if (val < bestDistance2)
            {
                bestDistance2 = val;
                bestTrainIdx2 = s_trainIdx[i];
            }
        }
    }
    myBestDistance1 = bestDistance1;
    myBestDistance2 = bestDistance2;
    myBestTrainIdx1 = bestTrainIdx1;
    myBestTrainIdx2 = bestTrainIdx2;
    if (queryIdx < query_rows && lidx == 0)
    {
        bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
        bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
    }
 }
 __kernel void BruteForceMatch_knnMatch(
    __global T *query,
    __global T *train,
    //__global float *mask,
    __global int2 *bestTrainIdx,
    __global float2 *bestDistance,
    __local float *sharebuffer,
    int query_rows,
    int query_cols,
    int train_rows,
    int train_cols,
    int step
 )
 {
    const int lidx = get_local_id(0);
    const int lidy = get_local_id(1);
    const int groupidx = get_group_id(0);
    const int queryIdx = groupidx * BLOCK_SIZE + lidy;
    __local value_type *s_query = (__local value_type *)sharebuffer;
    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
    float myBestDistance1 = MAX_FLOAT;
    float myBestDistance2 = MAX_FLOAT;
    int myBestTrainIdx1 = -1;
    int myBestTrainIdx2 = -1;
    //loop
    for (int  t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
    {
        result_type result = 0.0f;
        for (int i = 0 ; i < (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE ; i++)
        {
            const int loadx = lidx + i * BLOCK_SIZE;
            //load query and train into local memory
            s_query[lidy * BLOCK_SIZE + lidx] = 0;
            s_train[lidx * BLOCK_SIZE + lidy] = 0;
            if (loadx < query_cols)
            {
                s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
                s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            result += reduce_block_match(s_query, s_train, lidx, lidy);
            barrier(CLK_LOCAL_MEM_FENCE);
        }
        result = DIST_RES(result);
        const int trainIdx = t * BLOCK_SIZE + lidx;
        if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
        {
            if (result < myBestDistance1)
            {
                myBestDistance2 = myBestDistance1;
                myBestTrainIdx2 = myBestTrainIdx1;
                myBestDistance1 = result;
                myBestTrainIdx1 = trainIdx;
            }
            else if (result < myBestDistance2)
            {
                myBestDistance2 = result;
                myBestTrainIdx2 = trainIdx;
            }
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    __local float *s_distance = (__local float *)sharebuffer;
    __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
    //findBestMatch
    s_distance += lidy * BLOCK_SIZE;
    s_trainIdx += lidy * BLOCK_SIZE;
    s_distance[lidx] = myBestDistance1;
    s_trainIdx[lidx] = myBestTrainIdx1;
    float bestDistance1 = MAX_FLOAT;
    float bestDistance2 = MAX_FLOAT;
    int bestTrainIdx1 = -1;
    int bestTrainIdx2 = -1;
    barrier(CLK_LOCAL_MEM_FENCE);
    if (lidx == 0)
    {
        for (int i = 0 ; i < BLOCK_SIZE ; i++)
        {
            float val = s_distance[i];
            if (val < bestDistance1)
            {
                bestDistance2 = bestDistance1;
                bestTrainIdx2 = bestTrainIdx1;
                bestDistance1 = val;
                bestTrainIdx1 = s_trainIdx[i];
            }
            else if (val < bestDistance2)
            {
                bestDistance2 = val;
                bestTrainIdx2 = s_trainIdx[i];
            }
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    s_distance[lidx] = myBestDistance2;
    s_trainIdx[lidx] = myBestTrainIdx2;
    barrier(CLK_LOCAL_MEM_FENCE);
    if (lidx == 0)
    {
        for (int i = 0 ; i < BLOCK_SIZE ; i++)
        {
            float val = s_distance[i];
            if (val < bestDistance2)
            {
                bestDistance2 = val;
                bestTrainIdx2 = s_trainIdx[i];
            }
        }
    }
    myBestDistance1 = bestDistance1;
    myBestDistance2 = bestDistance2;
    myBestTrainIdx1 = bestTrainIdx1;
    myBestTrainIdx2 = bestTrainIdx2;
    if (queryIdx < query_rows && lidx == 0)
    {
        bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
        bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
    }
 }
 kernel void BruteForceMatch_calcDistanceUnrolled(
    __global T *query,
    __global T *train,
    //__global float *mask,
    __global float *allDist,
    __local float *sharebuffer,
    int query_rows,
    int query_cols,
    int train_rows,
    int train_cols,
    int step)
 {
    /* Todo */
 }
 kernel void BruteForceMatch_calcDistance(
    __global T *query,
    __global T *train,
    //__global float *mask,
    __global float *allDist,
    __local float *sharebuffer,
    int query_rows,
    int query_cols,
    int train_rows,
    int train_cols,
    int step)
 {
    /* Todo */
 }
 kernel void BruteForceMatch_findBestMatch(
    __global float *allDist,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    int k
 )
 {
    /* Todo */
 }
--- a/modules/features2d/src/precomp.hpp
+++ b/modules/features2d/src/precomp.hpp
@ -48,6 +48,7 @@
 #include "opencv2/core/utility.hpp"
 #include "opencv2/core/private.hpp"
 #include "opencv2/core/ocl.hpp"
 #include <algorithm>
--- a/modules/features2d/test/ocl/test_brute_force_matcher.cpp
+++ b/modules/features2d/test/ocl/test_brute_force_matcher.cpp
@ -0,0 +1,213 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //    Niko Li, newlife20080214@gmail.com
 //    Jia Haipeng, jiahaipeng95@gmail.com
 //    Zero Lin, Zero.Lin@amd.com
 //    Zhang Ying, zhangying913@gmail.com
 //    Yao Wang, bitwangyaoyao@gmail.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #include "test_precomp.hpp"
 #include "cvconfig.h"
 #include "opencv2/ts/ocl_test.hpp"
 #ifdef HAVE_OPENCL
 namespace cvtest {
 namespace ocl {
 PARAM_TEST_CASE(BruteForceMatcher, int, int)
 {
    int distType;
    int dim;
    int queryDescCount;
    int countFactor;
    Mat query, train;
    UMat uquery, utrain;
    virtual void SetUp()
    {
        distType = GET_PARAM(0);
        dim = GET_PARAM(1);
        queryDescCount = 300; // must be even number because we split train data in some cases in two
        countFactor = 4; // do not change it
        cv::Mat queryBuf, trainBuf;
        // Generate query descriptors randomly.
        // Descriptor vector elements are integer values.
        queryBuf.create(queryDescCount, dim, CV_32SC1);
        rng.fill(queryBuf, cv::RNG::UNIFORM, cv::Scalar::all(0), cv::Scalar::all(3));
        queryBuf.convertTo(queryBuf, CV_32FC1);
        // Generate train decriptors as follows:
        // copy each query descriptor to train set countFactor times
        // and perturb some one element of the copied descriptors in
        // in ascending order. General boundaries of the perturbation
        // are (0.f, 1.f).
        trainBuf.create(queryDescCount * countFactor, dim, CV_32FC1);
        float step = 1.f / countFactor;
        for (int qIdx = 0; qIdx < queryDescCount; qIdx++)
        {
            cv::Mat queryDescriptor = queryBuf.row(qIdx);
            for (int c = 0; c < countFactor; c++)
            {
                int tIdx = qIdx * countFactor + c;
                cv::Mat trainDescriptor = trainBuf.row(tIdx);
                queryDescriptor.copyTo(trainDescriptor);
                int elem = rng(dim);
                float diff = rng.uniform(step * c, step * (c + 1));
                trainDescriptor.at<float>(0, elem) += diff;
            }
        }
        queryBuf.convertTo(query, CV_32F);
        trainBuf.convertTo(train, CV_32F);
        query.copyTo(uquery);
        train.copyTo(utrain);
    }
 };
 #ifdef ANDROID
 OCL_TEST_P(BruteForceMatcher, DISABLED_Match_Single)
 #else
 OCL_TEST_P(BruteForceMatcher, Match_Single)
 #endif
 {
    BFMatcher matcher(distType);
    std::vector<cv::DMatch> matches;
    matcher.match(uquery, utrain,  matches);
    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
    int badCount = 0;
    for (size_t i = 0; i < matches.size(); i++)
    {
        cv::DMatch match = matches[i];
        if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor) || (match.imgIdx != 0))
            badCount++;
    }
    ASSERT_EQ(0, badCount);
 }
 #ifdef ANDROID
 OCL_TEST_P(BruteForceMatcher, DISABLED_KnnMatch_2_Single)
 #else
 OCL_TEST_P(BruteForceMatcher, KnnMatch_2_Single)
 #endif
 {
    const int knn = 2;
    BFMatcher matcher(distType);
    std::vector< std::vector<cv::DMatch> > matches;
    matcher.knnMatch(uquery, utrain, matches, knn);
    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
    int badCount = 0;
    for (size_t i = 0; i < matches.size(); i++)
    {
        if ((int)matches[i].size() != knn)
            badCount++;
        else
        {
            int localBadCount = 0;
            for (int k = 0; k < knn; k++)
            {
                cv::DMatch match = matches[i][k];
                if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k) || (match.imgIdx != 0))
                    localBadCount++;
            }
            badCount += localBadCount > 0 ? 1 : 0;
        }
    }
    ASSERT_EQ(0, badCount);
 }
 #ifdef ANDROID
 OCL_TEST_P(BruteForceMatcher, DISABLED_RadiusMatch_Single)
 #else
 OCL_TEST_P(BruteForceMatcher, RadiusMatch_Single)
 #endif
 {
    float radius = 1.f / countFactor;
    BFMatcher matcher(distType);
    std::vector< std::vector<cv::DMatch> > matches;
    matcher.radiusMatch(uquery, utrain, matches, radius);
    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
    int badCount = 0;
    for (size_t i = 0; i < matches.size(); i++)
    {
        if ((int)matches[i].size() != 1)
        {
            badCount++;
        }
        else
        {
            cv::DMatch match = matches[i][0];
            if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor) || (match.imgIdx != 0))
                badCount++;
        }
    }
    ASSERT_EQ(0, badCount);
 }
 OCL_INSTANTIATE_TEST_CASE_P(Matcher, BruteForceMatcher, Combine( Values((int)NORM_L1, (int)NORM_L2),
                                                                Values(57, 64, 83, 128, 179, 256, 304) ) );
 }//ocl
 }//cvtest
 #endif //HAVE_OPENCL