diff --git a/modules/ocl/src/brute_force_matcher.cpp b/modules/ocl/src/brute_force_matcher.cpp index c81e342d35..76b1436d29 100644 --- a/modules/ocl/src/brute_force_matcher.cpp +++ b/modules/ocl/src/brute_force_matcher.cpp @@ -51,7 +51,6 @@ using namespace cv; using namespace cv::ocl; using namespace std; -using namespace std; namespace cv { namespace ocl @@ -62,7 +61,7 @@ namespace cv } template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > -void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &mask, +void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &trainIdx, const oclMat &distance, int distType) { cv::ocl::Context *ctx = query.clCxt; @@ -77,7 +76,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); + //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( smemSize, (void *)NULL)); @@ -103,7 +102,7 @@ void matchUnrolledCached(const oclMat /*query*/, const oclMat * /*trains*/, int } template < int BLOCK_SIZE, typename T/*, typename Mask*/ > -void match(const oclMat &query, const oclMat &train, const oclMat &mask, +void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &trainIdx, const oclMat &distance, int distType) { cv::ocl::Context *ctx = query.clCxt; @@ -117,7 +116,7 @@ void match(const oclMat &query, const oclMat &train, const oclMat &mask, { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); + //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( smemSize, (void *)NULL)); @@ -143,7 +142,7 @@ void match(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const o //radius_matchUnrolledCached template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > -void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask, +void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/, const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType) { cv::ocl::Context *ctx = query.clCxt; @@ -159,7 +158,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); args.push_back( make_pair( sizeof(cl_float), (void *)&maxDistance )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); + //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data )); @@ -183,7 +182,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist //radius_match template < int BLOCK_SIZE, typename T/*, typename Mask*/ > -void radius_match(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask, +void radius_match(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/, const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType) { cv::ocl::Context *ctx = query.clCxt; @@ -198,7 +197,7 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); args.push_back( make_pair( sizeof(cl_float), (void *)&maxDistance )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); + //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data )); @@ -472,7 +471,7 @@ void matchDispatcher(const oclMat &query, const oclMat &train, int n, float maxD //knn match Dispatcher template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > -void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &mask, +void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &trainIdx, const oclMat &distance, int distType) { cv::ocl::Context *ctx = query.clCxt; @@ -487,7 +486,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); + //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( smemSize, (void *)NULL)); @@ -507,7 +506,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl } template < int BLOCK_SIZE, typename T/*, typename Mask*/ > -void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask, +void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &trainIdx, const oclMat &distance, int distType) { cv::ocl::Context *ctx = query.clCxt; @@ -521,7 +520,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask, { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); + //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( smemSize, (void *)NULL)); @@ -540,7 +539,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask, } template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > -void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat &mask, const oclMat &allDist, int distType) +void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType) { cv::ocl::Context *ctx = query.clCxt; size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1}; @@ -554,7 +553,7 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); + //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&allDist.data )); args.push_back( make_pair( smemSize, (void *)NULL)); args.push_back( make_pair( sizeof(cl_int), (void *)&block_size )); @@ -573,7 +572,7 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat } template < int BLOCK_SIZE, typename T/*, typename Mask*/ > -void calcDistance(const oclMat &query, const oclMat &train, const oclMat &mask, const oclMat &allDist, int distType) +void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType) { cv::ocl::Context *ctx = query.clCxt; size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1}; @@ -586,7 +585,7 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &mask, { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); + //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&allDist.data )); args.push_back( make_pair( smemSize, (void *)NULL)); args.push_back( make_pair( sizeof(cl_int), (void *)&block_size )); @@ -691,7 +690,7 @@ void findKnnMatch(int k, const oclMat &trainIdx, const oclMat &distance, const o } } -static void findKnnMatchDispatcher(int k, const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType) +void findKnnMatchDispatcher(int k, const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType) { findKnnMatch<256>(k, trainIdx, distance, allDist, distType); } @@ -1007,6 +1006,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchConvert(const Mat &trainIdx, cons void cv::ocl::BruteForceMatcher_OCL_base::match(const oclMat &query, const oclMat &train, vector &matches, const oclMat &mask) { + assert(mask.empty()); // mask is not supported at the moment oclMat trainIdx, distance; matchSingle(query, train, trainIdx, distance, mask); matchDownload(trainIdx, distance, matches); @@ -1696,4 +1696,6 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatch(const oclMat &query, vecto oclMat trainIdx, imgIdx, distance, nMatches; radiusMatchCollection(query, trainIdx, imgIdx, distance, nMatches, maxDistance, masks); radiusMatchDownload(trainIdx, imgIdx, distance, nMatches, matches, compactResult); -} \ No newline at end of file +} + + diff --git a/modules/ocl/src/kernels/brute_force_match.cl b/modules/ocl/src/kernels/brute_force_match.cl index a5d0d7b516..e5dd29ee0a 100644 --- a/modules/ocl/src/kernels/brute_force_match.cl +++ b/modules/ocl/src/kernels/brute_force_match.cl @@ -3,14 +3,16 @@ int bit1Count(float x) { - int c = 0; - int ix = (int)x; - for (int i = 0 ; i < 32 ; i++) - { - c += ix & 0x1; - ix >>= 1; - } - return (float)c; + int c = 0; + int ix = (int)x; + + for (int i = 0 ; i < 32 ; i++) + { + c += ix & 0x1; + ix >>= 1; + } + + return (float)c; } /* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size local size: dim0 is block_size, dim1 is block_size. @@ -18,7 +20,7 @@ local size: dim0 is block_size, dim1 is block_size. __kernel void BruteForceMatch_UnrollMatch( __global float *query, __global float *train, - __global float *mask, + //__global float *mask, __global int *bestTrainIdx, __global float *bestDistance, __local float *sharebuffer, @@ -30,113 +32,122 @@ __kernel void BruteForceMatch_UnrollMatch( int train_cols, int step, int distType - ) +) { - const int lidx = get_local_id(0); - const int lidy = get_local_id(1); - const int groupidx = get_group_id(0); - - __local float *s_query = sharebuffer; - __local float *s_train = sharebuffer + block_size * max_desc_len; - - int queryIdx = groupidx * block_size + lidy; - // load the query into local memory. - for (int i = 0 ; i < max_desc_len / block_size; i ++) - { - int loadx = lidx + i * block_size; - s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; - } - - float myBestDistance = MAX_FLOAT; - int myBestTrainIdx = -1; - - // loopUnrolledCached to find the best trainIdx and best distance. - volatile int imgIdx = 0; - for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) - { - float result = 0; - for (int i = 0 ; i < max_desc_len / block_size ; i++) - { - //load a block_size * block_size block into local train. - const int loadx = lidx + i * block_size; - s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; - - //synchronize to make sure each elem for reduceIteration in share memory is written already. - barrier(CLK_LOCAL_MEM_FENCE); - - /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch(distType) - { - case 0: - for (int j = 0 ; j < block_size ; j++) - { - result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]); - } - break; - case 1: - for (int j = 0 ; j < block_size ; j++) - { - float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - break; - case 2: - for (int j = 0 ; j < block_size ; j++) - { - //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - } - break; - } - - barrier(CLK_LOCAL_MEM_FENCE); - } - - int trainIdx = t * block_size + lidx; - - if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/) - { - //bestImgIdx = imgIdx; - myBestDistance = result; - myBestTrainIdx = trainIdx; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - __local float *s_distance = (__local float*)(sharebuffer); - __local int* s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); - - //find BestMatch - s_distance += lidy * block_size; - s_trainIdx += lidy * block_size; - s_distance[lidx] = myBestDistance; - s_trainIdx[lidx] = myBestTrainIdx; - - barrier(CLK_LOCAL_MEM_FENCE); - - //reduce -- now all reduce implement in each threads. - for (int k = 0 ; k < block_size; k++) - { - if (myBestDistance > s_distance[k]) - { - myBestDistance = s_distance[k]; - myBestTrainIdx = s_trainIdx[k]; - } - } - - if (queryIdx < query_rows && lidx == 0) - { - bestTrainIdx[queryIdx] = myBestTrainIdx; - bestDistance[queryIdx] = myBestDistance; - } + const int lidx = get_local_id(0); + const int lidy = get_local_id(1); + const int groupidx = get_group_id(0); + + __local float *s_query = sharebuffer; + __local float *s_train = sharebuffer + block_size * max_desc_len; + + int queryIdx = groupidx * block_size + lidy; + + // load the query into local memory. + for (int i = 0 ; i < max_desc_len / block_size; i ++) + { + int loadx = lidx + i * block_size; + s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; + } + + float myBestDistance = MAX_FLOAT; + int myBestTrainIdx = -1; + + // loopUnrolledCached to find the best trainIdx and best distance. + volatile int imgIdx = 0; + + for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) + { + float result = 0; + + for (int i = 0 ; i < max_desc_len / block_size ; i++) + { + //load a block_size * block_size block into local train. + const int loadx = lidx + i * block_size; + s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + + //synchronize to make sure each elem for reduceIteration in share memory is written already. + barrier(CLK_LOCAL_MEM_FENCE); + + /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to + sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ + + switch (distType) + { + case 0: + + for (int j = 0 ; j < block_size ; j++) + { + result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]); + } + + break; + case 1: + + for (int j = 0 ; j < block_size ; j++) + { + float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]; + result += qr * qr; + } + + break; + case 2: + + for (int j = 0 ; j < block_size ; j++) + { + //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); + result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]); + } + + break; + } + + barrier(CLK_LOCAL_MEM_FENCE); + } + + int trainIdx = t * block_size + lidx; + + if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/) + { + //bestImgIdx = imgIdx; + myBestDistance = result; + myBestTrainIdx = trainIdx; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + __local float *s_distance = (__local float *)(sharebuffer); + __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); + + //find BestMatch + s_distance += lidy * block_size; + s_trainIdx += lidy * block_size; + s_distance[lidx] = myBestDistance; + s_trainIdx[lidx] = myBestTrainIdx; + + barrier(CLK_LOCAL_MEM_FENCE); + + //reduce -- now all reduce implement in each threads. + for (int k = 0 ; k < block_size; k++) + { + if (myBestDistance > s_distance[k]) + { + myBestDistance = s_distance[k]; + myBestTrainIdx = s_trainIdx[k]; + } + } + + if (queryIdx < query_rows && lidx == 0) + { + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestDistance[queryIdx] = myBestDistance; + } } __kernel void BruteForceMatch_Match( __global float *query, __global float *train, - __global float *mask, + //__global float *mask, __global int *bestTrainIdx, __global float *bestDistance, __local float *sharebuffer, @@ -147,108 +158,115 @@ __kernel void BruteForceMatch_Match( int train_cols, int step, int distType - ) +) { - const int lidx = get_local_id(0); - const int lidy = get_local_id(1); - const int groupidx = get_group_id(0); - - const int queryIdx = groupidx * block_size + lidy; - - float myBestDistance = MAX_FLOAT; - int myBestTrainIdx = -1; - - __local float *s_query = sharebuffer; - __local float *s_train = sharebuffer + block_size * block_size; - - // loop - for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) - { - //Dist dist; - float result = 0; - for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++) - { - const int loadx = lidx + i * block_size; - //load query and train into local memory - s_query[lidy * block_size + lidx] = 0; - s_train[lidx * block_size + lidy] = 0; - - if (loadx < query_cols) - { - s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; - s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch(distType) - { - case 0: - for (int j = 0 ; j < block_size ; j++) - { - result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); - } - break; - case 1: - for (int j = 0 ; j < block_size ; j++) - { - float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - break; - case 2: - for (int j = 0 ; j < block_size ; j++) - { - //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]); - } - break; - } - - barrier(CLK_LOCAL_MEM_FENCE); - } - - const int trainIdx = t * block_size + lidx; - - if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/) - { - //myBestImgidx = imgIdx; - myBestDistance = result; - myBestTrainIdx = trainIdx; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - __local float *s_distance = (__local float *)sharebuffer; - __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); - - //findBestMatch - s_distance += lidy * block_size; - s_trainIdx += lidy * block_size; - s_distance[lidx] = myBestDistance; - s_trainIdx[lidx] = myBestTrainIdx; - - barrier(CLK_LOCAL_MEM_FENCE); - - //reduce -- now all reduce implement in each threads. - for (int k = 0 ; k < block_size; k++) - { - if (myBestDistance > s_distance[k]) - { - myBestDistance = s_distance[k]; - myBestTrainIdx = s_trainIdx[k]; - } - } - - if (queryIdx < query_rows && lidx == 0) - { - bestTrainIdx[queryIdx] = myBestTrainIdx; - bestDistance[queryIdx] = myBestDistance; - } + const int lidx = get_local_id(0); + const int lidy = get_local_id(1); + const int groupidx = get_group_id(0); + + const int queryIdx = groupidx * block_size + lidy; + + float myBestDistance = MAX_FLOAT; + int myBestTrainIdx = -1; + + __local float *s_query = sharebuffer; + __local float *s_train = sharebuffer + block_size * block_size; + + // loop + for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) + { + //Dist dist; + float result = 0; + + for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++) + { + const int loadx = lidx + i * block_size; + //load query and train into local memory + s_query[lidy * block_size + lidx] = 0; + s_train[lidx * block_size + lidy] = 0; + + if (loadx < query_cols) + { + s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; + s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to + sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ + + switch (distType) + { + case 0: + + for (int j = 0 ; j < block_size ; j++) + { + result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); + } + + break; + case 1: + + for (int j = 0 ; j < block_size ; j++) + { + float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; + result += qr * qr; + } + + break; + case 2: + + for (int j = 0 ; j < block_size ; j++) + { + //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); + result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]); + } + + break; + } + + barrier(CLK_LOCAL_MEM_FENCE); + } + + const int trainIdx = t * block_size + lidx; + + if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/) + { + //myBestImgidx = imgIdx; + myBestDistance = result; + myBestTrainIdx = trainIdx; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + __local float *s_distance = (__local float *)sharebuffer; + __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); + + //findBestMatch + s_distance += lidy * block_size; + s_trainIdx += lidy * block_size; + s_distance[lidx] = myBestDistance; + s_trainIdx[lidx] = myBestTrainIdx; + + barrier(CLK_LOCAL_MEM_FENCE); + + //reduce -- now all reduce implement in each threads. + for (int k = 0 ; k < block_size; k++) + { + if (myBestDistance > s_distance[k]) + { + myBestDistance = s_distance[k]; + myBestTrainIdx = s_trainIdx[k]; + } + } + + if (queryIdx < query_rows && lidx == 0) + { + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestDistance[queryIdx] = myBestDistance; + } } //radius_unrollmatch @@ -256,7 +274,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch( __global float *query, __global float *train, float maxDistance, - __global float *mask, + //__global float *mask, __global int *bestTrainIdx, __global float *bestDistance, __global int *nMatches, @@ -271,71 +289,78 @@ __kernel void BruteForceMatch_RadiusUnrollMatch( int step, int ostep, int distType - ) +) { - const int lidx = get_local_id(0); - const int lidy = get_local_id(1); - const int groupidx = get_group_id(0); - const int groupidy = get_group_id(1); - - const int queryIdx = groupidy * block_size + lidy; - const int trainIdx = groupidx * block_size + lidx; - - __local float *s_query = sharebuffer; - __local float *s_train = sharebuffer + block_size * block_size; - - float result = 0; - for (int i = 0 ; i < max_desc_len / block_size ; ++i) - { - //load a block_size * block_size block into local train. - const int loadx = lidx + i * block_size; - - s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; - s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; - - //synchronize to make sure each elem for reduceIteration in share memory is written already. - barrier(CLK_LOCAL_MEM_FENCE); - - /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch(distType) - { - case 0: - for (int j = 0 ; j < block_size ; ++j) - { - result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); - } - break; - case 1: - for (int j = 0 ; j < block_size ; ++j) - { - float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - break; - case 2: - for (int j = 0 ; j < block_size ; ++j) - { - result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - } - break; - } - - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/) - { - unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); - - if(ind < bestTrainIdx_cols) - { - //bestImgIdx = imgIdx; - bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; - bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; - } - } + const int lidx = get_local_id(0); + const int lidy = get_local_id(1); + const int groupidx = get_group_id(0); + const int groupidy = get_group_id(1); + + const int queryIdx = groupidy * block_size + lidy; + const int trainIdx = groupidx * block_size + lidx; + + __local float *s_query = sharebuffer; + __local float *s_train = sharebuffer + block_size * block_size; + + float result = 0; + + for (int i = 0 ; i < max_desc_len / block_size ; ++i) + { + //load a block_size * block_size block into local train. + const int loadx = lidx + i * block_size; + + s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; + s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + + //synchronize to make sure each elem for reduceIteration in share memory is written already. + barrier(CLK_LOCAL_MEM_FENCE); + + /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to + sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ + + switch (distType) + { + case 0: + + for (int j = 0 ; j < block_size ; ++j) + { + result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); + } + + break; + case 1: + + for (int j = 0 ; j < block_size ; ++j) + { + float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; + result += qr * qr; + } + + break; + case 2: + + for (int j = 0 ; j < block_size ; ++j) + { + result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]); + } + + break; + } + + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/) + { + unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); + + if (ind < bestTrainIdx_cols) + { + //bestImgIdx = imgIdx; + bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; + bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; + } + } } //radius_match @@ -343,7 +368,7 @@ __kernel void BruteForceMatch_RadiusMatch( __global float *query, __global float *train, float maxDistance, - __global float *mask, + //__global float *mask, __global int *bestTrainIdx, __global float *bestDistance, __global int *nMatches, @@ -357,78 +382,85 @@ __kernel void BruteForceMatch_RadiusMatch( int step, int ostep, int distType - ) +) { - const int lidx = get_local_id(0); - const int lidy = get_local_id(1); - const int groupidx = get_group_id(0); - const int groupidy = get_group_id(1); - - const int queryIdx = groupidy * block_size + lidy; - const int trainIdx = groupidx * block_size + lidx; - - __local float *s_query = sharebuffer; - __local float *s_train = sharebuffer + block_size * block_size; - - float result = 0; - for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i) - { - //load a block_size * block_size block into local train. - const int loadx = lidx + i * block_size; - - s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; - s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; - - //synchronize to make sure each elem for reduceIteration in share memory is written already. - barrier(CLK_LOCAL_MEM_FENCE); - - /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch(distType) - { - case 0: - for (int j = 0 ; j < block_size ; ++j) - { - result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); - } - break; - case 1: - for (int j = 0 ; j < block_size ; ++j) - { - float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - break; - case 2: - for (int j = 0 ; j < block_size ; ++j) - { - result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - } - break; - } - - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/) - { - unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); - - if(ind < bestTrainIdx_cols) - { - //bestImgIdx = imgIdx; - bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; - bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; - } - } + const int lidx = get_local_id(0); + const int lidy = get_local_id(1); + const int groupidx = get_group_id(0); + const int groupidy = get_group_id(1); + + const int queryIdx = groupidy * block_size + lidy; + const int trainIdx = groupidx * block_size + lidx; + + __local float *s_query = sharebuffer; + __local float *s_train = sharebuffer + block_size * block_size; + + float result = 0; + + for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i) + { + //load a block_size * block_size block into local train. + const int loadx = lidx + i * block_size; + + s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; + s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + + //synchronize to make sure each elem for reduceIteration in share memory is written already. + barrier(CLK_LOCAL_MEM_FENCE); + + /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to + sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ + + switch (distType) + { + case 0: + + for (int j = 0 ; j < block_size ; ++j) + { + result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); + } + + break; + case 1: + + for (int j = 0 ; j < block_size ; ++j) + { + float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; + result += qr * qr; + } + + break; + case 2: + + for (int j = 0 ; j < block_size ; ++j) + { + result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]); + } + + break; + } + + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/) + { + unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); + + if (ind < bestTrainIdx_cols) + { + //bestImgIdx = imgIdx; + bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; + bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; + } + } } __kernel void BruteForceMatch_knnUnrollMatch( __global float *query, __global float *train, - __global float *mask, + //__global float *mask, __global int2 *bestTrainIdx, __global float2 *bestDistance, __local float *sharebuffer, @@ -440,169 +472,178 @@ __kernel void BruteForceMatch_knnUnrollMatch( int train_cols, int step, int distType - ) +) { - const int lidx = get_local_id(0); - const int lidy = get_local_id(1); - const int groupidx = get_group_id(0); - - const int queryIdx = groupidx * block_size + lidy; - local float *s_query = sharebuffer; - local float *s_train = sharebuffer + block_size * max_desc_len; - - // load the query into local memory. - for (int i = 0 ; i < max_desc_len / block_size; i ++) - { - int loadx = lidx + i * block_size; - s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; - } - - float myBestDistance1 = MAX_FLOAT; - float myBestDistance2 = MAX_FLOAT; - int myBestTrainIdx1 = -1; - int myBestTrainIdx2 = -1; - - //loopUnrolledCached - volatile int imgIdx = 0; - for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) - { - float result = 0; - for (int i = 0 ; i < max_desc_len / block_size ; i++) - { - const int loadX = lidx + i * block_size; - //load a block_size * block_size block into local train. - const int loadx = lidx + i * block_size; - s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; - - //synchronize to make sure each elem for reduceIteration in share memory is written already. - barrier(CLK_LOCAL_MEM_FENCE); - - /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch(distType) - { - case 0: - for (int j = 0 ; j < block_size ; j++) - { - result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]); - } - break; - case 1: - for (int j = 0 ; j < block_size ; j++) - { - float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - break; - case 2: - for (int j = 0 ; j < block_size ; j++) - { - //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - } - break; - } - - barrier(CLK_LOCAL_MEM_FENCE); - } - - const int trainIdx = t * block_size + lidx; - - if (queryIdx < query_rows && trainIdx < train_rows) - { - if (result < myBestDistance1) - { - myBestDistance2 = myBestDistance1; - myBestTrainIdx2 = myBestTrainIdx1; - myBestDistance1 = result; - myBestTrainIdx1 = trainIdx; - } - else if (result < myBestDistance2) - { - myBestDistance2 = result; - myBestTrainIdx2 = trainIdx; - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - local float *s_distance = (local float *)sharebuffer; - local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size); - - // find BestMatch - s_distance += lidy * block_size; - s_trainIdx += lidy * block_size; - - s_distance[lidx] = myBestDistance1; - s_trainIdx[lidx] = myBestTrainIdx1; - - float bestDistance1 = MAX_FLOAT; - float bestDistance2 = MAX_FLOAT; - int bestTrainIdx1 = -1; - int bestTrainIdx2 = -1; - barrier(CLK_LOCAL_MEM_FENCE); - - if (lidx == 0) - { - for (int i = 0 ; i < block_size ; i++) - { - float val = s_distance[i]; - if (val < bestDistance1) - { - bestDistance2 = bestDistance1; - bestTrainIdx2 = bestTrainIdx1; - - bestDistance1 = val; - bestTrainIdx1 = s_trainIdx[i]; - } - else if (val < bestDistance2) - { - bestDistance2 = val; - bestTrainIdx2 = s_trainIdx[i]; - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - s_distance[lidx] = myBestDistance2; - s_trainIdx[lidx] = myBestTrainIdx2; - - barrier(CLK_LOCAL_MEM_FENCE); - - if (lidx == 0) - { - for (int i = 0 ; i < block_size ; i++) - { - float val = s_distance[i]; - - if (val < bestDistance2) - { - bestDistance2 = val; - bestTrainIdx2 = s_trainIdx[i]; - } - } - } - - myBestDistance1 = bestDistance1; - myBestDistance2 = bestDistance2; - - myBestTrainIdx1 = bestTrainIdx1; - myBestTrainIdx2 = bestTrainIdx2; - - if (queryIdx < query_rows && lidx == 0) - { - bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2); - bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2); - } + const int lidx = get_local_id(0); + const int lidy = get_local_id(1); + const int groupidx = get_group_id(0); + + const int queryIdx = groupidx * block_size + lidy; + local float *s_query = sharebuffer; + local float *s_train = sharebuffer + block_size * max_desc_len; + + // load the query into local memory. + for (int i = 0 ; i < max_desc_len / block_size; i ++) + { + int loadx = lidx + i * block_size; + s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; + } + + float myBestDistance1 = MAX_FLOAT; + float myBestDistance2 = MAX_FLOAT; + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; + + //loopUnrolledCached + volatile int imgIdx = 0; + + for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) + { + float result = 0; + + for (int i = 0 ; i < max_desc_len / block_size ; i++) + { + const int loadX = lidx + i * block_size; + //load a block_size * block_size block into local train. + const int loadx = lidx + i * block_size; + s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + + //synchronize to make sure each elem for reduceIteration in share memory is written already. + barrier(CLK_LOCAL_MEM_FENCE); + + /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to + sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ + + switch (distType) + { + case 0: + + for (int j = 0 ; j < block_size ; j++) + { + result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]); + } + + break; + case 1: + + for (int j = 0 ; j < block_size ; j++) + { + float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]; + result += qr * qr; + } + + break; + case 2: + + for (int j = 0 ; j < block_size ; j++) + { + //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); + result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]); + } + + break; + } + + barrier(CLK_LOCAL_MEM_FENCE); + } + + const int trainIdx = t * block_size + lidx; + + if (queryIdx < query_rows && trainIdx < train_rows) + { + if (result < myBestDistance1) + { + myBestDistance2 = myBestDistance1; + myBestTrainIdx2 = myBestTrainIdx1; + myBestDistance1 = result; + myBestTrainIdx1 = trainIdx; + } + else if (result < myBestDistance2) + { + myBestDistance2 = result; + myBestTrainIdx2 = trainIdx; + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + local float *s_distance = (local float *)sharebuffer; + local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size); + + // find BestMatch + s_distance += lidy * block_size; + s_trainIdx += lidy * block_size; + + s_distance[lidx] = myBestDistance1; + s_trainIdx[lidx] = myBestTrainIdx1; + + float bestDistance1 = MAX_FLOAT; + float bestDistance2 = MAX_FLOAT; + int bestTrainIdx1 = -1; + int bestTrainIdx2 = -1; + barrier(CLK_LOCAL_MEM_FENCE); + + if (lidx == 0) + { + for (int i = 0 ; i < block_size ; i++) + { + float val = s_distance[i]; + + if (val < bestDistance1) + { + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; + + bestDistance1 = val; + bestTrainIdx1 = s_trainIdx[i]; + } + else if (val < bestDistance2) + { + bestDistance2 = val; + bestTrainIdx2 = s_trainIdx[i]; + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + s_distance[lidx] = myBestDistance2; + s_trainIdx[lidx] = myBestTrainIdx2; + + barrier(CLK_LOCAL_MEM_FENCE); + + if (lidx == 0) + { + for (int i = 0 ; i < block_size ; i++) + { + float val = s_distance[i]; + + if (val < bestDistance2) + { + bestDistance2 = val; + bestTrainIdx2 = s_trainIdx[i]; + } + } + } + + myBestDistance1 = bestDistance1; + myBestDistance2 = bestDistance2; + + myBestTrainIdx1 = bestTrainIdx1; + myBestTrainIdx2 = bestTrainIdx2; + + if (queryIdx < query_rows && lidx == 0) + { + bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2); + bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2); + } } __kernel void BruteForceMatch_knnMatch( __global float *query, __global float *train, - __global float *mask, + //__global float *mask, __global int2 *bestTrainIdx, __global float2 *bestDistance, __local float *sharebuffer, @@ -613,166 +654,174 @@ __kernel void BruteForceMatch_knnMatch( int train_cols, int step, int distType - ) +) { - const int lidx = get_local_id(0); - const int lidy = get_local_id(1); - const int groupidx = get_group_id(0); - - const int queryIdx = groupidx * block_size + lidy; - local float *s_query = sharebuffer; - local float *s_train = sharebuffer + block_size * block_size; - - float myBestDistance1 = MAX_FLOAT; - float myBestDistance2 = MAX_FLOAT; - int myBestTrainIdx1 = -1; - int myBestTrainIdx2 = -1; - - //loop - for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) - { - float result = 0.0f; - for (int i = 0 ; i < (query_cols + block_size -1) / block_size ; i++) - { - const int loadx = lidx + i * block_size; - //load query and train into local memory - s_query[lidy * block_size + lidx] = 0; - s_train[lidx * block_size + lidy] = 0; - - if (loadx < query_cols) - { - s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; - s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch(distType) - { - case 0: - for (int j = 0 ; j < block_size ; j++) - { - result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); - } - break; - case 1: - for (int j = 0 ; j < block_size ; j++) - { - float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - break; - case 2: - for (int j = 0 ; j < block_size ; j++) - { - //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]); - } - break; - } - - barrier(CLK_LOCAL_MEM_FENCE); - } - - const int trainIdx = t * block_size + lidx; - - if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/) - { - if (result < myBestDistance1) - { - myBestDistance2 = myBestDistance1; - myBestTrainIdx2 = myBestTrainIdx1; - myBestDistance1 = result; - myBestTrainIdx1 = trainIdx; - } - else if (result < myBestDistance2) - { - myBestDistance2 = result; - myBestTrainIdx2 = trainIdx; - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - __local float *s_distance = (__local float *)sharebuffer; - __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); - - //findBestMatch - s_distance += lidy * block_size; - s_trainIdx += lidy * block_size; - - s_distance[lidx] = myBestDistance1; - s_trainIdx[lidx] = myBestTrainIdx1; - - float bestDistance1 = MAX_FLOAT; - float bestDistance2 = MAX_FLOAT; - int bestTrainIdx1 = -1; - int bestTrainIdx2 = -1; - barrier(CLK_LOCAL_MEM_FENCE); - - if (lidx == 0) - { - for (int i = 0 ; i < block_size ; i++) - { - float val = s_distance[i]; - if (val < bestDistance1) - { - bestDistance2 = bestDistance1; - bestTrainIdx2 = bestTrainIdx1; - - bestDistance1 = val; - bestTrainIdx1 = s_trainIdx[i]; - } - else if (val < bestDistance2) - { - bestDistance2 = val; - bestTrainIdx2 = s_trainIdx[i]; - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - s_distance[lidx] = myBestDistance2; - s_trainIdx[lidx] = myBestTrainIdx2; - - barrier(CLK_LOCAL_MEM_FENCE); - - if (lidx == 0) - { - for (int i = 0 ; i < block_size ; i++) - { - float val = s_distance[i]; - - if (val < bestDistance2) - { - bestDistance2 = val; - bestTrainIdx2 = s_trainIdx[i]; - } - } - } - - myBestDistance1 = bestDistance1; - myBestDistance2 = bestDistance2; - - myBestTrainIdx1 = bestTrainIdx1; - myBestTrainIdx2 = bestTrainIdx2; - - if (queryIdx < query_rows && lidx == 0) - { - bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2); - bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2); - } + const int lidx = get_local_id(0); + const int lidy = get_local_id(1); + const int groupidx = get_group_id(0); + + const int queryIdx = groupidx * block_size + lidy; + local float *s_query = sharebuffer; + local float *s_train = sharebuffer + block_size * block_size; + + float myBestDistance1 = MAX_FLOAT; + float myBestDistance2 = MAX_FLOAT; + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; + + //loop + for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) + { + float result = 0.0f; + + for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++) + { + const int loadx = lidx + i * block_size; + //load query and train into local memory + s_query[lidy * block_size + lidx] = 0; + s_train[lidx * block_size + lidy] = 0; + + if (loadx < query_cols) + { + s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; + s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to + sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ + + switch (distType) + { + case 0: + + for (int j = 0 ; j < block_size ; j++) + { + result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); + } + + break; + case 1: + + for (int j = 0 ; j < block_size ; j++) + { + float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; + result += qr * qr; + } + + break; + case 2: + + for (int j = 0 ; j < block_size ; j++) + { + //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); + result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]); + } + + break; + } + + barrier(CLK_LOCAL_MEM_FENCE); + } + + const int trainIdx = t * block_size + lidx; + + if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/) + { + if (result < myBestDistance1) + { + myBestDistance2 = myBestDistance1; + myBestTrainIdx2 = myBestTrainIdx1; + myBestDistance1 = result; + myBestTrainIdx1 = trainIdx; + } + else if (result < myBestDistance2) + { + myBestDistance2 = result; + myBestTrainIdx2 = trainIdx; + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + __local float *s_distance = (__local float *)sharebuffer; + __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); + + //findBestMatch + s_distance += lidy * block_size; + s_trainIdx += lidy * block_size; + + s_distance[lidx] = myBestDistance1; + s_trainIdx[lidx] = myBestTrainIdx1; + + float bestDistance1 = MAX_FLOAT; + float bestDistance2 = MAX_FLOAT; + int bestTrainIdx1 = -1; + int bestTrainIdx2 = -1; + barrier(CLK_LOCAL_MEM_FENCE); + + if (lidx == 0) + { + for (int i = 0 ; i < block_size ; i++) + { + float val = s_distance[i]; + + if (val < bestDistance1) + { + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; + + bestDistance1 = val; + bestTrainIdx1 = s_trainIdx[i]; + } + else if (val < bestDistance2) + { + bestDistance2 = val; + bestTrainIdx2 = s_trainIdx[i]; + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + s_distance[lidx] = myBestDistance2; + s_trainIdx[lidx] = myBestTrainIdx2; + + barrier(CLK_LOCAL_MEM_FENCE); + + if (lidx == 0) + { + for (int i = 0 ; i < block_size ; i++) + { + float val = s_distance[i]; + + if (val < bestDistance2) + { + bestDistance2 = val; + bestTrainIdx2 = s_trainIdx[i]; + } + } + } + + myBestDistance1 = bestDistance1; + myBestDistance2 = bestDistance2; + + myBestTrainIdx1 = bestTrainIdx1; + myBestTrainIdx2 = bestTrainIdx2; + + if (queryIdx < query_rows && lidx == 0) + { + bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2); + bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2); + } } kernel void BruteForceMatch_calcDistanceUnrolled( __global float *query, __global float *train, - __global float *mask, + //__global float *mask, __global float *allDist, __local float *sharebuffer, int block_size, @@ -784,13 +833,13 @@ kernel void BruteForceMatch_calcDistanceUnrolled( int step, int distType) { - /* Todo */ + /* Todo */ } kernel void BruteForceMatch_calcDistance( __global float *query, __global float *train, - __global float *mask, + //__global float *mask, __global float *allDist, __local float *sharebuffer, int block_size, @@ -801,16 +850,16 @@ kernel void BruteForceMatch_calcDistance( int step, int distType) { - /* Todo */ + /* Todo */ } kernel void BruteForceMatch_findBestMatch( __global float *allDist, __global int *bestTrainIdx, __global float *bestDistance, - int k, - int block_size - ) + int k, + int block_size +) { - /* Todo */ + /* Todo */ } \ No newline at end of file