#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable #define MAX_FLOAT 1e7f int bit1Count(float x) { int c = 0; int ix = (int)x; for (int i = 0 ; i < 32 ; i++) { c += ix & 0x1; ix >>= 1; } return (float)c; } /* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size local size: dim0 is block_size, dim1 is block_size. */ __kernel void BruteForceMatch_UnrollMatch( __global float *query, __global float *train, //__global float *mask, __global int *bestTrainIdx, __global float *bestDistance, __local float *sharebuffer, int block_size, int max_desc_len, int query_rows, int query_cols, int train_rows, int train_cols, int step, int distType ) { const int lidx = get_local_id(0); const int lidy = get_local_id(1); const int groupidx = get_group_id(0); __local float *s_query = sharebuffer; __local float *s_train = sharebuffer + block_size * max_desc_len; int queryIdx = groupidx * block_size + lidy; // load the query into local memory. for (int i = 0 ; i < max_desc_len / block_size; i ++) { int loadx = lidx + i * block_size; s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; } float myBestDistance = MAX_FLOAT; int myBestTrainIdx = -1; // loopUnrolledCached to find the best trainIdx and best distance. volatile int imgIdx = 0; for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) { float result = 0; for (int i = 0 ; i < max_desc_len / block_size ; i++) { //load a block_size * block_size block into local train. const int loadx = lidx + i * block_size; s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ switch (distType) { case 0: for (int j = 0 ; j < block_size ; j++) { result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]); } break; case 1: for (int j = 0 ; j < block_size ; j++) { float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]; result += qr * qr; } break; case 2: for (int j = 0 ; j < block_size ; j++) { //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]); } break; } barrier(CLK_LOCAL_MEM_FENCE); } int trainIdx = t * block_size + lidx; if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/) { //bestImgIdx = imgIdx; myBestDistance = result; myBestTrainIdx = trainIdx; } } barrier(CLK_LOCAL_MEM_FENCE); __local float *s_distance = (__local float *)(sharebuffer); __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); //find BestMatch s_distance += lidy * block_size; s_trainIdx += lidy * block_size; s_distance[lidx] = myBestDistance; s_trainIdx[lidx] = myBestTrainIdx; barrier(CLK_LOCAL_MEM_FENCE); //reduce -- now all reduce implement in each threads. for (int k = 0 ; k < block_size; k++) { if (myBestDistance > s_distance[k]) { myBestDistance = s_distance[k]; myBestTrainIdx = s_trainIdx[k]; } } if (queryIdx < query_rows && lidx == 0) { bestTrainIdx[queryIdx] = myBestTrainIdx; bestDistance[queryIdx] = myBestDistance; } } __kernel void BruteForceMatch_Match( __global float *query, __global float *train, //__global float *mask, __global int *bestTrainIdx, __global float *bestDistance, __local float *sharebuffer, int block_size, int query_rows, int query_cols, int train_rows, int train_cols, int step, int distType ) { const int lidx = get_local_id(0); const int lidy = get_local_id(1); const int groupidx = get_group_id(0); const int queryIdx = groupidx * block_size + lidy; float myBestDistance = MAX_FLOAT; int myBestTrainIdx = -1; __local float *s_query = sharebuffer; __local float *s_train = sharebuffer + block_size * block_size; // loop for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) { //Dist dist; float result = 0; for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++) { const int loadx = lidx + i * block_size; //load query and train into local memory s_query[lidy * block_size + lidx] = 0; s_train[lidx * block_size + lidy] = 0; if (loadx < query_cols) { s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; } barrier(CLK_LOCAL_MEM_FENCE); /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ switch (distType) { case 0: for (int j = 0 ; j < block_size ; j++) { result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); } break; case 1: for (int j = 0 ; j < block_size ; j++) { float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; result += qr * qr; } break; case 2: for (int j = 0 ; j < block_size ; j++) { //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]); } break; } barrier(CLK_LOCAL_MEM_FENCE); } const int trainIdx = t * block_size + lidx; if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/) { //myBestImgidx = imgIdx; myBestDistance = result; myBestTrainIdx = trainIdx; } } barrier(CLK_LOCAL_MEM_FENCE); __local float *s_distance = (__local float *)sharebuffer; __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); //findBestMatch s_distance += lidy * block_size; s_trainIdx += lidy * block_size; s_distance[lidx] = myBestDistance; s_trainIdx[lidx] = myBestTrainIdx; barrier(CLK_LOCAL_MEM_FENCE); //reduce -- now all reduce implement in each threads. for (int k = 0 ; k < block_size; k++) { if (myBestDistance > s_distance[k]) { myBestDistance = s_distance[k]; myBestTrainIdx = s_trainIdx[k]; } } if (queryIdx < query_rows && lidx == 0) { bestTrainIdx[queryIdx] = myBestTrainIdx; bestDistance[queryIdx] = myBestDistance; } } //radius_unrollmatch __kernel void BruteForceMatch_RadiusUnrollMatch( __global float *query, __global float *train, float maxDistance, //__global float *mask, __global int *bestTrainIdx, __global float *bestDistance, __global int *nMatches, __local float *sharebuffer, int block_size, int max_desc_len, int query_rows, int query_cols, int train_rows, int train_cols, int bestTrainIdx_cols, int step, int ostep, int distType ) { const int lidx = get_local_id(0); const int lidy = get_local_id(1); const int groupidx = get_group_id(0); const int groupidy = get_group_id(1); const int queryIdx = groupidy * block_size + lidy; const int trainIdx = groupidx * block_size + lidx; __local float *s_query = sharebuffer; __local float *s_train = sharebuffer + block_size * block_size; float result = 0; for (int i = 0 ; i < max_desc_len / block_size ; ++i) { //load a block_size * block_size block into local train. const int loadx = lidx + i * block_size; s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ switch (distType) { case 0: for (int j = 0 ; j < block_size ; ++j) { result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); } break; case 1: for (int j = 0 ; j < block_size ; ++j) { float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; result += qr * qr; } break; case 2: for (int j = 0 ; j < block_size ; ++j) { result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]); } break; } barrier(CLK_LOCAL_MEM_FENCE); } if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/) { unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); if (ind < bestTrainIdx_cols) { //bestImgIdx = imgIdx; bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; } } } //radius_match __kernel void BruteForceMatch_RadiusMatch( __global float *query, __global float *train, float maxDistance, //__global float *mask, __global int *bestTrainIdx, __global float *bestDistance, __global int *nMatches, __local float *sharebuffer, int block_size, int query_rows, int query_cols, int train_rows, int train_cols, int bestTrainIdx_cols, int step, int ostep, int distType ) { const int lidx = get_local_id(0); const int lidy = get_local_id(1); const int groupidx = get_group_id(0); const int groupidy = get_group_id(1); const int queryIdx = groupidy * block_size + lidy; const int trainIdx = groupidx * block_size + lidx; __local float *s_query = sharebuffer; __local float *s_train = sharebuffer + block_size * block_size; float result = 0; for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i) { //load a block_size * block_size block into local train. const int loadx = lidx + i * block_size; s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ switch (distType) { case 0: for (int j = 0 ; j < block_size ; ++j) { result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); } break; case 1: for (int j = 0 ; j < block_size ; ++j) { float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; result += qr * qr; } break; case 2: for (int j = 0 ; j < block_size ; ++j) { result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]); } break; } barrier(CLK_LOCAL_MEM_FENCE); } if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/) { unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); if (ind < bestTrainIdx_cols) { //bestImgIdx = imgIdx; bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; } } } __kernel void BruteForceMatch_knnUnrollMatch( __global float *query, __global float *train, //__global float *mask, __global int2 *bestTrainIdx, __global float2 *bestDistance, __local float *sharebuffer, int block_size, int max_desc_len, int query_rows, int query_cols, int train_rows, int train_cols, int step, int distType ) { const int lidx = get_local_id(0); const int lidy = get_local_id(1); const int groupidx = get_group_id(0); const int queryIdx = groupidx * block_size + lidy; local float *s_query = sharebuffer; local float *s_train = sharebuffer + block_size * max_desc_len; // load the query into local memory. for (int i = 0 ; i < max_desc_len / block_size; i ++) { int loadx = lidx + i * block_size; s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; } float myBestDistance1 = MAX_FLOAT; float myBestDistance2 = MAX_FLOAT; int myBestTrainIdx1 = -1; int myBestTrainIdx2 = -1; //loopUnrolledCached volatile int imgIdx = 0; for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) { float result = 0; for (int i = 0 ; i < max_desc_len / block_size ; i++) { const int loadX = lidx + i * block_size; //load a block_size * block_size block into local train. const int loadx = lidx + i * block_size; s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ switch (distType) { case 0: for (int j = 0 ; j < block_size ; j++) { result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]); } break; case 1: for (int j = 0 ; j < block_size ; j++) { float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]; result += qr * qr; } break; case 2: for (int j = 0 ; j < block_size ; j++) { //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]); } break; } barrier(CLK_LOCAL_MEM_FENCE); } const int trainIdx = t * block_size + lidx; if (queryIdx < query_rows && trainIdx < train_rows) { if (result < myBestDistance1) { myBestDistance2 = myBestDistance1; myBestTrainIdx2 = myBestTrainIdx1; myBestDistance1 = result; myBestTrainIdx1 = trainIdx; } else if (result < myBestDistance2) { myBestDistance2 = result; myBestTrainIdx2 = trainIdx; } } } barrier(CLK_LOCAL_MEM_FENCE); local float *s_distance = (local float *)sharebuffer; local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size); // find BestMatch s_distance += lidy * block_size; s_trainIdx += lidy * block_size; s_distance[lidx] = myBestDistance1; s_trainIdx[lidx] = myBestTrainIdx1; float bestDistance1 = MAX_FLOAT; float bestDistance2 = MAX_FLOAT; int bestTrainIdx1 = -1; int bestTrainIdx2 = -1; barrier(CLK_LOCAL_MEM_FENCE); if (lidx == 0) { for (int i = 0 ; i < block_size ; i++) { float val = s_distance[i]; if (val < bestDistance1) { bestDistance2 = bestDistance1; bestTrainIdx2 = bestTrainIdx1; bestDistance1 = val; bestTrainIdx1 = s_trainIdx[i]; } else if (val < bestDistance2) { bestDistance2 = val; bestTrainIdx2 = s_trainIdx[i]; } } } barrier(CLK_LOCAL_MEM_FENCE); s_distance[lidx] = myBestDistance2; s_trainIdx[lidx] = myBestTrainIdx2; barrier(CLK_LOCAL_MEM_FENCE); if (lidx == 0) { for (int i = 0 ; i < block_size ; i++) { float val = s_distance[i]; if (val < bestDistance2) { bestDistance2 = val; bestTrainIdx2 = s_trainIdx[i]; } } } myBestDistance1 = bestDistance1; myBestDistance2 = bestDistance2; myBestTrainIdx1 = bestTrainIdx1; myBestTrainIdx2 = bestTrainIdx2; if (queryIdx < query_rows && lidx == 0) { bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2); bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2); } } __kernel void BruteForceMatch_knnMatch( __global float *query, __global float *train, //__global float *mask, __global int2 *bestTrainIdx, __global float2 *bestDistance, __local float *sharebuffer, int block_size, int query_rows, int query_cols, int train_rows, int train_cols, int step, int distType ) { const int lidx = get_local_id(0); const int lidy = get_local_id(1); const int groupidx = get_group_id(0); const int queryIdx = groupidx * block_size + lidy; local float *s_query = sharebuffer; local float *s_train = sharebuffer + block_size * block_size; float myBestDistance1 = MAX_FLOAT; float myBestDistance2 = MAX_FLOAT; int myBestTrainIdx1 = -1; int myBestTrainIdx2 = -1; //loop for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) { float result = 0.0f; for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++) { const int loadx = lidx + i * block_size; //load query and train into local memory s_query[lidy * block_size + lidx] = 0; s_train[lidx * block_size + lidy] = 0; if (loadx < query_cols) { s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; } barrier(CLK_LOCAL_MEM_FENCE); /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ switch (distType) { case 0: for (int j = 0 ; j < block_size ; j++) { result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); } break; case 1: for (int j = 0 ; j < block_size ; j++) { float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; result += qr * qr; } break; case 2: for (int j = 0 ; j < block_size ; j++) { //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]); } break; } barrier(CLK_LOCAL_MEM_FENCE); } const int trainIdx = t * block_size + lidx; if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/) { if (result < myBestDistance1) { myBestDistance2 = myBestDistance1; myBestTrainIdx2 = myBestTrainIdx1; myBestDistance1 = result; myBestTrainIdx1 = trainIdx; } else if (result < myBestDistance2) { myBestDistance2 = result; myBestTrainIdx2 = trainIdx; } } } barrier(CLK_LOCAL_MEM_FENCE); __local float *s_distance = (__local float *)sharebuffer; __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); //findBestMatch s_distance += lidy * block_size; s_trainIdx += lidy * block_size; s_distance[lidx] = myBestDistance1; s_trainIdx[lidx] = myBestTrainIdx1; float bestDistance1 = MAX_FLOAT; float bestDistance2 = MAX_FLOAT; int bestTrainIdx1 = -1; int bestTrainIdx2 = -1; barrier(CLK_LOCAL_MEM_FENCE); if (lidx == 0) { for (int i = 0 ; i < block_size ; i++) { float val = s_distance[i]; if (val < bestDistance1) { bestDistance2 = bestDistance1; bestTrainIdx2 = bestTrainIdx1; bestDistance1 = val; bestTrainIdx1 = s_trainIdx[i]; } else if (val < bestDistance2) { bestDistance2 = val; bestTrainIdx2 = s_trainIdx[i]; } } } barrier(CLK_LOCAL_MEM_FENCE); s_distance[lidx] = myBestDistance2; s_trainIdx[lidx] = myBestTrainIdx2; barrier(CLK_LOCAL_MEM_FENCE); if (lidx == 0) { for (int i = 0 ; i < block_size ; i++) { float val = s_distance[i]; if (val < bestDistance2) { bestDistance2 = val; bestTrainIdx2 = s_trainIdx[i]; } } } myBestDistance1 = bestDistance1; myBestDistance2 = bestDistance2; myBestTrainIdx1 = bestTrainIdx1; myBestTrainIdx2 = bestTrainIdx2; if (queryIdx < query_rows && lidx == 0) { bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2); bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2); } } kernel void BruteForceMatch_calcDistanceUnrolled( __global float *query, __global float *train, //__global float *mask, __global float *allDist, __local float *sharebuffer, int block_size, int max_desc_len, int query_rows, int query_cols, int train_rows, int train_cols, int step, int distType) { /* Todo */ } kernel void BruteForceMatch_calcDistance( __global float *query, __global float *train, //__global float *mask, __global float *allDist, __local float *sharebuffer, int block_size, int query_rows, int query_cols, int train_rows, int train_cols, int step, int distType) { /* Todo */ } kernel void BruteForceMatch_findBestMatch( __global float *allDist, __global int *bestTrainIdx, __global float *bestDistance, int k, int block_size ) { /* Todo */ }