diff --git a/modules/ocl/src/kernels/nonfree_surf.cl b/modules/ocl/src/kernels/nonfree_surf.cl index 5916b2557c..69f64795e9 100644 --- a/modules/ocl/src/kernels/nonfree_surf.cl +++ b/modules/ocl/src/kernels/nonfree_surf.cl @@ -78,7 +78,7 @@ uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int col // dynamically change the precision used for floating type -#if defined (__ATI__) || defined (__NVIDIA__) +#if defined DOUBLE_SUPPORT #define F double #else #define F float @@ -299,7 +299,7 @@ __kernel __global const float * det, __global const float * trace, __global int4 * maxPosBuffer, - volatile __global unsigned int* maxCounter, + volatile __global int* maxCounter, int counter_offset, int det_step, // the step of det in bytes int trace_step, // the step of trace in bytes @@ -408,7 +408,7 @@ __kernel if(condmax) { - unsigned int ind = atomic_inc(maxCounter); + int ind = atomic_inc(maxCounter); if (ind < c_max_candidates) { @@ -427,7 +427,7 @@ __kernel __global float * det, __global float * trace, __global int4 * maxPosBuffer, - volatile __global unsigned int* maxCounter, + volatile __global int* maxCounter, int counter_offset, int det_step, // the step of det in bytes int trace_step, // the step of trace in bytes @@ -525,7 +525,7 @@ __kernel if(condmax) { - unsigned int ind = atomic_inc(maxCounter); + int ind = atomic_inc(maxCounter); if (ind < c_max_candidates) { @@ -585,7 +585,7 @@ __kernel __global const float * det, __global const int4 * maxPosBuffer, __global float * keypoints, - volatile __global unsigned int * featureCounter, + volatile __global int * featureCounter, int det_step, int keypoints_step, int c_img_rows, @@ -684,7 +684,7 @@ __kernel if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size) { // Get a new feature index. - unsigned int ind = atomic_inc(featureCounter); + int ind = atomic_inc(featureCounter); if (ind < c_max_features) { @@ -737,19 +737,19 @@ __constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448 __constant float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}}; __constant float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}}; -void reduce_32_sum(volatile __local float * data, float partial_reduction, int tid) +void reduce_32_sum(volatile __local float * data, volatile float* partial_reduction, int tid) { -#define op(A, B) (A)+(B) - data[tid] = partial_reduction; +#define op(A, B) (*A)+(B) + data[tid] = *partial_reduction; barrier(CLK_LOCAL_MEM_FENCE); if (tid < 16) { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); + data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]); + data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]); + data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]); + data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]); + data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]); } #undef op } @@ -831,7 +831,7 @@ __kernel { const int dir = (i * 4 + get_local_id(1)) * ORI_SEARCH_INC; - float sumx = 0.0f, sumy = 0.0f; + volatile float sumx = 0.0f, sumy = 0.0f; int d = abs(convert_int_rte(s_angle[get_local_id(0)]) - dir); if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2) { @@ -856,8 +856,8 @@ __kernel sumx += s_X[get_local_id(0) + 96]; sumy += s_Y[get_local_id(0) + 96]; } - reduce_32_sum(s_sumx + get_local_id(1) * 32, sumx, get_local_id(0)); - reduce_32_sum(s_sumy + get_local_id(1) * 32, sumy, get_local_id(0)); + reduce_32_sum(s_sumx + get_local_id(1) * 32, &sumx, get_local_id(0)); + reduce_32_sum(s_sumy + get_local_id(1) * 32, &sumy, get_local_id(0)); const float temp_mod = sumx * sumx + sumy * sumy; if (temp_mod > best_mod) @@ -892,14 +892,32 @@ __kernel kp_dir += 2.0f * CV_PI_F; kp_dir *= 180.0f / CV_PI_F; - kp_dir = 360.0f - kp_dir; - if (fabs(kp_dir - 360.f) < FLT_EPSILON) - kp_dir = 0.f; + //kp_dir = 360.0f - kp_dir; + //if (fabs(kp_dir - 360.f) < FLT_EPSILON) + // kp_dir = 0.f; featureDir[get_group_id(0)] = kp_dir; } } + +__kernel + void icvSetUpright( + __global float * keypoints, + int keypoints_step, + int nFeatures + ) +{ + keypoints_step /= sizeof(*keypoints); + __global float* featureDir = keypoints + ANGLE_ROW * keypoints_step; + + if(get_global_id(0) <= nFeatures) + { + featureDir[get_global_id(0)] = 90.0f; + } +} + + #undef ORI_SEARCH_INC #undef ORI_WIN #undef ORI_SAMPLES @@ -993,10 +1011,7 @@ void calc_dx_dy( const float centerX = featureX[get_group_id(0)]; const float centerY = featureY[get_group_id(0)]; const float size = featureSize[get_group_id(0)]; - float descriptor_dir = 360.0f - featureDir[get_group_id(0)]; - if (fabs(descriptor_dir - 360.f) < FLT_EPSILON) - descriptor_dir = 0.f; - descriptor_dir *= (float)(CV_PI_F / 180.0f); + float descriptor_dir = featureDir[get_group_id(0)] * (float)(CV_PI_F / 180.0f); /* The sampling intervals and wavelet sized for selecting an orientation and building the keypoint descriptor are defined relative to 's' */ @@ -1125,11 +1140,15 @@ __kernel { sdxabs[tid] = fabs(sdx[tid]); // |dx| array sdyabs[tid] = fabs(sdy[tid]); // |dy| array - //barrier(CLK_LOCAL_MEM_FENCE); - + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 25) + { reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid); - //barrier(CLK_LOCAL_MEM_FENCE); - + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 25) + { volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2); // write dx, dy, |dx|, |dy| diff --git a/modules/ocl/src/surf.cpp b/modules/ocl/src/surf.cpp index 2e06f4439f..e2ac21b972 100644 --- a/modules/ocl/src/surf.cpp +++ b/modules/ocl/src/surf.cpp @@ -57,6 +57,21 @@ namespace cv { ///////////////////////////OpenCL kernel strings/////////////////////////// extern const char *nonfree_surf; + + const char* noImage2dOption = "-D DISABLE_IMAGE2D"; + + static void openCLExecuteKernelSURF(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3], + size_t localThreads[3], vector< pair > &args, int channels, int depth) + { + if(support_image2d()) + { + openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth); + } + else + { + openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, noImage2dOption); + } + } } } @@ -80,10 +95,6 @@ static inline int calcSize(int octave, int layer) return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave; } -namespace -{ - const char* noImage2dOption = "-D DISABLE_IMAGE2D"; -} class SURF_OCL_Invoker { @@ -100,15 +111,16 @@ public: void icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset, int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols); - void icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, unsigned int maxCounter, + void icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter, oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures); void icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures); + void icvSetUpright_gpu(const oclMat &keypoints, int nFeatures); + void compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures); // end of kernel callers declarations - SURF_OCL_Invoker(SURF_OCL &surf, const oclMat &img, const oclMat &mask) : surf_(surf), img_cols(img.cols), img_rows(img.rows), @@ -182,8 +194,8 @@ public: icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer, counters, 1 + octave, octave, use_mask, surf_.nOctaveLayers, layer_rows, layer_cols); - unsigned int maxCounter = Mat(counters).at(1 + octave); - maxCounter = std::min(maxCounter, static_cast(maxCandidates)); + int maxCounter = ((Mat)counters).at(1 + octave); + maxCounter = std::min(maxCounter, static_cast(maxCandidates)); if (maxCounter > 0) { @@ -191,15 +203,29 @@ public: keypoints, counters, octave, layer_rows, maxFeatures); } } - unsigned int featureCounter = Mat(counters).at(0); - featureCounter = std::min(featureCounter, static_cast(maxFeatures)); + int featureCounter = Mat(counters).at(0); + featureCounter = std::min(featureCounter, static_cast(maxFeatures)); keypoints.cols = featureCounter; if (surf_.upright) - keypoints.row(SURF_OCL::ANGLE_ROW).setTo(Scalar::all(90.0)); + { + //keypoints.row(SURF_OCL::ANGLE_ROW).setTo(Scalar::all(90.0)); + setUpright(keypoints); + } else + { findOrientation(keypoints); + } + } + + void setUpright(oclMat &keypoints) + { + const int nFeatures = keypoints.cols; + if(nFeatures > 0) + { + icvSetUpright_gpu(keypoints, keypoints.cols); + } } void findOrientation(oclMat &keypoints) @@ -484,14 +510,7 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2), 1 }; - if(support_image2d()) - { - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); - } - else - { - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption); - } + openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); } void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset, @@ -537,17 +556,10 @@ void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat 1 }; - if(support_image2d()) - { - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); - } - else - { - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption); - } + openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); } -void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, unsigned int maxCounter, +void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter, oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures) { Context *clCxt = det.clCxt; @@ -569,14 +581,7 @@ void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMa size_t localThreads[3] = {3, 3, 3}; size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1}; - if(support_image2d()) - { - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); - } - else - { - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption); - } + openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); } void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures) @@ -603,16 +608,27 @@ void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeat size_t localThreads[3] = {32, 4, 1}; size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1}; - if(support_image2d()) - { - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); - } - else - { - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption); - } + openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); } +void SURF_OCL_Invoker::icvSetUpright_gpu(const oclMat &keypoints, int nFeatures) +{ + Context *clCxt = counters.clCxt; + string kernelName = "icvSetUpright"; + + vector< pair > args; + + args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data)); + args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step)); + args.push_back( make_pair( sizeof(cl_int), (void *)&nFeatures)); + + size_t localThreads[3] = {256, 1, 1}; + size_t globalThreads[3] = {nFeatures, 1, 1}; + + openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); +} + + void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures) { // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D @@ -648,14 +664,8 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const args.push_back( make_pair( sizeof(cl_int), (void *)&_img.rows)); args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols)); args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step)); - if(support_image2d()) - { - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); - } - else - { - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption); - } + + openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); kernelName = "normalize_descriptors64"; @@ -668,14 +678,8 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const args.clear(); args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data)); args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step)); - if(support_image2d()) - { - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); - } - else - { - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption); - } + + openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); } else { @@ -703,14 +707,8 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const args.push_back( make_pair( sizeof(cl_int), (void *)&_img.rows)); args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols)); args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step)); - if(support_image2d()) - { - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); - } - else - { - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption); - } + + openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); kernelName = "normalize_descriptors128"; @@ -723,14 +721,8 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const args.clear(); args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data)); args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step)); - if(support_image2d()) - { - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); - } - else - { - openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption); - } + + openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); } }