Merge pull request #585 from bitwangyaoyao:2.4_SURF

This commit is contained in:
Andrey Kamaev 2013-02-28 18:06:13 +04:00 committed by OpenCV Buildbot
commit 4811988caf
2 changed files with 116 additions and 105 deletions

View File

@ -78,7 +78,7 @@ uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int col
// dynamically change the precision used for floating type // dynamically change the precision used for floating type
#if defined (__ATI__) || defined (__NVIDIA__) #if defined DOUBLE_SUPPORT
#define F double #define F double
#else #else
#define F float #define F float
@ -299,7 +299,7 @@ __kernel
__global const float * det, __global const float * det,
__global const float * trace, __global const float * trace,
__global int4 * maxPosBuffer, __global int4 * maxPosBuffer,
volatile __global unsigned int* maxCounter, volatile __global int* maxCounter,
int counter_offset, int counter_offset,
int det_step, // the step of det in bytes int det_step, // the step of det in bytes
int trace_step, // the step of trace in bytes int trace_step, // the step of trace in bytes
@ -408,7 +408,7 @@ __kernel
if(condmax) if(condmax)
{ {
unsigned int ind = atomic_inc(maxCounter); int ind = atomic_inc(maxCounter);
if (ind < c_max_candidates) if (ind < c_max_candidates)
{ {
@ -427,7 +427,7 @@ __kernel
__global float * det, __global float * det,
__global float * trace, __global float * trace,
__global int4 * maxPosBuffer, __global int4 * maxPosBuffer,
volatile __global unsigned int* maxCounter, volatile __global int* maxCounter,
int counter_offset, int counter_offset,
int det_step, // the step of det in bytes int det_step, // the step of det in bytes
int trace_step, // the step of trace in bytes int trace_step, // the step of trace in bytes
@ -525,7 +525,7 @@ __kernel
if(condmax) if(condmax)
{ {
unsigned int ind = atomic_inc(maxCounter); int ind = atomic_inc(maxCounter);
if (ind < c_max_candidates) if (ind < c_max_candidates)
{ {
@ -585,7 +585,7 @@ __kernel
__global const float * det, __global const float * det,
__global const int4 * maxPosBuffer, __global const int4 * maxPosBuffer,
__global float * keypoints, __global float * keypoints,
volatile __global unsigned int * featureCounter, volatile __global int * featureCounter,
int det_step, int det_step,
int keypoints_step, int keypoints_step,
int c_img_rows, int c_img_rows,
@ -684,7 +684,7 @@ __kernel
if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size) if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
{ {
// Get a new feature index. // Get a new feature index.
unsigned int ind = atomic_inc(featureCounter); int ind = atomic_inc(featureCounter);
if (ind < c_max_features) if (ind < c_max_features)
{ {
@ -737,19 +737,19 @@ __constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448
__constant float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}}; __constant float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
__constant float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}}; __constant float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
void reduce_32_sum(volatile __local float * data, float partial_reduction, int tid) void reduce_32_sum(volatile __local float * data, volatile float* partial_reduction, int tid)
{ {
#define op(A, B) (A)+(B) #define op(A, B) (*A)+(B)
data[tid] = partial_reduction; data[tid] = *partial_reduction;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) if (tid < 16)
{ {
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]);
} }
#undef op #undef op
} }
@ -831,7 +831,7 @@ __kernel
{ {
const int dir = (i * 4 + get_local_id(1)) * ORI_SEARCH_INC; const int dir = (i * 4 + get_local_id(1)) * ORI_SEARCH_INC;
float sumx = 0.0f, sumy = 0.0f; volatile float sumx = 0.0f, sumy = 0.0f;
int d = abs(convert_int_rte(s_angle[get_local_id(0)]) - dir); int d = abs(convert_int_rte(s_angle[get_local_id(0)]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2) if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{ {
@ -856,8 +856,8 @@ __kernel
sumx += s_X[get_local_id(0) + 96]; sumx += s_X[get_local_id(0) + 96];
sumy += s_Y[get_local_id(0) + 96]; sumy += s_Y[get_local_id(0) + 96];
} }
reduce_32_sum(s_sumx + get_local_id(1) * 32, sumx, get_local_id(0)); reduce_32_sum(s_sumx + get_local_id(1) * 32, &sumx, get_local_id(0));
reduce_32_sum(s_sumy + get_local_id(1) * 32, sumy, get_local_id(0)); reduce_32_sum(s_sumy + get_local_id(1) * 32, &sumy, get_local_id(0));
const float temp_mod = sumx * sumx + sumy * sumy; const float temp_mod = sumx * sumx + sumy * sumy;
if (temp_mod > best_mod) if (temp_mod > best_mod)
@ -892,14 +892,32 @@ __kernel
kp_dir += 2.0f * CV_PI_F; kp_dir += 2.0f * CV_PI_F;
kp_dir *= 180.0f / CV_PI_F; kp_dir *= 180.0f / CV_PI_F;
kp_dir = 360.0f - kp_dir; //kp_dir = 360.0f - kp_dir;
if (fabs(kp_dir - 360.f) < FLT_EPSILON) //if (fabs(kp_dir - 360.f) < FLT_EPSILON)
kp_dir = 0.f; // kp_dir = 0.f;
featureDir[get_group_id(0)] = kp_dir; featureDir[get_group_id(0)] = kp_dir;
} }
} }
__kernel
void icvSetUpright(
__global float * keypoints,
int keypoints_step,
int nFeatures
)
{
keypoints_step /= sizeof(*keypoints);
__global float* featureDir = keypoints + ANGLE_ROW * keypoints_step;
if(get_global_id(0) <= nFeatures)
{
featureDir[get_global_id(0)] = 90.0f;
}
}
#undef ORI_SEARCH_INC #undef ORI_SEARCH_INC
#undef ORI_WIN #undef ORI_WIN
#undef ORI_SAMPLES #undef ORI_SAMPLES
@ -993,10 +1011,7 @@ void calc_dx_dy(
const float centerX = featureX[get_group_id(0)]; const float centerX = featureX[get_group_id(0)];
const float centerY = featureY[get_group_id(0)]; const float centerY = featureY[get_group_id(0)];
const float size = featureSize[get_group_id(0)]; const float size = featureSize[get_group_id(0)];
float descriptor_dir = 360.0f - featureDir[get_group_id(0)]; float descriptor_dir = featureDir[get_group_id(0)] * (float)(CV_PI_F / 180.0f);
if (fabs(descriptor_dir - 360.f) < FLT_EPSILON)
descriptor_dir = 0.f;
descriptor_dir *= (float)(CV_PI_F / 180.0f);
/* The sampling intervals and wavelet sized for selecting an orientation /* The sampling intervals and wavelet sized for selecting an orientation
and building the keypoint descriptor are defined relative to 's' */ and building the keypoint descriptor are defined relative to 's' */
@ -1125,11 +1140,15 @@ __kernel
{ {
sdxabs[tid] = fabs(sdx[tid]); // |dx| array sdxabs[tid] = fabs(sdx[tid]); // |dx| array
sdyabs[tid] = fabs(sdy[tid]); // |dy| array sdyabs[tid] = fabs(sdy[tid]); // |dy| array
//barrier(CLK_LOCAL_MEM_FENCE); }
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 25)
{
reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid); reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
//barrier(CLK_LOCAL_MEM_FENCE); }
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 25)
{
volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2); volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);
// write dx, dy, |dx|, |dy| // write dx, dy, |dx|, |dy|

View File

@ -57,6 +57,21 @@ namespace cv
{ {
///////////////////////////OpenCL kernel strings/////////////////////////// ///////////////////////////OpenCL kernel strings///////////////////////////
extern const char *nonfree_surf; extern const char *nonfree_surf;
const char* noImage2dOption = "-D DISABLE_IMAGE2D";
static void openCLExecuteKernelSURF(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels, int depth)
{
if(support_image2d())
{
openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth);
}
else
{
openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, noImage2dOption);
}
}
} }
} }
@ -80,10 +95,6 @@ static inline int calcSize(int octave, int layer)
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave; return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
} }
namespace
{
const char* noImage2dOption = "-D DISABLE_IMAGE2D";
}
class SURF_OCL_Invoker class SURF_OCL_Invoker
{ {
@ -100,15 +111,16 @@ public:
void icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset, void icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols); int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols);
void icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, unsigned int maxCounter, void icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter,
oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures); oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures);
void icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures); void icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures);
void icvSetUpright_gpu(const oclMat &keypoints, int nFeatures);
void compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures); void compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures);
// end of kernel callers declarations // end of kernel callers declarations
SURF_OCL_Invoker(SURF_OCL &surf, const oclMat &img, const oclMat &mask) : SURF_OCL_Invoker(SURF_OCL &surf, const oclMat &img, const oclMat &mask) :
surf_(surf), surf_(surf),
img_cols(img.cols), img_rows(img.rows), img_cols(img.cols), img_rows(img.rows),
@ -182,8 +194,8 @@ public:
icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer, counters, 1 + octave, icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer, counters, 1 + octave,
octave, use_mask, surf_.nOctaveLayers, layer_rows, layer_cols); octave, use_mask, surf_.nOctaveLayers, layer_rows, layer_cols);
unsigned int maxCounter = Mat(counters).at<unsigned int>(1 + octave); int maxCounter = ((Mat)counters).at<int>(1 + octave);
maxCounter = std::min(maxCounter, static_cast<unsigned int>(maxCandidates)); maxCounter = std::min(maxCounter, static_cast<int>(maxCandidates));
if (maxCounter > 0) if (maxCounter > 0)
{ {
@ -191,16 +203,30 @@ public:
keypoints, counters, octave, layer_rows, maxFeatures); keypoints, counters, octave, layer_rows, maxFeatures);
} }
} }
unsigned int featureCounter = Mat(counters).at<unsigned int>(0); int featureCounter = Mat(counters).at<int>(0);
featureCounter = std::min(featureCounter, static_cast<unsigned int>(maxFeatures)); featureCounter = std::min(featureCounter, static_cast<int>(maxFeatures));
keypoints.cols = featureCounter; keypoints.cols = featureCounter;
if (surf_.upright) if (surf_.upright)
keypoints.row(SURF_OCL::ANGLE_ROW).setTo(Scalar::all(90.0)); {
//keypoints.row(SURF_OCL::ANGLE_ROW).setTo(Scalar::all(90.0));
setUpright(keypoints);
}
else else
{
findOrientation(keypoints); findOrientation(keypoints);
} }
}
void setUpright(oclMat &keypoints)
{
const int nFeatures = keypoints.cols;
if(nFeatures > 0)
{
icvSetUpright_gpu(keypoints, keypoints.cols);
}
}
void findOrientation(oclMat &keypoints) void findOrientation(oclMat &keypoints)
{ {
@ -484,14 +510,7 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2), divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2),
1 1
}; };
if(support_image2d()) openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
} }
void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset, void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
@ -537,17 +556,10 @@ void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat
1 1
}; };
if(support_image2d()) openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
} }
void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, unsigned int maxCounter, void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter,
oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures) oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures)
{ {
Context *clCxt = det.clCxt; Context *clCxt = det.clCxt;
@ -569,14 +581,7 @@ void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMa
size_t localThreads[3] = {3, 3, 3}; size_t localThreads[3] = {3, 3, 3};
size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1}; size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1};
if(support_image2d()) openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
} }
void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures) void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures)
@ -603,16 +608,27 @@ void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeat
size_t localThreads[3] = {32, 4, 1}; size_t localThreads[3] = {32, 4, 1};
size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1}; size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1};
if(support_image2d()) openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
} }
void SURF_OCL_Invoker::icvSetUpright_gpu(const oclMat &keypoints, int nFeatures)
{
Context *clCxt = counters.clCxt;
string kernelName = "icvSetUpright";
vector< pair<size_t, const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
args.push_back( make_pair( sizeof(cl_int), (void *)&nFeatures));
size_t localThreads[3] = {256, 1, 1};
size_t globalThreads[3] = {nFeatures, 1, 1};
openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures) void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures)
{ {
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
@ -648,14 +664,8 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.rows)); args.push_back( make_pair( sizeof(cl_int), (void *)&_img.rows));
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols)); args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols));
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step));
if(support_image2d())
{ openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
kernelName = "normalize_descriptors64"; kernelName = "normalize_descriptors64";
@ -668,14 +678,8 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
args.clear(); args.clear();
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
if(support_image2d())
{ openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
} }
else else
{ {
@ -703,14 +707,8 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.rows)); args.push_back( make_pair( sizeof(cl_int), (void *)&_img.rows));
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols)); args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols));
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step));
if(support_image2d())
{ openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
kernelName = "normalize_descriptors128"; kernelName = "normalize_descriptors128";
@ -723,14 +721,8 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
args.clear(); args.clear();
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
if(support_image2d())
{ openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
} }
} }