remove the device specific logic

This commit is contained in:
yao 2013-02-26 17:53:08 +08:00
parent 620c699456
commit ea433cc496
2 changed files with 360 additions and 236 deletions

View File

@ -43,10 +43,39 @@
//
//M*/
#pragma OPENCL EXTENSION cl_amd_printf : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
// specialized for non-image2d_t supported platform, intel HD4000, for example
#ifdef DISABLE_IMAGE2D
#define IMAGE_INT32 __global uint *
#define IMAGE_INT8 __global uchar *
#else
#define IMAGE_INT32 image2d_t
#define IMAGE_INT8 image2d_t
#endif
uint read_sumTex(IMAGE_INT32 img, sampler_t sam, int2 coord, int rows, int cols, int elemPerRow)
{
#ifdef DISABLE_IMAGE2D
int x = clamp(coord.x, 0, cols);
int y = clamp(coord.y, 0, rows);
return img[elemPerRow * y + x];
#else
return read_imageui(img, sam, coord).x;
#endif
}
uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int cols, int elemPerRow)
{
#ifdef DISABLE_IMAGE2D
int x = clamp(convert_int_rte(coord.x), 0, cols - 1);
int y = clamp(convert_int_rte(coord.y), 0, rows - 1);
return img[elemPerRow * y + x];
#else
return (uchar)read_imageui(img, sam, coord).x;
#endif
}
// dynamically change the precision used for floating type
#if defined (__ATI__) || defined (__NVIDIA__)
@ -58,14 +87,24 @@
// Image read mode
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
#ifndef FLT_EPSILON
#define FLT_EPSILON (1e-15)
#define CV_PI_F 3.14159265f
#endif
#ifndef CV_PI_F
#define CV_PI_F 3.14159265f
#endif
// Use integral image to calculate haar wavelets.
// N = 2
// for simple haar paatern
float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int oldSize, int newSize, int y, int x)
float icvCalcHaarPatternSum_2(
IMAGE_INT32 sumTex,
__constant float src[2][5],
int oldSize,
int newSize,
int y, int x,
int rows, int cols, int elemPerRow)
{
float ratio = (float)newSize / oldSize;
@ -81,11 +120,10 @@ float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int
int dy2 = convert_int_rte(ratio * src[k][3]);
F t = 0;
t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x;
t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x;
t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x;
t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
}
@ -93,7 +131,13 @@ float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int
}
// N = 3
float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int oldSize, int newSize, int y, int x)
float icvCalcHaarPatternSum_3(
IMAGE_INT32 sumTex,
__constant float src[2][5],
int oldSize,
int newSize,
int y, int x,
int rows, int cols, int elemPerRow)
{
float ratio = (float)newSize / oldSize;
@ -109,11 +153,10 @@ float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int
int dy2 = convert_int_rte(ratio * src[k][3]);
F t = 0;
t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x;
t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x;
t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x;
t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
}
@ -121,7 +164,13 @@ float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int
}
// N = 4
float icvCalcHaarPatternSum_4(image2d_t sumTex, __constant float src[4][5], int oldSize, int newSize, int y, int x)
float icvCalcHaarPatternSum_4(
IMAGE_INT32 sumTex,
__constant float src[2][5],
int oldSize,
int newSize,
int y, int x,
int rows, int cols, int elemPerRow)
{
float ratio = (float)newSize / oldSize;
@ -137,11 +186,10 @@ float icvCalcHaarPatternSum_4(image2d_t sumTex, __constant float src[4][5], int
int dy2 = convert_int_rte(ratio * src[k][3]);
F t = 0;
t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x;
t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x;
t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x;
t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
}
@ -172,7 +220,7 @@ __inline int calcSize(int octave, int layer)
//calculate targeted layer per-pixel determinant and trace with an integral image
__kernel void icvCalcLayerDetAndTrace(
image2d_t sumTex, // input integral image
IMAGE_INT32 sumTex, // input integral image
__global float * det, // output Determinant
__global float * trace, // output trace
int det_step, // the step of det in bytes
@ -181,11 +229,13 @@ __kernel void icvCalcLayerDetAndTrace(
int c_img_cols,
int c_nOctaveLayers,
int c_octave,
int c_layer_rows
int c_layer_rows,
int sumTex_step
)
{
det_step /= sizeof(*det);
trace_step /= sizeof(*trace);
sumTex_step/= sizeof(uint);
// Determine the indices
const int gridDim_y = get_num_groups(1) / (c_nOctaveLayers + 2);
const int blockIdx_y = get_group_id(1) % gridDim_y;
@ -205,12 +255,12 @@ __kernel void icvCalcLayerDetAndTrace(
if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
{
const float dx = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave);
const float dy = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave);
const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave);
const float dx = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
const float dy = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
det [j + margin + det_step * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
}
}
@ -220,7 +270,7 @@ __kernel void icvCalcLayerDetAndTrace(
__constant float c_DM[5] = {0, 0, 9, 9, 1};
bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size)
bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int rows, int cols, int step)
{
float ratio = (float)size / 9.0f;
@ -233,10 +283,10 @@ bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size)
float t = 0;
t += read_imageui(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1)).x;
t -= read_imageui(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2)).x;
t -= read_imageui(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1)).x;
t += read_imageui(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2)).x;
t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1), rows, cols, step);
t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2), rows, cols, step);
t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1), rows, cols, step);
t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2), rows, cols, step);
d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
@ -246,9 +296,9 @@ bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size)
// Non-maximal suppression to further filtering the candidates from previous step
__kernel
void icvFindMaximaInLayer_withmask(
__global const float * det,
__global const float * trace,
__global int4 * maxPosBuffer,
__global const float * det,
__global const float * trace,
__global int4 * maxPosBuffer,
volatile __global unsigned int* maxCounter,
int counter_offset,
int det_step, // the step of det in bytes
@ -261,7 +311,8 @@ __kernel
int c_layer_cols,
int c_max_candidates,
float c_hessianThreshold,
image2d_t maskSumTex
IMAGE_INT32 maskSumTex,
int mask_step
)
{
volatile __local float N9[768]; // threads.x * threads.y * 3
@ -269,6 +320,7 @@ __kernel
det_step /= sizeof(*det);
trace_step /= sizeof(*trace);
maxCounter += counter_offset;
mask_step /= sizeof(uint);
// Determine the indices
const int gridDim_y = get_num_groups(1) / c_nOctaveLayers;
@ -288,26 +340,26 @@ __kernel
// Is this thread within the hessian buffer?
const int zoff = get_local_size(0) * get_local_size(1);
const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
N9[localLin - zoff] =
det[det_step *
N9[localLin - zoff] =
det[det_step *
(c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y
+ min(max(j, 0), c_img_cols - 1)]; // x
N9[localLin ] =
det[det_step *
N9[localLin ] =
det[det_step *
(c_layer_rows * (layer ) + min(max(i, 0), c_img_rows - 1)) // y
+ min(max(j, 0), c_img_cols - 1)]; // x
N9[localLin + zoff] =
det[det_step *
N9[localLin + zoff] =
det[det_step *
(c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y
+ min(max(j, 0), c_img_cols - 1)]; // x
barrier(CLK_LOCAL_MEM_FENCE);
if (i < c_layer_rows - margin
if (i < c_layer_rows - margin
&& j < c_layer_cols - margin
&& get_local_id(0) > 0
&& get_local_id(0) > 0
&& get_local_id(0) < get_local_size(0) - 1
&& get_local_id(1) > 0
&& get_local_id(1) > 0
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
)
{
@ -321,7 +373,7 @@ __kernel
const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
if (within_check(maskSumTex, sum_i, sum_j, size))
if (within_check(maskSumTex, sum_i, sum_j, size, c_img_rows, c_img_cols, mask_step))
{
// Check to see if we have a max (in its 26 neighbours)
const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff]
@ -372,9 +424,9 @@ __kernel
__kernel
void icvFindMaximaInLayer(
__global float * det,
__global float * trace,
__global int4 * maxPosBuffer,
__global float * det,
__global float * trace,
__global int4 * maxPosBuffer,
volatile __global unsigned int* maxCounter,
int counter_offset,
int det_step, // the step of det in bytes
@ -417,19 +469,19 @@ __kernel
int l_x = min(max(j, 0), c_img_cols - 1);
int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1);
N9[localLin - zoff] =
N9[localLin - zoff] =
det[det_step * (l_y - c_layer_rows) + l_x];
N9[localLin ] =
N9[localLin ] =
det[det_step * (l_y ) + l_x];
N9[localLin + zoff] =
N9[localLin + zoff] =
det[det_step * (l_y + c_layer_rows) + l_x];
barrier(CLK_LOCAL_MEM_FENCE);
if (i < c_layer_rows - margin
if (i < c_layer_rows - margin
&& j < c_layer_cols - margin
&& get_local_id(0) > 0
&& get_local_id(0) > 0
&& get_local_id(0) < get_local_size(0) - 1
&& get_local_id(1) > 0
&& get_local_id(1) > 0
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
)
{
@ -497,17 +549,17 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc
{
F invdet = 1.0 / det;
x[0] = invdet *
x[0] = invdet *
(b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) +
A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] ));
x[1] = invdet *
x[1] = invdet *
(A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) -
b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0]));
x[2] = invdet *
x[2] = invdet *
(A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) -
A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) +
b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]));
@ -528,9 +580,9 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc
////////////////////////////////////////////////////////////////////////
// INTERPOLATION
__kernel
__kernel
void icvInterpolateKeypoint(
__global const float * det,
__global const float * det,
__global const int4 * maxPosBuffer,
__global float * keypoints,
volatile __global unsigned int * featureCounter,
@ -560,7 +612,7 @@ __kernel
volatile __local float N9[3][3][3];
N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] =
N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] =
det[det_step * (c_layer_rows * layer + i) + j];
barrier(CLK_LOCAL_MEM_FENCE);
@ -658,27 +710,27 @@ __kernel
__constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
__constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f,
0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f,
0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f,
0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f,
0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f,
0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f,
0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f,
0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f,
0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f,
0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f,
0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f,
0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f,
0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f,
0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f,
__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f,
0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f,
0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f,
0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f,
0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f,
0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f,
0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f,
0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f,
0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f,
0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f,
0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f,
0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f,
0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f,
0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f,
0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f,
0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f,
0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f,
0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f,
0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f,
0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f,
0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f,
0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f,
0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f,
0.001707611023448408f, 0.001455130288377404f};
@ -691,27 +743,29 @@ void reduce_32_sum(volatile __local float * data, float partial_reduction, int
data[tid] = partial_reduction;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
if (tid < 16)
{
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
}
#undef op
}
__kernel
void icvCalcOrientation(
image2d_t sumTex,
IMAGE_INT32 sumTex,
__global float * keypoints,
int keypoints_step,
int c_img_rows,
int c_img_cols
int c_img_cols,
int sum_step
)
{
keypoints_step /= sizeof(*keypoints);
sum_step /= sizeof(uint);
__global float* featureX = keypoints + X_ROW * keypoints_step;
__global float* featureY = keypoints + Y_ROW * keypoints_step;
__global float* featureSize = keypoints + SIZE_ROW * keypoints_step;
@ -754,8 +808,8 @@ __kernel
if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
{
X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x);
Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x);
X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
angle = atan2(Y, X);
@ -881,20 +935,20 @@ __constant float c_DW[PATCH_SZ * PATCH_SZ] =
// utility for linear filter
inline uchar readerGet(
image2d_t src,
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
int i, int j
IMAGE_INT8 src,
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
int i, int j, int rows, int cols, int elemPerRow
)
{
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
return (uchar)read_imageui(src, sampler, (float2)(pixel_x, pixel_y)).x;
return read_imgTex(src, sampler, (float2)(pixel_x, pixel_y), rows, cols, elemPerRow);
}
inline float linearFilter(
image2d_t src,
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
float y, float x
IMAGE_INT8 src,
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
float y, float x, int rows, int cols, int elemPerRow
)
{
x -= 0.5f;
@ -907,30 +961,33 @@ inline float linearFilter(
const int x2 = x1 + 1;
const int y2 = y1 + 1;
uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1);
uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1, rows, cols, elemPerRow);
out = out + src_reg * ((x2 - x) * (y2 - y));
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2);
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2, rows, cols, elemPerRow);
out = out + src_reg * ((x - x1) * (y2 - y));
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1);
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1, rows, cols, elemPerRow);
out = out + src_reg * ((x2 - x) * (y - y1));
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2);
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2, rows, cols, elemPerRow);
out = out + src_reg * ((x - x1) * (y - y1));
return out;
}
void calc_dx_dy(
image2d_t imgTex,
IMAGE_INT8 imgTex,
volatile __local float s_dx_bin[25],
volatile __local float s_dy_bin[25],
volatile __local float s_PATCH[6][6],
__global const float* featureX,
__global const float* featureY,
__global const float* featureSize,
__global const float* featureDir
__global const float* featureX,
__global const float* featureY,
__global const float* featureSize,
__global const float* featureDir,
int rows,
int cols,
int elemPerRow
)
{
const float centerX = featureX[get_group_id(0)];
@ -965,7 +1022,7 @@ void calc_dx_dy(
const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
s_PATCH[get_local_id(1)][get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo);
s_PATCH[get_local_id(1)][get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo, rows, cols, elemPerRow);
barrier(CLK_LOCAL_MEM_FENCE);
@ -976,26 +1033,26 @@ void calc_dx_dy(
const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
const float vx = (
s_PATCH[get_local_id(1) ][get_local_id(0) + 1] -
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
s_PATCH[get_local_id(1) + 1][get_local_id(0) ])
s_PATCH[get_local_id(1) ][get_local_id(0) + 1] -
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
s_PATCH[get_local_id(1) + 1][get_local_id(0) ])
* dw;
const float vy = (
s_PATCH[get_local_id(1) + 1][get_local_id(0) ] -
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
s_PATCH[get_local_id(1) ][get_local_id(0) + 1])
s_PATCH[get_local_id(1) + 1][get_local_id(0) ] -
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
s_PATCH[get_local_id(1) ][get_local_id(0) + 1])
* dw;
s_dx_bin[tid] = vx;
s_dy_bin[tid] = vy;
}
}
void reduce_sum25(
volatile __local float* sdata1,
volatile __local float* sdata2,
volatile __local float* sdata3,
volatile __local float* sdata4,
volatile __local float* sdata1,
volatile __local float* sdata2,
volatile __local float* sdata3,
volatile __local float* sdata4,
int tid
)
{
@ -1033,18 +1090,20 @@ void reduce_sum25(
}
}
__kernel
__kernel
void compute_descriptors64(
image2d_t imgTex,
volatile __global float * descriptors,
IMAGE_INT8 imgTex,
volatile __global float * descriptors,
__global const float * keypoints,
int descriptors_step,
int keypoints_step
int keypoints_step,
int rows,
int cols,
int img_step
)
{
descriptors_step /= sizeof(float);
keypoints_step /= sizeof(float);
__global const float * featureX = keypoints + X_ROW * keypoints_step;
__global const float * featureY = keypoints + Y_ROW * keypoints_step;
__global const float * featureSize = keypoints + SIZE_ROW * keypoints_step;
@ -1057,7 +1116,7 @@ __kernel
volatile __local float sdyabs[25];
volatile __local float s_PATCH[6][6];
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir);
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
barrier(CLK_LOCAL_MEM_FENCE);
const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
@ -1066,10 +1125,10 @@ __kernel
{
sdxabs[tid] = fabs(sdx[tid]); // |dx| array
sdyabs[tid] = fabs(sdy[tid]); // |dy| array
barrier(CLK_LOCAL_MEM_FENCE);
//barrier(CLK_LOCAL_MEM_FENCE);
reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
barrier(CLK_LOCAL_MEM_FENCE);
//barrier(CLK_LOCAL_MEM_FENCE);
volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);
@ -1083,13 +1142,16 @@ __kernel
}
}
}
__kernel
__kernel
void compute_descriptors128(
image2d_t imgTex,
__global volatile float * descriptors,
IMAGE_INT8 imgTex,
__global volatile float * descriptors,
__global float * keypoints,
int descriptors_step,
int keypoints_step
int keypoints_step,
int rows,
int cols,
int img_step
)
{
descriptors_step /= sizeof(*descriptors);
@ -1111,7 +1173,7 @@ __kernel
volatile __local float sdabs2[25];
volatile __local float s_PATCH[6][6];
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir);
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
barrier(CLK_LOCAL_MEM_FENCE);
const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
@ -1132,10 +1194,10 @@ __kernel
sd2[tid] = sdx[tid];
sdabs2[tid] = fabs(sdx[tid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
//barrier(CLK_LOCAL_MEM_FENCE);
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
barrier(CLK_LOCAL_MEM_FENCE);
//barrier(CLK_LOCAL_MEM_FENCE);
volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);
@ -1162,10 +1224,10 @@ __kernel
sd2[tid] = sdy[tid];
sdabs2[tid] = fabs(sdy[tid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
//barrier(CLK_LOCAL_MEM_FENCE);
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
barrier(CLK_LOCAL_MEM_FENCE);
//barrier(CLK_LOCAL_MEM_FENCE);
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
if (tid == 0)
@ -1178,7 +1240,7 @@ __kernel
}
}
__kernel
__kernel
void normalize_descriptors128(__global float * descriptors, int descriptors_step)
{
descriptors_step /= sizeof(*descriptors);
@ -1219,7 +1281,7 @@ __kernel
// normalize and store in output
descriptor_base[get_local_id(0)] = lookup / len;
}
__kernel
__kernel
void normalize_descriptors64(__global float * descriptors, int descriptors_step)
{
descriptors_step /= sizeof(*descriptors);

View File

@ -1,4 +1,4 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
/*M/////////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
@ -44,6 +44,7 @@
//M*/
#include <iomanip>
#include "precomp.hpp"
#include "mcwutil.hpp"
//#include "opencv2/highgui/highgui.hpp"
using namespace cv;
@ -71,7 +72,7 @@ static inline int calcSize(int octave, int layer)
/* Wavelet size increment between layers. This should be an even number,
such that the wavelet sizes in an octave are either all even or all odd.
This ensures that when looking for the neighbours of a sample, the layers
This ensures that when looking for the neighbors of a sample, the layers
above and below are aligned correctly. */
const int HAAR_SIZE_INC = 6;
@ -79,6 +80,41 @@ static inline int calcSize(int octave, int layer)
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
}
namespace
{
const char* noImage2dOption = "-D DISABLE_IMAGE2D";
// default kernel name can be any kernel in nonfree_surf.cl
bool support_image2d(const char* kernel_name = "icvCalcLayerDetAndTrace")
{
static bool _isTested = false;
static bool _support = false;
if(_isTested)
{
return _support;
}
try
{
cv::ocl::openCLGetKernelFromSource(Context::getContext(), &nonfree_surf, kernel_name);
_support = true;
}
catch (cv::Exception& e)
{
if(e.code == -217)
{
_support = false;
}
else
{
// throw e once again
cv::error(e);
}
}
_isTested = true;
return _support;
}
}
class SURF_OCL_Invoker
{
public:
@ -88,7 +124,7 @@ public:
//void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
//void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
// kernel callers declearations
// kernel callers declarations
void icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, int octave, int nOctaveLayers, int layer_rows);
void icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
@ -100,14 +136,14 @@ public:
void icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures);
void compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures);
// end of kernel callers declearations
// end of kernel callers declarations
SURF_OCL_Invoker(SURF_OCL &surf, const oclMat &img, const oclMat &mask) :
surf_(surf),
img_cols(img.cols), img_rows(img.rows),
use_mask(!mask.empty()),
imgTex(NULL), sumTex(NULL), maskSumTex(NULL)
use_mask(!mask.empty()), counters(oclMat()),
imgTex(NULL), sumTex(NULL), maskSumTex(NULL), _img(img)
{
CV_Assert(!img.empty() && img.type() == CV_8UC1);
CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
@ -131,12 +167,13 @@ public:
counters.create(1, surf_.nOctaves + 1, CV_32SC1);
counters.setTo(Scalar::all(0));
//loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
integral(img, surf_.sum);
if(support_image2d())
{
bindImgTex(img, imgTex);
bindImgTex(surf_.sum, sumTex);
}
bindImgTex(img, imgTex);
integral(img, surf_.sum); // the two argumented integral version is incorrect
bindImgTex(surf_.sum, sumTex);
maskSumTex = 0;
if (use_mask)
@ -155,7 +192,7 @@ public:
void detectKeypoints(oclMat &keypoints)
{
// create image pyramid buffers
// different layers have same sized buffers, but they are sampled from gaussin kernel.
// different layers have same sized buffers, but they are sampled from Gaussian kernel.
ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.det);
ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.trace);
@ -222,7 +259,6 @@ public:
openCLFree(sumTex);
if(maskSumTex)
openCLFree(maskSumTex);
additioalParamBuffer.release();
}
private:
@ -236,13 +272,13 @@ private:
int maxFeatures;
oclMat counters;
// texture buffers
cl_mem imgTex;
cl_mem sumTex;
cl_mem maskSumTex;
oclMat additioalParamBuffer;
const oclMat _img; // make a copy for non-image2d_t supported platform
SURF_OCL_Invoker &operator= (const SURF_OCL_Invoker &right)
{
@ -362,11 +398,6 @@ void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat
{
if (!img.empty())
{
if (img.clCxt->impl->devName.find("Intel(R) HD Graphics") != string::npos)
{
cout << " Intel HD GPU device unsupported " << endl;
return;
}
SURF_OCL_Invoker surf(*this, img, mask);
surf.detectKeypoints(keypoints);
@ -378,11 +409,6 @@ void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat
{
if (!img.empty())
{
if (img.clCxt->impl->devName.find("Intel(R) HD Graphics") != string::npos)
{
cout << " Intel HD GPU device unsupported " << endl;
return;
}
SURF_OCL_Invoker surf(*this, img, mask);
if (!useProvidedKeypoints)
@ -443,74 +469,11 @@ void cv::ocl::SURF_OCL::releaseMemory()
// bind source buffer to image oject.
void SURF_OCL_Invoker::bindImgTex(const oclMat &img, cl_mem &texture)
{
cl_image_format format;
int err;
int depth = img.depth();
int channels = img.channels();
switch(depth)
{
case CV_8U:
format.image_channel_data_type = CL_UNSIGNED_INT8;
break;
case CV_32S:
format.image_channel_data_type = CL_UNSIGNED_INT32;
break;
case CV_32F:
format.image_channel_data_type = CL_FLOAT;
break;
default:
throw std::exception();
break;
}
switch(channels)
{
case 1:
format.image_channel_order = CL_R;
break;
case 3:
format.image_channel_order = CL_RGB;
break;
case 4:
format.image_channel_order = CL_RGBA;
break;
default:
throw std::exception();
break;
}
if(texture)
{
openCLFree(texture);
}
#ifdef CL_VERSION_1_2
cl_image_desc desc;
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
desc.image_width = img.step / img.elemSize();
desc.image_height = img.rows;
desc.image_depth = 0;
desc.image_array_size = 1;
desc.image_row_pitch = 0;
desc.image_slice_pitch = 0;
desc.buffer = NULL;
desc.num_mip_levels = 0;
desc.num_samples = 0;
texture = clCreateImage(Context::getContext()->impl->clContext, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
#else
texture = clCreateImage2D(
Context::getContext()->impl->clContext,
CL_MEM_READ_WRITE,
&format,
img.step / img.elemSize(),
img.rows,
0,
NULL,
&err);
#endif
size_t origin[] = { 0, 0, 0 };
size_t region[] = { img.step / img.elemSize(), img.rows, 1 };
clEnqueueCopyBufferToImage(img.clCxt->impl->clCmdQueue, (cl_mem)img.data, texture, 0, origin, region, 0, NULL, 0);
openCLSafeCall(err);
texture = bindTexture(img);
}
////////////////////////////
@ -525,7 +488,14 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
string kernelName = "icvCalcLayerDetAndTrace";
vector< pair<size_t, const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex));
if(sumTex)
{
args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex));
}
else
{
args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
}
args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&trace.data));
args.push_back( make_pair( sizeof(cl_int), (void *)&det.step));
@ -535,6 +505,7 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
args.push_back( make_pair( sizeof(cl_int), (void *)&nOctaveLayers));
args.push_back( make_pair( sizeof(cl_int), (void *)&octave));
args.push_back( make_pair( sizeof(cl_int), (void *)&c_layer_rows));
args.push_back( make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
size_t localThreads[3] = {16, 16, 1};
size_t globalThreads[3] =
@ -543,7 +514,14 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2),
1
};
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
if(support_image2d())
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
}
void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
@ -573,16 +551,30 @@ void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat
if(use_mask)
{
args.push_back( make_pair( sizeof(cl_mem), (void *)&maskSumTex));
if(maskSumTex)
{
args.push_back( make_pair( sizeof(cl_mem), (void *)&maskSumTex));
}
else
{
args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.data));
}
args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.step));
}
size_t localThreads[3] = {16, 16, 1};
size_t globalThreads[3] = {divUp(layer_cols - 2 * min_margin, localThreads[0] - 2) *localThreads[0],
divUp(layer_rows - 2 * min_margin, localThreads[1] - 2) *nLayers *localThreads[1],
1
};
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
if(support_image2d())
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
}
void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, unsigned int maxCounter,
@ -607,7 +599,14 @@ void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMa
size_t localThreads[3] = {3, 3, 3};
size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1};
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
if(support_image2d())
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
}
void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures)
@ -617,16 +616,31 @@ void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeat
vector< pair<size_t, const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex));
if(sumTex)
{
args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex));
}
else
{
args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
}
args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
args.push_back( make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
size_t localThreads[3] = {32, 4, 1};
size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1};
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
if(support_image2d())
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
}
void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures)
@ -649,12 +663,29 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
globalThreads[1] = 16 * localThreads[1];
args.clear();
args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex));
if(imgTex)
{
args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex));
}
else
{
args.push_back( make_pair( sizeof(cl_mem), (void *)&_img.data));
}
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.rows));
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols));
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step));
if(support_image2d())
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
kernelName = "normalize_descriptors64";
@ -667,7 +698,14 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
args.clear();
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
if(support_image2d())
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
}
else
{
@ -680,12 +718,29 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
globalThreads[1] = 16 * localThreads[1];
args.clear();
args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex));
if(imgTex)
{
args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex));
}
else
{
args.push_back( make_pair( sizeof(cl_mem), (void *)&_img.data));
}
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.rows));
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols));
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step));
if(support_image2d())
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
kernelName = "normalize_descriptors128";
@ -698,7 +753,14 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
args.clear();
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
if(support_image2d())
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
}
}