ocl_calcOpticalFlowPyrLK optimizations

1. decrease branch number in CL code by replacing them into weights
2. decrease local mem pressure in reduce operation by using private variables
3. decrease image sampler pressure by caching data into local memory
4. remove unnecessary sync point on the HOST side.
This commit is contained in:
krodyush 2014-03-19 19:31:14 +04:00
parent a2dec6c34e
commit 65d64af2a8
2 changed files with 116 additions and 96 deletions

View File

@ -989,7 +989,7 @@ namespace cv
idxArg = kernel.set(idxArg, (int)winSize.height); // int c_winSize_y
idxArg = kernel.set(idxArg, (int)iters); // int c_iters
idxArg = kernel.set(idxArg, (char)calcErr); //char calcErr
return kernel.run(2, globalThreads, localThreads, true);
return kernel.run(2, globalThreads, localThreads, false);
}
private:
inline static bool isDeviceCPU()

View File

@ -45,11 +45,15 @@
//
//M*/
#define BUFFER 64
#define BUFFER2 BUFFER>>1
#define GRIDSIZE 3
#define LSx 8
#define LSy 8
#define BUFFER (LSx*LSy)
#define BUFFER2 BUFFER>>1
#ifndef WAVE_SIZE
#define WAVE_SIZE 1
#endif
#ifdef CPU
inline void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)
@ -128,24 +132,21 @@ inline void reduce3(float val1, float val2, float val3,
#if WAVE_SIZE <16
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
if (tid<1)
{
#endif
smem1[tid] += smem1[tid + 8];
smem2[tid] += smem2[tid + 8];
smem3[tid] += smem3[tid + 8];
smem1[tid] += smem1[tid + 4];
smem2[tid] += smem2[tid + 4];
smem3[tid] += smem3[tid + 4];
smem1[tid] += smem1[tid + 2];
smem2[tid] += smem2[tid + 2];
smem3[tid] += smem3[tid + 2];
smem1[tid] += smem1[tid + 1];
smem2[tid] += smem2[tid + 1];
smem3[tid] += smem3[tid + 1];
local float8* m1 = (local float8*)smem1;
local float8* m2 = (local float8*)smem2;
local float8* m3 = (local float8*)smem3;
float8 t1 = m1[0]+m1[1];
float8 t2 = m2[0]+m2[1];
float8 t3 = m3[0]+m3[1];
float4 t14 = t1.lo + t1.hi;
float4 t24 = t2.lo + t2.hi;
float4 t34 = t3.lo + t3.hi;
smem1[0] = t14.x+t14.y+t14.z+t14.w;
smem2[0] = t24.x+t24.y+t24.z+t24.w;
smem3[0] = t34.x+t34.y+t34.z+t34.w;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@ -171,20 +172,17 @@ inline void reduce2(float val1, float val2, __local volatile float* smem1, __loc
#if WAVE_SIZE <16
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
if (tid<1)
{
#endif
smem1[tid] += smem1[tid + 8];
smem2[tid] += smem2[tid + 8];
smem1[tid] += smem1[tid + 4];
smem2[tid] += smem2[tid + 4];
smem1[tid] += smem1[tid + 2];
smem2[tid] += smem2[tid + 2];
smem1[tid] += smem1[tid + 1];
smem2[tid] += smem2[tid + 1];
local float8* m1 = (local float8*)smem1;
local float8* m2 = (local float8*)smem2;
float8 t1 = m1[0]+m1[1];
float8 t2 = m2[0]+m2[1];
float4 t14 = t1.lo + t1.hi;
float4 t24 = t2.lo + t2.hi;
smem1[0] = t14.x+t14.y+t14.z+t14.w;
smem2[0] = t24.x+t24.y+t24.z+t24.w;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@ -207,13 +205,13 @@ inline void reduce1(float val1, __local volatile float* smem1, int tid)
#if WAVE_SIZE <16
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
if (tid<1)
{
#endif
smem1[tid] += smem1[tid + 8];
smem1[tid] += smem1[tid + 4];
smem1[tid] += smem1[tid + 2];
smem1[tid] += smem1[tid + 1];
local float8* m1 = (local float8*)smem1;
float8 t1 = m1[0]+m1[1];
float4 t14 = t1.lo + t1.hi;
smem1[0] = t14.x+t14.y+t14.z+t14.w;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@ -225,18 +223,21 @@ inline void reduce1(float val1, __local volatile float* smem1, int tid)
// Image read mode
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
inline void SetPatch(image2d_t I, float x, float y,
// macro to get pixel value from local memory
#define VAL(_y,_x,_yy,_xx) (IPatchLocal[yid+((_y)*LSy)+1+(_yy)][xid+((_x)*LSx)+1+(_xx)])
inline void SetPatch(local float IPatchLocal[LSy*GRIDSIZE+2][LSx*GRIDSIZE+2], int TileY, int TileX,
float* Pch, float* Dx, float* Dy,
float* A11, float* A12, float* A22)
float* A11, float* A12, float* A22, float w)
{
*Pch = read_imagef(I, sampler, (float2)(x, y)).x;
unsigned int xid=get_local_id(0);
unsigned int yid=get_local_id(1);
*Pch = VAL(TileY,TileX,0,0);
float dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
(3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x);
float dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
(3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x);
float dIdx = (3.0f*VAL(TileY,TileX,-1,1)+10.0f*VAL(TileY,TileX,0,1)+3.0f*VAL(TileY,TileX,+1,1))-(3.0f*VAL(TileY,TileX,-1,-1)+10.0f*VAL(TileY,TileX,0,-1)+3.0f*VAL(TileY,TileX,+1,-1));
float dIdy = (3.0f*VAL(TileY,TileX,1,-1)+10.0f*VAL(TileY,TileX,1,0)+3.0f*VAL(TileY,TileX,1,+1))-(3.0f*VAL(TileY,TileX,-1,-1)+10.0f*VAL(TileY,TileX,-1,0)+3.0f*VAL(TileY,TileX,-1,+1));
dIdx *= w;
dIdy *= w;
*Dx = dIdx;
*Dy = dIdy;
@ -245,6 +246,7 @@ inline void SetPatch(image2d_t I, float x, float y,
*A12 += dIdx * dIdy;
*A22 += dIdy * dIdy;
}
#undef VAL
inline void GetPatch(image2d_t J, float x, float y,
float* Pch, float* Dx, float* Dy,
@ -303,7 +305,38 @@ inline void GetError4(image2d_t J, const float x, const float y, const float4* P
*errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z);
}
#define GRIDSIZE 3
//macro to read pixel value into local memory.
#define READI(_y,_x) IPatchLocal[yid+((_y)*LSy)][xid+((_x)*LSx)] = read_imagef(I, sampler, (float2)(Point.x + xid+(_x)*LSx + 0.5f-1, Point.y + yid+(_y)*LSy+ 0.5f-1)).x;
void ReadPatchIToLocalMem(image2d_t I, float2 Point, local float IPatchLocal[LSy*GRIDSIZE+2][LSx*GRIDSIZE+2])
{
unsigned int xid=get_local_id(0);
unsigned int yid=get_local_id(1);
//read (3*LSx)*(3*LSy) window. each macro call read LSx*LSy pixels block
READI(0,0);READI(0,1);READI(0,2);
READI(1,0);READI(1,1);READI(1,2);
READI(2,0);READI(2,1);READI(2,2);
if(xid<2)
{// read last 2 columns border. each macro call reads 2*LSy pixels block
READI(0,3);
READI(1,3);
READI(2,3);
}
if(yid<2)
{// read last 2 row. each macro call reads LSx*2 pixels block
READI(3,0);READI(3,1);READI(3,2);
}
if(yid<2 && xid<2)
{// read right bottom 2x2 corner. one macro call reads 2*2 pixels block
READI(3,3);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
#undef READI
__attribute__((reqd_work_group_size(LSx, LSy, 1)))
__kernel void lkSparse(image2d_t I, image2d_t J,
__global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
@ -318,6 +351,8 @@ __kernel void lkSparse(image2d_t I, image2d_t J,
unsigned int xsize=get_local_size(0);
unsigned int ysize=get_local_size(1);
int xBase, yBase, k;
float wx = ((xid+2*xsize)<c_winSize_x)?1:0;
float wy = ((yid+2*ysize)<c_winSize_y)?1:0;
float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);
@ -346,65 +381,54 @@ __kernel void lkSparse(image2d_t I, image2d_t J,
float dIdx_patch[GRIDSIZE][GRIDSIZE];
float dIdy_patch[GRIDSIZE][GRIDSIZE];
yBase=yid;
// local memory to read image with border to calc sobels
local float IPatchLocal[LSy*GRIDSIZE+2][LSx*GRIDSIZE+2];
ReadPatchIToLocalMem(I,prevPt,IPatchLocal);
{
xBase=xid;
SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
SetPatch(IPatchLocal, 0, 0,
&I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],
&A11, &A12, &A22);
&A11, &A12, &A22,1);
xBase+=xsize;
SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
SetPatch(IPatchLocal, 0, 1,
&I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],
&A11, &A12, &A22);
&A11, &A12, &A22,1);
xBase+=xsize;
if(xBase<c_winSize_x)
SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
&I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],
&A11, &A12, &A22);
SetPatch(IPatchLocal, 0, 2,
&I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],
&A11, &A12, &A22,wx);
}
yBase+=ysize;
{
xBase=xid;
SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
SetPatch(IPatchLocal, 1, 0,
&I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],
&A11, &A12, &A22);
&A11, &A12, &A22,1);
xBase+=xsize;
SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
SetPatch(IPatchLocal, 1,1,
&I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],
&A11, &A12, &A22);
&A11, &A12, &A22,1);
xBase+=xsize;
if(xBase<c_winSize_x)
SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
&I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],
&A11, &A12, &A22);
SetPatch(IPatchLocal, 1,2,
&I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],
&A11, &A12, &A22,wx);
}
yBase+=ysize;
if(yBase<c_winSize_y)
{
xBase=xid;
SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
SetPatch(IPatchLocal, 2,0,
&I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],
&A11, &A12, &A22);
&A11, &A12, &A22,wy);
xBase+=xsize;
SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
SetPatch(IPatchLocal, 2,1,
&I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],
&A11, &A12, &A22);
&A11, &A12, &A22,wy);
xBase+=xsize;
if(xBase<c_winSize_x)
SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
&I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
&A11, &A12, &A22);
SetPatch(IPatchLocal, 2,2,
&I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
&A11, &A12, &A22,wx*wy);
}
reduce3(A11, A12, A22, smem1, smem2, smem3, tid);
A11 = smem1[0];
@ -434,7 +458,7 @@ __kernel void lkSparse(image2d_t I, image2d_t J,
{
if (tid == 0 && level == 0)
status[gid] = 0;
return;
break;
}
float b1 = 0;
@ -454,10 +478,9 @@ __kernel void lkSparse(image2d_t I, image2d_t J,
&b1, &b2);
xBase+=xsize;
if(xBase<c_winSize_x)
GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
&I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],
&b1, &b2);
GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
&I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],
&b1, &b2);
}
yBase+=ysize;
{
@ -473,13 +496,11 @@ __kernel void lkSparse(image2d_t I, image2d_t J,
&b1, &b2);
xBase+=xsize;
if(xBase<c_winSize_x)
GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
&I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],
&b1, &b2);
GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
&I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],
&b1, &b2);
}
yBase+=ysize;
if(yBase<c_winSize_y)
{
xBase=xid;
GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
@ -493,10 +514,9 @@ __kernel void lkSparse(image2d_t I, image2d_t J,
&b1, &b2);
xBase+=xsize;
if(xBase<c_winSize_x)
GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
&I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
&b1, &b2);
GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
&I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
&b1, &b2);
}
reduce2(b1, b2, smem1, smem2, tid);