mirror of
https://github.com/opencv/opencv.git
synced 2025-01-18 14:13:15 +08:00
Merge pull request #808 from bitwangyaoyao:2.4_mac
This commit is contained in:
commit
36028bd8ad
@ -77,7 +77,7 @@ namespace cv
|
||||
|
||||
size_t wave_size = 0;
|
||||
queryDeviceInfo(WAVEFRONT_SIZE, &wave_size);
|
||||
std::sprintf(pSURF_OPTIONS, " -D WAVE_SIZE=%d", static_cast<int>(wave_size));
|
||||
std::sprintf(pSURF_OPTIONS, "-D WAVE_SIZE=%d", static_cast<int>(wave_size));
|
||||
OPTION_INIT = true;
|
||||
}
|
||||
openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, SURF_OPTIONS);
|
||||
|
@ -277,8 +277,7 @@ static void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
|
||||
char compile_option[128];
|
||||
sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D ERODE %s %s",
|
||||
anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1],
|
||||
rectKernel?"-D RECTKERNEL":"",
|
||||
s);
|
||||
s, rectKernel?"-D RECTKERNEL":"");
|
||||
vector< pair<size_t, const void *> > args;
|
||||
args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
|
||||
args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
|
||||
|
@ -330,16 +330,14 @@ __kernel void arithm_flip_cols_C1_D0 (__global uchar *src, int src_step, int src
|
||||
if (x < thread_cols && y < rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, (cols - x -1) + dst_offset);
|
||||
|
||||
uchar data0 = *(src + src_index_0);
|
||||
uchar data1 = *(src + src_index_1);
|
||||
|
||||
*(dst + dst_index_0) = data1;
|
||||
*(dst + dst_index_1) = data0;
|
||||
|
||||
int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset);
|
||||
int dst_index_0 = mad24(y, dst_step, (x) + dst_offset);
|
||||
uchar data1 = *(src + src_index_1);
|
||||
*(dst + dst_index_0) = data1;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_flip_cols_C1_D1 (__global char *src, int src_step, int src_offset,
|
||||
|
@ -96,18 +96,18 @@ The info above maybe obsolete.
|
||||
***********************************************************************************/
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D0
|
||||
(__global const uchar * restrict src,
|
||||
__global float * dst,
|
||||
const int dst_cols,
|
||||
const int dst_rows,
|
||||
const int src_whole_cols,
|
||||
const int src_whole_rows,
|
||||
const int src_step_in_pixel,
|
||||
const int src_offset_x,
|
||||
const int src_offset_y,
|
||||
const int dst_step_in_pixel,
|
||||
const int radiusy,
|
||||
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
|
||||
(__global const uchar * restrict src,
|
||||
__global float * dst,
|
||||
const int dst_cols,
|
||||
const int dst_rows,
|
||||
const int src_whole_cols,
|
||||
const int src_whole_rows,
|
||||
const int src_step_in_pixel,
|
||||
const int src_offset_x,
|
||||
const int src_offset_y,
|
||||
const int dst_step_in_pixel,
|
||||
const int radiusy,
|
||||
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
|
||||
{
|
||||
int x = get_global_id(0)<<2;
|
||||
int y = get_global_id(1);
|
||||
@ -122,17 +122,17 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
||||
uchar4 temp[READ_TIMES_ROW];
|
||||
|
||||
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
|
||||
#ifdef BORDER_CONSTANT
|
||||
#ifdef BORDER_CONSTANT
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||
{
|
||||
int current_addr = start_addr+i*LSIZE0*4;
|
||||
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
|
||||
temp[i] = *(__global uchar4*)&src[current_addr];
|
||||
}
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||
{
|
||||
temp[i].x= ELEM(start_x+i*LSIZE0*4,0,src_whole_cols,0,temp[i].x);
|
||||
temp[i].y= ELEM(start_x+i*LSIZE0*4+1,0,src_whole_cols,0,temp[i].y);
|
||||
@ -140,7 +140,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
||||
temp[i].w= ELEM(start_x+i*LSIZE0*4+3,0,src_whole_cols,0,temp[i].w);
|
||||
temp[i]= ELEM(start_y,0,src_whole_rows,(uchar4)0,temp[i]);
|
||||
}
|
||||
#else
|
||||
#else
|
||||
int not_all_in_range = (start_x<0) | (start_x + READ_TIMES_ROW*LSIZE0*4+4>src_whole_cols)| (start_y<0) | (start_y >= src_whole_rows);
|
||||
int4 index[READ_TIMES_ROW];
|
||||
int4 addr;
|
||||
@ -148,7 +148,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
||||
if(not_all_in_range)
|
||||
{
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||
{
|
||||
index[i].x= ADDR_L(start_x+i*LSIZE0*4,0,src_whole_cols,start_x+i*LSIZE0*4);
|
||||
index[i].x= ADDR_R(start_x+i*LSIZE0*4,src_whole_cols,index[i].x);
|
||||
@ -162,7 +162,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
||||
s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
|
||||
s_y= ADDR_R(start_y,src_whole_rows,s_y);
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||
{
|
||||
addr = mad24((int4)s_y,(int4)src_step_in_pixel,index[i]);
|
||||
temp[i].x = src[addr.x];
|
||||
@ -174,15 +174,15 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
||||
else
|
||||
{
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||
{
|
||||
temp[i] = *(__global uchar4*)&src[start_addr+i*LSIZE0*4];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//save pixels to lds
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||
{
|
||||
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
|
||||
}
|
||||
@ -190,7 +190,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
||||
|
||||
//read pixels from lds and calculate the result
|
||||
sum =convert_float4(vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset))*mat_kernel[RADIUSX];
|
||||
for(i=1;i<=RADIUSX;i++)
|
||||
for(i=1; i<=RADIUSX; i++)
|
||||
{
|
||||
temp[0]=vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset-i);
|
||||
temp[1]=vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset+i);
|
||||
@ -219,18 +219,18 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
||||
}
|
||||
}
|
||||
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D0
|
||||
(__global const uchar4 * restrict src,
|
||||
__global float4 * dst,
|
||||
const int dst_cols,
|
||||
const int dst_rows,
|
||||
const int src_whole_cols,
|
||||
const int src_whole_rows,
|
||||
const int src_step_in_pixel,
|
||||
const int src_offset_x,
|
||||
const int src_offset_y,
|
||||
const int dst_step_in_pixel,
|
||||
const int radiusy,
|
||||
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
|
||||
(__global const uchar4 * restrict src,
|
||||
__global float4 * dst,
|
||||
const int dst_cols,
|
||||
const int dst_rows,
|
||||
const int src_whole_cols,
|
||||
const int src_whole_rows,
|
||||
const int src_step_in_pixel,
|
||||
const int src_offset_x,
|
||||
const int src_offset_y,
|
||||
const int dst_step_in_pixel,
|
||||
const int radiusy,
|
||||
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@ -244,26 +244,26 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
||||
uchar4 temp[READ_TIMES_ROW];
|
||||
|
||||
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
|
||||
#ifdef BORDER_CONSTANT
|
||||
#ifdef BORDER_CONSTANT
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||
{
|
||||
int current_addr = start_addr+i*LSIZE0;
|
||||
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
|
||||
temp[i] = src[current_addr];
|
||||
}
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||
{
|
||||
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,(uchar4)0,temp[i]);
|
||||
temp[i]= ELEM(start_y,0,src_whole_rows,(uchar4)0,temp[i]);
|
||||
}
|
||||
#else
|
||||
#else
|
||||
int index[READ_TIMES_ROW];
|
||||
int s_x,s_y;
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||
{
|
||||
s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
|
||||
s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
|
||||
@ -272,14 +272,14 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
||||
index[i]=mad24(s_y,src_step_in_pixel,s_x);
|
||||
}
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||
{
|
||||
temp[i] = src[index[i]];
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//save pixels to lds
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||
{
|
||||
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
|
||||
}
|
||||
@ -287,7 +287,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
||||
|
||||
//read pixels from lds and calculate the result
|
||||
sum =convert_float4(LDS_DAT[l_y][l_x+RADIUSX])*mat_kernel[RADIUSX];
|
||||
for(i=1;i<=RADIUSX;i++)
|
||||
for(i=1; i<=RADIUSX; i++)
|
||||
{
|
||||
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
|
||||
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
|
||||
@ -302,18 +302,18 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
||||
}
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D5
|
||||
(__global const float * restrict src,
|
||||
__global float * dst,
|
||||
const int dst_cols,
|
||||
const int dst_rows,
|
||||
const int src_whole_cols,
|
||||
const int src_whole_rows,
|
||||
const int src_step_in_pixel,
|
||||
const int src_offset_x,
|
||||
const int src_offset_y,
|
||||
const int dst_step_in_pixel,
|
||||
const int radiusy,
|
||||
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
|
||||
(__global const float * restrict src,
|
||||
__global float * dst,
|
||||
const int dst_cols,
|
||||
const int dst_rows,
|
||||
const int src_whole_cols,
|
||||
const int src_whole_rows,
|
||||
const int src_step_in_pixel,
|
||||
const int src_offset_x,
|
||||
const int src_offset_y,
|
||||
const int dst_step_in_pixel,
|
||||
const int radiusy,
|
||||
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@ -327,26 +327,26 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
||||
float temp[READ_TIMES_ROW];
|
||||
|
||||
__local float LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
|
||||
#ifdef BORDER_CONSTANT
|
||||
#ifdef BORDER_CONSTANT
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||
{
|
||||
int current_addr = start_addr+i*LSIZE0;
|
||||
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
|
||||
temp[i] = src[current_addr];
|
||||
}
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||
{
|
||||
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,0,temp[i]);
|
||||
temp[i]= ELEM(start_y,0,src_whole_rows,0,temp[i]);
|
||||
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,(float)0,temp[i]);
|
||||
temp[i]= ELEM(start_y,0,src_whole_rows,(float)0,temp[i]);
|
||||
}
|
||||
#else
|
||||
#else
|
||||
int index[READ_TIMES_ROW];
|
||||
int s_x,s_y;
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||
{
|
||||
s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
|
||||
s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
|
||||
@ -355,14 +355,14 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
||||
index[i]=mad24(s_y,src_step_in_pixel,s_x);
|
||||
}
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||
{
|
||||
temp[i] = src[index[i]];
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//save pixels to lds
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||
{
|
||||
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
|
||||
}
|
||||
@ -370,7 +370,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
||||
|
||||
//read pixels from lds and calculate the result
|
||||
sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
|
||||
for(i=1;i<=RADIUSX;i++)
|
||||
for(i=1; i<=RADIUSX; i++)
|
||||
{
|
||||
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
|
||||
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
|
||||
@ -385,18 +385,18 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
||||
}
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D5
|
||||
(__global const float4 * restrict src,
|
||||
__global float4 * dst,
|
||||
const int dst_cols,
|
||||
const int dst_rows,
|
||||
const int src_whole_cols,
|
||||
const int src_whole_rows,
|
||||
const int src_step_in_pixel,
|
||||
const int src_offset_x,
|
||||
const int src_offset_y,
|
||||
const int dst_step_in_pixel,
|
||||
const int radiusy,
|
||||
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
|
||||
(__global const float4 * restrict src,
|
||||
__global float4 * dst,
|
||||
const int dst_cols,
|
||||
const int dst_rows,
|
||||
const int src_whole_cols,
|
||||
const int src_whole_rows,
|
||||
const int src_step_in_pixel,
|
||||
const int src_offset_x,
|
||||
const int src_offset_y,
|
||||
const int dst_step_in_pixel,
|
||||
const int radiusy,
|
||||
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@ -410,26 +410,26 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
||||
float4 temp[READ_TIMES_ROW];
|
||||
|
||||
__local float4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
|
||||
#ifdef BORDER_CONSTANT
|
||||
#ifdef BORDER_CONSTANT
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||
{
|
||||
int current_addr = start_addr+i*LSIZE0;
|
||||
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
|
||||
temp[i] = src[current_addr];
|
||||
}
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||
{
|
||||
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,0,temp[i]);
|
||||
temp[i]= ELEM(start_y,0,src_whole_rows,0,temp[i]);
|
||||
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,(float4)0,temp[i]);
|
||||
temp[i]= ELEM(start_y,0,src_whole_rows,(float4)0,temp[i]);
|
||||
}
|
||||
#else
|
||||
#else
|
||||
int index[READ_TIMES_ROW];
|
||||
int s_x,s_y;
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||
{
|
||||
s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
|
||||
s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
|
||||
@ -438,14 +438,14 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
||||
index[i]=mad24(s_y,src_step_in_pixel,s_x);
|
||||
}
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||
{
|
||||
temp[i] = src[index[i]];
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//save pixels to lds
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||
{
|
||||
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
|
||||
}
|
||||
@ -453,7 +453,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
||||
|
||||
//read pixels from lds and calculate the result
|
||||
sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
|
||||
for(i=1;i<=RADIUSX;i++)
|
||||
for(i=1; i<=RADIUSX; i++)
|
||||
{
|
||||
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
|
||||
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
|
||||
@ -465,4 +465,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
||||
start_addr = mad24(y,dst_step_in_pixel,x);
|
||||
dst[start_addr] = sum;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -114,7 +114,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
|
||||
int groupX_size = get_local_size(0);
|
||||
int groupX_id = get_group_id(0);
|
||||
|
||||
#define dst_align (dst_offset_x & 3)
|
||||
#define dst_align (dst_offset_x & 3)
|
||||
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
|
||||
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
|
||||
|
||||
@ -125,7 +125,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
|
||||
{
|
||||
if((rows_start_index - src_offset_y) + i < rows + ANY)
|
||||
{
|
||||
#ifdef BORDER_CONSTANT
|
||||
#ifdef BORDER_CONSTANT
|
||||
int selected_row = rows_start_index + i;
|
||||
int selected_cols = cols_start_index_group + lX;
|
||||
|
||||
@ -143,7 +143,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
|
||||
data = con ? data : 0;
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
}
|
||||
#else
|
||||
#else
|
||||
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
|
||||
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
|
||||
|
||||
@ -162,7 +162,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
|
||||
data = *(src + selected_row * src_step + selected_cols);
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -185,17 +185,17 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
|
||||
|
||||
for(int i = 0; i < ANCHOR; i++)
|
||||
{
|
||||
#pragma unroll 3
|
||||
for(int j = 0; j < ANCHOR; j++)
|
||||
{
|
||||
#pragma unroll 3
|
||||
for(int j = 0; j < ANCHOR; j++)
|
||||
{
|
||||
if(dst_rows_index < dst_rows_end)
|
||||
{
|
||||
int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
|
||||
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
|
||||
int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
|
||||
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
|
||||
|
||||
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
|
||||
sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int4_sat(data));
|
||||
}
|
||||
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
|
||||
sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int4_sat(data));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -207,7 +207,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
|
||||
sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w;
|
||||
*((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index)) = convert_uchar4_sat(sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////32FC1////////////////////////////////////////////////////////
|
||||
@ -225,7 +225,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
|
||||
int groupX_size = get_local_size(0);
|
||||
int groupX_id = get_group_id(0);
|
||||
|
||||
#define dst_align (dst_offset_x & 3)
|
||||
#define dst_align (dst_offset_x & 3)
|
||||
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
|
||||
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
|
||||
|
||||
@ -236,7 +236,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
|
||||
{
|
||||
if((rows_start_index - src_offset_y) + i < rows + ANY)
|
||||
{
|
||||
#ifdef BORDER_CONSTANT
|
||||
#ifdef BORDER_CONSTANT
|
||||
int selected_row = rows_start_index + i;
|
||||
int selected_cols = cols_start_index_group + lX;
|
||||
|
||||
@ -254,7 +254,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
|
||||
data = con ? data : 0;
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
}
|
||||
#else
|
||||
#else
|
||||
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
|
||||
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
|
||||
|
||||
@ -272,7 +272,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
|
||||
data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -295,17 +295,17 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
|
||||
|
||||
for(int i = 0; i < ANCHOR; i++)
|
||||
{
|
||||
#pragma unroll 3
|
||||
for(int j = 0; j < ANCHOR; j++)
|
||||
{
|
||||
#pragma unroll 3
|
||||
for(int j = 0; j < ANCHOR; j++)
|
||||
{
|
||||
if(dst_rows_index < dst_rows_end)
|
||||
{
|
||||
int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
|
||||
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
|
||||
int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
|
||||
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
|
||||
|
||||
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
|
||||
sum = sum + (mat_kernel[i * ANCHOR + j] * data);
|
||||
}
|
||||
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
|
||||
sum = sum + ((float)(mat_kernel[i * ANCHOR + j]) * data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -318,7 +318,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
|
||||
|
||||
*((__global float4 *)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -337,7 +337,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
|
||||
int groupX_size = get_local_size(0);
|
||||
int groupX_id = get_group_id(0);
|
||||
|
||||
#define dst_align (dst_offset_x & 3)
|
||||
#define dst_align (dst_offset_x & 3)
|
||||
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
|
||||
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
|
||||
|
||||
@ -349,7 +349,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
|
||||
{
|
||||
if((rows_start_index - src_offset_y) + i < rows + ANY)
|
||||
{
|
||||
#ifdef BORDER_CONSTANT
|
||||
#ifdef BORDER_CONSTANT
|
||||
int selected_row = rows_start_index + i;
|
||||
int selected_cols = cols_start_index_group + lX;
|
||||
|
||||
@ -367,7 +367,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
|
||||
data = con ? data : 0;
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
}
|
||||
#else
|
||||
#else
|
||||
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
|
||||
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
|
||||
|
||||
@ -386,7 +386,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
|
||||
data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -410,17 +410,17 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
|
||||
|
||||
for(int i = 0; i < ANCHOR; i++)
|
||||
{
|
||||
#pragma unroll 3
|
||||
for(int j = 0; j < ANCHOR; j++)
|
||||
{
|
||||
#pragma unroll 3
|
||||
for(int j = 0; j < ANCHOR; j++)
|
||||
{
|
||||
if(dst_rows_index < dst_rows_end)
|
||||
{
|
||||
int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
|
||||
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
|
||||
int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
|
||||
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
|
||||
|
||||
data = vload16(0, (__local uchar *)(local_data+local_row * LOCAL_MEM_STEP + local_cols));
|
||||
sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int16_sat(data));
|
||||
}
|
||||
data = vload16(0, (__local uchar *)(local_data+local_row * LOCAL_MEM_STEP + local_cols));
|
||||
sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int16_sat(data));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -468,7 +468,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
|
||||
{
|
||||
if((rows_start_index - src_offset_y) + i < rows + ANY)
|
||||
{
|
||||
#ifdef BORDER_CONSTANT
|
||||
#ifdef BORDER_CONSTANT
|
||||
int selected_row = rows_start_index + i;
|
||||
int selected_cols = cols_start_index_group + lX;
|
||||
|
||||
@ -486,7 +486,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
|
||||
data = con ? data : 0;
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
}
|
||||
#else
|
||||
#else
|
||||
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
|
||||
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
|
||||
|
||||
@ -504,7 +504,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
|
||||
data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
|
||||
local_data[i * LOCAL_MEM_STEP_C4 + lX + groupX_size] =data;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -519,10 +519,10 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
|
||||
|
||||
for(int i = 0; i < ANCHOR; i++)
|
||||
{
|
||||
for(int j = 0; j < ANCHOR; j++)
|
||||
{
|
||||
int local_cols = lX + j;
|
||||
sum = sum + mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols];
|
||||
for(int j = 0; j < ANCHOR; j++)
|
||||
{
|
||||
int local_cols = lX + j;
|
||||
sum = sum + ((float)mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -44,7 +44,11 @@
|
||||
//M*/
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
#define LSIZE 256
|
||||
#define LSIZE_1 255
|
||||
@ -71,13 +75,13 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
|
||||
gid = gid << 1;
|
||||
for(int i = 0; i < rows; i =i + LSIZE_1)
|
||||
{
|
||||
src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0);
|
||||
src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0);
|
||||
src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : 0);
|
||||
src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : 0);
|
||||
|
||||
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[0] = (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[1] = (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
|
||||
@ -127,7 +131,8 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
|
||||
if(lid > 0 && (i+lid) <= rows){
|
||||
if(lid > 0 && (i+lid) <= rows)
|
||||
{
|
||||
lm_sum[0][bf_loc] += sum_t[0];
|
||||
lm_sum[1][bf_loc] += sum_t[1];
|
||||
lm_sqsum[0][bf_loc] += sqsum_t[0];
|
||||
@ -169,15 +174,15 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
|
||||
src_step = src_step >> 4;
|
||||
for(int i = 0; i < rows; i =i + LSIZE_1)
|
||||
{
|
||||
src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : 0;
|
||||
sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : 0;
|
||||
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0;
|
||||
sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : 0;
|
||||
src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (int4)0;
|
||||
sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0;
|
||||
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (int4)0;
|
||||
sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
|
||||
|
||||
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[0] = (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[1] = (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
|
||||
@ -228,14 +233,14 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(gid == 0 && (i + lid) <= rows)
|
||||
{
|
||||
sum[sum_offset + i + lid] = 0;
|
||||
sqsum[sqsum_offset + i + lid] = 0;
|
||||
sum[sum_offset + i + lid] = 0;
|
||||
sqsum[sqsum_offset + i + lid] = 0;
|
||||
}
|
||||
if(i + lid == 0)
|
||||
{
|
||||
int loc0 = gid * 2 * sum_step;
|
||||
int loc1 = gid * 2 * sqsum_step;
|
||||
for(int k = 1;k <= 8;k++)
|
||||
for(int k = 1; k <= 8; k++)
|
||||
{
|
||||
if(gid * 8 + k > cols) break;
|
||||
sum[sum_offset + loc0 + k * sum_step / 4] = 0;
|
||||
@ -244,7 +249,8 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
|
||||
}
|
||||
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
|
||||
int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ;
|
||||
if(lid > 0 && (i+lid) <= rows){
|
||||
if(lid > 0 && (i+lid) <= rows)
|
||||
{
|
||||
lm_sum[0][bf_loc] += sum_t[0];
|
||||
lm_sum[1][bf_loc] += sum_t[1];
|
||||
lm_sqsum[0][bf_loc] += sqsum_t[0];
|
||||
|
@ -47,8 +47,12 @@
|
||||
//warpAffine kernel
|
||||
//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic.
|
||||
|
||||
#if defined DOUBLE_SUPPORT
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
typedef double F;
|
||||
typedef double4 F4;
|
||||
#define convert_F4 convert_double4
|
||||
@ -58,7 +62,6 @@ typedef float4 F4;
|
||||
#define convert_F4 convert_float4
|
||||
#endif
|
||||
|
||||
|
||||
#define INTER_BITS 5
|
||||
#define INTER_TAB_SIZE (1 << INTER_BITS)
|
||||
#define INTER_SCALE 1.f/INTER_TAB_SIZE
|
||||
@ -81,8 +84,8 @@ inline void interpolateCubic( float x, float* coeffs )
|
||||
/**********************************************8UC1*********************************************
|
||||
***********************************************************************************************/
|
||||
__kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -123,14 +126,14 @@ __kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global u
|
||||
sval.s1 = scon.s1 ? src[spos.s1] : 0;
|
||||
sval.s2 = scon.s2 ? src[spos.s2] : 0;
|
||||
sval.s3 = scon.s3 ? src[spos.s3] : 0;
|
||||
dval = convert_uchar4(dcon != 0) ? sval : dval;
|
||||
dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval;
|
||||
*d = dval;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -180,7 +183,7 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob
|
||||
spos1 = src_offset + sy * srcStep + sx + 1;
|
||||
spos2 = src_offset + (sy+1) * srcStep + sx;
|
||||
spos3 = src_offset + (sy+1) * srcStep + sx + 1;
|
||||
|
||||
|
||||
v0.s0 = scon0.s0 ? src[spos0.s0] : 0;
|
||||
v1.s0 = scon1.s0 ? src[spos1.s0] : 0;
|
||||
v2.s0 = scon2.s0 ? src[spos2.s0] : 0;
|
||||
@ -200,22 +203,22 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob
|
||||
v1.s3 = scon1.s3 ? src[spos1.s3] : 0;
|
||||
v2.s3 = scon2.s3 ? src[spos2.s3] : 0;
|
||||
v3.s3 = scon3.s3 ? src[spos3.s3] : 0;
|
||||
|
||||
|
||||
short4 itab0, itab1, itab2, itab3;
|
||||
float4 taby, tabx;
|
||||
taby = INTER_SCALE * convert_float4(ay);
|
||||
tabx = INTER_SCALE * convert_float4(ax);
|
||||
|
||||
itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
|
||||
itab1 = convert_short4_sat(( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ));
|
||||
itab2 = convert_short4_sat(( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
|
||||
itab3 = convert_short4_sat(( taby*tabx * INTER_REMAP_COEF_SCALE ));
|
||||
itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE ));
|
||||
itab1 = convert_short4_sat(( (1.0f-taby)*tabx * (float4)INTER_REMAP_COEF_SCALE ));
|
||||
itab2 = convert_short4_sat(( taby*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE ));
|
||||
itab3 = convert_short4_sat(( taby*tabx * (float4)INTER_REMAP_COEF_SCALE ));
|
||||
|
||||
|
||||
int4 val;
|
||||
uchar4 tval;
|
||||
val = convert_int4(v0) * convert_int4(itab0) + convert_int4(v1) * convert_int4(itab1)
|
||||
+ convert_int4(v2) * convert_int4(itab2) + convert_int4(v3) * convert_int4(itab3);
|
||||
+ convert_int4(v2) * convert_int4(itab2) + convert_int4(v3) * convert_int4(itab3);
|
||||
tval = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
|
||||
|
||||
__global uchar4 * d =(__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
|
||||
@ -228,8 +231,8 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob
|
||||
}
|
||||
|
||||
__kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -255,10 +258,10 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst,
|
||||
|
||||
#pragma unroll 4
|
||||
for(i=0; i<4; i++)
|
||||
for(j=0; j<4; j++)
|
||||
{
|
||||
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0;
|
||||
}
|
||||
for(j=0; j<4; j++)
|
||||
{
|
||||
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0;
|
||||
}
|
||||
|
||||
short itab[16];
|
||||
float tab1y[4], tab1x[4];
|
||||
@ -288,7 +291,7 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst,
|
||||
if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
|
||||
mk1 = k1, mk2 = k2;
|
||||
else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
|
||||
Mk1 = k1, Mk2 = k2;
|
||||
Mk1 = k1, Mk2 = k2;
|
||||
}
|
||||
diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
|
||||
}
|
||||
@ -309,8 +312,8 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst,
|
||||
***********************************************************************************************/
|
||||
|
||||
__kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -333,8 +336,8 @@ __kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global
|
||||
}
|
||||
|
||||
__kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -386,8 +389,8 @@ __kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __glo
|
||||
}
|
||||
|
||||
__kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -418,10 +421,10 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob
|
||||
int i,j;
|
||||
#pragma unroll 4
|
||||
for(i=0; i<4; i++)
|
||||
for(j=0; j<4; j++)
|
||||
{
|
||||
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0;
|
||||
}
|
||||
for(j=0; j<4; j++)
|
||||
{
|
||||
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0;
|
||||
}
|
||||
int itab[16];
|
||||
float tab1y[4], tab1x[4];
|
||||
float axx, ayy;
|
||||
@ -447,14 +450,14 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob
|
||||
int diff = isum - INTER_REMAP_COEF_SCALE;
|
||||
int Mk1=2, Mk2=2, mk1=2, mk2=2;
|
||||
|
||||
for( k1 = 2; k1 < 4; k1++ )
|
||||
for( k1 = 2; k1 < 4; k1++ )
|
||||
for( k2 = 2; k2 < 4; k2++ )
|
||||
{
|
||||
|
||||
if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
|
||||
mk1 = k1, mk2 = k2;
|
||||
else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
|
||||
Mk1 = k1, Mk2 = k2;
|
||||
Mk1 = k1, Mk2 = k2;
|
||||
}
|
||||
|
||||
diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
|
||||
@ -477,8 +480,8 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob
|
||||
***********************************************************************************************/
|
||||
|
||||
__kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -501,8 +504,8 @@ __kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int
|
||||
}
|
||||
|
||||
__kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -548,12 +551,12 @@ __kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst,
|
||||
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
dst[(dst_offset>>2)+dy*dstStep+dx] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -617,8 +620,8 @@ __kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst,
|
||||
***********************************************************************************************/
|
||||
|
||||
__kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -636,13 +639,13 @@ __kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, i
|
||||
short sy0 = (short)(Y0 >> AB_BITS);
|
||||
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>4)+sy0*(srcStep>>2)+sx0] : 0;
|
||||
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>4)+sy0*(srcStep>>2)+sx0] : (float4)0;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -670,10 +673,10 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds
|
||||
|
||||
float4 v0, v1, v2, v3;
|
||||
|
||||
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0;
|
||||
v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0;
|
||||
v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0;
|
||||
v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0;
|
||||
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0;
|
||||
v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0;
|
||||
v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0;
|
||||
v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0;
|
||||
|
||||
float tab[4];
|
||||
float taby[2], tabx[2];
|
||||
@ -691,12 +694,12 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds
|
||||
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
dst[dst_offset+dy*dstStep+dx] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -726,7 +729,7 @@ __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst
|
||||
int i;
|
||||
|
||||
for(i=0; i<16; i++)
|
||||
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0;
|
||||
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0;
|
||||
|
||||
float tab[16];
|
||||
float tab1y[4], tab1x[4];
|
||||
@ -754,5 +757,5 @@ __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst
|
||||
dst[dst_offset+dy*dstStep+dx] = sum;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -47,8 +47,12 @@
|
||||
//wrapPerspective kernel
|
||||
//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic.
|
||||
|
||||
#if defined DOUBLE_SUPPORT
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
typedef double F;
|
||||
typedef double4 F4;
|
||||
#define convert_F4 convert_double4
|
||||
@ -81,8 +85,8 @@ inline void interpolateCubic( float x, float* coeffs )
|
||||
/**********************************************8UC1*********************************************
|
||||
***********************************************************************************************/
|
||||
__kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -112,14 +116,14 @@ __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __glo
|
||||
sval.s1 = scon.s1 ? src[spos.s1] : 0;
|
||||
sval.s2 = scon.s2 ? src[spos.s2] : 0;
|
||||
sval.s3 = scon.s3 ? src[spos.s3] : 0;
|
||||
dval = convert_uchar4(dcon != 0) ? sval : dval;
|
||||
dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval;
|
||||
*d = dval;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
|
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
|
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -142,7 +146,7 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _
|
||||
int i;
|
||||
#pragma unroll 4
|
||||
for(i=0; i<4; i++)
|
||||
v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : 0;
|
||||
v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : (uchar)0;
|
||||
|
||||
short itab[4];
|
||||
float tab1y[2], tab1x[2];
|
||||
@ -170,8 +174,8 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _
|
||||
}
|
||||
|
||||
__kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -190,15 +194,15 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar *
|
||||
short ay = (short)(Y & (INTER_TAB_SIZE-1));
|
||||
short ax = (short)(X & (INTER_TAB_SIZE-1));
|
||||
|
||||
uchar v[16];
|
||||
uchar v[16];
|
||||
int i, j;
|
||||
|
||||
#pragma unroll 4
|
||||
for(i=0; i<4; i++)
|
||||
for(j=0; j<4; j++)
|
||||
{
|
||||
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0;
|
||||
}
|
||||
for(j=0; j<4; j++)
|
||||
{
|
||||
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : (uchar)0;
|
||||
}
|
||||
|
||||
short itab[16];
|
||||
float tab1y[4], tab1x[4];
|
||||
@ -227,7 +231,7 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar *
|
||||
if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
|
||||
mk1 = k1, mk2 = k2;
|
||||
else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
|
||||
Mk1 = k1, Mk2 = k2;
|
||||
Mk1 = k1, Mk2 = k2;
|
||||
}
|
||||
diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
|
||||
}
|
||||
@ -249,8 +253,8 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar *
|
||||
***********************************************************************************************/
|
||||
|
||||
__kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
|
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
|
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -273,8 +277,8 @@ __kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __gl
|
||||
}
|
||||
|
||||
__kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
|
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
|
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -299,10 +303,10 @@ __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src,
|
||||
|
||||
int4 v0, v1, v2, v3;
|
||||
|
||||
v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : 0;
|
||||
v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : 0;
|
||||
v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : 0;
|
||||
v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : 0;
|
||||
v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : (int4)0;
|
||||
v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : (int4)0;
|
||||
v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : (int4)0;
|
||||
v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : (int4)0;
|
||||
|
||||
int itab0, itab1, itab2, itab3;
|
||||
float taby, tabx;
|
||||
@ -323,8 +327,8 @@ __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src,
|
||||
}
|
||||
|
||||
__kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
|
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
|
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -352,10 +356,10 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _
|
||||
int i,j;
|
||||
#pragma unroll 4
|
||||
for(i=0; i<4; i++)
|
||||
for(j=0; j<4; j++)
|
||||
{
|
||||
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0;
|
||||
}
|
||||
for(j=0; j<4; j++)
|
||||
{
|
||||
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0;
|
||||
}
|
||||
int itab[16];
|
||||
float tab1y[4], tab1x[4];
|
||||
float axx, ayy;
|
||||
@ -381,14 +385,14 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _
|
||||
int diff = isum - INTER_REMAP_COEF_SCALE;
|
||||
int Mk1=2, Mk2=2, mk1=2, mk2=2;
|
||||
|
||||
for( k1 = 2; k1 < 4; k1++ )
|
||||
for( k1 = 2; k1 < 4; k1++ )
|
||||
for( k2 = 2; k2 < 4; k2++ )
|
||||
{
|
||||
|
||||
if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
|
||||
mk1 = k1, mk2 = k2;
|
||||
else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
|
||||
Mk1 = k1, Mk2 = k2;
|
||||
Mk1 = k1, Mk2 = k2;
|
||||
}
|
||||
|
||||
diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
|
||||
@ -411,8 +415,8 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _
|
||||
***********************************************************************************************/
|
||||
|
||||
__kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -434,8 +438,8 @@ __kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst
|
||||
}
|
||||
|
||||
__kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -458,10 +462,10 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float *
|
||||
|
||||
float v0, v1, v2, v3;
|
||||
|
||||
v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : 0;
|
||||
v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : 0;
|
||||
v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : 0;
|
||||
v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : 0;
|
||||
v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : (float)0;
|
||||
v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : (float)0;
|
||||
v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : (float)0;
|
||||
v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : (float)0;
|
||||
|
||||
float tab[4];
|
||||
float taby[2], tabx[2];
|
||||
@ -483,8 +487,8 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float *
|
||||
}
|
||||
|
||||
__kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -510,7 +514,7 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float *
|
||||
int i;
|
||||
|
||||
for(i=0; i<16; i++)
|
||||
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0;
|
||||
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float)0;
|
||||
|
||||
float tab[16];
|
||||
float tab1y[4], tab1x[4];
|
||||
@ -546,8 +550,8 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float *
|
||||
***********************************************************************************************/
|
||||
|
||||
__kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -564,13 +568,13 @@ __kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * d
|
||||
short sy = (short)Y;
|
||||
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>4)+sy*(srcStep>>2)+sx] : 0;
|
||||
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>4)+sy*(srcStep>>2)+sx] : (float)0;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -597,10 +601,10 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4
|
||||
|
||||
float4 v0, v1, v2, v3;
|
||||
|
||||
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0;
|
||||
v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0;
|
||||
v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0;
|
||||
v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0;
|
||||
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0;
|
||||
v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0;
|
||||
v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0;
|
||||
v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0;
|
||||
|
||||
float tab[4];
|
||||
float taby[2], tabx[2];
|
||||
@ -622,8 +626,8 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4
|
||||
}
|
||||
|
||||
__kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 * dst,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
|
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
|
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -652,7 +656,7 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4
|
||||
int i;
|
||||
|
||||
for(i=0; i<16; i++)
|
||||
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0;
|
||||
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0;
|
||||
|
||||
float tab[16];
|
||||
float tab1y[4], tab1x[4];
|
||||
@ -680,5 +684,6 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4
|
||||
dst[dst_offset+dy*dstStep+dx] = sum;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -447,10 +447,10 @@ void matchTemplate_Naive_CCORR_C1_D0
|
||||
__global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
|
||||
for(j = 0; j < tpl_cols; j ++)
|
||||
{
|
||||
sum = mad24(img_ptr[j], tpl_ptr[j], sum);
|
||||
sum = mad24(convert_int(img_ptr[j]), convert_int(tpl_ptr[j]), sum);
|
||||
}
|
||||
}
|
||||
res[res_idx] = sum;
|
||||
res[res_idx] = (float)sum;
|
||||
}
|
||||
}
|
||||
|
||||
@ -548,7 +548,7 @@ void matchTemplate_Naive_CCORR_C4_D0
|
||||
sum = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum);
|
||||
}
|
||||
}
|
||||
res[res_idx] = sum.x + sum.y + sum.z + sum.w;
|
||||
res[res_idx] = (float)(sum.x + sum.y + sum.z + sum.w);
|
||||
}
|
||||
}
|
||||
|
||||
@ -633,9 +633,8 @@ void matchTemplate_Prepared_CCOFF_C1_D0
|
||||
|
||||
if(gidx < res_cols && gidy < res_rows)
|
||||
{
|
||||
float sum = (float)(
|
||||
(img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
|
||||
- (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
|
||||
float sum = (float)((img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
|
||||
-(img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
|
||||
res[res_idx] -= sum * tpl_sum;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user