mirror of
https://github.com/opencv/opencv.git
synced 2025-01-18 14:13:15 +08:00
performance & bug fix for resize erode dilate sobel remap
This commit is contained in:
parent
a1d8091e10
commit
2e685dcf0a
@ -4,7 +4,7 @@ if(NOT HAVE_OPENCL)
|
||||
endif()
|
||||
|
||||
set(the_description "OpenCL-accelerated Computer Vision")
|
||||
ocv_add_module(ocl opencv_core opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_nonfree)
|
||||
ocv_add_module(ocl opencv_core opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_nonfree opencv_ts)
|
||||
|
||||
ocv_module_include_directories()
|
||||
|
||||
|
@ -325,7 +325,7 @@ PARAM_TEST_CASE(LaplacianTestBase, MatType, int)
|
||||
ksize = GET_PARAM(1);
|
||||
|
||||
cv::RNG& rng = TS::ptr()->get_rng();
|
||||
cv::Size size = cv::Size(2560, 2560);
|
||||
cv::Size size = cv::Size(MWIDTH, MHEIGHT);
|
||||
|
||||
mat = randomMat(rng, size, type, 5, 16, false);
|
||||
dst = randomMat(rng, size, type, 5, 16, false);
|
||||
@ -468,7 +468,7 @@ PARAM_TEST_CASE(ErodeDilateBase, MatType, bool)
|
||||
// iterations = GET_PARAM(1);
|
||||
|
||||
cv::RNG& rng = TS::ptr()->get_rng();
|
||||
cv::Size size = cv::Size(2560, 2560);
|
||||
cv::Size size = cv::Size(MWIDTH, MHEIGHT);
|
||||
|
||||
mat1 = randomMat(rng, size, type, 5, 16, false);
|
||||
dst = randomMat(rng, size, type, 5, 16, false);
|
||||
@ -679,7 +679,7 @@ PARAM_TEST_CASE(Sobel, MatType, int, int, int, int)
|
||||
dx = 2; dy=0;
|
||||
|
||||
cv::RNG& rng = TS::ptr()->get_rng();
|
||||
cv::Size size = cv::Size(2560, 2560);
|
||||
cv::Size size = cv::Size(MWIDTH, MHEIGHT);
|
||||
|
||||
mat1 = randomMat(rng, size, type, 5, 16, false);
|
||||
dst = randomMat(rng, size, type, 5, 16, false);
|
||||
@ -817,7 +817,7 @@ PARAM_TEST_CASE(Scharr, MatType, int, int, int)
|
||||
dx = 1; dy=0;
|
||||
|
||||
cv::RNG& rng = TS::ptr()->get_rng();
|
||||
cv::Size size = cv::Size(2560, 2560);
|
||||
cv::Size size = cv::Size(MWIDTH, MHEIGHT);
|
||||
|
||||
mat1 = randomMat(rng, size, type, 5, 16, false);
|
||||
dst = randomMat(rng, size, type, 5, 16, false);
|
||||
@ -956,7 +956,7 @@ PARAM_TEST_CASE(GaussianBlur, MatType, cv::Size, int)
|
||||
bordertype = GET_PARAM(2);
|
||||
|
||||
cv::RNG& rng = TS::ptr()->get_rng();
|
||||
cv::Size size = cv::Size(2560, 2560);
|
||||
cv::Size size = cv::Size(MWIDTH, MHEIGHT);
|
||||
|
||||
sigma1 = rng.uniform(0.1, 1.0);
|
||||
sigma2 = rng.uniform(0.1, 1.0);
|
||||
|
@ -177,9 +177,7 @@ namespace cv
|
||||
extern const char *filter_sep_row;
|
||||
extern const char *filter_sep_col;
|
||||
extern const char *filtering_laplacian;
|
||||
extern const char *filtering_erodeFilter;
|
||||
extern const char *filtering_dilateFilter;
|
||||
|
||||
extern const char *filtering_morph;
|
||||
}
|
||||
}
|
||||
|
||||
@ -334,28 +332,54 @@ void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, c
|
||||
|
||||
int srcStep = src.step1() / src.channels();
|
||||
int dstStep = dst.step1() / dst.channels();
|
||||
int srcOffset = src.offset / src.channels() / src.elemSize1();
|
||||
int dstOffset = dst.offset / dst.channels() / dst.elemSize1();
|
||||
int minclos = -(srcOffset % srcStep);
|
||||
int maxclos = src.wholecols + minclos - 1;
|
||||
int minrows = -(srcOffset / srcStep);
|
||||
int maxrows = src.wholerows + minrows - 1;
|
||||
|
||||
//int D=src.depth();
|
||||
int srcOffset = src.offset / src.elemSize();
|
||||
int dstOffset = dst.offset / dst.elemSize();
|
||||
|
||||
int srcOffset_x=srcOffset%srcStep;
|
||||
int srcOffset_y=srcOffset/srcStep;
|
||||
Context *clCxt = src.clCxt;
|
||||
string kernelName;
|
||||
size_t localThreads[3] = {16, 16, 1};
|
||||
size_t globalThreads[3] = {(src.cols + localThreads[0]) / localThreads[0] * localThreads[0], (src.rows + localThreads[1]) / localThreads[1] * localThreads[1], 1};
|
||||
|
||||
string kernelName = "erode";
|
||||
|
||||
if(src.type()==CV_8UC1)
|
||||
{
|
||||
kernelName = "morph_C1_D0";
|
||||
globalThreads[0] = ((src.cols + 3) / 4 + localThreads[0]) / localThreads[0] * localThreads[0];
|
||||
CV_Assert( localThreads[0]*localThreads[1]*8 >= (localThreads[0]*4+ksize.width-1)*(localThreads[1]+ksize.height-1) );
|
||||
}
|
||||
else
|
||||
{
|
||||
kernelName = "morph";
|
||||
CV_Assert( localThreads[0]*localThreads[1]*2 >= (localThreads[0]+ksize.width-1)*(localThreads[1]+ksize.height-1) );
|
||||
}
|
||||
char s[64];
|
||||
switch(src.type())
|
||||
{
|
||||
case CV_8UC1:
|
||||
sprintf(s, "-D VAL=255");
|
||||
break;
|
||||
case CV_8UC3:
|
||||
case CV_8UC4:
|
||||
sprintf(s, "-D VAL=255 -D GENTYPE=uchar4");
|
||||
break;
|
||||
case CV_32FC1:
|
||||
sprintf(s, "-D VAL=FLT_MAX -D GENTYPE=float");
|
||||
break;
|
||||
case CV_32FC3:
|
||||
case CV_32FC4:
|
||||
sprintf(s, "-D VAL=FLT_MAX -D GENTYPE=float4");
|
||||
break;
|
||||
default:
|
||||
CV_Error(-217,"unsupported type");
|
||||
}
|
||||
char compile_option[128];
|
||||
sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D ERODE %s", anchor.x, anchor.y, localThreads[0], localThreads[1],s);
|
||||
vector< pair<size_t, const void *> > args;
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&dstOffset));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&minclos));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&maxclos));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&minrows));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&maxrows));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_x));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_y));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep));
|
||||
@ -363,18 +387,8 @@ void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, c
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_kernel.data));
|
||||
args.push_back( make_pair( sizeof(cl_int),(void*)&src.wholecols));
|
||||
args.push_back( make_pair( sizeof(cl_int),(void*)&src.wholerows));
|
||||
//args.push_back( make_pair( sizeof(cl_int),(void*)&ksize.width));
|
||||
//args.push_back( make_pair( sizeof(cl_int),(void*)&ksize.height));
|
||||
|
||||
size_t globalThreads[3] = {(src.cols + 15) / 16 * 16, (src.rows + 15) / 16 * 16, 1};
|
||||
if(src.channels() == 1)
|
||||
globalThreads[0] = ((src.cols + 9) / 4 + 15) / 16 * 16;
|
||||
size_t localThreads[3] = {16, 16, 1};
|
||||
|
||||
char compile_option[128];
|
||||
sprintf(compile_option, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d", anchor.x, anchor.y, ksize.width, ksize.height);
|
||||
|
||||
openCLExecuteKernel(clCxt, &filtering_erodeFilter, kernelName, globalThreads, localThreads, args, src.channels(), src.depth(), compile_option);
|
||||
args.push_back( make_pair( sizeof(cl_int),(void*)&dstOffset));
|
||||
openCLExecuteKernel(clCxt, &filtering_morph, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
|
||||
}
|
||||
|
||||
|
||||
@ -390,26 +404,54 @@ void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize,
|
||||
|
||||
int srcStep = src.step1() / src.channels();
|
||||
int dstStep = dst.step1() / dst.channels();
|
||||
int srcOffset = src.offset / src.channels() / src.elemSize1();
|
||||
int dstOffset = dst.offset / dst.channels() / dst.elemSize1();
|
||||
int minclos = -(srcOffset % srcStep);
|
||||
int maxclos = src.wholecols + minclos - 1;
|
||||
int minrows = -(srcOffset / srcStep);
|
||||
int maxrows = src.wholerows + minrows - 1;
|
||||
|
||||
int srcOffset = src.offset / src.elemSize();
|
||||
int dstOffset = dst.offset / dst.elemSize();
|
||||
|
||||
int srcOffset_x=srcOffset%srcStep;
|
||||
int srcOffset_y=srcOffset/srcStep;
|
||||
Context *clCxt = src.clCxt;
|
||||
string kernelName;
|
||||
size_t localThreads[3] = {16, 16, 1};
|
||||
size_t globalThreads[3] = {(src.cols + localThreads[0]) / localThreads[0] * localThreads[0], (src.rows + localThreads[1]) / localThreads[1] * localThreads[1], 1};
|
||||
|
||||
string kernelName = "dilate";
|
||||
if(src.type()==CV_8UC1)
|
||||
{
|
||||
kernelName = "morph_C1_D0";
|
||||
globalThreads[0] = ((src.cols + 3) / 4 + localThreads[0]) / localThreads[0] * localThreads[0];
|
||||
CV_Assert( localThreads[0]*localThreads[1]*8 >= (localThreads[0]*4+ksize.width-1)*(localThreads[1]+ksize.height-1) );
|
||||
}
|
||||
else
|
||||
{
|
||||
kernelName = "morph";
|
||||
CV_Assert( localThreads[0]*localThreads[1]*2 >= (localThreads[0]+ksize.width-1)*(localThreads[1]+ksize.height-1) );
|
||||
}
|
||||
char s[64];
|
||||
switch(src.type())
|
||||
{
|
||||
case CV_8UC1:
|
||||
sprintf(s, "-D VAL=0");
|
||||
break;
|
||||
case CV_8UC3:
|
||||
case CV_8UC4:
|
||||
sprintf(s, "-D VAL=0 -D GENTYPE=uchar4");
|
||||
break;
|
||||
case CV_32FC1:
|
||||
sprintf(s, "-D VAL=-FLT_MAX -D GENTYPE=float");
|
||||
break;
|
||||
case CV_32FC3:
|
||||
case CV_32FC4:
|
||||
sprintf(s, "-D VAL=-FLT_MAX -D GENTYPE=float4");
|
||||
break;
|
||||
default:
|
||||
CV_Error(-217,"unsupported type");
|
||||
}
|
||||
char compile_option[128];
|
||||
sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D DILATE %s", anchor.x, anchor.y, localThreads[0], localThreads[1],s);
|
||||
vector< pair<size_t, const void *> > args;
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&dstOffset));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&minclos));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&maxclos));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&minrows));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&maxrows));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_x));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_y));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep));
|
||||
@ -417,15 +459,8 @@ void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize,
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_kernel.data));
|
||||
args.push_back( make_pair( sizeof(cl_int),(void*)&src.wholecols));
|
||||
args.push_back( make_pair( sizeof(cl_int),(void*)&src.wholerows));
|
||||
|
||||
size_t globalThreads[3] = {(src.cols + 15) / 16 * 16, (src.rows + 15) / 16 * 16, 1};
|
||||
if(src.channels() == 1)
|
||||
globalThreads[0] = ((src.cols + 9) / 4 + 15) / 16 * 16;
|
||||
size_t localThreads[3] = {16, 16, 1};
|
||||
char compile_option[128];
|
||||
sprintf(compile_option, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d", anchor.x, anchor.y, ksize.width, ksize.height);
|
||||
|
||||
openCLExecuteKernel(clCxt, &filtering_dilateFilter, kernelName, globalThreads, localThreads, args, src.channels(), src.depth(), compile_option);
|
||||
args.push_back( make_pair( sizeof(cl_int),(void*)&dstOffset));
|
||||
openCLExecuteKernel(clCxt, &filtering_morph, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
|
||||
}
|
||||
|
||||
Ptr<BaseFilter_GPU> cv::ocl::getMorphologyFilter_GPU(int op, int type, const Mat &kernel, const Size &ksize, Point anchor)
|
||||
@ -739,7 +774,7 @@ namespace
|
||||
int src_type = src.type();
|
||||
|
||||
int cn = src.channels();
|
||||
dst.create(src_size, src_type);
|
||||
//dst.create(src_size, src_type);
|
||||
dst = Scalar(0.0);
|
||||
//dstBuf.create(src_size, src_type);
|
||||
dstBuf.create(src_size.height + ksize.height - 1, src_size.width, CV_MAKETYPE(CV_32F, cn));
|
||||
@ -1265,8 +1300,8 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker
|
||||
sprintf(btype, "BORDER_REFLECT_101");
|
||||
break;
|
||||
}
|
||||
char compile_option[128];
|
||||
sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s", anchor, localThreads[0], localThreads[1], channels, btype);
|
||||
char compile_option[256];
|
||||
|
||||
|
||||
size_t globalThreads[3];
|
||||
globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
|
||||
@ -1277,21 +1312,46 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker
|
||||
{
|
||||
case 1:
|
||||
globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
|
||||
sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
|
||||
anchor, localThreads[0], localThreads[1], channels, btype,"float","uchar","convert_uchar_sat");
|
||||
break;
|
||||
case 2:
|
||||
globalThreads[0] = ((dst.cols + 1) / 2 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
|
||||
sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
|
||||
anchor, localThreads[0], localThreads[1], channels, btype,"float2","uchar2","convert_uchar2_sat");
|
||||
break;
|
||||
case 3:
|
||||
globalThreads[0] = ((dst.cols * 3 + 3) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
|
||||
break;
|
||||
case 4:
|
||||
globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
|
||||
sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
|
||||
anchor, localThreads[0], localThreads[1], channels, btype,"float4","uchar4","convert_uchar4_sat");
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
|
||||
switch(dst.type())
|
||||
{
|
||||
case CV_32SC1:
|
||||
sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
|
||||
anchor, localThreads[0], localThreads[1], channels, btype,"float","int","convert_int_sat");
|
||||
break;
|
||||
case CV_32SC3:
|
||||
case CV_32SC4:
|
||||
sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
|
||||
anchor, localThreads[0], localThreads[1], channels, btype,"float4","int4","convert_int4_sat");
|
||||
break;
|
||||
case CV_32FC1:
|
||||
sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
|
||||
anchor, localThreads[0], localThreads[1], channels, btype,"float","float","");
|
||||
break;
|
||||
case CV_32FC3:
|
||||
case CV_32FC4:
|
||||
sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
|
||||
anchor, localThreads[0], localThreads[1], channels, btype,"float4","float4","");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
//sanity checks
|
||||
@ -1321,7 +1381,7 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker
|
||||
args.push_back(make_pair(sizeof(cl_int), (void *)&dst_offset_in_pixel));
|
||||
args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
|
||||
|
||||
openCLExecuteKernel(clCxt, &filter_sep_col, kernelName, globalThreads, localThreads, args, channels, dst.depth(), compile_option);
|
||||
openCLExecuteKernel(clCxt, &filter_sep_col, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
|
||||
}
|
||||
|
||||
Ptr<BaseColumnFilter_GPU> cv::ocl::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat &columnKernel, int anchor, int bordertype, double delta)
|
||||
@ -1376,7 +1436,7 @@ void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat
|
||||
{
|
||||
if( ddepth < 0 )
|
||||
ddepth = src.depth();
|
||||
CV_Assert(ddepth == src.depth());
|
||||
//CV_Assert(ddepth == src.depth());
|
||||
dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
|
||||
|
||||
Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype);
|
||||
|
@ -394,8 +394,15 @@ namespace cv
|
||||
args.push_back( make_pair(sizeof(cl_int),(void*)&map1.cols));
|
||||
args.push_back( make_pair(sizeof(cl_int),(void*)&map1.rows));
|
||||
args.push_back( make_pair(sizeof(cl_int), (void *)&cols));
|
||||
args.push_back( make_pair(sizeof(cl_double4),(void*)&borderValue));
|
||||
}
|
||||
if(src.clCxt -> impl -> double_support != 0)
|
||||
{
|
||||
args.push_back( make_pair(sizeof(cl_double4),(void*)&borderValue));
|
||||
}
|
||||
else
|
||||
{
|
||||
args.push_back( make_pair(sizeof(cl_float4),(void*)&borderValue));
|
||||
}
|
||||
}
|
||||
openCLExecuteKernel(clCxt,&imgproc_remap,kernelName,globalThreads,localThreads,args,src.channels(),src.depth());
|
||||
}
|
||||
|
||||
|
@ -44,9 +44,9 @@
|
||||
//M*/
|
||||
|
||||
#include "precomp.hpp"
|
||||
#include "threadsafe.h"
|
||||
#include "Threadsafe.h"
|
||||
#include <iomanip>
|
||||
#include "binarycaching.hpp"
|
||||
#include "binaryCaching.hpp"
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::ocl;
|
||||
|
@ -90,9 +90,9 @@ Niko
|
||||
***********************************************************************************/
|
||||
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_C1_D0
|
||||
(__global const float * restrict src,
|
||||
__global uchar * dst,
|
||||
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter
|
||||
(__global const GENTYPE_SRC * restrict src,
|
||||
__global GENTYPE_DST * dst,
|
||||
const int dst_cols,
|
||||
const int dst_rows,
|
||||
const int src_whole_cols,
|
||||
@ -111,10 +111,10 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_
|
||||
int start_addr = mad24(y,src_step_in_pixel,x);
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
int i;
|
||||
float sum;
|
||||
float temp[READ_TIMES_COL];
|
||||
GENTYPE_SRC sum;
|
||||
GENTYPE_SRC temp[READ_TIMES_COL];
|
||||
|
||||
__local float LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
|
||||
__local GENTYPE_SRC LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
|
||||
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_COL;i++)
|
||||
@ -141,170 +141,6 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_
|
||||
if((x<dst_cols) & (y<dst_rows))
|
||||
{
|
||||
start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
|
||||
dst[start_addr] = convert_uchar_sat(sum);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_C4_D0
|
||||
(__global const float4 * restrict src,
|
||||
__global uchar4 * dst,
|
||||
const int dst_cols,
|
||||
const int dst_rows,
|
||||
const int src_whole_cols,
|
||||
const int src_whole_rows,
|
||||
const int src_step_in_pixel,
|
||||
//const int src_offset_x,
|
||||
//const int src_offset_y,
|
||||
const int dst_step_in_pixel,
|
||||
const int dst_offset_in_pixel,
|
||||
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSY+1)))))
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int l_x = get_local_id(0);
|
||||
int l_y = get_local_id(1);
|
||||
int start_addr = mad24(y,src_step_in_pixel,x);
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
int i;
|
||||
float4 sum;
|
||||
float4 temp[READ_TIMES_COL];
|
||||
|
||||
__local float4 LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
|
||||
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_COL;i++)
|
||||
{
|
||||
int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
|
||||
current_addr = current_addr < end_addr ? current_addr : 0;
|
||||
temp[i] = src[current_addr];
|
||||
}
|
||||
//save pixels to lds
|
||||
for(i = 0;i<READ_TIMES_COL;i++)
|
||||
{
|
||||
LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//read pixels from lds and calculate the result
|
||||
sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
|
||||
for(i=1;i<=RADIUSY;i++)
|
||||
{
|
||||
temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
|
||||
temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
|
||||
sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
|
||||
}
|
||||
//write the result to dst
|
||||
if((x<dst_cols) & (y<dst_rows))
|
||||
{
|
||||
start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
|
||||
dst[start_addr] = convert_uchar4_sat(sum);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_C1_D5
|
||||
(__global const float * restrict src,
|
||||
__global float * dst,
|
||||
const int dst_cols,
|
||||
const int dst_rows,
|
||||
const int src_whole_cols,
|
||||
const int src_whole_rows,
|
||||
const int src_step_in_pixel,
|
||||
//const int src_offset_x,
|
||||
//const int src_offset_y,
|
||||
const int dst_step_in_pixel,
|
||||
const int dst_offset_in_pixel,
|
||||
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSY+1)))))
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int l_x = get_local_id(0);
|
||||
int l_y = get_local_id(1);
|
||||
int start_addr = mad24(y,src_step_in_pixel,x);
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
int i;
|
||||
float sum;
|
||||
float temp[READ_TIMES_COL];
|
||||
|
||||
__local float LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
|
||||
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_COL;i++)
|
||||
{
|
||||
int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
|
||||
current_addr = current_addr < end_addr ? current_addr : 0;
|
||||
temp[i] = src[current_addr];
|
||||
}
|
||||
//save pixels to lds
|
||||
for(i = 0;i<READ_TIMES_COL;i++)
|
||||
{
|
||||
LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//read pixels from lds and calculate the result
|
||||
sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
|
||||
for(i=1;i<=RADIUSY;i++)
|
||||
{
|
||||
temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
|
||||
temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
|
||||
sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
|
||||
}
|
||||
//write the result to dst
|
||||
if((x<dst_cols) & (y<dst_rows))
|
||||
{
|
||||
start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
|
||||
dst[start_addr] = sum;
|
||||
}
|
||||
}
|
||||
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_C4_D5
|
||||
(__global const float4 * restrict src,
|
||||
__global float4 * dst,
|
||||
const int dst_cols,
|
||||
const int dst_rows,
|
||||
const int src_whole_cols,
|
||||
const int src_whole_rows,
|
||||
const int src_step_in_pixel,
|
||||
//const int src_offset_x,
|
||||
//const int src_offset_y,
|
||||
const int dst_step_in_pixel,
|
||||
const int dst_offset_in_pixel,
|
||||
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSY+1)))))
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int l_x = get_local_id(0);
|
||||
int l_y = get_local_id(1);
|
||||
int start_addr = mad24(y,src_step_in_pixel,x);
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
int i;
|
||||
float4 sum;
|
||||
float4 temp[READ_TIMES_COL];
|
||||
|
||||
__local float4 LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
|
||||
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_COL;i++)
|
||||
{
|
||||
int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
|
||||
current_addr = current_addr < end_addr ? current_addr : 0;
|
||||
temp[i] = src[current_addr];
|
||||
}
|
||||
//save pixels to lds
|
||||
for(i = 0;i<READ_TIMES_COL;i++)
|
||||
{
|
||||
LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//read pixels from lds and calculate the result
|
||||
sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
|
||||
for(i=1;i<=RADIUSY;i++)
|
||||
{
|
||||
temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
|
||||
temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
|
||||
sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
|
||||
}
|
||||
//write the result to dst
|
||||
if((x<dst_cols) & (y<dst_rows))
|
||||
{
|
||||
start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
|
||||
dst[start_addr] = sum;
|
||||
dst[start_addr] = convert_to_DST(sum);
|
||||
}
|
||||
}
|
||||
|
@ -1,192 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Zhang Ying, zhangying913@gmail.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#pragma OPENCL FP_CONTRACT ON
|
||||
#define UCHAR_MIN 0
|
||||
__kernel void dilate_C4_D5(__global const float4 * restrict src, __global float4 *dst, int srcOffset, int dstOffset,
|
||||
int mincols, int maxcols, int minrows, int maxrows, int cols, int rows,
|
||||
int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
|
||||
{
|
||||
int mX = get_global_id(0);
|
||||
int mY = get_global_id(1);
|
||||
int kX = mX - anX, kY = mY - anY;
|
||||
int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
|
||||
float4 maxVal = (float4)(-FLT_MAX);
|
||||
int k=0;
|
||||
for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
|
||||
{
|
||||
for(int j=0;j<ksX; j++, kX++)
|
||||
{
|
||||
int current_addr = mad24(kY,srcStep,kX) + srcOffset;
|
||||
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
|
||||
float4 v = src[current_addr];
|
||||
uchar now = mat_kernel[k++];
|
||||
float4 flag = (kX >= mincols & kX <= maxcols & kY >= minrows & kY <= maxrows & now != 0) ? v : (float4)(-FLT_MAX);
|
||||
maxVal = max(maxVal , flag);
|
||||
}
|
||||
}
|
||||
|
||||
if(mX < cols && mY < rows)
|
||||
dst[mY * dstStep + mX + dstOffset] = (maxVal);
|
||||
}
|
||||
|
||||
__kernel void dilate_C1_D5(__global float4 * src, __global float *dst, int srcOffset, int dstOffset,
|
||||
int mincols, int maxcols, int minrows, int maxrows, int cols, int rows,
|
||||
int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
|
||||
{
|
||||
int mX = (get_global_id(0)<<2) - (dstOffset&3);
|
||||
int mY = get_global_id(1);
|
||||
int kX = mX - anX, kY = mY - anY;
|
||||
int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
|
||||
float4 maxVal = (float4)(-FLT_MAX);
|
||||
int k=0;
|
||||
for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
|
||||
{
|
||||
for(int j=0;j<ksX;j++, kX++)
|
||||
{
|
||||
int start = mad24(kY,srcStep,kX) + srcOffset;
|
||||
start = ((start < end_addr) && (start > 0)) ? start : 0;
|
||||
int start2 = ((start + 4 < end_addr) && (start > 0)) ? start + 4 : 0;
|
||||
float8 sVal = (float8)(src[start>>2], src[start2>>2]);
|
||||
|
||||
float sAry[8]= {sVal.s0, sVal.s1, sVal.s2, sVal.s3, sVal.s4, sVal.s5, sVal.s6, sVal.s7};
|
||||
int det = start & 3;
|
||||
float4 v=(float4)(sAry[det], sAry[det+1], sAry[det+2], sAry[det+3]);
|
||||
uchar now = mat_kernel[k++];
|
||||
float4 flag = (kY >= minrows & kY <= maxrows & now != 0) ? v : maxVal;
|
||||
flag.x = (kX >= mincols & kX <= maxcols) ? flag.x : -FLT_MAX;
|
||||
flag.y = (kX+1 >= mincols & kX+1 <= maxcols) ? flag.y : -FLT_MAX;
|
||||
flag.z = (kX+2 >= mincols & kX+2 <= maxcols) ? flag.z : -FLT_MAX;
|
||||
flag.w = (kX+3 >= mincols & kX+3 <= maxcols) ? flag.w : -FLT_MAX;
|
||||
|
||||
maxVal = max(maxVal , flag);
|
||||
}
|
||||
}
|
||||
if(mY < rows && mX < cols)
|
||||
{
|
||||
__global float4* d = (__global float4*)(dst + mY * dstStep + mX + dstOffset);
|
||||
float4 dVal = *d;
|
||||
maxVal.x = (mX >=0 & mX < cols) ? maxVal.x : dVal.x;
|
||||
maxVal.y = (mX+1 >=0 & mX+1 < cols) ? maxVal.y : dVal.y;
|
||||
maxVal.z = (mX+2 >=0 & mX+2 < cols) ? maxVal.z : dVal.z;
|
||||
maxVal.w = (mX+3 >=0 & mX+3 < cols) ? maxVal.w : dVal.w;
|
||||
|
||||
*d = (maxVal);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void dilate_C1_D0(__global const uchar4 * restrict src, __global uchar *dst, int srcOffset, int dstOffset,
|
||||
int mincols, int maxcols, int minrows, int maxrows, int cols, int rows,
|
||||
int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
|
||||
{
|
||||
int mX = (get_global_id(0)<<2) - (dstOffset&3);;
|
||||
int mY = get_global_id(1);
|
||||
int kX = mX - anX, kY = mY - anY;
|
||||
int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
|
||||
uchar4 maxVal = (uchar4)(UCHAR_MIN);
|
||||
int k=0;
|
||||
for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
|
||||
{
|
||||
for(int j=0;j<ksX;j++, kX++)
|
||||
{
|
||||
int start = mad24(kY,srcStep,kX) + srcOffset;
|
||||
start = ((start < end_addr) && (start > 0)) ? start : 0;
|
||||
int start2 = ((start + 4 < end_addr) && (start > 0)) ? start + 4 : 0;
|
||||
uchar8 sVal = (uchar8)(src[start>>2], src[start2>>2]);
|
||||
|
||||
uchar sAry[8]= {sVal.s0, sVal.s1, sVal.s2, sVal.s3, sVal.s4, sVal.s5, sVal.s6, sVal.s7};
|
||||
int det = start & 3;
|
||||
uchar4 v=(uchar4)(sAry[det], sAry[det+1], sAry[det+2], sAry[det+3]);
|
||||
|
||||
uchar4 flag = (kY >= minrows & kY <= maxrows & mat_kernel[k++] != 0) ? v : maxVal;
|
||||
flag.x = (kX >= mincols & kX <= maxcols) ? flag.x : UCHAR_MIN;
|
||||
flag.y = (kX+1 >= mincols & kX+1 <= maxcols) ? flag.y : UCHAR_MIN;
|
||||
flag.z = (kX+2 >= mincols & kX+2 <= maxcols) ? flag.z : UCHAR_MIN;
|
||||
flag.w = (kX+3 >= mincols & kX+3 <= maxcols) ? flag.w : UCHAR_MIN;
|
||||
|
||||
maxVal = max(maxVal , flag);
|
||||
}
|
||||
}
|
||||
if(mY < rows)
|
||||
{
|
||||
__global uchar4* d = (__global uchar4*)(dst + mY * dstStep + mX + dstOffset);
|
||||
uchar4 dVal = *d;
|
||||
|
||||
maxVal.x = (mX >=0 & mX < cols) ? maxVal.x : dVal.x;
|
||||
maxVal.y = (mX+1 >=0 & mX+1 < cols) ? maxVal.y : dVal.y;
|
||||
maxVal.z = (mX+2 >=0 & mX+2 < cols) ? maxVal.z : dVal.z;
|
||||
maxVal.w = (mX+3 >=0 & mX+3 < cols) ? maxVal.w : dVal.w;
|
||||
|
||||
*d = (maxVal);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void dilate_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, int srcOffset, int dstOffset,
|
||||
int mincols, int maxcols, int minrows, int maxrows, int cols, int rows,
|
||||
int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
|
||||
{
|
||||
int mX = get_global_id(0);
|
||||
int mY = get_global_id(1);
|
||||
int kX = mX - anX, kY = mY - anY;
|
||||
int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
|
||||
uchar4 maxVal = (uchar4)(UCHAR_MIN);
|
||||
int k=0;
|
||||
for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
|
||||
{
|
||||
for(int j=0;j<ksX;j++, kX++)
|
||||
{
|
||||
int current_addr = mad24(kY,srcStep,kX) + srcOffset;
|
||||
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
|
||||
uchar4 v = src[current_addr];
|
||||
uchar now = mat_kernel[k++];
|
||||
uchar4 flag = (kX >= mincols & kX <= maxcols & kY >= minrows & kY <= maxrows & now != 0) ? v : maxVal;
|
||||
maxVal = max(maxVal , flag);
|
||||
}
|
||||
}
|
||||
|
||||
if(mX < cols && mY < rows)
|
||||
dst[mY * dstStep + mX + dstOffset] = (maxVal);
|
||||
}
|
||||
|
@ -1,183 +0,0 @@
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Niko Li, newlife20080214@gmail.com
|
||||
// Zero Lin, zero.lin@amd.com
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//
|
||||
|
||||
__kernel void erode_C4_D5(__global const float4 * restrict src, __global float4 *dst, int srcOffset, int dstOffset,
|
||||
int mincols, int maxcols, int minrows, int maxrows, int cols, int rows,
|
||||
int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
|
||||
{
|
||||
int mX = get_global_id(0);
|
||||
int mY = get_global_id(1);
|
||||
int kX = mX - anX, kY = mY - anY;
|
||||
int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
|
||||
float4 minVal = (float4)(3.4e+38);
|
||||
int k=0;
|
||||
for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
|
||||
{
|
||||
for(int j=0;j<ksX; j++, kX++)
|
||||
{
|
||||
int current_addr = mad24(kY,srcStep,kX) + srcOffset;
|
||||
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
|
||||
float4 v = src[current_addr];
|
||||
uchar now = mat_kernel[k++];
|
||||
float4 flag = (kX >= mincols & kX <= maxcols & kY >= minrows & kY <= maxrows & now != 0) ? v : (float4)(3.4e+38);
|
||||
minVal = min(minVal , flag);
|
||||
}
|
||||
}
|
||||
|
||||
if(mX < cols && mY < rows)
|
||||
dst[mY * dstStep + mX + dstOffset] = (minVal);
|
||||
}
|
||||
|
||||
__kernel void erode_C1_D5(__global float4 * src, __global float *dst, int srcOffset, int dstOffset,
|
||||
int mincols, int maxcols, int minrows, int maxrows, int cols, int rows,
|
||||
int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
|
||||
{
|
||||
int mX = (get_global_id(0)<<2) - (dstOffset&3);
|
||||
int mY = get_global_id(1);
|
||||
int kX = mX - anX, kY = mY - anY;
|
||||
int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
|
||||
float4 minVal = (float4)(3.4e+38);
|
||||
int k=0;
|
||||
for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
|
||||
{
|
||||
for(int j=0;j<ksX;j++, kX++)
|
||||
{
|
||||
int start = mad24(kY,srcStep,kX) + srcOffset;
|
||||
start = ((start < end_addr) && (start > 0)) ? start : 0;
|
||||
int start2 = ((start + 4 < end_addr) && (start > 0)) ? start + 4 : 0;
|
||||
float8 sVal = (float8)(src[start>>2], src[start2>>2]);
|
||||
|
||||
float sAry[8]= {sVal.s0, sVal.s1, sVal.s2, sVal.s3, sVal.s4, sVal.s5, sVal.s6, sVal.s7};
|
||||
int det = start & 3;
|
||||
float4 v=(float4)(sAry[det], sAry[det+1], sAry[det+2], sAry[det+3]);
|
||||
uchar now = mat_kernel[k++];
|
||||
float4 flag = (kY >= minrows & kY <= maxrows & now != 0) ? v : (float4)(3.4e+38);
|
||||
flag.x = (kX >= mincols & kX <= maxcols) ? flag.x : 3.4e+38;
|
||||
flag.y = (kX+1 >= mincols & kX+1 <= maxcols) ? flag.y : 3.4e+38;
|
||||
flag.z = (kX+2 >= mincols & kX+2 <= maxcols) ? flag.z : 3.4e+38;
|
||||
flag.w = (kX+3 >= mincols & kX+3 <= maxcols) ? flag.w : 3.4e+38;
|
||||
|
||||
minVal = min(minVal , flag);
|
||||
}
|
||||
}
|
||||
|
||||
if(mY < rows && mX < cols)
|
||||
{
|
||||
__global float4* d = (__global float4*)(dst + mY * dstStep + mX + dstOffset);
|
||||
float4 dVal = *d;
|
||||
minVal.x = (mX >=0 & mX < cols) ? minVal.x : dVal.x;
|
||||
minVal.y = (mX+1 >=0 & mX+1 < cols) ? minVal.y : dVal.y;
|
||||
minVal.z = (mX+2 >=0 & mX+2 < cols) ? minVal.z : dVal.z;
|
||||
minVal.w = (mX+3 >=0 & mX+3 < cols) ? minVal.w : dVal.w;
|
||||
|
||||
*d = (minVal);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void erode_C1_D0(__global const uchar4 * restrict src, __global uchar *dst, int srcOffset, int dstOffset,
|
||||
int mincols, int maxcols, int minrows, int maxrows, int cols, int rows,
|
||||
int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
|
||||
{
|
||||
int mX = (get_global_id(0)<<2) - (dstOffset&3);
|
||||
int mY = get_global_id(1);
|
||||
int kX = mX - anX, kY = mY - anY;
|
||||
int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
|
||||
uchar4 minVal = (uchar4)(0xff);
|
||||
int k=0;
|
||||
for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
|
||||
{
|
||||
for(int j=0;j<ksX;j++, kX++)
|
||||
{
|
||||
int start = mad24(kY,srcStep,kX) + srcOffset;
|
||||
start = ((start < end_addr) && (start > 0)) ? start : 0;
|
||||
int start2 = ((start + 4 < end_addr) && (start > 0)) ? start + 4 : 0;
|
||||
uchar8 sVal = (uchar8)(src[start>>2], src[start2>>2]);
|
||||
|
||||
uchar sAry[8]= {sVal.s0, sVal.s1, sVal.s2, sVal.s3, sVal.s4, sVal.s5, sVal.s6, sVal.s7};
|
||||
int det = start & 3;
|
||||
uchar4 v=(uchar4)(sAry[det], sAry[det+1], sAry[det+2], sAry[det+3]);
|
||||
|
||||
uchar4 flag = (kY >= minrows & kY <= maxrows & mat_kernel[k++] != 0) ? v : (uchar4)(0xff);
|
||||
flag.x = (kX >= mincols & kX <= maxcols) ? flag.x : 0xff;
|
||||
flag.y = (kX+1 >= mincols & kX+1 <= maxcols) ? flag.y : 0xff;
|
||||
flag.z = (kX+2 >= mincols & kX+2 <= maxcols) ? flag.z : 0xff;
|
||||
flag.w = (kX+3 >= mincols & kX+3 <= maxcols) ? flag.w : 0xff;
|
||||
|
||||
minVal = min(minVal , flag);
|
||||
}
|
||||
}
|
||||
|
||||
if(mY < rows)
|
||||
{
|
||||
__global uchar4* d = (__global uchar4*)(dst + mY * dstStep + mX + dstOffset);
|
||||
uchar4 dVal = *d;
|
||||
|
||||
minVal.x = (mX >=0 & mX < cols) ? minVal.x : dVal.x;
|
||||
minVal.y = (mX+1 >=0 & mX+1 < cols) ? minVal.y : dVal.y;
|
||||
minVal.z = (mX+2 >=0 & mX+2 < cols) ? minVal.z : dVal.z;
|
||||
minVal.w = (mX+3 >=0 & mX+3 < cols) ? minVal.w : dVal.w;
|
||||
|
||||
*d = (minVal);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void erode_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, int srcOffset, int dstOffset,
|
||||
int mincols, int maxcols, int minrows, int maxrows, int cols, int rows,
|
||||
int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
|
||||
{
|
||||
int mX = get_global_id(0);
|
||||
int mY = get_global_id(1);
|
||||
int kX = mX - anX, kY = mY - anY;
|
||||
int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
|
||||
uchar4 minVal = (uchar4)(0xff);
|
||||
int k=0;
|
||||
for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
|
||||
{
|
||||
for(int j=0;j<ksX;j++, kX++)
|
||||
{
|
||||
int current_addr = mad24(kY,srcStep,kX) + srcOffset;
|
||||
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
|
||||
uchar4 v = src[current_addr];
|
||||
uchar now = mat_kernel[k++];
|
||||
uchar4 flag = (kX >= mincols & kX <= maxcols & kY >= minrows & kY <= maxrows & now != 0) ? v : (uchar4)(0xff);
|
||||
minVal = min(minVal , flag);
|
||||
}
|
||||
}
|
||||
|
||||
if(mX < cols && mY < rows)
|
||||
dst[mY * dstStep + mX + dstOffset] = (minVal);
|
||||
}
|
||||
|
204
modules/ocl/src/kernels/filtering_morph.cl
Normal file
204
modules/ocl/src/kernels/filtering_morph.cl
Normal file
@ -0,0 +1,204 @@
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Niko Li, newlife20080214@gmail.com
|
||||
// Zero Lin, zero.lin@amd.com
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//
|
||||
|
||||
|
||||
#ifdef ERODE
|
||||
#define MORPH_OP(A,B) min((A),(B))
|
||||
#endif
|
||||
#ifdef DILATE
|
||||
#define MORPH_OP(A,B) max((A),(B))
|
||||
#endif
|
||||
//BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii
|
||||
#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)
|
||||
#ifndef GENTYPE
|
||||
__kernel void morph_C1_D0(__global const uchar * restrict src,
|
||||
__global uchar *dst,
|
||||
int src_offset_x, int src_offset_y,
|
||||
int cols, int rows,
|
||||
int src_step_in_pixel, int dst_step_in_pixel,
|
||||
__constant uchar * mat_kernel,
|
||||
int src_whole_cols, int src_whole_rows,
|
||||
int dst_offset_in_pixel)
|
||||
{
|
||||
int l_x = get_local_id(0);
|
||||
int l_y = get_local_id(1);
|
||||
int x = get_group_id(0)*4*LSIZE0;
|
||||
int y = get_group_id(1)*LSIZE1;
|
||||
int start_x = x+src_offset_x-RADIUSX & 0xfffffffc;
|
||||
int end_x = x + src_offset_x+LSIZE0*4+RADIUSX & 0xfffffffc;
|
||||
int width = (end_x -start_x+4)>>2;
|
||||
int offset = src_offset_x-RADIUSX & 3;
|
||||
int start_y = y+src_offset_y-RADIUSY;
|
||||
int point1 = mad24(l_y,LSIZE0,l_x);
|
||||
int point2 = point1 + LSIZE0*LSIZE1;
|
||||
int tl_x = (point1 % width)<<2;
|
||||
int tl_y = point1 / width;
|
||||
int tl_x2 = (point2 % width)<<2;
|
||||
int tl_y2 = point2 / width;
|
||||
int cur_x = start_x + tl_x;
|
||||
int cur_y = start_y + tl_y;
|
||||
int cur_x2 = start_x + tl_x2;
|
||||
int cur_y2 = start_y + tl_y2;
|
||||
int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
|
||||
int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
|
||||
uchar4 temp0,temp1;
|
||||
__local uchar4 LDS_DAT[2*LSIZE1*LSIZE0];
|
||||
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
//read pixels from src
|
||||
start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
|
||||
start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
|
||||
temp0 = *(__global uchar4*)&src[start_addr];
|
||||
temp1 = *(__global uchar4*)&src[start_addr2];
|
||||
//judge if read out of boundary
|
||||
temp0.x= ELEM(cur_x,0,src_whole_cols,VAL,temp0.x);
|
||||
temp0.y= ELEM(cur_x+1,0,src_whole_cols,VAL,temp0.y);
|
||||
temp0.z= ELEM(cur_x+2,0,src_whole_cols,VAL,temp0.z);
|
||||
temp0.w= ELEM(cur_x+3,0,src_whole_cols,VAL,temp0.w);
|
||||
temp0= ELEM(cur_y,0,src_whole_rows,(uchar4)VAL,temp0);
|
||||
|
||||
temp1.x= ELEM(cur_x2,0,src_whole_cols,VAL,temp1.x);
|
||||
temp1.y= ELEM(cur_x2+1,0,src_whole_cols,VAL,temp1.y);
|
||||
temp1.z= ELEM(cur_x2+2,0,src_whole_cols,VAL,temp1.z);
|
||||
temp1.w= ELEM(cur_x2+3,0,src_whole_cols,VAL,temp1.w);
|
||||
temp1= ELEM(cur_y2,0,src_whole_rows,(uchar4)VAL,temp1);
|
||||
|
||||
LDS_DAT[point1] = temp0;
|
||||
LDS_DAT[point2] = temp1;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
uchar4 res = (uchar4)VAL;
|
||||
for(int i=0;i<2*RADIUSY+1;i++)
|
||||
for(int j=0;j<2*RADIUSX+1;j++)
|
||||
{
|
||||
res =mat_kernel[i*(2*RADIUSX+1)+j]? MORPH_OP(res,vload4(0,(__local uchar*)&LDS_DAT[mad24((l_y+i),width,l_x)]+offset+j)):res;
|
||||
}
|
||||
int gidx = get_global_id(0)<<2;
|
||||
int gidy = get_global_id(1);
|
||||
int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
|
||||
if(gidx+3<cols && gidy<rows && (dst_offset_in_pixel&3==0))
|
||||
{
|
||||
*(__global uchar4*)&dst[out_addr] = res;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(gidx+3<cols && gidy<rows)
|
||||
{
|
||||
dst[out_addr] = res.x;
|
||||
dst[out_addr+1] = res.y;
|
||||
dst[out_addr+2] = res.z;
|
||||
dst[out_addr+3] = res.w;
|
||||
}
|
||||
else if(gidx+2<cols && gidy<rows)
|
||||
{
|
||||
dst[out_addr] = res.x;
|
||||
dst[out_addr+1] = res.y;
|
||||
dst[out_addr+2] = res.z;
|
||||
}
|
||||
else if(gidx+1<cols && gidy<rows)
|
||||
{
|
||||
dst[out_addr] = res.x;
|
||||
dst[out_addr+1] = res.y;
|
||||
}
|
||||
else if(gidx<cols && gidy<rows)
|
||||
{
|
||||
dst[out_addr] = res.x;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
__kernel void morph(__global const GENTYPE * restrict src,
|
||||
__global GENTYPE *dst,
|
||||
int src_offset_x, int src_offset_y,
|
||||
int cols, int rows,
|
||||
int src_step_in_pixel, int dst_step_in_pixel,
|
||||
__constant uchar * mat_kernel,
|
||||
int src_whole_cols, int src_whole_rows,
|
||||
int dst_offset_in_pixel)
|
||||
{
|
||||
int l_x = get_local_id(0);
|
||||
int l_y = get_local_id(1);
|
||||
int x = get_group_id(0)*LSIZE0;
|
||||
int y = get_group_id(1)*LSIZE1;
|
||||
int start_x = x+src_offset_x-RADIUSX;
|
||||
int end_x = x + src_offset_x+LSIZE0+RADIUSX;
|
||||
int width = end_x -start_x+1;
|
||||
int start_y = y+src_offset_y-RADIUSY;
|
||||
int point1 = mad24(l_y,LSIZE0,l_x);
|
||||
int point2 = point1 + LSIZE0*LSIZE1;
|
||||
int tl_x = point1 % width;
|
||||
int tl_y = point1 / width;
|
||||
int tl_x2 = point2 % width;
|
||||
int tl_y2 = point2 / width;
|
||||
int cur_x = start_x + tl_x;
|
||||
int cur_y = start_y + tl_y;
|
||||
int cur_x2 = start_x + tl_x2;
|
||||
int cur_y2 = start_y + tl_y2;
|
||||
int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
|
||||
int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
|
||||
GENTYPE temp0,temp1;
|
||||
__local GENTYPE LDS_DAT[2*LSIZE1*LSIZE0];
|
||||
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
//read pixels from src
|
||||
start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
|
||||
start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
|
||||
temp0 = src[start_addr];
|
||||
temp1 = src[start_addr2];
|
||||
//judge if read out of boundary
|
||||
temp0= ELEM(cur_x,0,src_whole_cols,(GENTYPE)VAL,temp0);
|
||||
temp0= ELEM(cur_y,0,src_whole_rows,(GENTYPE)VAL,temp0);
|
||||
|
||||
temp1= ELEM(cur_x2,0,src_whole_cols,(GENTYPE)VAL,temp1);
|
||||
temp1= ELEM(cur_y2,0,src_whole_rows,(GENTYPE)VAL,temp1);
|
||||
|
||||
LDS_DAT[point1] = temp0;
|
||||
LDS_DAT[point2] = temp1;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
GENTYPE res = (GENTYPE)VAL;
|
||||
for(int i=0;i<2*RADIUSY+1;i++)
|
||||
for(int j=0;j<2*RADIUSX+1;j++)
|
||||
{
|
||||
res =mat_kernel[i*(2*RADIUSX+1)+j]? MORPH_OP(res,LDS_DAT[mad24(l_y+i,width,l_x+j)]):res;
|
||||
}
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
|
||||
if(gidx<cols && gidy<rows)
|
||||
{
|
||||
dst[out_addr] = res;
|
||||
}
|
||||
}
|
||||
#endif
|
@ -94,7 +94,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
uchar4 dVal = *d;
|
||||
|
||||
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal;
|
||||
dst_data = (convert_uchar4(con) != convert_uchar4((int4)(0))) ? dst_data : dVal;
|
||||
|
||||
*d = dst_data;
|
||||
|
||||
@ -139,8 +139,8 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
uchar4 dVal = *d;
|
||||
|
||||
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal;
|
||||
|
||||
dst_data = (convert_uchar4(con) != convert_uchar4((int4)(0))) ? dst_data : dVal;
|
||||
*d = dst_data;
|
||||
|
||||
}
|
||||
@ -167,7 +167,7 @@ __kernel void remapNNSConstant_C4_D0(__global unsigned char* dst, __global unsig
|
||||
short8 map1_data;
|
||||
|
||||
map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
|
||||
int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even <<(short4)2) + src_offset;
|
||||
int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) <<((int4)(2))) + src_offset;
|
||||
uchar4 src_a, src_b, src_c, src_d;
|
||||
src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0));
|
||||
src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1));
|
||||
@ -187,7 +187,7 @@ __kernel void remapNNSConstant_C4_D0(__global unsigned char* dst, __global unsig
|
||||
uchar16 dVal = *d;
|
||||
|
||||
int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar16(con) != (uchar16)0) ? dst_data : dVal;
|
||||
dst_data = (convert_uchar16(con) != ((uchar16)(0))) ? dst_data : dVal;
|
||||
|
||||
*d = dst_data;
|
||||
}
|
||||
@ -216,7 +216,7 @@ __kernel void remapNNFConstant_C4_D0(__global unsigned char* dst, __global unsig
|
||||
map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
|
||||
int8 map1_dataZ = convert_int8_sat_rte(map1_data);
|
||||
|
||||
int4 srcIdx = map1_dataZ.odd * src_step + (map1_dataZ.even <<2) + src_offset;
|
||||
int4 srcIdx = map1_dataZ.odd * src_step + (map1_dataZ.even <<((int4)(2))) + src_offset;
|
||||
uchar4 src_a, src_b, src_c, src_d;
|
||||
src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0));
|
||||
src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1));
|
||||
@ -236,7 +236,7 @@ __kernel void remapNNFConstant_C4_D0(__global unsigned char* dst, __global unsig
|
||||
uchar16 dVal = *d;
|
||||
|
||||
int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar16(con) != (uchar16)0) ? dst_data : dVal;
|
||||
dst_data = (convert_uchar16(con) != ((uchar16)(0))) ? dst_data : dVal;
|
||||
|
||||
*d = dst_data;
|
||||
|
||||
@ -269,7 +269,7 @@ __kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const *
|
||||
|
||||
map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
|
||||
|
||||
int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even <<(short4)2) + src_offset;
|
||||
int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) <<((int4)(2))) + src_offset;
|
||||
|
||||
float4 src_data;
|
||||
src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
|
||||
@ -289,7 +289,7 @@ __kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const *
|
||||
float4 dVal = *d;
|
||||
|
||||
int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_float4(con) != 0) ? dst_data : dVal;
|
||||
dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
|
||||
|
||||
*d = dst_data;
|
||||
|
||||
@ -321,7 +321,7 @@ __kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const *
|
||||
map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
|
||||
int8 map1_dataZ = convert_int8_sat_rte(map1_data);
|
||||
|
||||
int4 srcIdx = convert_int4(map1_dataZ.odd) * src_step + convert_int4(map1_dataZ.even <<2) + src_offset;
|
||||
int4 srcIdx = convert_int4(map1_dataZ.odd) * src_step + convert_int4(map1_dataZ.even <<(int4)(2)) + src_offset;
|
||||
|
||||
float4 src_data;
|
||||
src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
|
||||
@ -341,7 +341,7 @@ __kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const *
|
||||
float4 dVal = *d;
|
||||
|
||||
int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_float4(con) != 0) ? dst_data : dVal;
|
||||
dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
|
||||
|
||||
*d = dst_data;
|
||||
|
||||
@ -418,21 +418,21 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
|
||||
float4 u = temp.even;
|
||||
float4 v = temp.odd;
|
||||
float4 ud = 1.f - u;
|
||||
float4 vd = 1.f - v;
|
||||
float4 ud = (float4)(1.0) - u;
|
||||
float4 vd = (float4)(1.0) - v;
|
||||
//float8 map1_dataU = map1_dataD + 1;
|
||||
|
||||
int4 map1_dataDx = map1_dataD.even;
|
||||
int4 map1_dataDy = map1_dataD.odd;
|
||||
int4 map1_dataDx1 = map1_dataDx + 1;
|
||||
int4 map1_dataDy1 = map1_dataDy + 1;
|
||||
int4 map1_dataDx1 = map1_dataDx + (int4)(1);
|
||||
int4 map1_dataDy1 = map1_dataDy + (int4)(1);
|
||||
|
||||
int4 src_StartU = map1_dataDy * src_step + map1_dataDx + src_offset;
|
||||
int4 src_StartD = src_StartU + src_step;
|
||||
/*
|
||||
//not using the vload
|
||||
int4 src_StartU1 = src_StartU + 1;
|
||||
int4 src_StartD1 = src_StartD + 1;
|
||||
int4 src_StartU1 = src_StartU + (int4)(1);
|
||||
int4 src_StartD1 = src_StartD + (int4)(1);
|
||||
|
||||
uchar4 a, b, c, d;
|
||||
a.x = *(src_StartU.x + src);
|
||||
@ -476,10 +476,10 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0);
|
||||
int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0);
|
||||
int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0);
|
||||
a = (convert_uchar4(ac) == (uchar4)0)? a : val;
|
||||
b = (convert_uchar4(bc) == (uchar4)0)? b : val;
|
||||
c = (convert_uchar4(cc) == (uchar4)0)? c : val;
|
||||
d = (convert_uchar4(dc) == (uchar4)0)? d : val;
|
||||
a = (convert_uchar4(ac) == (uchar4)(0))? a : val;
|
||||
b = (convert_uchar4(bc) == (uchar4)(0))? b : val;
|
||||
c = (convert_uchar4(cc) == (uchar4)(0))? c : val;
|
||||
d = (convert_uchar4(dc) == (uchar4)(0))? d : val;
|
||||
|
||||
uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v );
|
||||
|
||||
@ -487,7 +487,7 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
|
||||
uchar4 dVal = *D;
|
||||
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal;
|
||||
dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
|
||||
|
||||
*D = dst_data;
|
||||
}
|
||||
@ -531,7 +531,7 @@ __kernel void remapLNSConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
uchar4 dVal = *d;
|
||||
|
||||
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal;
|
||||
dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
|
||||
|
||||
*d = dst_data;
|
||||
|
||||
@ -567,17 +567,17 @@ __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsig
|
||||
|
||||
float4 u = temp.even;
|
||||
float4 v = temp.odd;
|
||||
float4 ud = 1.f - u;
|
||||
float4 vd = 1.f - v;
|
||||
float4 ud = (float4)(1.0) - u;
|
||||
float4 vd = (float4)(1.0) - v;
|
||||
|
||||
//float8 map1_dataU = map1_dataD + 1;
|
||||
|
||||
int4 map1_dataDx = map1_dataD.even;
|
||||
int4 map1_dataDy = map1_dataD.odd;
|
||||
int4 map1_dataDx1 = map1_dataDx + 1;
|
||||
int4 map1_dataDy1 = map1_dataDy + 1;
|
||||
int4 map1_dataDx1 = map1_dataDx + (int4)(1);
|
||||
int4 map1_dataDy1 = map1_dataDy + (int4)(1);
|
||||
|
||||
int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << 2) + src_offset;
|
||||
int4 src_StartU = map1_dataDy * src_step + (convert_int4(map1_dataDx) << (int4)(2)) + src_offset;
|
||||
int4 src_StartD = src_StartU + src_step;
|
||||
|
||||
uchar8 aU, bU, cU, dU, aD, bD, cD, dD;
|
||||
@ -605,10 +605,10 @@ __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsig
|
||||
int16 dcc = (int16)((int4)(dc.x), (int4)(dc.y), (int4)(dc.z), (int4)(dc.w));
|
||||
|
||||
uchar16 val = (uchar16)(nval, nval, nval, nval);
|
||||
a = (convert_uchar16(acc) == (uchar16)0)? a : val;
|
||||
b = (convert_uchar16(bcc) == (uchar16)0)? b : val;
|
||||
c = (convert_uchar16(ccc) == (uchar16)0)? c : val;
|
||||
d = (convert_uchar16(dcc) == (uchar16)0)? d : val;
|
||||
a = (convert_uchar16(acc) == (uchar16)(0))? a : val;
|
||||
b = (convert_uchar16(bcc) == (uchar16)(0))? b : val;
|
||||
c = (convert_uchar16(ccc) == (uchar16)(0))? c : val;
|
||||
d = (convert_uchar16(dcc) == (uchar16)(0))? d : val;
|
||||
|
||||
float16 U = (float16)((float4)(u.x), (float4)(u.y), (float4)(u.z), (float4)(u.w));
|
||||
float16 V = (float16)((float4)(v.x), (float4)(v.y), (float4)(v.z), (float4)(v.w));
|
||||
@ -621,7 +621,7 @@ __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsig
|
||||
|
||||
uchar16 dVal = *D;
|
||||
int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar16(con) != (uchar16)0) ? dst_data : dVal;
|
||||
dst_data = (convert_uchar16(con) != (uchar16)(0)) ? dst_data : dVal;
|
||||
|
||||
*D = dst_data;
|
||||
}
|
||||
@ -646,7 +646,7 @@ __kernel void remapLNSConstant_C4_D0(__global unsigned char* dst, __global unsig
|
||||
short8 map1_data;
|
||||
|
||||
map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
|
||||
int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even <<(short4)2) + src_offset;
|
||||
int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) <<(int4)(2)) + src_offset;
|
||||
uchar4 src_a, src_b, src_c, src_d;
|
||||
src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0));
|
||||
src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1));
|
||||
@ -666,7 +666,7 @@ __kernel void remapLNSConstant_C4_D0(__global unsigned char* dst, __global unsig
|
||||
uchar16 dVal = *d;
|
||||
|
||||
int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar16(con) != (uchar16)0) ? dst_data : dVal;
|
||||
dst_data = (convert_uchar16(con) != (uchar16)(0)) ? dst_data : dVal;
|
||||
|
||||
*d = dst_data;
|
||||
|
||||
@ -700,21 +700,21 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
|
||||
|
||||
float4 u = temp.even;
|
||||
float4 v = temp.odd;
|
||||
float4 ud = 1.f - u;
|
||||
float4 vd = 1.f - v;
|
||||
float4 ud = (float4)(1.0) - u;
|
||||
float4 vd = (float4)(1.0) - v;
|
||||
//float8 map1_dataU = map1_dataD + 1;
|
||||
|
||||
int4 map1_dataDx = map1_dataD.even;
|
||||
int4 map1_dataDy = map1_dataD.odd;
|
||||
int4 map1_dataDx1 = map1_dataDx + 1;
|
||||
int4 map1_dataDy1 = map1_dataDy + 1;
|
||||
int4 map1_dataDx1 = map1_dataDx + (int4)(1);
|
||||
int4 map1_dataDy1 = map1_dataDy + (int4)(1);
|
||||
|
||||
int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << 2) + src_offset;
|
||||
int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << (int4)(2)) + src_offset;
|
||||
int4 src_StartD = src_StartU + src_step;
|
||||
/*
|
||||
//not using the vload
|
||||
int4 src_StartU1 = src_StartU + 1;
|
||||
int4 src_StartD1 = src_StartD + 1;
|
||||
int4 src_StartU1 = src_StartU + (int4)(1);
|
||||
int4 src_StartD1 = src_StartD + (int4)(1);
|
||||
|
||||
float4 a, b, c, d;
|
||||
a.x = *(src_StartU.x + src);
|
||||
@ -754,14 +754,14 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
|
||||
c = (float4)(aD.x, bD.x, cD.x, dD.x);
|
||||
d = (float4)(aD.y, bD.y, cD.y, dD.y);
|
||||
|
||||
int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0);
|
||||
int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0);
|
||||
int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0);
|
||||
int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0);
|
||||
a = (convert_float4(ac) == 0)? a : val;
|
||||
b = (convert_float4(bc) == 0)? b : val;
|
||||
c = (convert_float4(cc) == 0)? c : val;
|
||||
d = (convert_float4(dc) == 0)? d : val;
|
||||
int4 ac =(map1_dataDx >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDy < (int4)(0) || map1_dataDy < (int4)(0));
|
||||
int4 bc =(map1_dataDx1 >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDx1 < (int4)(0) || map1_dataDy < (int4)(0));
|
||||
int4 cc =(map1_dataDx >= (int4)(src_cols) || map1_dataDy1 >= (int4)(src_rows) || map1_dataDy1 < (int4)(0) || map1_dataDx < (int4)(0));
|
||||
int4 dc =(map1_dataDx1 >= (int4)(src_cols) || map1_dataDy1 >= (int4)(src_rows) || map1_dataDy1 < (int4)(0) || map1_dataDy1 < (int4)(0));
|
||||
a = (convert_float4(ac) == (float4)(0))? a : val;
|
||||
b = (convert_float4(bc) == (float4)(0))? b : val;
|
||||
c = (convert_float4(cc) == (float4)(0))? c : val;
|
||||
d = (convert_float4(dc) == (float4)(0))? d : val;
|
||||
|
||||
float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
|
||||
|
||||
@ -769,7 +769,7 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
|
||||
|
||||
float4 dVal = *D;
|
||||
int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_float4(con) != 0) ? dst_data : dVal;
|
||||
dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
|
||||
|
||||
*D = dst_data;
|
||||
}
|
||||
@ -798,7 +798,7 @@ __kernel void remapLNSConstant_C1_D5(__global float* dst, __global float const *
|
||||
|
||||
map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
|
||||
|
||||
int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even <<(short4)2) + src_offset;
|
||||
int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) << (int4)(2)) + src_offset;
|
||||
|
||||
float4 src_data;
|
||||
src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
|
||||
@ -818,7 +818,7 @@ __kernel void remapLNSConstant_C1_D5(__global float* dst, __global float const *
|
||||
float4 dVal = *d;
|
||||
|
||||
int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_float4(con) != 0) ? dst_data : dVal;
|
||||
dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
|
||||
|
||||
*d = dst_data;
|
||||
|
||||
@ -918,7 +918,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __read_only im
|
||||
__global uchar4* d = (__global uchar4 *)(dst + dstStart);
|
||||
uchar4 dVal = *d;
|
||||
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal;
|
||||
dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
|
||||
|
||||
*d = dst_data;
|
||||
}
|
||||
|
@ -588,6 +588,13 @@ void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, string kern
|
||||
sprintf(compile_option, "-D GENTYPE=int");
|
||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] ));
|
||||
break;
|
||||
case 2:
|
||||
sprintf(compile_option, "-D GENTYPE=int2");
|
||||
cl_int2 i2val;
|
||||
i2val.s[0] = val.ival.s[0];
|
||||
i2val.s[1] = val.ival.s[1];
|
||||
args.push_back( make_pair( sizeof(cl_int2) , (void *)&i2val ));
|
||||
break;
|
||||
case 4:
|
||||
sprintf(compile_option, "-D GENTYPE=int4");
|
||||
args.push_back( make_pair( sizeof(cl_int4) , (void *)&val.ival ));
|
||||
|
@ -44,7 +44,7 @@
|
||||
//M*/
|
||||
|
||||
#include "precomp.hpp"
|
||||
#include "threadsafe.h"
|
||||
#include "Threadsafe.h"
|
||||
|
||||
CriticalSection::CriticalSection()
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user