Merge pull request #1555 from ilya-lavrenov:ocl_convertC3C4

This commit is contained in:
Roman Donchenko 2013-10-03 16:52:10 +04:00 committed by OpenCV Buildbot
commit 8e75947a7d
3 changed files with 59 additions and 136 deletions

View File

@ -58,12 +58,13 @@ using namespace std;
//////////////////////////////// oclMat ////////////////////////////////
////////////////////////////////////////////////////////////////////////
//helper routines
// helper routines
namespace cv
{
namespace ocl
{
///////////////////////////OpenCL kernel strings///////////////////////////
/////////////////////////// OpenCL kernel strings ///////////////////////////
extern const char *operator_copyToM;
extern const char *operator_convertTo;
extern const char *operator_setTo;
@ -74,42 +75,18 @@ namespace cv
}
}
////////////////////////////////////////////////////////////////////////
// convert_C3C4
static void convert_C3C4(const cl_mem &src, oclMat &dst)
{
int dstStep_in_pixel = dst.step1() / dst.oclchannels();
int pixel_end = dst.wholecols * dst.wholerows - 1;
Context *clCxt = dst.clCxt;
string kernelName = "convertC3C4";
char compile_option[32];
switch(dst.depth())
{
case 0:
sprintf(compile_option, "-D GENTYPE4=uchar4");
break;
case 1:
sprintf(compile_option, "-D GENTYPE4=char4");
break;
case 2:
sprintf(compile_option, "-D GENTYPE4=ushort4");
break;
case 3:
sprintf(compile_option, "-D GENTYPE4=short4");
break;
case 4:
sprintf(compile_option, "-D GENTYPE4=int4");
break;
case 5:
sprintf(compile_option, "-D GENTYPE4=float4");
break;
case 6:
sprintf(compile_option, "-D GENTYPE4=double4");
break;
default:
CV_Error(CV_StsUnsupportedFormat, "unknown depth");
}
int pixel_end = dst.wholecols * dst.wholerows - 1;
int dstStep_in_pixel = dst.step1() / dst.oclchannels();
const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
std::string buildOptions = format("-D GENTYPE4=%s4", typeMap[dst.depth()]);
vector< pair<size_t, const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&src));
args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
@ -118,46 +95,24 @@ static void convert_C3C4(const cl_mem &src, oclMat &dst)
args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep_in_pixel));
args.push_back( make_pair( sizeof(cl_int), (void *)&pixel_end));
size_t globalThreads[3] = {((dst.wholecols * dst.wholerows + 3) / 4 + 255) / 256 * 256, 1, 1};
size_t localThreads[3] = {256, 1, 1};
size_t globalThreads[3] = { divUp(dst.wholecols * dst.wholerows, 4), 1, 1 };
size_t localThreads[3] = { 256, 1, 1 };
openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
openCLExecuteKernel(clCxt, &convertC3C4, "convertC3C4", globalThreads, localThreads,
args, -1, -1, buildOptions.c_str());
}
////////////////////////////////////////////////////////////////////////
// convert_C4C3
static void convert_C4C3(const oclMat &src, cl_mem &dst)
{
int srcStep_in_pixel = src.step1() / src.oclchannels();
int pixel_end = src.wholecols * src.wholerows - 1;
Context *clCxt = src.clCxt;
string kernelName = "convertC4C3";
char compile_option[32];
switch(src.depth())
{
case 0:
sprintf(compile_option, "-D GENTYPE4=uchar4");
break;
case 1:
sprintf(compile_option, "-D GENTYPE4=char4");
break;
case 2:
sprintf(compile_option, "-D GENTYPE4=ushort4");
break;
case 3:
sprintf(compile_option, "-D GENTYPE4=short4");
break;
case 4:
sprintf(compile_option, "-D GENTYPE4=int4");
break;
case 5:
sprintf(compile_option, "-D GENTYPE4=float4");
break;
case 6:
sprintf(compile_option, "-D GENTYPE4=double4");
break;
default:
CV_Error(CV_StsUnsupportedFormat, "unknown depth");
}
const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
std::string buildOptions = format("-D GENTYPE4=%s4", typeMap[src.depth()]);
vector< pair<size_t, const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
@ -167,10 +122,10 @@ static void convert_C4C3(const oclMat &src, cl_mem &dst)
args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep_in_pixel));
args.push_back( make_pair( sizeof(cl_int), (void *)&pixel_end));
size_t globalThreads[3] = {((src.wholecols * src.wholerows + 3) / 4 + 255) / 256 * 256, 1, 1};
size_t localThreads[3] = {256, 1, 1};
size_t globalThreads[3] = { divUp(src.wholecols * src.wholerows, 4), 1, 1};
size_t localThreads[3] = { 256, 1, 1 };
openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
openCLExecuteKernel(clCxt, &convertC3C4, "convertC4C3", globalThreads, localThreads, args, -1, -1, buildOptions.c_str());
}
void cv::ocl::oclMat::upload(const Mat &m)
@ -179,14 +134,10 @@ void cv::ocl::oclMat::upload(const Mat &m)
Size wholeSize;
Point ofs;
m.locateROI(wholeSize, ofs);
// int type = m.type();
// if(m.oclchannels() == 3)
//{
// type = CV_MAKETYPE(m.depth(), 4);
//}
create(wholeSize, m.type());
if(m.channels() == 3)
if (m.channels() == 3)
{
int pitch = wholeSize.width * 3 * m.elemSize1();
int tail_padding = m.elemSize1() * 3072;
@ -197,35 +148,15 @@ void cv::ocl::oclMat::upload(const Mat &m)
openCLMemcpy2D(clCxt, temp, pitch, m.datastart, m.step, wholeSize.width * m.elemSize(), wholeSize.height, clMemcpyHostToDevice, 3);
convert_C3C4(temp, *this);
//int* cputemp=new int[wholeSize.height*wholeSize.width * 3];
//int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
//openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE,
// 0, wholeSize.height*wholeSize.width * 3* sizeof(int), cputemp, 0, NULL, NULL));
//openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)data, CL_TRUE,
// 0, this->step*this->wholerows, cpudata, 0, NULL, NULL));
//for(int i=0;i<wholeSize.height;i++)
//{
// int *a = cputemp+i*wholeSize.width * 3,*b = cpudata + i*this->step/sizeof(int);
// for(int j=0;j<wholeSize.width;j++)
// {
// if((a[3*j] != b[4*j])||(a[3*j+1] != b[4*j+1])||(a[3*j+2] != b[4*j+2]))
// printf("rows=%d,cols=%d,cputtemp=%d,%d,%d;cpudata=%d,%d,%d\n",
// i,j,a[3*j],a[3*j+1],a[3*j+2],b[4*j],b[4*j+1],b[4*j+2]);
// }
//}
//delete []cputemp;
//delete []cpudata;
openCLSafeCall(clReleaseMemObject(temp));
}
else
{
openCLMemcpy2D(clCxt, data, step, m.datastart, m.step, wholeSize.width * elemSize(), wholeSize.height, clMemcpyHostToDevice);
}
rows = m.rows;
cols = m.cols;
offset = ofs.y * step + ofs.x * elemSize();
//download_channels = m.channels();
}
cv::ocl::oclMat::operator cv::_InputArray()
@ -259,11 +190,6 @@ cv::ocl::oclMat& cv::ocl::getOclMatRef(OutputArray src)
void cv::ocl::oclMat::download(cv::Mat &m) const
{
CV_DbgAssert(!this->empty());
// int t = type();
// if(download_channels == 3)
//{
// t = CV_MAKETYPE(depth(), 3);
//}
m.create(wholerows, wholecols, type());
if(m.channels() == 3)
@ -277,30 +203,14 @@ void cv::ocl::oclMat::download(cv::Mat &m) const
convert_C4C3(*this, temp);
openCLMemcpy2D(clCxt, m.data, m.step, temp, pitch, wholecols * m.elemSize(), wholerows, clMemcpyDeviceToHost, 3);
//int* cputemp=new int[wholecols*wholerows * 3];
//int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
//openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE,
// 0, wholecols*wholerows * 3* sizeof(int), cputemp, 0, NULL, NULL));
//openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)data, CL_TRUE,
// 0, this->step*this->wholerows, cpudata, 0, NULL, NULL));
//for(int i=0;i<wholerows;i++)
//{
// int *a = cputemp+i*wholecols * 3,*b = cpudata + i*this->step/sizeof(int);
// for(int j=0;j<wholecols;j++)
// {
// if((a[3*j] != b[4*j])||(a[3*j+1] != b[4*j+1])||(a[3*j+2] != b[4*j+2]))
// printf("rows=%d,cols=%d,cputtemp=%d,%d,%d;cpudata=%d,%d,%d\n",
// i,j,a[3*j],a[3*j+1],a[3*j+2],b[4*j],b[4*j+1],b[4*j+2]);
// }
//}
//delete []cputemp;
//delete []cpudata;
openCLSafeCall(clReleaseMemObject(temp));
}
else
{
openCLMemcpy2D(clCxt, m.data, m.step, data, step, wholecols * elemSize(), wholerows, clMemcpyDeviceToHost);
}
Size wholesize;
Point ofs;
locateROI(wholesize, ofs);
@ -323,6 +233,7 @@ static void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask
{"uchar3", "char3", "ushort3", "short3", "int3", "float3", "double3"},
{"uchar4", "char4", "ushort4", "short4", "int4", "float4", "double4"}
};
char compile_option[32];
sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str());
size_t localThreads[3] = {16, 16, 1};
@ -366,6 +277,7 @@ void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const
///////////////////////////////////////////////////////////////////////////
//////////////////////////////// ConvertTo ////////////////////////////////
///////////////////////////////////////////////////////////////////////////
static void convert_run(const oclMat &src, oclMat &dst, double alpha, double beta)
{
string kernelName = "convert_to";
@ -404,6 +316,7 @@ static void convert_run(const oclMat &src, oclMat &dst, double alpha, double bet
openCLExecuteKernel(dst.clCxt , &operator_convertTo, kernelName, globalThreads,
localThreads, args, -1, -1, buildOptions);
}
void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double beta ) const
{
if (!clCxt->supportsFeature(Context::CL_DOUBLE) &&

View File

@ -32,23 +32,23 @@
// the use of this software, even if advised of the possibility of such damage.
//
//
//#pragma OPENCL EXTENSION cl_amd_printf : enable
#if defined (DOUBLE_SUPPORT)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
__kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows,
int dstStep_in_piexl,int pixel_end)
{
int id = get_global_id(0);
//int pixel_end = mul24(cols -1 , rows -1);
int3 pixelid = (int3)(mul24(id,3),mad24(id,3,1),mad24(id,3,2));
pixelid = clamp(pixelid,0,pixel_end);
GENTYPE4 pixel0, pixel1, pixel2, outpix0,outpix1,outpix2,outpix3;
pixel0 = src[pixelid.x];
pixel1 = src[pixelid.y];
pixel2 = src[pixelid.z];
outpix0 = (GENTYPE4)(pixel0.x,pixel0.y,pixel0.z,0);
outpix1 = (GENTYPE4)(pixel0.w,pixel1.x,pixel1.y,0);
outpix2 = (GENTYPE4)(pixel1.z,pixel1.w,pixel2.x,0);
@ -56,17 +56,19 @@ __kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTY
int4 outy = (id<<2)/cols;
int4 outx = (id<<2)%cols;
outx.y++;
outx.z+=2;
outx.w+=3;
outy = select(outy,outy+1,outx>=cols);
outx = select(outx,outx-cols,outx>=cols);
//outpix3 = select(outpix3, outpix0, (uchar4)(outy.w>=rows));
//outpix2 = select(outpix2, outpix0, (uchar4)(outy.z>=rows));
//outpix1 = select(outpix1, outpix0, (uchar4)(outy.y>=rows));
//outx = select(outx,(int4)outx.x,outy>=rows);
//outy = select(outy,(int4)outy.x,outy>=rows);
outx += (int4)(0, 1, 2, 3);
outy = select(outy, outy+1, outx>=cols);
outx = select(outx, outx-cols, outx>=cols);
// when cols == 1
outy = select(outy, outy + 1, outx >= cols);
outx = select(outx, outx-cols, outx >= cols);
outy = select(outy, outy + 1, outx >= cols);
outx = select(outx, outx-cols, outx >= cols);
int4 addr = mad24(outy,(int4)dstStep_in_piexl,outx);
if(outx.w<cols && outy.w<rows)
{
dst[addr.x] = outpix0;
@ -91,20 +93,26 @@ __kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTY
}
}
__kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows,
int srcStep_in_pixel,int pixel_end)
{
int id = get_global_id(0)<<2;
int y = id / cols;
int x = id % cols;
int4 x4 = (int4)(x,x+1,x+2,x+3);
int4 y4 = select((int4)y,(int4)(y+1),x4>=(int4)cols);
y4=clamp(y4,(int4)0,(int4)(rows-1));
x4 = select(x4,x4-(int4)cols,x4>=(int4)cols);
int4 addr = mad24(y4,(int4)srcStep_in_pixel,x4);
// when cols == 1
y4 = select(y4, y4 + 1,x4>=(int4)cols);
x4 = select(x4, x4 - (int4)cols,x4>=(int4)cols);
y4 = select(y4, y4 + 1,x4>=(int4)cols);
x4 = select(x4, x4-(int4)cols,x4>=(int4)cols);
y4=clamp(y4,(int4)0,(int4)(rows-1));
int4 addr = mad24(y4, (int4)srcStep_in_pixel, x4);
GENTYPE4 pixel0,pixel1,pixel2,pixel3, outpixel1, outpixel2;
pixel0 = src[addr.x];
pixel1 = src[addr.y];
@ -120,9 +128,11 @@ __kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTY
outpixel2.y = pixel3.x;
outpixel2.z = pixel3.y;
outpixel2.w = pixel3.z;
int4 outaddr = mul24(id>>2 , 3);
outaddr.y++;
outaddr.z+=2;
if(outaddr.z <= pixel_end)
{
dst[outaddr.x] = pixel0;

View File

@ -402,7 +402,7 @@ PARAM_TEST_CASE(convertC3C4, MatType, bool)
int type = CV_MAKE_TYPE(depth, 3);
cv::RNG &rng = TS::ptr()->get_rng();
src = randomMat(rng, randomSize(MIN_VALUE, MAX_VALUE), type, 0, 40, false);
src = randomMat(rng, randomSize(1, MAX_VALUE), type, 0, 40, false);
}
void random_roi()