mirror of
https://github.com/opencv/opencv.git
synced 2024-11-28 13:10:12 +08:00
Merge pull request #1555 from ilya-lavrenov:ocl_convertC3C4
This commit is contained in:
commit
8e75947a7d
@ -58,12 +58,13 @@ using namespace std;
|
||||
//////////////////////////////// oclMat ////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//helper routines
|
||||
// helper routines
|
||||
namespace cv
|
||||
{
|
||||
namespace ocl
|
||||
{
|
||||
///////////////////////////OpenCL kernel strings///////////////////////////
|
||||
/////////////////////////// OpenCL kernel strings ///////////////////////////
|
||||
|
||||
extern const char *operator_copyToM;
|
||||
extern const char *operator_convertTo;
|
||||
extern const char *operator_setTo;
|
||||
@ -74,42 +75,18 @@ namespace cv
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// convert_C3C4
|
||||
|
||||
static void convert_C3C4(const cl_mem &src, oclMat &dst)
|
||||
{
|
||||
int dstStep_in_pixel = dst.step1() / dst.oclchannels();
|
||||
int pixel_end = dst.wholecols * dst.wholerows - 1;
|
||||
Context *clCxt = dst.clCxt;
|
||||
string kernelName = "convertC3C4";
|
||||
char compile_option[32];
|
||||
switch(dst.depth())
|
||||
{
|
||||
case 0:
|
||||
sprintf(compile_option, "-D GENTYPE4=uchar4");
|
||||
break;
|
||||
case 1:
|
||||
sprintf(compile_option, "-D GENTYPE4=char4");
|
||||
break;
|
||||
case 2:
|
||||
sprintf(compile_option, "-D GENTYPE4=ushort4");
|
||||
break;
|
||||
case 3:
|
||||
sprintf(compile_option, "-D GENTYPE4=short4");
|
||||
break;
|
||||
case 4:
|
||||
sprintf(compile_option, "-D GENTYPE4=int4");
|
||||
break;
|
||||
case 5:
|
||||
sprintf(compile_option, "-D GENTYPE4=float4");
|
||||
break;
|
||||
case 6:
|
||||
sprintf(compile_option, "-D GENTYPE4=double4");
|
||||
break;
|
||||
default:
|
||||
CV_Error(CV_StsUnsupportedFormat, "unknown depth");
|
||||
}
|
||||
int pixel_end = dst.wholecols * dst.wholerows - 1;
|
||||
int dstStep_in_pixel = dst.step1() / dst.oclchannels();
|
||||
|
||||
const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
|
||||
std::string buildOptions = format("-D GENTYPE4=%s4", typeMap[dst.depth()]);
|
||||
|
||||
vector< pair<size_t, const void *> > args;
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&src));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
|
||||
@ -118,46 +95,24 @@ static void convert_C3C4(const cl_mem &src, oclMat &dst)
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep_in_pixel));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&pixel_end));
|
||||
|
||||
size_t globalThreads[3] = {((dst.wholecols * dst.wholerows + 3) / 4 + 255) / 256 * 256, 1, 1};
|
||||
size_t localThreads[3] = {256, 1, 1};
|
||||
size_t globalThreads[3] = { divUp(dst.wholecols * dst.wholerows, 4), 1, 1 };
|
||||
size_t localThreads[3] = { 256, 1, 1 };
|
||||
|
||||
openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
|
||||
openCLExecuteKernel(clCxt, &convertC3C4, "convertC3C4", globalThreads, localThreads,
|
||||
args, -1, -1, buildOptions.c_str());
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// convert_C4C3
|
||||
|
||||
static void convert_C4C3(const oclMat &src, cl_mem &dst)
|
||||
{
|
||||
int srcStep_in_pixel = src.step1() / src.oclchannels();
|
||||
int pixel_end = src.wholecols * src.wholerows - 1;
|
||||
Context *clCxt = src.clCxt;
|
||||
string kernelName = "convertC4C3";
|
||||
char compile_option[32];
|
||||
switch(src.depth())
|
||||
{
|
||||
case 0:
|
||||
sprintf(compile_option, "-D GENTYPE4=uchar4");
|
||||
break;
|
||||
case 1:
|
||||
sprintf(compile_option, "-D GENTYPE4=char4");
|
||||
break;
|
||||
case 2:
|
||||
sprintf(compile_option, "-D GENTYPE4=ushort4");
|
||||
break;
|
||||
case 3:
|
||||
sprintf(compile_option, "-D GENTYPE4=short4");
|
||||
break;
|
||||
case 4:
|
||||
sprintf(compile_option, "-D GENTYPE4=int4");
|
||||
break;
|
||||
case 5:
|
||||
sprintf(compile_option, "-D GENTYPE4=float4");
|
||||
break;
|
||||
case 6:
|
||||
sprintf(compile_option, "-D GENTYPE4=double4");
|
||||
break;
|
||||
default:
|
||||
CV_Error(CV_StsUnsupportedFormat, "unknown depth");
|
||||
}
|
||||
|
||||
const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
|
||||
std::string buildOptions = format("-D GENTYPE4=%s4", typeMap[src.depth()]);
|
||||
|
||||
vector< pair<size_t, const void *> > args;
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
|
||||
@ -167,10 +122,10 @@ static void convert_C4C3(const oclMat &src, cl_mem &dst)
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep_in_pixel));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&pixel_end));
|
||||
|
||||
size_t globalThreads[3] = {((src.wholecols * src.wholerows + 3) / 4 + 255) / 256 * 256, 1, 1};
|
||||
size_t localThreads[3] = {256, 1, 1};
|
||||
size_t globalThreads[3] = { divUp(src.wholecols * src.wholerows, 4), 1, 1};
|
||||
size_t localThreads[3] = { 256, 1, 1 };
|
||||
|
||||
openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
|
||||
openCLExecuteKernel(clCxt, &convertC3C4, "convertC4C3", globalThreads, localThreads, args, -1, -1, buildOptions.c_str());
|
||||
}
|
||||
|
||||
void cv::ocl::oclMat::upload(const Mat &m)
|
||||
@ -179,14 +134,10 @@ void cv::ocl::oclMat::upload(const Mat &m)
|
||||
Size wholeSize;
|
||||
Point ofs;
|
||||
m.locateROI(wholeSize, ofs);
|
||||
// int type = m.type();
|
||||
// if(m.oclchannels() == 3)
|
||||
//{
|
||||
// type = CV_MAKETYPE(m.depth(), 4);
|
||||
//}
|
||||
|
||||
create(wholeSize, m.type());
|
||||
|
||||
if(m.channels() == 3)
|
||||
if (m.channels() == 3)
|
||||
{
|
||||
int pitch = wholeSize.width * 3 * m.elemSize1();
|
||||
int tail_padding = m.elemSize1() * 3072;
|
||||
@ -197,35 +148,15 @@ void cv::ocl::oclMat::upload(const Mat &m)
|
||||
|
||||
openCLMemcpy2D(clCxt, temp, pitch, m.datastart, m.step, wholeSize.width * m.elemSize(), wholeSize.height, clMemcpyHostToDevice, 3);
|
||||
convert_C3C4(temp, *this);
|
||||
//int* cputemp=new int[wholeSize.height*wholeSize.width * 3];
|
||||
//int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
|
||||
//openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE,
|
||||
// 0, wholeSize.height*wholeSize.width * 3* sizeof(int), cputemp, 0, NULL, NULL));
|
||||
//openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)data, CL_TRUE,
|
||||
// 0, this->step*this->wholerows, cpudata, 0, NULL, NULL));
|
||||
//for(int i=0;i<wholeSize.height;i++)
|
||||
//{
|
||||
// int *a = cputemp+i*wholeSize.width * 3,*b = cpudata + i*this->step/sizeof(int);
|
||||
// for(int j=0;j<wholeSize.width;j++)
|
||||
// {
|
||||
// if((a[3*j] != b[4*j])||(a[3*j+1] != b[4*j+1])||(a[3*j+2] != b[4*j+2]))
|
||||
// printf("rows=%d,cols=%d,cputtemp=%d,%d,%d;cpudata=%d,%d,%d\n",
|
||||
// i,j,a[3*j],a[3*j+1],a[3*j+2],b[4*j],b[4*j+1],b[4*j+2]);
|
||||
// }
|
||||
//}
|
||||
//delete []cputemp;
|
||||
//delete []cpudata;
|
||||
|
||||
openCLSafeCall(clReleaseMemObject(temp));
|
||||
}
|
||||
else
|
||||
{
|
||||
openCLMemcpy2D(clCxt, data, step, m.datastart, m.step, wholeSize.width * elemSize(), wholeSize.height, clMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
rows = m.rows;
|
||||
cols = m.cols;
|
||||
offset = ofs.y * step + ofs.x * elemSize();
|
||||
//download_channels = m.channels();
|
||||
}
|
||||
|
||||
cv::ocl::oclMat::operator cv::_InputArray()
|
||||
@ -259,11 +190,6 @@ cv::ocl::oclMat& cv::ocl::getOclMatRef(OutputArray src)
|
||||
void cv::ocl::oclMat::download(cv::Mat &m) const
|
||||
{
|
||||
CV_DbgAssert(!this->empty());
|
||||
// int t = type();
|
||||
// if(download_channels == 3)
|
||||
//{
|
||||
// t = CV_MAKETYPE(depth(), 3);
|
||||
//}
|
||||
m.create(wholerows, wholecols, type());
|
||||
|
||||
if(m.channels() == 3)
|
||||
@ -277,30 +203,14 @@ void cv::ocl::oclMat::download(cv::Mat &m) const
|
||||
|
||||
convert_C4C3(*this, temp);
|
||||
openCLMemcpy2D(clCxt, m.data, m.step, temp, pitch, wholecols * m.elemSize(), wholerows, clMemcpyDeviceToHost, 3);
|
||||
//int* cputemp=new int[wholecols*wholerows * 3];
|
||||
//int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
|
||||
//openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE,
|
||||
// 0, wholecols*wholerows * 3* sizeof(int), cputemp, 0, NULL, NULL));
|
||||
//openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)data, CL_TRUE,
|
||||
// 0, this->step*this->wholerows, cpudata, 0, NULL, NULL));
|
||||
//for(int i=0;i<wholerows;i++)
|
||||
//{
|
||||
// int *a = cputemp+i*wholecols * 3,*b = cpudata + i*this->step/sizeof(int);
|
||||
// for(int j=0;j<wholecols;j++)
|
||||
// {
|
||||
// if((a[3*j] != b[4*j])||(a[3*j+1] != b[4*j+1])||(a[3*j+2] != b[4*j+2]))
|
||||
// printf("rows=%d,cols=%d,cputtemp=%d,%d,%d;cpudata=%d,%d,%d\n",
|
||||
// i,j,a[3*j],a[3*j+1],a[3*j+2],b[4*j],b[4*j+1],b[4*j+2]);
|
||||
// }
|
||||
//}
|
||||
//delete []cputemp;
|
||||
//delete []cpudata;
|
||||
|
||||
openCLSafeCall(clReleaseMemObject(temp));
|
||||
}
|
||||
else
|
||||
{
|
||||
openCLMemcpy2D(clCxt, m.data, m.step, data, step, wholecols * elemSize(), wholerows, clMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
Size wholesize;
|
||||
Point ofs;
|
||||
locateROI(wholesize, ofs);
|
||||
@ -323,6 +233,7 @@ static void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask
|
||||
{"uchar3", "char3", "ushort3", "short3", "int3", "float3", "double3"},
|
||||
{"uchar4", "char4", "ushort4", "short4", "int4", "float4", "double4"}
|
||||
};
|
||||
|
||||
char compile_option[32];
|
||||
sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str());
|
||||
size_t localThreads[3] = {16, 16, 1};
|
||||
@ -366,6 +277,7 @@ void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////////// ConvertTo ////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static void convert_run(const oclMat &src, oclMat &dst, double alpha, double beta)
|
||||
{
|
||||
string kernelName = "convert_to";
|
||||
@ -404,6 +316,7 @@ static void convert_run(const oclMat &src, oclMat &dst, double alpha, double bet
|
||||
openCLExecuteKernel(dst.clCxt , &operator_convertTo, kernelName, globalThreads,
|
||||
localThreads, args, -1, -1, buildOptions);
|
||||
}
|
||||
|
||||
void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double beta ) const
|
||||
{
|
||||
if (!clCxt->supportsFeature(Context::CL_DOUBLE) &&
|
||||
|
@ -32,23 +32,23 @@
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//
|
||||
//#pragma OPENCL EXTENSION cl_amd_printf : enable
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#endif
|
||||
|
||||
__kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows,
|
||||
int dstStep_in_piexl,int pixel_end)
|
||||
{
|
||||
int id = get_global_id(0);
|
||||
//int pixel_end = mul24(cols -1 , rows -1);
|
||||
int3 pixelid = (int3)(mul24(id,3),mad24(id,3,1),mad24(id,3,2));
|
||||
pixelid = clamp(pixelid,0,pixel_end);
|
||||
GENTYPE4 pixel0, pixel1, pixel2, outpix0,outpix1,outpix2,outpix3;
|
||||
|
||||
pixel0 = src[pixelid.x];
|
||||
pixel1 = src[pixelid.y];
|
||||
pixel2 = src[pixelid.z];
|
||||
|
||||
|
||||
outpix0 = (GENTYPE4)(pixel0.x,pixel0.y,pixel0.z,0);
|
||||
outpix1 = (GENTYPE4)(pixel0.w,pixel1.x,pixel1.y,0);
|
||||
outpix2 = (GENTYPE4)(pixel1.z,pixel1.w,pixel2.x,0);
|
||||
@ -56,17 +56,19 @@ __kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTY
|
||||
|
||||
int4 outy = (id<<2)/cols;
|
||||
int4 outx = (id<<2)%cols;
|
||||
outx.y++;
|
||||
outx.z+=2;
|
||||
outx.w+=3;
|
||||
outy = select(outy,outy+1,outx>=cols);
|
||||
outx = select(outx,outx-cols,outx>=cols);
|
||||
//outpix3 = select(outpix3, outpix0, (uchar4)(outy.w>=rows));
|
||||
//outpix2 = select(outpix2, outpix0, (uchar4)(outy.z>=rows));
|
||||
//outpix1 = select(outpix1, outpix0, (uchar4)(outy.y>=rows));
|
||||
//outx = select(outx,(int4)outx.x,outy>=rows);
|
||||
//outy = select(outy,(int4)outy.x,outy>=rows);
|
||||
|
||||
outx += (int4)(0, 1, 2, 3);
|
||||
outy = select(outy, outy+1, outx>=cols);
|
||||
outx = select(outx, outx-cols, outx>=cols);
|
||||
|
||||
// when cols == 1
|
||||
outy = select(outy, outy + 1, outx >= cols);
|
||||
outx = select(outx, outx-cols, outx >= cols);
|
||||
outy = select(outy, outy + 1, outx >= cols);
|
||||
outx = select(outx, outx-cols, outx >= cols);
|
||||
|
||||
int4 addr = mad24(outy,(int4)dstStep_in_piexl,outx);
|
||||
|
||||
if(outx.w<cols && outy.w<rows)
|
||||
{
|
||||
dst[addr.x] = outpix0;
|
||||
@ -91,20 +93,26 @@ __kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTY
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
__kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows,
|
||||
int srcStep_in_pixel,int pixel_end)
|
||||
{
|
||||
int id = get_global_id(0)<<2;
|
||||
int y = id / cols;
|
||||
int x = id % cols;
|
||||
|
||||
int4 x4 = (int4)(x,x+1,x+2,x+3);
|
||||
int4 y4 = select((int4)y,(int4)(y+1),x4>=(int4)cols);
|
||||
y4=clamp(y4,(int4)0,(int4)(rows-1));
|
||||
x4 = select(x4,x4-(int4)cols,x4>=(int4)cols);
|
||||
int4 addr = mad24(y4,(int4)srcStep_in_pixel,x4);
|
||||
|
||||
// when cols == 1
|
||||
y4 = select(y4, y4 + 1,x4>=(int4)cols);
|
||||
x4 = select(x4, x4 - (int4)cols,x4>=(int4)cols);
|
||||
y4 = select(y4, y4 + 1,x4>=(int4)cols);
|
||||
x4 = select(x4, x4-(int4)cols,x4>=(int4)cols);
|
||||
|
||||
y4=clamp(y4,(int4)0,(int4)(rows-1));
|
||||
int4 addr = mad24(y4, (int4)srcStep_in_pixel, x4);
|
||||
|
||||
GENTYPE4 pixel0,pixel1,pixel2,pixel3, outpixel1, outpixel2;
|
||||
pixel0 = src[addr.x];
|
||||
pixel1 = src[addr.y];
|
||||
@ -120,9 +128,11 @@ __kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTY
|
||||
outpixel2.y = pixel3.x;
|
||||
outpixel2.z = pixel3.y;
|
||||
outpixel2.w = pixel3.z;
|
||||
|
||||
int4 outaddr = mul24(id>>2 , 3);
|
||||
outaddr.y++;
|
||||
outaddr.z+=2;
|
||||
|
||||
if(outaddr.z <= pixel_end)
|
||||
{
|
||||
dst[outaddr.x] = pixel0;
|
||||
|
@ -402,7 +402,7 @@ PARAM_TEST_CASE(convertC3C4, MatType, bool)
|
||||
int type = CV_MAKE_TYPE(depth, 3);
|
||||
|
||||
cv::RNG &rng = TS::ptr()->get_rng();
|
||||
src = randomMat(rng, randomSize(MIN_VALUE, MAX_VALUE), type, 0, 40, false);
|
||||
src = randomMat(rng, randomSize(1, MAX_VALUE), type, 0, 40, false);
|
||||
}
|
||||
|
||||
void random_roi()
|
||||
|
Loading…
Reference in New Issue
Block a user