Merge pull request #1718 from stweil/opencl

Format OpenCL code
This commit is contained in:
zdenop 2018-06-30 20:14:04 +02:00 committed by GitHub
commit 5b14121449
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 998 additions and 1022 deletions

View File

@ -17,8 +17,8 @@
#define _CRT_SECURE_NO_WARNINGS
#endif
#include <cstdlib>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#ifdef __APPLE__

View File

@ -15,8 +15,8 @@
#endif
#include <float.h>
#include "openclwrapper.h"
#include "oclkernels.h"
#include "openclwrapper.h"
// for micro-benchmark
#include "otsuthr.h"
@ -627,8 +627,7 @@ static void populateGPUEnvFromDevice(GPUEnv *gpuInfo, cl_device_id device) {
CHECK_OPENCL(clStatus, "populateGPUEnv::createCommandQueue");
}
int OpenclDevice::LoadOpencl()
{
int OpenclDevice::LoadOpencl() {
#ifdef WIN32
HINSTANCE HOpenclDll = nullptr;
void* OpenclDll = nullptr;
@ -644,8 +643,7 @@ int OpenclDevice::LoadOpencl()
#endif
return 1;
}
int OpenclDevice::SetKernelEnv( KernelEnv *envInfo )
{
int OpenclDevice::SetKernelEnv(KernelEnv* envInfo) {
envInfo->mpkContext = gpuEnv.mpContext;
envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
@ -653,9 +651,9 @@ int OpenclDevice::SetKernelEnv( KernelEnv *envInfo )
return 1;
}
static cl_mem allocateZeroCopyBuffer(const KernelEnv &rEnv, l_uint32 *hostbuffer,
size_t nElements, cl_mem_flags flags,
cl_int *pStatus) {
static cl_mem allocateZeroCopyBuffer(const KernelEnv& rEnv,
l_uint32* hostbuffer, size_t nElements,
cl_mem_flags flags, cl_int* pStatus) {
cl_mem membuffer =
clCreateBuffer(rEnv.mpkContext, (cl_mem_flags)(flags),
nElements * sizeof(l_uint32), hostbuffer, pStatus);
@ -697,8 +695,7 @@ static Pix *mapOutputCLBuffer(const KernelEnv &rEnv, cl_mem clbuffer, Pix *pixd,
return pixd;
}
void OpenclDevice::releaseMorphCLBuffers()
{
void OpenclDevice::releaseMorphCLBuffers() {
if (pixdCLIntermediate != nullptr) clReleaseMemObject(pixdCLIntermediate);
if (pixsCLBuffer != nullptr) clReleaseMemObject(pixsCLBuffer);
if (pixdCLBuffer != nullptr) clReleaseMemObject(pixdCLBuffer);
@ -736,15 +733,12 @@ int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Pix *pixs) {
return (int)clStatus;
}
int OpenclDevice::InitEnv()
{
int OpenclDevice::InitEnv() {
// PERF_COUNT_START("OD::InitEnv")
// printf("[OD] OpenclDevice::InitEnv()\n");
#ifdef SAL_WIN32
while( 1 )
{
if( 1 == LoadOpencl() )
break;
while (1) {
if (1 == LoadOpencl()) break;
}
PERF_COUNT_SUB("LoadOpencl")
#endif
@ -756,8 +750,7 @@ PERF_COUNT_SUB("LoadOpencl")
return 1;
}
int OpenclDevice::ReleaseOpenclRunEnv()
{
int OpenclDevice::ReleaseOpenclRunEnv() {
ReleaseOpenclEnv(&gpuEnv);
#ifdef SAL_WIN32
FreeOpenclDll();
@ -765,8 +758,7 @@ int OpenclDevice::ReleaseOpenclRunEnv()
return 1;
}
inline int OpenclDevice::AddKernelConfig( int kCount, const char *kName )
{
inline int OpenclDevice::AddKernelConfig(int kCount, const char* kName) {
ASSERT_HOST(kCount > 0);
ASSERT_HOST(strlen(kName) < sizeof(gpuEnv.mArrykernelNames[kCount - 1]));
strcpy(gpuEnv.mArrykernelNames[kCount - 1], kName);
@ -774,10 +766,8 @@ inline int OpenclDevice::AddKernelConfig( int kCount, const char *kName )
return 0;
}
int OpenclDevice::RegistOpenclKernel()
{
if ( !gpuEnv.mnIsUserCreated )
memset( &gpuEnv, 0, sizeof(gpuEnv) );
int OpenclDevice::RegistOpenclKernel() {
if (!gpuEnv.mnIsUserCreated) memset(&gpuEnv, 0, sizeof(gpuEnv));
gpuEnv.mnFileCount = 0; // argc;
gpuEnv.mnKernelCount = 0UL;
@ -795,7 +785,8 @@ int OpenclDevice::InitOpenclRunEnv_DeviceSelection( int argc ) {
cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
// overwrite global static GPUEnv with new device
if (selectedDeviceIsOpenCL()) {
//printf("[DS] InitOpenclRunEnv_DS::Calling populateGPUEnvFromDevice() for selected device\n");
// printf("[DS] InitOpenclRunEnv_DS::Calling populateGPUEnvFromDevice()
// for selected device\n");
populateGPUEnvFromDevice(&gpuEnv, bestDevice);
gpuEnv.mnFileCount = 0; // argc;
gpuEnv.mnKernelCount = 0UL;
@ -803,7 +794,8 @@ int OpenclDevice::InitOpenclRunEnv_DeviceSelection( int argc ) {
CompileKernelFile(&gpuEnv, "");
// PERF_COUNT_SUB("CompileKernelFile")
} else {
//printf("[DS] InitOpenclRunEnv_DS::Skipping populateGPUEnvFromDevice() b/c native cpu selected\n");
// printf("[DS] InitOpenclRunEnv_DS::Skipping populateGPUEnvFromDevice()
// b/c native cpu selected\n");
}
isInited = 1;
}
@ -811,43 +803,34 @@ int OpenclDevice::InitOpenclRunEnv_DeviceSelection( int argc ) {
return 0;
}
OpenclDevice::OpenclDevice()
{
OpenclDevice::OpenclDevice() {
// InitEnv();
}
OpenclDevice::~OpenclDevice()
{
OpenclDevice::~OpenclDevice() {
// ReleaseOpenclRunEnv();
}
int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo )
{
int OpenclDevice::ReleaseOpenclEnv(GPUEnv* gpuInfo) {
int i = 0;
int clStatus = 0;
if ( !isInited )
{
if (!isInited) {
return 1;
}
for ( i = 0; i < gpuEnv.mnFileCount; i++ )
{
if ( gpuEnv.mpArryPrograms[i] )
{
for (i = 0; i < gpuEnv.mnFileCount; i++) {
if (gpuEnv.mpArryPrograms[i]) {
clStatus = clReleaseProgram(gpuEnv.mpArryPrograms[i]);
CHECK_OPENCL(clStatus, "clReleaseProgram");
gpuEnv.mpArryPrograms[i] = nullptr;
}
}
if ( gpuEnv.mpCmdQueue )
{
if (gpuEnv.mpCmdQueue) {
clReleaseCommandQueue(gpuEnv.mpCmdQueue);
gpuEnv.mpCmdQueue = nullptr;
}
if ( gpuEnv.mpContext )
{
if (gpuEnv.mpContext) {
clReleaseContext(gpuEnv.mpContext);
gpuEnv.mpContext = nullptr;
}
@ -856,8 +839,7 @@ int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo )
delete[] gpuInfo->mpArryDevsID;
return 1;
}
int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
{
int OpenclDevice::BinaryGenerated(const char* clFileName, FILE** fhandle) {
unsigned int i = 0;
cl_int clStatus;
int status = 0;
@ -879,15 +861,12 @@ int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
*fhandle = fd;
}
return status;
}
int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName )
{
int OpenclDevice::CachedOfKernerPrg(const GPUEnv* gpuEnvCached,
const char* clFileName) {
int i;
for ( i = 0; i < gpuEnvCached->mnFileCount; i++ )
{
if ( strcasecmp( gpuEnvCached->mArryKnelSrcFile[i], clFileName ) == 0 )
{
for (i = 0; i < gpuEnvCached->mnFileCount; i++) {
if (strcasecmp(gpuEnvCached->mArryKnelSrcFile[i], clFileName) == 0) {
if (gpuEnvCached->mpArryPrograms[i] != nullptr) {
return 1;
}
@ -896,8 +875,8 @@ int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * cl
return 0;
}
int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes )
{
int OpenclDevice::WriteBinaryToFile(const char* fileName, const char* birary,
size_t numBytes) {
FILE* output = nullptr;
output = fopen(fileName, "wb");
if (output == nullptr) {
@ -908,10 +887,9 @@ int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, s
fclose(output);
return 1;
}
int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * clFileName )
{
int OpenclDevice::GeneratBinFromKernelSource(cl_program program,
const char* clFileName) {
unsigned int i = 0;
cl_int clStatus;
size_t* binarySizes;
@ -944,33 +922,27 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c
/* copy over all of the generated binaries. */
std::vector<char*> binaries(numDevices);
for ( i = 0; i < numDevices; i++ )
{
if ( binarySizes[i] != 0 )
{
for (i = 0; i < numDevices; i++) {
if (binarySizes[i] != 0) {
binaries[i] = (char*)malloc(sizeof(char) * binarySizes[i]);
if (binaries[i] == nullptr) {
return 0;
}
}
else
{
} else {
binaries[i] = nullptr;
}
}
clStatus = clGetProgramInfo(program, CL_PROGRAM_BINARIES,
sizeof(char *) * numDevices,
clStatus =
clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(char*) * numDevices,
&binaries[0], nullptr);
CHECK_OPENCL(clStatus, "clGetProgramInfo");
/* dump out each binary into its own separate file. */
for ( i = 0; i < numDevices; i++ )
{
for (i = 0; i < numDevices; i++) {
char fileName[256] = {0}, cl_name[128] = {0};
if ( binarySizes[i] != 0 )
{
if (binarySizes[i] != 0) {
char deviceName[1024];
clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
sizeof(deviceName), deviceName, nullptr);
@ -981,8 +953,7 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c
cl_name[str - clFileName] = '\0';
sprintf(fileName, "%s-%s.bin", cl_name, deviceName);
legalizeFileName(fileName);
if ( !WriteBinaryToFile( fileName, binaries[i], binarySizes[i] ) )
{
if (!WriteBinaryToFile(fileName, binaries[i], binarySizes[i])) {
printf("[OD] write binary[%s] failed\n", fileName);
return 0;
} // else
@ -991,8 +962,7 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c
}
// Release all resources and memory
for ( i = 0; i < numDevices; i++ )
{
for (i = 0; i < numDevices; i++) {
free(binaries[i]);
}
@ -1005,8 +975,7 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c
return 1;
}
int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
{
int OpenclDevice::CompileKernelFile(GPUEnv* gpuInfo, const char* buildOption) {
// PERF_COUNT_START("CompileKernelFile")
cl_int clStatus = 0;
size_t length;
@ -1018,8 +987,7 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
FILE *fd, *fd1;
const char* filename = "kernel.cl";
// fprintf(stderr, "[OD] CompileKernelFile ... \n");
if ( CachedOfKernerPrg(gpuInfo, filename) == 1 )
{
if (CachedOfKernerPrg(gpuInfo, filename) == 1) {
return 1;
}
@ -1029,10 +997,10 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
source_size[0] = strlen(source);
binaryExisted = 0;
binaryExisted = BinaryGenerated( filename, &fd ); // don't check for binary during microbenchmark
binaryExisted = BinaryGenerated(
filename, &fd); // don't check for binary during microbenchmark
// PERF_COUNT_SUB("BinaryGenerated")
if ( binaryExisted == 1 )
{
if (binaryExisted == 1) {
clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
sizeof(numDevices), &numDevices, nullptr);
CHECK_OPENCL(clStatus, "clGetContextInfo");
@ -1044,8 +1012,7 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
b_error |= fseek(fd, 0, SEEK_END) < 0;
b_error |= (length = ftell(fd)) <= 0;
b_error |= fseek(fd, 0, SEEK_SET) < 0;
if ( b_error )
{
if (b_error) {
return 0;
}
@ -1065,19 +1032,17 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
// PERF_COUNT_SUB("get devices")
// fprintf(stderr, "[OD] Create kernel from binary\n");
const uint8_t* c_binary = &binary[0];
gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary( gpuInfo->mpContext,numDevices,
&mpArryDevsID[0], &length, &c_binary,
gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary(
gpuInfo->mpContext, numDevices, &mpArryDevsID[0], &length, &c_binary,
&binary_status, &clStatus);
CHECK_OPENCL(clStatus, "clCreateProgramWithBinary");
// PERF_COUNT_SUB("clCreateProgramWithBinary")
// PERF_COUNT_SUB("binaryExisted")
}
else
{
} else {
// create a CL program using the kernel source
// fprintf(stderr, "[OD] Create kernel from source\n");
gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource( gpuInfo->mpContext, 1, &source,
source_size, &clStatus);
gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource(
gpuInfo->mpContext, 1, &source, source_size, &clStatus);
CHECK_OPENCL(clStatus, "clCreateProgramWithSource");
// PERF_COUNT_SUB("!binaryExisted")
}
@ -1090,38 +1055,30 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
// create a cl program executable for all the devices specified
// printf("[OD] BuildProgram.\n");
PERF_COUNT_START("OD::CompileKernel::clBuildProgram")
if (!gpuInfo->mnIsUserCreated)
{
if (!gpuInfo->mnIsUserCreated) {
clStatus =
clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID,
buildOption, nullptr, nullptr);
// PERF_COUNT_SUB("clBuildProgram notUserCreated")
}
else
{
} else {
clStatus =
clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID),
buildOption, nullptr, nullptr);
// PERF_COUNT_SUB("clBuildProgram isUserCreated")
}
PERF_COUNT_END
if ( clStatus != CL_SUCCESS )
{
if (clStatus != CL_SUCCESS) {
printf("BuildProgram error!\n");
if ( !gpuInfo->mnIsUserCreated )
{
if (!gpuInfo->mnIsUserCreated) {
clStatus = clGetProgramBuildInfo(
gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
CL_PROGRAM_BUILD_LOG, 0, nullptr, &length);
}
else
{
clStatus = clGetProgramBuildInfo(
gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
} else {
clStatus =
clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
CL_PROGRAM_BUILD_LOG, 0, nullptr, &length);
}
if ( clStatus != CL_SUCCESS )
{
if (clStatus != CL_SUCCESS) {
printf("opencl create build log fail\n");
return 0;
}
@ -1129,18 +1086,16 @@ PERF_COUNT_END
if (buildLog == (char*)nullptr) {
return 0;
}
if ( !gpuInfo->mnIsUserCreated )
{
clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
if (!gpuInfo->mnIsUserCreated) {
clStatus = clGetProgramBuildInfo(
gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
CL_PROGRAM_BUILD_LOG, length, buildLog, &length);
} else {
clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx],
gpuInfo->mpDevID, CL_PROGRAM_BUILD_LOG,
length, buildLog, &length);
}
else
{
clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
}
if ( clStatus != CL_SUCCESS )
{
if (clStatus != CL_SUCCESS) {
printf("opencl program build info fail\n");
return 0;
}
@ -1168,8 +1123,9 @@ PERF_COUNT_END
return 1;
}
l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl,l_uint32 *line)
{
l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32* tiffdata, l_int32 w,
l_int32 h, l_int32 wpl,
l_uint32* line) {
PERF_COUNT_START("pixReadFromTiffKernel")
cl_int clStatus;
KernelEnv rEnv;
@ -1190,12 +1146,15 @@ PERF_COUNT_START("pixReadFromTiffKernel")
SetKernelEnv(&rEnv);
l_uint32* pResult = (l_uint32*)malloc(w * h * sizeof(l_uint32));
rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "composeRGBPixel", &clStatus );
rEnv.mpkKernel =
clCreateKernel(rEnv.mpkProgram, "composeRGBPixel", &clStatus);
CHECK_OPENCL(clStatus, "clCreateKernel composeRGBPixel");
// Allocate input and output OCL buffers
valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w*h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
outputCl = allocateZeroCopyBuffer(rEnv, pResult, w*h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
valuesCl = allocateZeroCopyBuffer(
rEnv, tiffdata, w * h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
outputCl = allocateZeroCopyBuffer(
rEnv, pResult, w * h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
// Kernel arguments
clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &valuesCl);
@ -1217,9 +1176,9 @@ clStatus =
CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel");
/* map results back from gpu */
void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ,
0, w * h * sizeof(l_uint32), 0, nullptr, nullptr,
&clStatus);
void* ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE,
CL_MAP_READ, 0, w * h * sizeof(l_uint32), 0,
nullptr, nullptr, &clStatus);
CHECK_OPENCL(clStatus, "clEnqueueMapBuffer outputCl");
clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, nullptr, nullptr);
@ -1230,7 +1189,8 @@ PERF_COUNT_END
return pResult;
}
//Morphology Dilate operation for 5x5 structuring element. Invokes the relevant OpenCL kernels
// Morphology Dilate operation for 5x5 structuring element. Invokes the relevant
// OpenCL kernels
static cl_int pixDilateCL_55(l_int32 wpl, l_int32 h) {
size_t globalThreads[2];
cl_mem pixtemp;
@ -1286,7 +1246,8 @@ static cl_int pixDilateCL_55(l_int32 wpl, l_int32 h) {
return status;
}
//Morphology Erode operation for 5x5 structuring element. Invokes the relevant OpenCL kernels
// Morphology Erode operation for 5x5 structuring element. Invokes the relevant
// OpenCL kernels
static cl_int pixErodeCL_55(l_int32 wpl, l_int32 h) {
size_t globalThreads[2];
cl_mem pixtemp;
@ -1624,7 +1585,8 @@ static cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1,
}
// OpenCL implementation of Get Lines from pix function
//Note: Assumes the source and dest opencl buffer are initialized. No check done
// Note: Assumes the source and dest opencl buffer are initialized. No check
// done
void OpenclDevice::pixGetLinesCL(Pix* pixd, Pix* pixs, Pix** pix_vline,
Pix** pix_hline, Pix** pixClosed,
bool getpixClosed, l_int32 close_hsize,
@ -1781,19 +1743,23 @@ int OpenclDevice::HistogramRectOCL(unsigned char *imageData,
CHECK_OPENCL(clStatus,
"clCreateKernel kernel_HistogramRectOneChannelReduction");
} else {
histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus );
histKern.mpkKernel = clCreateKernel(
histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus);
CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectAllChannels");
histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectAllChannelsReduction", &clStatus );
CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannelsReduction");
histRedKern.mpkKernel =
clCreateKernel(histRedKern.mpkProgram,
"kernel_HistogramRectAllChannelsReduction", &clStatus);
CHECK_OPENCL(clStatus,
"clCreateKernel kernel_HistogramRectAllChannelsReduction");
}
void* ptr;
// Initialize tmpHistogramBuffer buffer
ptr = clEnqueueMapBuffer(
histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0,
tmpHistogramBins * sizeof(cl_uint), 0, nullptr, nullptr, &clStatus);
ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE,
CL_MAP_WRITE, 0, tmpHistogramBins * sizeof(cl_uint),
0, nullptr, nullptr, &clStatus);
CHECK_OPENCL(clStatus, "clEnqueueMapBuffer tmpHistogramBuffer");
memset(ptr, 0, tmpHistogramBins * sizeof(cl_uint));
@ -1805,8 +1771,7 @@ int OpenclDevice::HistogramRectOCL(unsigned char *imageData,
clSetKernelArg(histKern.mpkKernel, 0, sizeof(cl_mem), &imageBuffer);
CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
cl_uint numPixels = width * height;
clStatus =
clSetKernelArg(histKern.mpkKernel, 1, sizeof(cl_uint), &numPixels);
clStatus = clSetKernelArg(histKern.mpkKernel, 1, sizeof(cl_uint), &numPixels);
CHECK_OPENCL(clStatus, "clSetKernelArg numPixels");
clStatus = clSetKernelArg(histKern.mpkKernel, 2, sizeof(cl_mem),
&tmpHistogramBuffer);
@ -1826,8 +1791,8 @@ int OpenclDevice::HistogramRectOCL(unsigned char *imageData,
/* launch histogram */
PERF_COUNT_SUB("before")
clStatus = clEnqueueNDRangeKernel(histKern.mpkCmdQueue, histKern.mpkKernel, 1,
nullptr, global_work_size, local_work_size, 0,
nullptr, nullptr);
nullptr, global_work_size, local_work_size,
0, nullptr, nullptr);
CHECK_OPENCL(clStatus,
"clEnqueueNDRangeKernel kernel_HistogramRectAllChannels");
clFinish(histKern.mpkCmdQueue);
@ -1838,7 +1803,9 @@ if (clStatus != 0) {
clStatus = clEnqueueNDRangeKernel(
histRedKern.mpkCmdQueue, histRedKern.mpkKernel, 1, nullptr,
red_global_work_size, local_work_size, 0, nullptr, nullptr);
CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction" );
CHECK_OPENCL(
clStatus,
"clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction");
clFinish(histRedKern.mpkCmdQueue);
if (clStatus != 0) {
retVal = -1;
@ -1982,8 +1949,6 @@ int OpenclDevice::ThresholdRectToPixOCL(unsigned char *imageData,
return retVal;
}
/******************************************************************************
* Data Types for Device Selection
*****************************************************************************/
@ -2108,7 +2073,9 @@ static double composeRGBPixelMicroBench(GPUEnv *env,
timespec time_funct_start, time_funct_end;
#endif
// input data
l_uint32 *tiffdata = (l_uint32 *)input.imageData;// same size and random data; data doesn't change workload
l_uint32* tiffdata =
(l_uint32*)input.imageData; // same size and random data; data doesn't
// change workload
// function call
if (type == DS_DEVICE_OPENCL_DEVICE) {
@ -2126,13 +2093,15 @@ static double composeRGBPixelMicroBench(GPUEnv *env,
wpl, nullptr);
#if ON_WINDOWS
QueryPerformanceCounter(&time_funct_end);
time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
(double)(freq.QuadPart);
#elif ON_APPLE
stop = mach_absolute_time();
time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
#else
clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
(time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
#endif
} else {
@ -2160,18 +2129,19 @@ static double composeRGBPixelMicroBench(GPUEnv *env,
}
#if ON_WINDOWS
QueryPerformanceCounter(&time_funct_end);
time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
(double)(freq.QuadPart);
#elif ON_APPLE
stop = mach_absolute_time();
time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
#else
clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
(time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
#endif
pixDestroy(&pix);
}
// cleanup
return time;
@ -2214,7 +2184,8 @@ static double histogramRectMicroBench(GPUEnv *env,
#if ON_WINDOWS
QueryPerformanceCounter(&time_funct_end);
time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
(double)(freq.QuadPart);
#elif ON_APPLE
stop = mach_absolute_time();
if (retVal == 0) {
@ -2224,7 +2195,8 @@ static double histogramRectMicroBench(GPUEnv *env,
}
#else
clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
(time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
#endif
} else {
int* histogram = new int[kHistogramSize];
@ -2241,13 +2213,15 @@ static double histogramRectMicroBench(GPUEnv *env,
}
#if ON_WINDOWS
QueryPerformanceCounter(&time_funct_end);
time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
(double)(freq.QuadPart);
#elif ON_APPLE
stop = mach_absolute_time();
time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
#else
clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
(time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
#endif
delete[] histogram;
}
@ -2270,8 +2244,8 @@ static void ThresholdRectToPix_Native(const unsigned char *imagedata,
*pix = pixCreate(width, height, 1);
uint32_t* pixdata = pixGetData(*pix);
int wpl = pixGetWpl(*pix);
const unsigned char* srcdata = imagedata + top * bytes_per_line +
left * bytes_per_pixel;
const unsigned char* srcdata =
imagedata + top * bytes_per_line + left * bytes_per_pixel;
for (int y = 0; y < height; ++y) {
const uint8_t* linedata = srcdata;
uint32_t* pixline = pixdata + y * wpl;
@ -2310,12 +2284,7 @@ static double thresholdRectToPixMicroBench(GPUEnv *env,
// input data
unsigned char pixelHi = (unsigned char)255;
int thresholds[4] = {
pixelHi,
pixelHi,
pixelHi,
pixelHi
};
int thresholds[4] = {pixelHi, pixelHi, pixelHi, pixelHi};
// Pix* pix = pixCreate(width, height, 1);
int top = 0;
@ -2340,7 +2309,8 @@ static double thresholdRectToPixMicroBench(GPUEnv *env,
#if ON_WINDOWS
QueryPerformanceCounter(&time_funct_end);
time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
(double)(freq.QuadPart);
#elif ON_APPLE
stop = mach_absolute_time();
if (retVal == 0) {
@ -2352,11 +2322,10 @@ static double thresholdRectToPixMicroBench(GPUEnv *env,
#else
clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
(time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
#endif
} else {
tesseract::ImageThresholder thresholder;
thresholder.SetImage(input.pix);
#if ON_WINDOWS
@ -2367,18 +2336,21 @@ static double thresholdRectToPixMicroBench(GPUEnv *env,
clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
#endif
int hi_values[4] = {};
ThresholdRectToPix_Native( input.imageData, input.numChannels, bytes_per_line,
thresholds, hi_values, &input.pix );
ThresholdRectToPix_Native(input.imageData, input.numChannels,
bytes_per_line, thresholds, hi_values,
&input.pix);
#if ON_WINDOWS
QueryPerformanceCounter(&time_funct_end);
time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
(double)(freq.QuadPart);
#elif ON_APPLE
stop = mach_absolute_time();
time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
#else
clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
(time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
#endif
}
@ -2421,22 +2393,24 @@ static double getLineMasksMorphMicroBench(GPUEnv *env,
OpenclDevice::gpuEnv = *env;
OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
Pix *pix_vline = nullptr, *pix_hline = nullptr, *pix_closed = nullptr;
OpenclDevice::pixGetLinesCL(
nullptr, input.pix, &pix_vline, &pix_hline, &pix_closed, true,
closing_brick, closing_brick, max_line_width, max_line_width,
min_line_length, min_line_length);
OpenclDevice::pixGetLinesCL(nullptr, input.pix, &pix_vline, &pix_hline,
&pix_closed, true, closing_brick, closing_brick,
max_line_width, max_line_width, min_line_length,
min_line_length);
OpenclDevice::releaseMorphCLBuffers();
#if ON_WINDOWS
QueryPerformanceCounter(&time_funct_end);
time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
(double)(freq.QuadPart);
#elif ON_APPLE
stop = mach_absolute_time();
time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
#else
clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
(time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
#endif
} else {
#if ON_WINDOWS
@ -2461,21 +2435,21 @@ static double getLineMasksMorphMicroBench(GPUEnv *env,
#if ON_WINDOWS
QueryPerformanceCounter(&time_funct_end);
time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
(double)(freq.QuadPart);
#elif ON_APPLE
stop = mach_absolute_time();
time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
#else
clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
(time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
#endif
}
return time;
}
/******************************************************************************
* Device Selection
*****************************************************************************/
@ -2693,7 +2667,6 @@ ds_device OpenclDevice::getDeviceSelection( ) {
return selectedDevice;
}
bool OpenclDevice::selectedDeviceIsOpenCL() {
ds_device device = getDeviceSelection();
return (device.type == DS_DEVICE_OPENCL_DEVICE);

View File

@ -72,19 +72,20 @@
* 0 - no reporting
* 1 - no reporting
* 2 - report total function call time for functions we're tracking
* 3 - optionally report breakdown of function calls (kernel launch, kernel time, data copies)
* 3 - optionally report breakdown of function calls (kernel launch, kernel
*time, data copies)
************************************************************************************/
#define PERF_COUNT_VERBOSE 1
#define PERF_COUNT_REPORT_STR "[%36s], %24s, %11.6f\n"
#if ON_WINDOWS
#if PERF_COUNT_VERBOSE >= 2
#define PERF_COUNT_START(FUNCT_NAME) \
char* funct_name = FUNCT_NAME; \
double elapsed_time_sec; \
LARGE_INTEGER freq, time_funct_start, time_funct_end, time_sub_start, time_sub_end; \
LARGE_INTEGER freq, time_funct_start, time_funct_end, time_sub_start, \
time_sub_end; \
QueryPerformanceFrequency(&freq); \
QueryPerformanceCounter(&time_funct_start); \
time_sub_start = time_funct_start; \
@ -111,7 +112,6 @@
#define PERF_COUNT_SUB(SUB)
#endif
// not on windows
#else
@ -169,8 +169,7 @@
#define GROUPSIZE_HMORX 256
#define GROUPSIZE_HMORY 1
typedef struct _KernelEnv
{
typedef struct _KernelEnv {
cl_context mpkContext;
cl_command_queue mpkCmdQueue;
cl_program mpkProgram;
@ -178,8 +177,7 @@ typedef struct _KernelEnv
char mckKernelName[150];
} KernelEnv;
typedef struct _OpenCLEnv
{
typedef struct _OpenCLEnv {
cl_platform_id mpOclPlatformID;
cl_context mpOclContext;
cl_device_id mpOclDevsID;
@ -188,14 +186,11 @@ typedef struct _OpenCLEnv
typedef int (*cl_kernel_function)(void** userdata, KernelEnv* kenv);
#define CHECK_OPENCL(status, name) \
if( status != CL_SUCCESS ) \
{ \
if (status != CL_SUCCESS) { \
printf("OpenCL error code is %d at when %s .\n", status, name); \
}
typedef struct _GPUEnv
{
typedef struct _GPUEnv {
// share vb in all modules in hb library
cl_platform_id mpPlatformID;
cl_device_type mDevType;
@ -204,39 +199,47 @@ typedef struct _GPUEnv
cl_device_id mpDevID;
cl_command_queue mpCmdQueue;
cl_kernel mpArryKernels[MAX_CLFILE_NUM];
cl_program mpArryPrograms[MAX_CLFILE_NUM]; //one program object maps one kernel source file
char mArryKnelSrcFile[MAX_CLFILE_NUM][256], //the max len of kernel file name is 256
cl_program mpArryPrograms[MAX_CLFILE_NUM]; // one program object maps one
// kernel source file
char mArryKnelSrcFile[MAX_CLFILE_NUM]
[256], // the max len of kernel file name is 256
mArrykernelNames[MAX_CLKERNEL_NUM][MAX_KERNEL_STRING_LEN + 1];
cl_kernel_function mpArryKnelFuncs[MAX_CLKERNEL_NUM];
int mnKernelCount, mnFileCount, // only one kernel file
mnIsUserCreated; // 1: created , 0:no create and needed to create by opencl wrapper
mnIsUserCreated; // 1: created , 0:no create and needed to create by
// opencl wrapper
int mnKhrFp64Flag;
int mnAmdFp64Flag;
} GPUEnv;
class OpenclDevice
{
class OpenclDevice {
public:
static GPUEnv gpuEnv;
static int isInited;
OpenclDevice();
~OpenclDevice();
static int InitEnv(); // load dll, call InitOpenclRunEnv(0)
static int InitOpenclRunEnv( int argc ); // RegistOpenclKernel, double flags, compile kernels
static int InitOpenclRunEnv_DeviceSelection( int argc ); // RegistOpenclKernel, double flags, compile kernels
static int InitOpenclRunEnv(
int argc); // RegistOpenclKernel, double flags, compile kernels
static int InitOpenclRunEnv_DeviceSelection(
int argc); // RegistOpenclKernel, double flags, compile kernels
static int RegistOpenclKernel();
static int ReleaseOpenclRunEnv();
static int ReleaseOpenclEnv(GPUEnv* gpuInfo);
static int CompileKernelFile(GPUEnv* gpuInfo, const char* buildOption);
static int CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName );
static int GeneratBinFromKernelSource( cl_program program, const char * clFileName );
static int WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes );
static int CachedOfKernerPrg(const GPUEnv* gpuEnvCached,
const char* clFileName);
static int GeneratBinFromKernelSource(cl_program program,
const char* clFileName);
static int WriteBinaryToFile(const char* fileName, const char* birary,
size_t numBytes);
static int BinaryGenerated(const char* clFileName, FILE** fhandle);
//static int CompileKernelFile( const char *filename, GPUEnv *gpuInfo, const char *buildOption );
static l_uint32* pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl, l_uint32 *line);
// static int CompileKernelFile( const char *filename, GPUEnv *gpuInfo, const
// char *buildOption );
static l_uint32* pixReadFromTiffKernel(l_uint32* tiffdata, l_int32 w,
l_int32 h, l_int32 wpl,
l_uint32* line);
static int composeRGBPixelCl(int* tiffdata, int* line, int h, int w);
/* OpenCL implementations of Morphological operations*/
@ -246,11 +249,10 @@ public:
static void releaseMorphCLBuffers();
static void pixGetLinesCL(Pix* pixd, Pix* pixs, Pix** pix_vline,
Pix **pix_hline, Pix **pixClosed,
bool getpixClosed, l_int32 close_hsize,
l_int32 close_vsize, l_int32 open_hsize,
l_int32 open_vsize, l_int32 line_hsize,
l_int32 line_vsize);
Pix** pix_hline, Pix** pixClosed, bool getpixClosed,
l_int32 close_hsize, l_int32 close_vsize,
l_int32 open_hsize, l_int32 open_vsize,
l_int32 line_hsize, l_int32 line_vsize);
// int InitOpenclAttr( OpenCLEnv * env );
// int ReleaseKernel( KernelEnv * env );
@ -259,9 +261,10 @@ public:
// int RunKernel( const char *kernelName, void **userdata );
// int ConvertToString( const char *filename, char **source );
// int CheckKernelName( KernelEnv *envInfo, const char *kernelName );
//int RegisterKernelWrapper( const char *kernelName, cl_kernel_function function );
//int RunKernelWrapper( cl_kernel_function function, const char * kernelName, void **usrdata );
//int GetKernelEnvAndFunc( const char *kernelName, KernelEnv *env, cl_kernel_function *function );
// int RegisterKernelWrapper( const char *kernelName, cl_kernel_function
// function ); int RunKernelWrapper( cl_kernel_function function, const char *
// kernelName, void **usrdata ); int GetKernelEnvAndFunc( const char
// *kernelName, KernelEnv *env, cl_kernel_function *function );
static int LoadOpencl();
#ifdef WIN32
@ -273,8 +276,8 @@ public:
/* for binarization */
static int HistogramRectOCL(unsigned char* imagedata, int bytes_per_pixel,
int bytes_per_line, int left, int top,
int width, int height, int kHistogramSize,
int bytes_per_line, int left, int top, int width,
int height, int kHistogramSize,
int* histogramAllChannels);
static int ThresholdRectToPixOCL(unsigned char* imagedata,