diff --git a/src/opencl/oclkernels.h b/src/opencl/oclkernels.h index 1788baa9..cca7afc9 100644 --- a/src/opencl/oclkernels.h +++ b/src/opencl/oclkernels.h @@ -12,7 +12,7 @@ #define TESSERACT_OPENCL_OCLKERNELS_H_ #ifndef USE_EXTERNAL_KERNEL -#define KERNEL( ... )# __VA_ARGS__ "\n" +#define KERNEL(...) #__VA_ARGS__ "\n" // Double precision is a default of spreadsheets // cl_khr_fp64: Khronos extension // cl_amd_fp64: AMD extension diff --git a/src/opencl/opencl_device_selection.h b/src/opencl/opencl_device_selection.h index c44c2baf..17df446e 100644 --- a/src/opencl/opencl_device_selection.h +++ b/src/opencl/opencl_device_selection.h @@ -17,8 +17,8 @@ #define _CRT_SECURE_NO_WARNINGS #endif -#include #include +#include #include #ifdef __APPLE__ @@ -34,12 +34,12 @@ typedef enum { } ds_device_type; typedef struct { - ds_device_type type; - cl_device_id oclDeviceID; - char* oclDeviceName; - char* oclDriverVersion; + ds_device_type type; + cl_device_id oclDeviceID; + char* oclDeviceName; + char* oclDriverVersion; // a pointer to the score data, the content/format is application defined. - void* score; + void* score; } ds_device; #endif // USE_OPENCL diff --git a/src/opencl/openclwrapper.cpp b/src/opencl/openclwrapper.cpp index e95c4014..3ad07652 100644 --- a/src/opencl/openclwrapper.cpp +++ b/src/opencl/openclwrapper.cpp @@ -15,8 +15,8 @@ #endif #include -#include "openclwrapper.h" #include "oclkernels.h" +#include "openclwrapper.h" // for micro-benchmark #include "otsuthr.h" @@ -31,7 +31,7 @@ #include #include -#include "errcode.h" // for ASSERT_HOST +#include "errcode.h" // for ASSERT_HOST #include "opencl_device_selection.h" GPUEnv OpenclDevice::gpuEnv; @@ -86,8 +86,8 @@ typedef enum { DS_EVALUATE_ALL, DS_EVALUATE_NEW_ONLY } ds_evaluation_type; typedef struct { unsigned int numDevices; - ds_device *devices; - const char *version; + ds_device* devices; + const char* version; } ds_profile; typedef enum { @@ -108,11 +108,11 @@ typedef enum { // device->score) update the data size of score. The encoding and the format // of the score data is implementation defined. The function should return // DS_SUCCESS if there's no error to be reported. -typedef ds_status (*ds_perf_evaluator)(ds_device *device, void *data); +typedef ds_status (*ds_perf_evaluator)(ds_device* device, void* data); // deallocate memory used by score -typedef ds_status (*ds_score_release)(void *score); -static ds_status releaseDSProfile(ds_profile *profile, ds_score_release sr) { +typedef ds_status (*ds_score_release)(void* score); +static ds_status releaseDSProfile(ds_profile* profile, ds_score_release sr) { ds_status status = DS_SUCCESS; if (profile != nullptr) { if (profile->devices != nullptr && sr != nullptr) { @@ -130,18 +130,18 @@ static ds_status releaseDSProfile(ds_profile *profile, ds_score_release sr) { return status; } -static ds_status initDSProfile(ds_profile **p, const char *version) { +static ds_status initDSProfile(ds_profile** p, const char* version) { int numDevices; cl_uint numPlatforms; - cl_platform_id *platforms = nullptr; - cl_device_id *devices = nullptr; + cl_platform_id* platforms = nullptr; + cl_device_id* devices = nullptr; ds_status status = DS_SUCCESS; unsigned int next; unsigned int i; if (p == nullptr) return DS_INVALID_PROFILE; - ds_profile *profile = (ds_profile *)malloc(sizeof(ds_profile)); + ds_profile* profile = (ds_profile*)malloc(sizeof(ds_profile)); if (profile == nullptr) return DS_MEMORY_ERROR; memset(profile, 0, sizeof(ds_profile)); @@ -149,7 +149,7 @@ static ds_status initDSProfile(ds_profile **p, const char *version) { clGetPlatformIDs(0, nullptr, &numPlatforms); if (numPlatforms > 0) { - platforms = (cl_platform_id *)malloc(numPlatforms * sizeof(cl_platform_id)); + platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id)); if (platforms == nullptr) { status = DS_MEMORY_ERROR; goto cleanup; @@ -165,7 +165,7 @@ static ds_status initDSProfile(ds_profile **p, const char *version) { } if (numDevices > 0) { - devices = (cl_device_id *)malloc(numDevices * sizeof(cl_device_id)); + devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id)); if (devices == nullptr) { status = DS_MEMORY_ERROR; goto cleanup; @@ -175,7 +175,7 @@ static ds_status initDSProfile(ds_profile **p, const char *version) { profile->numDevices = numDevices + 1; // +1 to numDevices to include the native CPU profile->devices = - (ds_device *)malloc(profile->numDevices * sizeof(ds_device)); + (ds_device*)malloc(profile->numDevices * sizeof(ds_device)); if (profile->devices == nullptr) { profile->numDevices = 0; status = DS_MEMORY_ERROR; @@ -198,13 +198,13 @@ static ds_status initDSProfile(ds_profile **p, const char *version) { clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_NAME, DS_DEVICE_NAME_LENGTH, &buffer, nullptr); length = strlen(buffer); - profile->devices[next].oclDeviceName = (char *)malloc(length + 1); + profile->devices[next].oclDeviceName = (char*)malloc(length + 1); memcpy(profile->devices[next].oclDeviceName, buffer, length + 1); clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DRIVER_VERSION, DS_DEVICE_NAME_LENGTH, &buffer, nullptr); length = strlen(buffer); - profile->devices[next].oclDriverVersion = (char *)malloc(length + 1); + profile->devices[next].oclDriverVersion = (char*)malloc(length + 1); memcpy(profile->devices[next].oclDriverVersion, buffer, length + 1); } } @@ -225,10 +225,10 @@ cleanup: return status; } -static ds_status profileDevices(ds_profile *profile, +static ds_status profileDevices(ds_profile* profile, const ds_evaluation_type type, ds_perf_evaluator evaluator, - void *evaluatorData, unsigned int *numUpdates) { + void* evaluatorData, unsigned int* numUpdates) { ds_status status = DS_SUCCESS; unsigned int i; unsigned int updates = 0; @@ -264,11 +264,11 @@ static ds_status profileDevices(ds_profile *profile, return status; } -static const char *findString(const char *contentStart, const char *contentEnd, - const char *string) { +static const char* findString(const char* contentStart, const char* contentEnd, + const char* string) { size_t stringLength; - const char *currentPosition; - const char *found = nullptr; + const char* currentPosition; + const char* found = nullptr; stringLength = strlen(string); currentPosition = contentStart; for (currentPosition = contentStart; currentPosition < contentEnd; @@ -285,19 +285,19 @@ static const char *findString(const char *contentStart, const char *contentEnd, return found; } -static ds_status readProFile(const char *fileName, char **content, - size_t *contentSize) { +static ds_status readProFile(const char* fileName, char** content, + size_t* contentSize) { *contentSize = 0; *content = nullptr; ds_status status = DS_SUCCESS; - FILE *input = fopen(fileName, "rb"); + FILE* input = fopen(fileName, "rb"); if (input == nullptr) { status = DS_FILE_ERROR; } else { fseek(input, 0L, SEEK_END); size_t size = ftell(input); rewind(input); - char *binary = new char[size]; + char* binary = new char[size]; if (fread(binary, sizeof(char), size, input) != size) { status = DS_FILE_ERROR; } else { @@ -309,26 +309,26 @@ static ds_status readProFile(const char *fileName, char **content, return status; } -typedef ds_status (*ds_score_deserializer)(ds_device *device, - const unsigned char *serializedScore, +typedef ds_status (*ds_score_deserializer)(ds_device* device, + const unsigned char* serializedScore, unsigned int serializedScoreSize); -static ds_status readProfileFromFile(ds_profile *profile, +static ds_status readProfileFromFile(ds_profile* profile, ds_score_deserializer deserializer, - const char *file) { + const char* file) { ds_status status = DS_SUCCESS; - char *contentStart; + char* contentStart; size_t contentSize; if (profile == nullptr) return DS_INVALID_PROFILE; status = readProFile(file, &contentStart, &contentSize); if (status == DS_SUCCESS) { - const char *currentPosition; - const char *dataStart; - const char *dataEnd; + const char* currentPosition; + const char* dataStart; + const char* dataEnd; - const char *contentEnd = contentStart + contentSize; + const char* contentEnd = contentStart + contentSize; currentPosition = contentStart; // parse the version string @@ -358,18 +358,18 @@ static ds_status readProfileFromFile(ds_profile *profile, while (1) { unsigned int i; - const char *deviceTypeStart; - const char *deviceTypeEnd; + const char* deviceTypeStart; + const char* deviceTypeEnd; ds_device_type deviceType; - const char *deviceNameStart; - const char *deviceNameEnd; + const char* deviceNameStart; + const char* deviceNameEnd; - const char *deviceScoreStart; - const char *deviceScoreEnd; + const char* deviceScoreStart; + const char* deviceScoreEnd; - const char *deviceDriverStart; - const char *deviceDriverEnd; + const char* deviceDriverStart; + const char* deviceDriverEnd; dataStart = findString(currentPosition, contentEnd, DS_TAG_DEVICE); if (dataStart == nullptr) { @@ -447,7 +447,7 @@ static ds_status readProfileFromFile(ds_profile *profile, deviceScoreEnd = findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END); status = deserializer(profile->devices + i, - (const unsigned char *)deviceScoreStart, + (const unsigned char*)deviceScoreStart, deviceScoreEnd - deviceScoreStart); if (status != DS_SUCCESS) { goto cleanup; @@ -467,7 +467,7 @@ static ds_status readProfileFromFile(ds_profile *profile, deviceScoreEnd = findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END); status = deserializer(profile->devices + i, - (const unsigned char *)deviceScoreStart, + (const unsigned char*)deviceScoreStart, deviceScoreEnd - deviceScoreStart); if (status != DS_SUCCESS) { goto cleanup; @@ -485,17 +485,17 @@ cleanup: return status; } -typedef ds_status (*ds_score_serializer)(ds_device *device, - void **serializedScore, - unsigned int *serializedScoreSize); -static ds_status writeProfileToFile(ds_profile *profile, +typedef ds_status (*ds_score_serializer)(ds_device* device, + void** serializedScore, + unsigned int* serializedScoreSize); +static ds_status writeProfileToFile(ds_profile* profile, ds_score_serializer serializer, - const char *file) { + const char* file) { ds_status status = DS_SUCCESS; if (profile == nullptr) return DS_INVALID_PROFILE; - FILE *profileFile = fopen(file, "wb"); + FILE* profileFile = fopen(file, "wb"); if (profileFile == nullptr) { status = DS_FILE_ERROR; } else { @@ -510,7 +510,7 @@ static ds_status writeProfileToFile(ds_profile *profile, fwrite("\n", sizeof(char), 1, profileFile); for (i = 0; i < profile->numDevices && status == DS_SUCCESS; i++) { - void *serializedScore; + void* serializedScore; unsigned int serializedScoreSize; fwrite(DS_TAG_DEVICE, sizeof(char), strlen(DS_TAG_DEVICE), profileFile); @@ -573,9 +573,9 @@ static ds_status writeProfileToFile(ds_profile *profile, } // substitute invalid characters in device name with _ -static void legalizeFileName(char *fileName) { +static void legalizeFileName(char* fileName) { // printf("fileName: %s\n", fileName); - const char *invalidChars = + const char* invalidChars = "/\?:*\"><| "; // space is valid but can cause headaches // for each invalid char for (unsigned i = 0; i < strlen(invalidChars); i++) { @@ -587,7 +587,7 @@ static void legalizeFileName(char *fileName) { // initial ./ is valid for present directory // if (*pos == '.') pos++; // if (*pos == '/') pos++; - for (char *pos = strstr(fileName, invalidStr); pos != nullptr; + for (char* pos = strstr(fileName, invalidStr); pos != nullptr; pos = strstr(pos + 1, invalidStr)) { // printf("\tfound: %s, ", pos); pos[0] = '_'; @@ -596,7 +596,7 @@ static void legalizeFileName(char *fileName) { } } -static void populateGPUEnvFromDevice(GPUEnv *gpuInfo, cl_device_id device) { +static void populateGPUEnvFromDevice(GPUEnv* gpuInfo, cl_device_id device) { // printf("[DS] populateGPUEnvFromDevice\n"); size_t size; gpuInfo->mnIsUserCreated = 1; @@ -627,11 +627,10 @@ static void populateGPUEnvFromDevice(GPUEnv *gpuInfo, cl_device_id device) { CHECK_OPENCL(clStatus, "populateGPUEnv::createCommandQueue"); } -int OpenclDevice::LoadOpencl() -{ +int OpenclDevice::LoadOpencl() { #ifdef WIN32 HINSTANCE HOpenclDll = nullptr; - void *OpenclDll = nullptr; + void* OpenclDll = nullptr; // fprintf(stderr, " LoadOpenclDllxx... \n"); OpenclDll = static_cast(HOpenclDll); OpenclDll = LoadLibrary("openCL.dll"); @@ -639,23 +638,22 @@ int OpenclDevice::LoadOpencl() fprintf(stderr, "[OD] Load opencl.dll failed!\n"); FreeLibrary(static_cast(OpenclDll)); return 0; - } - fprintf(stderr, "[OD] Load opencl.dll successful!\n"); + } + fprintf(stderr, "[OD] Load opencl.dll successful!\n"); #endif - return 1; + return 1; } -int OpenclDevice::SetKernelEnv( KernelEnv *envInfo ) -{ - envInfo->mpkContext = gpuEnv.mpContext; - envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue; - envInfo->mpkProgram = gpuEnv.mpArryPrograms[0]; +int OpenclDevice::SetKernelEnv(KernelEnv* envInfo) { + envInfo->mpkContext = gpuEnv.mpContext; + envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue; + envInfo->mpkProgram = gpuEnv.mpArryPrograms[0]; - return 1; + return 1; } -static cl_mem allocateZeroCopyBuffer(const KernelEnv &rEnv, l_uint32 *hostbuffer, - size_t nElements, cl_mem_flags flags, - cl_int *pStatus) { +static cl_mem allocateZeroCopyBuffer(const KernelEnv& rEnv, + l_uint32* hostbuffer, size_t nElements, + cl_mem_flags flags, cl_int* pStatus) { cl_mem membuffer = clCreateBuffer(rEnv.mpkContext, (cl_mem_flags)(flags), nElements * sizeof(l_uint32), hostbuffer, pStatus); @@ -663,8 +661,8 @@ static cl_mem allocateZeroCopyBuffer(const KernelEnv &rEnv, l_uint32 *hostbuffer return membuffer; } -static Pix *mapOutputCLBuffer(const KernelEnv &rEnv, cl_mem clbuffer, Pix *pixd, - Pix *pixs, int elements, cl_mem_flags flags, +static Pix* mapOutputCLBuffer(const KernelEnv& rEnv, cl_mem clbuffer, Pix* pixd, + Pix* pixs, int elements, cl_mem_flags flags, bool memcopy = false, bool sync = true) { PROCNAME("mapOutputCLBuffer"); if (!pixd) { @@ -677,7 +675,7 @@ static Pix *mapOutputCLBuffer(const KernelEnv &rEnv, cl_mem clbuffer, Pix *pixd, tprintf("pixd not made\n"); } } - l_uint32 *pValues = (l_uint32 *)clEnqueueMapBuffer( + l_uint32* pValues = (l_uint32*)clEnqueueMapBuffer( rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0, elements * sizeof(l_uint32), 0, nullptr, nullptr, nullptr); @@ -697,8 +695,7 @@ static Pix *mapOutputCLBuffer(const KernelEnv &rEnv, cl_mem clbuffer, Pix *pixd, return pixd; } -void OpenclDevice::releaseMorphCLBuffers() -{ +void OpenclDevice::releaseMorphCLBuffers() { if (pixdCLIntermediate != nullptr) clReleaseMemObject(pixdCLIntermediate); if (pixsCLBuffer != nullptr) clReleaseMemObject(pixsCLBuffer); if (pixdCLBuffer != nullptr) clReleaseMemObject(pixdCLBuffer); @@ -706,7 +703,7 @@ void OpenclDevice::releaseMorphCLBuffers() pixdCLIntermediate = pixsCLBuffer = pixdCLBuffer = pixThBuffer = nullptr; } -int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Pix *pixs) { +int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Pix* pixs) { SetKernelEnv(&rEnv); if (pixThBuffer != nullptr) { @@ -719,8 +716,8 @@ int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Pix *pixs) { sizeof(l_uint32) * wpl * h, 0, nullptr, nullptr); } else { // Get data from the source image - l_uint32 *srcdata = - reinterpret_cast(malloc(wpl * h * sizeof(l_uint32))); + l_uint32* srcdata = + reinterpret_cast(malloc(wpl * h * sizeof(l_uint32))); memcpy(srcdata, pixGetData(pixs), wpl * h * sizeof(l_uint32)); pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl * h, @@ -736,501 +733,464 @@ int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Pix *pixs) { return (int)clStatus; } -int OpenclDevice::InitEnv() -{ -//PERF_COUNT_START("OD::InitEnv") +int OpenclDevice::InitEnv() { +// PERF_COUNT_START("OD::InitEnv") // printf("[OD] OpenclDevice::InitEnv()\n"); #ifdef SAL_WIN32 - while( 1 ) - { - if( 1 == LoadOpencl() ) - break; - } -PERF_COUNT_SUB("LoadOpencl") + while (1) { + if (1 == LoadOpencl()) break; + } + PERF_COUNT_SUB("LoadOpencl") #endif - // sets up environment, compiles programs + // sets up environment, compiles programs - InitOpenclRunEnv_DeviceSelection( 0 ); -//PERF_COUNT_SUB("called InitOpenclRunEnv_DS") -//PERF_COUNT_END - return 1; + InitOpenclRunEnv_DeviceSelection(0); + // PERF_COUNT_SUB("called InitOpenclRunEnv_DS") + // PERF_COUNT_END + return 1; } -int OpenclDevice::ReleaseOpenclRunEnv() -{ - ReleaseOpenclEnv( &gpuEnv ); +int OpenclDevice::ReleaseOpenclRunEnv() { + ReleaseOpenclEnv(&gpuEnv); #ifdef SAL_WIN32 - FreeOpenclDll(); + FreeOpenclDll(); #endif - return 1; + return 1; } -inline int OpenclDevice::AddKernelConfig( int kCount, const char *kName ) -{ - ASSERT_HOST(kCount > 0); - ASSERT_HOST(strlen(kName) < sizeof(gpuEnv.mArrykernelNames[kCount-1])); - strcpy( gpuEnv.mArrykernelNames[kCount-1], kName ); - gpuEnv.mnKernelCount++; - return 0; +inline int OpenclDevice::AddKernelConfig(int kCount, const char* kName) { + ASSERT_HOST(kCount > 0); + ASSERT_HOST(strlen(kName) < sizeof(gpuEnv.mArrykernelNames[kCount - 1])); + strcpy(gpuEnv.mArrykernelNames[kCount - 1], kName); + gpuEnv.mnKernelCount++; + return 0; } -int OpenclDevice::RegistOpenclKernel() -{ - if ( !gpuEnv.mnIsUserCreated ) - memset( &gpuEnv, 0, sizeof(gpuEnv) ); +int OpenclDevice::RegistOpenclKernel() { + if (!gpuEnv.mnIsUserCreated) memset(&gpuEnv, 0, sizeof(gpuEnv)); - gpuEnv.mnFileCount = 0; //argc; - gpuEnv.mnKernelCount = 0UL; + gpuEnv.mnFileCount = 0; // argc; + gpuEnv.mnKernelCount = 0UL; - AddKernelConfig( 1, (const char*) "oclAverageSub1" ); - return 0; + AddKernelConfig(1, (const char*)"oclAverageSub1"); + return 0; } -int OpenclDevice::InitOpenclRunEnv_DeviceSelection( int argc ) { -//PERF_COUNT_START("InitOpenclRunEnv_DS") - if (!isInited) { - // after programs compiled, selects best device - ds_device bestDevice_DS = getDeviceSelection( ); -//PERF_COUNT_SUB("called getDeviceSelection()") - cl_device_id bestDevice = bestDevice_DS.oclDeviceID; - // overwrite global static GPUEnv with new device - if (selectedDeviceIsOpenCL() ) { - //printf("[DS] InitOpenclRunEnv_DS::Calling populateGPUEnvFromDevice() for selected device\n"); - populateGPUEnvFromDevice( &gpuEnv, bestDevice ); - gpuEnv.mnFileCount = 0; //argc; - gpuEnv.mnKernelCount = 0UL; -//PERF_COUNT_SUB("populate gpuEnv") - CompileKernelFile(&gpuEnv, ""); -//PERF_COUNT_SUB("CompileKernelFile") - } else { - //printf("[DS] InitOpenclRunEnv_DS::Skipping populateGPUEnvFromDevice() b/c native cpu selected\n"); - } - isInited = 1; +int OpenclDevice::InitOpenclRunEnv_DeviceSelection(int argc) { + // PERF_COUNT_START("InitOpenclRunEnv_DS") + if (!isInited) { + // after programs compiled, selects best device + ds_device bestDevice_DS = getDeviceSelection(); + // PERF_COUNT_SUB("called getDeviceSelection()") + cl_device_id bestDevice = bestDevice_DS.oclDeviceID; + // overwrite global static GPUEnv with new device + if (selectedDeviceIsOpenCL()) { + // printf("[DS] InitOpenclRunEnv_DS::Calling populateGPUEnvFromDevice() + // for selected device\n"); + populateGPUEnvFromDevice(&gpuEnv, bestDevice); + gpuEnv.mnFileCount = 0; // argc; + gpuEnv.mnKernelCount = 0UL; + // PERF_COUNT_SUB("populate gpuEnv") + CompileKernelFile(&gpuEnv, ""); + // PERF_COUNT_SUB("CompileKernelFile") + } else { + // printf("[DS] InitOpenclRunEnv_DS::Skipping populateGPUEnvFromDevice() + // b/c native cpu selected\n"); } -//PERF_COUNT_END - return 0; + isInited = 1; + } + // PERF_COUNT_END + return 0; } - -OpenclDevice::OpenclDevice() -{ - //InitEnv(); +OpenclDevice::OpenclDevice() { + // InitEnv(); } -OpenclDevice::~OpenclDevice() -{ - //ReleaseOpenclRunEnv(); +OpenclDevice::~OpenclDevice() { + // ReleaseOpenclRunEnv(); } -int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo ) -{ - int i = 0; - int clStatus = 0; +int OpenclDevice::ReleaseOpenclEnv(GPUEnv* gpuInfo) { + int i = 0; + int clStatus = 0; - if ( !isInited ) - { + if (!isInited) { + return 1; + } + + for (i = 0; i < gpuEnv.mnFileCount; i++) { + if (gpuEnv.mpArryPrograms[i]) { + clStatus = clReleaseProgram(gpuEnv.mpArryPrograms[i]); + CHECK_OPENCL(clStatus, "clReleaseProgram"); + gpuEnv.mpArryPrograms[i] = nullptr; + } + } + if (gpuEnv.mpCmdQueue) { + clReleaseCommandQueue(gpuEnv.mpCmdQueue); + gpuEnv.mpCmdQueue = nullptr; + } + if (gpuEnv.mpContext) { + clReleaseContext(gpuEnv.mpContext); + gpuEnv.mpContext = nullptr; + } + isInited = 0; + gpuInfo->mnIsUserCreated = 0; + delete[] gpuInfo->mpArryDevsID; + return 1; +} +int OpenclDevice::BinaryGenerated(const char* clFileName, FILE** fhandle) { + unsigned int i = 0; + cl_int clStatus; + int status = 0; + char* str = nullptr; + FILE* fd = nullptr; + char fileName[256] = {0}, cl_name[128] = {0}; + char deviceName[1024]; + clStatus = clGetDeviceInfo(gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, + sizeof(deviceName), deviceName, nullptr); + CHECK_OPENCL(clStatus, "clGetDeviceInfo"); + str = (char*)strstr(clFileName, (char*)".cl"); + memcpy(cl_name, clFileName, str - clFileName); + cl_name[str - clFileName] = '\0'; + sprintf(fileName, "%s-%s.bin", cl_name, deviceName); + legalizeFileName(fileName); + fd = fopen(fileName, "rb"); + status = (fd != nullptr) ? 1 : 0; + if (fd != nullptr) { + *fhandle = fd; + } + return status; +} +int OpenclDevice::CachedOfKernerPrg(const GPUEnv* gpuEnvCached, + const char* clFileName) { + int i; + for (i = 0; i < gpuEnvCached->mnFileCount; i++) { + if (strcasecmp(gpuEnvCached->mArryKnelSrcFile[i], clFileName) == 0) { + if (gpuEnvCached->mpArryPrograms[i] != nullptr) { return 1; + } } + } - for ( i = 0; i < gpuEnv.mnFileCount; i++ ) - { - if ( gpuEnv.mpArryPrograms[i] ) - { - clStatus = clReleaseProgram( gpuEnv.mpArryPrograms[i] ); - CHECK_OPENCL( clStatus, "clReleaseProgram" ); - gpuEnv.mpArryPrograms[i] = nullptr; - } - } - if ( gpuEnv.mpCmdQueue ) - { - clReleaseCommandQueue( gpuEnv.mpCmdQueue ); - gpuEnv.mpCmdQueue = nullptr; - } - if ( gpuEnv.mpContext ) - { - clReleaseContext( gpuEnv.mpContext ); - gpuEnv.mpContext = nullptr; - } - isInited = 0; - gpuInfo->mnIsUserCreated = 0; - delete[] gpuInfo->mpArryDevsID; - return 1; + return 0; } -int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle ) -{ - unsigned int i = 0; - cl_int clStatus; - int status = 0; - char *str = nullptr; - FILE *fd = nullptr; - char fileName[256] = {0}, cl_name[128] = {0}; - char deviceName[1024]; - clStatus = clGetDeviceInfo(gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, - sizeof(deviceName), deviceName, nullptr); - CHECK_OPENCL(clStatus, "clGetDeviceInfo"); - str = (char *)strstr(clFileName, (char *)".cl"); - memcpy(cl_name, clFileName, str - clFileName); - cl_name[str - clFileName] = '\0'; - sprintf(fileName, "%s-%s.bin", cl_name, deviceName); - legalizeFileName(fileName); - fd = fopen(fileName, "rb"); - status = (fd != nullptr) ? 1 : 0; - if (fd != nullptr) { - *fhandle = fd; - } - return status; - -} -int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName ) -{ - int i; - for ( i = 0; i < gpuEnvCached->mnFileCount; i++ ) - { - if ( strcasecmp( gpuEnvCached->mArryKnelSrcFile[i], clFileName ) == 0 ) - { - if (gpuEnvCached->mpArryPrograms[i] != nullptr) { - return 1; - } - } - } - - return 0; -} -int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes ) -{ - FILE *output = nullptr; +int OpenclDevice::WriteBinaryToFile(const char* fileName, const char* birary, + size_t numBytes) { + FILE* output = nullptr; output = fopen(fileName, "wb"); if (output == nullptr) { return 0; - } + } - fwrite( birary, sizeof(char), numBytes, output ); - fclose( output ); - - return 1; + fwrite(birary, sizeof(char), numBytes, output); + fclose(output); + return 1; } -int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * clFileName ) -{ - unsigned int i = 0; - cl_int clStatus; - size_t *binarySizes; - cl_uint numDevices; - cl_device_id *mpArryDevsID; - char *str = nullptr; +int OpenclDevice::GeneratBinFromKernelSource(cl_program program, + const char* clFileName) { + unsigned int i = 0; + cl_int clStatus; + size_t* binarySizes; + cl_uint numDevices; + cl_device_id* mpArryDevsID; + char* str = nullptr; - clStatus = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, - sizeof(numDevices), &numDevices, nullptr); - CHECK_OPENCL( clStatus, "clGetProgramInfo" ); + clStatus = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, + sizeof(numDevices), &numDevices, nullptr); + CHECK_OPENCL(clStatus, "clGetProgramInfo"); - mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices ); - if (mpArryDevsID == nullptr) { - return 0; - } - /* grab the handles to all of the devices in the program. */ - clStatus = clGetProgramInfo(program, CL_PROGRAM_DEVICES, - sizeof(cl_device_id) * numDevices, mpArryDevsID, - nullptr); - CHECK_OPENCL( clStatus, "clGetProgramInfo" ); + mpArryDevsID = (cl_device_id*)malloc(sizeof(cl_device_id) * numDevices); + if (mpArryDevsID == nullptr) { + return 0; + } + /* grab the handles to all of the devices in the program. */ + clStatus = clGetProgramInfo(program, CL_PROGRAM_DEVICES, + sizeof(cl_device_id) * numDevices, mpArryDevsID, + nullptr); + CHECK_OPENCL(clStatus, "clGetProgramInfo"); - /* figure out the sizes of each of the binaries. */ - binarySizes = (size_t*) malloc( sizeof(size_t) * numDevices ); + /* figure out the sizes of each of the binaries. */ + binarySizes = (size_t*)malloc(sizeof(size_t) * numDevices); - clStatus = - clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, - sizeof(size_t) * numDevices, binarySizes, nullptr); - CHECK_OPENCL( clStatus, "clGetProgramInfo" ); + clStatus = + clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, + sizeof(size_t) * numDevices, binarySizes, nullptr); + CHECK_OPENCL(clStatus, "clGetProgramInfo"); - /* copy over all of the generated binaries. */ - std::vector binaries(numDevices); + /* copy over all of the generated binaries. */ + std::vector binaries(numDevices); - for ( i = 0; i < numDevices; i++ ) - { - if ( binarySizes[i] != 0 ) - { - binaries[i] = (char*) malloc( sizeof(char) * binarySizes[i] ); - if (binaries[i] == nullptr) { - return 0; - } - } - else - { - binaries[i] = nullptr; - } - } - - clStatus = clGetProgramInfo(program, CL_PROGRAM_BINARIES, - sizeof(char *) * numDevices, - &binaries[0], nullptr); - CHECK_OPENCL(clStatus,"clGetProgramInfo"); - - /* dump out each binary into its own separate file. */ - for ( i = 0; i < numDevices; i++ ) - { - char fileName[256] = { 0 }, cl_name[128] = { 0 }; - - if ( binarySizes[i] != 0 ) - { - char deviceName[1024]; - clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME, - sizeof(deviceName), deviceName, nullptr); - CHECK_OPENCL( clStatus, "clGetDeviceInfo" ); - - str = (char*) strstr( clFileName, (char*) ".cl" ); - memcpy( cl_name, clFileName, str - clFileName ); - cl_name[str - clFileName] = '\0'; - sprintf( fileName, "%s-%s.bin", cl_name, deviceName ); - legalizeFileName(fileName); - if ( !WriteBinaryToFile( fileName, binaries[i], binarySizes[i] ) ) - { - printf("[OD] write binary[%s] failed\n", fileName); - return 0; - } //else - printf("[OD] write binary[%s] successfully\n", fileName); - } - } - - // Release all resources and memory - for ( i = 0; i < numDevices; i++ ) - { - free(binaries[i]); - } - - free(binarySizes); - binarySizes = nullptr; - - free(mpArryDevsID); - mpArryDevsID = nullptr; - - return 1; -} - -int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption ) -{ -//PERF_COUNT_START("CompileKernelFile") - cl_int clStatus = 0; - size_t length; - char *buildLog = nullptr; - const char *source; - size_t source_size[1]; - int b_error, binary_status, binaryExisted, idx; - cl_uint numDevices; - FILE *fd, *fd1; - const char* filename = "kernel.cl"; - //fprintf(stderr, "[OD] CompileKernelFile ... \n"); - if ( CachedOfKernerPrg(gpuInfo, filename) == 1 ) - { - return 1; - } - - idx = gpuInfo->mnFileCount; - - source = kernel_src; - - source_size[0] = strlen( source ); - binaryExisted = 0; - binaryExisted = BinaryGenerated( filename, &fd ); // don't check for binary during microbenchmark -//PERF_COUNT_SUB("BinaryGenerated") - if ( binaryExisted == 1 ) - { - clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES, - sizeof(numDevices), &numDevices, nullptr); - CHECK_OPENCL(clStatus, "clGetContextInfo"); - - std::vector mpArryDevsID(numDevices); -//PERF_COUNT_SUB("get numDevices") - b_error = 0; - length = 0; - b_error |= fseek( fd, 0, SEEK_END ) < 0; - b_error |= ( length = ftell(fd) ) <= 0; - b_error |= fseek( fd, 0, SEEK_SET ) < 0; - if ( b_error ) - { - return 0; - } - - std::vector binary(length + 2); - - memset(&binary[0], 0, length + 2); - b_error |= fread(&binary[0], 1, length, fd) != length; - - fclose( fd ); -//PERF_COUNT_SUB("read file") - fd = nullptr; - // grab the handles to all of the devices in the context. - clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES, - sizeof(cl_device_id) * numDevices, - &mpArryDevsID[0], nullptr); - CHECK_OPENCL( clStatus, "clGetContextInfo" ); -//PERF_COUNT_SUB("get devices") - //fprintf(stderr, "[OD] Create kernel from binary\n"); - const uint8_t *c_binary = &binary[0]; - gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary( gpuInfo->mpContext,numDevices, - &mpArryDevsID[0], &length, &c_binary, - &binary_status, &clStatus ); - CHECK_OPENCL( clStatus, "clCreateProgramWithBinary" ); -//PERF_COUNT_SUB("clCreateProgramWithBinary") - // PERF_COUNT_SUB("binaryExisted") - } - else - { - // create a CL program using the kernel source - //fprintf(stderr, "[OD] Create kernel from source\n"); - gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource( gpuInfo->mpContext, 1, &source, - source_size, &clStatus); - CHECK_OPENCL( clStatus, "clCreateProgramWithSource" ); -//PERF_COUNT_SUB("!binaryExisted") - } - - if (gpuInfo->mpArryPrograms[idx] == (cl_program) nullptr) { - return 0; - } - - //char options[512]; - // create a cl program executable for all the devices specified - //printf("[OD] BuildProgram.\n"); -PERF_COUNT_START("OD::CompileKernel::clBuildProgram") - if (!gpuInfo->mnIsUserCreated) - { - clStatus = - clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID, - buildOption, nullptr, nullptr); - // PERF_COUNT_SUB("clBuildProgram notUserCreated") - } - else - { - clStatus = - clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID), - buildOption, nullptr, nullptr); - // PERF_COUNT_SUB("clBuildProgram isUserCreated") - } -PERF_COUNT_END - if ( clStatus != CL_SUCCESS ) - { - printf ("BuildProgram error!\n"); - if ( !gpuInfo->mnIsUserCreated ) - { - clStatus = clGetProgramBuildInfo( - gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0], - CL_PROGRAM_BUILD_LOG, 0, nullptr, &length); - } - else - { - clStatus = clGetProgramBuildInfo( - gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID, - CL_PROGRAM_BUILD_LOG, 0, nullptr, &length); - } - if ( clStatus != CL_SUCCESS ) - { - printf("opencl create build log fail\n"); - return 0; - } - buildLog = (char*) malloc( length ); - if (buildLog == (char *)nullptr) { - return 0; - } - if ( !gpuInfo->mnIsUserCreated ) - { - clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0], - CL_PROGRAM_BUILD_LOG, length, buildLog, &length ); - } - else - { - clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID, - CL_PROGRAM_BUILD_LOG, length, buildLog, &length ); - } - if ( clStatus != CL_SUCCESS ) - { - printf("opencl program build info fail\n"); - return 0; - } - - fd1 = fopen( "kernel-build.log", "w+" ); - if (fd1 != nullptr) { - fwrite(buildLog, sizeof(char), length, fd1); - fclose(fd1); - } - - free( buildLog ); -//PERF_COUNT_SUB("build error log") + for (i = 0; i < numDevices; i++) { + if (binarySizes[i] != 0) { + binaries[i] = (char*)malloc(sizeof(char) * binarySizes[i]); + if (binaries[i] == nullptr) { return 0; + } + } else { + binaries[i] = nullptr; } + } - strcpy( gpuInfo->mArryKnelSrcFile[idx], filename ); -//PERF_COUNT_SUB("strcpy") - if ( binaryExisted == 0 ) { - GeneratBinFromKernelSource( gpuInfo->mpArryPrograms[idx], filename ); - PERF_COUNT_SUB("GenerateBinFromKernelSource") + clStatus = + clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(char*) * numDevices, + &binaries[0], nullptr); + CHECK_OPENCL(clStatus, "clGetProgramInfo"); + + /* dump out each binary into its own separate file. */ + for (i = 0; i < numDevices; i++) { + char fileName[256] = {0}, cl_name[128] = {0}; + + if (binarySizes[i] != 0) { + char deviceName[1024]; + clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME, + sizeof(deviceName), deviceName, nullptr); + CHECK_OPENCL(clStatus, "clGetDeviceInfo"); + + str = (char*)strstr(clFileName, (char*)".cl"); + memcpy(cl_name, clFileName, str - clFileName); + cl_name[str - clFileName] = '\0'; + sprintf(fileName, "%s-%s.bin", cl_name, deviceName); + legalizeFileName(fileName); + if (!WriteBinaryToFile(fileName, binaries[i], binarySizes[i])) { + printf("[OD] write binary[%s] failed\n", fileName); + return 0; + } // else + printf("[OD] write binary[%s] successfully\n", fileName); } + } - gpuInfo->mnFileCount += 1; -//PERF_COUNT_END + // Release all resources and memory + for (i = 0; i < numDevices; i++) { + free(binaries[i]); + } + + free(binarySizes); + binarySizes = nullptr; + + free(mpArryDevsID); + mpArryDevsID = nullptr; + + return 1; +} + +int OpenclDevice::CompileKernelFile(GPUEnv* gpuInfo, const char* buildOption) { + // PERF_COUNT_START("CompileKernelFile") + cl_int clStatus = 0; + size_t length; + char* buildLog = nullptr; + const char* source; + size_t source_size[1]; + int b_error, binary_status, binaryExisted, idx; + cl_uint numDevices; + FILE *fd, *fd1; + const char* filename = "kernel.cl"; + // fprintf(stderr, "[OD] CompileKernelFile ... \n"); + if (CachedOfKernerPrg(gpuInfo, filename) == 1) { return 1; + } + + idx = gpuInfo->mnFileCount; + + source = kernel_src; + + source_size[0] = strlen(source); + binaryExisted = 0; + binaryExisted = BinaryGenerated( + filename, &fd); // don't check for binary during microbenchmark + // PERF_COUNT_SUB("BinaryGenerated") + if (binaryExisted == 1) { + clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES, + sizeof(numDevices), &numDevices, nullptr); + CHECK_OPENCL(clStatus, "clGetContextInfo"); + + std::vector mpArryDevsID(numDevices); + // PERF_COUNT_SUB("get numDevices") + b_error = 0; + length = 0; + b_error |= fseek(fd, 0, SEEK_END) < 0; + b_error |= (length = ftell(fd)) <= 0; + b_error |= fseek(fd, 0, SEEK_SET) < 0; + if (b_error) { + return 0; + } + + std::vector binary(length + 2); + + memset(&binary[0], 0, length + 2); + b_error |= fread(&binary[0], 1, length, fd) != length; + + fclose(fd); + // PERF_COUNT_SUB("read file") + fd = nullptr; + // grab the handles to all of the devices in the context. + clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES, + sizeof(cl_device_id) * numDevices, + &mpArryDevsID[0], nullptr); + CHECK_OPENCL(clStatus, "clGetContextInfo"); + // PERF_COUNT_SUB("get devices") + // fprintf(stderr, "[OD] Create kernel from binary\n"); + const uint8_t* c_binary = &binary[0]; + gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary( + gpuInfo->mpContext, numDevices, &mpArryDevsID[0], &length, &c_binary, + &binary_status, &clStatus); + CHECK_OPENCL(clStatus, "clCreateProgramWithBinary"); + // PERF_COUNT_SUB("clCreateProgramWithBinary") + // PERF_COUNT_SUB("binaryExisted") + } else { + // create a CL program using the kernel source + // fprintf(stderr, "[OD] Create kernel from source\n"); + gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource( + gpuInfo->mpContext, 1, &source, source_size, &clStatus); + CHECK_OPENCL(clStatus, "clCreateProgramWithSource"); + // PERF_COUNT_SUB("!binaryExisted") + } + + if (gpuInfo->mpArryPrograms[idx] == (cl_program) nullptr) { + return 0; + } + + // char options[512]; + // create a cl program executable for all the devices specified + // printf("[OD] BuildProgram.\n"); + PERF_COUNT_START("OD::CompileKernel::clBuildProgram") + if (!gpuInfo->mnIsUserCreated) { + clStatus = + clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID, + buildOption, nullptr, nullptr); + // PERF_COUNT_SUB("clBuildProgram notUserCreated") + } else { + clStatus = + clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID), + buildOption, nullptr, nullptr); + // PERF_COUNT_SUB("clBuildProgram isUserCreated") + } + PERF_COUNT_END + if (clStatus != CL_SUCCESS) { + printf("BuildProgram error!\n"); + if (!gpuInfo->mnIsUserCreated) { + clStatus = clGetProgramBuildInfo( + gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0], + CL_PROGRAM_BUILD_LOG, 0, nullptr, &length); + } else { + clStatus = + clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID, + CL_PROGRAM_BUILD_LOG, 0, nullptr, &length); + } + if (clStatus != CL_SUCCESS) { + printf("opencl create build log fail\n"); + return 0; + } + buildLog = (char*)malloc(length); + if (buildLog == (char*)nullptr) { + return 0; + } + if (!gpuInfo->mnIsUserCreated) { + clStatus = clGetProgramBuildInfo( + gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0], + CL_PROGRAM_BUILD_LOG, length, buildLog, &length); + } else { + clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], + gpuInfo->mpDevID, CL_PROGRAM_BUILD_LOG, + length, buildLog, &length); + } + if (clStatus != CL_SUCCESS) { + printf("opencl program build info fail\n"); + return 0; + } + + fd1 = fopen("kernel-build.log", "w+"); + if (fd1 != nullptr) { + fwrite(buildLog, sizeof(char), length, fd1); + fclose(fd1); + } + + free(buildLog); + // PERF_COUNT_SUB("build error log") + return 0; + } + + strcpy(gpuInfo->mArryKnelSrcFile[idx], filename); + // PERF_COUNT_SUB("strcpy") + if (binaryExisted == 0) { + GeneratBinFromKernelSource(gpuInfo->mpArryPrograms[idx], filename); + PERF_COUNT_SUB("GenerateBinFromKernelSource") + } + + gpuInfo->mnFileCount += 1; + // PERF_COUNT_END + return 1; } -l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl,l_uint32 *line) -{ -PERF_COUNT_START("pixReadFromTiffKernel") - cl_int clStatus; - KernelEnv rEnv; - size_t globalThreads[2]; - size_t localThreads[2]; - int gsize; - cl_mem valuesCl; - cl_mem outputCl; +l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32* tiffdata, l_int32 w, + l_int32 h, l_int32 wpl, + l_uint32* line) { + PERF_COUNT_START("pixReadFromTiffKernel") + cl_int clStatus; + KernelEnv rEnv; + size_t globalThreads[2]; + size_t localThreads[2]; + int gsize; + cl_mem valuesCl; + cl_mem outputCl; - //global and local work dimensions for Horizontal pass - gsize = (w + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; - globalThreads[0] = gsize; - gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; - globalThreads[1] = gsize; - localThreads[0] = GROUPSIZE_X; - localThreads[1] = GROUPSIZE_Y; + // global and local work dimensions for Horizontal pass + gsize = (w + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X; + globalThreads[0] = gsize; + gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y; + globalThreads[1] = gsize; + localThreads[0] = GROUPSIZE_X; + localThreads[1] = GROUPSIZE_Y; - SetKernelEnv( &rEnv ); + SetKernelEnv(&rEnv); - l_uint32 *pResult = (l_uint32 *)malloc(w*h * sizeof(l_uint32)); - rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "composeRGBPixel", &clStatus ); - CHECK_OPENCL(clStatus, "clCreateKernel composeRGBPixel"); + l_uint32* pResult = (l_uint32*)malloc(w * h * sizeof(l_uint32)); + rEnv.mpkKernel = + clCreateKernel(rEnv.mpkProgram, "composeRGBPixel", &clStatus); + CHECK_OPENCL(clStatus, "clCreateKernel composeRGBPixel"); - //Allocate input and output OCL buffers - valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w*h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus); - outputCl = allocateZeroCopyBuffer(rEnv, pResult, w*h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus); + // Allocate input and output OCL buffers + valuesCl = allocateZeroCopyBuffer( + rEnv, tiffdata, w * h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus); + outputCl = allocateZeroCopyBuffer( + rEnv, pResult, w * h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus); - //Kernel arguments - clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &valuesCl); - CHECK_OPENCL( clStatus, "clSetKernelArg"); - clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(w), &w); - CHECK_OPENCL( clStatus, "clSetKernelArg" ); - clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(h), &h); - CHECK_OPENCL( clStatus, "clSetKernelArg" ); - clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl); - CHECK_OPENCL( clStatus, "clSetKernelArg" ); - clStatus = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &outputCl); - CHECK_OPENCL( clStatus, "clSetKernelArg"); + // Kernel arguments + clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &valuesCl); + CHECK_OPENCL(clStatus, "clSetKernelArg"); + clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(w), &w); + CHECK_OPENCL(clStatus, "clSetKernelArg"); + clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(h), &h); + CHECK_OPENCL(clStatus, "clSetKernelArg"); + clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl); + CHECK_OPENCL(clStatus, "clSetKernelArg"); + clStatus = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &outputCl); + CHECK_OPENCL(clStatus, "clSetKernelArg"); - //Kernel enqueue -PERF_COUNT_SUB("before") -clStatus = - clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, - globalThreads, localThreads, 0, nullptr, nullptr); -CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel"); + // Kernel enqueue + PERF_COUNT_SUB("before") + clStatus = + clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, + globalThreads, localThreads, 0, nullptr, nullptr); + CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel"); -/* map results back from gpu */ -void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ, - 0, w * h * sizeof(l_uint32), 0, nullptr, nullptr, - &clStatus); -CHECK_OPENCL(clStatus, "clEnqueueMapBuffer outputCl"); -clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, nullptr, nullptr); + /* map results back from gpu */ + void* ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, + CL_MAP_READ, 0, w * h * sizeof(l_uint32), 0, + nullptr, nullptr, &clStatus); + CHECK_OPENCL(clStatus, "clEnqueueMapBuffer outputCl"); + clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, nullptr, nullptr); -// Sync -clFinish(rEnv.mpkCmdQueue); -PERF_COUNT_SUB("kernel & map") -PERF_COUNT_END - return pResult; + // Sync + clFinish(rEnv.mpkCmdQueue); + PERF_COUNT_SUB("kernel & map") + PERF_COUNT_END + return pResult; } -//Morphology Dilate operation for 5x5 structuring element. Invokes the relevant OpenCL kernels +// Morphology Dilate operation for 5x5 structuring element. Invokes the relevant +// OpenCL kernels static cl_int pixDilateCL_55(l_int32 wpl, l_int32 h) { size_t globalThreads[2]; cl_mem pixtemp; @@ -1286,7 +1246,8 @@ static cl_int pixDilateCL_55(l_int32 wpl, l_int32 h) { return status; } -//Morphology Erode operation for 5x5 structuring element. Invokes the relevant OpenCL kernels +// Morphology Erode operation for 5x5 structuring element. Invokes the relevant +// OpenCL kernels static cl_int pixErodeCL_55(l_int32 wpl, l_int32 h) { size_t globalThreads[2]; cl_mem pixtemp; @@ -1348,11 +1309,11 @@ static cl_int pixErodeCL_55(l_int32 wpl, l_int32 h) { return status; } -//Morphology Dilate operation. Invokes the relevant OpenCL kernels +// Morphology Dilate operation. Invokes the relevant OpenCL kernels static cl_int pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) { l_int32 xp, yp, xn, yn; - SEL *sel; + SEL* sel; size_t globalThreads[2]; cl_mem pixtemp; cl_int status = 0; @@ -1444,11 +1405,11 @@ static cl_int pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, return status; } -//Morphology Erode operation. Invokes the relevant OpenCL kernels +// Morphology Erode operation. Invokes the relevant OpenCL kernels static cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, l_uint32 h) { l_int32 xp, yp, xn, yn; - SEL *sel; + SEL* sel; size_t globalThreads[2]; size_t localThreads[2]; cl_mem pixtemp; @@ -1552,7 +1513,7 @@ static cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, return status; } -//Morphology Open operation. Invokes the relevant OpenCL kernels +// Morphology Open operation. Invokes the relevant OpenCL kernels static cl_int pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) { cl_int status; cl_mem pixtemp; @@ -1569,7 +1530,7 @@ static cl_int pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) { return status; } -//Morphology Close operation. Invokes the relevant OpenCL kernels +// Morphology Close operation. Invokes the relevant OpenCL kernels static cl_int pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) { cl_int status; cl_mem pixtemp; @@ -1586,7 +1547,7 @@ static cl_int pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) { return status; } -//output = buffer1 & ~(buffer2) +// output = buffer1 & ~(buffer2) static cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outBuffer = nullptr) { cl_int status; @@ -1624,9 +1585,10 @@ static cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, } // OpenCL implementation of Get Lines from pix function -//Note: Assumes the source and dest opencl buffer are initialized. No check done -void OpenclDevice::pixGetLinesCL(Pix *pixd, Pix *pixs, Pix **pix_vline, - Pix **pix_hline, Pix **pixClosed, +// Note: Assumes the source and dest opencl buffer are initialized. No check +// done +void OpenclDevice::pixGetLinesCL(Pix* pixd, Pix* pixs, Pix** pix_vline, + Pix** pix_hline, Pix** pixClosed, bool getpixClosed, l_int32 close_hsize, l_int32 close_vsize, l_int32 open_hsize, l_int32 open_vsize, l_int32 line_hsize, @@ -1706,12 +1668,12 @@ void OpenclDevice::pixGetLinesCL(Pix *pixd, Pix *pixs, Pix **pix_vline, * histogramAllChannels is laid out as all channel 0, then all channel 1... * only supports 1 or 4 channels (bytes_per_pixel) ************************************************************************/ -int OpenclDevice::HistogramRectOCL(unsigned char *imageData, +int OpenclDevice::HistogramRectOCL(unsigned char* imageData, int bytes_per_pixel, int bytes_per_line, int left, // always 0 int top, // always 0 int width, int height, int kHistogramSize, - int *histogramAllChannels) { + int* histogramAllChannels) { PERF_COUNT_START("HistogramRectOCL") cl_int clStatus; int retVal = 0; @@ -1762,7 +1724,7 @@ int OpenclDevice::HistogramRectOCL(unsigned char *imageData, CHECK_OPENCL(clStatus, "clCreateBuffer tmpHistogramBuffer"); /* atomic sync buffer */ - int *zeroBuffer = new int[1]; + int* zeroBuffer = new int[1]; zeroBuffer[0] = 0; cl_mem atomicSyncBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, @@ -1781,87 +1743,92 @@ int OpenclDevice::HistogramRectOCL(unsigned char *imageData, CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectOneChannelReduction"); } else { - histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus ); - CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannels"); + histKern.mpkKernel = clCreateKernel( + histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus); + CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectAllChannels"); - histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectAllChannelsReduction", &clStatus ); - CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannelsReduction"); - } + histRedKern.mpkKernel = + clCreateKernel(histRedKern.mpkProgram, + "kernel_HistogramRectAllChannelsReduction", &clStatus); + CHECK_OPENCL(clStatus, + "clCreateKernel kernel_HistogramRectAllChannelsReduction"); + } - void *ptr; + void* ptr; - //Initialize tmpHistogramBuffer buffer - ptr = clEnqueueMapBuffer( - histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0, - tmpHistogramBins * sizeof(cl_uint), 0, nullptr, nullptr, &clStatus); - CHECK_OPENCL( clStatus, "clEnqueueMapBuffer tmpHistogramBuffer"); + // Initialize tmpHistogramBuffer buffer + ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, + CL_MAP_WRITE, 0, tmpHistogramBins * sizeof(cl_uint), + 0, nullptr, nullptr, &clStatus); + CHECK_OPENCL(clStatus, "clEnqueueMapBuffer tmpHistogramBuffer"); - memset(ptr, 0, tmpHistogramBins*sizeof(cl_uint)); - clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0, - nullptr, nullptr); + memset(ptr, 0, tmpHistogramBins * sizeof(cl_uint)); + clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0, + nullptr, nullptr); - /* set kernel 1 arguments */ - clStatus = - clSetKernelArg(histKern.mpkKernel, 0, sizeof(cl_mem), &imageBuffer); - CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer"); - cl_uint numPixels = width*height; - clStatus = - clSetKernelArg(histKern.mpkKernel, 1, sizeof(cl_uint), &numPixels); - CHECK_OPENCL( clStatus, "clSetKernelArg numPixels" ); - clStatus = clSetKernelArg(histKern.mpkKernel, 2, sizeof(cl_mem), - &tmpHistogramBuffer); - CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer"); + /* set kernel 1 arguments */ + clStatus = + clSetKernelArg(histKern.mpkKernel, 0, sizeof(cl_mem), &imageBuffer); + CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer"); + cl_uint numPixels = width * height; + clStatus = clSetKernelArg(histKern.mpkKernel, 1, sizeof(cl_uint), &numPixels); + CHECK_OPENCL(clStatus, "clSetKernelArg numPixels"); + clStatus = clSetKernelArg(histKern.mpkKernel, 2, sizeof(cl_mem), + &tmpHistogramBuffer); + CHECK_OPENCL(clStatus, "clSetKernelArg tmpHistogramBuffer"); - /* set kernel 2 arguments */ - int n = numThreads/bytes_per_pixel; - clStatus = clSetKernelArg(histRedKern.mpkKernel, 0, sizeof(cl_int), &n); - CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer"); - clStatus = clSetKernelArg(histRedKern.mpkKernel, 1, sizeof(cl_mem), - &tmpHistogramBuffer); - CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer"); - clStatus = clSetKernelArg(histRedKern.mpkKernel, 2, sizeof(cl_mem), - &histogramBuffer); - CHECK_OPENCL( clStatus, "clSetKernelArg histogramBuffer"); + /* set kernel 2 arguments */ + int n = numThreads / bytes_per_pixel; + clStatus = clSetKernelArg(histRedKern.mpkKernel, 0, sizeof(cl_int), &n); + CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer"); + clStatus = clSetKernelArg(histRedKern.mpkKernel, 1, sizeof(cl_mem), + &tmpHistogramBuffer); + CHECK_OPENCL(clStatus, "clSetKernelArg tmpHistogramBuffer"); + clStatus = clSetKernelArg(histRedKern.mpkKernel, 2, sizeof(cl_mem), + &histogramBuffer); + CHECK_OPENCL(clStatus, "clSetKernelArg histogramBuffer"); - /* launch histogram */ -PERF_COUNT_SUB("before") -clStatus = clEnqueueNDRangeKernel(histKern.mpkCmdQueue, histKern.mpkKernel, 1, - nullptr, global_work_size, local_work_size, 0, - nullptr, nullptr); -CHECK_OPENCL(clStatus, - "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels"); -clFinish(histKern.mpkCmdQueue); -if (clStatus != 0) { - retVal = -1; - } - /* launch histogram */ - clStatus = clEnqueueNDRangeKernel( - histRedKern.mpkCmdQueue, histRedKern.mpkKernel, 1, nullptr, - red_global_work_size, local_work_size, 0, nullptr, nullptr); - CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction" ); - clFinish( histRedKern.mpkCmdQueue ); - if (clStatus != 0) { - retVal = -1; - } - PERF_COUNT_SUB("redKernel") + /* launch histogram */ + PERF_COUNT_SUB("before") + clStatus = clEnqueueNDRangeKernel(histKern.mpkCmdQueue, histKern.mpkKernel, 1, + nullptr, global_work_size, local_work_size, + 0, nullptr, nullptr); + CHECK_OPENCL(clStatus, + "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels"); + clFinish(histKern.mpkCmdQueue); + if (clStatus != 0) { + retVal = -1; + } + /* launch histogram */ + clStatus = clEnqueueNDRangeKernel( + histRedKern.mpkCmdQueue, histRedKern.mpkKernel, 1, nullptr, + red_global_work_size, local_work_size, 0, nullptr, nullptr); + CHECK_OPENCL( + clStatus, + "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction"); + clFinish(histRedKern.mpkCmdQueue); + if (clStatus != 0) { + retVal = -1; + } + PERF_COUNT_SUB("redKernel") - /* map results back from gpu */ - ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE, - CL_MAP_READ, 0, - kHistogramSize * bytes_per_pixel * sizeof(int), 0, - nullptr, nullptr, &clStatus); - CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer"); - if (clStatus != 0) { - retVal = -1; - } - clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0, - nullptr, nullptr); + /* map results back from gpu */ + ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE, + CL_MAP_READ, 0, + kHistogramSize * bytes_per_pixel * sizeof(int), 0, + nullptr, nullptr, &clStatus); + CHECK_OPENCL(clStatus, "clEnqueueMapBuffer histogramBuffer"); + if (clStatus != 0) { + retVal = -1; + } + clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0, + nullptr, nullptr); - clReleaseMemObject(histogramBuffer); - clReleaseMemObject(imageBuffer); -PERF_COUNT_SUB("after") -PERF_COUNT_END -return retVal; + clReleaseMemObject(histogramBuffer); + clReleaseMemObject(imageBuffer); + PERF_COUNT_SUB("after") + PERF_COUNT_END + return retVal; } /************************************************************************* @@ -1869,16 +1836,16 @@ return retVal; * from the class, using thresholds/hi_values to the output IMAGE. * only supports 1 or 4 channels ************************************************************************/ -int OpenclDevice::ThresholdRectToPixOCL(unsigned char *imageData, +int OpenclDevice::ThresholdRectToPixOCL(unsigned char* imageData, int bytes_per_pixel, int bytes_per_line, - int *thresholds, int *hi_values, - Pix **pix, int height, int width, + int* thresholds, int* hi_values, + Pix** pix, int height, int width, int top, int left) { PERF_COUNT_START("ThresholdRectToPixOCL") int retVal = 0; /* create pix result buffer */ *pix = pixCreate(width, height, 1); - uint32_t *pixData = pixGetData(*pix); + uint32_t* pixData = pixGetData(*pix); int wpl = pixGetWpl(*pix); int pixSize = wpl * height * sizeof(uint32_t); // number of pixels @@ -1966,7 +1933,7 @@ int OpenclDevice::ThresholdRectToPixOCL(unsigned char *imageData, retVal = -1; } /* map results back from gpu */ - void *ptr = + void* ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0, pixSize, 0, nullptr, nullptr, &clStatus); CHECK_OPENCL(clStatus, "clEnqueueMapBuffer histogramBuffer"); @@ -1982,22 +1949,20 @@ int OpenclDevice::ThresholdRectToPixOCL(unsigned char *imageData, return retVal; } - - /****************************************************************************** * Data Types for Device Selection *****************************************************************************/ typedef struct _TessScoreEvaluationInputData { - int height; - int width; - int numChannels; - unsigned char *imageData; - Pix *pix; + int height; + int width; + int numChannels; + unsigned char* imageData; + Pix* pix; } TessScoreEvaluationInputData; static void populateTessScoreEvaluationInputData( - TessScoreEvaluationInputData *input) { + TessScoreEvaluationInputData* input) { srand(1); // 8.5x11 inches @ 300dpi rounded to clean multiples int height = 3328; // %256 @@ -2009,7 +1974,7 @@ static void populateTessScoreEvaluationInputData( unsigned char(*imageData4)[4] = (unsigned char(*)[4])malloc( height * width * numChannels * sizeof(unsigned char)); // new unsigned char[4][height*width]; - input->imageData = (unsigned char *)&imageData4[0]; + input->imageData = (unsigned char*)&imageData4[0]; // zero out image unsigned char pixelWhite[4] = {0, 0, 0, 255}; @@ -2084,197 +2049,206 @@ static void populateTessScoreEvaluationInputData( } typedef struct _TessDeviceScore { - float time; // small time means faster device - bool clError; // were there any opencl errors - bool valid; // was the correct response generated + float time; // small time means faster device + bool clError; // were there any opencl errors + bool valid; // was the correct response generated } TessDeviceScore; /****************************************************************************** * Micro Benchmarks for Device Selection *****************************************************************************/ -static double composeRGBPixelMicroBench(GPUEnv *env, +static double composeRGBPixelMicroBench(GPUEnv* env, TessScoreEvaluationInputData input, ds_device_type type) { double time = 0; #if ON_WINDOWS - LARGE_INTEGER freq, time_funct_start, time_funct_end; - QueryPerformanceFrequency(&freq); + LARGE_INTEGER freq, time_funct_start, time_funct_end; + QueryPerformanceFrequency(&freq); #elif ON_APPLE - mach_timebase_info_data_t info = {0, 0}; - mach_timebase_info(&info); - long long start, stop; + mach_timebase_info_data_t info = {0, 0}; + mach_timebase_info(&info); + long long start, stop; #else - timespec time_funct_start, time_funct_end; + timespec time_funct_start, time_funct_end; #endif - // input data - l_uint32 *tiffdata = (l_uint32 *)input.imageData;// same size and random data; data doesn't change workload + // input data + l_uint32* tiffdata = + (l_uint32*)input.imageData; // same size and random data; data doesn't + // change workload - // function call - if (type == DS_DEVICE_OPENCL_DEVICE) { + // function call + if (type == DS_DEVICE_OPENCL_DEVICE) { #if ON_WINDOWS - QueryPerformanceCounter(&time_funct_start); + QueryPerformanceCounter(&time_funct_start); #elif ON_APPLE - start = mach_absolute_time(); + start = mach_absolute_time(); #else - clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); + clock_gettime(CLOCK_MONOTONIC, &time_funct_start); #endif - OpenclDevice::gpuEnv = *env; - int wpl = pixGetWpl(input.pix); - OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height, - wpl, nullptr); + OpenclDevice::gpuEnv = *env; + int wpl = pixGetWpl(input.pix); + OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height, + wpl, nullptr); #if ON_WINDOWS - QueryPerformanceCounter(&time_funct_end); - time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); + QueryPerformanceCounter(&time_funct_end); + time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / + (double)(freq.QuadPart); #elif ON_APPLE - stop = mach_absolute_time(); - time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; + stop = mach_absolute_time(); + time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; #else - clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); - time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; + clock_gettime(CLOCK_MONOTONIC, &time_funct_end); + time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 + + (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0; #endif - } else { + } else { #if ON_WINDOWS - QueryPerformanceCounter(&time_funct_start); + QueryPerformanceCounter(&time_funct_start); #elif ON_APPLE - start = mach_absolute_time(); + start = mach_absolute_time(); #else - clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); + clock_gettime(CLOCK_MONOTONIC, &time_funct_start); #endif - Pix *pix = pixCreate(input.width, input.height, 32); - l_uint32 *pixData = pixGetData(pix); - int i, j; - int idx = 0; - for (i = 0; i < input.height ; i++) { - for (j = 0; j < input.width; j++) { - l_uint32 tiffword = tiffdata[i * input.width + j]; - l_int32 rval = ((tiffword) & 0xff); - l_int32 gval = (((tiffword) >> 8) & 0xff); - l_int32 bval = (((tiffword) >> 16) & 0xff); - l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8); - pixData[idx] = value; - idx++; - } - } -#if ON_WINDOWS - QueryPerformanceCounter(&time_funct_end); - time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); -#elif ON_APPLE - stop = mach_absolute_time(); - time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; -#else - clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); - time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; -#endif - pixDestroy(&pix); + Pix* pix = pixCreate(input.width, input.height, 32); + l_uint32* pixData = pixGetData(pix); + int i, j; + int idx = 0; + for (i = 0; i < input.height; i++) { + for (j = 0; j < input.width; j++) { + l_uint32 tiffword = tiffdata[i * input.width + j]; + l_int32 rval = ((tiffword)&0xff); + l_int32 gval = (((tiffword) >> 8) & 0xff); + l_int32 bval = (((tiffword) >> 16) & 0xff); + l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8); + pixData[idx] = value; + idx++; + } } +#if ON_WINDOWS + QueryPerformanceCounter(&time_funct_end); + time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / + (double)(freq.QuadPart); +#elif ON_APPLE + stop = mach_absolute_time(); + time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; +#else + clock_gettime(CLOCK_MONOTONIC, &time_funct_end); + time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 + + (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0; +#endif + pixDestroy(&pix); + } + // cleanup - // cleanup - - return time; + return time; } -static double histogramRectMicroBench(GPUEnv *env, +static double histogramRectMicroBench(GPUEnv* env, TessScoreEvaluationInputData input, ds_device_type type) { double time; #if ON_WINDOWS - LARGE_INTEGER freq, time_funct_start, time_funct_end; - QueryPerformanceFrequency(&freq); + LARGE_INTEGER freq, time_funct_start, time_funct_end; + QueryPerformanceFrequency(&freq); #elif ON_APPLE - mach_timebase_info_data_t info = {0, 0}; - mach_timebase_info(&info); - long long start, stop; + mach_timebase_info_data_t info = {0, 0}; + mach_timebase_info(&info); + long long start, stop; #else - timespec time_funct_start, time_funct_end; + timespec time_funct_start, time_funct_end; #endif - int left = 0; - int top = 0; - int kHistogramSize = 256; - int bytes_per_line = input.width*input.numChannels; - int *histogramAllChannels = new int[kHistogramSize*input.numChannels]; - // function call - if (type == DS_DEVICE_OPENCL_DEVICE) { + int left = 0; + int top = 0; + int kHistogramSize = 256; + int bytes_per_line = input.width * input.numChannels; + int* histogramAllChannels = new int[kHistogramSize * input.numChannels]; + // function call + if (type == DS_DEVICE_OPENCL_DEVICE) { #if ON_WINDOWS - QueryPerformanceCounter(&time_funct_start); + QueryPerformanceCounter(&time_funct_start); #elif ON_APPLE - start = mach_absolute_time(); + start = mach_absolute_time(); #else - clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); + clock_gettime(CLOCK_MONOTONIC, &time_funct_start); #endif - OpenclDevice::gpuEnv = *env; - int retVal = OpenclDevice::HistogramRectOCL( - input.imageData, input.numChannels, bytes_per_line, top, left, - input.width, input.height, kHistogramSize, histogramAllChannels); + OpenclDevice::gpuEnv = *env; + int retVal = OpenclDevice::HistogramRectOCL( + input.imageData, input.numChannels, bytes_per_line, top, left, + input.width, input.height, kHistogramSize, histogramAllChannels); #if ON_WINDOWS - QueryPerformanceCounter(&time_funct_end); - time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); + QueryPerformanceCounter(&time_funct_end); + time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / + (double)(freq.QuadPart); #elif ON_APPLE - stop = mach_absolute_time(); - if (retVal == 0) { - time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; - } else { - time = FLT_MAX; - } -#else - clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); - time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; -#endif + stop = mach_absolute_time(); + if (retVal == 0) { + time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; } else { - int *histogram = new int[kHistogramSize]; -#if ON_WINDOWS - QueryPerformanceCounter(&time_funct_start); -#elif ON_APPLE - start = mach_absolute_time(); -#else - clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); -#endif - for (int ch = 0; ch < input.numChannels; ++ch) { - tesseract::HistogramRect(input.pix, input.numChannels, left, top, - input.width, input.height, histogram); - } -#if ON_WINDOWS - QueryPerformanceCounter(&time_funct_end); - time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); -#elif ON_APPLE - stop = mach_absolute_time(); - time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; -#else - clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); - time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; -#endif - delete[] histogram; + time = FLT_MAX; } +#else + clock_gettime(CLOCK_MONOTONIC, &time_funct_end); + time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 + + (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0; +#endif + } else { + int* histogram = new int[kHistogramSize]; +#if ON_WINDOWS + QueryPerformanceCounter(&time_funct_start); +#elif ON_APPLE + start = mach_absolute_time(); +#else + clock_gettime(CLOCK_MONOTONIC, &time_funct_start); +#endif + for (int ch = 0; ch < input.numChannels; ++ch) { + tesseract::HistogramRect(input.pix, input.numChannels, left, top, + input.width, input.height, histogram); + } +#if ON_WINDOWS + QueryPerformanceCounter(&time_funct_end); + time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / + (double)(freq.QuadPart); +#elif ON_APPLE + stop = mach_absolute_time(); + time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; +#else + clock_gettime(CLOCK_MONOTONIC, &time_funct_end); + time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 + + (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0; +#endif + delete[] histogram; + } - // cleanup - delete[] histogramAllChannels; - return time; + // cleanup + delete[] histogramAllChannels; + return time; } -//Reproducing the ThresholdRectToPix native version -static void ThresholdRectToPix_Native(const unsigned char *imagedata, +// Reproducing the ThresholdRectToPix native version +static void ThresholdRectToPix_Native(const unsigned char* imagedata, int bytes_per_pixel, int bytes_per_line, - const int *thresholds, - const int *hi_values, Pix **pix) { + const int* thresholds, + const int* hi_values, Pix** pix) { int top = 0; int left = 0; int width = pixGetWidth(*pix); int height = pixGetHeight(*pix); *pix = pixCreate(width, height, 1); - uint32_t *pixdata = pixGetData(*pix); + uint32_t* pixdata = pixGetData(*pix); int wpl = pixGetWpl(*pix); - const unsigned char* srcdata = imagedata + top * bytes_per_line + - left * bytes_per_pixel; + const unsigned char* srcdata = + imagedata + top * bytes_per_line + left * bytes_per_pixel; for (int y = 0; y < height; ++y) { - const uint8_t *linedata = srcdata; - uint32_t *pixline = pixdata + y * wpl; + const uint8_t* linedata = srcdata; + uint32_t* pixline = pixdata + y * wpl; for (int x = 0; x < width; ++x, linedata += bytes_per_pixel) { bool white_result = true; for (int ch = 0; ch < bytes_per_pixel; ++ch) { @@ -2293,7 +2267,7 @@ static void ThresholdRectToPix_Native(const unsigned char *imagedata, } } -static double thresholdRectToPixMicroBench(GPUEnv *env, +static double thresholdRectToPixMicroBench(GPUEnv* env, TessScoreEvaluationInputData input, ds_device_type type) { double time; @@ -2308,174 +2282,174 @@ static double thresholdRectToPixMicroBench(GPUEnv *env, timespec time_funct_start, time_funct_end; #endif - // input data - unsigned char pixelHi = (unsigned char)255; - int thresholds[4] = { - pixelHi, - pixelHi, - pixelHi, - pixelHi - }; + // input data + unsigned char pixelHi = (unsigned char)255; + int thresholds[4] = {pixelHi, pixelHi, pixelHi, pixelHi}; -//Pix* pix = pixCreate(width, height, 1); - int top = 0; - int left = 0; - int bytes_per_line = input.width*input.numChannels; + // Pix* pix = pixCreate(width, height, 1); + int top = 0; + int left = 0; + int bytes_per_line = input.width * input.numChannels; - // function call - if (type == DS_DEVICE_OPENCL_DEVICE) { + // function call + if (type == DS_DEVICE_OPENCL_DEVICE) { #if ON_WINDOWS - QueryPerformanceCounter(&time_funct_start); + QueryPerformanceCounter(&time_funct_start); #elif ON_APPLE - start = mach_absolute_time(); + start = mach_absolute_time(); #else - clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); + clock_gettime(CLOCK_MONOTONIC, &time_funct_start); #endif - OpenclDevice::gpuEnv = *env; - int hi_values[4]; - int retVal = OpenclDevice::ThresholdRectToPixOCL( - input.imageData, input.numChannels, bytes_per_line, thresholds, - hi_values, &input.pix, input.height, input.width, top, left); + OpenclDevice::gpuEnv = *env; + int hi_values[4]; + int retVal = OpenclDevice::ThresholdRectToPixOCL( + input.imageData, input.numChannels, bytes_per_line, thresholds, + hi_values, &input.pix, input.height, input.width, top, left); #if ON_WINDOWS - QueryPerformanceCounter(&time_funct_end); - time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); + QueryPerformanceCounter(&time_funct_end); + time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / + (double)(freq.QuadPart); #elif ON_APPLE - stop = mach_absolute_time(); - if (retVal == 0) { - time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; - ; - } else { - time = FLT_MAX; - } - -#else - clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); - time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; -#endif + stop = mach_absolute_time(); + if (retVal == 0) { + time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; + ; } else { - - - tesseract::ImageThresholder thresholder; - thresholder.SetImage( input.pix ); -#if ON_WINDOWS - QueryPerformanceCounter(&time_funct_start); -#elif ON_APPLE - start = mach_absolute_time(); -#else - clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); -#endif - int hi_values[4] = {}; - ThresholdRectToPix_Native( input.imageData, input.numChannels, bytes_per_line, - thresholds, hi_values, &input.pix ); - -#if ON_WINDOWS - QueryPerformanceCounter(&time_funct_end); - time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); -#elif ON_APPLE - stop = mach_absolute_time(); - time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; -#else - clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); - time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; -#endif + time = FLT_MAX; } - return time; +#else + clock_gettime(CLOCK_MONOTONIC, &time_funct_end); + time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 + + (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0; +#endif + } else { + tesseract::ImageThresholder thresholder; + thresholder.SetImage(input.pix); +#if ON_WINDOWS + QueryPerformanceCounter(&time_funct_start); +#elif ON_APPLE + start = mach_absolute_time(); +#else + clock_gettime(CLOCK_MONOTONIC, &time_funct_start); +#endif + int hi_values[4] = {}; + ThresholdRectToPix_Native(input.imageData, input.numChannels, + bytes_per_line, thresholds, hi_values, + &input.pix); + +#if ON_WINDOWS + QueryPerformanceCounter(&time_funct_end); + time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / + (double)(freq.QuadPart); +#elif ON_APPLE + stop = mach_absolute_time(); + time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; +#else + clock_gettime(CLOCK_MONOTONIC, &time_funct_end); + time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 + + (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0; +#endif + } + + return time; } -static double getLineMasksMorphMicroBench(GPUEnv *env, +static double getLineMasksMorphMicroBench(GPUEnv* env, TessScoreEvaluationInputData input, ds_device_type type) { double time = 0; #if ON_WINDOWS - LARGE_INTEGER freq, time_funct_start, time_funct_end; - QueryPerformanceFrequency(&freq); + LARGE_INTEGER freq, time_funct_start, time_funct_end; + QueryPerformanceFrequency(&freq); #elif ON_APPLE - mach_timebase_info_data_t info = {0, 0}; - mach_timebase_info(&info); - long long start, stop; + mach_timebase_info_data_t info = {0, 0}; + mach_timebase_info(&info); + long long start, stop; #else - timespec time_funct_start, time_funct_end; + timespec time_funct_start, time_funct_end; #endif - // input data - int resolution = 300; - int wpl = pixGetWpl(input.pix); - int kThinLineFraction = 20; // tess constant - int kMinLineLengthFraction = 4; // tess constant - int max_line_width = resolution / kThinLineFraction; - int min_line_length = resolution / kMinLineLengthFraction; - int closing_brick = max_line_width / 3; + // input data + int resolution = 300; + int wpl = pixGetWpl(input.pix); + int kThinLineFraction = 20; // tess constant + int kMinLineLengthFraction = 4; // tess constant + int max_line_width = resolution / kThinLineFraction; + int min_line_length = resolution / kMinLineLengthFraction; + int closing_brick = max_line_width / 3; - // function call - if (type == DS_DEVICE_OPENCL_DEVICE) { + // function call + if (type == DS_DEVICE_OPENCL_DEVICE) { #if ON_WINDOWS - QueryPerformanceCounter(&time_funct_start); + QueryPerformanceCounter(&time_funct_start); #elif ON_APPLE - start = mach_absolute_time(); + start = mach_absolute_time(); #else - clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); + clock_gettime(CLOCK_MONOTONIC, &time_funct_start); #endif - OpenclDevice::gpuEnv = *env; - OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix); - Pix *pix_vline = nullptr, *pix_hline = nullptr, *pix_closed = nullptr; - OpenclDevice::pixGetLinesCL( - nullptr, input.pix, &pix_vline, &pix_hline, &pix_closed, true, - closing_brick, closing_brick, max_line_width, max_line_width, - min_line_length, min_line_length); + OpenclDevice::gpuEnv = *env; + OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix); + Pix *pix_vline = nullptr, *pix_hline = nullptr, *pix_closed = nullptr; + OpenclDevice::pixGetLinesCL(nullptr, input.pix, &pix_vline, &pix_hline, + &pix_closed, true, closing_brick, closing_brick, + max_line_width, max_line_width, min_line_length, + min_line_length); - OpenclDevice::releaseMorphCLBuffers(); + OpenclDevice::releaseMorphCLBuffers(); #if ON_WINDOWS - QueryPerformanceCounter(&time_funct_end); - time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); + QueryPerformanceCounter(&time_funct_end); + time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / + (double)(freq.QuadPart); #elif ON_APPLE - stop = mach_absolute_time(); - time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; + stop = mach_absolute_time(); + time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; #else - clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); - time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; + clock_gettime(CLOCK_MONOTONIC, &time_funct_end); + time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 + + (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0; #endif - } else { + } else { #if ON_WINDOWS - QueryPerformanceCounter(&time_funct_start); + QueryPerformanceCounter(&time_funct_start); #elif ON_APPLE - start = mach_absolute_time(); + start = mach_absolute_time(); #else - clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); + clock_gettime(CLOCK_MONOTONIC, &time_funct_start); #endif - // native serial code - Pix *src_pix = input.pix; - Pix *pix_closed = - pixCloseBrick(nullptr, src_pix, closing_brick, closing_brick); - Pix *pix_solid = - pixOpenBrick(nullptr, pix_closed, max_line_width, max_line_width); - Pix *pix_hollow = pixSubtract(nullptr, pix_closed, pix_solid); - pixDestroy(&pix_solid); - Pix *pix_vline = pixOpenBrick(nullptr, pix_hollow, 1, min_line_length); - Pix *pix_hline = pixOpenBrick(nullptr, pix_hollow, min_line_length, 1); - pixDestroy(&pix_hollow); + // native serial code + Pix* src_pix = input.pix; + Pix* pix_closed = + pixCloseBrick(nullptr, src_pix, closing_brick, closing_brick); + Pix* pix_solid = + pixOpenBrick(nullptr, pix_closed, max_line_width, max_line_width); + Pix* pix_hollow = pixSubtract(nullptr, pix_closed, pix_solid); + pixDestroy(&pix_solid); + Pix* pix_vline = pixOpenBrick(nullptr, pix_hollow, 1, min_line_length); + Pix* pix_hline = pixOpenBrick(nullptr, pix_hollow, min_line_length, 1); + pixDestroy(&pix_hollow); #if ON_WINDOWS - QueryPerformanceCounter(&time_funct_end); - time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); + QueryPerformanceCounter(&time_funct_end); + time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / + (double)(freq.QuadPart); #elif ON_APPLE - stop = mach_absolute_time(); - time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; + stop = mach_absolute_time(); + time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; #else - clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); - time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; + clock_gettime(CLOCK_MONOTONIC, &time_funct_end); + time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 + + (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0; #endif - } + } - return time; + return time; } - - /****************************************************************************** * Device Selection *****************************************************************************/ @@ -2483,8 +2457,8 @@ static double getLineMasksMorphMicroBench(GPUEnv *env, #include "stdlib.h" // encode score object as byte string -static ds_status serializeScore(ds_device *device, void **serializedScore, - unsigned int *serializedScoreSize) { +static ds_status serializeScore(ds_device* device, void** serializedScore, + unsigned int* serializedScoreSize) { *serializedScoreSize = sizeof(TessDeviceScore); *serializedScore = new unsigned char[*serializedScoreSize]; memcpy(*serializedScore, device->score, *serializedScoreSize); @@ -2492,8 +2466,8 @@ static ds_status serializeScore(ds_device *device, void **serializedScore, } // parses byte string and stores in score object -static ds_status deserializeScore(ds_device *device, - const unsigned char *serializedScore, +static ds_status deserializeScore(ds_device* device, + const unsigned char* serializedScore, unsigned int serializedScoreSize) { // check that serializedScoreSize == sizeof(TessDeviceScore); device->score = new TessDeviceScore; @@ -2501,18 +2475,18 @@ static ds_status deserializeScore(ds_device *device, return DS_SUCCESS; } -static ds_status releaseScore(void *score) { - delete (TessDeviceScore *)score; +static ds_status releaseScore(void* score) { + delete (TessDeviceScore*)score; return DS_SUCCESS; } // evaluate devices -static ds_status evaluateScoreForDevice(ds_device *device, void *inputData) { +static ds_status evaluateScoreForDevice(ds_device* device, void* inputData) { // overwrite statuc gpuEnv w/ current device // so native opencl calls can be used; they use static gpuEnv printf("\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName, device->type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native"); - GPUEnv *env = nullptr; + GPUEnv* env = nullptr; if (device->type == DS_DEVICE_OPENCL_DEVICE) { env = &OpenclDevice::gpuEnv; memset(env, 0, sizeof(*env)); @@ -2524,8 +2498,8 @@ static ds_status evaluateScoreForDevice(ds_device *device, void *inputData) { OpenclDevice::CompileKernelFile(env, ""); } - TessScoreEvaluationInputData *input = - static_cast(inputData); + TessScoreEvaluationInputData* input = + static_cast(inputData); // pixReadTiff double composeRGBPixelTime = @@ -2554,7 +2528,7 @@ static ds_status evaluateScoreForDevice(ds_device *device, void *inputData) { thresholdRectToPixWeight * thresholdRectToPixTime + getLineMasksMorphWeight * getLineMasksMorphTime; device->score = new TessDeviceScore; - ((TessDeviceScore *)device->score)->time = weightedTime; + ((TessDeviceScore*)device->score)->time = weightedTime; printf("[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName, device->type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native"); @@ -2567,12 +2541,12 @@ static ds_status evaluateScoreForDevice(ds_device *device, void *inputData) { printf("[DS]%25s: %f (w=%.1f)\n", "getLineMasksMorph", getLineMasksMorphTime, getLineMasksMorphWeight); printf("[DS]%25s: %f\n", "Score", - static_cast(device->score)->time); + static_cast(device->score)->time); return DS_SUCCESS; } // initial call to select device -ds_device OpenclDevice::getDeviceSelection( ) { +ds_device OpenclDevice::getDeviceSelection() { if (!deviceIsSelected) { PERF_COUNT_START("getDeviceSelection") // check if opencl is available at runtime @@ -2581,11 +2555,11 @@ ds_device OpenclDevice::getDeviceSelection( ) { // PERF_COUNT_SUB("LoadOpencl") // setup devices ds_status status; - ds_profile *profile; + ds_profile* profile; status = initDSProfile(&profile, "v0.1"); PERF_COUNT_SUB("initDSProfile") // try reading scores from file - const char *fileName = "tesseract_opencl_profile_devices.dat"; + const char* fileName = "tesseract_opencl_profile_devices.dat"; status = readProfileFromFile(profile, deserializeScore, fileName); if (status != DS_SUCCESS) { // need to run evaluation @@ -2629,7 +2603,7 @@ ds_device OpenclDevice::getDeviceSelection( ) { int bestDeviceIdx = -1; for (unsigned d = 0; d < profile->numDevices; d++) { ds_device device = profile->devices[d]; - TessDeviceScore score = *(TessDeviceScore *)device.score; + TessDeviceScore score = *(TessDeviceScore*)device.score; float time = score.time; printf("[DS] Device[%u] %i:%s score is %f\n", d + 1, device.type, @@ -2648,7 +2622,7 @@ ds_device OpenclDevice::getDeviceSelection( ) { // TODO: call destructor for profile object? bool overridden = false; - char *overrideDeviceStr = getenv("TESSERACT_OPENCL_DEVICE"); + char* overrideDeviceStr = getenv("TESSERACT_OPENCL_DEVICE"); if (overrideDeviceStr != nullptr) { int overrideDeviceIdx = atoi(overrideDeviceStr); if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices) { @@ -2693,7 +2667,6 @@ ds_device OpenclDevice::getDeviceSelection( ) { return selectedDevice; } - bool OpenclDevice::selectedDeviceIsOpenCL() { ds_device device = getDeviceSelection(); return (device.type == DS_DEVICE_OPENCL_DEVICE); diff --git a/src/opencl/openclwrapper.h b/src/opencl/openclwrapper.h index 4d2d52dc..7c7b5284 100644 --- a/src/opencl/openclwrapper.h +++ b/src/opencl/openclwrapper.h @@ -22,29 +22,29 @@ #if defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || \ defined(__CYGWIN__) || defined(__MINGW32__) #define ON_WINDOWS 1 -#define ON_LINUX 0 -#define ON_APPLE 0 -#define ON_OTHER 0 +#define ON_LINUX 0 +#define ON_APPLE 0 +#define ON_OTHER 0 #define IF_WINDOWS(X) X #define IF_LINUX(X) #define IF_APPLE(X) #define IF_OTHER(X) #define NOT_WINDOWS(X) -#elif defined( __linux__ ) +#elif defined(__linux__) #define ON_WINDOWS 0 -#define ON_LINUX 1 -#define ON_APPLE 0 -#define ON_OTHER 0 +#define ON_LINUX 1 +#define ON_APPLE 0 +#define ON_OTHER 0 #define IF_WINDOWS(X) #define IF_LINUX(X) X #define IF_APPLE(X) #define IF_OTHER(X) #define NOT_WINDOWS(X) X -#elif defined( __APPLE__ ) +#elif defined(__APPLE__) #define ON_WINDOWS 0 -#define ON_LINUX 0 -#define ON_APPLE 1 -#define ON_OTHER 0 +#define ON_LINUX 0 +#define ON_APPLE 1 +#define ON_OTHER 0 #define IF_WINDOWS(X) #define IF_LINUX(X) #define IF_APPLE(X) X @@ -52,9 +52,9 @@ #define NOT_WINDOWS(X) X #else #define ON_WINDOWS 0 -#define ON_LINUX 0 -#define ON_APPLE 0 -#define ON_OTHER 1 +#define ON_LINUX 0 +#define ON_APPLE 0 +#define ON_OTHER 1 #define IF_WINDOWS(X) #define IF_LINUX(X) #define IF_APPLE(X) @@ -72,23 +72,24 @@ * 0 - no reporting * 1 - no reporting * 2 - report total function call time for functions we're tracking - * 3 - optionally report breakdown of function calls (kernel launch, kernel time, data copies) + * 3 - optionally report breakdown of function calls (kernel launch, kernel + *time, data copies) ************************************************************************************/ #define PERF_COUNT_VERBOSE 1 #define PERF_COUNT_REPORT_STR "[%36s], %24s, %11.6f\n" - #if ON_WINDOWS #if PERF_COUNT_VERBOSE >= 2 -#define PERF_COUNT_START(FUNCT_NAME) \ - char *funct_name = FUNCT_NAME; \ - double elapsed_time_sec; \ - LARGE_INTEGER freq, time_funct_start, time_funct_end, time_sub_start, time_sub_end; \ - QueryPerformanceFrequency(&freq); \ - QueryPerformanceCounter(&time_funct_start); \ - time_sub_start = time_funct_start; \ - time_sub_end = time_funct_start; +#define PERF_COUNT_START(FUNCT_NAME) \ + char* funct_name = FUNCT_NAME; \ + double elapsed_time_sec; \ + LARGE_INTEGER freq, time_funct_start, time_funct_end, time_sub_start, \ + time_sub_end; \ + QueryPerformanceFrequency(&freq); \ + QueryPerformanceCounter(&time_funct_start); \ + time_sub_start = time_funct_start; \ + time_sub_end = time_funct_start; #define PERF_COUNT_END \ QueryPerformanceCounter(&time_funct_end); \ @@ -111,18 +112,17 @@ #define PERF_COUNT_SUB(SUB) #endif - // not on windows #else #if PERF_COUNT_VERBOSE >= 2 -#define PERF_COUNT_START(FUNCT_NAME) \ - char *funct_name = FUNCT_NAME; \ - double elapsed_time_sec; \ - timespec time_funct_start, time_funct_end, time_sub_start, time_sub_end; \ - clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); \ - time_sub_start = time_funct_start; \ - time_sub_end = time_funct_start; +#define PERF_COUNT_START(FUNCT_NAME) \ + char* funct_name = FUNCT_NAME; \ + double elapsed_time_sec; \ + timespec time_funct_start, time_funct_end, time_sub_start, time_sub_end; \ + clock_gettime(CLOCK_MONOTONIC, &time_funct_start); \ + time_sub_start = time_funct_start; \ + time_sub_end = time_funct_start; #define PERF_COUNT_END \ clock_gettime(CLOCK_MONOTONIC, &time_funct_end); \ @@ -169,124 +169,127 @@ #define GROUPSIZE_HMORX 256 #define GROUPSIZE_HMORY 1 -typedef struct _KernelEnv -{ - cl_context mpkContext; - cl_command_queue mpkCmdQueue; - cl_program mpkProgram; - cl_kernel mpkKernel; - char mckKernelName[150]; +typedef struct _KernelEnv { + cl_context mpkContext; + cl_command_queue mpkCmdQueue; + cl_program mpkProgram; + cl_kernel mpkKernel; + char mckKernelName[150]; } KernelEnv; -typedef struct _OpenCLEnv -{ - cl_platform_id mpOclPlatformID; - cl_context mpOclContext; - cl_device_id mpOclDevsID; - cl_command_queue mpOclCmdQueue; +typedef struct _OpenCLEnv { + cl_platform_id mpOclPlatformID; + cl_context mpOclContext; + cl_device_id mpOclDevsID; + cl_command_queue mpOclCmdQueue; } OpenCLEnv; -typedef int ( *cl_kernel_function )( void **userdata, KernelEnv *kenv ); +typedef int (*cl_kernel_function)(void** userdata, KernelEnv* kenv); -#define CHECK_OPENCL(status,name) \ -if( status != CL_SUCCESS ) \ -{ \ - printf ("OpenCL error code is %d at when %s .\n", status, name); \ -} +#define CHECK_OPENCL(status, name) \ + if (status != CL_SUCCESS) { \ + printf("OpenCL error code is %d at when %s .\n", status, name); \ + } - -typedef struct _GPUEnv -{ - //share vb in all modules in hb library - cl_platform_id mpPlatformID; - cl_device_type mDevType; - cl_context mpContext; - cl_device_id *mpArryDevsID; - cl_device_id mpDevID; - cl_command_queue mpCmdQueue; - cl_kernel mpArryKernels[MAX_CLFILE_NUM]; - cl_program mpArryPrograms[MAX_CLFILE_NUM]; //one program object maps one kernel source file - char mArryKnelSrcFile[MAX_CLFILE_NUM][256], //the max len of kernel file name is 256 - mArrykernelNames[MAX_CLKERNEL_NUM][MAX_KERNEL_STRING_LEN + 1]; - cl_kernel_function mpArryKnelFuncs[MAX_CLKERNEL_NUM]; - int mnKernelCount, mnFileCount, // only one kernel file - mnIsUserCreated; // 1: created , 0:no create and needed to create by opencl wrapper - int mnKhrFp64Flag; - int mnAmdFp64Flag; +typedef struct _GPUEnv { + // share vb in all modules in hb library + cl_platform_id mpPlatformID; + cl_device_type mDevType; + cl_context mpContext; + cl_device_id* mpArryDevsID; + cl_device_id mpDevID; + cl_command_queue mpCmdQueue; + cl_kernel mpArryKernels[MAX_CLFILE_NUM]; + cl_program mpArryPrograms[MAX_CLFILE_NUM]; // one program object maps one + // kernel source file + char mArryKnelSrcFile[MAX_CLFILE_NUM] + [256], // the max len of kernel file name is 256 + mArrykernelNames[MAX_CLKERNEL_NUM][MAX_KERNEL_STRING_LEN + 1]; + cl_kernel_function mpArryKnelFuncs[MAX_CLKERNEL_NUM]; + int mnKernelCount, mnFileCount, // only one kernel file + mnIsUserCreated; // 1: created , 0:no create and needed to create by + // opencl wrapper + int mnKhrFp64Flag; + int mnAmdFp64Flag; } GPUEnv; +class OpenclDevice { + public: + static GPUEnv gpuEnv; + static int isInited; + OpenclDevice(); + ~OpenclDevice(); + static int InitEnv(); // load dll, call InitOpenclRunEnv(0) + static int InitOpenclRunEnv( + int argc); // RegistOpenclKernel, double flags, compile kernels + static int InitOpenclRunEnv_DeviceSelection( + int argc); // RegistOpenclKernel, double flags, compile kernels + static int RegistOpenclKernel(); + static int ReleaseOpenclRunEnv(); + static int ReleaseOpenclEnv(GPUEnv* gpuInfo); + static int CompileKernelFile(GPUEnv* gpuInfo, const char* buildOption); + static int CachedOfKernerPrg(const GPUEnv* gpuEnvCached, + const char* clFileName); + static int GeneratBinFromKernelSource(cl_program program, + const char* clFileName); + static int WriteBinaryToFile(const char* fileName, const char* birary, + size_t numBytes); + static int BinaryGenerated(const char* clFileName, FILE** fhandle); + // static int CompileKernelFile( const char *filename, GPUEnv *gpuInfo, const + // char *buildOption ); + static l_uint32* pixReadFromTiffKernel(l_uint32* tiffdata, l_int32 w, + l_int32 h, l_int32 wpl, + l_uint32* line); + static int composeRGBPixelCl(int* tiffdata, int* line, int h, int w); -class OpenclDevice -{ + /* OpenCL implementations of Morphological operations*/ -public: - static GPUEnv gpuEnv; - static int isInited; - OpenclDevice(); - ~OpenclDevice(); - static int InitEnv(); // load dll, call InitOpenclRunEnv(0) - static int InitOpenclRunEnv( int argc ); // RegistOpenclKernel, double flags, compile kernels - static int InitOpenclRunEnv_DeviceSelection( int argc ); // RegistOpenclKernel, double flags, compile kernels - static int RegistOpenclKernel(); - static int ReleaseOpenclRunEnv(); - static int ReleaseOpenclEnv( GPUEnv *gpuInfo ); - static int CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption ); - static int CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName ); - static int GeneratBinFromKernelSource( cl_program program, const char * clFileName ); - static int WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes ); - static int BinaryGenerated( const char * clFileName, FILE ** fhandle ); - //static int CompileKernelFile( const char *filename, GPUEnv *gpuInfo, const char *buildOption ); - static l_uint32* pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl, l_uint32 *line); - static int composeRGBPixelCl(int *tiffdata,int *line,int h,int w); + // Initialization of OCL buffers used in Morph operations + static int initMorphCLAllocations(l_int32 wpl, l_int32 h, Pix* pixs); + static void releaseMorphCLBuffers(); -/* OpenCL implementations of Morphological operations*/ + static void pixGetLinesCL(Pix* pixd, Pix* pixs, Pix** pix_vline, + Pix** pix_hline, Pix** pixClosed, bool getpixClosed, + l_int32 close_hsize, l_int32 close_vsize, + l_int32 open_hsize, l_int32 open_vsize, + l_int32 line_hsize, l_int32 line_vsize); - //Initialization of OCL buffers used in Morph operations - static int initMorphCLAllocations(l_int32 wpl, l_int32 h, Pix *pixs); - static void releaseMorphCLBuffers(); + // int InitOpenclAttr( OpenCLEnv * env ); + // int ReleaseKernel( KernelEnv * env ); + static int SetKernelEnv(KernelEnv* envInfo); + // int CreateKernel( char * kernelname, KernelEnv * env ); + // int RunKernel( const char *kernelName, void **userdata ); + // int ConvertToString( const char *filename, char **source ); + // int CheckKernelName( KernelEnv *envInfo, const char *kernelName ); + // int RegisterKernelWrapper( const char *kernelName, cl_kernel_function + // function ); int RunKernelWrapper( cl_kernel_function function, const char * + // kernelName, void **usrdata ); int GetKernelEnvAndFunc( const char + // *kernelName, KernelEnv *env, cl_kernel_function *function ); - static void pixGetLinesCL(Pix *pixd, Pix *pixs, Pix **pix_vline, - Pix **pix_hline, Pix **pixClosed, - bool getpixClosed, l_int32 close_hsize, - l_int32 close_vsize, l_int32 open_hsize, - l_int32 open_vsize, l_int32 line_hsize, - l_int32 line_vsize); - - //int InitOpenclAttr( OpenCLEnv * env ); - //int ReleaseKernel( KernelEnv * env ); - static int SetKernelEnv( KernelEnv *envInfo ); - //int CreateKernel( char * kernelname, KernelEnv * env ); - //int RunKernel( const char *kernelName, void **userdata ); - //int ConvertToString( const char *filename, char **source ); - //int CheckKernelName( KernelEnv *envInfo, const char *kernelName ); - //int RegisterKernelWrapper( const char *kernelName, cl_kernel_function function ); - //int RunKernelWrapper( cl_kernel_function function, const char * kernelName, void **usrdata ); - //int GetKernelEnvAndFunc( const char *kernelName, KernelEnv *env, cl_kernel_function *function ); - - static int LoadOpencl(); + static int LoadOpencl(); #ifdef WIN32 - //static int OpenclInite(); - static void FreeOpenclDll(); + // static int OpenclInite(); + static void FreeOpenclDll(); #endif - inline static int AddKernelConfig( int kCount, const char *kName ); + inline static int AddKernelConfig(int kCount, const char* kName); - /* for binarization */ - static int HistogramRectOCL(unsigned char *imagedata, int bytes_per_pixel, - int bytes_per_line, int left, int top, - int width, int height, int kHistogramSize, - int *histogramAllChannels); + /* for binarization */ + static int HistogramRectOCL(unsigned char* imagedata, int bytes_per_pixel, + int bytes_per_line, int left, int top, int width, + int height, int kHistogramSize, + int* histogramAllChannels); - static int ThresholdRectToPixOCL(unsigned char *imagedata, - int bytes_per_pixel, int bytes_per_line, - int *thresholds, int *hi_values, Pix **pix, - int rect_height, int rect_width, - int rect_top, int rect_left); + static int ThresholdRectToPixOCL(unsigned char* imagedata, + int bytes_per_pixel, int bytes_per_line, + int* thresholds, int* hi_values, Pix** pix, + int rect_height, int rect_width, + int rect_top, int rect_left); - static ds_device getDeviceSelection(); - static ds_device selectedDevice; - static bool deviceIsSelected; - static bool selectedDeviceIsOpenCL(); + static ds_device getDeviceSelection(); + static ds_device selectedDevice; + static bool deviceIsSelected; + static bool selectedDeviceIsOpenCL(); }; #endif // USE_OPENCL