mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-11 12:43:17 +08:00
Fixes for OpenCL issues reported on Apple Mac. Still get -54 on Apple Mac while running on OpenCL CPU, however it is ignored now.
This commit is contained in:
parent
2c8bc4a2ac
commit
8e9159b091
@ -1100,10 +1100,13 @@ void kernel_ThresholdRectToPix(
|
||||
}
|
||||
}
|
||||
|
||||
// only supports 1 channel
|
||||
\n#define CHAR_VEC_WIDTH 8 \n
|
||||
\n#define PIXELS_PER_WORD 32 \n
|
||||
\n#define PIXELS_PER_BURST 8 \n
|
||||
\n#define BURSTS_PER_WORD (PIXELS_PER_WORD/PIXELS_PER_BURST) \n
|
||||
typedef union {
|
||||
uchar s[PIXELS_PER_BURST];
|
||||
uchar8 v[(PIXELS_PER_BURST)/CHAR_VEC_WIDTH];
|
||||
uchar s[PIXELS_PER_BURST*1];
|
||||
uchar8 v[(PIXELS_PER_BURST*1)/CHAR_VEC_WIDTH];
|
||||
} charVec1;
|
||||
|
||||
__attribute__((reqd_work_group_size(256, 1, 1)))
|
||||
@ -1112,7 +1115,7 @@ void kernel_ThresholdRectToPix_OneChan(
|
||||
__global const uchar8 *imageData,
|
||||
int height,
|
||||
int width,
|
||||
int wpl, // words per line
|
||||
int wpl, // words per line of output image
|
||||
__global int *thresholds,
|
||||
__global int *hi_values,
|
||||
__global int *pix) {
|
||||
@ -1134,96 +1137,71 @@ void kernel_ThresholdRectToPix_OneChan(
|
||||
|
||||
// load burst
|
||||
charVec1 pixels;
|
||||
for ( int i = 0; i < (PIXELS_PER_BURST)/CHAR_VEC_WIDTH; i++ ) {
|
||||
pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + i];
|
||||
}
|
||||
// for each char8 in burst
|
||||
pixels.v[0] = imageData[
|
||||
w*BURSTS_PER_WORD
|
||||
+ b
|
||||
+ 0 ];
|
||||
|
||||
// for each pixel in burst
|
||||
for ( int p = 0; p < PIXELS_PER_BURST; p++) {
|
||||
for ( int c = 0; c < 1; c++) {
|
||||
unsigned char pixChan = pixels.s[p + c];
|
||||
if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
|
||||
|
||||
//int littleEndianIdx = p ^ 3;
|
||||
//int bigEndianIdx = p;
|
||||
int idx =
|
||||
\n#ifdef __ENDIAN_LITTLE__\n
|
||||
p ^ 3;
|
||||
\n#else\n
|
||||
p;
|
||||
\n#endif\n
|
||||
unsigned char pixChan = pixels.s[idx];
|
||||
if (pHi_Values[0] >= 0 && (pixChan > pThresholds[0]) == (pHi_Values[0] == 0)) {
|
||||
word |= (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
pix[w] = word;
|
||||
}
|
||||
}
|
||||
|
||||
)
|
||||
|
||||
|
||||
KERNEL(
|
||||
\n#define RED_SHIFT 24\n
|
||||
\n#define GREEN_SHIFT 16\n
|
||||
\n#define BLUE_SHIFT 8\n
|
||||
\n#define SET_DATA_BYTE( pdata, n, val ) (*(l_uint8 *)((l_uintptr_t)((l_uint8 *)(pdata) + (n)) ^ 3) = (val))\n
|
||||
\n
|
||||
\n__attribute__((reqd_work_group_size(256, 1, 1)))\n
|
||||
\n__kernel\n
|
||||
\nvoid kernel_RGBToGray(
|
||||
__global const unsigned int *srcData,
|
||||
__global unsigned char *dstData,
|
||||
int srcWPL,
|
||||
int dstWPL,
|
||||
int height,
|
||||
int width,
|
||||
float rwt,
|
||||
float gwt,
|
||||
float bwt ) {
|
||||
|
||||
// pixel index
|
||||
int pixelIdx = get_global_id(0);
|
||||
if (pixelIdx >= height*width) return;
|
||||
|
||||
unsigned int word = srcData[pixelIdx];
|
||||
int output = (rwt * ((word >> RED_SHIFT) & 0xff) +
|
||||
gwt * ((word >> GREEN_SHIFT) & 0xff) +
|
||||
bwt * ((word >> BLUE_SHIFT) & 0xff) + 0.5);
|
||||
// SET_DATA_BYTE
|
||||
dstData[pixelIdx] = output;
|
||||
}
|
||||
)
|
||||
#endif
|
||||
|
||||
; // close char*
|
||||
|
||||
#endif // USE_EXTERNAL_KERNEL
|
||||
#endif //_OCL_KERNEL_H_
|
||||
//#endif //_OCL_KERNEL_H_
|
||||
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|
||||
|
||||
// Alternative histogram kernel written to use uchar and different global memory scattered write
|
||||
// was a little better for intel platforms but still not faster then native serial code
|
||||
#if 0
|
||||
/* data layed out as
|
||||
bin0 bin1 bin2...
|
||||
r,g,b,a,r,g,b,a,r,g,b,a nthreads/4 copies
|
||||
*/
|
||||
\n__attribute__((reqd_work_group_size(256, 1, 1)))
|
||||
\n __kernel
|
||||
\n void kernel_HistogramRectAllChannels_uchar(
|
||||
\n volatile __global const uchar *data,
|
||||
\n uint numPixels,
|
||||
\n volatile __global uint *histBuffer) {
|
||||
\n
|
||||
\n // for each pixel/channel, accumulate in global memory
|
||||
\n for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS; pc += get_global_size(0) ) {
|
||||
\n uchar value = data[pc];
|
||||
\n int idx = value*get_global_size(0) + get_global_id(0);
|
||||
\n histBuffer[ idx ]++; // coalesced if same value
|
||||
\n }
|
||||
\n } // kernel_HistogramRectAllChannels
|
||||
\n
|
||||
\n __attribute__((reqd_work_group_size(256, 1, 1)))
|
||||
\n __kernel
|
||||
\n void kernel_HistogramRectAllChannelsReduction_uchar(
|
||||
\n int n, // pixel redundancy that needs to be accumulated = nthreads/4
|
||||
\n __global uint4 *histBuffer,
|
||||
\n __global uint* histResult) { // each wg accumulates 1 bin (all channels within it
|
||||
\n
|
||||
\n // declare variables
|
||||
\n int binIdx = get_group_id(0);
|
||||
\n size_t groupId = get_group_id(0);
|
||||
\n size_t localId = get_local_id(0); // 0 -> 256-1
|
||||
\n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
|
||||
\n uint numThreads = get_global_size(0);
|
||||
\n uint4 hist = {0, 0, 0, 0};
|
||||
\n
|
||||
\n // accumulate in register
|
||||
\n for ( uint p = get_local_id(0); p < n; p+=GROUP_SIZE) {
|
||||
\n hist += histBuffer[binIdx*n+p];
|
||||
\n }
|
||||
\n
|
||||
\n // reduction in local memory
|
||||
\n __local uint4 localHist[GROUP_SIZE];
|
||||
\n localHist[localId] = hist;
|
||||
\n barrier(CLK_LOCAL_MEM_FENCE);
|
||||
\n
|
||||
\n for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
|
||||
\n if (localId < stride) {
|
||||
\n hist = localHist[ localId+stride];
|
||||
\n }
|
||||
\n barrier(CLK_LOCAL_MEM_FENCE);
|
||||
\n if (localId < stride) {
|
||||
\n localHist[ localId] += hist;
|
||||
\n }
|
||||
\n barrier(CLK_LOCAL_MEM_FENCE);
|
||||
\n }
|
||||
\n
|
||||
\n // write reduction to final result
|
||||
\n if (localId == 0) {
|
||||
\n histResult[0*HIST_SIZE+binIdx] = localHist[0].s0;
|
||||
\n histResult[1*HIST_SIZE+binIdx] = localHist[0].s1;
|
||||
\n histResult[2*HIST_SIZE+binIdx] = localHist[0].s2;
|
||||
\n histResult[3*HIST_SIZE+binIdx] = localHist[0].s3;
|
||||
\n }
|
||||
\n
|
||||
\n } // kernel_HistogramRectAllChannels
|
||||
#endif
|
||||
|
@ -63,6 +63,8 @@ static ds_status releaseDSProfile(ds_profile* profile, ds_score_release sr) {
|
||||
if (profile->devices!=NULL && sr!=NULL) {
|
||||
unsigned int i;
|
||||
for (i = 0; i < profile->numDevices; i++) {
|
||||
if (profile->devices[i].oclDeviceName) free(profile->devices[i].oclDeviceName);
|
||||
if (profile->devices[i].oclDriverVersion) free(profile->devices[i].oclDriverVersion);
|
||||
status = sr(profile->devices[i].score);
|
||||
if (status != DS_SUCCESS)
|
||||
break;
|
||||
|
@ -1,6 +1,7 @@
|
||||
#ifdef _WIN32
|
||||
#include <Windows.h>
|
||||
#include <io.h>
|
||||
|
||||
#else
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
@ -14,21 +15,22 @@
|
||||
#include "otsuthr.h"
|
||||
#include "thresholder.h"
|
||||
|
||||
#if ON_APPLE
|
||||
#include <stdio.h>
|
||||
#include <mach/mach_time.h>
|
||||
#endif
|
||||
#ifdef USE_OPENCL
|
||||
|
||||
#if ON_APPLE
|
||||
#define TIMESPEC mach_timespec
|
||||
#else
|
||||
#define TIMESPEC timespec
|
||||
#endif
|
||||
|
||||
#include "opencl_device_selection.h"
|
||||
#ifdef _MSC_VER
|
||||
int LeptMsgSeverity = 3; // L_SEVERITY_INFO
|
||||
#endif // _MSC_VER
|
||||
GPUEnv OpenclDevice::gpuEnv;
|
||||
|
||||
#if USE_DEVICE_SELECTION
|
||||
|
||||
bool OpenclDevice::deviceIsSelected = false;
|
||||
ds_device OpenclDevice::selectedDevice;
|
||||
#endif
|
||||
|
||||
|
||||
int OpenclDevice::isInited = 0;
|
||||
|
||||
@ -202,175 +204,6 @@ PIX* mapOutputCLBuffer(KernelEnv rEnv, cl_mem clbuffer, PIX* pixd, PIX* pixs, in
|
||||
return xValues;
|
||||
}
|
||||
|
||||
int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo )
|
||||
{
|
||||
size_t length;
|
||||
cl_int clStatus;
|
||||
cl_uint numPlatforms, numDevices;
|
||||
cl_platform_id *platforms;
|
||||
cl_context_properties cps[3];
|
||||
char platformName[256];
|
||||
unsigned int i;
|
||||
|
||||
|
||||
// Have a look at the available platforms.
|
||||
|
||||
if ( !gpuInfo->mnIsUserCreated )
|
||||
{
|
||||
clStatus = clGetPlatformIDs( 0, NULL, &numPlatforms );
|
||||
if ( clStatus != CL_SUCCESS )
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
gpuInfo->mpPlatformID = NULL;
|
||||
|
||||
if ( 0 < numPlatforms )
|
||||
{
|
||||
platforms = (cl_platform_id*) malloc( numPlatforms * sizeof( cl_platform_id ) );
|
||||
if ( platforms == (cl_platform_id*) NULL )
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
clStatus = clGetPlatformIDs( numPlatforms, platforms, NULL );
|
||||
|
||||
if ( clStatus != CL_SUCCESS )
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
for ( i = 0; i < numPlatforms; i++ )
|
||||
{
|
||||
clStatus = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR,
|
||||
sizeof( platformName ), platformName, NULL );
|
||||
|
||||
if ( clStatus != CL_SUCCESS )
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
gpuInfo->mpPlatformID = platforms[i];
|
||||
|
||||
//if (!strcmp(platformName, "Intel(R) Coporation"))
|
||||
//if( !strcmp( platformName, "Advanced Micro Devices, Inc." ))
|
||||
{
|
||||
gpuInfo->mpPlatformID = platforms[i];
|
||||
|
||||
if ( getenv("SC_OPENCLCPU") )
|
||||
{
|
||||
clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform
|
||||
CL_DEVICE_TYPE_CPU, // device_type for CPU device
|
||||
0, // num_entries
|
||||
NULL, // devices
|
||||
&numDevices);
|
||||
printf("Selecting OpenCL device: CPU (a)\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform
|
||||
CL_DEVICE_TYPE_GPU, // device_type for GPU device
|
||||
0, // num_entries
|
||||
NULL, // devices
|
||||
&numDevices);
|
||||
printf("Selecting OpenCL device: GPU (a)\n");
|
||||
}
|
||||
if ( clStatus != CL_SUCCESS )
|
||||
continue;
|
||||
|
||||
if ( numDevices )
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( clStatus != CL_SUCCESS )
|
||||
return 1;
|
||||
free( platforms );
|
||||
}
|
||||
if ( NULL == gpuInfo->mpPlatformID )
|
||||
return 1;
|
||||
|
||||
// Use available platform.
|
||||
cps[0] = CL_CONTEXT_PLATFORM;
|
||||
cps[1] = (cl_context_properties) gpuInfo->mpPlatformID;
|
||||
cps[2] = 0;
|
||||
// Set device type for OpenCL
|
||||
|
||||
if ( getenv("SC_OPENCLCPU") )
|
||||
{
|
||||
gpuInfo->mDevType = CL_DEVICE_TYPE_CPU;
|
||||
printf("Selecting OpenCL device: CPU (b)\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
gpuInfo->mDevType = CL_DEVICE_TYPE_GPU;
|
||||
printf("Selecting OpenCL device: GPU (b)\n");
|
||||
}
|
||||
|
||||
gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
|
||||
|
||||
if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
|
||||
{
|
||||
gpuInfo->mDevType = CL_DEVICE_TYPE_CPU;
|
||||
gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
|
||||
printf("Selecting OpenCL device: CPU (c)\n");
|
||||
}
|
||||
if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
|
||||
{
|
||||
gpuInfo->mDevType = CL_DEVICE_TYPE_DEFAULT;
|
||||
gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
|
||||
printf("Selecting OpenCL device: DEFAULT (c)\n");
|
||||
}
|
||||
if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
|
||||
return 1;
|
||||
// Detect OpenCL devices.
|
||||
// First, get the size of device list data
|
||||
clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, 0, NULL, &length );
|
||||
if ( ( clStatus != CL_SUCCESS ) || ( length == 0 ) )
|
||||
return 1;
|
||||
// Now allocate memory for device list based on the size we got earlier
|
||||
gpuInfo->mpArryDevsID = (cl_device_id*) malloc( length );
|
||||
if ( gpuInfo->mpArryDevsID == (cl_device_id*) NULL )
|
||||
return 1;
|
||||
// Now, get the device list data
|
||||
clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, length,
|
||||
gpuInfo->mpArryDevsID, NULL );
|
||||
if ( clStatus != CL_SUCCESS )
|
||||
return 1;
|
||||
|
||||
// Create OpenCL command queue.
|
||||
gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpArryDevsID[0], 0, &clStatus );
|
||||
|
||||
if ( clStatus != CL_SUCCESS )
|
||||
return 1;
|
||||
}
|
||||
|
||||
clStatus = clGetCommandQueueInfo( gpuInfo->mpCmdQueue, CL_QUEUE_THREAD_HANDLE_AMD, 0, NULL, NULL );
|
||||
// Check device extensions for double type
|
||||
size_t aDevExtInfoSize = 0;
|
||||
|
||||
clStatus = clGetDeviceInfo( gpuInfo->mpArryDevsID[0], CL_DEVICE_EXTENSIONS, 0, NULL, &aDevExtInfoSize );
|
||||
CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
|
||||
|
||||
char *aExtInfo = new char[aDevExtInfoSize];
|
||||
|
||||
clStatus = clGetDeviceInfo( gpuInfo->mpArryDevsID[0], CL_DEVICE_EXTENSIONS,
|
||||
sizeof(char) * aDevExtInfoSize, aExtInfo, NULL);
|
||||
CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
|
||||
|
||||
gpuInfo->mnKhrFp64Flag = 0;
|
||||
gpuInfo->mnAmdFp64Flag = 0;
|
||||
|
||||
if ( strstr( aExtInfo, "cl_khr_fp64" ) )
|
||||
{
|
||||
gpuInfo->mnKhrFp64Flag = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Check if cl_amd_fp64 extension is supported
|
||||
if ( strstr( aExtInfo, "cl_amd_fp64" ) )
|
||||
gpuInfo->mnAmdFp64Flag = 1;
|
||||
}
|
||||
delete []aExtInfo;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void OpenclDevice::releaseMorphCLBuffers()
|
||||
{
|
||||
@ -426,14 +259,8 @@ PERF_COUNT_SUB("LoadOpencl")
|
||||
// sets up environment, compiles programs
|
||||
|
||||
|
||||
#if USE_DEVICE_SELECTION
|
||||
|
||||
InitOpenclRunEnv_DeviceSelection( 0 );
|
||||
//PERF_COUNT_SUB("called InitOpenclRunEnv_DS")
|
||||
#else
|
||||
// init according to device
|
||||
InitOpenclRunEnv( 0 );
|
||||
#endif
|
||||
//PERF_COUNT_END
|
||||
return 1;
|
||||
}
|
||||
@ -465,62 +292,9 @@ int OpenclDevice::RegistOpenclKernel()
|
||||
AddKernelConfig( 1, (const char*) "oclAverageSub1" );
|
||||
return 0;
|
||||
}
|
||||
int OpenclDevice::InitOpenclRunEnv( int argc )
|
||||
{
|
||||
int status = 0;
|
||||
if ( MAX_CLKERNEL_NUM <= 0 )
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
if ( ( argc > MAX_CLFILE_NUM ) || ( argc < 0 ) )
|
||||
return 1;
|
||||
|
||||
if ( !isInited )
|
||||
{
|
||||
RegistOpenclKernel();
|
||||
//initialize devices, context, comand_queue
|
||||
status = InitOpenclRunEnv( &gpuEnv );
|
||||
if ( status )
|
||||
{
|
||||
fprintf(stderr,"init_opencl_env failed.\n");
|
||||
return 1;
|
||||
}
|
||||
fprintf(stderr,"init_opencl_env successed.\n");
|
||||
//initialize program, kernelName, kernelCount
|
||||
if( getenv( "SC_FLOAT" ) )
|
||||
{
|
||||
gpuEnv.mnKhrFp64Flag = 0;
|
||||
gpuEnv.mnAmdFp64Flag = 0;
|
||||
}
|
||||
if( gpuEnv.mnKhrFp64Flag )
|
||||
{
|
||||
fprintf(stderr,"----use khr double type in kernel----\n");
|
||||
status = CompileKernelFile( &gpuEnv, "-D KHR_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16" );
|
||||
}
|
||||
else if( gpuEnv.mnAmdFp64Flag )
|
||||
{
|
||||
fprintf(stderr,"----use amd double type in kernel----\n");
|
||||
status = CompileKernelFile( &gpuEnv, "-D AMD_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16" );
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(stderr,"----use float type in kernel----\n");
|
||||
status = CompileKernelFile( &gpuEnv, "-Dfp_t=float -Dfp_t4=float4 -Dfp_t16=float16" );
|
||||
}
|
||||
if ( status == 0 || gpuEnv.mnKernelCount == 0 )
|
||||
{
|
||||
fprintf(stderr,"CompileKernelFile failed.\n");
|
||||
return 1;
|
||||
}
|
||||
fprintf(stderr,"CompileKernelFile successed.\n");
|
||||
isInited = 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int OpenclDevice::InitOpenclRunEnv_DeviceSelection( int argc ) {
|
||||
//PERF_COUNT_START("InitOpenclRunEnv_DS")
|
||||
#if USE_DEVICE_SELECTION
|
||||
if (!isInited) {
|
||||
// after programs compiled, selects best device
|
||||
//printf("[DS] InitOpenclRunEnv_DS::Calling performDeviceSelection()\n");
|
||||
@ -541,7 +315,6 @@ int OpenclDevice::InitOpenclRunEnv_DeviceSelection( int argc ) {
|
||||
}
|
||||
isInited = 1;
|
||||
}
|
||||
#endif
|
||||
//PERF_COUNT_END
|
||||
return 0;
|
||||
}
|
||||
@ -598,29 +371,7 @@ int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
|
||||
int status = 0;
|
||||
char *str = NULL;
|
||||
FILE *fd = NULL;
|
||||
cl_uint numDevices=0;
|
||||
if ( getenv("SC_OPENCLCPU") )
|
||||
{
|
||||
clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform
|
||||
CL_DEVICE_TYPE_CPU, // device_type for CPU device
|
||||
0, // num_entries
|
||||
NULL, // devices ID
|
||||
&numDevices);
|
||||
}
|
||||
else
|
||||
{
|
||||
clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform
|
||||
CL_DEVICE_TYPE_GPU, // device_type for GPU device
|
||||
0, // num_entries
|
||||
NULL, // devices ID
|
||||
&numDevices);
|
||||
}
|
||||
CHECK_OPENCL( clStatus, "clGetDeviceIDs" );
|
||||
for ( i = 0; i < numDevices; i++ )
|
||||
{
|
||||
char fileName[256] = { 0 }, cl_name[128] = { 0 };
|
||||
if ( gpuEnv.mpArryDevsID[i] != 0 )
|
||||
{
|
||||
char deviceName[1024];
|
||||
clStatus = clGetDeviceInfo( gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL );
|
||||
CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
|
||||
@ -631,8 +382,6 @@ int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
|
||||
legalizeFileName(fileName);
|
||||
fd = fopen( fileName, "rb" );
|
||||
status = ( fd != NULL ) ? 1 : 0;
|
||||
}
|
||||
}
|
||||
if ( fd != NULL )
|
||||
{
|
||||
*fhandle = fd;
|
||||
@ -675,7 +424,7 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c
|
||||
{
|
||||
unsigned int i = 0;
|
||||
cl_int clStatus;
|
||||
size_t *binarySizes, numDevices;
|
||||
size_t *binarySizes, numDevices=0;
|
||||
cl_device_id *mpArryDevsID;
|
||||
char **binaries, *str = NULL;
|
||||
|
||||
@ -714,14 +463,6 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c
|
||||
binaries[i] = (char*) malloc( sizeof(char) * binarySizes[i] );
|
||||
if ( binaries[i] == NULL )
|
||||
{
|
||||
// cleanup all memory allocated so far
|
||||
for(int cleanupIndex = 0; cleanupIndex < i; ++cleanupIndex)
|
||||
{
|
||||
free(binaries[cleanupIndex]);
|
||||
}
|
||||
// cleanup binary array
|
||||
free(binaries);
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
@ -1038,7 +779,6 @@ PERF_COUNT_END
|
||||
return pResult;
|
||||
}
|
||||
|
||||
|
||||
PIX * OpenclDevice::pixReadTiffCl ( const char *filename, l_int32 n )
|
||||
{
|
||||
PERF_COUNT_START("pixReadTiffCL")
|
||||
@ -1356,7 +1096,7 @@ OpenclDevice::pixReadMemTiffCl(const l_uint8 *data,size_t size,l_int32 n)
|
||||
l_int32 i, pagefound;
|
||||
PIX *pix;
|
||||
TIFF *tif;
|
||||
L_MEMSTREAM *memStream;
|
||||
//L_MEMSTREAM *memStream;
|
||||
PROCNAME("pixReadMemTiffCl");
|
||||
|
||||
if (!data)
|
||||
@ -1555,9 +1295,12 @@ PIXCMAP *cmap;
|
||||
|
||||
//Invoke the OpenCL kernel for pixReadFromTiff
|
||||
l_uint32* output_gpu=pixReadFromTiffKernel(tiffdata,w,h,wpl,line);
|
||||
pixSetData(pix, output_gpu);
|
||||
|
||||
pixSetData(pix, output_gpu);
|
||||
// pix already has data allocated, it now points to output_gpu?
|
||||
FREE(tiffdata);
|
||||
FREE(line);
|
||||
//FREE(output_gpu);
|
||||
}
|
||||
|
||||
if (getTiffStreamResolutionCl(tif, &xres, &yres) == 0) {
|
||||
@ -1833,7 +1576,7 @@ pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h)
|
||||
sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
|
||||
|
||||
selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
|
||||
|
||||
selDestroy(&sel);
|
||||
//global and local work dimensions for Horizontal pass
|
||||
gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
|
||||
globalThreads[0] = gsize;
|
||||
@ -1998,7 +1741,7 @@ pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, l_uint32 h)
|
||||
sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
|
||||
|
||||
selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
|
||||
|
||||
selDestroy(&sel);
|
||||
OpenclDevice::SetKernelEnv( &rEnv );
|
||||
|
||||
if (hsize == 5 && vsize == 5 && isAsymmetric)
|
||||
@ -2634,7 +2377,7 @@ OpenclDevice::pixGetLinesCL(PIX *pixd,
|
||||
* histogramAllChannels is layed out as all channel 0, then all channel 1...
|
||||
* only supports 1 or 4 channels (bytes_per_pixel)
|
||||
************************************************************************/
|
||||
void OpenclDevice::HistogramRectOCL(
|
||||
int OpenclDevice::HistogramRectOCL(
|
||||
const unsigned char* imageData,
|
||||
int bytes_per_pixel,
|
||||
int bytes_per_line,
|
||||
@ -2647,6 +2390,7 @@ void OpenclDevice::HistogramRectOCL(
|
||||
{
|
||||
PERF_COUNT_START("HistogramRectOCL")
|
||||
cl_int clStatus;
|
||||
int retVal= 0;
|
||||
KernelEnv histKern;
|
||||
SetKernelEnv( &histKern );
|
||||
KernelEnv histRedKern;
|
||||
@ -2667,10 +2411,9 @@ PERF_COUNT_START("HistogramRectOCL")
|
||||
int requestedOccupancy = 10;
|
||||
int numWorkGroups = numCUs * requestedOccupancy;
|
||||
int numThreads = block_size*numWorkGroups;
|
||||
size_t local_work_size[] = {static_cast<size_t>(block_size)};
|
||||
size_t global_work_size[] = {static_cast<size_t>(numThreads)};
|
||||
size_t red_global_work_size[] = {
|
||||
static_cast<size_t>(block_size * kHistogramSize * bytes_per_pixel)};
|
||||
size_t local_work_size[] = {block_size};
|
||||
size_t global_work_size[] = {numThreads};
|
||||
size_t red_global_work_size[] = {block_size*kHistogramSize*bytes_per_pixel};
|
||||
|
||||
/* map histogramAllChannels as write only */
|
||||
int numBins = kHistogramSize*bytes_per_pixel*numWorkGroups;
|
||||
@ -2690,7 +2433,7 @@ PERF_COUNT_START("HistogramRectOCL")
|
||||
zeroBuffer[0] = 0;
|
||||
cl_mem atomicSyncBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_int), (void *)zeroBuffer, &clStatus );
|
||||
CHECK_OPENCL( clStatus, "clCreateBuffer atomicSyncBuffer");
|
||||
|
||||
delete[] zeroBuffer;
|
||||
//Create kernel objects based on bytes_per_pixel
|
||||
if (bytes_per_pixel == 1)
|
||||
{
|
||||
@ -2743,7 +2486,10 @@ PERF_COUNT_SUB("before")
|
||||
0, NULL, NULL );
|
||||
CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels" );
|
||||
clFinish( histKern.mpkCmdQueue );
|
||||
|
||||
if(clStatus !=0)
|
||||
{
|
||||
retVal = -1;
|
||||
}
|
||||
/* launch histogram */
|
||||
clStatus = clEnqueueNDRangeKernel(
|
||||
histRedKern.mpkCmdQueue,
|
||||
@ -2752,19 +2498,26 @@ PERF_COUNT_SUB("before")
|
||||
0, NULL, NULL );
|
||||
CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction" );
|
||||
clFinish( histRedKern.mpkCmdQueue );
|
||||
|
||||
if(clStatus !=0)
|
||||
{
|
||||
retVal = -1;
|
||||
}
|
||||
PERF_COUNT_SUB("redKernel")
|
||||
|
||||
/* map results back from gpu */
|
||||
ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE, CL_MAP_READ, 0, kHistogramSize*bytes_per_pixel*sizeof(int), 0, NULL, NULL, &clStatus);
|
||||
CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer");
|
||||
|
||||
if(clStatus !=0)
|
||||
{
|
||||
retVal = -1;
|
||||
}
|
||||
clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0, NULL, NULL);
|
||||
|
||||
clReleaseMemObject(histogramBuffer);
|
||||
clReleaseMemObject(imageBuffer);
|
||||
PERF_COUNT_SUB("after")
|
||||
PERF_COUNT_END
|
||||
return retVal;
|
||||
|
||||
}
|
||||
|
||||
@ -2773,7 +2526,7 @@ PERF_COUNT_END
|
||||
* from the class, using thresholds/hi_values to the output IMAGE.
|
||||
* only supports 1 or 4 channels
|
||||
************************************************************************/
|
||||
void OpenclDevice::ThresholdRectToPixOCL(
|
||||
int OpenclDevice::ThresholdRectToPixOCL(
|
||||
const unsigned char* imageData,
|
||||
int bytes_per_pixel,
|
||||
int bytes_per_line,
|
||||
@ -2785,12 +2538,12 @@ void OpenclDevice::ThresholdRectToPixOCL(
|
||||
int top,
|
||||
int left) {
|
||||
PERF_COUNT_START("ThresholdRectToPixOCL")
|
||||
|
||||
int retVal =0;
|
||||
/* create pix result buffer */
|
||||
*pix = pixCreate(width, height, 1);
|
||||
uinT32* pixData = pixGetData(*pix);
|
||||
int wpl = pixGetWpl(*pix);
|
||||
int pixSize = wpl*height*sizeof(uinT32);
|
||||
int pixSize = wpl*height*sizeof(uinT32); // number of pixels
|
||||
|
||||
cl_int clStatus;
|
||||
KernelEnv rEnv;
|
||||
@ -2861,7 +2614,11 @@ PERF_COUNT_SUB("before")
|
||||
CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix" );
|
||||
clFinish( rEnv.mpkCmdQueue );
|
||||
PERF_COUNT_SUB("kernel")
|
||||
|
||||
if(clStatus !=0)
|
||||
{
|
||||
printf("Setting return value to -1\n");
|
||||
retVal = -1;
|
||||
}
|
||||
/* map results back from gpu */
|
||||
void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0, pixSize, 0, NULL, NULL, &clStatus);
|
||||
CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer");
|
||||
@ -2873,10 +2630,10 @@ PERF_COUNT_SUB("kernel")
|
||||
|
||||
PERF_COUNT_SUB("after")
|
||||
PERF_COUNT_END
|
||||
return retVal;
|
||||
}
|
||||
|
||||
|
||||
#if USE_DEVICE_SELECTION
|
||||
|
||||
/******************************************************************************
|
||||
* Data Types for Device Selection
|
||||
@ -2987,9 +2744,11 @@ double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData inpu
|
||||
LARGE_INTEGER freq, time_funct_start, time_funct_end;
|
||||
QueryPerformanceFrequency(&freq);
|
||||
#elif ON_APPLE
|
||||
mach_timespec_t time_funct_start, time_funct_end;
|
||||
mach_timebase_info_data_t info = { 0, 0 };
|
||||
mach_timebase_info(&info);
|
||||
long long start,stop;
|
||||
#else
|
||||
TIMESPEC time_funct_start, time_funct_end;
|
||||
timespec time_funct_start, time_funct_end;
|
||||
#endif
|
||||
// input data
|
||||
l_uint32 *tiffdata = (l_uint32 *)input.imageData;// same size and random data; data doesn't change workload
|
||||
@ -2998,6 +2757,8 @@ double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData inpu
|
||||
if (type == DS_DEVICE_OPENCL_DEVICE) {
|
||||
#if ON_WINDOWS
|
||||
QueryPerformanceCounter(&time_funct_start);
|
||||
#elif ON_APPLE
|
||||
start = mach_absolute_time();
|
||||
#else
|
||||
clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
|
||||
#endif
|
||||
@ -3008,6 +2769,9 @@ double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData inpu
|
||||
#if ON_WINDOWS
|
||||
QueryPerformanceCounter(&time_funct_end);
|
||||
time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
|
||||
#elif ON_APPLE
|
||||
stop = mach_absolute_time();
|
||||
time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
|
||||
#else
|
||||
clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
|
||||
time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
|
||||
@ -3016,6 +2780,8 @@ double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData inpu
|
||||
} else {
|
||||
#if ON_WINDOWS
|
||||
QueryPerformanceCounter(&time_funct_start);
|
||||
#elif ON_APPLE
|
||||
start = mach_absolute_time();
|
||||
#else
|
||||
clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
|
||||
#endif
|
||||
@ -3041,6 +2807,9 @@ double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData inpu
|
||||
#if ON_WINDOWS
|
||||
QueryPerformanceCounter(&time_funct_end);
|
||||
time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
|
||||
#elif ON_APPLE
|
||||
stop = mach_absolute_time();
|
||||
time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
|
||||
#else
|
||||
clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
|
||||
time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
|
||||
@ -3061,9 +2830,11 @@ double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input,
|
||||
LARGE_INTEGER freq, time_funct_start, time_funct_end;
|
||||
QueryPerformanceFrequency(&freq);
|
||||
#elif ON_APPLE
|
||||
mach_timespec_t time_funct_start, time_funct_end;
|
||||
mach_timebase_info_data_t info = { 0, 0 };
|
||||
mach_timebase_info(&info);
|
||||
long long start,stop;
|
||||
#else
|
||||
TIMESPEC time_funct_start, time_funct_end;
|
||||
timespec time_funct_start, time_funct_end;
|
||||
#endif
|
||||
|
||||
unsigned char pixelHi = (unsigned char)255;
|
||||
@ -3073,22 +2844,34 @@ double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input,
|
||||
int kHistogramSize = 256;
|
||||
int bytes_per_line = input.width*input.numChannels;
|
||||
int *histogramAllChannels = new int[kHistogramSize*input.numChannels];
|
||||
|
||||
int retVal= 0;
|
||||
// function call
|
||||
if (type == DS_DEVICE_OPENCL_DEVICE) {
|
||||
#if ON_WINDOWS
|
||||
QueryPerformanceCounter(&time_funct_start);
|
||||
#elif ON_APPLE
|
||||
start = mach_absolute_time();
|
||||
#else
|
||||
clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
|
||||
#endif
|
||||
|
||||
OpenclDevice::gpuEnv = *env;
|
||||
int wpl = pixGetWpl(input.pix);
|
||||
OpenclDevice::HistogramRectOCL(input.imageData, input.numChannels, bytes_per_line, top, left, input.width, input.height, kHistogramSize, histogramAllChannels);
|
||||
retVal= OpenclDevice::HistogramRectOCL(input.imageData, input.numChannels, bytes_per_line, top, left, input.width, input.height, kHistogramSize, histogramAllChannels);
|
||||
|
||||
#if ON_WINDOWS
|
||||
QueryPerformanceCounter(&time_funct_end);
|
||||
time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
|
||||
#elif ON_APPLE
|
||||
stop = mach_absolute_time();
|
||||
if(retVal ==0)
|
||||
{
|
||||
time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
|
||||
}
|
||||
else
|
||||
{
|
||||
time= FLT_MAX;
|
||||
}
|
||||
#else
|
||||
clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
|
||||
time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
|
||||
@ -3098,6 +2881,8 @@ double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input,
|
||||
int *histogram = new int[kHistogramSize];
|
||||
#if ON_WINDOWS
|
||||
QueryPerformanceCounter(&time_funct_start);
|
||||
#elif ON_APPLE
|
||||
start = mach_absolute_time();
|
||||
#else
|
||||
clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
|
||||
#endif
|
||||
@ -3108,6 +2893,9 @@ double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input,
|
||||
#if ON_WINDOWS
|
||||
QueryPerformanceCounter(&time_funct_end);
|
||||
time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
|
||||
#elif ON_APPLE
|
||||
stop = mach_absolute_time();
|
||||
time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
|
||||
#else
|
||||
clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
|
||||
time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
|
||||
@ -3116,7 +2904,6 @@ double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input,
|
||||
}
|
||||
|
||||
// cleanup
|
||||
//delete[] imageData;
|
||||
delete[] histogramAllChannels;
|
||||
return time;
|
||||
}
|
||||
@ -3162,13 +2949,16 @@ void ThresholdRectToPix_Native(const unsigned char* imagedata,
|
||||
double thresholdRectToPixMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
|
||||
|
||||
double time;
|
||||
int retVal =0;
|
||||
#if ON_WINDOWS
|
||||
LARGE_INTEGER freq, time_funct_start, time_funct_end;
|
||||
QueryPerformanceFrequency(&freq);
|
||||
#elif ON_APPLE
|
||||
mach_timespec_t time_funct_start, time_funct_end;
|
||||
mach_timebase_info_data_t info = { 0, 0 };
|
||||
mach_timebase_info(&info);
|
||||
long long start,stop;
|
||||
#else
|
||||
TIMESPEC time_funct_start, time_funct_end;
|
||||
timespec time_funct_start, time_funct_end;
|
||||
#endif
|
||||
|
||||
// input data
|
||||
@ -3192,17 +2982,30 @@ double thresholdRectToPixMicroBench( GPUEnv *env, TessScoreEvaluationInputData i
|
||||
if (type == DS_DEVICE_OPENCL_DEVICE) {
|
||||
#if ON_WINDOWS
|
||||
QueryPerformanceCounter(&time_funct_start);
|
||||
#elif ON_APPLE
|
||||
start = mach_absolute_time();
|
||||
#else
|
||||
clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
|
||||
#endif
|
||||
|
||||
OpenclDevice::gpuEnv = *env;
|
||||
int wpl = pixGetWpl(input.pix);
|
||||
OpenclDevice::ThresholdRectToPixOCL(input.imageData, input.numChannels, bytes_per_line, thresholds, hi_values, &input.pix, input.height, input.width, top, left);
|
||||
retVal= OpenclDevice::ThresholdRectToPixOCL(input.imageData, input.numChannels, bytes_per_line, thresholds, hi_values, &input.pix, input.height, input.width, top, left);
|
||||
|
||||
#if ON_WINDOWS
|
||||
QueryPerformanceCounter(&time_funct_end);
|
||||
time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
|
||||
#elif ON_APPLE
|
||||
stop = mach_absolute_time();
|
||||
if(retVal ==0)
|
||||
{
|
||||
time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;;
|
||||
}
|
||||
else
|
||||
{
|
||||
time= FLT_MAX;
|
||||
}
|
||||
|
||||
#else
|
||||
clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
|
||||
time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
|
||||
@ -3214,6 +3017,8 @@ double thresholdRectToPixMicroBench( GPUEnv *env, TessScoreEvaluationInputData i
|
||||
thresholder.SetImage( input.pix );
|
||||
#if ON_WINDOWS
|
||||
QueryPerformanceCounter(&time_funct_start);
|
||||
#elif ON_APPLE
|
||||
start = mach_absolute_time();
|
||||
#else
|
||||
clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
|
||||
#endif
|
||||
@ -3223,6 +3028,9 @@ double thresholdRectToPixMicroBench( GPUEnv *env, TessScoreEvaluationInputData i
|
||||
#if ON_WINDOWS
|
||||
QueryPerformanceCounter(&time_funct_end);
|
||||
time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
|
||||
#elif ON_APPLE
|
||||
stop = mach_absolute_time();
|
||||
time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
|
||||
#else
|
||||
clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
|
||||
time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
|
||||
@ -3242,9 +3050,11 @@ double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData in
|
||||
LARGE_INTEGER freq, time_funct_start, time_funct_end;
|
||||
QueryPerformanceFrequency(&freq);
|
||||
#elif ON_APPLE
|
||||
mach_timespec_t time_funct_start, time_funct_end;
|
||||
mach_timebase_info_data_t info = { 0, 0 };
|
||||
mach_timebase_info(&info);
|
||||
long long start,stop;
|
||||
#else
|
||||
TIMESPEC time_funct_start, time_funct_end;
|
||||
timespec time_funct_start, time_funct_end;
|
||||
#endif
|
||||
|
||||
// input data
|
||||
@ -3260,6 +3070,8 @@ double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData in
|
||||
if (type == DS_DEVICE_OPENCL_DEVICE) {
|
||||
#if ON_WINDOWS
|
||||
QueryPerformanceCounter(&time_funct_start);
|
||||
#elif ON_APPLE
|
||||
start = mach_absolute_time();
|
||||
#else
|
||||
clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
|
||||
#endif
|
||||
@ -3274,6 +3086,9 @@ double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData in
|
||||
#if ON_WINDOWS
|
||||
QueryPerformanceCounter(&time_funct_end);
|
||||
time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
|
||||
#elif ON_APPLE
|
||||
stop = mach_absolute_time();
|
||||
time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
|
||||
#else
|
||||
clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
|
||||
time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
|
||||
@ -3281,6 +3096,8 @@ double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData in
|
||||
} else {
|
||||
#if ON_WINDOWS
|
||||
QueryPerformanceCounter(&time_funct_start);
|
||||
#elif ON_APPLE
|
||||
start = mach_absolute_time();
|
||||
#else
|
||||
clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
|
||||
#endif
|
||||
@ -3298,6 +3115,9 @@ double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData in
|
||||
#if ON_WINDOWS
|
||||
QueryPerformanceCounter(&time_funct_end);
|
||||
time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
|
||||
#elif ON_APPLE
|
||||
stop = mach_absolute_time();
|
||||
time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
|
||||
#else
|
||||
clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
|
||||
time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
|
||||
@ -3332,7 +3152,10 @@ ds_status deserializeScore( ds_device* device, const unsigned char* serializedSc
|
||||
return DS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
ds_status releaseScore( void* score ) {
|
||||
delete[] score;
|
||||
return DS_SUCCESS;
|
||||
}
|
||||
|
||||
// evaluate devices
|
||||
ds_status evaluateScoreForDevice( ds_device *device, void *inputData) {
|
||||
@ -3352,7 +3175,6 @@ ds_status evaluateScoreForDevice( ds_device *device, void *inputData) {
|
||||
OpenclDevice::CompileKernelFile(env, "");
|
||||
}
|
||||
|
||||
|
||||
TessScoreEvaluationInputData *input = (TessScoreEvaluationInputData *)inputData;
|
||||
|
||||
// pixReadTiff
|
||||
@ -3395,7 +3217,6 @@ ds_status evaluateScoreForDevice( ds_device *device, void *inputData) {
|
||||
|
||||
// initial call to select device
|
||||
ds_device OpenclDevice::getDeviceSelection( ) {
|
||||
//PERF_COUNT_START("getDeviceSelection")
|
||||
if (!deviceIsSelected) {
|
||||
PERF_COUNT_START("getDeviceSelection")
|
||||
// check if opencl is available at runtime
|
||||
@ -3434,7 +3255,6 @@ PERF_COUNT_SUB("writeProfileToFile")
|
||||
} else {
|
||||
printf("[DS] Unable to evaluate performance; scores not written to file.\n");
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
PERF_COUNT_SUB("readProfileFromFile")
|
||||
@ -3446,7 +3266,6 @@ PERF_COUNT_SUB("readProfileFromFile")
|
||||
float bestTime = FLT_MAX; // begin search with worst possible time
|
||||
int bestDeviceIdx = -1;
|
||||
for (int d = 0; d < profile->numDevices; d++) {
|
||||
//((TessDeviceScore *)device->score)->time
|
||||
ds_device device = profile->devices[d];
|
||||
TessDeviceScore score = *(TessDeviceScore *)device.score;
|
||||
|
||||
@ -3478,7 +3297,8 @@ PERF_COUNT_SUB("readProfileFromFile")
|
||||
printf("[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx+1, profile->devices[bestDeviceIdx].oclDeviceName, profile->devices[bestDeviceIdx].type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
|
||||
}
|
||||
selectedDevice = profile->devices[bestDeviceIdx];
|
||||
|
||||
// cleanup
|
||||
releaseDSProfile(profile, releaseScore);
|
||||
} else {
|
||||
// opencl isn't available at runtime, select native cpu device
|
||||
printf("[DS] OpenCL runtime not available.\n");
|
||||
@ -3496,26 +3316,178 @@ PERF_COUNT_END
|
||||
return selectedDevice;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
bool OpenclDevice::selectedDeviceIsOpenCL() {
|
||||
#if USE_DEVICE_SELECTION
|
||||
ds_device device = getDeviceSelection();
|
||||
return (device.type == DS_DEVICE_OPENCL_DEVICE);
|
||||
#else
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool OpenclDevice::selectedDeviceIsNativeCPU() {
|
||||
#if USE_DEVICE_SELECTION
|
||||
ds_device device = getDeviceSelection();
|
||||
return (device.type == DS_DEVICE_NATIVE_CPU);
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
* pixConvertRGBToGray() from leptonica, converted to opencl kernel
|
||||
*
|
||||
* Input: pix (32 bpp RGB)
|
||||
* rwt, gwt, bwt (non-negative; these should add to 1.0,
|
||||
* or use 0.0 for default)
|
||||
* Return: 8 bpp pix, or null on error
|
||||
*
|
||||
* Notes:
|
||||
* (1) Use a weighted average of the RGB values.
|
||||
*/
|
||||
#define SET_DATA_BYTE( pdata, n, val ) (*(l_uint8 *)((l_uintptr_t)((l_uint8 *)(pdata) + (n)) ^ 3) = (val))
|
||||
|
||||
Pix * OpenclDevice::pixConvertRGBToGrayOCL(
|
||||
Pix *srcPix, // 32-bit source
|
||||
float rwt,
|
||||
float gwt,
|
||||
float bwt )
|
||||
{
|
||||
PERF_COUNT_START("pixConvertRGBToGrayOCL")
|
||||
Pix *dstPix; // 8-bit destination
|
||||
|
||||
if (rwt < 0.0 || gwt < 0.0 || bwt < 0.0) return NULL;
|
||||
|
||||
if (rwt == 0.0 && gwt == 0.0 && bwt == 0.0) {
|
||||
// magic numbers from leptonica
|
||||
rwt = 0.3;
|
||||
gwt = 0.5;
|
||||
bwt = 0.2;
|
||||
}
|
||||
// normalize
|
||||
float sum = rwt + gwt + bwt;
|
||||
rwt /= sum;
|
||||
gwt /= sum;
|
||||
bwt /= sum;
|
||||
|
||||
// source pix
|
||||
int w, h;
|
||||
pixGetDimensions(srcPix, &w, &h, NULL);
|
||||
//printf("Image is %i x %i\n", w, h);
|
||||
unsigned int *srcData = pixGetData(srcPix);
|
||||
int srcWPL = pixGetWpl(srcPix);
|
||||
int srcSize = srcWPL * h * sizeof(unsigned int);
|
||||
|
||||
// destination pix
|
||||
if ((dstPix = pixCreate(w, h, 8)) == NULL)
|
||||
return NULL;
|
||||
pixCopyResolution(dstPix, srcPix);
|
||||
unsigned int *dstData = pixGetData(dstPix);
|
||||
int dstWPL = pixGetWpl(dstPix);
|
||||
int dstWords = dstWPL * h;
|
||||
int dstSize = dstWords * sizeof(unsigned int);
|
||||
//printf("dstSize = %i\n", dstSize);
|
||||
PERF_COUNT_SUB("pix setup")
|
||||
|
||||
// opencl objects
|
||||
cl_int clStatus;
|
||||
KernelEnv kEnv;
|
||||
SetKernelEnv( &kEnv );
|
||||
|
||||
// source buffer
|
||||
cl_mem srcBuffer = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, srcSize, (void *)srcData, &clStatus );
|
||||
CHECK_OPENCL( clStatus, "clCreateBuffer srcBuffer");
|
||||
|
||||
// destination buffer
|
||||
cl_mem dstBuffer = clCreateBuffer( kEnv.mpkContext, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, dstSize, (void *)dstData, &clStatus );
|
||||
CHECK_OPENCL( clStatus, "clCreateBuffer dstBuffer");
|
||||
|
||||
// setup work group size parameters
|
||||
int block_size = 256;
|
||||
int numWorkGroups = ((h*w+block_size-1) / block_size );
|
||||
int numThreads = block_size*numWorkGroups;
|
||||
size_t local_work_size[] = {block_size};
|
||||
size_t global_work_size[] = {numThreads};
|
||||
//printf("Enqueueing %i threads for %i output pixels\n", numThreads, w*h);
|
||||
|
||||
/* compile kernel */
|
||||
kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, "kernel_RGBToGray", &clStatus );
|
||||
CHECK_OPENCL( clStatus, "clCreateKernel kernel_RGBToGray");
|
||||
|
||||
|
||||
/* set kernel arguments */
|
||||
clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&srcBuffer );
|
||||
CHECK_OPENCL( clStatus, "clSetKernelArg srcBuffer");
|
||||
clStatus = clSetKernelArg( kEnv.mpkKernel, 1, sizeof(cl_mem), (void *)&dstBuffer );
|
||||
CHECK_OPENCL( clStatus, "clSetKernelArg dstBuffer");
|
||||
clStatus = clSetKernelArg( kEnv.mpkKernel, 2, sizeof(int), (void *)&srcWPL );
|
||||
CHECK_OPENCL( clStatus, "clSetKernelArg srcWPL" );
|
||||
clStatus = clSetKernelArg( kEnv.mpkKernel, 3, sizeof(int), (void *)&dstWPL );
|
||||
CHECK_OPENCL( clStatus, "clSetKernelArg dstWPL" );
|
||||
clStatus = clSetKernelArg( kEnv.mpkKernel, 4, sizeof(int), (void *)&h );
|
||||
CHECK_OPENCL( clStatus, "clSetKernelArg height" );
|
||||
clStatus = clSetKernelArg( kEnv.mpkKernel, 5, sizeof(int), (void *)&w );
|
||||
CHECK_OPENCL( clStatus, "clSetKernelArg width" );
|
||||
clStatus = clSetKernelArg( kEnv.mpkKernel, 6, sizeof(float), (void *)&rwt );
|
||||
CHECK_OPENCL( clStatus, "clSetKernelArg rwt" );
|
||||
clStatus = clSetKernelArg( kEnv.mpkKernel, 7, sizeof(float), (void *)&gwt );
|
||||
CHECK_OPENCL( clStatus, "clSetKernelArg gwt");
|
||||
clStatus = clSetKernelArg( kEnv.mpkKernel, 8, sizeof(float), (void *)&bwt );
|
||||
CHECK_OPENCL( clStatus, "clSetKernelArg bwt");
|
||||
|
||||
/* launch kernel & wait */
|
||||
PERF_COUNT_SUB("before")
|
||||
clStatus = clEnqueueNDRangeKernel(
|
||||
kEnv.mpkCmdQueue,
|
||||
kEnv.mpkKernel,
|
||||
1, NULL, global_work_size, local_work_size,
|
||||
0, NULL, NULL );
|
||||
CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_RGBToGray" );
|
||||
clFinish( kEnv.mpkCmdQueue );
|
||||
PERF_COUNT_SUB("kernel")
|
||||
|
||||
/* map results back from gpu */
|
||||
void *ptr = clEnqueueMapBuffer(kEnv.mpkCmdQueue, dstBuffer, CL_TRUE, CL_MAP_READ, 0, dstSize, 0, NULL, NULL, &clStatus);
|
||||
CHECK_OPENCL( clStatus, "clEnqueueMapBuffer dstBuffer");
|
||||
clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, dstBuffer, ptr, 0, NULL, NULL);
|
||||
|
||||
#if 0
|
||||
// validate: compute on cpu
|
||||
Pix *cpuPix = pixCreate(w, h, 8);
|
||||
pixCopyResolution(cpuPix, srcPix);
|
||||
unsigned int *cpuData = pixGetData(cpuPix);
|
||||
int cpuWPL = pixGetWpl(cpuPix);
|
||||
unsigned int *cpuLine, *srcLine;
|
||||
int i, j;
|
||||
for (i = 0, srcLine = srcData, cpuLine = cpuData; i < h; i++) {
|
||||
for (j = 0; j < w; j++) {
|
||||
unsigned int word = *(srcLine + j);
|
||||
int val = (l_int32)(rwt * ((word >> L_RED_SHIFT) & 0xff) +
|
||||
gwt * ((word >> L_GREEN_SHIFT) & 0xff) +
|
||||
bwt * ((word >> L_BLUE_SHIFT) & 0xff) + 0.5);
|
||||
SET_DATA_BYTE(cpuLine, j, val);
|
||||
}
|
||||
srcLine += srcWPL;
|
||||
cpuLine += cpuWPL;
|
||||
}
|
||||
|
||||
// validate: compare
|
||||
printf("converted 32-bit -> 8-bit image\n");
|
||||
for (int row = 0; row < h; row++) {
|
||||
for (int col = 0; col < w; col++) {
|
||||
int idx = row*w + col;
|
||||
unsigned int srcVal = srcData[idx];
|
||||
unsigned char cpuVal = ((unsigned char *)cpuData)[idx];
|
||||
unsigned char oclVal = ((unsigned char *)dstData)[idx];
|
||||
if (srcVal > 0) {
|
||||
printf("%4i,%4i: %u, %u, %u\n", row, col, srcVal, cpuVal, oclVal);
|
||||
}
|
||||
}
|
||||
//printf("\n");
|
||||
}
|
||||
#endif
|
||||
// release opencl objects
|
||||
clReleaseMemObject(srcBuffer);
|
||||
clReleaseMemObject(dstBuffer);
|
||||
|
||||
|
||||
PERF_COUNT_END
|
||||
// success
|
||||
return dstPix;
|
||||
}
|
||||
#endif
|
||||
|
@ -56,13 +56,6 @@
|
||||
#include <time.h>
|
||||
#endif
|
||||
|
||||
#if ON_APPLE
|
||||
#include <mach/clock.h>
|
||||
#include <mach/mach.h>
|
||||
#define CLOCK_MONOTONIC SYSTEM_CLOCK
|
||||
#define clock_gettime clock_get_time
|
||||
#endif
|
||||
|
||||
/************************************************************************************
|
||||
* enable/disable reporting of performance
|
||||
* PERF_REPORT_LEVEL
|
||||
@ -74,13 +67,6 @@
|
||||
#define PERF_COUNT_VERBOSE 1
|
||||
#define PERF_COUNT_REPORT_STR "[%36s], %24s, %11.6f\n"
|
||||
|
||||
#if ON_APPLE
|
||||
#include <time.h>
|
||||
#include <mach/clock.h>
|
||||
#include <mach/mach.h>
|
||||
#define CLOCK_MONOTONIC SYSTEM_CLOCK
|
||||
#define clock_gettime clock_get_time
|
||||
#endif
|
||||
|
||||
#if ON_WINDOWS
|
||||
|
||||
@ -97,7 +83,7 @@
|
||||
#define PERF_COUNT_END \
|
||||
QueryPerformanceCounter(&time_funct_end); \
|
||||
elapsed_time_sec = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); \
|
||||
tprintf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec);
|
||||
printf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec);
|
||||
#else
|
||||
#define PERF_COUNT_START(FUNCT_NAME)
|
||||
#define PERF_COUNT_END
|
||||
@ -107,7 +93,7 @@
|
||||
#define PERF_COUNT_SUB(SUB) \
|
||||
QueryPerformanceCounter(&time_sub_end); \
|
||||
elapsed_time_sec = (time_sub_end.QuadPart-time_sub_start.QuadPart)/(double)(freq.QuadPart); \
|
||||
tprintf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \
|
||||
printf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \
|
||||
time_sub_start = time_sub_end;
|
||||
#else
|
||||
#define PERF_COUNT_SUB(SUB)
|
||||
@ -129,7 +115,7 @@
|
||||
#define PERF_COUNT_END \
|
||||
clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); \
|
||||
elapsed_time_sec = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; \
|
||||
tprintf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec);
|
||||
printf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec);
|
||||
#else
|
||||
#define PERF_COUNT_START(FUNCT_NAME)
|
||||
#define PERF_COUNT_END
|
||||
@ -139,7 +125,7 @@
|
||||
#define PERF_COUNT_SUB(SUB) \
|
||||
clock_gettime( CLOCK_MONOTONIC, &time_sub_end ); \
|
||||
elapsed_time_sec = (time_sub_end.tv_sec - time_sub_start.tv_sec)*1.0 + (time_sub_end.tv_nsec - time_sub_start.tv_nsec)/1000000000.0; \
|
||||
tprintf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \
|
||||
printf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \
|
||||
time_sub_start = time_sub_end;
|
||||
#else
|
||||
#define PERF_COUNT_SUB(SUB)
|
||||
@ -151,9 +137,6 @@
|
||||
**************************************************************************/
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
|
||||
#define USE_DEVICE_SELECTION 1
|
||||
|
||||
#include "opencl_device_selection.h"
|
||||
|
||||
#ifndef strcasecmp
|
||||
@ -251,7 +234,6 @@ public:
|
||||
static int InitEnv(); // load dll, call InitOpenclRunEnv(0)
|
||||
static int InitOpenclRunEnv( int argc ); // RegistOpenclKernel, double flags, compile kernels
|
||||
static int InitOpenclRunEnv_DeviceSelection( int argc ); // RegistOpenclKernel, double flags, compile kernels
|
||||
static int InitOpenclRunEnv( GPUEnv *gpu ); // select device by env_CPU or selector
|
||||
static int RegistOpenclKernel();
|
||||
static int ReleaseOpenclRunEnv();
|
||||
static int ReleaseOpenclEnv( GPUEnv *gpuInfo );
|
||||
@ -320,12 +302,11 @@ public:
|
||||
static void FreeOpenclDll();
|
||||
#endif
|
||||
|
||||
//int GetOpenclState();
|
||||
//void SetOpenclState( int state );
|
||||
|
||||
inline static int AddKernelConfig( int kCount, const char *kName );
|
||||
|
||||
/* for binarization */
|
||||
static void HistogramRectOCL(
|
||||
static int HistogramRectOCL(
|
||||
const unsigned char *imagedata,
|
||||
int bytes_per_pixel,
|
||||
int bytes_per_line,
|
||||
@ -335,7 +316,8 @@ public:
|
||||
int height,
|
||||
int kHistogramSize,
|
||||
int *histogramAllChannels);
|
||||
static void ThresholdRectToPixOCL(
|
||||
|
||||
static int ThresholdRectToPixOCL(
|
||||
const unsigned char* imagedata,
|
||||
int bytes_per_pixel,
|
||||
int bytes_per_line,
|
||||
@ -346,11 +328,12 @@ public:
|
||||
int rect_width,
|
||||
int rect_top,
|
||||
int rect_left);
|
||||
#if USE_DEVICE_SELECTION
|
||||
|
||||
static Pix * pixConvertRGBToGrayOCL( Pix *pix, float weightRed = 0.3, float weightGreen = 0.5, float weightBlue = 0.2 );
|
||||
|
||||
static ds_device getDeviceSelection();
|
||||
static ds_device selectedDevice;
|
||||
static bool deviceIsSelected;
|
||||
#endif
|
||||
static bool selectedDeviceIsOpenCL();
|
||||
static bool selectedDeviceIsNativeCPU();
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user