#include "openclwrapper.h" #include "oclkernels.h" #ifdef WIN32 #include #include #else #include #include #endif #include "opencl_device_selection.h" // for micro-benchmark #include "otsuthr.h" #include "thresholder.h" #ifdef USE_OPENCL cl_device_id performDeviceSelection( ); GPUEnv OpenclDevice::gpuEnv; int OpenclDevice::isInited =0; struct tiff_transform { int vflip; /* if non-zero, image needs a vertical fip */ int hflip; /* if non-zero, image needs a horizontal flip */ int rotate; /* -1 -> counterclockwise 90-degree rotation, 0 -> no rotation 1 -> clockwise 90-degree rotation */ }; static struct tiff_transform tiff_orientation_transforms[] = { {0, 0, 0}, {0, 1, 0}, {1, 1, 0}, {1, 0, 0}, {0, 1, -1}, {0, 0, 1}, {0, 1, 1}, {0, 0, -1} }; static const l_int32 MAX_PAGES_IN_TIFF_FILE = 3000; cl_mem pixsCLBuffer, pixdCLBuffer, pixdCLIntermediate; //Morph operations buffers cl_mem pixThBuffer; //output from thresholdtopix calculation cl_int clStatus; KernelEnv rEnv; void populateGPUEnvFromDevice( GPUEnv *gpuInfo, cl_device_id device ) { printf("[DS] populateGPUEnvFromDevice\n"); size_t size; gpuInfo->mnIsUserCreated = 1; // device gpuInfo->mpDevID = device; gpuInfo->mpArryDevsID = new cl_device_id[1]; gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID; clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE , sizeof(cl_device_type), (void *) &gpuInfo->mDevType , &size); CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(TYPE)"); // platform clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM , sizeof(cl_platform_id), (void *) &gpuInfo->mpPlatformID , &size); CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(PLATFORM)"); // context cl_context_properties props[3]; props[0] = CL_CONTEXT_PLATFORM; props[1] = (cl_context_properties) gpuInfo->mpPlatformID; props[2] = 0; gpuInfo->mpContext = clCreateContext(props, 1, &gpuInfo->mpDevID, NULL, NULL, &clStatus); CHECK_OPENCL( clStatus, "populateGPUEnv::createContext"); // queue cl_command_queue_properties queueProperties = 0; gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus ); CHECK_OPENCL( clStatus, "populateGPUEnv::createCommandQueue"); } #ifdef WIN32 int OpenclDevice::LoadOpencl() { HINSTANCE HOpenclDll = NULL; void * OpenclDll = NULL; //fprintf(stderr, " LoadOpenclDllxx... \n"); OpenclDll = static_cast( HOpenclDll ); OpenclDll = LoadLibrary( "openCL.dll" ); if ( !static_cast( OpenclDll ) ) { fprintf(stderr, " Load opencl.dll failed! \n"); FreeLibrary( static_cast( OpenclDll ) ); return 0; } fprintf(stderr, " Load opencl.dll successfully!\n"); return 1; } #endif int OpenclDevice::SetKernelEnv( KernelEnv *envInfo ) { envInfo->mpkContext = gpuEnv.mpContext; envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue; envInfo->mpkProgram = gpuEnv.mpArryPrograms[0]; return 1; } cl_mem allocateZeroCopyBuffer(KernelEnv rEnv, l_uint32 *hostbuffer, size_t nElements, cl_mem_flags flags, cl_int *pStatus) { cl_mem membuffer = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (flags), nElements * sizeof(l_uint32), hostbuffer, pStatus); return membuffer; } PIX* mapOutputCLBuffer(KernelEnv rEnv, cl_mem clbuffer, PIX* pixd, PIX* pixs, int elements, cl_mem_flags flags, bool memcopy = false, bool sync = true) { PROCNAME("mapOutputCLBuffer"); if (!pixd) { if (memcopy) { if ((pixd = pixCreateTemplate(pixs)) == NULL) (PIX *)ERROR_PTR("pixd not made", procName, NULL); } else { if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs), pixGetDepth(pixs))) == NULL) (PIX *)ERROR_PTR("pixd not made", procName, NULL); } } l_uint32 *pValues = (l_uint32 *)clEnqueueMapBuffer(rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0, elements * sizeof(l_uint32), 0, NULL, NULL, NULL ); if (memcopy) { memcpy(pixGetData(pixd), pValues, elements * sizeof(l_uint32)); } else { pixSetData(pixd, pValues); } clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,clbuffer,pValues,0,NULL,NULL); if (sync) { clFinish( rEnv.mpkCmdQueue ); } return pixd; } cl_mem allocateIntBuffer( KernelEnv rEnv, const l_uint32 *_pValues, size_t nElements, cl_int *pStatus , bool sync = false) { cl_mem xValues = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE), nElements * sizeof(l_int32), NULL, pStatus); if (_pValues != NULL) { l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0, nElements * sizeof(l_int32), 0, NULL, NULL, NULL ); memcpy(pValues, _pValues, nElements * sizeof(l_int32)); clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL); if (sync) clFinish( rEnv.mpkCmdQueue ); } return xValues; } int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo ) { size_t length; cl_int clStatus; cl_uint numPlatforms, numDevices; cl_platform_id *platforms; cl_context_properties cps[3]; char platformName[256]; unsigned int i; // Have a look at the available platforms. if ( !gpuInfo->mnIsUserCreated ) { clStatus = clGetPlatformIDs( 0, NULL, &numPlatforms ); if ( clStatus != CL_SUCCESS ) { return 1; } gpuInfo->mpPlatformID = NULL; if ( 0 < numPlatforms ) { platforms = (cl_platform_id*) malloc( numPlatforms * sizeof( cl_platform_id ) ); if ( platforms == (cl_platform_id*) NULL ) { return 1; } clStatus = clGetPlatformIDs( numPlatforms, platforms, NULL ); if ( clStatus != CL_SUCCESS ) { return 1; } for ( i = 0; i < numPlatforms; i++ ) { clStatus = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof( platformName ), platformName, NULL ); if ( clStatus != CL_SUCCESS ) { return 1; } gpuInfo->mpPlatformID = platforms[i]; //if (!strcmp(platformName, "Intel(R) Coporation")) //if( !strcmp( platformName, "Advanced Micro Devices, Inc." )) { gpuInfo->mpPlatformID = platforms[i]; if ( getenv("SC_OPENCLCPU") ) { clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform CL_DEVICE_TYPE_CPU, // device_type for CPU device 0, // num_entries NULL, // devices &numDevices); printf("Selecting OpenCL device: CPU (a)\n"); } else { clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform CL_DEVICE_TYPE_GPU, // device_type for GPU device 0, // num_entries NULL, // devices &numDevices); printf("Selecting OpenCL device: GPU (a)\n"); } if ( clStatus != CL_SUCCESS ) continue; if ( numDevices ) break; } } if ( clStatus != CL_SUCCESS ) return 1; free( platforms ); } if ( NULL == gpuInfo->mpPlatformID ) return 1; // Use available platform. cps[0] = CL_CONTEXT_PLATFORM; cps[1] = (cl_context_properties) gpuInfo->mpPlatformID; cps[2] = 0; // Set device type for OpenCL if ( getenv("SC_OPENCLCPU") ) { gpuInfo->mDevType = CL_DEVICE_TYPE_CPU; printf("Selecting OpenCL device: CPU (b)\n"); } else { gpuInfo->mDevType = CL_DEVICE_TYPE_GPU; printf("Selecting OpenCL device: GPU (b)\n"); } gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus ); if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) ) { gpuInfo->mDevType = CL_DEVICE_TYPE_CPU; gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus ); printf("Selecting OpenCL device: CPU (c)\n"); } if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) ) { gpuInfo->mDevType = CL_DEVICE_TYPE_DEFAULT; gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus ); printf("Selecting OpenCL device: DEFAULT (c)\n"); } if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) ) return 1; // Detect OpenCL devices. // First, get the size of device list data clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, 0, NULL, &length ); if ( ( clStatus != CL_SUCCESS ) || ( length == 0 ) ) return 1; // Now allocate memory for device list based on the size we got earlier gpuInfo->mpArryDevsID = (cl_device_id*) malloc( length ); if ( gpuInfo->mpArryDevsID == (cl_device_id*) NULL ) return 1; // Now, get the device list data clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, length, gpuInfo->mpArryDevsID, NULL ); if ( clStatus != CL_SUCCESS ) return 1; // Create OpenCL command queue. gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpArryDevsID[0], 0, &clStatus ); if ( clStatus != CL_SUCCESS ) return 1; } clStatus = clGetCommandQueueInfo( gpuInfo->mpCmdQueue, CL_QUEUE_THREAD_HANDLE_AMD, 0, NULL, NULL ); // Check device extensions for double type size_t aDevExtInfoSize = 0; clStatus = clGetDeviceInfo( gpuInfo->mpArryDevsID[0], CL_DEVICE_EXTENSIONS, 0, NULL, &aDevExtInfoSize ); CHECK_OPENCL( clStatus, "clGetDeviceInfo" ); char *aExtInfo = new char[aDevExtInfoSize]; clStatus = clGetDeviceInfo( gpuInfo->mpArryDevsID[0], CL_DEVICE_EXTENSIONS, sizeof(char) * aDevExtInfoSize, aExtInfo, NULL); CHECK_OPENCL( clStatus, "clGetDeviceInfo" ); gpuInfo->mnKhrFp64Flag = 0; gpuInfo->mnAmdFp64Flag = 0; if ( strstr( aExtInfo, "cl_khr_fp64" ) ) { gpuInfo->mnKhrFp64Flag = 1; } else { // Check if cl_amd_fp64 extension is supported if ( strstr( aExtInfo, "cl_amd_fp64" ) ) gpuInfo->mnAmdFp64Flag = 1; } delete []aExtInfo; return 0; } void OpenclDevice::releaseMorphCLBuffers() { if (pixdCLIntermediate != NULL) clReleaseMemObject(pixdCLIntermediate); if (pixsCLBuffer != NULL) clReleaseMemObject(pixsCLBuffer); if (pixdCLBuffer != NULL) clReleaseMemObject(pixdCLBuffer); if (pixThBuffer != NULL) clReleaseMemObject(pixThBuffer); } int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, PIX* pixs) { SetKernelEnv( &rEnv ); if (pixThBuffer != NULL) { pixsCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus); //Get the output from ThresholdToPix operation clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0, sizeof(l_uint32) * wpl*h, 0, NULL, NULL); } else { //Get data from the source image l_uint32* srcdata = (l_uint32*) malloc(wpl*h*sizeof(l_uint32)); memcpy(srcdata, pixGetData(pixs), wpl*h*sizeof(l_uint32)); pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl*h, CL_MEM_USE_HOST_PTR, &clStatus); } pixdCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus); pixdCLIntermediate = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus); return (int)clStatus; } int OpenclDevice::InitEnv() { printf("[DS] OpenclDevice::InitEnv()\n"); #ifdef SAL_WIN32 while( 1 ) { if( 1 == LoadOpencl() ) break; } #endif // sets up environment, compiles programs #if USE_DEVICE_SELECTION InitOpenclRunEnv_DeviceSelection( 0 ); #if 0 // after programs compiled, selects best device printf("[DS] calling performDeviceSelection()\n"); cl_device_id bestDevice = performDeviceSelection( ); // overwrite global static GPUEnv with new device printf("[DS] calling populateGPUEnvFromDevice() bestDevice->global static GPUEnv\n"); populateGPUEnvFromDevice( &gpuEnv, bestDevice ); printf("[DS] done.\n"); gpuEnv.mnFileCount = 0; //argc; gpuEnv.mnKernelCount = 0UL; OpenclDevice::CompileKernelFile(&gpuEnv, ""); #endif #else // init according to device InitOpenclRunEnv( 0 ); #endif return 1; } int OpenclDevice::ReleaseOpenclRunEnv() { ReleaseOpenclEnv( &gpuEnv ); #ifdef SAL_WIN32 FreeOpenclDll(); #endif return 1; } inline int OpenclDevice::AddKernelConfig( int kCount, const char *kName ) { if ( kCount < 1 ) fprintf(stderr,"Error: ( KCount < 1 ) AddKernelConfig\n" ); strcpy( gpuEnv.mArrykernelNames[kCount-1], kName ); gpuEnv.mnKernelCount++; return 0; } int OpenclDevice::RegistOpenclKernel() { if ( !gpuEnv.mnIsUserCreated ) memset( &gpuEnv, 0, sizeof(gpuEnv) ); gpuEnv.mnFileCount = 0; //argc; gpuEnv.mnKernelCount = 0UL; AddKernelConfig( 1, (const char*) "oclAverageSub1" ); return 0; } int OpenclDevice::InitOpenclRunEnv( int argc ) { int status = 0; if ( MAX_CLKERNEL_NUM <= 0 ) { return 1; } if ( ( argc > MAX_CLFILE_NUM ) || ( argc < 0 ) ) return 1; if ( !isInited ) { RegistOpenclKernel(); //initialize devices, context, comand_queue status = InitOpenclRunEnv( &gpuEnv ); if ( status ) { fprintf(stderr,"init_opencl_env failed.\n"); return 1; } fprintf(stderr,"init_opencl_env successed.\n"); //initialize program, kernelName, kernelCount if( getenv( "SC_FLOAT" ) ) { gpuEnv.mnKhrFp64Flag = 0; gpuEnv.mnAmdFp64Flag = 0; } if( gpuEnv.mnKhrFp64Flag ) { fprintf(stderr,"----use khr double type in kernel----\n"); status = CompileKernelFile( &gpuEnv, "-D KHR_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16" ); } else if( gpuEnv.mnAmdFp64Flag ) { fprintf(stderr,"----use amd double type in kernel----\n"); status = CompileKernelFile( &gpuEnv, "-D AMD_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16" ); } else { fprintf(stderr,"----use float type in kernel----\n"); status = CompileKernelFile( &gpuEnv, "-Dfp_t=float -Dfp_t4=float4 -Dfp_t16=float16" ); } if ( status == 0 || gpuEnv.mnKernelCount == 0 ) { fprintf(stderr,"CompileKernelFile failed.\n"); return 1; } fprintf(stderr,"CompileKernelFile successed.\n"); isInited = 1; } return 0; } int OpenclDevice::InitOpenclRunEnv_DeviceSelection( int argc ) { #if USE_DEVICE_SELECTION if (!isInited) { // after programs compiled, selects best device printf("[DS] calling performDeviceSelection()\n"); cl_device_id bestDevice = performDeviceSelection( ); // overwrite global static GPUEnv with new device printf("[DS] calling populateGPUEnvFromDevice() bestDevice->global static GPUEnv\n"); populateGPUEnvFromDevice( &gpuEnv, bestDevice ); printf("[DS] done.\n"); gpuEnv.mnFileCount = 0; //argc; gpuEnv.mnKernelCount = 0UL; CompileKernelFile(&gpuEnv, ""); isInited = 1; } #endif return 0; } OpenclDevice::OpenclDevice() { //InitEnv(); } OpenclDevice::~OpenclDevice() { //ReleaseOpenclRunEnv(); } int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo ) { int i = 0; int clStatus = 0; if ( !isInited ) { return 1; } for ( i = 0; i < gpuEnv.mnFileCount; i++ ) { if ( gpuEnv.mpArryPrograms[i] ) { clStatus = clReleaseProgram( gpuEnv.mpArryPrograms[i] ); CHECK_OPENCL( clStatus, "clReleaseProgram" ); gpuEnv.mpArryPrograms[i] = NULL; } } if ( gpuEnv.mpCmdQueue ) { clReleaseCommandQueue( gpuEnv.mpCmdQueue ); gpuEnv.mpCmdQueue = NULL; } if ( gpuEnv.mpContext ) { clReleaseContext( gpuEnv.mpContext ); gpuEnv.mpContext = NULL; } isInited = 0; gpuInfo->mnIsUserCreated = 0; free( gpuInfo->mpArryDevsID ); return 1; } int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle ) { unsigned int i = 0; cl_int clStatus; int status = 0; char *str = NULL; FILE *fd = NULL; cl_uint numDevices=0; if ( getenv("SC_OPENCLCPU") ) { clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform CL_DEVICE_TYPE_CPU, // device_type for CPU device 0, // num_entries NULL, // devices ID &numDevices); } else { clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform CL_DEVICE_TYPE_GPU, // device_type for GPU device 0, // num_entries NULL, // devices ID &numDevices); } CHECK_OPENCL( clStatus, "clGetDeviceIDs" ); for ( i = 0; i < numDevices; i++ ) { char fileName[256] = { 0 }, cl_name[128] = { 0 }; if ( gpuEnv.mpArryDevsID[i] != 0 ) { char deviceName[1024]; clStatus = clGetDeviceInfo( gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL ); CHECK_OPENCL( clStatus, "clGetDeviceInfo" ); str = (char*) strstr( clFileName, (char*) ".cl" ); memcpy( cl_name, clFileName, str - clFileName ); cl_name[str - clFileName] = '\0'; sprintf( fileName, "./%s-%s.bin", cl_name, deviceName ); fd = fopen( fileName, "rb" ); status = ( fd != NULL ) ? 1 : 0; } } if ( fd != NULL ) { *fhandle = fd; } return status; } int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName ) { int i; for ( i = 0; i < gpuEnvCached->mnFileCount; i++ ) { if ( strcasecmp( gpuEnvCached->mArryKnelSrcFile[i], clFileName ) == 0 ) { if ( gpuEnvCached->mpArryPrograms[i] != NULL ) { return 1; } } } return 0; } int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes ) { FILE *output = NULL; output = fopen( fileName, "wb" ); if ( output == NULL ) { return 0; } fwrite( birary, sizeof(char), numBytes, output ); fclose( output ); return 1; } int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * clFileName ) { unsigned int i = 0; cl_int clStatus; size_t *binarySizes, numDevices; cl_device_id *mpArryDevsID; char **binaries, *str = NULL; clStatus = clGetProgramInfo( program, CL_PROGRAM_NUM_DEVICES, sizeof(numDevices), &numDevices, NULL ); CHECK_OPENCL( clStatus, "clGetProgramInfo" ); mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices ); if ( mpArryDevsID == NULL ) { return 0; } /* grab the handles to all of the devices in the program. */ clStatus = clGetProgramInfo( program, CL_PROGRAM_DEVICES, sizeof(cl_device_id) * numDevices, mpArryDevsID, NULL ); CHECK_OPENCL( clStatus, "clGetProgramInfo" ); /* figure out the sizes of each of the binaries. */ binarySizes = (size_t*) malloc( sizeof(size_t) * numDevices ); clStatus = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * numDevices, binarySizes, NULL ); CHECK_OPENCL( clStatus, "clGetProgramInfo" ); /* copy over all of the generated binaries. */ binaries = (char**) malloc( sizeof(char *) * numDevices ); if ( binaries == NULL ) { return 0; } for ( i = 0; i < numDevices; i++ ) { if ( binarySizes[i] != 0 ) { binaries[i] = (char*) malloc( sizeof(char) * binarySizes[i] ); if ( binaries[i] == NULL ) { return 0; } } else { binaries[i] = NULL; } } clStatus = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(char *) * numDevices, binaries, NULL ); CHECK_OPENCL(clStatus,"clGetProgramInfo"); /* dump out each binary into its own separate file. */ for ( i = 0; i < numDevices; i++ ) { char fileName[256] = { 0 }, cl_name[128] = { 0 }; if ( binarySizes[i] != 0 ) { char deviceName[1024]; clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL); CHECK_OPENCL( clStatus, "clGetDeviceInfo" ); str = (char*) strstr( clFileName, (char*) ".cl" ); memcpy( cl_name, clFileName, str - clFileName ); cl_name[str - clFileName] = '\0'; sprintf( fileName, "./%s-%s.bin", cl_name, deviceName ); if ( !WriteBinaryToFile( fileName, binaries[i], binarySizes[i] ) ) { printf("opencl-wrapper: write binary[%s] failed\n", fileName); return 0; } //else printf("opencl-wrapper: write binary[%s] succesfully\n", fileName); } } // Release all resouces and memory for ( i = 0; i < numDevices; i++ ) { if ( binaries[i] != NULL ) { free( binaries[i] ); binaries[i] = NULL; } } if ( binaries != NULL ) { free( binaries ); binaries = NULL; } if ( binarySizes != NULL ) { free( binarySizes ); binarySizes = NULL; } if ( mpArryDevsID != NULL ) { free( mpArryDevsID ); mpArryDevsID = NULL; } return 1; } void copyIntBuffer( KernelEnv rEnv, cl_mem xValues, const l_uint32 *_pValues, size_t nElements, cl_int *pStatus ) { l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0, nElements * sizeof(l_int32), 0, NULL, NULL, NULL ); clFinish( rEnv.mpkCmdQueue ); if (_pValues != NULL) { for ( int i = 0; i < (int)nElements; i++ ) pValues[i] = (l_int32)_pValues[i]; } clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL); //clFinish( rEnv.mpkCmdQueue ); return; } int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption ) { cl_int clStatus = 0; size_t length; char *buildLog = NULL, *binary; const char *source; size_t source_size[1]; int b_error, binary_status, binaryExisted, idx; size_t numDevices; cl_device_id *mpArryDevsID; FILE *fd, *fd1; const char* filename = "kernel.cl"; fprintf(stderr, "CompileKernelFile ... \n"); if ( CachedOfKernerPrg(gpuInfo, filename) == 1 ) { return 1; } idx = gpuInfo->mnFileCount; source = kernel_src; source_size[0] = strlen( source ); binaryExisted = 0; if (gpuInfo == &gpuEnv) binaryExisted = BinaryGenerated( filename, &fd ); // don't check for binary during microbenchmark if ( binaryExisted == 1 ) { clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES, sizeof(numDevices), &numDevices, NULL ); CHECK_OPENCL( clStatus, "clGetContextInfo" ); mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices ); if ( mpArryDevsID == NULL ) { return 0; } b_error = 0; length = 0; b_error |= fseek( fd, 0, SEEK_END ) < 0; b_error |= ( length = ftell(fd) ) <= 0; b_error |= fseek( fd, 0, SEEK_SET ) < 0; if ( b_error ) { return 0; } binary = (char*) malloc( length + 2 ); if ( !binary ) { return 0; } memset( binary, 0, length + 2 ); b_error |= fread( binary, 1, length, fd ) != length; fclose( fd ); fd = NULL; // grab the handles to all of the devices in the context. clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, sizeof( cl_device_id ) * numDevices, mpArryDevsID, NULL ); CHECK_OPENCL( clStatus, "clGetContextInfo" ); fprintf(stderr, "Create kernel from binary\n"); gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary( gpuInfo->mpContext,numDevices, mpArryDevsID, &length, (const unsigned char**) &binary, &binary_status, &clStatus ); CHECK_OPENCL( clStatus, "clCreateProgramWithBinary" ); free( binary ); free( mpArryDevsID ); mpArryDevsID = NULL; } else { // create a CL program using the kernel source fprintf(stderr, "Create kernel from source\n"); gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource( gpuInfo->mpContext, 1, &source, source_size, &clStatus); CHECK_OPENCL( clStatus, "clCreateProgramWithSource" ); } if ( gpuInfo->mpArryPrograms[idx] == (cl_program) NULL ) { return 0; } //char options[512]; // create a cl program executable for all the devices specified printf("BuildProgram.\n"); if (!gpuInfo->mnIsUserCreated) { clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID, buildOption, NULL, NULL); } else { clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID), buildOption, NULL, NULL); } if ( clStatus != CL_SUCCESS ) { printf ("BuildProgram error!\n"); if ( !gpuInfo->mnIsUserCreated ) { clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &length ); } else { clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID, CL_PROGRAM_BUILD_LOG, 0, NULL, &length); } if ( clStatus != CL_SUCCESS ) { printf("opencl create build log fail\n"); return 0; } buildLog = (char*) malloc( length ); if ( buildLog == (char*) NULL ) { return 0; } if ( !gpuInfo->mnIsUserCreated ) { clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0], CL_PROGRAM_BUILD_LOG, length, buildLog, &length ); } else { clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID, CL_PROGRAM_BUILD_LOG, length, buildLog, &length ); } if ( clStatus != CL_SUCCESS ) { printf("opencl program build info fail\n"); return 0; } fd1 = fopen( "kernel-build.log", "w+" ); if ( fd1 != NULL ) { fwrite( buildLog, sizeof(char), length, fd1 ); fclose( fd1 ); } free( buildLog ); return 0; } strcpy( gpuEnv.mArryKnelSrcFile[idx], filename ); if ( binaryExisted == 0 && gpuInfo == &gpuEnv) GeneratBinFromKernelSource( gpuEnv.mpArryPrograms[idx], filename ); gpuInfo->mnFileCount += 1; return 1; } l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl,l_uint32 *line) { cl_int clStatus; KernelEnv rEnv; size_t globalThreads[2]; size_t localThreads[2]; int gsize; cl_mem valuesCl; cl_mem outputCl; //global and local work dimensions for Horizontal pass gsize = (w + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; globalThreads[0] = gsize; gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; globalThreads[1] = gsize; localThreads[0] = GROUPSIZE_X; localThreads[1] = GROUPSIZE_Y; SetKernelEnv( &rEnv ); l_uint32 *pResult = (l_uint32 *)malloc(w*h * sizeof(l_uint32)); rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "composeRGBPixel", &clStatus ); CHECK_OPENCL( clStatus, "clCreateKernel"); //Allocate input and output OCL buffers valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w*h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus); outputCl = allocateZeroCopyBuffer(rEnv, pResult, w*h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus); //Kernel arguments clStatus = clSetKernelArg( rEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&valuesCl ); CHECK_OPENCL( clStatus, "clSetKernelArg"); clStatus = clSetKernelArg( rEnv.mpkKernel, 1, sizeof(w), (void *)&w ); CHECK_OPENCL( clStatus, "clSetKernelArg" ); clStatus = clSetKernelArg( rEnv.mpkKernel, 2, sizeof(h), (void *)&h ); CHECK_OPENCL( clStatus, "clSetKernelArg" ); clStatus = clSetKernelArg( rEnv.mpkKernel, 3, sizeof(wpl), (void *)&wpl ); CHECK_OPENCL( clStatus, "clSetKernelArg" ); clStatus = clSetKernelArg( rEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&outputCl ); CHECK_OPENCL( clStatus, "clSetKernelArg"); //Kernel enqueue clStatus = clEnqueueNDRangeKernel( rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL ); CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" ); /* map results back from gpu */ void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ, 0, w*h * sizeof(l_uint32), 0, NULL, NULL, &clStatus); CHECK_OPENCL( clStatus, "clEnqueueMapBuffer outputCl"); clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, NULL, NULL); //Sync clFinish( rEnv.mpkCmdQueue ); return pResult; } PIX * OpenclDevice::pixReadTiffCl ( const char *filename, l_int32 n ) { FILE *fp; PIX *pix; //printf("pixReadTiffCl file"); PROCNAME("pixReadTiff"); if (!filename) return (PIX *)ERROR_PTR("filename not defined", procName, NULL); if ((fp = fopenReadStream(filename)) == NULL) return (PIX *)ERROR_PTR("image file not found", procName, NULL); if ((pix = pixReadStreamTiffCl(fp, n)) == NULL) { fclose(fp); return (PIX *)ERROR_PTR("pix not read", procName, NULL); } fclose(fp); return pix; } TIFF * OpenclDevice::fopenTiffCl(FILE *fp, const char *modestring) { l_int32 fd; PROCNAME("fopenTiff"); if (!fp) return (TIFF *)ERROR_PTR("stream not opened", procName, NULL); if (!modestring) return (TIFF *)ERROR_PTR("modestring not defined", procName, NULL); if ((fd = fileno(fp)) < 0) return (TIFF *)ERROR_PTR("invalid file descriptor", procName, NULL); lseek(fd, 0, SEEK_SET); return TIFFFdOpen(fd, "TIFFstream", modestring); } l_int32 OpenclDevice::getTiffStreamResolutionCl(TIFF *tif, l_int32 *pxres, l_int32 *pyres) { l_uint16 resunit; l_int32 foundxres, foundyres; l_float32 fxres, fyres; PROCNAME("getTiffStreamResolution"); if (!tif) return ERROR_INT("tif not opened", procName, 1); if (!pxres || !pyres) return ERROR_INT("&xres and &yres not both defined", procName, 1); *pxres = *pyres = 0; TIFFGetFieldDefaulted(tif, TIFFTAG_RESOLUTIONUNIT, &resunit); foundxres = TIFFGetField(tif, TIFFTAG_XRESOLUTION, &fxres); foundyres = TIFFGetField(tif, TIFFTAG_YRESOLUTION, &fyres); if (!foundxres && !foundyres) return 1; if (!foundxres && foundyres) fxres = fyres; else if (foundxres && !foundyres) fyres = fxres; if (resunit == RESUNIT_CENTIMETER) { /* convert to ppi */ *pxres = (l_int32)(2.54 * fxres + 0.5); *pyres = (l_int32)(2.54 * fyres + 0.5); } else { *pxres = (l_int32)fxres; *pyres = (l_int32)fyres; } return 0; } PIX * OpenclDevice::pixReadStreamTiffCl(FILE *fp, l_int32 n) { l_int32 i, pagefound; PIX *pix; TIFF *tif; PROCNAME("pixReadStreamTiff"); if (!fp) return (PIX *)ERROR_PTR("stream not defined", procName, NULL); if ((tif = fopenTiffCl(fp, "rb")) == NULL) return (PIX *)ERROR_PTR("tif not opened", procName, NULL); pagefound = FALSE; pix = NULL; for (i = 0; i < MAX_PAGES_IN_TIFF_FILE; i++) { if (i == n) { pagefound = TRUE; if ((pix = pixReadFromTiffStreamCl(tif)) == NULL) { TIFFCleanup(tif); return (PIX *)ERROR_PTR("pix not read", procName, NULL); } break; } if (TIFFReadDirectory(tif) == 0) break; } if (pagefound == FALSE) { L_WARNING_INT("tiff page %d not found", procName, n); TIFFCleanup(tif); return NULL; } TIFFCleanup(tif); return pix; } static l_int32 getTiffCompressedFormat(l_uint16 tiffcomp) { l_int32 comptype; switch (tiffcomp) { case COMPRESSION_CCITTFAX4: comptype = IFF_TIFF_G4; break; case COMPRESSION_CCITTFAX3: comptype = IFF_TIFF_G3; break; case COMPRESSION_CCITTRLE: comptype = IFF_TIFF_RLE; break; case COMPRESSION_PACKBITS: comptype = IFF_TIFF_PACKBITS; break; case COMPRESSION_LZW: comptype = IFF_TIFF_LZW; break; case COMPRESSION_ADOBE_DEFLATE: comptype = IFF_TIFF_ZIP; break; default: comptype = IFF_TIFF; break; } return comptype; } void compare(l_uint32 *cpu, l_uint32 *gpu,int size) { for(int i=0;i 32) return (PIX *)ERROR_PTR("can't handle bpp > 32", procName, NULL); if (spp == 1) d = bps; else if (spp == 3 || spp == 4) d = 32; else return (PIX *)ERROR_PTR("spp not in set {1,3,4}", procName, NULL); TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &w); TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &h); tiffbpl = TIFFScanlineSize(tif); if ((pix = pixCreate(w, h, d)) == NULL) return (PIX *)ERROR_PTR("pix not made", procName, NULL); data = (l_uint8 *)pixGetData(pix); wpl = pixGetWpl(pix); bpl = 4 * wpl; if (spp == 1) { if ((linebuf = (l_uint8 *)CALLOC(tiffbpl + 1, sizeof(l_uint8))) == NULL) return (PIX *)ERROR_PTR("calloc fail for linebuf", procName, NULL); for (i = 0 ; i < h ; i++) { if (TIFFReadScanline(tif, linebuf, i, 0) < 0) { FREE(linebuf); pixDestroy(&pix); return (PIX *)ERROR_PTR("line read fail", procName, NULL); } memcpy((char *)data, (char *)linebuf, tiffbpl); data += bpl; } if (bps <= 8) pixEndianByteSwap(pix); else pixEndianTwoByteSwap(pix); FREE(linebuf); } else { if ((tiffdata = (l_uint32 *)CALLOC(w * h, sizeof(l_uint32))) == NULL) { pixDestroy(&pix); return (PIX *)ERROR_PTR("calloc fail for tiffdata", procName, NULL); } if (!TIFFReadRGBAImageOriented(tif, w, h, (uint32 *)tiffdata, ORIENTATION_TOPLEFT, 0)) { FREE(tiffdata); pixDestroy(&pix); return (PIX *)ERROR_PTR("failed to read tiffdata", procName, NULL); } line = pixGetData(pix); //Invoke the OpenCL kernel for pixReadFromTiff l_uint32* output_gpu=pixReadFromTiffKernel(tiffdata,w,h,wpl,line); pixSetData(pix, output_gpu); FREE(tiffdata); } if (getTiffStreamResolutionCl(tif, &xres, &yres) == 0) { pixSetXRes(pix, xres); pixSetYRes(pix, yres); } TIFFGetFieldDefaulted(tif, TIFFTAG_COMPRESSION, &tiffcomp); comptype = getTiffCompressedFormat(tiffcomp); pixSetInputFormat(pix, comptype); if (TIFFGetField(tif, TIFFTAG_COLORMAP, &redmap, &greenmap, &bluemap)) { if ((cmap = pixcmapCreate(bps)) == NULL) { pixDestroy(&pix); return (PIX *)ERROR_PTR("cmap not made", procName, NULL); } ncolors = 1 << bps; for (i = 0; i < ncolors; i++) pixcmapAddColor(cmap, redmap[i] >> 8, greenmap[i] >> 8, bluemap[i] >> 8); pixSetColormap(pix, cmap); } else { if (!TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &photometry)) { if (tiffcomp == COMPRESSION_CCITTFAX3 || tiffcomp == COMPRESSION_CCITTFAX4 || tiffcomp == COMPRESSION_CCITTRLE || tiffcomp == COMPRESSION_CCITTRLEW) { photometry = PHOTOMETRIC_MINISWHITE; } else photometry = PHOTOMETRIC_MINISBLACK; } if ((d == 1 && photometry == PHOTOMETRIC_MINISBLACK) || (d == 8 && photometry == PHOTOMETRIC_MINISWHITE)) pixInvert(pix, pix); } if (TIFFGetField(tif, TIFFTAG_ORIENTATION, &orientation)) { if (orientation >= 1 && orientation <= 8) { struct tiff_transform *transform = &tiff_orientation_transforms[orientation - 1]; if (transform->vflip) pixFlipTB(pix, pix); if (transform->hflip) pixFlipLR(pix, pix); if (transform->rotate) { PIX *oldpix = pix; pix = pixRotate90(oldpix, transform->rotate); pixDestroy(&oldpix); } } } return pix; } //Morphology Dilate operation for 5x5 structuring element. Invokes the relevant OpenCL kernels cl_int pixDilateCL_55(l_int32 wpl, l_int32 h) { size_t globalThreads[2]; cl_mem pixtemp; cl_int status; int gsize; size_t localThreads[2]; //Horizontal pass gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX; globalThreads[0] = gsize; globalThreads[1] = GROUPSIZE_HMORY; localThreads[0] = GROUPSIZE_HMORX; localThreads[1] = GROUPSIZE_HMORY; rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_5x5", &status ); status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), (const void *)&wpl); status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), (const void *)&h); status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL); //Swap source and dest buffers pixtemp = pixsCLBuffer; pixsCLBuffer = pixdCLBuffer; pixdCLBuffer = pixtemp; //Vertical gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; globalThreads[0] = gsize; gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; globalThreads[1] = gsize; localThreads[0] = GROUPSIZE_X; localThreads[1] = GROUPSIZE_Y; rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer_5x5", &status ); status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), (const void *)&wpl); status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), (const void *)&h); status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL); return status; } //Morphology Erode operation for 5x5 structuring element. Invokes the relevant OpenCL kernels cl_int pixErodeCL_55(l_int32 wpl, l_int32 h) { size_t globalThreads[2]; cl_mem pixtemp; cl_int status; int gsize; l_uint32 fwmask, lwmask; size_t localThreads[2]; lwmask = lmask32[32 - 2]; fwmask = rmask32[32 - 2]; //Horizontal pass gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX; globalThreads[0] = gsize; globalThreads[1] = GROUPSIZE_HMORY; localThreads[0] = GROUPSIZE_HMORX; localThreads[1] = GROUPSIZE_HMORY; rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_5x5", &status ); status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), (const void *)&wpl); status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), (const void *)&h); status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL); //Swap source and dest buffers pixtemp = pixsCLBuffer; pixsCLBuffer = pixdCLBuffer; pixdCLBuffer = pixtemp; //Vertical gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; globalThreads[0] = gsize; gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; globalThreads[1] = gsize; localThreads[0] = GROUPSIZE_X; localThreads[1] = GROUPSIZE_Y; rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer_5x5", &status ); status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), (const void *)&wpl); status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), (const void *)&h); status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(fwmask), (const void *)&fwmask); status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(lwmask), (const void *)&lwmask); status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL); return status; } //Morphology Dilate operation. Invokes the relevant OpenCL kernels cl_int pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) { l_int32 xp, yp, xn, yn; SEL* sel; size_t globalThreads[2]; cl_mem pixtemp; cl_int status; int gsize; size_t localThreads[2]; char isEven; OpenclDevice::SetKernelEnv( &rEnv ); if (hsize == 5 && vsize == 5) { //Specific case for 5x5 status = pixDilateCL_55(wpl, h); return status; } sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT); selFindMaxTranslations(sel, &xp, &yp, &xn, &yn); //global and local work dimensions for Horizontal pass gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; globalThreads[0] = gsize; gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; globalThreads[1] = gsize; localThreads[0] = GROUPSIZE_X; localThreads[1] = GROUPSIZE_Y; if (xp > 31 || xn > 31) { //Generic case. rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor", &status ); status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), (const void *)&xp); status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), (const void *)&xn); status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), (const void *)&wpl); status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), (const void *)&h); status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL); if (yp > 0 || yn > 0) { pixtemp = pixsCLBuffer; pixsCLBuffer = pixdCLBuffer; pixdCLBuffer = pixtemp; } } else if (xp > 0 || xn > 0 ) { //Specfic Horizontal pass kernel for half width < 32 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_32word", &status ); isEven = (xp != xn); status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), (const void *)&xp); status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), (const void *)&wpl); status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), (const void *)&h); status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isEven), (const void *)&isEven); status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL); if (yp > 0 || yn > 0) { pixtemp = pixsCLBuffer; pixsCLBuffer = pixdCLBuffer; pixdCLBuffer = pixtemp; } } if (yp > 0 || yn > 0) { rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer", &status ); status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), (const void *)&yp); status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), (const void *)&wpl); status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), (const void *)&h); status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(yn), (const void *)&yn); status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL); } return status; } //Morphology Erode operation. Invokes the relevant OpenCL kernels cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, l_uint32 h) { l_int32 xp, yp, xn, yn; SEL* sel; size_t globalThreads[2]; size_t localThreads[2]; cl_mem pixtemp; cl_int status; int gsize; char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC); l_uint32 rwmask, lwmask; char isEven; sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT); selFindMaxTranslations(sel, &xp, &yp, &xn, &yn); OpenclDevice::SetKernelEnv( &rEnv ); if (hsize == 5 && vsize == 5 && isAsymmetric) { //Specific kernel for 5x5 status = pixErodeCL_55(wpl, h); return status; } rwmask = rmask32[32 - (xp & 31)]; lwmask = lmask32[32 - (xn & 31)]; //global and local work dimensions for Horizontal pass gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; globalThreads[0] = gsize; gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; globalThreads[1] = gsize; localThreads[0] = GROUPSIZE_X; localThreads[1] = GROUPSIZE_Y; //Horizontal Pass if (xp > 31 || xn > 31 ) { //Generic case. rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor", &status ); status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), (const void *)&xp); status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), (const void *)&xn); status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), (const void *)&wpl); status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), (const void *)&h); status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(isAsymmetric), (const void *)&isAsymmetric); status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(rwmask), (const void *)&rwmask); status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(lwmask), (const void *)&lwmask); status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL); if (yp > 0 || yn > 0) { pixtemp = pixsCLBuffer; pixsCLBuffer = pixdCLBuffer; pixdCLBuffer = pixtemp; } } else if (xp > 0 || xn > 0) { rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_32word", &status ); isEven = (xp != xn); status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), (const void *)&xp); status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), (const void *)&wpl); status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), (const void *)&h); status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), (const void *)&isAsymmetric); status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(rwmask), (const void *)&rwmask); status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(lwmask), (const void *)&lwmask); status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(isEven), (const void *)&isEven); status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL); if (yp > 0 || yn > 0) { pixtemp = pixsCLBuffer; pixsCLBuffer = pixdCLBuffer; pixdCLBuffer = pixtemp; } } //Vertical Pass if (yp > 0 || yn > 0) { rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer", &status ); status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), (const void *)&yp); status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), (const void *)&wpl); status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), (const void *)&h); status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), (const void *)&isAsymmetric); status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(yn), (const void *)&yn); status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL); } return status; } // OpenCL implementation of Morphology Dilate //Note: Assumes the source and dest opencl buffer are initialized. No check done PIX* OpenclDevice::pixDilateBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy = false) { l_uint32 wpl, h; wpl = pixGetWpl(pixs); h = pixGetHeight(pixs); clStatus = pixDilateCL(hsize, vsize, wpl, h); if (reqDataCopy) { pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ, false); } return pixd; } // OpenCL implementation of Morphology Erode //Note: Assumes the source and dest opencl buffer are initialized. No check done PIX* OpenclDevice::pixErodeBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy = false) { l_uint32 wpl, h; wpl = pixGetWpl(pixs); h = pixGetHeight(pixs); clStatus = pixErodeCL(hsize, vsize, wpl, h); if (reqDataCopy) { pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ); } return pixd; } //Morphology Open operation. Invokes the relevant OpenCL kernels cl_int pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) { cl_int status; cl_mem pixtemp; //Erode followed by Dilate status = pixErodeCL(hsize, vsize, wpl, h); pixtemp = pixsCLBuffer; pixsCLBuffer = pixdCLBuffer; pixdCLBuffer = pixtemp; status = pixDilateCL(hsize, vsize, wpl, h); return status; } //Morphology Close operation. Invokes the relevant OpenCL kernels cl_int pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) { cl_int status; cl_mem pixtemp; //Dilate followed by Erode status = pixDilateCL(hsize, vsize, wpl, h); pixtemp = pixsCLBuffer; pixsCLBuffer = pixdCLBuffer; pixdCLBuffer = pixtemp; status = pixErodeCL(hsize, vsize, wpl, h); return status; } // OpenCL implementation of Morphology Close //Note: Assumes the source and dest opencl buffer are initialized. No check done PIX* OpenclDevice::pixCloseBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy = false) { l_uint32 wpl, h; wpl = pixGetWpl(pixs); h = pixGetHeight(pixs); clStatus = pixCloseCL(hsize, vsize, wpl, h); if (reqDataCopy) { pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ); } return pixd; } // OpenCL implementation of Morphology Open //Note: Assumes the source and dest opencl buffer are initialized. No check done PIX* OpenclDevice::pixOpenBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy = false) { l_uint32 wpl, h; wpl = pixGetWpl(pixs); h = pixGetHeight(pixs); clStatus = pixOpenCL(hsize, vsize, wpl, h); if (reqDataCopy) { pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ); } return pixd; } //pix OR operation: outbuffer = buffer1 | buffer2 cl_int pixORCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outbuffer) { cl_int status; size_t globalThreads[2]; int gsize; size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y}; gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; globalThreads[0] = gsize; gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; globalThreads[1] = gsize; rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixOR", &status ); status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &buffer1); status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &buffer2); status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(cl_mem), &outbuffer); status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), (const void *)&wpl); status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), (const void *)&h); status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL); return status; } //pix AND operation: outbuffer = buffer1 & buffer2 cl_int pixANDCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outbuffer) { cl_int status; size_t globalThreads[2]; int gsize; size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y}; gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; globalThreads[0] = gsize; gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; globalThreads[1] = gsize; rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixAND", &status ); // Enqueue a kernel run call. status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &buffer1); status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &buffer2); status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(cl_mem), &outbuffer); status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), (const void *)&wpl); status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), (const void *)&h); status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL); return status; } //output = buffer1 & ~(buffer2) cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outBuffer = NULL) { cl_int status; size_t globalThreads[2]; int gsize; size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y}; gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; globalThreads[0] = gsize; gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; globalThreads[1] = gsize; if (outBuffer != NULL) { rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixSubtract", &status ); } else { rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixSubtract_inplace", &status ); } // Enqueue a kernel run call. status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &buffer1); status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &buffer2); status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), (const void *)&wpl); status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), (const void *)&h); if (outBuffer != NULL) { status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), (const void *)&outBuffer); } status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL); return status; } // OpenCL implementation of Subtract pix //Note: Assumes the source and dest opencl buffer are initialized. No check done PIX* OpenclDevice::pixSubtractCL(PIX *pixd, PIX *pixs1, PIX *pixs2, bool reqDataCopy = false) { l_uint32 wpl, h; PROCNAME("pixSubtractCL"); if (!pixs1) return (PIX *)ERROR_PTR("pixs1 not defined", procName, pixd); if (!pixs2) return (PIX *)ERROR_PTR("pixs2 not defined", procName, pixd); if (pixGetDepth(pixs1) != pixGetDepth(pixs2)) return (PIX *)ERROR_PTR("depths of pixs* unequal", procName, pixd); #if EQUAL_SIZE_WARNING if (!pixSizesEqual(pixs1, pixs2)) L_WARNING("pixs1 and pixs2 not equal sizes", procName); #endif /* EQUAL_SIZE_WARNING */ wpl = pixGetWpl(pixs1); h = pixGetHeight(pixs1); clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer); if (reqDataCopy) { //Read back output data from OCL buffer to cpu pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs1, wpl*h, CL_MAP_READ); } return pixd; } // OpenCL implementation of Hollow pix //Note: Assumes the source and dest opencl buffer are initialized. No check done PIX* OpenclDevice::pixHollowCL(PIX *pixd, PIX *pixs, l_int32 close_hsize, l_int32 close_vsize, l_int32 open_hsize, l_int32 open_vsize, bool reqDataCopy = false) { l_uint32 wpl, h; cl_mem pixtemp; wpl = pixGetWpl(pixs); h = pixGetHeight(pixs); //First step : Close Morph operation: Dilate followed by Erode clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h); //Store the output of close operation in an intermediate buffer //this will be later used for pixsubtract clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL); //Second step: Open Operation - Erode followed by Dilate pixtemp = pixsCLBuffer; pixsCLBuffer = pixdCLBuffer; pixdCLBuffer = pixtemp; clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h); //Third step: Subtract : (Close - Open) pixtemp = pixsCLBuffer; pixsCLBuffer = pixdCLBuffer; pixdCLBuffer = pixdCLIntermediate; pixdCLIntermediate = pixtemp; clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer); if (reqDataCopy) { //Read back output data from OCL buffer to cpu pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ); } return pixd; } // OpenCL implementation of Get Lines from pix function //Note: Assumes the source and dest opencl buffer are initialized. No check done void OpenclDevice::pixGetLinesCL(PIX *pixd, PIX *pixs, PIX** pix_vline, PIX** pix_hline, PIX** pixClosed, bool getpixClosed, l_int32 close_hsize, l_int32 close_vsize, l_int32 open_hsize, l_int32 open_vsize, l_int32 line_hsize, l_int32 line_vsize) { l_uint32 wpl, h; cl_mem pixtemp; wpl = pixGetWpl(pixs); h = pixGetHeight(pixs); //First step : Close Morph operation: Dilate followed by Erode clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h); //Copy the Close output to CPU buffer if (getpixClosed) { *pixClosed = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs, wpl*h, CL_MAP_READ, true, false); } //Store the output of close operation in an intermediate buffer //this will be later used for pixsubtract clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL); //Second step: Open Operation - Erode followed by Dilate pixtemp = pixsCLBuffer; pixsCLBuffer = pixdCLBuffer; pixdCLBuffer = pixtemp; clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h); //Third step: Subtract : (Close - Open) pixtemp = pixsCLBuffer; pixsCLBuffer = pixdCLBuffer; pixdCLBuffer = pixdCLIntermediate; pixdCLIntermediate = pixtemp; clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer); //Store the output of Hollow operation in an intermediate buffer //this will be later used clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL); pixtemp = pixsCLBuffer; pixsCLBuffer = pixdCLBuffer; pixdCLBuffer = pixtemp; //Fourth step: Get vertical line //pixOpenBrick(NULL, pix_hollow, 1, min_line_length); clStatus = pixOpenCL(1, line_vsize, wpl, h); //Copy the vertical line output to CPU buffer *pix_vline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl*h, CL_MAP_READ, true, false); pixtemp = pixsCLBuffer; pixsCLBuffer = pixdCLIntermediate; pixdCLIntermediate = pixtemp; //Fifth step: Get horizontal line //pixOpenBrick(NULL, pix_hollow, min_line_length, 1); clStatus = pixOpenCL(line_hsize, 1, wpl, h); //Copy the horizontal line output to CPU buffer *pix_hline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl*h, CL_MAP_READ, true, true); return; } /************************************************************************* * HistogramRect * Otsu Thresholding Operations * histogramAllChannels is layed out as all channel 0, then all channel 1... ************************************************************************/ void OpenclDevice::HistogramRectOCL( const unsigned char* imageData, int bytes_per_pixel, int bytes_per_line, int left, // TODO use this int top, // TODO use this int width, int height, int kHistogramSize, int* histogramAllChannels) { cl_int clStatus; KernelEnv histKern; SetKernelEnv( &histKern ); KernelEnv histRedKern; SetKernelEnv( &histRedKern ); /* map imagedata to device as read only */ // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be coherent which we don't need. // faster option would be to allocate initial image buffer // using a garlic bus memory type cl_mem imageBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, width*height*bytes_per_pixel*sizeof(char), (void *)imageData, &clStatus ); CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer"); /* setup work group size parameters */ int block_size = 256; int numCUs = 6; // for KV, TODO read from device info int requestedOccupancy = 10; // will be limited by local memory; TODO calculate ideal? int numWorkGroups = numCUs * requestedOccupancy; int numThreads = block_size*numWorkGroups; size_t local_work_size[] = {block_size}; size_t global_work_size[] = {numThreads}; size_t red_global_work_size[] = {block_size*kHistogramSize*bytes_per_pixel}; // {block_size*kHistogramSize}; /*uchar*/ /* map histogramAllChannels as write only */ int numBins = kHistogramSize*bytes_per_pixel*numWorkGroups; cl_uint *histogramBufferHost = new cl_uint[numBins]; for (int i = 0; i < numBins; i++) histogramBufferHost[i] = 123456789; cl_mem histogramBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, kHistogramSize*bytes_per_pixel*sizeof(int), (void *)histogramAllChannels, &clStatus ); CHECK_OPENCL( clStatus, "clCreateBuffer histogramBuffer"); /* intermediate histogram buffer */ int histRed = 256; int tmpHistogramBins = kHistogramSize*bytes_per_pixel*histRed; // numThreads*kHistogramSize; /*uchar*/ cl_mem tmpHistogramBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE, tmpHistogramBins*sizeof(cl_uint), NULL, &clStatus ); CHECK_OPENCL( clStatus, "clCreateBuffer tmpHistogramBuffer"); /* atomic sync buffer */ int *zeroBuffer = new int[1]; zeroBuffer[0] = 0; cl_mem atomicSyncBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_int), (void *)zeroBuffer, &clStatus ); CHECK_OPENCL( clStatus, "clCreateBuffer atomicSyncBuffer"); /* compile kernels */ histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus ); CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannels"); histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectAllChannelsReduction", &clStatus ); CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannelsReduction"); /* set kernel 1 arguments */ clStatus = clSetKernelArg( histKern.mpkKernel, 0, sizeof(cl_mem), (void *)&imageBuffer ); CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer"); cl_uint numPixels = width*height; clStatus = clSetKernelArg( histKern.mpkKernel, 1, sizeof(cl_uint), (void *)&numPixels ); CHECK_OPENCL( clStatus, "clSetKernelArg numPixels" ); clStatus = clSetKernelArg( histKern.mpkKernel, 2, sizeof(cl_mem), (void *)&tmpHistogramBuffer ); CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer"); /* set kernel 2 arguments */ int n = numThreads/bytes_per_pixel; clStatus = clSetKernelArg( histRedKern.mpkKernel, 0, sizeof(cl_int), (void *)&n ); CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer"); clStatus = clSetKernelArg( histRedKern.mpkKernel, 1, sizeof(cl_mem), (void *)&tmpHistogramBuffer ); CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer"); clStatus = clSetKernelArg( histRedKern.mpkKernel, 2, sizeof(cl_mem), (void *)&histogramBuffer ); CHECK_OPENCL( clStatus, "clSetKernelArg histogramBuffer"); /* launch histogram */ //QueryPerformanceCounter(&t2); clStatus = clEnqueueNDRangeKernel( histKern.mpkCmdQueue, histKern.mpkKernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL ); CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels" ); clFinish( histKern.mpkCmdQueue ); /* launch histogram */ clStatus = clEnqueueNDRangeKernel( histRedKern.mpkCmdQueue, histRedKern.mpkKernel, 1, NULL, red_global_work_size, local_work_size, 0, NULL, NULL ); CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction" ); clFinish( histRedKern.mpkCmdQueue ); /* map results back from gpu */ void *ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, histogramBuffer, CL_TRUE, CL_MAP_READ, 0, kHistogramSize*bytes_per_pixel*sizeof(int), 0, NULL, NULL, &clStatus); CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer"); clEnqueueUnmapMemObject(histKern.mpkCmdQueue, histogramBuffer, ptr, 0, NULL, NULL); clReleaseMemObject(histogramBuffer); clReleaseMemObject(imageBuffer); delete[] histogramBufferHost; } // Threshold the rectangle, taking everything except the image buffer pointer // from the class, using thresholds/hi_values to the output IMAGE. void OpenclDevice::ThresholdRectToPixOCL( const unsigned char* imageData, int bytes_per_pixel, int bytes_per_line, const int* thresholds, const int* hi_values, Pix** pix, int height, int width, int top, int left) { /* create pix result buffer */ *pix = pixCreate(width, height, 1); uinT32* pixData = pixGetData(*pix); int wpl = pixGetWpl(*pix); int pixSize = wpl*height*sizeof(uinT32); cl_int clStatus; KernelEnv rEnv; SetKernelEnv( &rEnv ); /* setup work group size parameters */ int block_size = 256; int numCUs = 8; // for KV, TODO read from device info int requestedOccupancy = 10; // will be limited by local memory; TODO calculate ideal? int numWorkGroups = numCUs * requestedOccupancy; int numThreads = block_size*numWorkGroups; size_t local_work_size[] = {block_size}; size_t global_work_size[] = {numThreads}; /* map imagedata to device as read only */ // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be coherent which we don't need. // faster option would be to allocate initial image buffer // using a garlic bus memory type cl_mem imageBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, width*height*bytes_per_pixel*sizeof(char), (void *)imageData, &clStatus ); CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer"); /* map pix as write only */ pixThBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, pixSize, (void *)pixData, &clStatus ); CHECK_OPENCL( clStatus, "clCreateBuffer pix"); /* map thresholds and hi_values */ cl_mem thresholdsBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bytes_per_pixel*sizeof(int), (void *)thresholds, &clStatus ); CHECK_OPENCL( clStatus, "clCreateBuffer thresholdBuffer"); cl_mem hiValuesBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bytes_per_pixel*sizeof(int), (void *)hi_values, &clStatus ); CHECK_OPENCL( clStatus, "clCreateBuffer hiValuesBuffer"); /* compile kernel */ rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "kernel_ThresholdRectToPix", &clStatus ); CHECK_OPENCL( clStatus, "clCreateKernel kernel_ThresholdRectToPix"); /* set kernel arguments */ clStatus = clSetKernelArg( rEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&imageBuffer ); CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer"); cl_uint numPixels = width*height; clStatus = clSetKernelArg( rEnv.mpkKernel, 1, sizeof(int), (void *)&height ); CHECK_OPENCL( clStatus, "clSetKernelArg height" ); clStatus = clSetKernelArg( rEnv.mpkKernel, 2, sizeof(int), (void *)&width ); CHECK_OPENCL( clStatus, "clSetKernelArg width" ); clStatus = clSetKernelArg( rEnv.mpkKernel, 3, sizeof(int), (void *)&wpl ); CHECK_OPENCL( clStatus, "clSetKernelArg wpl" ); clStatus = clSetKernelArg( rEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&thresholdsBuffer ); CHECK_OPENCL( clStatus, "clSetKernelArg thresholdsBuffer" ); clStatus = clSetKernelArg( rEnv.mpkKernel, 5, sizeof(cl_mem), (void *)&hiValuesBuffer ); CHECK_OPENCL( clStatus, "clSetKernelArg hiValuesBuffer" ); clStatus = clSetKernelArg( rEnv.mpkKernel, 6, sizeof(cl_mem), (void *)&pixThBuffer ); CHECK_OPENCL( clStatus, "clSetKernelArg pixThBuffer"); /* launch kernel & wait */ clStatus = clEnqueueNDRangeKernel( rEnv.mpkCmdQueue, rEnv.mpkKernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL ); CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix" ); clFinish( rEnv.mpkCmdQueue ); /* map results back from gpu */ void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0, pixSize, 0, NULL, NULL, &clStatus); CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer"); clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0, NULL, NULL); clReleaseMemObject(imageBuffer); clReleaseMemObject(thresholdsBuffer); clReleaseMemObject(hiValuesBuffer); } /****************************************************************************** * Micro Benchmarks for Device Selection *****************************************************************************/ #if USE_DEVICE_SELECTION typedef struct _TessScoreEvaluationInputData { int imageHeight; int imageWidth; int imageNumChannels; } TessScoreEvaluationInputData; typedef struct _TessDeviceScore { float time; // small time means faster device } TessDeviceScore; bool pixReadTiffMicroBench(TessScoreEvaluationInputData input, ds_device_type type ) { // input data // function call // cleanup return true; } bool histogramRectMicroBench( TessScoreEvaluationInputData input, ds_device_type type ) { // input data int height = input.imageHeight; int width = input.imageWidth; int bytes_per_pixel = input.imageNumChannels; unsigned char *imageData = new unsigned char[height*width]; int bytes_per_line = width*bytes_per_pixel / 8; int left = 0; int top = 0; int kHistogramSize = 256; int *histogramAllChannels = new int[kHistogramSize*bytes_per_pixel]; // function call if (type == DS_DEVICE_OPENCL_DEVICE) { OpenclDevice od; od.HistogramRectOCL( imageData, bytes_per_pixel, bytes_per_line, left, top, width, height, kHistogramSize, histogramAllChannels); } else { int *histogram = new int[kHistogramSize]; for (int ch = 0; ch < bytes_per_pixel; ++ch) { tesseract::HistogramRect(imageData + ch, bytes_per_pixel, bytes_per_line, left, top, width, height, histogram); } delete[] histogram; } // cleanup delete[] imageData; delete[] histogramAllChannels; return true; } double thresholdRectToPixMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) { double time; LARGE_INTEGER freq, time_funct_start, time_funct_end; QueryPerformanceFrequency(&freq); // input data int height = input.imageHeight; int width = input.imageWidth; int bytes_per_pixel = input.imageNumChannels; unsigned char pixelHi = (unsigned char)255; unsigned char* imageData = new unsigned char[height*width*bytes_per_pixel]; for (int i = 0; i < height*width*bytes_per_pixel; i++) { imageData[i] = (unsigned char) rand()%pixelHi; } int bytes_per_line = width/8; int* thresholds = new int[4]; thresholds[0] = pixelHi/2; thresholds[1] = pixelHi/2; thresholds[2] = pixelHi/2; thresholds[3] = pixelHi/2; int *hi_values = new int[4]; thresholds[0] = pixelHi; thresholds[1] = pixelHi; thresholds[2] = pixelHi; thresholds[3] = pixelHi; Pix* pix = pixCreate(width, height, 1); int top = 0; int left = 0; // function call if (type == DS_DEVICE_OPENCL_DEVICE) { /* create pix result buffer */ //*pix = pixCreate(width, height, 1); uinT32* pixData = pixGetData(pix); int wpl = pixGetWpl(pix); int pixSize = wpl*height*sizeof(uinT32); cl_int clStatus; /* setup work group size parameters */ int block_size = 256; int numCUs = 8; // for KV, TODO read from device info int requestedOccupancy = 10; // will be limited by local memory; TODO calculate ideal? int numWorkGroups = numCUs * requestedOccupancy; int numThreads = block_size*numWorkGroups; size_t local_work_size[] = {block_size}; size_t global_work_size[] = {numThreads}; /* map imagedata to device as read only */ cl_mem imageBuffer = clCreateBuffer( env->mpContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, width*height*bytes_per_pixel*sizeof(char), (void *)imageData, &clStatus ); CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer"); /* map pix as write only */ cl_mem pixBuffer = clCreateBuffer( env->mpContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, pixSize, (void *)pixData, &clStatus ); CHECK_OPENCL( clStatus, "clCreateBuffer pix"); /* map thresholds and hi_values */ cl_mem thresholdsBuffer = clCreateBuffer( env->mpContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bytes_per_pixel*sizeof(int), (void *)thresholds, &clStatus ); CHECK_OPENCL( clStatus, "clCreateBuffer thresholdBuffer"); cl_mem hiValuesBuffer = clCreateBuffer( env->mpContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bytes_per_pixel*sizeof(int), (void *)hi_values, &clStatus ); CHECK_OPENCL( clStatus, "clCreateBuffer hiValuesBuffer"); /* compile kernel */ cl_kernel kernel = clCreateKernel( env->mpArryPrograms[0], "kernel_ThresholdRectToPix", &clStatus ); CHECK_OPENCL( clStatus, "clCreateKernel kernel_ThresholdRectToPix"); /* set kernel arguments */ clStatus = clSetKernelArg( kernel, 0, sizeof(cl_mem), (void *)&imageBuffer ); CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer"); cl_uint numPixels = width*height; clStatus = clSetKernelArg( kernel, 1, sizeof(int), (void *)&height ); CHECK_OPENCL( clStatus, "clSetKernelArg height" ); clStatus = clSetKernelArg( kernel, 2, sizeof(int), (void *)&width ); CHECK_OPENCL( clStatus, "clSetKernelArg width" ); clStatus = clSetKernelArg( kernel, 3, sizeof(int), (void *)&wpl ); CHECK_OPENCL( clStatus, "clSetKernelArg wpl" ); clStatus = clSetKernelArg( kernel, 4, sizeof(cl_mem), (void *)&thresholdsBuffer ); CHECK_OPENCL( clStatus, "clSetKernelArg thresholdsBuffer" ); clStatus = clSetKernelArg( kernel, 5, sizeof(cl_mem), (void *)&hiValuesBuffer ); CHECK_OPENCL( clStatus, "clSetKernelArg hiValuesBuffer" ); clStatus = clSetKernelArg( kernel, 6, sizeof(cl_mem), (void *)&pixBuffer ); CHECK_OPENCL( clStatus, "clSetKernelArg pixBuffer"); /* launch kernel & wait */ QueryPerformanceCounter(&time_funct_start); clStatus = clEnqueueNDRangeKernel( env->mpCmdQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL ); CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix" ); clFinish( env->mpCmdQueue ); QueryPerformanceCounter(&time_funct_end); time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); /* map results back from gpu */ void *ptr = clEnqueueMapBuffer(env->mpCmdQueue, pixBuffer, CL_TRUE, CL_MAP_READ, 0, pixSize, 0, NULL, NULL, &clStatus); CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer"); clEnqueueUnmapMemObject(env->mpCmdQueue, pixBuffer, ptr, 0, NULL, NULL); clReleaseMemObject(pixBuffer); clReleaseMemObject(imageBuffer); clReleaseMemObject(thresholdsBuffer); clReleaseMemObject(hiValuesBuffer); } else { tesseract::ImageThresholder thresholder; thresholder.SetImage( pix ); QueryPerformanceCounter(&time_funct_start); thresholder.ThresholdRectToPix( imageData, bytes_per_pixel, bytes_per_line, thresholds, hi_values, &pix ); QueryPerformanceCounter(&time_funct_end); time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); } // cleanup delete[] imageData; delete[] thresholds; delete[] hi_values; pixDestroy(&pix); return time; } /****************************************************************************** * Device Selection *****************************************************************************/ #include "stdlib.h" // encode score object as byte string ds_status serializeScore( ds_device* device, // input void **serializedScore, // output unsigned int* serializedScoreSize // output ) { *serializedScoreSize = sizeof(TessDeviceScore); *serializedScore = (void *) new unsigned char[*serializedScoreSize]; memcpy(*serializedScore, device->score, *serializedScoreSize); return DS_SUCCESS; } // parses byte string and stores in score object ds_status deserializeScore( ds_device* device, // output const unsigned char* serializedScore, // input unsigned int serializedScoreSize // input ) { // check that serializedScoreSize == sizeof(TessDeviceScore); device->score = new TessDeviceScore; memcpy(device->score, serializedScore, serializedScoreSize); return DS_SUCCESS; } // evaluate devices ds_status evaluateScoreForDevice( ds_device *device, void *inputData) { // overwrite statuc gpuEnv w/ current device // so native opencl calls can be used; they use static gpuEnv printf("[DS] evaluateScoreForDevice [%i:%s] calling populateGPUEnvFromDevice\n", device->type, device->oclDeviceName); GPUEnv *env = NULL; if (device->type == DS_DEVICE_OPENCL_DEVICE) { env = new GPUEnv; printf("[DS] populating tmp GPUEnv from device\n"); populateGPUEnvFromDevice( env, device->oclDeviceID); env->mnFileCount = 0; //argc; env->mnKernelCount = 0UL; printf("[DS] compiling kernels for tmp GPUEnv\n"); OpenclDevice::CompileKernelFile(env, ""); } TessScoreEvaluationInputData *input = (TessScoreEvaluationInputData *)inputData; LARGE_INTEGER freq, time_funct_start, time_funct_end; QueryPerformanceFrequency(&freq); // pixReadTiff //QueryPerformanceCounter(&time_funct_start); //bool pixReadTiffCorrect = pixReadTiffMicroBench( *input, device->type ); //QueryPerformanceCounter(&time_funct_end); double pixReadTiffTime = 0.f; // (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); // HistogramRect QueryPerformanceCounter(&time_funct_start); //bool histogramRectCorrect = histogramRectMicroBench( *env, *input, device->type ); QueryPerformanceCounter(&time_funct_end); double histogramRectTime = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); // ThresholdRectToPix //QueryPerformanceCounter(&time_funct_start); double thresholdRectToPixTime = thresholdRectToPixMicroBench( env, *input, device->type ); //QueryPerformanceCounter(&time_funct_end); //double thresholdRectToPixTime = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); // others QueryPerformanceCounter(&time_funct_start); // function call QueryPerformanceCounter(&time_funct_end); double otherTime = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); // weigh times float weightedTime = 1.00 * pixReadTiffTime + 1.00 * histogramRectTime + 1.00 * thresholdRectToPixTime + 1.00 * otherTime; device->score = (void *)new TessDeviceScore; ((TessDeviceScore *)device->score)->time = weightedTime; //TessDeviceScore *score = new TessDeviceScore; //score->time = weightedTime; //*(device->score) = (void *)score; printf("[DS] Device %i:%s evaluated. Score: %f\n", device->type, device->oclDeviceName, ((TessDeviceScore *)device->score)->time ); return DS_SUCCESS; } // initial call to select device cl_device_id performDeviceSelection( ) { printf("[DS] Performing Device Selection\n"); // setup devices ds_status status; ds_profile *profile; status = initDSProfile( &profile, "v0.1" ); // try reading scores from file char *fileName = "tesseract_opencl_profile_devices.dat"; status = readProfileFromFile( profile, deserializeScore, fileName); if (status != DS_SUCCESS) { printf("[DS] Performing Device Profiling\n"); // need to run evaluation // create input data TessScoreEvaluationInputData input; input.imageHeight = 2048; input.imageWidth = 2048; input.imageNumChannels = 4; // perform evaluations unsigned int numUpdates; status = profileDevices( profile, DS_EVALUATE_ALL, evaluateScoreForDevice, (void *)&input, &numUpdates ); #if 1 // write scores to file if ( status == DS_SUCCESS ) { status = writeProfileToFile( profile, serializeScore, fileName); if ( status == DS_SUCCESS ) { printf("[DS] Scores written to file (%s).", fileName); } else { printf("[DS] Error saving scores to file (%s); scores not written to file.", fileName); } } else { printf("[DS] Unable to evaluate performance; scores not written to file."); } #endif } else { printf("[DS] Scores read from file.\n"); } // we now have device scores either from file or evaluation // select fastest using custom Tesseract selection algorithm float bestTime = FLT_MAX; // begin search with worst possible time int bestDeviceIdx = -1; for (int d = 0; d < profile->numDevices; d++) { //((TessDeviceScore *)device->score)->time ds_device device = profile->devices[d]; TessDeviceScore score = *(TessDeviceScore *)device.score; float time = score.time; printf("[DS] Device[%i] %i:%s score is %f\n", d, device.type, device.oclDeviceName, time); if (time < bestTime) { bestDeviceIdx = d; } } bestDeviceIdx = 0; // TODO remove me cl_device_id bestDevice = profile->devices[bestDeviceIdx].oclDeviceID; printf("[DS] Selected Device[%i]: %s\n", bestDeviceIdx, profile->devices[bestDeviceIdx].oclDeviceName); // cleanup // call destructor for profile object return bestDevice; } #endif #endif