Fixes for OpenCL issues reported on Apple Mac. Still get -54 on Apple Mac while running on OpenCL CPU, however it is ignored now.

2025-06-11 12:43:17 +08:00 · 2015-08-24 15:57:18 +05:30 · 2015-08-24 15:57:18 +05:30 · 8e9159b091
commit 8e9159b091
parent 2c8bc4a2ac
4 changed files with 563 additions and 628 deletions
--- a/opencl/oclkernels.h
+++ b/opencl/oclkernels.h
@ -1100,10 +1100,13 @@ void kernel_ThresholdRectToPix(
    }
 }

-// only supports 1 channel
+\n#define CHAR_VEC_WIDTH 8 \n
+\n#define PIXELS_PER_WORD 32 \n
+\n#define PIXELS_PER_BURST 8 \n
+\n#define BURSTS_PER_WORD (PIXELS_PER_WORD/PIXELS_PER_BURST) \n
 typedef union {
-  uchar s[PIXELS_PER_BURST];
-  uchar8 v[(PIXELS_PER_BURST)/CHAR_VEC_WIDTH];
+  uchar s[PIXELS_PER_BURST*1];
+  uchar8 v[(PIXELS_PER_BURST*1)/CHAR_VEC_WIDTH];
 } charVec1;

 __attribute__((reqd_work_group_size(256, 1, 1)))
@ -1112,7 +1115,7 @@ void kernel_ThresholdRectToPix_OneChan(
    __global const uchar8 *imageData,
    int height,
    int width,
-    int wpl, // words per line
+    int wpl, // words per line of output image
    __global int *thresholds,
    __global int *hi_values,
    __global int *pix) {
@ -1134,96 +1137,71 @@ void kernel_ThresholdRectToPix_OneChan(

            // load burst
            charVec1 pixels;
-            for ( int i = 0; i < (PIXELS_PER_BURST)/CHAR_VEC_WIDTH; i++ ) {
-                pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST)/CHAR_VEC_WIDTH)  + i];
-            }
+            // for each char8 in burst
+            pixels.v[0] = imageData[
+                    w*BURSTS_PER_WORD
+                  + b
+                  + 0 ];

            // for each pixel in burst
            for ( int p = 0; p < PIXELS_PER_BURST; p++) {
-                for ( int c = 0; c < 1; c++) {
-                    unsigned char pixChan = pixels.s[p + c];
-                    if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
+                
+                  //int littleEndianIdx = p ^ 3;
+                  //int bigEndianIdx = p;
+                  int idx = 
+\n#ifdef __ENDIAN_LITTLE__\n
+                  p ^ 3;
+\n#else\n
+                  p;
+\n#endif\n
+                unsigned char pixChan = pixels.s[idx];
+                if (pHi_Values[0] >= 0 && (pixChan > pThresholds[0]) == (pHi_Values[0] == 0)) {
                    word |=  (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31));
                }
            }
        }
-        }
        pix[w] = word;
    }
 }
+
 )

+
+KERNEL(
+\n#define RED_SHIFT		24\n
+\n#define GREEN_SHIFT		16\n
+\n#define BLUE_SHIFT		8\n
+\n#define SET_DATA_BYTE( pdata, n, val ) (*(l_uint8 *)((l_uintptr_t)((l_uint8 *)(pdata) + (n)) ^ 3) = (val))\n
+\n
+\n__attribute__((reqd_work_group_size(256, 1, 1)))\n
+\n__kernel\n
+\nvoid kernel_RGBToGray(
+    __global const unsigned int *srcData,
+	__global unsigned char *dstData,
+    int srcWPL,
+    int dstWPL,
+    int height,
+    int width,
+	float rwt,
+	float gwt,
+	float bwt ) {
+    
+    // pixel index
+    int pixelIdx = get_global_id(0);
+    if (pixelIdx >= height*width) return;
+
+	unsigned int word = srcData[pixelIdx];
+	int output =	(rwt * ((word >> RED_SHIFT)	  & 0xff) +
+                     gwt * ((word >> GREEN_SHIFT) & 0xff) +
+                     bwt * ((word >> BLUE_SHIFT)  & 0xff) + 0.5);
+    // SET_DATA_BYTE
+    dstData[pixelIdx] = output;
+}
+)
+#endif
+
 ; // close char*

 #endif // USE_EXTERNAL_KERNEL
-#endif //_OCL_KERNEL_H_
+//#endif //_OCL_KERNEL_H_
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
-
-// Alternative histogram kernel written to use uchar and different global memory scattered write
-// was a little better for intel platforms but still not faster then native serial code
-#if 0
-/*  data layed out as
-    bin0                                        bin1                            bin2...
-    r,g,b,a,r,g,b,a,r,g,b,a nthreads/4 copies
-*/
-\n__attribute__((reqd_work_group_size(256, 1, 1)))
-\n  __kernel
-\n  void kernel_HistogramRectAllChannels_uchar(
-\n      volatile __global const uchar  *data,
-\n                              uint   numPixels,
-\n      volatile __global       uint   *histBuffer) {
-\n      
-\n      // for each pixel/channel, accumulate in global memory
-\n      for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS; pc += get_global_size(0) ) {
-\n          uchar value = data[pc];
-\n          int idx = value*get_global_size(0) + get_global_id(0);
-\n          histBuffer[ idx ]++; // coalesced if same value
-\n      }
-\n  } // kernel_HistogramRectAllChannels
-\n
-\n  __attribute__((reqd_work_group_size(256, 1, 1)))
-\n  __kernel
-\n  void kernel_HistogramRectAllChannelsReduction_uchar(
-\n      int n, // pixel redundancy that needs to be accumulated = nthreads/4
-\n      __global uint4 *histBuffer,
-\n      __global uint* histResult) { // each wg accumulates 1 bin (all channels within it
-\n  
-\n      // declare variables
-\n      int binIdx     = get_group_id(0);
-\n      size_t groupId = get_group_id(0);
-\n      size_t localId = get_local_id(0); // 0 -> 256-1
-\n      size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
-\n      uint numThreads = get_global_size(0);
-\n      uint4 hist = {0, 0, 0, 0};
-\n
-\n      // accumulate in register
-\n      for ( uint p = get_local_id(0); p < n; p+=GROUP_SIZE) {
-\n          hist += histBuffer[binIdx*n+p];
-\n      }
-\n  
-\n      // reduction in local memory
-\n      __local uint4 localHist[GROUP_SIZE];
-\n      localHist[localId] = hist;
-\n      barrier(CLK_LOCAL_MEM_FENCE);
-\n  
-\n      for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
-\n          if (localId < stride) {
-\n              hist = localHist[ localId+stride];
-\n          }
-\n          barrier(CLK_LOCAL_MEM_FENCE);
-\n          if (localId < stride) {
-\n              localHist[ localId] += hist;
-\n          }
-\n          barrier(CLK_LOCAL_MEM_FENCE);
-\n      }
-\n
-\n      // write reduction to final result
-\n      if (localId == 0) {
-\n          histResult[0*HIST_SIZE+binIdx] = localHist[0].s0;
-\n          histResult[1*HIST_SIZE+binIdx] = localHist[0].s1;
-\n          histResult[2*HIST_SIZE+binIdx] = localHist[0].s2;
-\n          histResult[3*HIST_SIZE+binIdx] = localHist[0].s3;
-\n      }
-\n  
-\n  } // kernel_HistogramRectAllChannels
-#endif
--- a/opencl/opencl_device_selection.h
+++ b/opencl/opencl_device_selection.h
@ -63,6 +63,8 @@ static ds_status releaseDSProfile(ds_profile* profile, ds_score_release sr) {
    if (profile->devices!=NULL && sr!=NULL) {
      unsigned int i;
      for (i = 0; i < profile->numDevices; i++) {
+        if (profile->devices[i].oclDeviceName) free(profile->devices[i].oclDeviceName);
+        if (profile->devices[i].oclDriverVersion) free(profile->devices[i].oclDriverVersion);
        status = sr(profile->devices[i].score);
        if (status != DS_SUCCESS)
          break;
--- a/opencl/openclwrapper.cpp
+++ b/opencl/openclwrapper.cpp
@ -1,6 +1,7 @@
 #ifdef _WIN32
 #include <Windows.h>
 #include <io.h>
+
 #else
 #include <sys/types.h>
 #include <unistd.h>
@ -14,21 +15,22 @@
 #include "otsuthr.h"
 #include "thresholder.h"

+#if ON_APPLE
+#include <stdio.h>
+#include <mach/mach_time.h>
+#endif
 #ifdef USE_OPENCL

-#if ON_APPLE
-#define TIMESPEC mach_timespec
-#else
-#define TIMESPEC timespec
-#endif
-
 #include "opencl_device_selection.h"
+#ifdef _MSC_VER
+int LeptMsgSeverity = 3;  // L_SEVERITY_INFO
+#endif  // _MSC_VER
 GPUEnv OpenclDevice::gpuEnv;

-#if USE_DEVICE_SELECTION
+
 bool OpenclDevice::deviceIsSelected = false;
 ds_device OpenclDevice::selectedDevice;
-#endif
+

 int OpenclDevice::isInited = 0;

@ -202,175 +204,6 @@ PIX* mapOutputCLBuffer(KernelEnv rEnv, cl_mem clbuffer, PIX* pixd, PIX* pixs, in
    return xValues;
 }

-int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo )
-{
-    size_t length;
-    cl_int clStatus;
-    cl_uint numPlatforms, numDevices;
-    cl_platform_id *platforms;
-    cl_context_properties cps[3];
-    char platformName[256];
-    unsigned int i;
-
-
-    // Have a look at the available platforms.
-
-    if ( !gpuInfo->mnIsUserCreated )
-    {
-        clStatus = clGetPlatformIDs( 0, NULL, &numPlatforms );
-        if ( clStatus != CL_SUCCESS )
-        {
-            return 1;
-        }
-        gpuInfo->mpPlatformID = NULL;
-
-        if ( 0 < numPlatforms )
-        {
-            platforms = (cl_platform_id*) malloc( numPlatforms * sizeof( cl_platform_id ) );
-            if ( platforms == (cl_platform_id*) NULL )
-            {
-                return 1;
-            }
-            clStatus = clGetPlatformIDs( numPlatforms, platforms, NULL );
-
-            if ( clStatus != CL_SUCCESS )
-            {
-                return 1;
-            }
-
-            for ( i = 0; i < numPlatforms; i++ )
-            {
-                clStatus = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR,
-                    sizeof( platformName ), platformName, NULL );
-
-                if ( clStatus != CL_SUCCESS )
-                {
-                    return 1;
-                }
-                gpuInfo->mpPlatformID = platforms[i];
-
-                //if (!strcmp(platformName, "Intel(R) Coporation"))
-                //if( !strcmp( platformName, "Advanced Micro Devices, Inc." ))
-                {
-                    gpuInfo->mpPlatformID = platforms[i];
-                    
-                    if ( getenv("SC_OPENCLCPU") )
-                    {
-                        clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform
-                                                  CL_DEVICE_TYPE_CPU,    // device_type for CPU device
-                                                  0,                     // num_entries
-                                                  NULL,                  // devices
-                                                  &numDevices);
-                        printf("Selecting OpenCL device: CPU (a)\n");
-                    }
-                    else
-                    {
-                          clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform
-                                                  CL_DEVICE_TYPE_GPU,      // device_type for GPU device
-                                                  0,                       // num_entries
-                                                  NULL,                    // devices
-                                                  &numDevices);
-                          printf("Selecting OpenCL device: GPU (a)\n");
-                    }
-                    if ( clStatus != CL_SUCCESS )
-                        continue;
-
-                    if ( numDevices )
-                        break;
-                }
-            }
-            if ( clStatus != CL_SUCCESS )
-                return 1;
-            free( platforms );
-        }
-        if ( NULL == gpuInfo->mpPlatformID )
-            return 1;
-
-        // Use available platform.
-        cps[0] = CL_CONTEXT_PLATFORM;
-        cps[1] = (cl_context_properties) gpuInfo->mpPlatformID;
-        cps[2] = 0;
-        // Set device type for OpenCL
-        
-        if ( getenv("SC_OPENCLCPU") )
-        {
-            gpuInfo->mDevType = CL_DEVICE_TYPE_CPU;
-            printf("Selecting OpenCL device: CPU (b)\n");
-        }
-        else
-        {
-            gpuInfo->mDevType = CL_DEVICE_TYPE_GPU;
-            printf("Selecting OpenCL device: GPU (b)\n");
-        }
-
-        gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
-
-        if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
-        {
-            gpuInfo->mDevType = CL_DEVICE_TYPE_CPU;
-            gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
-            printf("Selecting OpenCL device: CPU (c)\n");
-        }
-        if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
-        {
-            gpuInfo->mDevType = CL_DEVICE_TYPE_DEFAULT;
-            gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
-            printf("Selecting OpenCL device: DEFAULT (c)\n");
-        }
-        if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
-            return 1;
-        // Detect OpenCL devices.
-        // First, get the size of device list data
-        clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, 0, NULL, &length );
-        if ( ( clStatus != CL_SUCCESS ) || ( length == 0 ) )
-            return 1;
-        // Now allocate memory for device list based on the size we got earlier
-        gpuInfo->mpArryDevsID = (cl_device_id*) malloc( length );
-        if ( gpuInfo->mpArryDevsID == (cl_device_id*) NULL )
-            return 1;
-        // Now, get the device list data
-        clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, length,
-                       gpuInfo->mpArryDevsID, NULL );
-        if ( clStatus != CL_SUCCESS )
-            return 1;
-
-        // Create OpenCL command queue.
-        gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpArryDevsID[0], 0, &clStatus );
-
-        if ( clStatus != CL_SUCCESS )
-            return 1;
-    }
-
-    clStatus = clGetCommandQueueInfo( gpuInfo->mpCmdQueue, CL_QUEUE_THREAD_HANDLE_AMD, 0, NULL, NULL );
-    // Check device extensions for double type
-    size_t aDevExtInfoSize = 0;
-
-    clStatus = clGetDeviceInfo( gpuInfo->mpArryDevsID[0], CL_DEVICE_EXTENSIONS, 0, NULL, &aDevExtInfoSize );
-    CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
-
-    char *aExtInfo = new char[aDevExtInfoSize];
-
-    clStatus = clGetDeviceInfo( gpuInfo->mpArryDevsID[0], CL_DEVICE_EXTENSIONS,
-                   sizeof(char) * aDevExtInfoSize, aExtInfo, NULL);
-    CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
-
-    gpuInfo->mnKhrFp64Flag = 0;
-    gpuInfo->mnAmdFp64Flag = 0;
-
-    if ( strstr( aExtInfo, "cl_khr_fp64" ) )
-    {
-        gpuInfo->mnKhrFp64Flag = 1;
-    }
-    else
-    {
-        // Check if cl_amd_fp64 extension is supported
-        if ( strstr( aExtInfo, "cl_amd_fp64" ) )
-            gpuInfo->mnAmdFp64Flag = 1;
-    }
-    delete []aExtInfo;
-
-    return 0;
-}

 void OpenclDevice::releaseMorphCLBuffers()
 {
@ -426,14 +259,8 @@ PERF_COUNT_SUB("LoadOpencl")
    // sets up environment, compiles programs


-#if USE_DEVICE_SELECTION
-    
    InitOpenclRunEnv_DeviceSelection( 0 );
 //PERF_COUNT_SUB("called InitOpenclRunEnv_DS")
-#else
-    // init according to device
-    InitOpenclRunEnv( 0 );
-#endif
 //PERF_COUNT_END
    return 1;
 }
@ -465,62 +292,9 @@ int OpenclDevice::RegistOpenclKernel()
    AddKernelConfig( 1, (const char*) "oclAverageSub1" );
    return 0;
 }
-int OpenclDevice::InitOpenclRunEnv( int argc )
-{
-    int status = 0;
-    if ( MAX_CLKERNEL_NUM <= 0 )
-    {
-        return 1;
-    }
-    if ( ( argc > MAX_CLFILE_NUM ) || ( argc < 0 ) )
-        return 1;
-
-    if ( !isInited )
-    {
-        RegistOpenclKernel();
-        //initialize devices, context, comand_queue
-        status = InitOpenclRunEnv( &gpuEnv );
-        if ( status )
-        {
-            fprintf(stderr,"init_opencl_env failed.\n");
-            return 1;
-        }
-        fprintf(stderr,"init_opencl_env successed.\n");
-        //initialize program, kernelName, kernelCount
-        if( getenv( "SC_FLOAT" ) )
-        {
-            gpuEnv.mnKhrFp64Flag = 0;
-            gpuEnv.mnAmdFp64Flag = 0;
-        }
-        if( gpuEnv.mnKhrFp64Flag )
-        {
-            fprintf(stderr,"----use khr double type in kernel----\n");
-            status = CompileKernelFile( &gpuEnv, "-D KHR_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16" );
-        }
-        else if( gpuEnv.mnAmdFp64Flag )
-        {
-            fprintf(stderr,"----use amd double type in kernel----\n");
-            status = CompileKernelFile( &gpuEnv, "-D AMD_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16" );
-        }
-        else
-        {
-            fprintf(stderr,"----use float type in kernel----\n");
-            status = CompileKernelFile( &gpuEnv, "-Dfp_t=float -Dfp_t4=float4 -Dfp_t16=float16" );
-        }
-        if ( status == 0 || gpuEnv.mnKernelCount == 0 )
-        {
-            fprintf(stderr,"CompileKernelFile failed.\n");
-            return 1;
-        }
-        fprintf(stderr,"CompileKernelFile successed.\n");
-        isInited = 1;
-    }
-    return 0;
-}

 int OpenclDevice::InitOpenclRunEnv_DeviceSelection( int argc ) {
 //PERF_COUNT_START("InitOpenclRunEnv_DS")
-#if USE_DEVICE_SELECTION
    if (!isInited) {
        // after programs compiled, selects best device
        //printf("[DS] InitOpenclRunEnv_DS::Calling performDeviceSelection()\n");
@ -541,7 +315,6 @@ int OpenclDevice::InitOpenclRunEnv_DeviceSelection( int argc ) {
        }
        isInited = 1;
    }
-#endif
 //PERF_COUNT_END
    return 0;
 }
@ -598,29 +371,7 @@ int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
    int status = 0;
    char *str = NULL;
    FILE *fd = NULL;
-    cl_uint numDevices=0;
-    if ( getenv("SC_OPENCLCPU") )
-    {
-        clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform
-                                  CL_DEVICE_TYPE_CPU,  // device_type for CPU device
-                                  0,                   // num_entries
-                                  NULL,                // devices ID
-                                  &numDevices);
-    }
-    else
-    {
-        clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform
-                                  CL_DEVICE_TYPE_GPU,  // device_type for GPU device
-                                  0,                   // num_entries
-                                  NULL,                // devices ID
-                                  &numDevices);
-    }
-    CHECK_OPENCL( clStatus, "clGetDeviceIDs" );
-    for ( i = 0; i < numDevices; i++ )
-    {
    char fileName[256] = { 0 }, cl_name[128] = { 0 };
-        if ( gpuEnv.mpArryDevsID[i] != 0 )
-        {
    char deviceName[1024];
    clStatus = clGetDeviceInfo( gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL );
    CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
@ -631,8 +382,6 @@ int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
    legalizeFileName(fileName);
    fd = fopen( fileName, "rb" );
    status = ( fd != NULL ) ? 1 : 0;
-        }
-    }
    if ( fd != NULL )
    {
        *fhandle = fd;
@ -675,7 +424,7 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c
 {
    unsigned int i = 0;
    cl_int clStatus;
-    size_t *binarySizes, numDevices;
+    size_t *binarySizes, numDevices=0;
    cl_device_id *mpArryDevsID;
    char **binaries, *str = NULL;

@ -714,14 +463,6 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c
            binaries[i] = (char*) malloc( sizeof(char) * binarySizes[i] );
            if ( binaries[i] == NULL )
            {
-                // cleanup all memory allocated so far
-                for(int cleanupIndex = 0; cleanupIndex < i; ++cleanupIndex)
-                {
-                    free(binaries[cleanupIndex]);
-                }
-                // cleanup binary array
-                free(binaries);
-
                return 0;
            }
        }
@ -1038,7 +779,6 @@ PERF_COUNT_END
    return pResult;
 }

-
 PIX * OpenclDevice::pixReadTiffCl ( const char *filename, l_int32 n )
 {
 PERF_COUNT_START("pixReadTiffCL")
@ -1356,7 +1096,7 @@ OpenclDevice::pixReadMemTiffCl(const l_uint8 *data,size_t size,l_int32  n)
 	l_int32  i, pagefound;
 	PIX     *pix;
 	TIFF    *tif;
-	L_MEMSTREAM *memStream;
+	//L_MEMSTREAM *memStream;
 	PROCNAME("pixReadMemTiffCl");

 	if (!data)
@ -1555,9 +1295,12 @@ PIXCMAP   *cmap;

        //Invoke the OpenCL kernel for pixReadFromTiff
        l_uint32* output_gpu=pixReadFromTiffKernel(tiffdata,w,h,wpl,line);
-        pixSetData(pix, output_gpu);

+        pixSetData(pix, output_gpu);
+        // pix already has data allocated, it now points to output_gpu?
        FREE(tiffdata);
+        FREE(line);
+        //FREE(output_gpu);
    }

    if (getTiffStreamResolutionCl(tif, &xres, &yres) == 0) {
@ -1833,7 +1576,7 @@ pixDilateCL(l_int32  hsize, l_int32  vsize, l_int32  wpl, l_int32  h)
    sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);

    selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
-
+    selDestroy(&sel);
    //global and local work dimensions for Horizontal pass
    gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
    globalThreads[0] = gsize;
@ -1998,7 +1741,7 @@ pixErodeCL(l_int32  hsize, l_int32  vsize, l_uint32 wpl, l_uint32 h)
    sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);

    selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
-    
+    selDestroy(&sel);
    OpenclDevice::SetKernelEnv( &rEnv );

    if (hsize == 5 && vsize == 5 && isAsymmetric)
@ -2634,7 +2377,7 @@ OpenclDevice::pixGetLinesCL(PIX  *pixd,
 *  histogramAllChannels is layed out as all channel 0, then all channel 1...
 *  only supports 1 or 4 channels (bytes_per_pixel)
 ************************************************************************/
-void OpenclDevice::HistogramRectOCL(
+int OpenclDevice::HistogramRectOCL(
    const unsigned char* imageData,
    int bytes_per_pixel,
    int bytes_per_line,
@ -2647,6 +2390,7 @@ void OpenclDevice::HistogramRectOCL(
 {
 PERF_COUNT_START("HistogramRectOCL")
    cl_int clStatus;
+    int retVal= 0;
    KernelEnv histKern;
    SetKernelEnv( &histKern );
    KernelEnv histRedKern;
@ -2667,10 +2411,9 @@ PERF_COUNT_START("HistogramRectOCL")
    int requestedOccupancy = 10;
    int numWorkGroups = numCUs * requestedOccupancy;
    int numThreads = block_size*numWorkGroups;
-    size_t local_work_size[] = {static_cast<size_t>(block_size)};
-    size_t global_work_size[] = {static_cast<size_t>(numThreads)};
-    size_t red_global_work_size[] = {
-        static_cast<size_t>(block_size * kHistogramSize * bytes_per_pixel)};
+    size_t local_work_size[] = {block_size};
+    size_t global_work_size[] = {numThreads};
+    size_t red_global_work_size[] = {block_size*kHistogramSize*bytes_per_pixel};

    /* map histogramAllChannels as write only */
    int numBins = kHistogramSize*bytes_per_pixel*numWorkGroups;
@ -2690,7 +2433,7 @@ PERF_COUNT_START("HistogramRectOCL")
    zeroBuffer[0] = 0;
    cl_mem atomicSyncBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_int), (void *)zeroBuffer, &clStatus );
    CHECK_OPENCL( clStatus, "clCreateBuffer atomicSyncBuffer");
-
+    delete[] zeroBuffer;
    //Create kernel objects based on bytes_per_pixel
    if (bytes_per_pixel == 1)
    {
@ -2743,7 +2486,10 @@ PERF_COUNT_SUB("before")
        0, NULL, NULL );
    CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels" );
    clFinish( histKern.mpkCmdQueue );
-
+    if(clStatus !=0)
+    {
+		retVal = -1;
+	}
    /* launch histogram */
    clStatus = clEnqueueNDRangeKernel(
        histRedKern.mpkCmdQueue,
@ -2752,19 +2498,26 @@ PERF_COUNT_SUB("before")
        0, NULL, NULL );
    CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction" );
    clFinish( histRedKern.mpkCmdQueue );
-
+	if(clStatus !=0)
+	{
+			retVal = -1;
+	}
 PERF_COUNT_SUB("redKernel")

    /* map results back from gpu */
    ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE, CL_MAP_READ, 0, kHistogramSize*bytes_per_pixel*sizeof(int), 0, NULL, NULL, &clStatus);
    CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer");
-    
+    if(clStatus !=0)
+    {
+				retVal = -1;
+	}
    clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0, NULL, NULL);

    clReleaseMemObject(histogramBuffer);
    clReleaseMemObject(imageBuffer);
 PERF_COUNT_SUB("after")
 PERF_COUNT_END
+   return retVal;

 }

@ -2773,7 +2526,7 @@ PERF_COUNT_END
 * from the class, using thresholds/hi_values to the output IMAGE.
 * only supports 1 or 4 channels
 ************************************************************************/
-void OpenclDevice::ThresholdRectToPixOCL(
+int OpenclDevice::ThresholdRectToPixOCL(
    const unsigned char* imageData,
    int bytes_per_pixel,
    int bytes_per_line,
@ -2785,12 +2538,12 @@ void OpenclDevice::ThresholdRectToPixOCL(
    int top,
    int left) {
 PERF_COUNT_START("ThresholdRectToPixOCL")
-
+    int retVal =0;
    /* create pix result buffer */
    *pix = pixCreate(width, height, 1);
    uinT32* pixData = pixGetData(*pix);
    int wpl = pixGetWpl(*pix);
-    int pixSize = wpl*height*sizeof(uinT32);
+    int pixSize = wpl*height*sizeof(uinT32); // number of pixels

    cl_int clStatus;
    KernelEnv rEnv;
@ -2861,7 +2614,11 @@ PERF_COUNT_SUB("before")
    CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix" );
    clFinish( rEnv.mpkCmdQueue );
 PERF_COUNT_SUB("kernel")
-    
+	if(clStatus !=0)
+		{
+				printf("Setting return value to -1\n");
+				retVal = -1;
+	}
    /* map results back from gpu */
    void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0, pixSize, 0, NULL, NULL, &clStatus);
    CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer");
@ -2873,10 +2630,10 @@ PERF_COUNT_SUB("kernel")

 PERF_COUNT_SUB("after")
 PERF_COUNT_END
+return retVal;
 }


-#if USE_DEVICE_SELECTION

 /******************************************************************************
 * Data Types for Device Selection
@ -2987,9 +2744,11 @@ double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData inpu
    LARGE_INTEGER freq, time_funct_start, time_funct_end;
    QueryPerformanceFrequency(&freq);
 #elif ON_APPLE
-    mach_timespec_t time_funct_start, time_funct_end;
+	mach_timebase_info_data_t info = { 0, 0 };
+    mach_timebase_info(&info);
+	long long start,stop;
 #else
-    TIMESPEC time_funct_start, time_funct_end;
+    timespec time_funct_start, time_funct_end;
 #endif
    // input data
    l_uint32 *tiffdata = (l_uint32 *)input.imageData;// same size and random data; data doesn't change workload
@ -2998,6 +2757,8 @@ double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData inpu
    if (type == DS_DEVICE_OPENCL_DEVICE) {
 #if ON_WINDOWS
        QueryPerformanceCounter(&time_funct_start);
+#elif  ON_APPLE
+	start = mach_absolute_time();
 #else
        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
 #endif
@ -3008,6 +2769,9 @@ double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData inpu
 #if ON_WINDOWS
        QueryPerformanceCounter(&time_funct_end);
        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
+#elif  ON_APPLE
+		stop = mach_absolute_time();
+		time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
 #else
        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
@ -3016,6 +2780,8 @@ double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData inpu
    } else {
 #if ON_WINDOWS
        QueryPerformanceCounter(&time_funct_start);
+#elif  ON_APPLE
+		start = mach_absolute_time();
 #else
        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
 #endif
@ -3041,6 +2807,9 @@ double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData inpu
 #if ON_WINDOWS
        QueryPerformanceCounter(&time_funct_end);
        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
+#elif  ON_APPLE
+		stop = mach_absolute_time();
+		time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
 #else
        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
@ -3061,9 +2830,11 @@ double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input,
    LARGE_INTEGER freq, time_funct_start, time_funct_end;
    QueryPerformanceFrequency(&freq);
 #elif  ON_APPLE
-    mach_timespec_t time_funct_start, time_funct_end;
+	mach_timebase_info_data_t info = { 0, 0 };
+    mach_timebase_info(&info);
+	long long start,stop;
 #else
-    TIMESPEC time_funct_start, time_funct_end;
+    timespec time_funct_start, time_funct_end;
 #endif

    unsigned char pixelHi = (unsigned char)255;
@ -3073,22 +2844,34 @@ double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input,
    int kHistogramSize = 256;
    int bytes_per_line = input.width*input.numChannels;
    int *histogramAllChannels = new int[kHistogramSize*input.numChannels];
-
+    int retVal= 0;
    // function call
    if (type == DS_DEVICE_OPENCL_DEVICE) {
 #if ON_WINDOWS
        QueryPerformanceCounter(&time_funct_start);
+#elif  ON_APPLE
+		start = mach_absolute_time();
 #else
        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
 #endif

        OpenclDevice::gpuEnv = *env;
        int wpl = pixGetWpl(input.pix);
-        OpenclDevice::HistogramRectOCL(input.imageData, input.numChannels, bytes_per_line, top, left, input.width, input.height, kHistogramSize, histogramAllChannels);
+        retVal= OpenclDevice::HistogramRectOCL(input.imageData, input.numChannels, bytes_per_line, top, left, input.width, input.height, kHistogramSize, histogramAllChannels);

 #if ON_WINDOWS
        QueryPerformanceCounter(&time_funct_end);
        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
+#elif  ON_APPLE
+		stop = mach_absolute_time();
+		if(retVal ==0)
+		{
+			time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
+		}
+		else
+		{
+			time= FLT_MAX;
+		}
 #else
        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
@ -3098,6 +2881,8 @@ double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input,
        int *histogram = new int[kHistogramSize];
 #if ON_WINDOWS
        QueryPerformanceCounter(&time_funct_start);
+#elif  ON_APPLE
+		start = mach_absolute_time();
 #else
        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
 #endif
@ -3108,6 +2893,9 @@ double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input,
 #if ON_WINDOWS
        QueryPerformanceCounter(&time_funct_end);
        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
+#elif  ON_APPLE
+		stop = mach_absolute_time();
+		time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
 #else
        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
@ -3116,7 +2904,6 @@ double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input,
    }

    // cleanup
-    //delete[] imageData;
    delete[] histogramAllChannels;
    return time;
 }
@ -3162,13 +2949,16 @@ void ThresholdRectToPix_Native(const unsigned char* imagedata,
 double thresholdRectToPixMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {

    double time;
+    int retVal =0;
 #if ON_WINDOWS
    LARGE_INTEGER freq, time_funct_start, time_funct_end;
    QueryPerformanceFrequency(&freq);
 #elif  ON_APPLE
-    mach_timespec_t time_funct_start, time_funct_end;
+	mach_timebase_info_data_t info = { 0, 0 };
+    mach_timebase_info(&info);
+	long long start,stop;
 #else
-    TIMESPEC time_funct_start, time_funct_end;
+    timespec time_funct_start, time_funct_end;
 #endif

    // input data
@ -3192,17 +2982,30 @@ double thresholdRectToPixMicroBench( GPUEnv *env, TessScoreEvaluationInputData i
    if (type == DS_DEVICE_OPENCL_DEVICE) {
 #if ON_WINDOWS
        QueryPerformanceCounter(&time_funct_start);
+#elif  ON_APPLE
+		start = mach_absolute_time();
 #else
        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
 #endif

        OpenclDevice::gpuEnv = *env;
        int wpl = pixGetWpl(input.pix);
-        OpenclDevice::ThresholdRectToPixOCL(input.imageData, input.numChannels, bytes_per_line, thresholds, hi_values, &input.pix, input.height, input.width, top, left);
+        retVal= OpenclDevice::ThresholdRectToPixOCL(input.imageData, input.numChannels, bytes_per_line, thresholds, hi_values, &input.pix, input.height, input.width, top, left);

 #if ON_WINDOWS
        QueryPerformanceCounter(&time_funct_end);
        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
+#elif  ON_APPLE
+		stop = mach_absolute_time();
+		if(retVal ==0)
+		{
+			time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;;
+		}
+		else
+		{
+			time= FLT_MAX;
+		}
+
 #else
        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
@ -3214,6 +3017,8 @@ double thresholdRectToPixMicroBench( GPUEnv *env, TessScoreEvaluationInputData i
        thresholder.SetImage( input.pix );
 #if ON_WINDOWS
        QueryPerformanceCounter(&time_funct_start);
+#elif  ON_APPLE
+		start = mach_absolute_time();
 #else
        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
 #endif
@ -3223,6 +3028,9 @@ double thresholdRectToPixMicroBench( GPUEnv *env, TessScoreEvaluationInputData i
 #if ON_WINDOWS
        QueryPerformanceCounter(&time_funct_end);
        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
+#elif  ON_APPLE
+		stop = mach_absolute_time();
+		time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
 #else
        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
@ -3242,9 +3050,11 @@ double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData in
    LARGE_INTEGER freq, time_funct_start, time_funct_end;
    QueryPerformanceFrequency(&freq);
 #elif  ON_APPLE
-    mach_timespec_t time_funct_start, time_funct_end;
+	mach_timebase_info_data_t info = { 0, 0 };
+    mach_timebase_info(&info);
+	long long start,stop;
 #else
-    TIMESPEC time_funct_start, time_funct_end;
+    timespec time_funct_start, time_funct_end;
 #endif

    // input data
@ -3260,6 +3070,8 @@ double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData in
    if (type == DS_DEVICE_OPENCL_DEVICE) {
 #if ON_WINDOWS
        QueryPerformanceCounter(&time_funct_start);
+#elif  ON_APPLE
+		start = mach_absolute_time();
 #else
        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
 #endif
@ -3274,6 +3086,9 @@ double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData in
 #if ON_WINDOWS
        QueryPerformanceCounter(&time_funct_end);
        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
+#elif  ON_APPLE
+		stop = mach_absolute_time();
+		time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
 #else
        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
@ -3281,6 +3096,8 @@ double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData in
    } else {
 #if ON_WINDOWS
        QueryPerformanceCounter(&time_funct_start);
+#elif  ON_APPLE
+		start = mach_absolute_time();
 #else
        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
 #endif
@ -3298,6 +3115,9 @@ double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData in
 #if ON_WINDOWS
        QueryPerformanceCounter(&time_funct_end);
        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
+#elif  ON_APPLE
+		stop = mach_absolute_time();
+		time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
 #else
        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
@ -3332,7 +3152,10 @@ ds_status deserializeScore( ds_device* device, const unsigned char* serializedSc
    return DS_SUCCESS;
 }

-
+ds_status releaseScore( void* score ) {
+  delete[] score;
+  return DS_SUCCESS;
+}

 // evaluate devices
 ds_status evaluateScoreForDevice( ds_device *device, void *inputData) {
@ -3352,7 +3175,6 @@ ds_status evaluateScoreForDevice( ds_device *device, void *inputData) {
        OpenclDevice::CompileKernelFile(env, "");
    }

-
    TessScoreEvaluationInputData *input = (TessScoreEvaluationInputData *)inputData;

    // pixReadTiff
@ -3395,7 +3217,6 @@ ds_status evaluateScoreForDevice( ds_device *device, void *inputData) {

 // initial call to select device
 ds_device OpenclDevice::getDeviceSelection( ) {
-//PERF_COUNT_START("getDeviceSelection")
  if (!deviceIsSelected) {
 PERF_COUNT_START("getDeviceSelection")
  // check if opencl is available at runtime
@ -3434,7 +3255,6 @@ PERF_COUNT_SUB("writeProfileToFile")
      } else {
        printf("[DS] Unable to evaluate performance; scores not written to file.\n");
      }
-
    } else {

 PERF_COUNT_SUB("readProfileFromFile")
@ -3446,7 +3266,6 @@ PERF_COUNT_SUB("readProfileFromFile")
    float bestTime = FLT_MAX; // begin search with worst possible time
    int bestDeviceIdx = -1;
    for (int d = 0; d < profile->numDevices; d++) {
-        //((TessDeviceScore *)device->score)->time
      ds_device device = profile->devices[d];
      TessDeviceScore score = *(TessDeviceScore *)device.score;

@ -3478,7 +3297,8 @@ PERF_COUNT_SUB("readProfileFromFile")
        printf("[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx+1, profile->devices[bestDeviceIdx].oclDeviceName, profile->devices[bestDeviceIdx].type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
      }
      selectedDevice = profile->devices[bestDeviceIdx];
-
+      // cleanup
+      releaseDSProfile(profile, releaseScore);
    } else {
      // opencl isn't available at runtime, select native cpu device
      printf("[DS] OpenCL runtime not available.\n");
@ -3496,26 +3316,178 @@ PERF_COUNT_END
  return selectedDevice;
 }

-#endif

 bool OpenclDevice::selectedDeviceIsOpenCL() {
-#if USE_DEVICE_SELECTION
  ds_device device = getDeviceSelection();
  return (device.type == DS_DEVICE_OPENCL_DEVICE);
-#else
-    return true;
-#endif
 }

 bool OpenclDevice::selectedDeviceIsNativeCPU() {
-#if USE_DEVICE_SELECTION
  ds_device device = getDeviceSelection();
  return (device.type == DS_DEVICE_NATIVE_CPU);
-#else
-    return false;
-#endif
 }



+/*!
+ *  pixConvertRGBToGray() from leptonica, converted to opencl kernel
+ *
+ *      Input:  pix (32 bpp RGB)
+ *              rwt, gwt, bwt  (non-negative; these should add to 1.0,
+ *                              or use 0.0 for default)
+ *      Return: 8 bpp pix, or null on error
+ *
+ *  Notes:
+ *      (1) Use a weighted average of the RGB values.
+ */
+#define SET_DATA_BYTE( pdata, n, val ) (*(l_uint8 *)((l_uintptr_t)((l_uint8 *)(pdata) + (n)) ^ 3) = (val))
+
+Pix * OpenclDevice::pixConvertRGBToGrayOCL(
+	Pix *srcPix, // 32-bit source
+	float rwt,
+	float gwt,
+	float bwt )
+{
+PERF_COUNT_START("pixConvertRGBToGrayOCL")
+	Pix *dstPix; // 8-bit destination
+
+	if (rwt < 0.0 || gwt < 0.0 || bwt < 0.0) return NULL;
+
+	if (rwt == 0.0 && gwt == 0.0 && bwt == 0.0) {
+		// magic numbers from leptonica
+	    rwt = 0.3;
+	    gwt = 0.5;
+	    bwt = 0.2;
+	}
+	// normalize
+	float sum = rwt + gwt + bwt;
+	rwt /= sum;
+	gwt /= sum;
+	bwt /= sum;
+
+	// source pix
+	int w, h;
+	pixGetDimensions(srcPix, &w, &h, NULL);
+    //printf("Image is %i x %i\n", w, h);
+    unsigned int *srcData = pixGetData(srcPix);
+    int srcWPL = pixGetWpl(srcPix);
+	int srcSize = srcWPL * h * sizeof(unsigned int);
+
+	// destination pix
+    if ((dstPix = pixCreate(w, h, 8)) == NULL)
+        return NULL;
+    pixCopyResolution(dstPix, srcPix);
+    unsigned int *dstData = pixGetData(dstPix);
+    int dstWPL = pixGetWpl(dstPix);
+	int dstWords = dstWPL * h;
+	int dstSize = dstWords * sizeof(unsigned int);
+    //printf("dstSize = %i\n", dstSize);
+PERF_COUNT_SUB("pix setup")
+
+	// opencl objects
+	cl_int clStatus;
+    KernelEnv kEnv;
+    SetKernelEnv( &kEnv );
+
+	// source buffer
+	cl_mem srcBuffer = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, srcSize, (void *)srcData, &clStatus );
+    CHECK_OPENCL( clStatus, "clCreateBuffer srcBuffer");
+
+    // destination buffer
+    cl_mem dstBuffer = clCreateBuffer( kEnv.mpkContext, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, dstSize, (void *)dstData, &clStatus );
+    CHECK_OPENCL( clStatus, "clCreateBuffer dstBuffer");
+
+    // setup work group size parameters
+    int block_size = 256;
+    int numWorkGroups = ((h*w+block_size-1) / block_size );
+    int numThreads = block_size*numWorkGroups;
+    size_t local_work_size[] = {block_size};
+    size_t global_work_size[] = {numThreads};
+    //printf("Enqueueing %i threads for %i output pixels\n", numThreads, w*h);
+
+    /* compile kernel */
+    kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, "kernel_RGBToGray", &clStatus );
+    CHECK_OPENCL( clStatus, "clCreateKernel kernel_RGBToGray");
+
+
+    /* set kernel arguments */
+    clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&srcBuffer );
+    CHECK_OPENCL( clStatus, "clSetKernelArg srcBuffer");
+	clStatus = clSetKernelArg( kEnv.mpkKernel, 1, sizeof(cl_mem), (void *)&dstBuffer );
+    CHECK_OPENCL( clStatus, "clSetKernelArg dstBuffer");
+    clStatus = clSetKernelArg( kEnv.mpkKernel, 2, sizeof(int), (void *)&srcWPL );
+    CHECK_OPENCL( clStatus, "clSetKernelArg srcWPL" );
+    clStatus = clSetKernelArg( kEnv.mpkKernel, 3, sizeof(int), (void *)&dstWPL );
+    CHECK_OPENCL( clStatus, "clSetKernelArg dstWPL" );
+    clStatus = clSetKernelArg( kEnv.mpkKernel, 4, sizeof(int), (void *)&h );
+    CHECK_OPENCL( clStatus, "clSetKernelArg height" );
+    clStatus = clSetKernelArg( kEnv.mpkKernel, 5, sizeof(int), (void *)&w );
+    CHECK_OPENCL( clStatus, "clSetKernelArg width" );
+    clStatus = clSetKernelArg( kEnv.mpkKernel, 6, sizeof(float), (void *)&rwt );
+    CHECK_OPENCL( clStatus, "clSetKernelArg rwt" );
+    clStatus = clSetKernelArg( kEnv.mpkKernel, 7, sizeof(float), (void *)&gwt );
+    CHECK_OPENCL( clStatus, "clSetKernelArg gwt");
+    clStatus = clSetKernelArg( kEnv.mpkKernel, 8, sizeof(float), (void *)&bwt );
+    CHECK_OPENCL( clStatus, "clSetKernelArg bwt");
+
+    /* launch kernel & wait */
+PERF_COUNT_SUB("before")
+    clStatus = clEnqueueNDRangeKernel(
+        kEnv.mpkCmdQueue,
+        kEnv.mpkKernel,
+        1, NULL, global_work_size, local_work_size,
+        0, NULL, NULL );
+    CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_RGBToGray" );
+    clFinish( kEnv.mpkCmdQueue );
+PERF_COUNT_SUB("kernel")
+
+    /* map results back from gpu */
+    void *ptr = clEnqueueMapBuffer(kEnv.mpkCmdQueue, dstBuffer, CL_TRUE, CL_MAP_READ, 0, dstSize, 0, NULL, NULL, &clStatus);
+    CHECK_OPENCL( clStatus, "clEnqueueMapBuffer dstBuffer");
+    clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, dstBuffer, ptr, 0, NULL, NULL);
+
+#if 0
+    // validate: compute on cpu
+    Pix *cpuPix = pixCreate(w, h, 8);
+    pixCopyResolution(cpuPix, srcPix);
+    unsigned int *cpuData = pixGetData(cpuPix);
+    int cpuWPL = pixGetWpl(cpuPix);
+    unsigned int *cpuLine, *srcLine;
+    int i, j;
+    for (i = 0, srcLine = srcData, cpuLine = cpuData; i < h; i++) {
+        for (j = 0; j < w; j++) {
+            unsigned int word = *(srcLine + j);
+            int val = (l_int32)(rwt * ((word >> L_RED_SHIFT) & 0xff) +
+                            gwt * ((word >> L_GREEN_SHIFT) & 0xff) +
+                            bwt * ((word >> L_BLUE_SHIFT) & 0xff) + 0.5);
+            SET_DATA_BYTE(cpuLine, j, val);
+        }
+        srcLine += srcWPL;
+        cpuLine += cpuWPL;
+    }
+
+    // validate: compare
+    printf("converted 32-bit -> 8-bit image\n");
+    for (int row = 0; row < h; row++) {
+        for (int col = 0; col < w; col++) {
+            int idx = row*w + col;
+            unsigned int srcVal = srcData[idx];
+            unsigned char cpuVal = ((unsigned char *)cpuData)[idx];
+            unsigned char oclVal = ((unsigned char *)dstData)[idx];
+            if (srcVal > 0) {
+                printf("%4i,%4i: %u, %u, %u\n", row, col, srcVal, cpuVal, oclVal);
+            }
+        }
+        //printf("\n");
+    }
+#endif
+    // release opencl objects
+    clReleaseMemObject(srcBuffer);
+    clReleaseMemObject(dstBuffer);
+
+
+PERF_COUNT_END
+	// success
+	return dstPix;
+}
 #endif
--- a/opencl/openclwrapper.h
+++ b/opencl/openclwrapper.h
@ -56,13 +56,6 @@
 #include <time.h>
 #endif

-#if ON_APPLE
-#include <mach/clock.h>
-#include <mach/mach.h>
-#define CLOCK_MONOTONIC SYSTEM_CLOCK
-#define clock_gettime clock_get_time
-#endif
-
 /************************************************************************************
 * enable/disable reporting of performance
 * PERF_REPORT_LEVEL
@ -74,13 +67,6 @@
 #define PERF_COUNT_VERBOSE 1
 #define PERF_COUNT_REPORT_STR "[%36s], %24s, %11.6f\n"

-#if ON_APPLE
-#include <time.h>
-#include <mach/clock.h>
-#include <mach/mach.h>
-#define CLOCK_MONOTONIC SYSTEM_CLOCK
-#define clock_gettime clock_get_time
-#endif

 #if ON_WINDOWS

@ -97,7 +83,7 @@
 #define PERF_COUNT_END \
    QueryPerformanceCounter(&time_funct_end); \
    elapsed_time_sec = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); \
-    tprintf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec);
+    printf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec);
 #else
 #define PERF_COUNT_START(FUNCT_NAME)
 #define PERF_COUNT_END
@ -107,7 +93,7 @@
 #define PERF_COUNT_SUB(SUB) \
    QueryPerformanceCounter(&time_sub_end); \
    elapsed_time_sec = (time_sub_end.QuadPart-time_sub_start.QuadPart)/(double)(freq.QuadPart); \
-    tprintf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \
+    printf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \
    time_sub_start = time_sub_end;
 #else
 #define PERF_COUNT_SUB(SUB)
@ -129,7 +115,7 @@
 #define PERF_COUNT_END \
    clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); \
    elapsed_time_sec = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; \
-    tprintf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec);
+    printf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec);
 #else
 #define PERF_COUNT_START(FUNCT_NAME)
 #define PERF_COUNT_END
@ -139,7 +125,7 @@
 #define PERF_COUNT_SUB(SUB) \
    clock_gettime( CLOCK_MONOTONIC, &time_sub_end ); \
    elapsed_time_sec = (time_sub_end.tv_sec - time_sub_start.tv_sec)*1.0 + (time_sub_end.tv_nsec - time_sub_start.tv_nsec)/1000000000.0; \
-    tprintf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \
+    printf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \
    time_sub_start = time_sub_end;
 #else
 #define PERF_COUNT_SUB(SUB)
@ -151,9 +137,6 @@
 **************************************************************************/

 #ifdef USE_OPENCL
-
-#define USE_DEVICE_SELECTION 1
-
 #include "opencl_device_selection.h"

 #ifndef strcasecmp
@ -251,7 +234,6 @@ public:
    static int InitEnv(); // load dll, call InitOpenclRunEnv(0)
    static int InitOpenclRunEnv( int argc ); // RegistOpenclKernel, double flags, compile kernels
    static int InitOpenclRunEnv_DeviceSelection( int argc ); // RegistOpenclKernel, double flags, compile kernels
-    static int InitOpenclRunEnv( GPUEnv *gpu ); // select device by env_CPU or selector
    static int RegistOpenclKernel();
    static int ReleaseOpenclRunEnv();
    static int ReleaseOpenclEnv( GPUEnv *gpuInfo );
@ -320,12 +302,11 @@ public:
    static void FreeOpenclDll();
 #endif

-    //int GetOpenclState();
-    //void SetOpenclState( int state );
+
    inline static int AddKernelConfig( int kCount, const char *kName );

    /* for binarization */
-    static void HistogramRectOCL(
+    static int HistogramRectOCL(
        const unsigned char *imagedata,
        int bytes_per_pixel,
        int bytes_per_line,
@ -335,7 +316,8 @@ public:
        int height,
        int kHistogramSize,
        int *histogramAllChannels);
-    static void ThresholdRectToPixOCL(
+
+    static int ThresholdRectToPixOCL(
        const unsigned char* imagedata,
        int bytes_per_pixel,
        int bytes_per_line,
@ -346,11 +328,12 @@ public:
        int rect_width,
        int rect_top,
        int rect_left);
-#if USE_DEVICE_SELECTION
+
+    static Pix * pixConvertRGBToGrayOCL( Pix *pix, float weightRed = 0.3, float weightGreen = 0.5, float weightBlue = 0.2 );
+
    static ds_device getDeviceSelection();
    static ds_device selectedDevice;
    static bool deviceIsSelected;
-#endif
    static bool selectedDeviceIsOpenCL();
    static bool selectedDeviceIsNativeCPU();