From 644c817f8582bc1f9d2c43c718df69bd674e8c32 Mon Sep 17 00:00:00 2001
From: zdenop <zdenop@d0cd1f9f-072b-0410-8dd7-cf729c803f20>
Date: Sun, 26 Jan 2014 17:38:37 +0000
Subject: [PATCH] fix opencl endl; fix leptonica warning

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1016 d0cd1f9f-072b-0410-8dd7-cf729c803f20
---
 opencl/opencl_device_selection.h | 1188 +++---
 opencl/openclwrapper.cpp         | 6452 +++++++++++++++---------------
 opencl/openclwrapper.h           |  690 ++--
 3 files changed, 4165 insertions(+), 4165 deletions(-)

diff --git a/opencl/opencl_device_selection.h b/opencl/opencl_device_selection.h
index d6eb28a0..ba45b5de 100644
--- a/opencl/opencl_device_selection.h
+++ b/opencl/opencl_device_selection.h
@@ -1,594 +1,594 @@
-#ifdef USE_OPENCL
-#ifndef DEVICE_SELECTION_H
-#define DEVICE_SELECTION_H
-
-
-#ifdef _MSC_VER
-#define _CRT_SECURE_NO_WARNINGS
-#endif
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#else
-#include <CL/cl.h>
-#endif
-
-#define DS_DEVICE_NAME_LENGTH 256
-
-typedef enum {
-  DS_SUCCESS = 0,
-  DS_INVALID_PROFILE = 1000,
-  DS_MEMORY_ERROR,
-  DS_INVALID_PERF_EVALUATOR_TYPE,
-  DS_INVALID_PERF_EVALUATOR,
-  DS_PERF_EVALUATOR_ERROR,
-  DS_FILE_ERROR,
-  DS_UNKNOWN_DEVICE_TYPE,
-  DS_PROFILE_FILE_ERROR,
-  DS_SCORE_SERIALIZER_ERROR,
-  DS_SCORE_DESERIALIZER_ERROR
-} ds_status;
-
-// device type
-typedef enum {
-  DS_DEVICE_NATIVE_CPU = 0,
-  DS_DEVICE_OPENCL_DEVICE 
-} ds_device_type;
-
-
-typedef struct {
-  ds_device_type  type;
-  cl_device_id    oclDeviceID;
-  char*           oclDeviceName;
-  char*           oclDriverVersion;
-  // a pointer to the score data, the content/format is application defined.
-  void*           score;
-} ds_device;
-
-typedef struct {
-  unsigned int  numDevices;
-  ds_device*    devices;
-  const char*   version;
-} ds_profile;
-
-// deallocate memory used by score
-typedef ds_status (*ds_score_release)(void* score);
-static ds_status releaseDSProfile(ds_profile* profile, ds_score_release sr) {
-  ds_status status = DS_SUCCESS;
-  if (profile!=NULL) {
-    if (profile->devices!=NULL && sr!=NULL) {
-      unsigned int i;
-      for (i = 0; i < profile->numDevices; i++) {
-        status = sr(profile->devices[i].score);
-        if (status != DS_SUCCESS)
-          break;
-      }
-      free(profile->devices);
-    }
-    free(profile);
-  }
-  return status;
-}
-
-
-static ds_status initDSProfile(ds_profile** p, const char* version) {
-  int numDevices;
-  cl_uint numPlatforms;
-  cl_platform_id* platforms = NULL;
-  cl_device_id*   devices = NULL;
-  ds_status status = DS_SUCCESS;
-  ds_profile* profile = NULL;
-  unsigned int next;
-  unsigned int i;
-
-  if (p == NULL)
-    return DS_INVALID_PROFILE;
-
-  profile = (ds_profile*)malloc(sizeof(ds_profile));
-  if (profile == NULL)
-    return DS_MEMORY_ERROR;
-  
-  memset(profile, 0, sizeof(ds_profile));
-
-  clGetPlatformIDs(0, NULL, &numPlatforms);
-  if (numPlatforms == 0)
-    goto cleanup;
-
-  platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
-  if (platforms == NULL) {
-    status = DS_MEMORY_ERROR;
-    goto cleanup;
-  }
-  clGetPlatformIDs(numPlatforms, platforms, NULL);
-
-  numDevices = 0;
-  for (i = 0; i < (unsigned int)numPlatforms; i++) {
-    cl_uint num;
-    clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &num);
-    numDevices+=num;
-  }
-  if (numDevices == 0)
-    goto cleanup;
-  
-  devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
-  if (devices == NULL) {
-    status = DS_MEMORY_ERROR;
-    goto cleanup;
-  }
-
-  profile->numDevices = numDevices+1;     // +1 to numDevices to include the native CPU
-  profile->devices = (ds_device*)malloc(profile->numDevices*sizeof(ds_device));    
-  if (profile->devices == NULL) {
-    profile->numDevices = 0;
-    status = DS_MEMORY_ERROR;
-    goto cleanup;    
-  }
-  memset(profile->devices, 0, profile->numDevices*sizeof(ds_device));
-
-  next = 0;
-  for (i = 0; i < (unsigned int)numPlatforms; i++) {
-    cl_uint num;
-    unsigned j;
-    clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numDevices, devices, &num);
-    for (j = 0; j < num; j++, next++) {
-      char buffer[DS_DEVICE_NAME_LENGTH];
-      size_t length;
-
-      profile->devices[next].type = DS_DEVICE_OPENCL_DEVICE;
-      profile->devices[next].oclDeviceID = devices[j];
-
-      clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_NAME
-        , DS_DEVICE_NAME_LENGTH, &buffer, NULL);
-      length = strlen(buffer);
-      profile->devices[next].oclDeviceName = (char*)malloc(length+1);
-      memcpy(profile->devices[next].oclDeviceName, buffer, length+1);
-
-      clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DRIVER_VERSION
-        , DS_DEVICE_NAME_LENGTH, &buffer, NULL);
-      length = strlen(buffer);
-      profile->devices[next].oclDriverVersion = (char*)malloc(length+1);
-      memcpy(profile->devices[next].oclDriverVersion, buffer, length+1);
-    }
-  }
-  profile->devices[next].type = DS_DEVICE_NATIVE_CPU;
-  profile->version = version;
-
-cleanup:
-  if (platforms)  free(platforms);
-  if (devices)    free(devices);
-  if (status == DS_SUCCESS) {
-    *p = profile;
-  }
-  else {
-    if (profile) {
-      if (profile->devices)
-        free(profile->devices);
-      free(profile);
-    }
-  }
-  return status;
-}
-
-// Pointer to a function that calculates the score of a device (ex:
-// device->score) update the data size of score. The encoding and the format
-// of the score data is implementation defined. The function should return
-// DS_SUCCESS if there's no error to be reported.
-typedef ds_status (*ds_perf_evaluator)(ds_device* device, void* data);
-
-typedef enum {
-  DS_EVALUATE_ALL
-  ,DS_EVALUATE_NEW_ONLY
-} ds_evaluation_type;
-
-static ds_status profileDevices(ds_profile* profile,
-                                const ds_evaluation_type type,
-                                ds_perf_evaluator evaluator,
-                                void* evaluatorData, unsigned int* numUpdates) {
-  ds_status status = DS_SUCCESS;
-  unsigned int i;
-  unsigned int updates = 0;
-
-  if (profile == NULL) {
-    return DS_INVALID_PROFILE;
-  }
-  if (evaluator == NULL) {
-    return DS_INVALID_PERF_EVALUATOR;
-  }
-
-  for (i = 0; i < profile->numDevices; i++) {
-    ds_status evaluatorStatus;
-    
-    switch (type) {
-    case DS_EVALUATE_NEW_ONLY:
-      if (profile->devices[i].score != NULL)
-        break;
-      //  else fall through
-    case DS_EVALUATE_ALL:
-      evaluatorStatus = evaluator(profile->devices+i, evaluatorData);
-      if (evaluatorStatus != DS_SUCCESS) {
-        status = evaluatorStatus;
-        return status;
-      }
-      updates++;
-      break;
-    default:
-      return DS_INVALID_PERF_EVALUATOR_TYPE;
-      break;
-    };
-  }
-  if (numUpdates)
-    *numUpdates = updates;
-  return status;
-}
-
-
-#define DS_TAG_VERSION                      "<version>"
-#define DS_TAG_VERSION_END                  "</version>"
-#define DS_TAG_DEVICE                       "<device>"
-#define DS_TAG_DEVICE_END                   "</device>"
-#define DS_TAG_SCORE                        "<score>"
-#define DS_TAG_SCORE_END                    "</score>"
-#define DS_TAG_DEVICE_TYPE                  "<type>"
-#define DS_TAG_DEVICE_TYPE_END              "</type>"
-#define DS_TAG_DEVICE_NAME                  "<name>"
-#define DS_TAG_DEVICE_NAME_END              "</name>"
-#define DS_TAG_DEVICE_DRIVER_VERSION        "<driver>"
-#define DS_TAG_DEVICE_DRIVER_VERSION_END    "</driver>"
-
-#define DS_DEVICE_NATIVE_CPU_STRING  "native_cpu"
-
-
-
-typedef ds_status (*ds_score_serializer)(ds_device* device,
-                                         void** serializedScore,
-                                         unsigned int* serializedScoreSize);
-static ds_status writeProfileToFile(ds_profile* profile,
-                                    ds_score_serializer serializer,
-                                    const char* file) {
-  ds_status status = DS_SUCCESS;
-  FILE* profileFile = NULL;
-
-
-  if (profile == NULL)
-    return DS_INVALID_PROFILE;
-
-  profileFile = fopen(file, "wb");
-  if (profileFile==NULL) {
-    status = DS_FILE_ERROR;
-  }
-  else {
-    unsigned int i;
-
-    // write version string
-    fwrite(DS_TAG_VERSION, sizeof(char), strlen(DS_TAG_VERSION), profileFile);
-    fwrite(profile->version, sizeof(char), strlen(profile->version), profileFile);
-    fwrite(DS_TAG_VERSION_END, sizeof(char), strlen(DS_TAG_VERSION_END), profileFile);
-    fwrite("\n", sizeof(char), 1, profileFile);
-
-    for (i = 0; i < profile->numDevices && status == DS_SUCCESS; i++) {
-      void* serializedScore;
-      unsigned int serializedScoreSize;
-
-      fwrite(DS_TAG_DEVICE, sizeof(char), strlen(DS_TAG_DEVICE), profileFile);
-
-      fwrite(DS_TAG_DEVICE_TYPE, sizeof(char), strlen(DS_TAG_DEVICE_TYPE),
-             profileFile);
-      fwrite(&profile->devices[i].type,sizeof(ds_device_type),1, profileFile);
-      fwrite(DS_TAG_DEVICE_TYPE_END, sizeof(char),
-             strlen(DS_TAG_DEVICE_TYPE_END), profileFile);
-
-      switch(profile->devices[i].type) {
-      case DS_DEVICE_NATIVE_CPU:
-        { 
-          // There's no need to emit a device name for the native CPU device.
-          /*
-          fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME),
-                 profileFile);
-          fwrite(DS_DEVICE_NATIVE_CPU_STRING,sizeof(char),
-                 strlen(DS_DEVICE_NATIVE_CPU_STRING), profileFile);
-          fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char),
-                 strlen(DS_TAG_DEVICE_NAME_END), profileFile);
-          */
-        }
-        break;
-      case DS_DEVICE_OPENCL_DEVICE: 
-        {
-          fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME),
-                 profileFile);
-          fwrite(profile->devices[i].oclDeviceName,
-                 sizeof(char),strlen(profile->devices[i].oclDeviceName), profileFile);
-          fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char),
-                 strlen(DS_TAG_DEVICE_NAME_END), profileFile);
-
-          fwrite(DS_TAG_DEVICE_DRIVER_VERSION, sizeof(char),
-                 strlen(DS_TAG_DEVICE_DRIVER_VERSION), profileFile);
-          fwrite(profile->devices[i].oclDriverVersion, sizeof(char),
-                 strlen(profile->devices[i].oclDriverVersion), profileFile);
-          fwrite(DS_TAG_DEVICE_DRIVER_VERSION_END, sizeof(char),
-                 strlen(DS_TAG_DEVICE_DRIVER_VERSION_END), profileFile);
-        }
-        break;
-      default:
-        status = DS_UNKNOWN_DEVICE_TYPE;
-        break;
-      };
-
-      fwrite(DS_TAG_SCORE, sizeof(char), strlen(DS_TAG_SCORE), profileFile);
-      status = serializer(profile->devices+i, &serializedScore,
-                          &serializedScoreSize);
-      if (status == DS_SUCCESS && serializedScore!=NULL && serializedScoreSize > 0) {
-        fwrite(serializedScore, sizeof(char), serializedScoreSize, profileFile);
-        free(serializedScore);
-      }
-      fwrite(DS_TAG_SCORE_END, sizeof(char), strlen(DS_TAG_SCORE_END), profileFile);
-      fwrite(DS_TAG_DEVICE_END, sizeof(char), strlen(DS_TAG_DEVICE_END), profileFile);
-      fwrite("\n",sizeof(char),1,profileFile);
-    }
-    fclose(profileFile);
-  }
-  return status;
-}
-
-
-static ds_status readProFile(const char* fileName, char** content,
-                             size_t* contentSize) {
-  FILE * input = NULL;
-  size_t size = 0;
-  char* binary = NULL;
-
-  *contentSize = 0;
-  *content = NULL;
-
-  input = fopen(fileName, "rb");
-  if(input == NULL) {
-    return DS_FILE_ERROR;
-  }
-
-  fseek(input, 0L, SEEK_END); 
-  size = ftell(input);
-  rewind(input);
-  binary = (char*)malloc(size);
-  if(binary == NULL) {
-    fclose(input);
-    return DS_FILE_ERROR;
-  }
-  fread(binary, sizeof(char), size, input);
-  fclose(input);
-
-  *contentSize = size;
-  *content = binary;
-  return DS_SUCCESS;
-}
-
-
-static const char* findString(const char* contentStart, const char* contentEnd,
-                              const char* string) {
-  size_t stringLength;
-  const char* currentPosition;
-  const char* found;
-  found = NULL;
-  stringLength = strlen(string);
-  currentPosition = contentStart;
-  for(currentPosition = contentStart; currentPosition < contentEnd; currentPosition++) {
-    if (*currentPosition == string[0]) {
-      if (currentPosition+stringLength < contentEnd) {
-        if (strncmp(currentPosition, string, stringLength) == 0) {
-          found = currentPosition;
-          break;
-        }
-      }
-    }
-  }
-  return found;
-}
-
-
-typedef ds_status (*ds_score_deserializer)(ds_device* device,
-                                           const unsigned char* serializedScore,
-                                           unsigned int serializedScoreSize); 
-static ds_status readProfileFromFile(ds_profile* profile,
-                                     ds_score_deserializer deserializer,
-                                     const char* file) {
-
-  ds_status status = DS_SUCCESS;
-  char* contentStart = NULL;
-  const char* contentEnd = NULL;
-  size_t contentSize;
-
-  if (profile==NULL)
-    return DS_INVALID_PROFILE;
-
-  status = readProFile(file, &contentStart, &contentSize);
-  if (status == DS_SUCCESS) {
-    const char* currentPosition;
-    const char* dataStart;
-    const char* dataEnd;
-    size_t versionStringLength;
-
-    contentEnd = contentStart + contentSize;
-    currentPosition = contentStart;
-
-
-    // parse the version string
-    dataStart = findString(currentPosition, contentEnd, DS_TAG_VERSION);
-    if (dataStart == NULL) {
-      status = DS_PROFILE_FILE_ERROR;
-      goto cleanup;
-    }
-    dataStart += strlen(DS_TAG_VERSION);
-
-    dataEnd = findString(dataStart, contentEnd, DS_TAG_VERSION_END);
-    if (dataEnd==NULL) {
-      status = DS_PROFILE_FILE_ERROR;
-      goto cleanup;
-    }
-
-    versionStringLength = strlen(profile->version);
-    if (versionStringLength!=(dataEnd-dataStart)   
-        || strncmp(profile->version, dataStart, versionStringLength)!=0) {
-      // version mismatch
-      status = DS_PROFILE_FILE_ERROR;
-      goto cleanup;
-    }
-    currentPosition = dataEnd+strlen(DS_TAG_VERSION_END);
-
-    // parse the device information
-    while (1) {
-      unsigned int i;
-
-      const char* deviceTypeStart;
-      const char* deviceTypeEnd;
-      ds_device_type deviceType;
-
-      const char* deviceNameStart;
-      const char* deviceNameEnd;
-
-      const char* deviceScoreStart;
-      const char* deviceScoreEnd;
-
-      const char* deviceDriverStart;
-      const char* deviceDriverEnd;
-
-      dataStart = findString(currentPosition, contentEnd, DS_TAG_DEVICE);
-      if (dataStart==NULL) {
-        // nothing useful remain, quit...
-        break;
-      }
-      dataStart+=strlen(DS_TAG_DEVICE);
-      dataEnd = findString(dataStart, contentEnd, DS_TAG_DEVICE_END);
-      if (dataEnd==NULL) {
-        status = DS_PROFILE_FILE_ERROR;
-        goto cleanup;
-      }
-
-      // parse the device type
-      deviceTypeStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_TYPE);
-      if (deviceTypeStart==NULL) {
-        status = DS_PROFILE_FILE_ERROR;
-        goto cleanup;       
-      }
-      deviceTypeStart+=strlen(DS_TAG_DEVICE_TYPE);
-      deviceTypeEnd = findString(deviceTypeStart, contentEnd,
-                                 DS_TAG_DEVICE_TYPE_END);
-      if (deviceTypeEnd==NULL) {
-        status = DS_PROFILE_FILE_ERROR;
-        goto cleanup;
-      }
-      memcpy(&deviceType, deviceTypeStart, sizeof(ds_device_type));
-
-
-      // parse the device name
-      if (deviceType == DS_DEVICE_OPENCL_DEVICE) {
-
-        deviceNameStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_NAME);
-        if (deviceNameStart==NULL) {
-          status = DS_PROFILE_FILE_ERROR;
-          goto cleanup;       
-        }
-        deviceNameStart+=strlen(DS_TAG_DEVICE_NAME);
-        deviceNameEnd = findString(deviceNameStart, contentEnd,
-                                   DS_TAG_DEVICE_NAME_END);
-        if (deviceNameEnd==NULL) {
-          status = DS_PROFILE_FILE_ERROR;
-          goto cleanup;       
-        }
-
-
-        deviceDriverStart = findString(dataStart, contentEnd,
-                                       DS_TAG_DEVICE_DRIVER_VERSION);
-        if (deviceDriverStart==NULL) {
-          status = DS_PROFILE_FILE_ERROR;
-          goto cleanup;       
-        }
-        deviceDriverStart+=strlen(DS_TAG_DEVICE_DRIVER_VERSION);
-        deviceDriverEnd = findString(deviceDriverStart, contentEnd,
-                                     DS_TAG_DEVICE_DRIVER_VERSION_END);
-        if (deviceDriverEnd ==NULL) {
-          status = DS_PROFILE_FILE_ERROR;
-          goto cleanup;       
-        }
-
-
-        // check if this device is on the system
-        for (i = 0; i < profile->numDevices; i++) {
-          if (profile->devices[i].type == DS_DEVICE_OPENCL_DEVICE) {
-            size_t actualDeviceNameLength;
-            size_t driverVersionLength;
-            
-            actualDeviceNameLength = strlen(profile->devices[i].oclDeviceName);
-            driverVersionLength = strlen(profile->devices[i].oclDriverVersion);
-            if (actualDeviceNameLength == (deviceNameEnd - deviceNameStart)
-               && driverVersionLength == (deviceDriverEnd - deviceDriverStart)
-               && strncmp(profile->devices[i].oclDeviceName, deviceNameStart,
-                          actualDeviceNameLength)==0
-               && strncmp(profile->devices[i].oclDriverVersion, deviceDriverStart,
-                          driverVersionLength)==0) {
-              deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
-              if (deviceNameStart==NULL) {
-                status = DS_PROFILE_FILE_ERROR;
-                goto cleanup;       
-              }
-              deviceScoreStart+=strlen(DS_TAG_SCORE);
-              deviceScoreEnd = findString(deviceScoreStart, contentEnd,
-                                          DS_TAG_SCORE_END);
-              status = deserializer(profile->devices+i,
-                                    (const unsigned char*)deviceScoreStart,
-                                    deviceScoreEnd-deviceScoreStart);
-              if (status != DS_SUCCESS) {
-                goto cleanup;
-              }
-            }
-          }
-        }
-
-      }
-      else if (deviceType == DS_DEVICE_NATIVE_CPU) {
-        for (i = 0; i < profile->numDevices; i++) {
-          if (profile->devices[i].type == DS_DEVICE_NATIVE_CPU) {
-            deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
-            if (deviceScoreStart==NULL) {
-              status = DS_PROFILE_FILE_ERROR;
-              goto cleanup;       
-            }
-            deviceScoreStart+=strlen(DS_TAG_SCORE);
-            deviceScoreEnd = findString(deviceScoreStart, contentEnd,
-                                        DS_TAG_SCORE_END);
-            status = deserializer(profile->devices+i,
-                                  (const unsigned char*)deviceScoreStart,
-                                  deviceScoreEnd-deviceScoreStart);
-            if (status != DS_SUCCESS) {
-              goto cleanup;
-            }
-          }
-        }
-      }
-
-      // skip over the current one to find the next device
-      currentPosition = dataEnd+strlen(DS_TAG_DEVICE_END);
-    }
-  }
-cleanup:
-  if (contentStart!=NULL) free(contentStart);
-  return status;
-}
-
-static ds_status getNumDeviceWithEmptyScore(ds_profile* profile,
-                                            unsigned int* num) {
-  unsigned int i;
-  if (profile == NULL || num==NULL)
-    return DS_MEMORY_ERROR;
-  *num=0;
-  for (i = 0; i < profile->numDevices; i++) {
-    if (profile->devices[i].score == NULL) {
-      *num++;
-    }
-  }
-  return DS_SUCCESS;
-}
-
-#endif
-#endif
+#ifdef USE_OPENCL
+#ifndef DEVICE_SELECTION_H
+#define DEVICE_SELECTION_H
+
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#define DS_DEVICE_NAME_LENGTH 256
+
+typedef enum {
+  DS_SUCCESS = 0,
+  DS_INVALID_PROFILE = 1000,
+  DS_MEMORY_ERROR,
+  DS_INVALID_PERF_EVALUATOR_TYPE,
+  DS_INVALID_PERF_EVALUATOR,
+  DS_PERF_EVALUATOR_ERROR,
+  DS_FILE_ERROR,
+  DS_UNKNOWN_DEVICE_TYPE,
+  DS_PROFILE_FILE_ERROR,
+  DS_SCORE_SERIALIZER_ERROR,
+  DS_SCORE_DESERIALIZER_ERROR
+} ds_status;
+
+// device type
+typedef enum {
+  DS_DEVICE_NATIVE_CPU = 0,
+  DS_DEVICE_OPENCL_DEVICE 
+} ds_device_type;
+
+
+typedef struct {
+  ds_device_type  type;
+  cl_device_id    oclDeviceID;
+  char*           oclDeviceName;
+  char*           oclDriverVersion;
+  // a pointer to the score data, the content/format is application defined.
+  void*           score;
+} ds_device;
+
+typedef struct {
+  unsigned int  numDevices;
+  ds_device*    devices;
+  const char*   version;
+} ds_profile;
+
+// deallocate memory used by score
+typedef ds_status (*ds_score_release)(void* score);
+static ds_status releaseDSProfile(ds_profile* profile, ds_score_release sr) {
+  ds_status status = DS_SUCCESS;
+  if (profile!=NULL) {
+    if (profile->devices!=NULL && sr!=NULL) {
+      unsigned int i;
+      for (i = 0; i < profile->numDevices; i++) {
+        status = sr(profile->devices[i].score);
+        if (status != DS_SUCCESS)
+          break;
+      }
+      free(profile->devices);
+    }
+    free(profile);
+  }
+  return status;
+}
+
+
+static ds_status initDSProfile(ds_profile** p, const char* version) {
+  int numDevices;
+  cl_uint numPlatforms;
+  cl_platform_id* platforms = NULL;
+  cl_device_id*   devices = NULL;
+  ds_status status = DS_SUCCESS;
+  ds_profile* profile = NULL;
+  unsigned int next;
+  unsigned int i;
+
+  if (p == NULL)
+    return DS_INVALID_PROFILE;
+
+  profile = (ds_profile*)malloc(sizeof(ds_profile));
+  if (profile == NULL)
+    return DS_MEMORY_ERROR;
+  
+  memset(profile, 0, sizeof(ds_profile));
+
+  clGetPlatformIDs(0, NULL, &numPlatforms);
+  if (numPlatforms == 0)
+    goto cleanup;
+
+  platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
+  if (platforms == NULL) {
+    status = DS_MEMORY_ERROR;
+    goto cleanup;
+  }
+  clGetPlatformIDs(numPlatforms, platforms, NULL);
+
+  numDevices = 0;
+  for (i = 0; i < (unsigned int)numPlatforms; i++) {
+    cl_uint num;
+    clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &num);
+    numDevices+=num;
+  }
+  if (numDevices == 0)
+    goto cleanup;
+  
+  devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
+  if (devices == NULL) {
+    status = DS_MEMORY_ERROR;
+    goto cleanup;
+  }
+
+  profile->numDevices = numDevices+1;     // +1 to numDevices to include the native CPU
+  profile->devices = (ds_device*)malloc(profile->numDevices*sizeof(ds_device));    
+  if (profile->devices == NULL) {
+    profile->numDevices = 0;
+    status = DS_MEMORY_ERROR;
+    goto cleanup;    
+  }
+  memset(profile->devices, 0, profile->numDevices*sizeof(ds_device));
+
+  next = 0;
+  for (i = 0; i < (unsigned int)numPlatforms; i++) {
+    cl_uint num;
+    unsigned j;
+    clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numDevices, devices, &num);
+    for (j = 0; j < num; j++, next++) {
+      char buffer[DS_DEVICE_NAME_LENGTH];
+      size_t length;
+
+      profile->devices[next].type = DS_DEVICE_OPENCL_DEVICE;
+      profile->devices[next].oclDeviceID = devices[j];
+
+      clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_NAME
+        , DS_DEVICE_NAME_LENGTH, &buffer, NULL);
+      length = strlen(buffer);
+      profile->devices[next].oclDeviceName = (char*)malloc(length+1);
+      memcpy(profile->devices[next].oclDeviceName, buffer, length+1);
+
+      clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DRIVER_VERSION
+        , DS_DEVICE_NAME_LENGTH, &buffer, NULL);
+      length = strlen(buffer);
+      profile->devices[next].oclDriverVersion = (char*)malloc(length+1);
+      memcpy(profile->devices[next].oclDriverVersion, buffer, length+1);
+    }
+  }
+  profile->devices[next].type = DS_DEVICE_NATIVE_CPU;
+  profile->version = version;
+
+cleanup:
+  if (platforms)  free(platforms);
+  if (devices)    free(devices);
+  if (status == DS_SUCCESS) {
+    *p = profile;
+  }
+  else {
+    if (profile) {
+      if (profile->devices)
+        free(profile->devices);
+      free(profile);
+    }
+  }
+  return status;
+}
+
+// Pointer to a function that calculates the score of a device (ex:
+// device->score) update the data size of score. The encoding and the format
+// of the score data is implementation defined. The function should return
+// DS_SUCCESS if there's no error to be reported.
+typedef ds_status (*ds_perf_evaluator)(ds_device* device, void* data);
+
+typedef enum {
+  DS_EVALUATE_ALL
+  ,DS_EVALUATE_NEW_ONLY
+} ds_evaluation_type;
+
+static ds_status profileDevices(ds_profile* profile,
+                                const ds_evaluation_type type,
+                                ds_perf_evaluator evaluator,
+                                void* evaluatorData, unsigned int* numUpdates) {
+  ds_status status = DS_SUCCESS;
+  unsigned int i;
+  unsigned int updates = 0;
+
+  if (profile == NULL) {
+    return DS_INVALID_PROFILE;
+  }
+  if (evaluator == NULL) {
+    return DS_INVALID_PERF_EVALUATOR;
+  }
+
+  for (i = 0; i < profile->numDevices; i++) {
+    ds_status evaluatorStatus;
+    
+    switch (type) {
+    case DS_EVALUATE_NEW_ONLY:
+      if (profile->devices[i].score != NULL)
+        break;
+      //  else fall through
+    case DS_EVALUATE_ALL:
+      evaluatorStatus = evaluator(profile->devices+i, evaluatorData);
+      if (evaluatorStatus != DS_SUCCESS) {
+        status = evaluatorStatus;
+        return status;
+      }
+      updates++;
+      break;
+    default:
+      return DS_INVALID_PERF_EVALUATOR_TYPE;
+      break;
+    };
+  }
+  if (numUpdates)
+    *numUpdates = updates;
+  return status;
+}
+
+
+#define DS_TAG_VERSION                      "<version>"
+#define DS_TAG_VERSION_END                  "</version>"
+#define DS_TAG_DEVICE                       "<device>"
+#define DS_TAG_DEVICE_END                   "</device>"
+#define DS_TAG_SCORE                        "<score>"
+#define DS_TAG_SCORE_END                    "</score>"
+#define DS_TAG_DEVICE_TYPE                  "<type>"
+#define DS_TAG_DEVICE_TYPE_END              "</type>"
+#define DS_TAG_DEVICE_NAME                  "<name>"
+#define DS_TAG_DEVICE_NAME_END              "</name>"
+#define DS_TAG_DEVICE_DRIVER_VERSION        "<driver>"
+#define DS_TAG_DEVICE_DRIVER_VERSION_END    "</driver>"
+
+#define DS_DEVICE_NATIVE_CPU_STRING  "native_cpu"
+
+
+
+typedef ds_status (*ds_score_serializer)(ds_device* device,
+                                         void** serializedScore,
+                                         unsigned int* serializedScoreSize);
+static ds_status writeProfileToFile(ds_profile* profile,
+                                    ds_score_serializer serializer,
+                                    const char* file) {
+  ds_status status = DS_SUCCESS;
+  FILE* profileFile = NULL;
+
+
+  if (profile == NULL)
+    return DS_INVALID_PROFILE;
+
+  profileFile = fopen(file, "wb");
+  if (profileFile==NULL) {
+    status = DS_FILE_ERROR;
+  }
+  else {
+    unsigned int i;
+
+    // write version string
+    fwrite(DS_TAG_VERSION, sizeof(char), strlen(DS_TAG_VERSION), profileFile);
+    fwrite(profile->version, sizeof(char), strlen(profile->version), profileFile);
+    fwrite(DS_TAG_VERSION_END, sizeof(char), strlen(DS_TAG_VERSION_END), profileFile);
+    fwrite("\n", sizeof(char), 1, profileFile);
+
+    for (i = 0; i < profile->numDevices && status == DS_SUCCESS; i++) {
+      void* serializedScore;
+      unsigned int serializedScoreSize;
+
+      fwrite(DS_TAG_DEVICE, sizeof(char), strlen(DS_TAG_DEVICE), profileFile);
+
+      fwrite(DS_TAG_DEVICE_TYPE, sizeof(char), strlen(DS_TAG_DEVICE_TYPE),
+             profileFile);
+      fwrite(&profile->devices[i].type,sizeof(ds_device_type),1, profileFile);
+      fwrite(DS_TAG_DEVICE_TYPE_END, sizeof(char),
+             strlen(DS_TAG_DEVICE_TYPE_END), profileFile);
+
+      switch(profile->devices[i].type) {
+      case DS_DEVICE_NATIVE_CPU:
+        { 
+          // There's no need to emit a device name for the native CPU device.
+          /*
+          fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME),
+                 profileFile);
+          fwrite(DS_DEVICE_NATIVE_CPU_STRING,sizeof(char),
+                 strlen(DS_DEVICE_NATIVE_CPU_STRING), profileFile);
+          fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char),
+                 strlen(DS_TAG_DEVICE_NAME_END), profileFile);
+          */
+        }
+        break;
+      case DS_DEVICE_OPENCL_DEVICE: 
+        {
+          fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME),
+                 profileFile);
+          fwrite(profile->devices[i].oclDeviceName,
+                 sizeof(char),strlen(profile->devices[i].oclDeviceName), profileFile);
+          fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char),
+                 strlen(DS_TAG_DEVICE_NAME_END), profileFile);
+
+          fwrite(DS_TAG_DEVICE_DRIVER_VERSION, sizeof(char),
+                 strlen(DS_TAG_DEVICE_DRIVER_VERSION), profileFile);
+          fwrite(profile->devices[i].oclDriverVersion, sizeof(char),
+                 strlen(profile->devices[i].oclDriverVersion), profileFile);
+          fwrite(DS_TAG_DEVICE_DRIVER_VERSION_END, sizeof(char),
+                 strlen(DS_TAG_DEVICE_DRIVER_VERSION_END), profileFile);
+        }
+        break;
+      default:
+        status = DS_UNKNOWN_DEVICE_TYPE;
+        break;
+      };
+
+      fwrite(DS_TAG_SCORE, sizeof(char), strlen(DS_TAG_SCORE), profileFile);
+      status = serializer(profile->devices+i, &serializedScore,
+                          &serializedScoreSize);
+      if (status == DS_SUCCESS && serializedScore!=NULL && serializedScoreSize > 0) {
+        fwrite(serializedScore, sizeof(char), serializedScoreSize, profileFile);
+        free(serializedScore);
+      }
+      fwrite(DS_TAG_SCORE_END, sizeof(char), strlen(DS_TAG_SCORE_END), profileFile);
+      fwrite(DS_TAG_DEVICE_END, sizeof(char), strlen(DS_TAG_DEVICE_END), profileFile);
+      fwrite("\n",sizeof(char),1,profileFile);
+    }
+    fclose(profileFile);
+  }
+  return status;
+}
+
+
+static ds_status readProFile(const char* fileName, char** content,
+                             size_t* contentSize) {
+  FILE * input = NULL;
+  size_t size = 0;
+  char* binary = NULL;
+
+  *contentSize = 0;
+  *content = NULL;
+
+  input = fopen(fileName, "rb");
+  if(input == NULL) {
+    return DS_FILE_ERROR;
+  }
+
+  fseek(input, 0L, SEEK_END); 
+  size = ftell(input);
+  rewind(input);
+  binary = (char*)malloc(size);
+  if(binary == NULL) {
+    fclose(input);
+    return DS_FILE_ERROR;
+  }
+  fread(binary, sizeof(char), size, input);
+  fclose(input);
+
+  *contentSize = size;
+  *content = binary;
+  return DS_SUCCESS;
+}
+
+
+static const char* findString(const char* contentStart, const char* contentEnd,
+                              const char* string) {
+  size_t stringLength;
+  const char* currentPosition;
+  const char* found;
+  found = NULL;
+  stringLength = strlen(string);
+  currentPosition = contentStart;
+  for(currentPosition = contentStart; currentPosition < contentEnd; currentPosition++) {
+    if (*currentPosition == string[0]) {
+      if (currentPosition+stringLength < contentEnd) {
+        if (strncmp(currentPosition, string, stringLength) == 0) {
+          found = currentPosition;
+          break;
+        }
+      }
+    }
+  }
+  return found;
+}
+
+
+typedef ds_status (*ds_score_deserializer)(ds_device* device,
+                                           const unsigned char* serializedScore,
+                                           unsigned int serializedScoreSize); 
+static ds_status readProfileFromFile(ds_profile* profile,
+                                     ds_score_deserializer deserializer,
+                                     const char* file) {
+
+  ds_status status = DS_SUCCESS;
+  char* contentStart = NULL;
+  const char* contentEnd = NULL;
+  size_t contentSize;
+
+  if (profile==NULL)
+    return DS_INVALID_PROFILE;
+
+  status = readProFile(file, &contentStart, &contentSize);
+  if (status == DS_SUCCESS) {
+    const char* currentPosition;
+    const char* dataStart;
+    const char* dataEnd;
+    size_t versionStringLength;
+
+    contentEnd = contentStart + contentSize;
+    currentPosition = contentStart;
+
+
+    // parse the version string
+    dataStart = findString(currentPosition, contentEnd, DS_TAG_VERSION);
+    if (dataStart == NULL) {
+      status = DS_PROFILE_FILE_ERROR;
+      goto cleanup;
+    }
+    dataStart += strlen(DS_TAG_VERSION);
+
+    dataEnd = findString(dataStart, contentEnd, DS_TAG_VERSION_END);
+    if (dataEnd==NULL) {
+      status = DS_PROFILE_FILE_ERROR;
+      goto cleanup;
+    }
+
+    versionStringLength = strlen(profile->version);
+    if (versionStringLength!=(dataEnd-dataStart)   
+        || strncmp(profile->version, dataStart, versionStringLength)!=0) {
+      // version mismatch
+      status = DS_PROFILE_FILE_ERROR;
+      goto cleanup;
+    }
+    currentPosition = dataEnd+strlen(DS_TAG_VERSION_END);
+
+    // parse the device information
+    while (1) {
+      unsigned int i;
+
+      const char* deviceTypeStart;
+      const char* deviceTypeEnd;
+      ds_device_type deviceType;
+
+      const char* deviceNameStart;
+      const char* deviceNameEnd;
+
+      const char* deviceScoreStart;
+      const char* deviceScoreEnd;
+
+      const char* deviceDriverStart;
+      const char* deviceDriverEnd;
+
+      dataStart = findString(currentPosition, contentEnd, DS_TAG_DEVICE);
+      if (dataStart==NULL) {
+        // nothing useful remain, quit...
+        break;
+      }
+      dataStart+=strlen(DS_TAG_DEVICE);
+      dataEnd = findString(dataStart, contentEnd, DS_TAG_DEVICE_END);
+      if (dataEnd==NULL) {
+        status = DS_PROFILE_FILE_ERROR;
+        goto cleanup;
+      }
+
+      // parse the device type
+      deviceTypeStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_TYPE);
+      if (deviceTypeStart==NULL) {
+        status = DS_PROFILE_FILE_ERROR;
+        goto cleanup;       
+      }
+      deviceTypeStart+=strlen(DS_TAG_DEVICE_TYPE);
+      deviceTypeEnd = findString(deviceTypeStart, contentEnd,
+                                 DS_TAG_DEVICE_TYPE_END);
+      if (deviceTypeEnd==NULL) {
+        status = DS_PROFILE_FILE_ERROR;
+        goto cleanup;
+      }
+      memcpy(&deviceType, deviceTypeStart, sizeof(ds_device_type));
+
+
+      // parse the device name
+      if (deviceType == DS_DEVICE_OPENCL_DEVICE) {
+
+        deviceNameStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_NAME);
+        if (deviceNameStart==NULL) {
+          status = DS_PROFILE_FILE_ERROR;
+          goto cleanup;       
+        }
+        deviceNameStart+=strlen(DS_TAG_DEVICE_NAME);
+        deviceNameEnd = findString(deviceNameStart, contentEnd,
+                                   DS_TAG_DEVICE_NAME_END);
+        if (deviceNameEnd==NULL) {
+          status = DS_PROFILE_FILE_ERROR;
+          goto cleanup;       
+        }
+
+
+        deviceDriverStart = findString(dataStart, contentEnd,
+                                       DS_TAG_DEVICE_DRIVER_VERSION);
+        if (deviceDriverStart==NULL) {
+          status = DS_PROFILE_FILE_ERROR;
+          goto cleanup;       
+        }
+        deviceDriverStart+=strlen(DS_TAG_DEVICE_DRIVER_VERSION);
+        deviceDriverEnd = findString(deviceDriverStart, contentEnd,
+                                     DS_TAG_DEVICE_DRIVER_VERSION_END);
+        if (deviceDriverEnd ==NULL) {
+          status = DS_PROFILE_FILE_ERROR;
+          goto cleanup;       
+        }
+
+
+        // check if this device is on the system
+        for (i = 0; i < profile->numDevices; i++) {
+          if (profile->devices[i].type == DS_DEVICE_OPENCL_DEVICE) {
+            size_t actualDeviceNameLength;
+            size_t driverVersionLength;
+            
+            actualDeviceNameLength = strlen(profile->devices[i].oclDeviceName);
+            driverVersionLength = strlen(profile->devices[i].oclDriverVersion);
+            if (actualDeviceNameLength == (deviceNameEnd - deviceNameStart)
+               && driverVersionLength == (deviceDriverEnd - deviceDriverStart)
+               && strncmp(profile->devices[i].oclDeviceName, deviceNameStart,
+                          actualDeviceNameLength)==0
+               && strncmp(profile->devices[i].oclDriverVersion, deviceDriverStart,
+                          driverVersionLength)==0) {
+              deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
+              if (deviceNameStart==NULL) {
+                status = DS_PROFILE_FILE_ERROR;
+                goto cleanup;       
+              }
+              deviceScoreStart+=strlen(DS_TAG_SCORE);
+              deviceScoreEnd = findString(deviceScoreStart, contentEnd,
+                                          DS_TAG_SCORE_END);
+              status = deserializer(profile->devices+i,
+                                    (const unsigned char*)deviceScoreStart,
+                                    deviceScoreEnd-deviceScoreStart);
+              if (status != DS_SUCCESS) {
+                goto cleanup;
+              }
+            }
+          }
+        }
+
+      }
+      else if (deviceType == DS_DEVICE_NATIVE_CPU) {
+        for (i = 0; i < profile->numDevices; i++) {
+          if (profile->devices[i].type == DS_DEVICE_NATIVE_CPU) {
+            deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
+            if (deviceScoreStart==NULL) {
+              status = DS_PROFILE_FILE_ERROR;
+              goto cleanup;       
+            }
+            deviceScoreStart+=strlen(DS_TAG_SCORE);
+            deviceScoreEnd = findString(deviceScoreStart, contentEnd,
+                                        DS_TAG_SCORE_END);
+            status = deserializer(profile->devices+i,
+                                  (const unsigned char*)deviceScoreStart,
+                                  deviceScoreEnd-deviceScoreStart);
+            if (status != DS_SUCCESS) {
+              goto cleanup;
+            }
+          }
+        }
+      }
+
+      // skip over the current one to find the next device
+      currentPosition = dataEnd+strlen(DS_TAG_DEVICE_END);
+    }
+  }
+cleanup:
+  if (contentStart!=NULL) free(contentStart);
+  return status;
+}
+
+static ds_status getNumDeviceWithEmptyScore(ds_profile* profile,
+                                            unsigned int* num) {
+  unsigned int i;
+  if (profile == NULL || num==NULL)
+    return DS_MEMORY_ERROR;
+  *num=0;
+  for (i = 0; i < profile->numDevices; i++) {
+    if (profile->devices[i].score == NULL) {
+      *num++;
+    }
+  }
+  return DS_SUCCESS;
+}
+
+#endif
+#endif
diff --git a/opencl/openclwrapper.cpp b/opencl/openclwrapper.cpp
index ac1812e3..f2770da9 100644
--- a/opencl/openclwrapper.cpp
+++ b/opencl/openclwrapper.cpp
@@ -1,3226 +1,3226 @@
-#ifdef _WIN32
-#include <Windows.h>
-#include <io.h>
-#else
-#include <sys/types.h>
-#include <unistd.h>
-#endif
-#include <float.h>
-
-#include "openclwrapper.h"
-#include "oclkernels.h"
-
-// for micro-benchmark
-#include "otsuthr.h"
-#include "thresholder.h"
-
-#ifdef USE_OPENCL
-
-#include "opencl_device_selection.h"
-
-GPUEnv OpenclDevice::gpuEnv;
-
-#if USE_DEVICE_SELECTION
-bool OpenclDevice::deviceIsSelected = false;
-ds_device OpenclDevice::selectedDevice;
-#endif
-
-int OpenclDevice::isInited =0;
-
-struct tiff_transform {
-    int vflip;    /* if non-zero, image needs a vertical fip */
-    int hflip;    /* if non-zero, image needs a horizontal flip */
-    int rotate;   /* -1 -> counterclockwise 90-degree rotation,
-                      0 -> no rotation
-                      1 -> clockwise 90-degree rotation */
-};
-
-static struct tiff_transform tiff_orientation_transforms[] = {
-    {0, 0, 0},
-    {0, 1, 0},
-    {1, 1, 0},
-    {1, 0, 0},
-    {0, 1, -1},
-    {0, 0, 1},
-    {0, 1, 1},
-    {0, 0, -1}
-};
-
-static const l_int32  MAX_PAGES_IN_TIFF_FILE = 3000; 
-
-cl_mem pixsCLBuffer, pixdCLBuffer, pixdCLIntermediate; //Morph operations buffers
-cl_mem pixThBuffer; //output from thresholdtopix calculation
-cl_int clStatus;
-KernelEnv rEnv;
-
-// substitute invalid characters in device name with _
-void legalizeFileName( char *fileName) {
-    //printf("fileName: %s\n", fileName);
-    char *invalidChars = "/\?:*\"><| "; // space is valid but can cause headaches
-    // for each invalid char
-    for (int i = 0; i < strlen(invalidChars); i++) {
-        char invalidStr[4];
-        invalidStr[0] = invalidChars[i];
-        invalidStr[1] = NULL;
-        //printf("eliminating %s\n", invalidStr);
-        //char *pos = strstr(fileName, invalidStr);
-        // initial ./ is valid for present directory
-        //if (*pos == '.') pos++;
-        //if (*pos == '/') pos++;
-        for ( char *pos = strstr(fileName, invalidStr); pos != NULL; pos = strstr(pos+1, invalidStr)) {
-            //printf("\tfound: %s, ", pos);
-            pos[0] = '_';
-            //printf("fileName: %s\n", fileName);
-        }
-    }
-}
-
-void populateGPUEnvFromDevice( GPUEnv *gpuInfo, cl_device_id device ) {
-    //printf("[DS] populateGPUEnvFromDevice\n");
-    size_t size;
-    gpuInfo->mnIsUserCreated = 1;
-    // device
-    gpuInfo->mpDevID = device;
-    gpuInfo->mpArryDevsID = new cl_device_id[1];
-    gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
-    clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE       , sizeof(cl_device_type), (void *) &gpuInfo->mDevType       , &size);
-    CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(TYPE)");
-    // platform
-    clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM   , sizeof(cl_platform_id), (void *) &gpuInfo->mpPlatformID   , &size);
-    CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(PLATFORM)");
-    // context
-    cl_context_properties props[3];
-    props[0] = CL_CONTEXT_PLATFORM;
-    props[1] = (cl_context_properties) gpuInfo->mpPlatformID;
-    props[2] = 0;
-    gpuInfo->mpContext = clCreateContext(props, 1, &gpuInfo->mpDevID, NULL, NULL, &clStatus);
-    CHECK_OPENCL( clStatus, "populateGPUEnv::createContext");
-    // queue
-    cl_command_queue_properties queueProperties = 0;
-    gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus );
-    CHECK_OPENCL( clStatus, "populateGPUEnv::createCommandQueue");
-    
-}
-
-int OpenclDevice::LoadOpencl()
-{
-#ifdef WIN32
-    HINSTANCE HOpenclDll = NULL;
-  void * OpenclDll = NULL;
-    //fprintf(stderr, " LoadOpenclDllxx... \n");
-    OpenclDll = static_cast<HINSTANCE>( HOpenclDll );
-    OpenclDll = LoadLibrary( "openCL.dll" );
-    if ( !static_cast<HINSTANCE>( OpenclDll ) )
-    {
-        fprintf(stderr, "[OD] Load opencl.dll failed!\n");
-        FreeLibrary( static_cast<HINSTANCE>( OpenclDll ) );
-        return 0;
-        
-    }
-    fprintf(stderr, "[OD] Load opencl.dll successful!\n");
-#endif
-    return 1;
-}
-int OpenclDevice::SetKernelEnv( KernelEnv *envInfo )
-{
-    envInfo->mpkContext = gpuEnv.mpContext;
-    envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
-    envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
-
-    return 1;
-}
-
-cl_mem allocateZeroCopyBuffer(KernelEnv rEnv, l_uint32 *hostbuffer, size_t nElements, cl_mem_flags flags, cl_int *pStatus)
-{
-    cl_mem membuffer = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (flags),
-                                        nElements * sizeof(l_uint32), hostbuffer, pStatus);
-
-    return membuffer;
-}
-
-PIX* mapOutputCLBuffer(KernelEnv rEnv, cl_mem clbuffer, PIX* pixd, PIX* pixs, int elements, cl_mem_flags flags, bool memcopy = false, bool sync = true)
-{   
-    PROCNAME("mapOutputCLBuffer");
-    if (!pixd)
-    {
-        if (memcopy)
-        {
-            if ((pixd = pixCreateTemplate(pixs)) == NULL)
-                (PIX *)ERROR_PTR("pixd not made", procName, NULL);
-        }
-        else
-        {
-            if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs), pixGetDepth(pixs))) == NULL)
-                (PIX *)ERROR_PTR("pixd not made", procName, NULL);
-        }
-    }
-    l_uint32 *pValues = (l_uint32 *)clEnqueueMapBuffer(rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
-                                                    elements * sizeof(l_uint32), 0, NULL, NULL, NULL );
-    
-    if (memcopy)
-    {
-        memcpy(pixGetData(pixd), pValues, elements * sizeof(l_uint32));
-    }
-    else
-    {
-        pixSetData(pixd, pValues);
-    }
-
-    clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,clbuffer,pValues,0,NULL,NULL);
-    
-    if (sync)
-    {
-        clFinish( rEnv.mpkCmdQueue );
-    }
-
-    return pixd;
-}
-
- cl_mem allocateIntBuffer( KernelEnv rEnv, const l_uint32 *_pValues, size_t nElements, cl_int *pStatus , bool sync = false)
-{
-    cl_mem xValues = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
-        nElements * sizeof(l_int32), NULL, pStatus);
-
-    if (_pValues != NULL)
-    {
-        l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0,
-            nElements * sizeof(l_int32), 0, NULL, NULL, NULL );
-
-        memcpy(pValues, _pValues, nElements * sizeof(l_int32));
-
-        clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL);
-
-        if (sync)
-            clFinish( rEnv.mpkCmdQueue );
-    }
-
-    return xValues;
-}
-
-int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo )
-{
-    size_t length;
-    cl_int clStatus;
-    cl_uint numPlatforms, numDevices;
-    cl_platform_id *platforms;
-    cl_context_properties cps[3];
-    char platformName[256];
-    unsigned int i;
-
-
-    // Have a look at the available platforms.
-
-    if ( !gpuInfo->mnIsUserCreated )
-    {
-        clStatus = clGetPlatformIDs( 0, NULL, &numPlatforms );
-        if ( clStatus != CL_SUCCESS )
-        {
-            return 1;
-        }
-        gpuInfo->mpPlatformID = NULL;
-
-        if ( 0 < numPlatforms )
-        {
-            platforms = (cl_platform_id*) malloc( numPlatforms * sizeof( cl_platform_id ) );
-            if ( platforms == (cl_platform_id*) NULL )
-            {
-                return 1;
-            }
-            clStatus = clGetPlatformIDs( numPlatforms, platforms, NULL );
-
-            if ( clStatus != CL_SUCCESS )
-            {
-                return 1;
-            }
-
-            for ( i = 0; i < numPlatforms; i++ )
-            {
-                clStatus = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR,
-                    sizeof( platformName ), platformName, NULL );
-
-                if ( clStatus != CL_SUCCESS )
-                {
-                    return 1;
-                }
-                gpuInfo->mpPlatformID = platforms[i];
-
-                //if (!strcmp(platformName, "Intel(R) Coporation"))
-                //if( !strcmp( platformName, "Advanced Micro Devices, Inc." ))
-                {
-                    gpuInfo->mpPlatformID = platforms[i];
-                    
-                    if ( getenv("SC_OPENCLCPU") )
-                    {
-                        clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform
-                                                  CL_DEVICE_TYPE_CPU,    // device_type for CPU device
-                                                  0,                     // num_entries
-                                                  NULL,                  // devices
-                                                  &numDevices);
-                        printf("Selecting OpenCL device: CPU (a)\n");
-                    }
-                    else
-                    {
-                          clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform
-                                                  CL_DEVICE_TYPE_GPU,      // device_type for GPU device
-                                                  0,                       // num_entries
-                                                  NULL,                    // devices
-                                                  &numDevices);
-                          printf("Selecting OpenCL device: GPU (a)\n");
-                    }
-                    if ( clStatus != CL_SUCCESS )
-                        continue;
-
-                    if ( numDevices )
-                        break;
-                }
-            }
-            if ( clStatus != CL_SUCCESS )
-                return 1;
-            free( platforms );
-        }
-        if ( NULL == gpuInfo->mpPlatformID )
-            return 1;
-
-        // Use available platform.
-        cps[0] = CL_CONTEXT_PLATFORM;
-        cps[1] = (cl_context_properties) gpuInfo->mpPlatformID;
-        cps[2] = 0;
-        // Set device type for OpenCL
-        
-        if ( getenv("SC_OPENCLCPU") )
-        {
-            gpuInfo->mDevType = CL_DEVICE_TYPE_CPU;
-            printf("Selecting OpenCL device: CPU (b)\n");
-        }
-        else
-        {
-            gpuInfo->mDevType = CL_DEVICE_TYPE_GPU;
-            printf("Selecting OpenCL device: GPU (b)\n");
-        }
-
-        gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
-
-        if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
-        {
-            gpuInfo->mDevType = CL_DEVICE_TYPE_CPU;
-            gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
-            printf("Selecting OpenCL device: CPU (c)\n");
-        }
-        if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
-        {
-            gpuInfo->mDevType = CL_DEVICE_TYPE_DEFAULT;
-            gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
-            printf("Selecting OpenCL device: DEFAULT (c)\n");
-        }
-        if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
-            return 1;
-        // Detect OpenCL devices.
-        // First, get the size of device list data
-        clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, 0, NULL, &length );
-        if ( ( clStatus != CL_SUCCESS ) || ( length == 0 ) )
-            return 1;
-        // Now allocate memory for device list based on the size we got earlier
-        gpuInfo->mpArryDevsID = (cl_device_id*) malloc( length );
-        if ( gpuInfo->mpArryDevsID == (cl_device_id*) NULL )
-            return 1;
-        // Now, get the device list data
-        clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, length,
-                       gpuInfo->mpArryDevsID, NULL );
-        if ( clStatus != CL_SUCCESS )
-            return 1;
-
-        // Create OpenCL command queue.
-        gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpArryDevsID[0], 0, &clStatus );
-
-        if ( clStatus != CL_SUCCESS )
-            return 1;
-    }
-
-    clStatus = clGetCommandQueueInfo( gpuInfo->mpCmdQueue, CL_QUEUE_THREAD_HANDLE_AMD, 0, NULL, NULL );
-    // Check device extensions for double type
-    size_t aDevExtInfoSize = 0;
-
-    clStatus = clGetDeviceInfo( gpuInfo->mpArryDevsID[0], CL_DEVICE_EXTENSIONS, 0, NULL, &aDevExtInfoSize );
-    CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
-
-    char *aExtInfo = new char[aDevExtInfoSize];
-
-    clStatus = clGetDeviceInfo( gpuInfo->mpArryDevsID[0], CL_DEVICE_EXTENSIONS,
-                   sizeof(char) * aDevExtInfoSize, aExtInfo, NULL);
-    CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
-
-    gpuInfo->mnKhrFp64Flag = 0;
-    gpuInfo->mnAmdFp64Flag = 0;
-
-    if ( strstr( aExtInfo, "cl_khr_fp64" ) )
-    {
-        gpuInfo->mnKhrFp64Flag = 1;
-    }
-    else
-    {
-        // Check if cl_amd_fp64 extension is supported
-        if ( strstr( aExtInfo, "cl_amd_fp64" ) )
-            gpuInfo->mnAmdFp64Flag = 1;
-    }
-    delete []aExtInfo;
-
-    return 0;
-}
-
-void OpenclDevice::releaseMorphCLBuffers()
-{
-    if (pixdCLIntermediate != NULL)
-        clReleaseMemObject(pixdCLIntermediate);
-    if (pixsCLBuffer != NULL)
-        clReleaseMemObject(pixsCLBuffer);
-    if (pixdCLBuffer != NULL)
-        clReleaseMemObject(pixdCLBuffer);
-    if (pixThBuffer != NULL)
-        clReleaseMemObject(pixThBuffer);
-}
-
-int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, PIX* pixs)
-{
-    SetKernelEnv( &rEnv );
-    
-    if (pixThBuffer != NULL)
-    {
-        pixsCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
-        
-        //Get the output from ThresholdToPix operation
-        clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0, sizeof(l_uint32) * wpl*h, 0, NULL, NULL);
-    }
-    else
-    {
-        //Get data from the source image
-        l_uint32* srcdata = (l_uint32*) malloc(wpl*h*sizeof(l_uint32));
-        memcpy(srcdata, pixGetData(pixs), wpl*h*sizeof(l_uint32));
-    
-        pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl*h, CL_MEM_USE_HOST_PTR, &clStatus);
-    }
-    
-    pixdCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
-
-    pixdCLIntermediate = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
-
-    return (int)clStatus;
-}
-
-int OpenclDevice::InitEnv()
-{
-//PERF_COUNT_START("OD::InitEnv")
-//    printf("[OD] OpenclDevice::InitEnv()\n");
-#ifdef SAL_WIN32
-    while( 1 )
-    {
-        if( 1 == LoadOpencl() )
-            break;
-    }
-PERF_COUNT_SUB("LoadOpencl")
-#endif
-    // sets up environment, compiles programs
-    
-
-#if USE_DEVICE_SELECTION
-    
-    InitOpenclRunEnv_DeviceSelection( 0 );
-//PERF_COUNT_SUB("called InitOpenclRunEnv_DS")
-#else
-    // init according to device
-    InitOpenclRunEnv( 0 );
-#endif
-//PERF_COUNT_END
-    return 1;
-}
-
-int OpenclDevice::ReleaseOpenclRunEnv()
-{
-    ReleaseOpenclEnv( &gpuEnv );
-#ifdef SAL_WIN32
-    FreeOpenclDll();
-#endif
-    return 1;
-}
-inline int OpenclDevice::AddKernelConfig( int kCount, const char *kName )
-{
-    if ( kCount < 1 )
-        fprintf(stderr,"Error: ( KCount < 1 ) AddKernelConfig\n" );
-    strcpy( gpuEnv.mArrykernelNames[kCount-1], kName );
-    gpuEnv.mnKernelCount++;
-    return 0;
-}
-int OpenclDevice::RegistOpenclKernel()
-{
-    if ( !gpuEnv.mnIsUserCreated )
-        memset( &gpuEnv, 0, sizeof(gpuEnv) );
-
-    gpuEnv.mnFileCount = 0; //argc;
-    gpuEnv.mnKernelCount = 0UL;
-
-    AddKernelConfig( 1, (const char*) "oclAverageSub1" );
-    return 0;
-}
-int OpenclDevice::InitOpenclRunEnv( int argc )
-{
-    int status = 0;
-    if ( MAX_CLKERNEL_NUM <= 0 )
-    {
-        return 1;
-    }
-    if ( ( argc > MAX_CLFILE_NUM ) || ( argc < 0 ) )
-        return 1;
-
-    if ( !isInited )
-    {
-        RegistOpenclKernel();
-        //initialize devices, context, comand_queue
-        status = InitOpenclRunEnv( &gpuEnv );
-        if ( status )
-        {
-            fprintf(stderr,"init_opencl_env failed.\n");
-            return 1;
-        }
-        fprintf(stderr,"init_opencl_env successed.\n");
-        //initialize program, kernelName, kernelCount
-        if( getenv( "SC_FLOAT" ) )
-        {
-            gpuEnv.mnKhrFp64Flag = 0;
-            gpuEnv.mnAmdFp64Flag = 0;
-        }
-        if( gpuEnv.mnKhrFp64Flag )
-        {
-            fprintf(stderr,"----use khr double type in kernel----\n");
-            status = CompileKernelFile( &gpuEnv, "-D KHR_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16" );
-        }
-        else if( gpuEnv.mnAmdFp64Flag )
-        {
-            fprintf(stderr,"----use amd double type in kernel----\n");
-            status = CompileKernelFile( &gpuEnv, "-D AMD_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16" );
-        }
-        else
-        {
-            fprintf(stderr,"----use float type in kernel----\n");
-            status = CompileKernelFile( &gpuEnv, "-Dfp_t=float -Dfp_t4=float4 -Dfp_t16=float16" );
-        }
-        if ( status == 0 || gpuEnv.mnKernelCount == 0 )
-        {
-            fprintf(stderr,"CompileKernelFile failed.\n");
-            return 1;
-        }
-        fprintf(stderr,"CompileKernelFile successed.\n");
-        isInited = 1;
-    }
-    return 0;
-}
-
-int OpenclDevice::InitOpenclRunEnv_DeviceSelection( int argc ) {
-//PERF_COUNT_START("InitOpenclRunEnv_DS")
-#if USE_DEVICE_SELECTION
-    if (!isInited) {
-        // after programs compiled, selects best device
-        //printf("[DS] InitOpenclRunEnv_DS::Calling performDeviceSelection()\n");
-        ds_device bestDevice_DS = getDeviceSelection( );
-//PERF_COUNT_SUB("called getDeviceSelection()")
-        cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
-        // overwrite global static GPUEnv with new device
-        if (selectedDeviceIsOpenCL() ) {
-            //printf("[DS] InitOpenclRunEnv_DS::Calling populateGPUEnvFromDevice() for selected device\n");
-        populateGPUEnvFromDevice( &gpuEnv, bestDevice );
-        gpuEnv.mnFileCount = 0; //argc;
-        gpuEnv.mnKernelCount = 0UL;
-//PERF_COUNT_SUB("populate gpuEnv")
-        CompileKernelFile(&gpuEnv, "");
-//PERF_COUNT_SUB("CompileKernelFile")
-        } else {
-            //printf("[DS] InitOpenclRunEnv_DS::Skipping populateGPUEnvFromDevice() b/c native cpu selected\n");
-        }
-        isInited = 1;
-    }
-#endif
-//PERF_COUNT_END
-    return 0;
-}
-
-
-OpenclDevice::OpenclDevice()
-{
-    //InitEnv();
-}
-
-OpenclDevice::~OpenclDevice()
-{
-    //ReleaseOpenclRunEnv();
-}
-
-int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo )
-{
-    int i = 0;
-    int clStatus = 0;
-
-    if ( !isInited )
-    {
-        return 1;
-    }
-
-    for ( i = 0; i < gpuEnv.mnFileCount; i++ )
-    {
-        if ( gpuEnv.mpArryPrograms[i] )
-        {
-            clStatus = clReleaseProgram( gpuEnv.mpArryPrograms[i] );
-            CHECK_OPENCL( clStatus, "clReleaseProgram" );
-            gpuEnv.mpArryPrograms[i] = NULL;
-        }
-    }
-    if ( gpuEnv.mpCmdQueue )
-    {
-        clReleaseCommandQueue( gpuEnv.mpCmdQueue );
-        gpuEnv.mpCmdQueue = NULL;
-    }
-    if ( gpuEnv.mpContext )
-    {
-        clReleaseContext( gpuEnv.mpContext );
-        gpuEnv.mpContext = NULL;
-    }
-    isInited = 0;
-    gpuInfo->mnIsUserCreated = 0;
-    free( gpuInfo->mpArryDevsID );
-    return 1;
-}
-int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
-{
-    unsigned int i = 0;
-    cl_int clStatus;
-    int status = 0;
-    char *str = NULL;
-    FILE *fd = NULL;
-    cl_uint numDevices=0;
-    if ( getenv("SC_OPENCLCPU") )
-    {
-        clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform
-                                  CL_DEVICE_TYPE_CPU,  // device_type for CPU device
-                                  0,                   // num_entries
-                                  NULL,                // devices ID
-                                  &numDevices);
-    }
-    else
-    {
-        clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform
-                                  CL_DEVICE_TYPE_GPU,  // device_type for GPU device
-                                  0,                   // num_entries
-                                  NULL,                // devices ID
-                                  &numDevices);
-    }
-    CHECK_OPENCL( clStatus, "clGetDeviceIDs" );
-    for ( i = 0; i < numDevices; i++ )
-    {
-        char fileName[256] = { 0 }, cl_name[128] = { 0 };
-        if ( gpuEnv.mpArryDevsID[i] != 0 )
-        {
-            char deviceName[1024];
-            clStatus = clGetDeviceInfo( gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL );
-            CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
-            str = (char*) strstr( clFileName, (char*) ".cl" );
-            memcpy( cl_name, clFileName, str - clFileName );
-            cl_name[str - clFileName] = '\0';
-            sprintf( fileName, "%s-%s.bin", cl_name, deviceName );
-            legalizeFileName(fileName);
-            fd = fopen( fileName, "rb" );
-            status = ( fd != NULL ) ? 1 : 0;
-        }
-    }
-    if ( fd != NULL )
-    {
-        *fhandle = fd;
-    }
-    return status;
-
-}
-int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName )
-{
-    int i;
-    for ( i = 0; i < gpuEnvCached->mnFileCount; i++ )
-    {
-        if ( strcasecmp( gpuEnvCached->mArryKnelSrcFile[i], clFileName ) == 0 )
-        {
-            if ( gpuEnvCached->mpArryPrograms[i] != NULL )
-            {
-                return 1;
-            }
-        }
-    }
-
-    return 0;
-}
-int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes )
-{
-    FILE *output = NULL;
-    output = fopen( fileName, "wb" );
-    if ( output == NULL )
-    {
-        return 0;
-    }
-
-    fwrite( birary, sizeof(char), numBytes, output );
-    fclose( output );
-
-    return 1;
-
-}
-int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * clFileName )
-{
-    unsigned int i = 0;
-    cl_int clStatus;
-    size_t *binarySizes, numDevices;
-    cl_device_id *mpArryDevsID;
-    char **binaries, *str = NULL;
-
-    clStatus = clGetProgramInfo( program, CL_PROGRAM_NUM_DEVICES,
-                   sizeof(numDevices), &numDevices, NULL );
-    CHECK_OPENCL( clStatus, "clGetProgramInfo" );
-
-    mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices );
-    if ( mpArryDevsID == NULL )
-    {
-        return 0;
-    }
-    /* grab the handles to all of the devices in the program. */
-    clStatus = clGetProgramInfo( program, CL_PROGRAM_DEVICES,
-                   sizeof(cl_device_id) * numDevices, mpArryDevsID, NULL );
-    CHECK_OPENCL( clStatus, "clGetProgramInfo" );
-
-    /* figure out the sizes of each of the binaries. */
-    binarySizes = (size_t*) malloc( sizeof(size_t) * numDevices );
-
-    clStatus = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES,
-                   sizeof(size_t) * numDevices, binarySizes, NULL );
-    CHECK_OPENCL( clStatus, "clGetProgramInfo" );
-
-    /* copy over all of the generated binaries. */
-    binaries = (char**) malloc( sizeof(char *) * numDevices );
-    if ( binaries == NULL )
-    {
-        return 0;
-    }
-
-    for ( i = 0; i < numDevices; i++ )
-    {
-        if ( binarySizes[i] != 0 )
-        {
-            binaries[i] = (char*) malloc( sizeof(char) * binarySizes[i] );
-            if ( binaries[i] == NULL )
-            {
-                return 0;
-            }
-        }
-        else
-        {
-            binaries[i] = NULL;
-        }
-    }
-
-    clStatus = clGetProgramInfo( program, CL_PROGRAM_BINARIES,
-                   sizeof(char *) * numDevices, binaries, NULL );
-    CHECK_OPENCL(clStatus,"clGetProgramInfo");
-
-    /* dump out each binary into its own separate file. */
-    for ( i = 0; i < numDevices; i++ )
-    {
-        char fileName[256] = { 0 }, cl_name[128] = { 0 };
-
-        if ( binarySizes[i] != 0 )
-        {
-            char deviceName[1024];
-            clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
-                           sizeof(deviceName), deviceName, NULL);
-            CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
-
-            str = (char*) strstr( clFileName, (char*) ".cl" );
-            memcpy( cl_name, clFileName, str - clFileName );
-            cl_name[str - clFileName] = '\0';
-            sprintf( fileName, "%s-%s.bin", cl_name, deviceName );
-            legalizeFileName(fileName);
-            if ( !WriteBinaryToFile( fileName, binaries[i], binarySizes[i] ) )
-            {
-                printf("[OD] write binary[%s] failed\n", fileName);
-                return 0;
-            } //else
-            printf("[OD] write binary[%s] succesfully\n", fileName);
-        }
-    }
-
-    // Release all resouces and memory
-    for ( i = 0; i < numDevices; i++ )
-    {
-        if ( binaries[i] != NULL )
-        {
-            free( binaries[i] );
-            binaries[i] = NULL;
-        }
-    }
-
-    if ( binaries != NULL )
-    {
-        free( binaries );
-        binaries = NULL;
-    }
-
-    if ( binarySizes != NULL )
-    {
-        free( binarySizes );
-        binarySizes = NULL;
-    }
-
-    if ( mpArryDevsID != NULL )
-    {
-        free( mpArryDevsID );
-        mpArryDevsID = NULL;
-    }
-    return 1;
-}
-
-void copyIntBuffer( KernelEnv rEnv, cl_mem xValues, const l_uint32 *_pValues, size_t nElements, cl_int *pStatus )
-{
-    l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0,
-        nElements * sizeof(l_int32), 0, NULL, NULL, NULL );
-    clFinish( rEnv.mpkCmdQueue );
-    if (_pValues != NULL)
-    {
-        for ( int i = 0; i < (int)nElements; i++ )
-            pValues[i] = (l_int32)_pValues[i];
-    }
-
-    clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL);
-    //clFinish( rEnv.mpkCmdQueue );
-    return;
-}
-
-int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
-{
-//PERF_COUNT_START("CompileKernelFile")
-    cl_int clStatus = 0;
-    size_t length;
-    char *buildLog = NULL, *binary;
-    const char *source;
-    size_t source_size[1];
-    int b_error, binary_status, binaryExisted, idx;
-    size_t numDevices;
-    cl_device_id *mpArryDevsID;
-    FILE *fd, *fd1;
-    const char* filename = "kernel.cl";
-    //fprintf(stderr, "[OD] CompileKernelFile ... \n");
-    if ( CachedOfKernerPrg(gpuInfo, filename) == 1 )
-    {
-        return 1;
-    }
-
-    idx = gpuInfo->mnFileCount;
-
-    source = kernel_src;
-
-    source_size[0] = strlen( source );
-    binaryExisted = 0;
-        binaryExisted = BinaryGenerated( filename, &fd ); // don't check for binary during microbenchmark
-//PERF_COUNT_SUB("BinaryGenerated")
-    if ( binaryExisted == 1 )
-    {
-        clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
-                       sizeof(numDevices), &numDevices, NULL );
-        CHECK_OPENCL( clStatus, "clGetContextInfo" );
-
-        mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices );
-        if ( mpArryDevsID == NULL )
-        {
-            return 0;
-        }
-//PERF_COUNT_SUB("get numDevices")
-        b_error = 0;
-        length = 0;
-        b_error |= fseek( fd, 0, SEEK_END ) < 0;
-        b_error |= ( length = ftell(fd) ) <= 0;
-        b_error |= fseek( fd, 0, SEEK_SET ) < 0;
-        if ( b_error )
-        {
-            return 0;
-        }
-
-        binary = (char*) malloc( length + 2 );
-        if ( !binary )
-        {
-            return 0;
-        }
-
-        memset( binary, 0, length + 2 );
-        b_error |= fread( binary, 1, length, fd ) != length;
-
-
-        fclose( fd );
-//PERF_COUNT_SUB("read file")
-        fd = NULL;
-        // grab the handles to all of the devices in the context.
-        clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES,
-                       sizeof( cl_device_id ) * numDevices, mpArryDevsID, NULL );
-        CHECK_OPENCL( clStatus, "clGetContextInfo" );
-//PERF_COUNT_SUB("get devices")
-        //fprintf(stderr, "[OD] Create kernel from binary\n");
-        gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary( gpuInfo->mpContext,numDevices,
-                                           mpArryDevsID, &length, (const unsigned char**) &binary,
-                                           &binary_status, &clStatus );
-        CHECK_OPENCL( clStatus, "clCreateProgramWithBinary" );
-//PERF_COUNT_SUB("clCreateProgramWithBinary")
-        free( binary );
-        free( mpArryDevsID );
-        mpArryDevsID = NULL;
-//PERF_COUNT_SUB("binaryExisted")
-    }
-    else
-    {
-        // create a CL program using the kernel source
-        //fprintf(stderr, "[OD] Create kernel from source\n");
-        gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource( gpuInfo->mpContext, 1, &source,
-                                         source_size, &clStatus);
-        CHECK_OPENCL( clStatus, "clCreateProgramWithSource" );
-//PERF_COUNT_SUB("!binaryExisted")
-    }
-
-    if ( gpuInfo->mpArryPrograms[idx] == (cl_program) NULL )
-    {
-        return 0;
-    }
-
-    //char options[512];
-    // create a cl program executable for all the devices specified
-    //printf("[OD] BuildProgram.\n");
-PERF_COUNT_START("OD::CompileKernel::clBuildProgram")
-    if (!gpuInfo->mnIsUserCreated)
-    {
-        clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID,
-                       buildOption, NULL, NULL);
-//PERF_COUNT_SUB("clBuildProgram notUserCreated")
-    }
-    else
-    {
-        clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID),
-                       buildOption, NULL, NULL);
-//PERF_COUNT_SUB("clBuildProgram isUserCreated")
-    }
-PERF_COUNT_END
-    if ( clStatus != CL_SUCCESS )
-    {
-        printf ("BuildProgram error!\n");
-        if ( !gpuInfo->mnIsUserCreated )
-        {
-            clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
-                           CL_PROGRAM_BUILD_LOG, 0, NULL, &length );
-        }
-        else
-        {
-            clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
-                           CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
-        }
-        if ( clStatus != CL_SUCCESS )
-        {
-            printf("opencl create build log fail\n");
-            return 0;
-        }
-        buildLog = (char*) malloc( length );
-        if ( buildLog == (char*) NULL )
-        {
-            return 0;
-        }
-        if ( !gpuInfo->mnIsUserCreated )
-        {
-            clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
-                           CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
-        }
-        else
-        {
-            clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
-                           CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
-        }
-        if ( clStatus != CL_SUCCESS )
-        {
-            printf("opencl program build info fail\n");
-            return 0;
-        }
-
-        fd1 = fopen( "kernel-build.log", "w+" );
-        if ( fd1 != NULL )
-        {
-            fwrite( buildLog, sizeof(char), length, fd1 );
-            fclose( fd1 );
-        }
-
-        free( buildLog );
-//PERF_COUNT_SUB("build error log")
-        return 0;
-    }
-
-    strcpy( gpuInfo->mArryKnelSrcFile[idx], filename );
-//PERF_COUNT_SUB("strcpy")
-    if ( binaryExisted == 0 ) {
-        GeneratBinFromKernelSource( gpuInfo->mpArryPrograms[idx], filename );
-        PERF_COUNT_SUB("GenerateBinFromKernelSource")
-    }
-
-    gpuInfo->mnFileCount += 1;
-//PERF_COUNT_END
-    return 1;
-}
-
-l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl,l_uint32 *line)
-{
-PERF_COUNT_START("pixReadFromTiffKernel")
-    cl_int clStatus;
-    KernelEnv rEnv;
-    size_t globalThreads[2];
-    size_t localThreads[2];
-    int gsize;
-    cl_mem valuesCl; 
-    cl_mem outputCl;
-
-    //global and local work dimensions for Horizontal pass
-    gsize = (w + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
-    globalThreads[0] = gsize;
-    gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
-    globalThreads[1] = gsize;
-    localThreads[0] = GROUPSIZE_X;
-    localThreads[1] = GROUPSIZE_Y;
-
-    SetKernelEnv( &rEnv );
-    
-    l_uint32 *pResult = (l_uint32 *)malloc(w*h * sizeof(l_uint32));
-    rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "composeRGBPixel", &clStatus );
-    CHECK_OPENCL( clStatus, "clCreateKernel");
-    
-    //Allocate input and output OCL buffers
-    valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w*h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
-    outputCl = allocateZeroCopyBuffer(rEnv, pResult, w*h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
-
-    //Kernel arguments
-    clStatus = clSetKernelArg( rEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&valuesCl );
-    CHECK_OPENCL( clStatus, "clSetKernelArg");
-    clStatus = clSetKernelArg( rEnv.mpkKernel, 1, sizeof(w), (void *)&w );
-    CHECK_OPENCL( clStatus, "clSetKernelArg" );
-    clStatus = clSetKernelArg( rEnv.mpkKernel, 2, sizeof(h), (void *)&h );
-    CHECK_OPENCL( clStatus, "clSetKernelArg" );
-    clStatus = clSetKernelArg( rEnv.mpkKernel, 3, sizeof(wpl), (void *)&wpl );
-    CHECK_OPENCL( clStatus, "clSetKernelArg" );
-    clStatus = clSetKernelArg( rEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&outputCl );
-    CHECK_OPENCL( clStatus, "clSetKernelArg");
-    
-    //Kernel enqueue
-PERF_COUNT_SUB("before")
-    clStatus = clEnqueueNDRangeKernel( rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL );
-    CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" );
-    
-     /* map results back from gpu */
-    void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ, 0, w*h * sizeof(l_uint32), 0, NULL, NULL, &clStatus);
-    CHECK_OPENCL( clStatus, "clEnqueueMapBuffer outputCl");
-    clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, NULL, NULL);
-
-    //Sync
-    clFinish( rEnv.mpkCmdQueue );
-PERF_COUNT_SUB("kernel & map")
-PERF_COUNT_END
-    return pResult;
-}
-
-
-PIX * OpenclDevice::pixReadTiffCl ( const char *filename, l_int32 n )
-{
-PERF_COUNT_START("pixReadTiffCL")
-    FILE  *fp;
-PIX   *pix;
-
-    //printf("pixReadTiffCl file");
-    PROCNAME("pixReadTiff");
-
-    if (!filename)
-        return (PIX *)ERROR_PTR("filename not defined", procName, NULL);
-
-    if ((fp = fopenReadStream(filename)) == NULL)
-        return (PIX *)ERROR_PTR("image file not found", procName, NULL);
-    if ((pix = pixReadStreamTiffCl(fp, n)) == NULL) {
-        fclose(fp);
-        return (PIX *)ERROR_PTR("pix not read", procName, NULL);
-    }
-    fclose(fp);
-PERF_COUNT_END
-    return pix;
-    
-}
-TIFF *
-OpenclDevice::fopenTiffCl(FILE        *fp,
-          const char  *modestring)
-{
-l_int32  fd;
-
-    PROCNAME("fopenTiff");
-
-    if (!fp)
-        return (TIFF *)ERROR_PTR("stream not opened", procName, NULL);
-    if (!modestring)
-        return (TIFF *)ERROR_PTR("modestring not defined", procName, NULL);
-
-    if ((fd = fileno(fp)) < 0)
-        return (TIFF *)ERROR_PTR("invalid file descriptor", procName, NULL);
-    lseek(fd, 0, SEEK_SET);
-
-    return TIFFFdOpen(fd, "TIFFstream", modestring);
-}
-l_int32 OpenclDevice::getTiffStreamResolutionCl(TIFF     *tif,
-                        l_int32  *pxres,
-                        l_int32  *pyres)
-{
-l_uint16   resunit;
-l_int32    foundxres, foundyres;
-l_float32  fxres, fyres;
-
-    PROCNAME("getTiffStreamResolution");
-
-    if (!tif)
-        return ERROR_INT("tif not opened", procName, 1);
-    if (!pxres || !pyres)
-        return ERROR_INT("&xres and &yres not both defined", procName, 1);
-    *pxres = *pyres = 0;
-
-    TIFFGetFieldDefaulted(tif, TIFFTAG_RESOLUTIONUNIT, &resunit);
-    foundxres = TIFFGetField(tif, TIFFTAG_XRESOLUTION, &fxres);
-    foundyres = TIFFGetField(tif, TIFFTAG_YRESOLUTION, &fyres);
-    if (!foundxres && !foundyres) return 1;
-    if (!foundxres && foundyres)
-        fxres = fyres;
-    else if (foundxres && !foundyres)
-        fyres = fxres;
-
-    if (resunit == RESUNIT_CENTIMETER) {  /* convert to ppi */
-        *pxres = (l_int32)(2.54 * fxres + 0.5);
-        *pyres = (l_int32)(2.54 * fyres + 0.5);
-    }
-    else {
-        *pxres = (l_int32)fxres;
-        *pyres = (l_int32)fyres;
-    }
-
-    return 0;
-}
-PIX *
-OpenclDevice::pixReadStreamTiffCl(FILE    *fp,
-                  l_int32  n)
-{
-l_int32  i, pagefound;
-PIX     *pix;
-TIFF    *tif;
-
-    PROCNAME("pixReadStreamTiff");
-
-    if (!fp)
-        return (PIX *)ERROR_PTR("stream not defined", procName, NULL);
-
-    if ((tif = fopenTiffCl(fp, "rb")) == NULL)
-        return (PIX *)ERROR_PTR("tif not opened", procName, NULL);
-
-    pagefound = FALSE;
-    pix = NULL;
-    for (i = 0; i < MAX_PAGES_IN_TIFF_FILE; i++) {
-        if (i == n) {
-            pagefound = TRUE;
-            if ((pix = pixReadFromTiffStreamCl(tif)) == NULL) {
-                TIFFCleanup(tif);
-                return (PIX *)ERROR_PTR("pix not read", procName, NULL);
-            }
-            break;
-        }
-        if (TIFFReadDirectory(tif) == 0)
-            break;
-    }
-
-    if (pagefound == FALSE) {
-        L_WARNING_INT("tiff page %d not found", procName, n);
-        TIFFCleanup(tif);
-        return NULL;
-    }
-
-    TIFFCleanup(tif);
-    return pix;
-}
-
-static l_int32
-getTiffCompressedFormat(l_uint16  tiffcomp)
-{
-l_int32  comptype;
-
-    switch (tiffcomp)
-    {
-    case COMPRESSION_CCITTFAX4:
-        comptype = IFF_TIFF_G4;
-        break;
-    case COMPRESSION_CCITTFAX3:
-        comptype = IFF_TIFF_G3;
-        break;
-    case COMPRESSION_CCITTRLE:
-        comptype = IFF_TIFF_RLE;
-        break;
-    case COMPRESSION_PACKBITS:
-        comptype = IFF_TIFF_PACKBITS;
-        break;
-    case COMPRESSION_LZW:
-        comptype = IFF_TIFF_LZW;
-        break;
-    case COMPRESSION_ADOBE_DEFLATE:
-        comptype = IFF_TIFF_ZIP;
-        break;
-    default:
-        comptype = IFF_TIFF;
-        break;
-    }
-    return comptype;
-}
-
-void compare(l_uint32  *cpu, l_uint32  *gpu,int size)
-{
-    for(int i=0;i<size;i++)
-    {
-        if(cpu[i]!=gpu[i])
-        {
-            printf("\ndoesnot match\n");
-            return;
-        }
-    }
-    printf("\nit matches\n");
-    
-}
-
-//OpenCL implementation of pixReadFromTiffStream.
-//Similar to the CPU implentation of pixReadFromTiffStream
-PIX *
-OpenclDevice::pixReadFromTiffStreamCl(TIFF  *tif)
-{
-l_uint8   *linebuf, *data;
-l_uint16   spp, bps, bpp, tiffbpl, photometry, tiffcomp, orientation;
-l_uint16  *redmap, *greenmap, *bluemap;
-l_int32    d, wpl, bpl, comptype, i, ncolors;
-l_int32    xres, yres;
-l_uint32   w, h;
-l_uint32  *line, *tiffdata;
-PIX       *pix;
-PIXCMAP   *cmap;
-
-    PROCNAME("pixReadFromTiffStream");
-
-    if (!tif)
-        return (PIX *)ERROR_PTR("tif not defined", procName, NULL);
-
-    
-    TIFFGetFieldDefaulted(tif, TIFFTAG_BITSPERSAMPLE, &bps);
-    TIFFGetFieldDefaulted(tif, TIFFTAG_SAMPLESPERPIXEL, &spp);
-    bpp = bps * spp;
-    if (bpp > 32)
-        return (PIX *)ERROR_PTR("can't handle bpp > 32", procName, NULL);
-    if (spp == 1)
-        d = bps;
-    else if (spp == 3 || spp == 4)
-        d = 32;
-    else
-        return (PIX *)ERROR_PTR("spp not in set {1,3,4}", procName, NULL);
-
-    TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &w);
-    TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &h);
-    tiffbpl = TIFFScanlineSize(tif);
-
-    if ((pix = pixCreate(w, h, d)) == NULL)
-        return (PIX *)ERROR_PTR("pix not made", procName, NULL);
-    data = (l_uint8 *)pixGetData(pix);
-    wpl = pixGetWpl(pix);
-    bpl = 4 * wpl;
-
-   
-    if (spp == 1) {
-        if ((linebuf = (l_uint8 *)CALLOC(tiffbpl + 1, sizeof(l_uint8))) == NULL)
-            return (PIX *)ERROR_PTR("calloc fail for linebuf", procName, NULL);
-        
-        for (i = 0 ; i < h ; i++) {
-            if (TIFFReadScanline(tif, linebuf, i, 0) < 0) {
-                FREE(linebuf);
-                pixDestroy(&pix);
-                return (PIX *)ERROR_PTR("line read fail", procName, NULL);
-            }
-            memcpy((char *)data, (char *)linebuf, tiffbpl);
-            data += bpl;
-        }
-        if (bps <= 8)
-            pixEndianByteSwap(pix);
-        else   
-            pixEndianTwoByteSwap(pix);
-        FREE(linebuf);
-    }
-    else {  
-        if ((tiffdata = (l_uint32 *)CALLOC(w * h, sizeof(l_uint32))) == NULL) {
-            pixDestroy(&pix);
-            return (PIX *)ERROR_PTR("calloc fail for tiffdata", procName, NULL);
-        }
-        if (!TIFFReadRGBAImageOriented(tif, w, h, (uint32 *)tiffdata,
-                                       ORIENTATION_TOPLEFT, 0)) {
-            FREE(tiffdata);
-            pixDestroy(&pix);
-            return (PIX *)ERROR_PTR("failed to read tiffdata", procName, NULL);
-        }
-        line = pixGetData(pix);
-
-        //Invoke the OpenCL kernel for pixReadFromTiff
-        l_uint32* output_gpu=pixReadFromTiffKernel(tiffdata,w,h,wpl,line);
-        pixSetData(pix, output_gpu);
-        
-        FREE(tiffdata);
-    }
-
-    if (getTiffStreamResolutionCl(tif, &xres, &yres) == 0) {
-        pixSetXRes(pix, xres);
-        pixSetYRes(pix, yres);
-    }
-
-
-    TIFFGetFieldDefaulted(tif, TIFFTAG_COMPRESSION, &tiffcomp);
-    comptype = getTiffCompressedFormat(tiffcomp);
-    pixSetInputFormat(pix, comptype);
-
-    if (TIFFGetField(tif, TIFFTAG_COLORMAP, &redmap, &greenmap, &bluemap)) {
-           
-        if ((cmap = pixcmapCreate(bps)) == NULL) {
-            pixDestroy(&pix);
-            return (PIX *)ERROR_PTR("cmap not made", procName, NULL);
-        }
-        ncolors = 1 << bps;
-        for (i = 0; i < ncolors; i++)
-            pixcmapAddColor(cmap, redmap[i] >> 8, greenmap[i] >> 8,
-                            bluemap[i] >> 8);
-        pixSetColormap(pix, cmap);
-    }
-    else {  
-        if (!TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &photometry)) {
-       
-            if (tiffcomp == COMPRESSION_CCITTFAX3 ||
-                tiffcomp == COMPRESSION_CCITTFAX4 ||
-                tiffcomp == COMPRESSION_CCITTRLE ||
-                tiffcomp == COMPRESSION_CCITTRLEW) {
-                photometry = PHOTOMETRIC_MINISWHITE;
-            }
-            else
-                photometry = PHOTOMETRIC_MINISBLACK;
-        }
-        if ((d == 1 && photometry == PHOTOMETRIC_MINISBLACK) ||
-            (d == 8 && photometry == PHOTOMETRIC_MINISWHITE))
-            pixInvert(pix, pix);
-    }
-
-    if (TIFFGetField(tif, TIFFTAG_ORIENTATION, &orientation)) {
-        if (orientation >= 1 && orientation <= 8) {
-            struct tiff_transform *transform =
-              &tiff_orientation_transforms[orientation - 1];
-            if (transform->vflip) pixFlipTB(pix, pix);
-            if (transform->hflip) pixFlipLR(pix, pix);
-            if (transform->rotate) {
-                PIX *oldpix = pix;
-                pix = pixRotate90(oldpix, transform->rotate);
-                pixDestroy(&oldpix);
-           }
-        }
-    }
-
-    return pix;
-}
-
-//Morphology Dilate operation for 5x5 structuring element. Invokes the relevant OpenCL kernels
-cl_int
-pixDilateCL_55(l_int32  wpl, l_int32  h)
-{
-    size_t globalThreads[2];
-    cl_mem pixtemp;
-    cl_int status;
-    int gsize;
-    size_t localThreads[2];
-
-    //Horizontal pass
-    gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX;
-    globalThreads[0] = gsize;
-    globalThreads[1] = GROUPSIZE_HMORY;
-    localThreads[0] = GROUPSIZE_HMORX;
-    localThreads[1] = GROUPSIZE_HMORY;
-
-    rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_5x5", &status );
-    
-    status = clSetKernelArg(rEnv.mpkKernel,
-        0,
-        sizeof(cl_mem),
-        &pixsCLBuffer);
-    status = clSetKernelArg(rEnv.mpkKernel,
-        1,
-        sizeof(cl_mem),
-        &pixdCLBuffer);
-    status = clSetKernelArg(rEnv.mpkKernel,
-        2,
-        sizeof(wpl),
-        (const void *)&wpl);
-    status = clSetKernelArg(rEnv.mpkKernel,
-        3,
-        sizeof(h),
-        (const void *)&h);
-
-    status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
-                            rEnv.mpkKernel,
-                            2,
-                            NULL,
-                            globalThreads,
-                            localThreads,
-                            0,
-                            NULL,
-                            NULL);
-    
-    //Swap source and dest buffers
-    pixtemp = pixsCLBuffer;
-    pixsCLBuffer = pixdCLBuffer;
-    pixdCLBuffer = pixtemp;
-
-    //Vertical
-    gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
-    globalThreads[0] = gsize;
-    gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
-    globalThreads[1] = gsize;
-    localThreads[0] = GROUPSIZE_X;
-    localThreads[1] = GROUPSIZE_Y;
-
-    rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer_5x5", &status );
-    
-    status = clSetKernelArg(rEnv.mpkKernel,
-        0,
-        sizeof(cl_mem),
-        &pixsCLBuffer);
-    status = clSetKernelArg(rEnv.mpkKernel,
-        1,
-        sizeof(cl_mem),
-        &pixdCLBuffer);
-    status = clSetKernelArg(rEnv.mpkKernel,
-        2,
-        sizeof(wpl),
-        (const void *)&wpl);
-    status = clSetKernelArg(rEnv.mpkKernel,
-        3,
-        sizeof(h),
-        (const void *)&h);
-    status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
-                            rEnv.mpkKernel,
-                            2,
-                            NULL,
-                            globalThreads,
-                            localThreads,
-                            0,
-                            NULL,
-                            NULL);
-
-    return status;
-}
-
-//Morphology Erode operation for 5x5 structuring element. Invokes the relevant OpenCL kernels
-cl_int
-pixErodeCL_55(l_int32  wpl, l_int32  h)
-{
-    size_t globalThreads[2];
-    cl_mem pixtemp;
-    cl_int status;
-    int gsize;
-    l_uint32 fwmask, lwmask;
-    size_t localThreads[2];
-
-    lwmask = lmask32[32 - 2];
-    fwmask = rmask32[32 - 2];
-
-    //Horizontal pass
-    gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX;
-    globalThreads[0] = gsize;
-    globalThreads[1] = GROUPSIZE_HMORY;
-    localThreads[0] = GROUPSIZE_HMORX;
-    localThreads[1] = GROUPSIZE_HMORY;
-
-    rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_5x5", &status );
-    
-    status = clSetKernelArg(rEnv.mpkKernel,
-        0,
-        sizeof(cl_mem),
-        &pixsCLBuffer);
-    status = clSetKernelArg(rEnv.mpkKernel,
-        1,
-        sizeof(cl_mem),
-        &pixdCLBuffer);
-    status = clSetKernelArg(rEnv.mpkKernel,
-        2,
-        sizeof(wpl),
-        (const void *)&wpl);
-    status = clSetKernelArg(rEnv.mpkKernel,
-        3,
-        sizeof(h),
-        (const void *)&h);
-
-    status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
-                            rEnv.mpkKernel,
-                            2,
-                            NULL,
-                            globalThreads,
-                            localThreads,
-                            0,
-                            NULL,
-                            NULL);
-    
-    //Swap source and dest buffers
-    pixtemp = pixsCLBuffer;
-    pixsCLBuffer = pixdCLBuffer;
-    pixdCLBuffer = pixtemp;
-
-    //Vertical
-    gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
-    globalThreads[0] = gsize;
-    gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
-    globalThreads[1] = gsize;
-    localThreads[0] = GROUPSIZE_X;
-    localThreads[1] = GROUPSIZE_Y;
-
-    rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer_5x5", &status );
-    
-    status = clSetKernelArg(rEnv.mpkKernel,
-        0,
-        sizeof(cl_mem),
-        &pixsCLBuffer);
-    status = clSetKernelArg(rEnv.mpkKernel,
-        1,
-        sizeof(cl_mem),
-        &pixdCLBuffer);
-    status = clSetKernelArg(rEnv.mpkKernel,
-        2,
-        sizeof(wpl),
-        (const void *)&wpl);
-    status = clSetKernelArg(rEnv.mpkKernel,
-        3,
-        sizeof(h),
-        (const void *)&h);
-    status = clSetKernelArg(rEnv.mpkKernel,
-        4,
-        sizeof(fwmask),
-        (const void *)&fwmask);
-    status = clSetKernelArg(rEnv.mpkKernel,
-        5,
-        sizeof(lwmask),
-        (const void *)&lwmask);
-    status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
-                            rEnv.mpkKernel,
-                            2,
-                            NULL,
-                            globalThreads,
-                            localThreads,
-                            0,
-                            NULL,
-                            NULL);
-
-    return status;
-}
-
-//Morphology Dilate operation. Invokes the relevant OpenCL kernels
-cl_int
-pixDilateCL(l_int32  hsize, l_int32  vsize, l_int32  wpl, l_int32  h)
-{
-    l_int32  xp, yp, xn, yn;
-    SEL* sel;
-    size_t globalThreads[2];
-    cl_mem pixtemp;
-    cl_int status;
-    int gsize;
-    size_t localThreads[2];
-    char isEven;
-
-    OpenclDevice::SetKernelEnv( &rEnv );
-    
-    if (hsize == 5 && vsize == 5)
-    {
-        //Specific case for 5x5
-        status = pixDilateCL_55(wpl, h);
-        return status;
-    }
-    
-    sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
-    
-    selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
-
-    //global and local work dimensions for Horizontal pass
-    gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
-    globalThreads[0] = gsize;
-    gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
-    globalThreads[1] = gsize;
-    localThreads[0] = GROUPSIZE_X;
-    localThreads[1] = GROUPSIZE_Y;
-
-    if (xp > 31 || xn > 31)
-    {
-        //Generic case. 
-        rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor", &status );
-    
-        status = clSetKernelArg(rEnv.mpkKernel,
-            0,
-            sizeof(cl_mem),
-            &pixsCLBuffer);
-        status = clSetKernelArg(rEnv.mpkKernel,
-            1,
-            sizeof(cl_mem),
-            &pixdCLBuffer);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                2,
-                sizeof(xp),
-                (const void *)&xp);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                3,
-                sizeof(xn),
-                (const void *)&xn);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                4,
-                sizeof(wpl),
-                (const void *)&wpl);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                5,
-                sizeof(h),
-                (const void *)&h);
-        status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
-                                rEnv.mpkKernel,
-                                2,
-                                NULL,
-                                globalThreads,
-                                localThreads,
-                                0,
-                                NULL,
-                                NULL);
-
-        if (yp > 0 || yn > 0)
-        {
-            pixtemp = pixsCLBuffer;
-            pixsCLBuffer = pixdCLBuffer;
-            pixdCLBuffer = pixtemp;
-        }
-    }
-    else if (xp > 0 || xn > 0 )
-    {
-        //Specfic Horizontal pass kernel for half width < 32
-        rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_32word", &status );
-        isEven = (xp != xn);
-        
-        status = clSetKernelArg(rEnv.mpkKernel,
-            0,
-            sizeof(cl_mem),
-            &pixsCLBuffer);
-        status = clSetKernelArg(rEnv.mpkKernel,
-            1,
-            sizeof(cl_mem),
-            &pixdCLBuffer);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                2,
-                sizeof(xp),
-                (const void *)&xp);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                3,
-                sizeof(wpl),
-                (const void *)&wpl);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                4,
-                sizeof(h),
-                (const void *)&h);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                5,
-                sizeof(isEven),
-                (const void *)&isEven);
-        status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
-                                rEnv.mpkKernel,
-                                2,
-                                NULL,
-                                globalThreads,
-                                localThreads,
-                                0,
-                                NULL,
-                                NULL);
-
-        if (yp > 0 || yn > 0)
-        {
-            pixtemp = pixsCLBuffer;
-            pixsCLBuffer = pixdCLBuffer;
-            pixdCLBuffer = pixtemp;
-        }
-    } 
-
-    if (yp > 0 || yn > 0)
-    {
-        rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer", &status );
-        
-        status = clSetKernelArg(rEnv.mpkKernel,
-            0,
-            sizeof(cl_mem),
-            &pixsCLBuffer);
-        status = clSetKernelArg(rEnv.mpkKernel,
-            1,
-            sizeof(cl_mem),
-            &pixdCLBuffer);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                2,
-                sizeof(yp),
-                (const void *)&yp);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                3,
-                sizeof(wpl),
-                (const void *)&wpl);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                4,
-                sizeof(h),
-                (const void *)&h);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                5,
-                sizeof(yn),
-                (const void *)&yn);
-        status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
-                                rEnv.mpkKernel,
-                                2,
-                                NULL,
-                                globalThreads,
-                                localThreads,
-                                0,
-                                NULL,
-                                NULL);
-    }
-    
-
-    return status;
-}
-
-//Morphology Erode operation. Invokes the relevant OpenCL kernels
-cl_int 
-pixErodeCL(l_int32  hsize, l_int32  vsize, l_uint32 wpl, l_uint32 h)
-{
-
-    l_int32  xp, yp, xn, yn;
-    SEL* sel;
-    size_t globalThreads[2];
-    size_t localThreads[2];
-    cl_mem pixtemp;
-    cl_int status;
-    int gsize;
-    char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
-    l_uint32 rwmask, lwmask;
-    char isEven;
-
-    sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
-    
-    selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
-    
-    OpenclDevice::SetKernelEnv( &rEnv );
-
-    if (hsize == 5 && vsize == 5 && isAsymmetric)
-    {
-        //Specific kernel for 5x5
-        status = pixErodeCL_55(wpl, h);
-        return status;
-    }
-
-    rwmask = rmask32[32 - (xp & 31)];
-    lwmask = lmask32[32 - (xn & 31)];
-
-    //global and local work dimensions for Horizontal pass
-    gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
-    globalThreads[0] = gsize;
-    gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
-    globalThreads[1] = gsize;
-    localThreads[0] = GROUPSIZE_X;
-    localThreads[1] = GROUPSIZE_Y;
-    
-    //Horizontal Pass
-    if (xp > 31 || xn > 31 )
-    {
-        //Generic case. 
-        rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor", &status );
-        
-        status = clSetKernelArg(rEnv.mpkKernel,
-            0,
-            sizeof(cl_mem),
-            &pixsCLBuffer);
-        status = clSetKernelArg(rEnv.mpkKernel,
-            1,
-            sizeof(cl_mem),
-            &pixdCLBuffer);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                2,
-                sizeof(xp),
-                (const void *)&xp);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                3,
-                sizeof(xn),
-                (const void *)&xn);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                4,
-                sizeof(wpl),
-                (const void *)&wpl);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                5,
-                sizeof(h),
-                (const void *)&h);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                6,
-                sizeof(isAsymmetric),
-                (const void *)&isAsymmetric);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                7,
-                sizeof(rwmask),
-                (const void *)&rwmask);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                8,
-                sizeof(lwmask),
-                (const void *)&lwmask);
-        status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
-                                rEnv.mpkKernel,
-                                2,
-                                NULL,
-                                globalThreads,
-                                localThreads,
-                                0,
-                                NULL,
-                                NULL);
-
-        if (yp > 0 || yn > 0)
-        {
-            pixtemp = pixsCLBuffer;
-            pixsCLBuffer = pixdCLBuffer;
-            pixdCLBuffer = pixtemp;
-        }
-    }
-    else if (xp > 0 || xn > 0)
-    {
-        rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_32word", &status );
-        isEven = (xp != xn);
-
-        status = clSetKernelArg(rEnv.mpkKernel,
-            0,
-            sizeof(cl_mem),
-            &pixsCLBuffer);
-        status = clSetKernelArg(rEnv.mpkKernel,
-            1,
-            sizeof(cl_mem),
-            &pixdCLBuffer);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                2,
-                sizeof(xp),
-                (const void *)&xp);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                3,
-                sizeof(wpl),
-                (const void *)&wpl);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                4,
-                sizeof(h),
-                (const void *)&h);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                5,
-                sizeof(isAsymmetric),
-                (const void *)&isAsymmetric);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                6,
-                sizeof(rwmask),
-                (const void *)&rwmask);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                7,
-                sizeof(lwmask),
-                (const void *)&lwmask);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                8,
-                sizeof(isEven),
-                (const void *)&isEven);
-        status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
-                                rEnv.mpkKernel,
-                                2,
-                                NULL,
-                                globalThreads,
-                                localThreads,
-                                0,
-                                NULL,
-                                NULL);
-    
-        if (yp > 0 || yn > 0)
-        {
-            pixtemp = pixsCLBuffer;
-            pixsCLBuffer = pixdCLBuffer;
-            pixdCLBuffer = pixtemp;
-        }
-    }
-
-    //Vertical Pass
-    if (yp > 0 || yn > 0)
-    {
-        rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer", &status );
-        
-        status = clSetKernelArg(rEnv.mpkKernel,
-            0,
-            sizeof(cl_mem),
-            &pixsCLBuffer);
-        status = clSetKernelArg(rEnv.mpkKernel,
-            1,
-            sizeof(cl_mem),
-            &pixdCLBuffer);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                2,
-                sizeof(yp),
-                (const void *)&yp);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                3,
-                sizeof(wpl),
-                (const void *)&wpl);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                4,
-                sizeof(h),
-                (const void *)&h);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                5,
-                sizeof(isAsymmetric),
-                (const void *)&isAsymmetric);
-        status = clSetKernelArg(rEnv.mpkKernel,
-                6,
-                sizeof(yn),
-                (const void *)&yn);
-        status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
-                                rEnv.mpkKernel,
-                                2,
-                                NULL,
-                                globalThreads,
-                                localThreads,
-                                0,
-                                NULL,
-                                NULL);
-    }
-
-    return status;
-}
-
-// OpenCL implementation of Morphology Dilate
-//Note: Assumes the source and dest opencl buffer are initialized. No check done
-PIX* 
-OpenclDevice::pixDilateBrickCL(PIX  *pixd, PIX  *pixs, l_int32  hsize, l_int32  vsize, bool reqDataCopy = false)
-{
-    l_uint32 wpl, h;
-
-    wpl = pixGetWpl(pixs);
-    h = pixGetHeight(pixs);
-    
-    clStatus = pixDilateCL(hsize, vsize, wpl, h);
-
-    if (reqDataCopy)
-    {
-        pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ, false);
-    }
-
-    return pixd;
-}
-
-// OpenCL implementation of Morphology Erode
-//Note: Assumes the source and dest opencl buffer are initialized. No check done
-PIX* 
-OpenclDevice::pixErodeBrickCL(PIX  *pixd, PIX  *pixs, l_int32  hsize, l_int32  vsize, bool reqDataCopy = false)
-{
-    l_uint32 wpl, h;
-    
-    wpl = pixGetWpl(pixs);
-    h = pixGetHeight(pixs);
-
-    clStatus = pixErodeCL(hsize, vsize, wpl, h);
-    
-    if (reqDataCopy)
-    {
-        pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
-    }
-
-    return pixd;
-}
-
-//Morphology Open operation. Invokes the relevant OpenCL kernels
-cl_int
-pixOpenCL(l_int32  hsize, l_int32  vsize, l_int32  wpl, l_int32  h)
-{
-    cl_int status;
-    cl_mem pixtemp;
-    
-    //Erode followed by Dilate
-    status = pixErodeCL(hsize, vsize, wpl, h);
-    
-    pixtemp = pixsCLBuffer;
-    pixsCLBuffer = pixdCLBuffer;
-    pixdCLBuffer = pixtemp;
-
-    status = pixDilateCL(hsize, vsize, wpl, h);
-
-    return status;
-}
-
-//Morphology Close operation. Invokes the relevant OpenCL kernels
-cl_int
-pixCloseCL(l_int32  hsize, l_int32  vsize, l_int32  wpl, l_int32  h)
-{
-    cl_int status;
-    cl_mem pixtemp;
-    
-    //Dilate followed by Erode
-    status = pixDilateCL(hsize, vsize, wpl, h);
-    
-    pixtemp = pixsCLBuffer;
-    pixsCLBuffer = pixdCLBuffer;
-    pixdCLBuffer = pixtemp;
-
-    status = pixErodeCL(hsize, vsize, wpl, h);
-
-    return status;
-}
-
-// OpenCL implementation of Morphology Close
-//Note: Assumes the source and dest opencl buffer are initialized. No check done
-PIX* 
-OpenclDevice::pixCloseBrickCL(PIX  *pixd, 
-                              PIX  *pixs, 
-                              l_int32  hsize, 
-                              l_int32  vsize, 
-                              bool reqDataCopy = false)
-{
-    l_uint32 wpl, h;
-    
-    wpl = pixGetWpl(pixs);
-    h = pixGetHeight(pixs);
-
-    clStatus = pixCloseCL(hsize, vsize, wpl, h);
-
-    if (reqDataCopy)
-    {
-        pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
-    }
-
-    return pixd;
-}
-
-// OpenCL implementation of Morphology Open
-//Note: Assumes the source and dest opencl buffer are initialized. No check done
-PIX* 
-OpenclDevice::pixOpenBrickCL(PIX  *pixd, 
-                              PIX  *pixs, 
-                              l_int32  hsize, 
-                              l_int32  vsize, 
-                              bool reqDataCopy = false)
-{
-    l_uint32 wpl, h;
-    
-    wpl = pixGetWpl(pixs);
-    h = pixGetHeight(pixs);
-
-    clStatus = pixOpenCL(hsize, vsize, wpl, h);
-
-    if (reqDataCopy)
-    {
-        pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
-    }
-
-    return pixd;
-}
-
-//pix OR operation: outbuffer = buffer1 | buffer2
-cl_int
-pixORCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outbuffer)
-{
-    cl_int status;
-    size_t globalThreads[2];
-    int gsize;
-    size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
-
-    gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
-    globalThreads[0] = gsize;
-    gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
-    globalThreads[1] = gsize;
-
-    rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixOR", &status );
-
-    status = clSetKernelArg(rEnv.mpkKernel,
-        0,
-        sizeof(cl_mem),
-        &buffer1);
-    status = clSetKernelArg(rEnv.mpkKernel,
-        1,
-        sizeof(cl_mem),
-        &buffer2);
-    status = clSetKernelArg(rEnv.mpkKernel,
-        2,
-        sizeof(cl_mem),
-        &outbuffer);
-    status = clSetKernelArg(rEnv.mpkKernel,
-            3,
-            sizeof(wpl),
-            (const void *)&wpl);
-    status = clSetKernelArg(rEnv.mpkKernel,
-            4,
-            sizeof(h),
-            (const void *)&h);
-    status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
-                            rEnv.mpkKernel,
-                            2,
-                            NULL,
-                            globalThreads,
-                            localThreads,
-                            0,
-                            NULL,
-                            NULL);
-
-    return status;
-}
-
-//pix AND operation: outbuffer = buffer1 & buffer2
-cl_int
-pixANDCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outbuffer)
-{
-    cl_int status;
-    size_t globalThreads[2];
-    int gsize;
-    size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
-
-    gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
-    globalThreads[0] = gsize;
-    gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
-    globalThreads[1] = gsize;
-
-    rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixAND", &status );
-                
-    // Enqueue a kernel run call.
-    status = clSetKernelArg(rEnv.mpkKernel,
-        0,
-        sizeof(cl_mem),
-        &buffer1);
-    status = clSetKernelArg(rEnv.mpkKernel,
-        1,
-        sizeof(cl_mem),
-        &buffer2);
-    status = clSetKernelArg(rEnv.mpkKernel,
-        2,
-        sizeof(cl_mem),
-        &outbuffer);
-    status = clSetKernelArg(rEnv.mpkKernel,
-            3,
-            sizeof(wpl),
-            (const void *)&wpl);
-    status = clSetKernelArg(rEnv.mpkKernel,
-            4,
-            sizeof(h),
-            (const void *)&h);
-    status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
-                            rEnv.mpkKernel,
-                            2,
-                            NULL,
-                            globalThreads,
-                            localThreads,
-                            0,
-                            NULL,
-                            NULL);
-
-    return status;
-}
-
-//output = buffer1 & ~(buffer2)
-cl_int
-pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outBuffer = NULL)
-{
-    cl_int status;
-    size_t globalThreads[2];
-    int gsize;
-    size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
-
-    gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
-    globalThreads[0] = gsize;
-    gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
-    globalThreads[1] = gsize;
-
-    if (outBuffer != NULL)
-    {
-        rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixSubtract", &status );
-    }
-    else
-    {
-        rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixSubtract_inplace", &status );
-    }
-
-    // Enqueue a kernel run call.
-    status = clSetKernelArg(rEnv.mpkKernel,
-        0,
-        sizeof(cl_mem),
-        &buffer1);
-    status = clSetKernelArg(rEnv.mpkKernel,
-        1,
-        sizeof(cl_mem),
-        &buffer2);
-    status = clSetKernelArg(rEnv.mpkKernel,
-            2,
-            sizeof(wpl),
-            (const void *)&wpl);
-    status = clSetKernelArg(rEnv.mpkKernel,
-            3,
-            sizeof(h),
-            (const void *)&h);
-    if (outBuffer != NULL)
-    {
-        status = clSetKernelArg(rEnv.mpkKernel,
-            4,
-            sizeof(cl_mem),
-            (const void *)&outBuffer);
-    }
-    status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
-                            rEnv.mpkKernel,
-                            2,
-                            NULL,
-                            globalThreads,
-                            localThreads,
-                            0,
-                            NULL,
-                            NULL);
-
-    return status;
-}
-
-// OpenCL implementation of Subtract pix
-//Note: Assumes the source and dest opencl buffer are initialized. No check done
-PIX* 
-OpenclDevice::pixSubtractCL(PIX  *pixd, PIX  *pixs1, PIX  *pixs2, bool reqDataCopy = false)
-{
-    l_uint32 wpl, h;
-    
-    PROCNAME("pixSubtractCL");
-
-    if (!pixs1)
-        return (PIX *)ERROR_PTR("pixs1 not defined", procName, pixd);
-    if (!pixs2)
-        return (PIX *)ERROR_PTR("pixs2 not defined", procName, pixd);
-    if (pixGetDepth(pixs1) != pixGetDepth(pixs2))
-        return (PIX *)ERROR_PTR("depths of pixs* unequal", procName, pixd);
-
-#if  EQUAL_SIZE_WARNING
-    if (!pixSizesEqual(pixs1, pixs2))
-        L_WARNING("pixs1 and pixs2 not equal sizes", procName);
-#endif  /* EQUAL_SIZE_WARNING */
-
-    wpl = pixGetWpl(pixs1);
-    h = pixGetHeight(pixs1);
-
-    clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
-
-    if (reqDataCopy)
-    {
-        //Read back output data from OCL buffer to cpu
-        pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs1, wpl*h, CL_MAP_READ);
-    }
-
-    return pixd;
-}
-
-// OpenCL implementation of Hollow pix
-//Note: Assumes the source and dest opencl buffer are initialized. No check done
-PIX* 
-OpenclDevice::pixHollowCL(PIX  *pixd, 
-                        PIX  *pixs, 
-                        l_int32  close_hsize, 
-                        l_int32  close_vsize, 
-                        l_int32  open_hsize, 
-                        l_int32  open_vsize,
-                        bool reqDataCopy = false)
-{
-    l_uint32 wpl, h;
-    cl_mem pixtemp;
-
-    wpl = pixGetWpl(pixs);
-    h = pixGetHeight(pixs);
-
-    //First step : Close Morph operation: Dilate followed by Erode
-    clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
-
-    //Store the output of close operation in an intermediate buffer
-    //this will be later used for pixsubtract
-    clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL);
-
-    //Second step: Open Operation - Erode followed by Dilate
-    pixtemp = pixsCLBuffer;
-    pixsCLBuffer = pixdCLBuffer;
-    pixdCLBuffer = pixtemp;
-
-    clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
-
-    //Third step: Subtract : (Close - Open)
-    pixtemp = pixsCLBuffer;
-    pixsCLBuffer = pixdCLBuffer;
-    pixdCLBuffer = pixdCLIntermediate;
-    pixdCLIntermediate = pixtemp;
-
-    clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
-
-    if (reqDataCopy)
-    {
-        //Read back output data from OCL buffer to cpu
-        pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
-    }
-    return pixd;
-}
-
-// OpenCL implementation of Get Lines from pix function
-//Note: Assumes the source and dest opencl buffer are initialized. No check done
-void 
-OpenclDevice::pixGetLinesCL(PIX  *pixd, 
-                            PIX  *pixs, 
-                            PIX** pix_vline, 
-                            PIX** pix_hline, 
-                            PIX** pixClosed,
-                            bool  getpixClosed,
-                            l_int32  close_hsize, l_int32  close_vsize, 
-                            l_int32  open_hsize, l_int32  open_vsize,
-                            l_int32  line_hsize, l_int32  line_vsize)
-{
-    l_uint32 wpl, h;
-    cl_mem pixtemp;
-
-    wpl = pixGetWpl(pixs);
-    h = pixGetHeight(pixs);
-
-    //First step : Close Morph operation: Dilate followed by Erode
-    clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
-
-    //Copy the Close output to CPU buffer
-    if (getpixClosed)
-    {
-        *pixClosed = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs, wpl*h, CL_MAP_READ, true, false);
-    }
-
-    //Store the output of close operation in an intermediate buffer
-    //this will be later used for pixsubtract
-    clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL);
-    
-    //Second step: Open Operation - Erode followed by Dilate
-    pixtemp = pixsCLBuffer;
-    pixsCLBuffer = pixdCLBuffer;
-    pixdCLBuffer = pixtemp;
-
-    clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
-
-    //Third step: Subtract : (Close - Open)
-    pixtemp = pixsCLBuffer;
-    pixsCLBuffer = pixdCLBuffer;
-    pixdCLBuffer = pixdCLIntermediate;
-    pixdCLIntermediate = pixtemp;
-
-    clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
-
-    //Store the output of Hollow operation in an intermediate buffer
-    //this will be later used 
-    clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL);
-
-    pixtemp = pixsCLBuffer;
-    pixsCLBuffer = pixdCLBuffer;
-    pixdCLBuffer = pixtemp;
-
-    //Fourth step: Get vertical line
-    //pixOpenBrick(NULL, pix_hollow, 1, min_line_length);
-    clStatus = pixOpenCL(1, line_vsize, wpl, h);
-
-    //Copy the vertical line output to CPU buffer
-    *pix_vline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl*h, CL_MAP_READ, true, false);
-    
-    pixtemp = pixsCLBuffer;
-    pixsCLBuffer = pixdCLIntermediate;
-    pixdCLIntermediate = pixtemp;
-
-    //Fifth step: Get horizontal line
-    //pixOpenBrick(NULL, pix_hollow, min_line_length, 1);
-    clStatus = pixOpenCL(line_hsize, 1, wpl, h);
-        
-    //Copy the horizontal line output to CPU buffer
-    *pix_hline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl*h, CL_MAP_READ, true, true);
-
-    return;
-}
-
-
-/*************************************************************************
- *  HistogramRect
- *  Otsu Thresholding Operations
- *  histogramAllChannels is layed out as all channel 0, then all channel 1...
- *  only supports 1 or 4 channels (bytes_per_pixel)
- ************************************************************************/
-void OpenclDevice::HistogramRectOCL(
-    const unsigned char* imageData,
-    int bytes_per_pixel,
-    int bytes_per_line,
-    int left, // always 0
-    int top, // always 0
-    int width,
-    int height,
-    int kHistogramSize,
-    int* histogramAllChannels)
-{
-PERF_COUNT_START("HistogramRectOCL")
-    cl_int clStatus;
-    KernelEnv histKern;
-    SetKernelEnv( &histKern );
-    KernelEnv histRedKern;
-    SetKernelEnv( &histRedKern );
-    /* map imagedata to device as read only */
-    // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be coherent which we don't need.
-    // faster option would be to allocate initial image buffer
-    // using a garlic bus memory type
-    cl_mem imageBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, width*height*bytes_per_pixel*sizeof(char), (void *)imageData, &clStatus );
-    CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
-
-    /* setup work group size parameters */
-    int block_size = 256;
-    cl_uint numCUs;
-    clStatus = clGetDeviceInfo( gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs, NULL);
-    CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
-
-    int requestedOccupancy = 10;
-    int numWorkGroups = numCUs * requestedOccupancy;
-    int numThreads = block_size*numWorkGroups;
-    size_t local_work_size[] = {block_size};
-    size_t global_work_size[] = {numThreads};
-    size_t red_global_work_size[] = {block_size*kHistogramSize*bytes_per_pixel}; 
-
-    /* map histogramAllChannels as write only */
-    int numBins = kHistogramSize*bytes_per_pixel*numWorkGroups;
-    
-    cl_mem histogramBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, kHistogramSize*bytes_per_pixel*sizeof(int), (void *)histogramAllChannels, &clStatus );
-    CHECK_OPENCL( clStatus, "clCreateBuffer histogramBuffer");
-
-    /* intermediate histogram buffer */
-    int histRed = 256;
-    int tmpHistogramBins =  kHistogramSize*bytes_per_pixel*histRed; 
-
-    cl_mem tmpHistogramBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE, tmpHistogramBins*sizeof(cl_uint), NULL, &clStatus );
-    CHECK_OPENCL( clStatus, "clCreateBuffer tmpHistogramBuffer");
-
-    /* atomic sync buffer */
-    int *zeroBuffer = new int[1];
-    zeroBuffer[0] = 0;
-    cl_mem atomicSyncBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_int), (void *)zeroBuffer, &clStatus );
-    CHECK_OPENCL( clStatus, "clCreateBuffer atomicSyncBuffer");
-
-    //Create kernel objects based on bytes_per_pixel
-    if (bytes_per_pixel == 1)
-    {
-        histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectOneChannel", &clStatus );
-        CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectOneChannel");
-
-        histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectOneChannelReduction", &clStatus );
-        CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectOneChannelReduction");
-    } else {
-    histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus );
-    CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannels");
-
-    histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectAllChannelsReduction", &clStatus );
-    CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannelsReduction");
-    }
-
-    void *ptr;
-    
-    //Initialize tmpHistogramBuffer buffer
-    ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0, tmpHistogramBins*sizeof(cl_uint), 0, NULL, NULL, &clStatus);
-    CHECK_OPENCL( clStatus, "clEnqueueMapBuffer tmpHistogramBuffer");
-    
-    memset(ptr, 0, tmpHistogramBins*sizeof(cl_uint));
-    clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0, NULL, NULL);
-
-    /* set kernel 1 arguments */
-    clStatus = clSetKernelArg( histKern.mpkKernel, 0, sizeof(cl_mem), (void *)&imageBuffer );
-    CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
-    cl_uint numPixels = width*height;
-    clStatus = clSetKernelArg( histKern.mpkKernel, 1, sizeof(cl_uint), (void *)&numPixels );
-    CHECK_OPENCL( clStatus, "clSetKernelArg numPixels" );
-    clStatus = clSetKernelArg( histKern.mpkKernel, 2, sizeof(cl_mem), (void *)&tmpHistogramBuffer );
-    CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer");
-
-    /* set kernel 2 arguments */
-    int n = numThreads/bytes_per_pixel;
-    clStatus = clSetKernelArg( histRedKern.mpkKernel, 0, sizeof(cl_int), (void *)&n );
-    CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
-    clStatus = clSetKernelArg( histRedKern.mpkKernel, 1, sizeof(cl_mem), (void *)&tmpHistogramBuffer );
-    CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer");
-    clStatus = clSetKernelArg( histRedKern.mpkKernel, 2, sizeof(cl_mem), (void *)&histogramBuffer );
-    CHECK_OPENCL( clStatus, "clSetKernelArg histogramBuffer");
-
-    /* launch histogram */
-PERF_COUNT_SUB("before")
-    clStatus = clEnqueueNDRangeKernel(
-        histKern.mpkCmdQueue,
-        histKern.mpkKernel,
-        1, NULL, global_work_size, local_work_size,
-        0, NULL, NULL );
-    CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels" );
-    clFinish( histKern.mpkCmdQueue );
-
-    /* launch histogram */
-    clStatus = clEnqueueNDRangeKernel(
-        histRedKern.mpkCmdQueue,
-        histRedKern.mpkKernel,
-        1, NULL, red_global_work_size, local_work_size,
-        0, NULL, NULL );
-    CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction" );
-    clFinish( histRedKern.mpkCmdQueue );
-
-PERF_COUNT_SUB("redKernel")
-
-    /* map results back from gpu */
-    ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE, CL_MAP_READ, 0, kHistogramSize*bytes_per_pixel*sizeof(int), 0, NULL, NULL, &clStatus);
-    CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer");
-    
-    clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0, NULL, NULL);
-   
-    clReleaseMemObject(histogramBuffer);
-    clReleaseMemObject(imageBuffer);
-PERF_COUNT_SUB("after")
-PERF_COUNT_END
-
-}
-
-/*************************************************************************
- * Threshold the rectangle, taking everything except the image buffer pointer
- * from the class, using thresholds/hi_values to the output IMAGE.
- * only supports 1 or 4 channels
- ************************************************************************/
-void OpenclDevice::ThresholdRectToPixOCL(
-    const unsigned char* imageData,
-    int bytes_per_pixel,
-    int bytes_per_line,
-    const int* thresholds,
-    const int* hi_values,
-    Pix** pix,
-    int height,
-    int width,
-    int top,
-    int left) {
-PERF_COUNT_START("ThresholdRectToPixOCL")
-
-    /* create pix result buffer */                                 
-    *pix = pixCreate(width, height, 1);
-    uinT32* pixData = pixGetData(*pix);
-    int wpl = pixGetWpl(*pix);
-    int pixSize = wpl*height*sizeof(uinT32);
-
-    cl_int clStatus;
-    KernelEnv rEnv;
-    SetKernelEnv( &rEnv );
-
-    /* setup work group size parameters */
-    int block_size = 256;
-    cl_uint numCUs = 6;
-     clStatus = clGetDeviceInfo( gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs, NULL);
-    CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
-
-    int requestedOccupancy = 10;
-    int numWorkGroups = numCUs * requestedOccupancy;
-    int numThreads = block_size*numWorkGroups;
-    size_t local_work_size[] = {(size_t) block_size};
-    size_t global_work_size[] = {(size_t) numThreads};
-
-    /* map imagedata to device as read only */
-    // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be coherent which we don't need.
-    // faster option would be to allocate initial image buffer
-    // using a garlic bus memory type
-    cl_mem imageBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, width*height*bytes_per_pixel*sizeof(char), (void *)imageData, &clStatus );
-    CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
-
-    /* map pix as write only */
-    pixThBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, pixSize, (void *)pixData, &clStatus );
-    CHECK_OPENCL( clStatus, "clCreateBuffer pix");
-
-    /* map thresholds and hi_values */
-    cl_mem thresholdsBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bytes_per_pixel*sizeof(int), (void *)thresholds, &clStatus );
-    CHECK_OPENCL( clStatus, "clCreateBuffer thresholdBuffer");
-    cl_mem hiValuesBuffer   = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bytes_per_pixel*sizeof(int), (void *)hi_values, &clStatus );
-    CHECK_OPENCL( clStatus, "clCreateBuffer hiValuesBuffer");
-
-    /* compile kernel */
-    if (bytes_per_pixel == 4) {
-    rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "kernel_ThresholdRectToPix", &clStatus );
-    CHECK_OPENCL( clStatus, "clCreateKernel kernel_ThresholdRectToPix");
-    } else {
-        rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "kernel_ThresholdRectToPix_OneChan", &clStatus );
-        CHECK_OPENCL( clStatus, "clCreateKernel kernel_ThresholdRectToPix_OneChan");
-    }
-
-    /* set kernel arguments */
-    clStatus = clSetKernelArg( rEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&imageBuffer );
-    CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
-    cl_uint numPixels = width*height;
-    clStatus = clSetKernelArg( rEnv.mpkKernel, 1, sizeof(int), (void *)&height );
-    CHECK_OPENCL( clStatus, "clSetKernelArg height" );
-    clStatus = clSetKernelArg( rEnv.mpkKernel, 2, sizeof(int), (void *)&width );
-    CHECK_OPENCL( clStatus, "clSetKernelArg width" );
-    clStatus = clSetKernelArg( rEnv.mpkKernel, 3, sizeof(int), (void *)&wpl );
-    CHECK_OPENCL( clStatus, "clSetKernelArg wpl" );
-    clStatus = clSetKernelArg( rEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&thresholdsBuffer );
-    CHECK_OPENCL( clStatus, "clSetKernelArg thresholdsBuffer" );
-    clStatus = clSetKernelArg( rEnv.mpkKernel, 5, sizeof(cl_mem), (void *)&hiValuesBuffer );
-    CHECK_OPENCL( clStatus, "clSetKernelArg hiValuesBuffer" );
-    clStatus = clSetKernelArg( rEnv.mpkKernel, 6, sizeof(cl_mem), (void *)&pixThBuffer );
-    CHECK_OPENCL( clStatus, "clSetKernelArg pixThBuffer");
-
-    /* launch kernel & wait */
-PERF_COUNT_SUB("before")
-    clStatus = clEnqueueNDRangeKernel(
-        rEnv.mpkCmdQueue,
-        rEnv.mpkKernel,
-        1, NULL, global_work_size, local_work_size,
-        0, NULL, NULL );
-    CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix" );
-    clFinish( rEnv.mpkCmdQueue );
-PERF_COUNT_SUB("kernel")
-    
-    /* map results back from gpu */
-    void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0, pixSize, 0, NULL, NULL, &clStatus);
-    CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer");
-    clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0, NULL, NULL);
-    
-    clReleaseMemObject(imageBuffer);
-    clReleaseMemObject(thresholdsBuffer);
-    clReleaseMemObject(hiValuesBuffer);
-
-PERF_COUNT_SUB("after")
-PERF_COUNT_END
-}
-
-
-#if USE_DEVICE_SELECTION
-
-/******************************************************************************
- * Data Types for Device Selection
- *****************************************************************************/
-
-typedef struct _TessScoreEvaluationInputData {
-    int height;
-    int width;
-    int numChannels;
-    unsigned char *imageData;
-    Pix *pix;
-} TessScoreEvaluationInputData;
-
-void populateTessScoreEvaluationInputData( TessScoreEvaluationInputData *input ) {
-    srand(1);
-    // 8.5x11 inches @ 300dpi rounded to clean multiples
-    int height = 3328; // %256
-    int width = 2560; // %512
-    int numChannels = 4;
-    input->height = height;
-    input->width = width;
-    input->numChannels = numChannels;
-    unsigned char (*imageData4)[4] = (unsigned char (*)[4]) malloc(height*width*numChannels*sizeof(unsigned char)); // new unsigned char[4][height*width];
-    input->imageData = (unsigned char *) &imageData4[0];
-    
-    // zero out image
-    unsigned char pixelWhite[4] = {  0,   0,   0, 255};
-    unsigned char pixelBlack[4] = {255, 255, 255, 255};
-    for (int p = 0; p < height*width; p++) {
-        //unsigned char tmp[4] = imageData4[0];
-        imageData4[p][0] = pixelWhite[0];
-        imageData4[p][1] = pixelWhite[1];
-        imageData4[p][2] = pixelWhite[2];
-        imageData4[p][3] = pixelWhite[3];
-    }
-    // random lines to be eliminated
-    int maxLineWidth = 64; // pixels wide
-    int numLines = 10;
-    // vertical lines
-    for (int i = 0; i < numLines; i++) {
-        int lineWidth = rand()%maxLineWidth;
-        int vertLinePos = lineWidth + rand()%(width-2*lineWidth);
-        //printf("[PI] VerticalLine @ %i (w=%i)\n", vertLinePos, lineWidth);
-        for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) {
-            for (int col = 0; col < height; col++) {
-                //imageData4[row*width+col] = pixelBlack;
-                imageData4[row*width+col][0] = pixelBlack[0];
-                imageData4[row*width+col][1] = pixelBlack[1];
-                imageData4[row*width+col][2] = pixelBlack[2];
-                imageData4[row*width+col][3] = pixelBlack[3];
-            }
-        }
-    }
-    // horizontal lines
-    for (int i = 0; i < numLines; i++) {
-        int lineWidth = rand()%maxLineWidth;
-        int horLinePos = lineWidth + rand()%(height-2*lineWidth);
-        //printf("[PI] HorizontalLine @ %i (w=%i)\n", horLinePos, lineWidth);
-        for (int row = 0; row < width; row++) {
-            for (int col = horLinePos-lineWidth/2; col < horLinePos+lineWidth/2; col++) { // for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) {
-                //printf("[PI] HoizLine pix @ (%3i, %3i)\n", row, col);
-                //imageData4[row*width+col] = pixelBlack;
-                imageData4[row*width+col][0] = pixelBlack[0];
-                imageData4[row*width+col][1] = pixelBlack[1];
-                imageData4[row*width+col][2] = pixelBlack[2];
-                imageData4[row*width+col][3] = pixelBlack[3];
-            }
-        }
-    }
-    // spots (noise, squares)
-    float fractionBlack = 0.1; // how much of the image should be blackened
-    int numSpots = (height*width)*fractionBlack/(maxLineWidth*maxLineWidth/2/2);
-    for (int i = 0; i < numSpots; i++) {
-        
-        int lineWidth = rand()%maxLineWidth;
-        int col = lineWidth + rand()%(width-2*lineWidth);
-        int row = lineWidth + rand()%(height-2*lineWidth);
-        //printf("[PI] Spot[%i/%i] @ (%3i, %3i)\n", i, numSpots, row, col );
-        for (int r = row-lineWidth/2; r < row+lineWidth/2; r++) {
-            for (int c = col-lineWidth/2; c < col+lineWidth/2; c++) {
-                //printf("[PI] \tSpot[%i/%i] @ (%3i, %3i)\n", i, numSpots, r, c );
-                //imageData4[row*width+col] = pixelBlack;
-                imageData4[r*width+c][0] = pixelBlack[0];
-                imageData4[r*width+c][1] = pixelBlack[1];
-                imageData4[r*width+c][2] = pixelBlack[2];
-                imageData4[r*width+c][3] = pixelBlack[3];
-            }
-        }
-    }
-
-    input->pix = pixCreate(input->width, input->height, 1);
-}
-
-typedef struct _TessDeviceScore {
-    float time; // small time means faster device
-    bool clError; // were there any opencl errors
-    bool valid; // was the correct response generated
-} TessDeviceScore;
-
-/******************************************************************************
- * Micro Benchmarks for Device Selection
- *****************************************************************************/
-
-double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
-    
-    double time = 0;
-#if ON_WINDOWS
-    LARGE_INTEGER freq, time_funct_start, time_funct_end;
-    QueryPerformanceFrequency(&freq);
-#else
-    timespec time_funct_start, time_funct_end;
-#endif
-    // input data
-    l_uint32 *tiffdata = (l_uint32 *)input.imageData;// same size and random data; data doesn't change workload
-
-    // function call
-    if (type == DS_DEVICE_OPENCL_DEVICE) {
-#if ON_WINDOWS
-        QueryPerformanceCounter(&time_funct_start);
-#else
-        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
-#endif
-
-        OpenclDevice::gpuEnv = *env;
-        int wpl = pixGetWpl(input.pix);
-        OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height, wpl, NULL);
-#if ON_WINDOWS
-        QueryPerformanceCounter(&time_funct_end);
-        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
-#else
-        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
-        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
-#endif
-
-    } else {
-#if ON_WINDOWS
-        QueryPerformanceCounter(&time_funct_start);
-#else
-        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
-#endif
-        Pix *pix = pixCreate(input.width, input.height, 32);
-        l_uint32 *pixData = pixGetData(pix);
-        int wpl = pixGetWpl(pix);
-        //l_uint32* output_gpu=pixReadFromTiffKernel(tiffdata,w,h,wpl,line);
-        //pixSetData(pix, output_gpu);
-        int i, j;
-        int idx = 0;
-        for (i = 0; i < input.height ; i++) {
-            for (j = 0; j < input.width; j++) {
-                
-                l_uint32 tiffword = tiffdata[i * input.width + j];
-                l_int32 rval = ((tiffword) & 0xff);
-                l_int32 gval = (((tiffword) >> 8) & 0xff);
-                l_int32 bval = (((tiffword) >> 16) & 0xff);
-                l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8);
-                pixData[idx] = value;
-                idx++;
-            }
-        }
-#if ON_WINDOWS
-        QueryPerformanceCounter(&time_funct_end);
-        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
-#else
-        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
-        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
-#endif
-        pixDestroy(&pix);
-    }
-
-
-    // cleanup
-
-    return time;
-}
-
-double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
-    
-    double time;
-#if ON_WINDOWS
-    LARGE_INTEGER freq, time_funct_start, time_funct_end;
-    QueryPerformanceFrequency(&freq);
-#else
-    timespec time_funct_start, time_funct_end;
-#endif
-    
-    unsigned char pixelHi = (unsigned char)255;
-    
-    int left = 0;
-    int top = 0;
-    int kHistogramSize = 256;
-    int bytes_per_line = input.width*input.numChannels;
-    int *histogramAllChannels = new int[kHistogramSize*input.numChannels];
-
-    // function call
-    if (type == DS_DEVICE_OPENCL_DEVICE) {
-#if ON_WINDOWS
-        QueryPerformanceCounter(&time_funct_start);
-#else
-        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
-#endif
-
-        OpenclDevice::gpuEnv = *env;
-        int wpl = pixGetWpl(input.pix);
-        OpenclDevice::HistogramRectOCL(input.imageData, input.numChannels, bytes_per_line, top, left, input.width, input.height, kHistogramSize, histogramAllChannels);
-
-#if ON_WINDOWS
-        QueryPerformanceCounter(&time_funct_end);
-        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
-#else
-        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
-        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
-#endif
-    } else {
-    
-        int *histogram = new int[kHistogramSize];
-#if ON_WINDOWS
-        QueryPerformanceCounter(&time_funct_start);
-#else
-        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
-#endif
-        for (int ch = 0; ch < input.numChannels; ++ch) { 
-            tesseract::HistogramRect(input.pix, input.numChannels,
-                  left, top, input.width, input.height, histogram);
-        }
-#if ON_WINDOWS
-        QueryPerformanceCounter(&time_funct_end);
-        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
-#else
-        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
-        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
-#endif
-        delete[] histogram;
-    }
-
-    // cleanup
-    //delete[] imageData;
-    delete[] histogramAllChannels;
-    return time;
-}
-
-//Reproducing the ThresholdRectToPix native version
-void ThresholdRectToPix_Native(const unsigned char* imagedata,
-                                          int bytes_per_pixel,
-                                          int bytes_per_line,
-                                          const int* thresholds,
-                                          const int* hi_values,
-                                          Pix** pix) {
-    int top = 0;
-    int left = 0;
-    int width = pixGetWidth(*pix);
-    int height = pixGetHeight(*pix);
-
-  *pix = pixCreate(width, height, 1);
-  uinT32* pixdata = pixGetData(*pix);
-  int wpl = pixGetWpl(*pix);
-  const unsigned char* srcdata = imagedata + top * bytes_per_line +
-                                 left * bytes_per_pixel;
-  for (int y = 0; y < height; ++y) {
-    const uinT8* linedata = srcdata;
-    uinT32* pixline = pixdata + y * wpl;
-    for (int x = 0; x < width; ++x, linedata += bytes_per_pixel) {
-      bool white_result = true;
-      for (int ch = 0; ch < bytes_per_pixel; ++ch) {
-        if (hi_values[ch] >= 0 &&
-            (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
-          white_result = false;
-          break;
-        }
-      }
-      if (white_result)
-        CLEAR_DATA_BIT(pixline, x);
-      else
-        SET_DATA_BIT(pixline, x);
-    }
-    srcdata += bytes_per_line;
-  }
-}
-
-double thresholdRectToPixMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
-    
-    double time;
-#if ON_WINDOWS
-    LARGE_INTEGER freq, time_funct_start, time_funct_end;
-    QueryPerformanceFrequency(&freq);
-#else
-    timespec time_funct_start, time_funct_end;
-#endif
-    
-    // input data
-    unsigned char pixelHi = (unsigned char)255;
-    int* thresholds = new int[4];
-    thresholds[0] = pixelHi/2;
-    thresholds[1] = pixelHi/2;
-    thresholds[2] = pixelHi/2;
-    thresholds[3] = pixelHi/2;
-    int *hi_values = new int[4];
-    thresholds[0] = pixelHi;
-    thresholds[1] = pixelHi;
-    thresholds[2] = pixelHi;
-    thresholds[3] = pixelHi;
-    //Pix* pix = pixCreate(width, height, 1);
-    int top = 0;
-    int left = 0;
-    int bytes_per_line = input.width*input.numChannels;
-
-    // function call
-    if (type == DS_DEVICE_OPENCL_DEVICE) {
-#if ON_WINDOWS
-        QueryPerformanceCounter(&time_funct_start);
-#else
-        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
-#endif
-
-        OpenclDevice::gpuEnv = *env;
-        int wpl = pixGetWpl(input.pix);
-        OpenclDevice::ThresholdRectToPixOCL(input.imageData, input.numChannels, bytes_per_line, thresholds, hi_values, &input.pix, input.height, input.width, top, left);
-
-#if ON_WINDOWS
-        QueryPerformanceCounter(&time_funct_end);
-        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
-#else
-        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
-        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
-#endif
-    } else {
-
-
-        tesseract::ImageThresholder thresholder;
-        thresholder.SetImage( input.pix );
-#if ON_WINDOWS
-        QueryPerformanceCounter(&time_funct_start);
-#else
-        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
-#endif
-        ThresholdRectToPix_Native( input.imageData, input.numChannels, bytes_per_line,
-            thresholds, hi_values, &input.pix );
-
-#if ON_WINDOWS
-        QueryPerformanceCounter(&time_funct_end);
-        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
-#else
-        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
-        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
-#endif
-    }
-
-    // cleanup
-    delete[] thresholds;
-    delete[] hi_values;
-    return time;
-}
-
-double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
-
-    double time = 0;
-#if ON_WINDOWS
-    LARGE_INTEGER freq, time_funct_start, time_funct_end;
-    QueryPerformanceFrequency(&freq);
-#else
-    timespec time_funct_start, time_funct_end;
-#endif
-
-    // input data
-    int resolution = 300;
-    int wpl = pixGetWpl(input.pix);
-    int kThinLineFraction = 20; // tess constant
-    int kMinLineLengthFraction = 4; // tess constant
-    int max_line_width = resolution / kThinLineFraction;
-    int min_line_length = resolution / kMinLineLengthFraction;
-    int closing_brick = max_line_width / 3;
-   
-    // function call
-    if (type == DS_DEVICE_OPENCL_DEVICE) {
-#if ON_WINDOWS
-        QueryPerformanceCounter(&time_funct_start);
-#else
-        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
-#endif
-        Pix *src_pix = input.pix;
-        OpenclDevice::gpuEnv = *env;
-        OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
-        Pix *pix_vline = NULL, *pix_hline = NULL, *pix_closed = NULL;
-        OpenclDevice::pixGetLinesCL(NULL, input.pix, &pix_vline, &pix_hline, &pix_closed, true, closing_brick, closing_brick, max_line_width, max_line_width, min_line_length, min_line_length);
-
-        OpenclDevice::releaseMorphCLBuffers();
-
-#if ON_WINDOWS
-        QueryPerformanceCounter(&time_funct_end);
-        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
-#else
-        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
-        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
-#endif
-    } else {
-#if ON_WINDOWS
-        QueryPerformanceCounter(&time_funct_start);
-#else
-        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
-#endif
-
-        // native serial code
-        Pix *src_pix = input.pix;
-        Pix *pix_closed = pixCloseBrick(NULL, src_pix, closing_brick, closing_brick);
-        Pix *pix_solid  = pixOpenBrick(NULL, pix_closed, max_line_width, max_line_width);
-        Pix *pix_hollow = pixSubtract(NULL, pix_closed, pix_solid);
-        pixDestroy(&pix_solid);
-        Pix *pix_vline = pixOpenBrick(NULL, pix_hollow, 1, min_line_length);
-        Pix *pix_hline = pixOpenBrick(NULL, pix_hollow, min_line_length, 1);
-        pixDestroy(&pix_hollow);
-
-#if ON_WINDOWS
-        QueryPerformanceCounter(&time_funct_end);
-        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
-#else
-        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
-        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
-#endif
-    }
-
-    return time;
-}
-
-
-
-/******************************************************************************
- * Device Selection
- *****************************************************************************/
-
-#include "stdlib.h"
-
- 
-// encode score object as byte string
-ds_status serializeScore( ds_device* device, void **serializedScore, unsigned int* serializedScoreSize ) {
-    *serializedScoreSize = sizeof(TessDeviceScore);
-    *serializedScore = (void *) new unsigned char[*serializedScoreSize];
-    memcpy(*serializedScore, device->score, *serializedScoreSize);
-    return DS_SUCCESS;
-}
-
-// parses byte string and stores in score object
-ds_status deserializeScore( ds_device* device, const unsigned char* serializedScore, unsigned int serializedScoreSize ) {
-    // check that serializedScoreSize == sizeof(TessDeviceScore);
-    device->score = new TessDeviceScore;
-    memcpy(device->score, serializedScore, serializedScoreSize);
-    return DS_SUCCESS;
-}
-
-
-
-// evaluate devices
-ds_status evaluateScoreForDevice( ds_device *device, void *inputData) {
-    
-    // overwrite statuc gpuEnv w/ current device
-    // so native opencl calls can be used; they use static gpuEnv
-    printf("\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" );
-    GPUEnv *env = NULL;
-    if (device->type == DS_DEVICE_OPENCL_DEVICE) {
-        env = new GPUEnv;
-        //printf("[DS] populating tmp GPUEnv from device\n");
-        populateGPUEnvFromDevice( env, device->oclDeviceID);
-        env->mnFileCount = 0; //argc;
-        env->mnKernelCount = 0UL;
-        //printf("[DS] compiling kernels for tmp GPUEnv\n");
-        OpenclDevice::gpuEnv = *env;
-        OpenclDevice::CompileKernelFile(env, "");
-    }
-    
-
-    TessScoreEvaluationInputData *input = (TessScoreEvaluationInputData *)inputData;
-    
-    // pixReadTiff
-    double composeRGBPixelTime = composeRGBPixelMicroBench( env, *input, device->type );
-
-    // HistogramRect
-    double histogramRectTime = histogramRectMicroBench( env, *input, device->type );
-
-    // ThresholdRectToPix
-    double thresholdRectToPixTime = thresholdRectToPixMicroBench( env, *input, device->type );
-
-    // getLineMasks
-    double getLineMasksMorphTime = getLineMasksMorphMicroBench( env, *input, device->type );
-
-
-    // weigh times (% of cpu time)
-    // these weights should be the % execution time that the native cpu code took
-    float composeRGBPixelWeight     = 1.2f;
-    float histogramRectWeight       = 2.4f;
-    float thresholdRectToPixWeight  = 4.5f;
-    float getLineMasksMorphWeight        = 5.0f;
-    
-    float weightedTime = 
-        composeRGBPixelWeight       * composeRGBPixelTime +
-        histogramRectWeight         * histogramRectTime +
-        thresholdRectToPixWeight    * thresholdRectToPixTime +
-        getLineMasksMorphWeight     * getLineMasksMorphTime
-        ;
-    device->score = (void *)new TessDeviceScore;
-    ((TessDeviceScore *)device->score)->time = weightedTime;
-    
-    printf("[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" );
-    printf("[DS]%25s: %f (w=%.1f)\n", "composeRGBPixel", composeRGBPixelTime, composeRGBPixelWeight );
-    printf("[DS]%25s: %f (w=%.1f)\n", "HistogramRect", histogramRectTime, histogramRectWeight );
-    printf("[DS]%25s: %f (w=%.1f)\n", "ThresholdRectToPix", thresholdRectToPixTime, thresholdRectToPixWeight );
-    printf("[DS]%25s: %f (w=%.1f)\n", "getLineMasksMorph", getLineMasksMorphTime, getLineMasksMorphWeight );
-    printf("[DS]%25s: %f\n", "Score", ((TessDeviceScore *)device->score)->time );
-    return DS_SUCCESS;
-}
-
-// initial call to select device
-ds_device OpenclDevice::getDeviceSelection( ) {
-//PERF_COUNT_START("getDeviceSelection")
-    if (!deviceIsSelected) {
-PERF_COUNT_START("getDeviceSelection")
-        // check if opencl is available at runtime
-        if( 1 == LoadOpencl() ) {
-            // opencl is available
-//PERF_COUNT_SUB("LoadOpencl")
-    // setup devices
-    ds_status status;
-    ds_profile *profile;
-    status = initDSProfile( &profile, "v0.1" );
-PERF_COUNT_SUB("initDSProfile")
-    // try reading scores from file
-    char *fileName = "tesseract_opencl_profile_devices.dat";
-    status = readProfileFromFile( profile, deserializeScore, fileName);
-    if (status != DS_SUCCESS) {
-        // need to run evaluation
-                printf("[DS] Profile file not available (%s); performing profiling.\n", fileName);
-
-        // create input data
-        TessScoreEvaluationInputData input;
-                populateTessScoreEvaluationInputData( &input );
-//PERF_COUNT_SUB("populateTessScoreEvaluationInputData")
-        // perform evaluations
-        unsigned int numUpdates;
-        status =  profileDevices( profile, DS_EVALUATE_ALL, evaluateScoreForDevice, (void *)&input, &numUpdates );
-PERF_COUNT_SUB("profileDevices")
-        // write scores to file
-        if ( status == DS_SUCCESS ) {
-            status = writeProfileToFile( profile, serializeScore, fileName);
-PERF_COUNT_SUB("writeProfileToFile")
-            if ( status == DS_SUCCESS ) {
-                        printf("[DS] Scores written to file (%s).\n", fileName);
-            } else {
-                        printf("[DS] Error saving scores to file (%s); scores not written to file.\n", fileName);
-            }
-        } else {
-                    printf("[DS] Unable to evaluate performance; scores not written to file.\n");
-        }
-
-    } else {
-
-PERF_COUNT_SUB("readProfileFromFile")
-                printf("[DS] Profile read from file (%s).\n", fileName);
-    }
-
-    // we now have device scores either from file or evaluation
-    // select fastest using custom Tesseract selection algorithm
-    float bestTime = FLT_MAX; // begin search with worst possible time
-    int bestDeviceIdx = -1;
-    for (int d = 0; d < profile->numDevices; d++) {
-        //((TessDeviceScore *)device->score)->time
-        ds_device device = profile->devices[d];
-        TessDeviceScore score = *(TessDeviceScore *)device.score;
-        
-        float time = score.time;
-                printf("[DS] Device[%i] %i:%s score is %f\n", d+1, device.type, device.oclDeviceName, time);
-        if (time < bestTime) {
-                    bestTime = time;
-            bestDeviceIdx = d;
-        }
-    }
-            printf("[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx+1, profile->devices[bestDeviceIdx].oclDeviceName, profile->devices[bestDeviceIdx].type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
-    // cleanup
-            // TODO: call destructor for profile object?
-
-            bool overrided = false;
-            char *overrideDeviceStr = getenv("TESSERACT_OPENCL_DEVICE");
-            if (overrideDeviceStr != NULL) {
-                int overrideDeviceIdx = atoi(overrideDeviceStr);
-                if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices ) {
-                    printf("[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, %i)\n", overrideDeviceStr, overrideDeviceIdx);
-                    bestDeviceIdx = overrideDeviceIdx - 1;
-                    overrided = true;
-                } else {
-                    printf("[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are valid devices).\n", overrideDeviceStr, profile->numDevices);
-                }
-}
-
-            if (overrided) {
-                printf("[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx+1, profile->devices[bestDeviceIdx].oclDeviceName, profile->devices[bestDeviceIdx].type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
-            }
-            selectedDevice = profile->devices[bestDeviceIdx];
-
-        } else {
-            // opencl isn't available at runtime, select native cpu device
-            printf("[DS] OpenCL runtime not available.\n");
-            selectedDevice.type = DS_DEVICE_NATIVE_CPU;
-            selectedDevice.oclDeviceName = "(null)";
-            selectedDevice.score = NULL;
-            selectedDevice.oclDeviceID = NULL;
-            selectedDevice.oclDriverVersion = NULL;
-        }
-        deviceIsSelected = true;
-PERF_COUNT_SUB("select from Profile")
-PERF_COUNT_END
-    }
-//PERF_COUNT_END
-    return selectedDevice;
-}
-
-#endif
-
-bool OpenclDevice::selectedDeviceIsOpenCL() {
-#if USE_DEVICE_SELECTION
-    ds_device device = getDeviceSelection();
-    return (device.type == DS_DEVICE_OPENCL_DEVICE);
-#else
-    return true;
-#endif
-}
-
-bool OpenclDevice::selectedDeviceIsNativeCPU() {
-#if USE_DEVICE_SELECTION
-    ds_device device = getDeviceSelection();
-    return (device.type == DS_DEVICE_NATIVE_CPU);
-#else
-    return false;
-#endif
-}
-
-
-
-#endif
+#ifdef _WIN32
+#include <Windows.h>
+#include <io.h>
+#else
+#include <sys/types.h>
+#include <unistd.h>
+#endif
+#include <float.h>
+
+#include "openclwrapper.h"
+#include "oclkernels.h"
+
+// for micro-benchmark
+#include "otsuthr.h"
+#include "thresholder.h"
+
+#ifdef USE_OPENCL
+
+#include "opencl_device_selection.h"
+
+GPUEnv OpenclDevice::gpuEnv;
+
+#if USE_DEVICE_SELECTION
+bool OpenclDevice::deviceIsSelected = false;
+ds_device OpenclDevice::selectedDevice;
+#endif
+
+int OpenclDevice::isInited =0;
+
+struct tiff_transform {
+    int vflip;    /* if non-zero, image needs a vertical fip */
+    int hflip;    /* if non-zero, image needs a horizontal flip */
+    int rotate;   /* -1 -> counterclockwise 90-degree rotation,
+                      0 -> no rotation
+                      1 -> clockwise 90-degree rotation */
+};
+
+static struct tiff_transform tiff_orientation_transforms[] = {
+    {0, 0, 0},
+    {0, 1, 0},
+    {1, 1, 0},
+    {1, 0, 0},
+    {0, 1, -1},
+    {0, 0, 1},
+    {0, 1, 1},
+    {0, 0, -1}
+};
+
+static const l_int32  MAX_PAGES_IN_TIFF_FILE = 3000; 
+
+cl_mem pixsCLBuffer, pixdCLBuffer, pixdCLIntermediate; //Morph operations buffers
+cl_mem pixThBuffer; //output from thresholdtopix calculation
+cl_int clStatus;
+KernelEnv rEnv;
+
+// substitute invalid characters in device name with _
+void legalizeFileName( char *fileName) {
+    //printf("fileName: %s\n", fileName);
+    char *invalidChars = "/\?:*\"><| "; // space is valid but can cause headaches
+    // for each invalid char
+    for (int i = 0; i < strlen(invalidChars); i++) {
+        char invalidStr[4];
+        invalidStr[0] = invalidChars[i];
+        invalidStr[1] = NULL;
+        //printf("eliminating %s\n", invalidStr);
+        //char *pos = strstr(fileName, invalidStr);
+        // initial ./ is valid for present directory
+        //if (*pos == '.') pos++;
+        //if (*pos == '/') pos++;
+        for ( char *pos = strstr(fileName, invalidStr); pos != NULL; pos = strstr(pos+1, invalidStr)) {
+            //printf("\tfound: %s, ", pos);
+            pos[0] = '_';
+            //printf("fileName: %s\n", fileName);
+        }
+    }
+}
+
+void populateGPUEnvFromDevice( GPUEnv *gpuInfo, cl_device_id device ) {
+    //printf("[DS] populateGPUEnvFromDevice\n");
+    size_t size;
+    gpuInfo->mnIsUserCreated = 1;
+    // device
+    gpuInfo->mpDevID = device;
+    gpuInfo->mpArryDevsID = new cl_device_id[1];
+    gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
+    clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE       , sizeof(cl_device_type), (void *) &gpuInfo->mDevType       , &size);
+    CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(TYPE)");
+    // platform
+    clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM   , sizeof(cl_platform_id), (void *) &gpuInfo->mpPlatformID   , &size);
+    CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(PLATFORM)");
+    // context
+    cl_context_properties props[3];
+    props[0] = CL_CONTEXT_PLATFORM;
+    props[1] = (cl_context_properties) gpuInfo->mpPlatformID;
+    props[2] = 0;
+    gpuInfo->mpContext = clCreateContext(props, 1, &gpuInfo->mpDevID, NULL, NULL, &clStatus);
+    CHECK_OPENCL( clStatus, "populateGPUEnv::createContext");
+    // queue
+    cl_command_queue_properties queueProperties = 0;
+    gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus );
+    CHECK_OPENCL( clStatus, "populateGPUEnv::createCommandQueue");
+    
+}
+
+int OpenclDevice::LoadOpencl()
+{
+#ifdef WIN32
+    HINSTANCE HOpenclDll = NULL;
+  void * OpenclDll = NULL;
+    //fprintf(stderr, " LoadOpenclDllxx... \n");
+    OpenclDll = static_cast<HINSTANCE>( HOpenclDll );
+    OpenclDll = LoadLibrary( "openCL.dll" );
+    if ( !static_cast<HINSTANCE>( OpenclDll ) )
+    {
+        fprintf(stderr, "[OD] Load opencl.dll failed!\n");
+        FreeLibrary( static_cast<HINSTANCE>( OpenclDll ) );
+        return 0;
+        
+    }
+    fprintf(stderr, "[OD] Load opencl.dll successful!\n");
+#endif
+    return 1;
+}
+int OpenclDevice::SetKernelEnv( KernelEnv *envInfo )
+{
+    envInfo->mpkContext = gpuEnv.mpContext;
+    envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
+    envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
+
+    return 1;
+}
+
+cl_mem allocateZeroCopyBuffer(KernelEnv rEnv, l_uint32 *hostbuffer, size_t nElements, cl_mem_flags flags, cl_int *pStatus)
+{
+    cl_mem membuffer = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (flags),
+                                        nElements * sizeof(l_uint32), hostbuffer, pStatus);
+
+    return membuffer;
+}
+
+PIX* mapOutputCLBuffer(KernelEnv rEnv, cl_mem clbuffer, PIX* pixd, PIX* pixs, int elements, cl_mem_flags flags, bool memcopy = false, bool sync = true)
+{   
+    PROCNAME("mapOutputCLBuffer");
+    if (!pixd)
+    {
+        if (memcopy)
+        {
+            if ((pixd = pixCreateTemplate(pixs)) == NULL)
+                (PIX *)ERROR_PTR("pixd not made", procName, NULL);
+        }
+        else
+        {
+            if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs), pixGetDepth(pixs))) == NULL)
+                (PIX *)ERROR_PTR("pixd not made", procName, NULL);
+        }
+    }
+    l_uint32 *pValues = (l_uint32 *)clEnqueueMapBuffer(rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
+                                                    elements * sizeof(l_uint32), 0, NULL, NULL, NULL );
+    
+    if (memcopy)
+    {
+        memcpy(pixGetData(pixd), pValues, elements * sizeof(l_uint32));
+    }
+    else
+    {
+        pixSetData(pixd, pValues);
+    }
+
+    clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,clbuffer,pValues,0,NULL,NULL);
+    
+    if (sync)
+    {
+        clFinish( rEnv.mpkCmdQueue );
+    }
+
+    return pixd;
+}
+
+ cl_mem allocateIntBuffer( KernelEnv rEnv, const l_uint32 *_pValues, size_t nElements, cl_int *pStatus , bool sync = false)
+{
+    cl_mem xValues = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+        nElements * sizeof(l_int32), NULL, pStatus);
+
+    if (_pValues != NULL)
+    {
+        l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0,
+            nElements * sizeof(l_int32), 0, NULL, NULL, NULL );
+
+        memcpy(pValues, _pValues, nElements * sizeof(l_int32));
+
+        clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL);
+
+        if (sync)
+            clFinish( rEnv.mpkCmdQueue );
+    }
+
+    return xValues;
+}
+
+int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo )
+{
+    size_t length;
+    cl_int clStatus;
+    cl_uint numPlatforms, numDevices;
+    cl_platform_id *platforms;
+    cl_context_properties cps[3];
+    char platformName[256];
+    unsigned int i;
+
+
+    // Have a look at the available platforms.
+
+    if ( !gpuInfo->mnIsUserCreated )
+    {
+        clStatus = clGetPlatformIDs( 0, NULL, &numPlatforms );
+        if ( clStatus != CL_SUCCESS )
+        {
+            return 1;
+        }
+        gpuInfo->mpPlatformID = NULL;
+
+        if ( 0 < numPlatforms )
+        {
+            platforms = (cl_platform_id*) malloc( numPlatforms * sizeof( cl_platform_id ) );
+            if ( platforms == (cl_platform_id*) NULL )
+            {
+                return 1;
+            }
+            clStatus = clGetPlatformIDs( numPlatforms, platforms, NULL );
+
+            if ( clStatus != CL_SUCCESS )
+            {
+                return 1;
+            }
+
+            for ( i = 0; i < numPlatforms; i++ )
+            {
+                clStatus = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR,
+                    sizeof( platformName ), platformName, NULL );
+
+                if ( clStatus != CL_SUCCESS )
+                {
+                    return 1;
+                }
+                gpuInfo->mpPlatformID = platforms[i];
+
+                //if (!strcmp(platformName, "Intel(R) Coporation"))
+                //if( !strcmp( platformName, "Advanced Micro Devices, Inc." ))
+                {
+                    gpuInfo->mpPlatformID = platforms[i];
+                    
+                    if ( getenv("SC_OPENCLCPU") )
+                    {
+                        clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform
+                                                  CL_DEVICE_TYPE_CPU,    // device_type for CPU device
+                                                  0,                     // num_entries
+                                                  NULL,                  // devices
+                                                  &numDevices);
+                        printf("Selecting OpenCL device: CPU (a)\n");
+                    }
+                    else
+                    {
+                          clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform
+                                                  CL_DEVICE_TYPE_GPU,      // device_type for GPU device
+                                                  0,                       // num_entries
+                                                  NULL,                    // devices
+                                                  &numDevices);
+                          printf("Selecting OpenCL device: GPU (a)\n");
+                    }
+                    if ( clStatus != CL_SUCCESS )
+                        continue;
+
+                    if ( numDevices )
+                        break;
+                }
+            }
+            if ( clStatus != CL_SUCCESS )
+                return 1;
+            free( platforms );
+        }
+        if ( NULL == gpuInfo->mpPlatformID )
+            return 1;
+
+        // Use available platform.
+        cps[0] = CL_CONTEXT_PLATFORM;
+        cps[1] = (cl_context_properties) gpuInfo->mpPlatformID;
+        cps[2] = 0;
+        // Set device type for OpenCL
+        
+        if ( getenv("SC_OPENCLCPU") )
+        {
+            gpuInfo->mDevType = CL_DEVICE_TYPE_CPU;
+            printf("Selecting OpenCL device: CPU (b)\n");
+        }
+        else
+        {
+            gpuInfo->mDevType = CL_DEVICE_TYPE_GPU;
+            printf("Selecting OpenCL device: GPU (b)\n");
+        }
+
+        gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
+
+        if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
+        {
+            gpuInfo->mDevType = CL_DEVICE_TYPE_CPU;
+            gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
+            printf("Selecting OpenCL device: CPU (c)\n");
+        }
+        if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
+        {
+            gpuInfo->mDevType = CL_DEVICE_TYPE_DEFAULT;
+            gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
+            printf("Selecting OpenCL device: DEFAULT (c)\n");
+        }
+        if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
+            return 1;
+        // Detect OpenCL devices.
+        // First, get the size of device list data
+        clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, 0, NULL, &length );
+        if ( ( clStatus != CL_SUCCESS ) || ( length == 0 ) )
+            return 1;
+        // Now allocate memory for device list based on the size we got earlier
+        gpuInfo->mpArryDevsID = (cl_device_id*) malloc( length );
+        if ( gpuInfo->mpArryDevsID == (cl_device_id*) NULL )
+            return 1;
+        // Now, get the device list data
+        clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, length,
+                       gpuInfo->mpArryDevsID, NULL );
+        if ( clStatus != CL_SUCCESS )
+            return 1;
+
+        // Create OpenCL command queue.
+        gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpArryDevsID[0], 0, &clStatus );
+
+        if ( clStatus != CL_SUCCESS )
+            return 1;
+    }
+
+    clStatus = clGetCommandQueueInfo( gpuInfo->mpCmdQueue, CL_QUEUE_THREAD_HANDLE_AMD, 0, NULL, NULL );
+    // Check device extensions for double type
+    size_t aDevExtInfoSize = 0;
+
+    clStatus = clGetDeviceInfo( gpuInfo->mpArryDevsID[0], CL_DEVICE_EXTENSIONS, 0, NULL, &aDevExtInfoSize );
+    CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
+
+    char *aExtInfo = new char[aDevExtInfoSize];
+
+    clStatus = clGetDeviceInfo( gpuInfo->mpArryDevsID[0], CL_DEVICE_EXTENSIONS,
+                   sizeof(char) * aDevExtInfoSize, aExtInfo, NULL);
+    CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
+
+    gpuInfo->mnKhrFp64Flag = 0;
+    gpuInfo->mnAmdFp64Flag = 0;
+
+    if ( strstr( aExtInfo, "cl_khr_fp64" ) )
+    {
+        gpuInfo->mnKhrFp64Flag = 1;
+    }
+    else
+    {
+        // Check if cl_amd_fp64 extension is supported
+        if ( strstr( aExtInfo, "cl_amd_fp64" ) )
+            gpuInfo->mnAmdFp64Flag = 1;
+    }
+    delete []aExtInfo;
+
+    return 0;
+}
+
+void OpenclDevice::releaseMorphCLBuffers()
+{
+    if (pixdCLIntermediate != NULL)
+        clReleaseMemObject(pixdCLIntermediate);
+    if (pixsCLBuffer != NULL)
+        clReleaseMemObject(pixsCLBuffer);
+    if (pixdCLBuffer != NULL)
+        clReleaseMemObject(pixdCLBuffer);
+    if (pixThBuffer != NULL)
+        clReleaseMemObject(pixThBuffer);
+}
+
+int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, PIX* pixs)
+{
+    SetKernelEnv( &rEnv );
+    
+    if (pixThBuffer != NULL)
+    {
+        pixsCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
+        
+        //Get the output from ThresholdToPix operation
+        clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0, sizeof(l_uint32) * wpl*h, 0, NULL, NULL);
+    }
+    else
+    {
+        //Get data from the source image
+        l_uint32* srcdata = (l_uint32*) malloc(wpl*h*sizeof(l_uint32));
+        memcpy(srcdata, pixGetData(pixs), wpl*h*sizeof(l_uint32));
+    
+        pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl*h, CL_MEM_USE_HOST_PTR, &clStatus);
+    }
+    
+    pixdCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
+
+    pixdCLIntermediate = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
+
+    return (int)clStatus;
+}
+
+int OpenclDevice::InitEnv()
+{
+//PERF_COUNT_START("OD::InitEnv")
+//    printf("[OD] OpenclDevice::InitEnv()\n");
+#ifdef SAL_WIN32
+    while( 1 )
+    {
+        if( 1 == LoadOpencl() )
+            break;
+    }
+PERF_COUNT_SUB("LoadOpencl")
+#endif
+    // sets up environment, compiles programs
+    
+
+#if USE_DEVICE_SELECTION
+    
+    InitOpenclRunEnv_DeviceSelection( 0 );
+//PERF_COUNT_SUB("called InitOpenclRunEnv_DS")
+#else
+    // init according to device
+    InitOpenclRunEnv( 0 );
+#endif
+//PERF_COUNT_END
+    return 1;
+}
+
+int OpenclDevice::ReleaseOpenclRunEnv()
+{
+    ReleaseOpenclEnv( &gpuEnv );
+#ifdef SAL_WIN32
+    FreeOpenclDll();
+#endif
+    return 1;
+}
+inline int OpenclDevice::AddKernelConfig( int kCount, const char *kName )
+{
+    if ( kCount < 1 )
+        fprintf(stderr,"Error: ( KCount < 1 ) AddKernelConfig\n" );
+    strcpy( gpuEnv.mArrykernelNames[kCount-1], kName );
+    gpuEnv.mnKernelCount++;
+    return 0;
+}
+int OpenclDevice::RegistOpenclKernel()
+{
+    if ( !gpuEnv.mnIsUserCreated )
+        memset( &gpuEnv, 0, sizeof(gpuEnv) );
+
+    gpuEnv.mnFileCount = 0; //argc;
+    gpuEnv.mnKernelCount = 0UL;
+
+    AddKernelConfig( 1, (const char*) "oclAverageSub1" );
+    return 0;
+}
+int OpenclDevice::InitOpenclRunEnv( int argc )
+{
+    int status = 0;
+    if ( MAX_CLKERNEL_NUM <= 0 )
+    {
+        return 1;
+    }
+    if ( ( argc > MAX_CLFILE_NUM ) || ( argc < 0 ) )
+        return 1;
+
+    if ( !isInited )
+    {
+        RegistOpenclKernel();
+        //initialize devices, context, comand_queue
+        status = InitOpenclRunEnv( &gpuEnv );
+        if ( status )
+        {
+            fprintf(stderr,"init_opencl_env failed.\n");
+            return 1;
+        }
+        fprintf(stderr,"init_opencl_env successed.\n");
+        //initialize program, kernelName, kernelCount
+        if( getenv( "SC_FLOAT" ) )
+        {
+            gpuEnv.mnKhrFp64Flag = 0;
+            gpuEnv.mnAmdFp64Flag = 0;
+        }
+        if( gpuEnv.mnKhrFp64Flag )
+        {
+            fprintf(stderr,"----use khr double type in kernel----\n");
+            status = CompileKernelFile( &gpuEnv, "-D KHR_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16" );
+        }
+        else if( gpuEnv.mnAmdFp64Flag )
+        {
+            fprintf(stderr,"----use amd double type in kernel----\n");
+            status = CompileKernelFile( &gpuEnv, "-D AMD_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16" );
+        }
+        else
+        {
+            fprintf(stderr,"----use float type in kernel----\n");
+            status = CompileKernelFile( &gpuEnv, "-Dfp_t=float -Dfp_t4=float4 -Dfp_t16=float16" );
+        }
+        if ( status == 0 || gpuEnv.mnKernelCount == 0 )
+        {
+            fprintf(stderr,"CompileKernelFile failed.\n");
+            return 1;
+        }
+        fprintf(stderr,"CompileKernelFile successed.\n");
+        isInited = 1;
+    }
+    return 0;
+}
+
+int OpenclDevice::InitOpenclRunEnv_DeviceSelection( int argc ) {
+//PERF_COUNT_START("InitOpenclRunEnv_DS")
+#if USE_DEVICE_SELECTION
+    if (!isInited) {
+        // after programs compiled, selects best device
+        //printf("[DS] InitOpenclRunEnv_DS::Calling performDeviceSelection()\n");
+        ds_device bestDevice_DS = getDeviceSelection( );
+//PERF_COUNT_SUB("called getDeviceSelection()")
+        cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
+        // overwrite global static GPUEnv with new device
+        if (selectedDeviceIsOpenCL() ) {
+            //printf("[DS] InitOpenclRunEnv_DS::Calling populateGPUEnvFromDevice() for selected device\n");
+        populateGPUEnvFromDevice( &gpuEnv, bestDevice );
+        gpuEnv.mnFileCount = 0; //argc;
+        gpuEnv.mnKernelCount = 0UL;
+//PERF_COUNT_SUB("populate gpuEnv")
+        CompileKernelFile(&gpuEnv, "");
+//PERF_COUNT_SUB("CompileKernelFile")
+        } else {
+            //printf("[DS] InitOpenclRunEnv_DS::Skipping populateGPUEnvFromDevice() b/c native cpu selected\n");
+        }
+        isInited = 1;
+    }
+#endif
+//PERF_COUNT_END
+    return 0;
+}
+
+
+OpenclDevice::OpenclDevice()
+{
+    //InitEnv();
+}
+
+OpenclDevice::~OpenclDevice()
+{
+    //ReleaseOpenclRunEnv();
+}
+
+int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo )
+{
+    int i = 0;
+    int clStatus = 0;
+
+    if ( !isInited )
+    {
+        return 1;
+    }
+
+    for ( i = 0; i < gpuEnv.mnFileCount; i++ )
+    {
+        if ( gpuEnv.mpArryPrograms[i] )
+        {
+            clStatus = clReleaseProgram( gpuEnv.mpArryPrograms[i] );
+            CHECK_OPENCL( clStatus, "clReleaseProgram" );
+            gpuEnv.mpArryPrograms[i] = NULL;
+        }
+    }
+    if ( gpuEnv.mpCmdQueue )
+    {
+        clReleaseCommandQueue( gpuEnv.mpCmdQueue );
+        gpuEnv.mpCmdQueue = NULL;
+    }
+    if ( gpuEnv.mpContext )
+    {
+        clReleaseContext( gpuEnv.mpContext );
+        gpuEnv.mpContext = NULL;
+    }
+    isInited = 0;
+    gpuInfo->mnIsUserCreated = 0;
+    free( gpuInfo->mpArryDevsID );
+    return 1;
+}
+int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
+{
+    unsigned int i = 0;
+    cl_int clStatus;
+    int status = 0;
+    char *str = NULL;
+    FILE *fd = NULL;
+    cl_uint numDevices=0;
+    if ( getenv("SC_OPENCLCPU") )
+    {
+        clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform
+                                  CL_DEVICE_TYPE_CPU,  // device_type for CPU device
+                                  0,                   // num_entries
+                                  NULL,                // devices ID
+                                  &numDevices);
+    }
+    else
+    {
+        clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform
+                                  CL_DEVICE_TYPE_GPU,  // device_type for GPU device
+                                  0,                   // num_entries
+                                  NULL,                // devices ID
+                                  &numDevices);
+    }
+    CHECK_OPENCL( clStatus, "clGetDeviceIDs" );
+    for ( i = 0; i < numDevices; i++ )
+    {
+        char fileName[256] = { 0 }, cl_name[128] = { 0 };
+        if ( gpuEnv.mpArryDevsID[i] != 0 )
+        {
+            char deviceName[1024];
+            clStatus = clGetDeviceInfo( gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL );
+            CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
+            str = (char*) strstr( clFileName, (char*) ".cl" );
+            memcpy( cl_name, clFileName, str - clFileName );
+            cl_name[str - clFileName] = '\0';
+            sprintf( fileName, "%s-%s.bin", cl_name, deviceName );
+            legalizeFileName(fileName);
+            fd = fopen( fileName, "rb" );
+            status = ( fd != NULL ) ? 1 : 0;
+        }
+    }
+    if ( fd != NULL )
+    {
+        *fhandle = fd;
+    }
+    return status;
+
+}
+int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName )
+{
+    int i;
+    for ( i = 0; i < gpuEnvCached->mnFileCount; i++ )
+    {
+        if ( strcasecmp( gpuEnvCached->mArryKnelSrcFile[i], clFileName ) == 0 )
+        {
+            if ( gpuEnvCached->mpArryPrograms[i] != NULL )
+            {
+                return 1;
+            }
+        }
+    }
+
+    return 0;
+}
+int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes )
+{
+    FILE *output = NULL;
+    output = fopen( fileName, "wb" );
+    if ( output == NULL )
+    {
+        return 0;
+    }
+
+    fwrite( birary, sizeof(char), numBytes, output );
+    fclose( output );
+
+    return 1;
+
+}
+int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * clFileName )
+{
+    unsigned int i = 0;
+    cl_int clStatus;
+    size_t *binarySizes, numDevices;
+    cl_device_id *mpArryDevsID;
+    char **binaries, *str = NULL;
+
+    clStatus = clGetProgramInfo( program, CL_PROGRAM_NUM_DEVICES,
+                   sizeof(numDevices), &numDevices, NULL );
+    CHECK_OPENCL( clStatus, "clGetProgramInfo" );
+
+    mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices );
+    if ( mpArryDevsID == NULL )
+    {
+        return 0;
+    }
+    /* grab the handles to all of the devices in the program. */
+    clStatus = clGetProgramInfo( program, CL_PROGRAM_DEVICES,
+                   sizeof(cl_device_id) * numDevices, mpArryDevsID, NULL );
+    CHECK_OPENCL( clStatus, "clGetProgramInfo" );
+
+    /* figure out the sizes of each of the binaries. */
+    binarySizes = (size_t*) malloc( sizeof(size_t) * numDevices );
+
+    clStatus = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES,
+                   sizeof(size_t) * numDevices, binarySizes, NULL );
+    CHECK_OPENCL( clStatus, "clGetProgramInfo" );
+
+    /* copy over all of the generated binaries. */
+    binaries = (char**) malloc( sizeof(char *) * numDevices );
+    if ( binaries == NULL )
+    {
+        return 0;
+    }
+
+    for ( i = 0; i < numDevices; i++ )
+    {
+        if ( binarySizes[i] != 0 )
+        {
+            binaries[i] = (char*) malloc( sizeof(char) * binarySizes[i] );
+            if ( binaries[i] == NULL )
+            {
+                return 0;
+            }
+        }
+        else
+        {
+            binaries[i] = NULL;
+        }
+    }
+
+    clStatus = clGetProgramInfo( program, CL_PROGRAM_BINARIES,
+                   sizeof(char *) * numDevices, binaries, NULL );
+    CHECK_OPENCL(clStatus,"clGetProgramInfo");
+
+    /* dump out each binary into its own separate file. */
+    for ( i = 0; i < numDevices; i++ )
+    {
+        char fileName[256] = { 0 }, cl_name[128] = { 0 };
+
+        if ( binarySizes[i] != 0 )
+        {
+            char deviceName[1024];
+            clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
+                           sizeof(deviceName), deviceName, NULL);
+            CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
+
+            str = (char*) strstr( clFileName, (char*) ".cl" );
+            memcpy( cl_name, clFileName, str - clFileName );
+            cl_name[str - clFileName] = '\0';
+            sprintf( fileName, "%s-%s.bin", cl_name, deviceName );
+            legalizeFileName(fileName);
+            if ( !WriteBinaryToFile( fileName, binaries[i], binarySizes[i] ) )
+            {
+                printf("[OD] write binary[%s] failed\n", fileName);
+                return 0;
+            } //else
+            printf("[OD] write binary[%s] succesfully\n", fileName);
+        }
+    }
+
+    // Release all resouces and memory
+    for ( i = 0; i < numDevices; i++ )
+    {
+        if ( binaries[i] != NULL )
+        {
+            free( binaries[i] );
+            binaries[i] = NULL;
+        }
+    }
+
+    if ( binaries != NULL )
+    {
+        free( binaries );
+        binaries = NULL;
+    }
+
+    if ( binarySizes != NULL )
+    {
+        free( binarySizes );
+        binarySizes = NULL;
+    }
+
+    if ( mpArryDevsID != NULL )
+    {
+        free( mpArryDevsID );
+        mpArryDevsID = NULL;
+    }
+    return 1;
+}
+
+void copyIntBuffer( KernelEnv rEnv, cl_mem xValues, const l_uint32 *_pValues, size_t nElements, cl_int *pStatus )
+{
+    l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0,
+        nElements * sizeof(l_int32), 0, NULL, NULL, NULL );
+    clFinish( rEnv.mpkCmdQueue );
+    if (_pValues != NULL)
+    {
+        for ( int i = 0; i < (int)nElements; i++ )
+            pValues[i] = (l_int32)_pValues[i];
+    }
+
+    clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL);
+    //clFinish( rEnv.mpkCmdQueue );
+    return;
+}
+
+int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
+{
+//PERF_COUNT_START("CompileKernelFile")
+    cl_int clStatus = 0;
+    size_t length;
+    char *buildLog = NULL, *binary;
+    const char *source;
+    size_t source_size[1];
+    int b_error, binary_status, binaryExisted, idx;
+    size_t numDevices;
+    cl_device_id *mpArryDevsID;
+    FILE *fd, *fd1;
+    const char* filename = "kernel.cl";
+    //fprintf(stderr, "[OD] CompileKernelFile ... \n");
+    if ( CachedOfKernerPrg(gpuInfo, filename) == 1 )
+    {
+        return 1;
+    }
+
+    idx = gpuInfo->mnFileCount;
+
+    source = kernel_src;
+
+    source_size[0] = strlen( source );
+    binaryExisted = 0;
+        binaryExisted = BinaryGenerated( filename, &fd ); // don't check for binary during microbenchmark
+//PERF_COUNT_SUB("BinaryGenerated")
+    if ( binaryExisted == 1 )
+    {
+        clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
+                       sizeof(numDevices), &numDevices, NULL );
+        CHECK_OPENCL( clStatus, "clGetContextInfo" );
+
+        mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices );
+        if ( mpArryDevsID == NULL )
+        {
+            return 0;
+        }
+//PERF_COUNT_SUB("get numDevices")
+        b_error = 0;
+        length = 0;
+        b_error |= fseek( fd, 0, SEEK_END ) < 0;
+        b_error |= ( length = ftell(fd) ) <= 0;
+        b_error |= fseek( fd, 0, SEEK_SET ) < 0;
+        if ( b_error )
+        {
+            return 0;
+        }
+
+        binary = (char*) malloc( length + 2 );
+        if ( !binary )
+        {
+            return 0;
+        }
+
+        memset( binary, 0, length + 2 );
+        b_error |= fread( binary, 1, length, fd ) != length;
+
+
+        fclose( fd );
+//PERF_COUNT_SUB("read file")
+        fd = NULL;
+        // grab the handles to all of the devices in the context.
+        clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES,
+                       sizeof( cl_device_id ) * numDevices, mpArryDevsID, NULL );
+        CHECK_OPENCL( clStatus, "clGetContextInfo" );
+//PERF_COUNT_SUB("get devices")
+        //fprintf(stderr, "[OD] Create kernel from binary\n");
+        gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary( gpuInfo->mpContext,numDevices,
+                                           mpArryDevsID, &length, (const unsigned char**) &binary,
+                                           &binary_status, &clStatus );
+        CHECK_OPENCL( clStatus, "clCreateProgramWithBinary" );
+//PERF_COUNT_SUB("clCreateProgramWithBinary")
+        free( binary );
+        free( mpArryDevsID );
+        mpArryDevsID = NULL;
+//PERF_COUNT_SUB("binaryExisted")
+    }
+    else
+    {
+        // create a CL program using the kernel source
+        //fprintf(stderr, "[OD] Create kernel from source\n");
+        gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource( gpuInfo->mpContext, 1, &source,
+                                         source_size, &clStatus);
+        CHECK_OPENCL( clStatus, "clCreateProgramWithSource" );
+//PERF_COUNT_SUB("!binaryExisted")
+    }
+
+    if ( gpuInfo->mpArryPrograms[idx] == (cl_program) NULL )
+    {
+        return 0;
+    }
+
+    //char options[512];
+    // create a cl program executable for all the devices specified
+    //printf("[OD] BuildProgram.\n");
+PERF_COUNT_START("OD::CompileKernel::clBuildProgram")
+    if (!gpuInfo->mnIsUserCreated)
+    {
+        clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID,
+                       buildOption, NULL, NULL);
+//PERF_COUNT_SUB("clBuildProgram notUserCreated")
+    }
+    else
+    {
+        clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID),
+                       buildOption, NULL, NULL);
+//PERF_COUNT_SUB("clBuildProgram isUserCreated")
+    }
+PERF_COUNT_END
+    if ( clStatus != CL_SUCCESS )
+    {
+        printf ("BuildProgram error!\n");
+        if ( !gpuInfo->mnIsUserCreated )
+        {
+            clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
+                           CL_PROGRAM_BUILD_LOG, 0, NULL, &length );
+        }
+        else
+        {
+            clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
+                           CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
+        }
+        if ( clStatus != CL_SUCCESS )
+        {
+            printf("opencl create build log fail\n");
+            return 0;
+        }
+        buildLog = (char*) malloc( length );
+        if ( buildLog == (char*) NULL )
+        {
+            return 0;
+        }
+        if ( !gpuInfo->mnIsUserCreated )
+        {
+            clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
+                           CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
+        }
+        else
+        {
+            clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
+                           CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
+        }
+        if ( clStatus != CL_SUCCESS )
+        {
+            printf("opencl program build info fail\n");
+            return 0;
+        }
+
+        fd1 = fopen( "kernel-build.log", "w+" );
+        if ( fd1 != NULL )
+        {
+            fwrite( buildLog, sizeof(char), length, fd1 );
+            fclose( fd1 );
+        }
+
+        free( buildLog );
+//PERF_COUNT_SUB("build error log")
+        return 0;
+    }
+
+    strcpy( gpuInfo->mArryKnelSrcFile[idx], filename );
+//PERF_COUNT_SUB("strcpy")
+    if ( binaryExisted == 0 ) {
+        GeneratBinFromKernelSource( gpuInfo->mpArryPrograms[idx], filename );
+        PERF_COUNT_SUB("GenerateBinFromKernelSource")
+    }
+
+    gpuInfo->mnFileCount += 1;
+//PERF_COUNT_END
+    return 1;
+}
+
+l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl,l_uint32 *line)
+{
+PERF_COUNT_START("pixReadFromTiffKernel")
+    cl_int clStatus;
+    KernelEnv rEnv;
+    size_t globalThreads[2];
+    size_t localThreads[2];
+    int gsize;
+    cl_mem valuesCl; 
+    cl_mem outputCl;
+
+    //global and local work dimensions for Horizontal pass
+    gsize = (w + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
+    globalThreads[0] = gsize;
+    gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
+    globalThreads[1] = gsize;
+    localThreads[0] = GROUPSIZE_X;
+    localThreads[1] = GROUPSIZE_Y;
+
+    SetKernelEnv( &rEnv );
+    
+    l_uint32 *pResult = (l_uint32 *)malloc(w*h * sizeof(l_uint32));
+    rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "composeRGBPixel", &clStatus );
+    CHECK_OPENCL( clStatus, "clCreateKernel");
+    
+    //Allocate input and output OCL buffers
+    valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w*h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
+    outputCl = allocateZeroCopyBuffer(rEnv, pResult, w*h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
+
+    //Kernel arguments
+    clStatus = clSetKernelArg( rEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&valuesCl );
+    CHECK_OPENCL( clStatus, "clSetKernelArg");
+    clStatus = clSetKernelArg( rEnv.mpkKernel, 1, sizeof(w), (void *)&w );
+    CHECK_OPENCL( clStatus, "clSetKernelArg" );
+    clStatus = clSetKernelArg( rEnv.mpkKernel, 2, sizeof(h), (void *)&h );
+    CHECK_OPENCL( clStatus, "clSetKernelArg" );
+    clStatus = clSetKernelArg( rEnv.mpkKernel, 3, sizeof(wpl), (void *)&wpl );
+    CHECK_OPENCL( clStatus, "clSetKernelArg" );
+    clStatus = clSetKernelArg( rEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&outputCl );
+    CHECK_OPENCL( clStatus, "clSetKernelArg");
+    
+    //Kernel enqueue
+PERF_COUNT_SUB("before")
+    clStatus = clEnqueueNDRangeKernel( rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL );
+    CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" );
+    
+     /* map results back from gpu */
+    void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ, 0, w*h * sizeof(l_uint32), 0, NULL, NULL, &clStatus);
+    CHECK_OPENCL( clStatus, "clEnqueueMapBuffer outputCl");
+    clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, NULL, NULL);
+
+    //Sync
+    clFinish( rEnv.mpkCmdQueue );
+PERF_COUNT_SUB("kernel & map")
+PERF_COUNT_END
+    return pResult;
+}
+
+
+PIX * OpenclDevice::pixReadTiffCl ( const char *filename, l_int32 n )
+{
+PERF_COUNT_START("pixReadTiffCL")
+    FILE  *fp;
+PIX   *pix;
+
+    //printf("pixReadTiffCl file");
+    PROCNAME("pixReadTiff");
+
+    if (!filename)
+        return (PIX *)ERROR_PTR("filename not defined", procName, NULL);
+
+    if ((fp = fopenReadStream(filename)) == NULL)
+        return (PIX *)ERROR_PTR("image file not found", procName, NULL);
+    if ((pix = pixReadStreamTiffCl(fp, n)) == NULL) {
+        fclose(fp);
+        return (PIX *)ERROR_PTR("pix not read", procName, NULL);
+    }
+    fclose(fp);
+PERF_COUNT_END
+    return pix;
+    
+}
+TIFF *
+OpenclDevice::fopenTiffCl(FILE        *fp,
+          const char  *modestring)
+{
+l_int32  fd;
+
+    PROCNAME("fopenTiff");
+
+    if (!fp)
+        return (TIFF *)ERROR_PTR("stream not opened", procName, NULL);
+    if (!modestring)
+        return (TIFF *)ERROR_PTR("modestring not defined", procName, NULL);
+
+    if ((fd = fileno(fp)) < 0)
+        return (TIFF *)ERROR_PTR("invalid file descriptor", procName, NULL);
+    lseek(fd, 0, SEEK_SET);
+
+    return TIFFFdOpen(fd, "TIFFstream", modestring);
+}
+l_int32 OpenclDevice::getTiffStreamResolutionCl(TIFF     *tif,
+                        l_int32  *pxres,
+                        l_int32  *pyres)
+{
+l_uint16   resunit;
+l_int32    foundxres, foundyres;
+l_float32  fxres, fyres;
+
+    PROCNAME("getTiffStreamResolution");
+
+    if (!tif)
+        return ERROR_INT("tif not opened", procName, 1);
+    if (!pxres || !pyres)
+        return ERROR_INT("&xres and &yres not both defined", procName, 1);
+    *pxres = *pyres = 0;
+
+    TIFFGetFieldDefaulted(tif, TIFFTAG_RESOLUTIONUNIT, &resunit);
+    foundxres = TIFFGetField(tif, TIFFTAG_XRESOLUTION, &fxres);
+    foundyres = TIFFGetField(tif, TIFFTAG_YRESOLUTION, &fyres);
+    if (!foundxres && !foundyres) return 1;
+    if (!foundxres && foundyres)
+        fxres = fyres;
+    else if (foundxres && !foundyres)
+        fyres = fxres;
+
+    if (resunit == RESUNIT_CENTIMETER) {  /* convert to ppi */
+        *pxres = (l_int32)(2.54 * fxres + 0.5);
+        *pyres = (l_int32)(2.54 * fyres + 0.5);
+    }
+    else {
+        *pxres = (l_int32)fxres;
+        *pyres = (l_int32)fyres;
+    }
+
+    return 0;
+}
+PIX *
+OpenclDevice::pixReadStreamTiffCl(FILE    *fp,
+                  l_int32  n)
+{
+l_int32  i, pagefound;
+PIX     *pix;
+TIFF    *tif;
+
+    PROCNAME("pixReadStreamTiff");
+
+    if (!fp)
+        return (PIX *)ERROR_PTR("stream not defined", procName, NULL);
+
+    if ((tif = fopenTiffCl(fp, "rb")) == NULL)
+        return (PIX *)ERROR_PTR("tif not opened", procName, NULL);
+
+    pagefound = FALSE;
+    pix = NULL;
+    for (i = 0; i < MAX_PAGES_IN_TIFF_FILE; i++) {
+        if (i == n) {
+            pagefound = TRUE;
+            if ((pix = pixReadFromTiffStreamCl(tif)) == NULL) {
+                TIFFCleanup(tif);
+                return (PIX *)ERROR_PTR("pix not read", procName, NULL);
+            }
+            break;
+        }
+        if (TIFFReadDirectory(tif) == 0)
+            break;
+    }
+
+    if (pagefound == FALSE) {
+        L_WARNING_INT("tiff page %d not found", procName, n);
+        TIFFCleanup(tif);
+        return NULL;
+    }
+
+    TIFFCleanup(tif);
+    return pix;
+}
+
+static l_int32
+getTiffCompressedFormat(l_uint16  tiffcomp)
+{
+l_int32  comptype;
+
+    switch (tiffcomp)
+    {
+    case COMPRESSION_CCITTFAX4:
+        comptype = IFF_TIFF_G4;
+        break;
+    case COMPRESSION_CCITTFAX3:
+        comptype = IFF_TIFF_G3;
+        break;
+    case COMPRESSION_CCITTRLE:
+        comptype = IFF_TIFF_RLE;
+        break;
+    case COMPRESSION_PACKBITS:
+        comptype = IFF_TIFF_PACKBITS;
+        break;
+    case COMPRESSION_LZW:
+        comptype = IFF_TIFF_LZW;
+        break;
+    case COMPRESSION_ADOBE_DEFLATE:
+        comptype = IFF_TIFF_ZIP;
+        break;
+    default:
+        comptype = IFF_TIFF;
+        break;
+    }
+    return comptype;
+}
+
+void compare(l_uint32  *cpu, l_uint32  *gpu,int size)
+{
+    for(int i=0;i<size;i++)
+    {
+        if(cpu[i]!=gpu[i])
+        {
+            printf("\ndoesnot match\n");
+            return;
+        }
+    }
+    printf("\nit matches\n");
+    
+}
+
+//OpenCL implementation of pixReadFromTiffStream.
+//Similar to the CPU implentation of pixReadFromTiffStream
+PIX *
+OpenclDevice::pixReadFromTiffStreamCl(TIFF  *tif)
+{
+l_uint8   *linebuf, *data;
+l_uint16   spp, bps, bpp, tiffbpl, photometry, tiffcomp, orientation;
+l_uint16  *redmap, *greenmap, *bluemap;
+l_int32    d, wpl, bpl, comptype, i, ncolors;
+l_int32    xres, yres;
+l_uint32   w, h;
+l_uint32  *line, *tiffdata;
+PIX       *pix;
+PIXCMAP   *cmap;
+
+    PROCNAME("pixReadFromTiffStream");
+
+    if (!tif)
+        return (PIX *)ERROR_PTR("tif not defined", procName, NULL);
+
+    
+    TIFFGetFieldDefaulted(tif, TIFFTAG_BITSPERSAMPLE, &bps);
+    TIFFGetFieldDefaulted(tif, TIFFTAG_SAMPLESPERPIXEL, &spp);
+    bpp = bps * spp;
+    if (bpp > 32)
+        return (PIX *)ERROR_PTR("can't handle bpp > 32", procName, NULL);
+    if (spp == 1)
+        d = bps;
+    else if (spp == 3 || spp == 4)
+        d = 32;
+    else
+        return (PIX *)ERROR_PTR("spp not in set {1,3,4}", procName, NULL);
+
+    TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &w);
+    TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &h);
+    tiffbpl = TIFFScanlineSize(tif);
+
+    if ((pix = pixCreate(w, h, d)) == NULL)
+        return (PIX *)ERROR_PTR("pix not made", procName, NULL);
+    data = (l_uint8 *)pixGetData(pix);
+    wpl = pixGetWpl(pix);
+    bpl = 4 * wpl;
+
+   
+    if (spp == 1) {
+        if ((linebuf = (l_uint8 *)CALLOC(tiffbpl + 1, sizeof(l_uint8))) == NULL)
+            return (PIX *)ERROR_PTR("calloc fail for linebuf", procName, NULL);
+        
+        for (i = 0 ; i < h ; i++) {
+            if (TIFFReadScanline(tif, linebuf, i, 0) < 0) {
+                FREE(linebuf);
+                pixDestroy(&pix);
+                return (PIX *)ERROR_PTR("line read fail", procName, NULL);
+            }
+            memcpy((char *)data, (char *)linebuf, tiffbpl);
+            data += bpl;
+        }
+        if (bps <= 8)
+            pixEndianByteSwap(pix);
+        else   
+            pixEndianTwoByteSwap(pix);
+        FREE(linebuf);
+    }
+    else {  
+        if ((tiffdata = (l_uint32 *)CALLOC(w * h, sizeof(l_uint32))) == NULL) {
+            pixDestroy(&pix);
+            return (PIX *)ERROR_PTR("calloc fail for tiffdata", procName, NULL);
+        }
+        if (!TIFFReadRGBAImageOriented(tif, w, h, (uint32 *)tiffdata,
+                                       ORIENTATION_TOPLEFT, 0)) {
+            FREE(tiffdata);
+            pixDestroy(&pix);
+            return (PIX *)ERROR_PTR("failed to read tiffdata", procName, NULL);
+        }
+        line = pixGetData(pix);
+
+        //Invoke the OpenCL kernel for pixReadFromTiff
+        l_uint32* output_gpu=pixReadFromTiffKernel(tiffdata,w,h,wpl,line);
+        pixSetData(pix, output_gpu);
+        
+        FREE(tiffdata);
+    }
+
+    if (getTiffStreamResolutionCl(tif, &xres, &yres) == 0) {
+        pixSetXRes(pix, xres);
+        pixSetYRes(pix, yres);
+    }
+
+
+    TIFFGetFieldDefaulted(tif, TIFFTAG_COMPRESSION, &tiffcomp);
+    comptype = getTiffCompressedFormat(tiffcomp);
+    pixSetInputFormat(pix, comptype);
+
+    if (TIFFGetField(tif, TIFFTAG_COLORMAP, &redmap, &greenmap, &bluemap)) {
+           
+        if ((cmap = pixcmapCreate(bps)) == NULL) {
+            pixDestroy(&pix);
+            return (PIX *)ERROR_PTR("cmap not made", procName, NULL);
+        }
+        ncolors = 1 << bps;
+        for (i = 0; i < ncolors; i++)
+            pixcmapAddColor(cmap, redmap[i] >> 8, greenmap[i] >> 8,
+                            bluemap[i] >> 8);
+        pixSetColormap(pix, cmap);
+    }
+    else {  
+        if (!TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &photometry)) {
+       
+            if (tiffcomp == COMPRESSION_CCITTFAX3 ||
+                tiffcomp == COMPRESSION_CCITTFAX4 ||
+                tiffcomp == COMPRESSION_CCITTRLE ||
+                tiffcomp == COMPRESSION_CCITTRLEW) {
+                photometry = PHOTOMETRIC_MINISWHITE;
+            }
+            else
+                photometry = PHOTOMETRIC_MINISBLACK;
+        }
+        if ((d == 1 && photometry == PHOTOMETRIC_MINISBLACK) ||
+            (d == 8 && photometry == PHOTOMETRIC_MINISWHITE))
+            pixInvert(pix, pix);
+    }
+
+    if (TIFFGetField(tif, TIFFTAG_ORIENTATION, &orientation)) {
+        if (orientation >= 1 && orientation <= 8) {
+            struct tiff_transform *transform =
+              &tiff_orientation_transforms[orientation - 1];
+            if (transform->vflip) pixFlipTB(pix, pix);
+            if (transform->hflip) pixFlipLR(pix, pix);
+            if (transform->rotate) {
+                PIX *oldpix = pix;
+                pix = pixRotate90(oldpix, transform->rotate);
+                pixDestroy(&oldpix);
+           }
+        }
+    }
+
+    return pix;
+}
+
+//Morphology Dilate operation for 5x5 structuring element. Invokes the relevant OpenCL kernels
+cl_int
+pixDilateCL_55(l_int32  wpl, l_int32  h)
+{
+    size_t globalThreads[2];
+    cl_mem pixtemp;
+    cl_int status;
+    int gsize;
+    size_t localThreads[2];
+
+    //Horizontal pass
+    gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX;
+    globalThreads[0] = gsize;
+    globalThreads[1] = GROUPSIZE_HMORY;
+    localThreads[0] = GROUPSIZE_HMORX;
+    localThreads[1] = GROUPSIZE_HMORY;
+
+    rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_5x5", &status );
+    
+    status = clSetKernelArg(rEnv.mpkKernel,
+        0,
+        sizeof(cl_mem),
+        &pixsCLBuffer);
+    status = clSetKernelArg(rEnv.mpkKernel,
+        1,
+        sizeof(cl_mem),
+        &pixdCLBuffer);
+    status = clSetKernelArg(rEnv.mpkKernel,
+        2,
+        sizeof(wpl),
+        (const void *)&wpl);
+    status = clSetKernelArg(rEnv.mpkKernel,
+        3,
+        sizeof(h),
+        (const void *)&h);
+
+    status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
+                            rEnv.mpkKernel,
+                            2,
+                            NULL,
+                            globalThreads,
+                            localThreads,
+                            0,
+                            NULL,
+                            NULL);
+    
+    //Swap source and dest buffers
+    pixtemp = pixsCLBuffer;
+    pixsCLBuffer = pixdCLBuffer;
+    pixdCLBuffer = pixtemp;
+
+    //Vertical
+    gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
+    globalThreads[0] = gsize;
+    gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
+    globalThreads[1] = gsize;
+    localThreads[0] = GROUPSIZE_X;
+    localThreads[1] = GROUPSIZE_Y;
+
+    rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer_5x5", &status );
+    
+    status = clSetKernelArg(rEnv.mpkKernel,
+        0,
+        sizeof(cl_mem),
+        &pixsCLBuffer);
+    status = clSetKernelArg(rEnv.mpkKernel,
+        1,
+        sizeof(cl_mem),
+        &pixdCLBuffer);
+    status = clSetKernelArg(rEnv.mpkKernel,
+        2,
+        sizeof(wpl),
+        (const void *)&wpl);
+    status = clSetKernelArg(rEnv.mpkKernel,
+        3,
+        sizeof(h),
+        (const void *)&h);
+    status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
+                            rEnv.mpkKernel,
+                            2,
+                            NULL,
+                            globalThreads,
+                            localThreads,
+                            0,
+                            NULL,
+                            NULL);
+
+    return status;
+}
+
+//Morphology Erode operation for 5x5 structuring element. Invokes the relevant OpenCL kernels
+cl_int
+pixErodeCL_55(l_int32  wpl, l_int32  h)
+{
+    size_t globalThreads[2];
+    cl_mem pixtemp;
+    cl_int status;
+    int gsize;
+    l_uint32 fwmask, lwmask;
+    size_t localThreads[2];
+
+    lwmask = lmask32[32 - 2];
+    fwmask = rmask32[32 - 2];
+
+    //Horizontal pass
+    gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX;
+    globalThreads[0] = gsize;
+    globalThreads[1] = GROUPSIZE_HMORY;
+    localThreads[0] = GROUPSIZE_HMORX;
+    localThreads[1] = GROUPSIZE_HMORY;
+
+    rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_5x5", &status );
+    
+    status = clSetKernelArg(rEnv.mpkKernel,
+        0,
+        sizeof(cl_mem),
+        &pixsCLBuffer);
+    status = clSetKernelArg(rEnv.mpkKernel,
+        1,
+        sizeof(cl_mem),
+        &pixdCLBuffer);
+    status = clSetKernelArg(rEnv.mpkKernel,
+        2,
+        sizeof(wpl),
+        (const void *)&wpl);
+    status = clSetKernelArg(rEnv.mpkKernel,
+        3,
+        sizeof(h),
+        (const void *)&h);
+
+    status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
+                            rEnv.mpkKernel,
+                            2,
+                            NULL,
+                            globalThreads,
+                            localThreads,
+                            0,
+                            NULL,
+                            NULL);
+    
+    //Swap source and dest buffers
+    pixtemp = pixsCLBuffer;
+    pixsCLBuffer = pixdCLBuffer;
+    pixdCLBuffer = pixtemp;
+
+    //Vertical
+    gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
+    globalThreads[0] = gsize;
+    gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
+    globalThreads[1] = gsize;
+    localThreads[0] = GROUPSIZE_X;
+    localThreads[1] = GROUPSIZE_Y;
+
+    rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer_5x5", &status );
+    
+    status = clSetKernelArg(rEnv.mpkKernel,
+        0,
+        sizeof(cl_mem),
+        &pixsCLBuffer);
+    status = clSetKernelArg(rEnv.mpkKernel,
+        1,
+        sizeof(cl_mem),
+        &pixdCLBuffer);
+    status = clSetKernelArg(rEnv.mpkKernel,
+        2,
+        sizeof(wpl),
+        (const void *)&wpl);
+    status = clSetKernelArg(rEnv.mpkKernel,
+        3,
+        sizeof(h),
+        (const void *)&h);
+    status = clSetKernelArg(rEnv.mpkKernel,
+        4,
+        sizeof(fwmask),
+        (const void *)&fwmask);
+    status = clSetKernelArg(rEnv.mpkKernel,
+        5,
+        sizeof(lwmask),
+        (const void *)&lwmask);
+    status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
+                            rEnv.mpkKernel,
+                            2,
+                            NULL,
+                            globalThreads,
+                            localThreads,
+                            0,
+                            NULL,
+                            NULL);
+
+    return status;
+}
+
+//Morphology Dilate operation. Invokes the relevant OpenCL kernels
+cl_int
+pixDilateCL(l_int32  hsize, l_int32  vsize, l_int32  wpl, l_int32  h)
+{
+    l_int32  xp, yp, xn, yn;
+    SEL* sel;
+    size_t globalThreads[2];
+    cl_mem pixtemp;
+    cl_int status;
+    int gsize;
+    size_t localThreads[2];
+    char isEven;
+
+    OpenclDevice::SetKernelEnv( &rEnv );
+    
+    if (hsize == 5 && vsize == 5)
+    {
+        //Specific case for 5x5
+        status = pixDilateCL_55(wpl, h);
+        return status;
+    }
+    
+    sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
+    
+    selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
+
+    //global and local work dimensions for Horizontal pass
+    gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
+    globalThreads[0] = gsize;
+    gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
+    globalThreads[1] = gsize;
+    localThreads[0] = GROUPSIZE_X;
+    localThreads[1] = GROUPSIZE_Y;
+
+    if (xp > 31 || xn > 31)
+    {
+        //Generic case. 
+        rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor", &status );
+    
+        status = clSetKernelArg(rEnv.mpkKernel,
+            0,
+            sizeof(cl_mem),
+            &pixsCLBuffer);
+        status = clSetKernelArg(rEnv.mpkKernel,
+            1,
+            sizeof(cl_mem),
+            &pixdCLBuffer);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                2,
+                sizeof(xp),
+                (const void *)&xp);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                3,
+                sizeof(xn),
+                (const void *)&xn);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                4,
+                sizeof(wpl),
+                (const void *)&wpl);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                5,
+                sizeof(h),
+                (const void *)&h);
+        status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
+                                rEnv.mpkKernel,
+                                2,
+                                NULL,
+                                globalThreads,
+                                localThreads,
+                                0,
+                                NULL,
+                                NULL);
+
+        if (yp > 0 || yn > 0)
+        {
+            pixtemp = pixsCLBuffer;
+            pixsCLBuffer = pixdCLBuffer;
+            pixdCLBuffer = pixtemp;
+        }
+    }
+    else if (xp > 0 || xn > 0 )
+    {
+        //Specfic Horizontal pass kernel for half width < 32
+        rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_32word", &status );
+        isEven = (xp != xn);
+        
+        status = clSetKernelArg(rEnv.mpkKernel,
+            0,
+            sizeof(cl_mem),
+            &pixsCLBuffer);
+        status = clSetKernelArg(rEnv.mpkKernel,
+            1,
+            sizeof(cl_mem),
+            &pixdCLBuffer);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                2,
+                sizeof(xp),
+                (const void *)&xp);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                3,
+                sizeof(wpl),
+                (const void *)&wpl);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                4,
+                sizeof(h),
+                (const void *)&h);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                5,
+                sizeof(isEven),
+                (const void *)&isEven);
+        status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
+                                rEnv.mpkKernel,
+                                2,
+                                NULL,
+                                globalThreads,
+                                localThreads,
+                                0,
+                                NULL,
+                                NULL);
+
+        if (yp > 0 || yn > 0)
+        {
+            pixtemp = pixsCLBuffer;
+            pixsCLBuffer = pixdCLBuffer;
+            pixdCLBuffer = pixtemp;
+        }
+    } 
+
+    if (yp > 0 || yn > 0)
+    {
+        rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer", &status );
+        
+        status = clSetKernelArg(rEnv.mpkKernel,
+            0,
+            sizeof(cl_mem),
+            &pixsCLBuffer);
+        status = clSetKernelArg(rEnv.mpkKernel,
+            1,
+            sizeof(cl_mem),
+            &pixdCLBuffer);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                2,
+                sizeof(yp),
+                (const void *)&yp);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                3,
+                sizeof(wpl),
+                (const void *)&wpl);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                4,
+                sizeof(h),
+                (const void *)&h);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                5,
+                sizeof(yn),
+                (const void *)&yn);
+        status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
+                                rEnv.mpkKernel,
+                                2,
+                                NULL,
+                                globalThreads,
+                                localThreads,
+                                0,
+                                NULL,
+                                NULL);
+    }
+    
+
+    return status;
+}
+
+//Morphology Erode operation. Invokes the relevant OpenCL kernels
+cl_int 
+pixErodeCL(l_int32  hsize, l_int32  vsize, l_uint32 wpl, l_uint32 h)
+{
+
+    l_int32  xp, yp, xn, yn;
+    SEL* sel;
+    size_t globalThreads[2];
+    size_t localThreads[2];
+    cl_mem pixtemp;
+    cl_int status;
+    int gsize;
+    char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
+    l_uint32 rwmask, lwmask;
+    char isEven;
+
+    sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
+    
+    selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
+    
+    OpenclDevice::SetKernelEnv( &rEnv );
+
+    if (hsize == 5 && vsize == 5 && isAsymmetric)
+    {
+        //Specific kernel for 5x5
+        status = pixErodeCL_55(wpl, h);
+        return status;
+    }
+
+    rwmask = rmask32[32 - (xp & 31)];
+    lwmask = lmask32[32 - (xn & 31)];
+
+    //global and local work dimensions for Horizontal pass
+    gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
+    globalThreads[0] = gsize;
+    gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
+    globalThreads[1] = gsize;
+    localThreads[0] = GROUPSIZE_X;
+    localThreads[1] = GROUPSIZE_Y;
+    
+    //Horizontal Pass
+    if (xp > 31 || xn > 31 )
+    {
+        //Generic case. 
+        rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor", &status );
+        
+        status = clSetKernelArg(rEnv.mpkKernel,
+            0,
+            sizeof(cl_mem),
+            &pixsCLBuffer);
+        status = clSetKernelArg(rEnv.mpkKernel,
+            1,
+            sizeof(cl_mem),
+            &pixdCLBuffer);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                2,
+                sizeof(xp),
+                (const void *)&xp);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                3,
+                sizeof(xn),
+                (const void *)&xn);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                4,
+                sizeof(wpl),
+                (const void *)&wpl);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                5,
+                sizeof(h),
+                (const void *)&h);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                6,
+                sizeof(isAsymmetric),
+                (const void *)&isAsymmetric);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                7,
+                sizeof(rwmask),
+                (const void *)&rwmask);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                8,
+                sizeof(lwmask),
+                (const void *)&lwmask);
+        status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
+                                rEnv.mpkKernel,
+                                2,
+                                NULL,
+                                globalThreads,
+                                localThreads,
+                                0,
+                                NULL,
+                                NULL);
+
+        if (yp > 0 || yn > 0)
+        {
+            pixtemp = pixsCLBuffer;
+            pixsCLBuffer = pixdCLBuffer;
+            pixdCLBuffer = pixtemp;
+        }
+    }
+    else if (xp > 0 || xn > 0)
+    {
+        rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_32word", &status );
+        isEven = (xp != xn);
+
+        status = clSetKernelArg(rEnv.mpkKernel,
+            0,
+            sizeof(cl_mem),
+            &pixsCLBuffer);
+        status = clSetKernelArg(rEnv.mpkKernel,
+            1,
+            sizeof(cl_mem),
+            &pixdCLBuffer);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                2,
+                sizeof(xp),
+                (const void *)&xp);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                3,
+                sizeof(wpl),
+                (const void *)&wpl);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                4,
+                sizeof(h),
+                (const void *)&h);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                5,
+                sizeof(isAsymmetric),
+                (const void *)&isAsymmetric);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                6,
+                sizeof(rwmask),
+                (const void *)&rwmask);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                7,
+                sizeof(lwmask),
+                (const void *)&lwmask);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                8,
+                sizeof(isEven),
+                (const void *)&isEven);
+        status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
+                                rEnv.mpkKernel,
+                                2,
+                                NULL,
+                                globalThreads,
+                                localThreads,
+                                0,
+                                NULL,
+                                NULL);
+    
+        if (yp > 0 || yn > 0)
+        {
+            pixtemp = pixsCLBuffer;
+            pixsCLBuffer = pixdCLBuffer;
+            pixdCLBuffer = pixtemp;
+        }
+    }
+
+    //Vertical Pass
+    if (yp > 0 || yn > 0)
+    {
+        rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer", &status );
+        
+        status = clSetKernelArg(rEnv.mpkKernel,
+            0,
+            sizeof(cl_mem),
+            &pixsCLBuffer);
+        status = clSetKernelArg(rEnv.mpkKernel,
+            1,
+            sizeof(cl_mem),
+            &pixdCLBuffer);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                2,
+                sizeof(yp),
+                (const void *)&yp);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                3,
+                sizeof(wpl),
+                (const void *)&wpl);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                4,
+                sizeof(h),
+                (const void *)&h);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                5,
+                sizeof(isAsymmetric),
+                (const void *)&isAsymmetric);
+        status = clSetKernelArg(rEnv.mpkKernel,
+                6,
+                sizeof(yn),
+                (const void *)&yn);
+        status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
+                                rEnv.mpkKernel,
+                                2,
+                                NULL,
+                                globalThreads,
+                                localThreads,
+                                0,
+                                NULL,
+                                NULL);
+    }
+
+    return status;
+}
+
+// OpenCL implementation of Morphology Dilate
+//Note: Assumes the source and dest opencl buffer are initialized. No check done
+PIX* 
+OpenclDevice::pixDilateBrickCL(PIX  *pixd, PIX  *pixs, l_int32  hsize, l_int32  vsize, bool reqDataCopy = false)
+{
+    l_uint32 wpl, h;
+
+    wpl = pixGetWpl(pixs);
+    h = pixGetHeight(pixs);
+    
+    clStatus = pixDilateCL(hsize, vsize, wpl, h);
+
+    if (reqDataCopy)
+    {
+        pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ, false);
+    }
+
+    return pixd;
+}
+
+// OpenCL implementation of Morphology Erode
+//Note: Assumes the source and dest opencl buffer are initialized. No check done
+PIX* 
+OpenclDevice::pixErodeBrickCL(PIX  *pixd, PIX  *pixs, l_int32  hsize, l_int32  vsize, bool reqDataCopy = false)
+{
+    l_uint32 wpl, h;
+    
+    wpl = pixGetWpl(pixs);
+    h = pixGetHeight(pixs);
+
+    clStatus = pixErodeCL(hsize, vsize, wpl, h);
+    
+    if (reqDataCopy)
+    {
+        pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
+    }
+
+    return pixd;
+}
+
+//Morphology Open operation. Invokes the relevant OpenCL kernels
+cl_int
+pixOpenCL(l_int32  hsize, l_int32  vsize, l_int32  wpl, l_int32  h)
+{
+    cl_int status;
+    cl_mem pixtemp;
+    
+    //Erode followed by Dilate
+    status = pixErodeCL(hsize, vsize, wpl, h);
+    
+    pixtemp = pixsCLBuffer;
+    pixsCLBuffer = pixdCLBuffer;
+    pixdCLBuffer = pixtemp;
+
+    status = pixDilateCL(hsize, vsize, wpl, h);
+
+    return status;
+}
+
+//Morphology Close operation. Invokes the relevant OpenCL kernels
+cl_int
+pixCloseCL(l_int32  hsize, l_int32  vsize, l_int32  wpl, l_int32  h)
+{
+    cl_int status;
+    cl_mem pixtemp;
+    
+    //Dilate followed by Erode
+    status = pixDilateCL(hsize, vsize, wpl, h);
+    
+    pixtemp = pixsCLBuffer;
+    pixsCLBuffer = pixdCLBuffer;
+    pixdCLBuffer = pixtemp;
+
+    status = pixErodeCL(hsize, vsize, wpl, h);
+
+    return status;
+}
+
+// OpenCL implementation of Morphology Close
+//Note: Assumes the source and dest opencl buffer are initialized. No check done
+PIX* 
+OpenclDevice::pixCloseBrickCL(PIX  *pixd, 
+                              PIX  *pixs, 
+                              l_int32  hsize, 
+                              l_int32  vsize, 
+                              bool reqDataCopy = false)
+{
+    l_uint32 wpl, h;
+    
+    wpl = pixGetWpl(pixs);
+    h = pixGetHeight(pixs);
+
+    clStatus = pixCloseCL(hsize, vsize, wpl, h);
+
+    if (reqDataCopy)
+    {
+        pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
+    }
+
+    return pixd;
+}
+
+// OpenCL implementation of Morphology Open
+//Note: Assumes the source and dest opencl buffer are initialized. No check done
+PIX* 
+OpenclDevice::pixOpenBrickCL(PIX  *pixd, 
+                              PIX  *pixs, 
+                              l_int32  hsize, 
+                              l_int32  vsize, 
+                              bool reqDataCopy = false)
+{
+    l_uint32 wpl, h;
+    
+    wpl = pixGetWpl(pixs);
+    h = pixGetHeight(pixs);
+
+    clStatus = pixOpenCL(hsize, vsize, wpl, h);
+
+    if (reqDataCopy)
+    {
+        pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
+    }
+
+    return pixd;
+}
+
+//pix OR operation: outbuffer = buffer1 | buffer2
+cl_int
+pixORCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outbuffer)
+{
+    cl_int status;
+    size_t globalThreads[2];
+    int gsize;
+    size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
+
+    gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
+    globalThreads[0] = gsize;
+    gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
+    globalThreads[1] = gsize;
+
+    rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixOR", &status );
+
+    status = clSetKernelArg(rEnv.mpkKernel,
+        0,
+        sizeof(cl_mem),
+        &buffer1);
+    status = clSetKernelArg(rEnv.mpkKernel,
+        1,
+        sizeof(cl_mem),
+        &buffer2);
+    status = clSetKernelArg(rEnv.mpkKernel,
+        2,
+        sizeof(cl_mem),
+        &outbuffer);
+    status = clSetKernelArg(rEnv.mpkKernel,
+            3,
+            sizeof(wpl),
+            (const void *)&wpl);
+    status = clSetKernelArg(rEnv.mpkKernel,
+            4,
+            sizeof(h),
+            (const void *)&h);
+    status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
+                            rEnv.mpkKernel,
+                            2,
+                            NULL,
+                            globalThreads,
+                            localThreads,
+                            0,
+                            NULL,
+                            NULL);
+
+    return status;
+}
+
+//pix AND operation: outbuffer = buffer1 & buffer2
+cl_int
+pixANDCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outbuffer)
+{
+    cl_int status;
+    size_t globalThreads[2];
+    int gsize;
+    size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
+
+    gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
+    globalThreads[0] = gsize;
+    gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
+    globalThreads[1] = gsize;
+
+    rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixAND", &status );
+                
+    // Enqueue a kernel run call.
+    status = clSetKernelArg(rEnv.mpkKernel,
+        0,
+        sizeof(cl_mem),
+        &buffer1);
+    status = clSetKernelArg(rEnv.mpkKernel,
+        1,
+        sizeof(cl_mem),
+        &buffer2);
+    status = clSetKernelArg(rEnv.mpkKernel,
+        2,
+        sizeof(cl_mem),
+        &outbuffer);
+    status = clSetKernelArg(rEnv.mpkKernel,
+            3,
+            sizeof(wpl),
+            (const void *)&wpl);
+    status = clSetKernelArg(rEnv.mpkKernel,
+            4,
+            sizeof(h),
+            (const void *)&h);
+    status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
+                            rEnv.mpkKernel,
+                            2,
+                            NULL,
+                            globalThreads,
+                            localThreads,
+                            0,
+                            NULL,
+                            NULL);
+
+    return status;
+}
+
+//output = buffer1 & ~(buffer2)
+cl_int
+pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outBuffer = NULL)
+{
+    cl_int status;
+    size_t globalThreads[2];
+    int gsize;
+    size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
+
+    gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
+    globalThreads[0] = gsize;
+    gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
+    globalThreads[1] = gsize;
+
+    if (outBuffer != NULL)
+    {
+        rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixSubtract", &status );
+    }
+    else
+    {
+        rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixSubtract_inplace", &status );
+    }
+
+    // Enqueue a kernel run call.
+    status = clSetKernelArg(rEnv.mpkKernel,
+        0,
+        sizeof(cl_mem),
+        &buffer1);
+    status = clSetKernelArg(rEnv.mpkKernel,
+        1,
+        sizeof(cl_mem),
+        &buffer2);
+    status = clSetKernelArg(rEnv.mpkKernel,
+            2,
+            sizeof(wpl),
+            (const void *)&wpl);
+    status = clSetKernelArg(rEnv.mpkKernel,
+            3,
+            sizeof(h),
+            (const void *)&h);
+    if (outBuffer != NULL)
+    {
+        status = clSetKernelArg(rEnv.mpkKernel,
+            4,
+            sizeof(cl_mem),
+            (const void *)&outBuffer);
+    }
+    status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
+                            rEnv.mpkKernel,
+                            2,
+                            NULL,
+                            globalThreads,
+                            localThreads,
+                            0,
+                            NULL,
+                            NULL);
+
+    return status;
+}
+
+// OpenCL implementation of Subtract pix
+//Note: Assumes the source and dest opencl buffer are initialized. No check done
+PIX* 
+OpenclDevice::pixSubtractCL(PIX  *pixd, PIX  *pixs1, PIX  *pixs2, bool reqDataCopy = false)
+{
+    l_uint32 wpl, h;
+    
+    PROCNAME("pixSubtractCL");
+
+    if (!pixs1)
+        return (PIX *)ERROR_PTR("pixs1 not defined", procName, pixd);
+    if (!pixs2)
+        return (PIX *)ERROR_PTR("pixs2 not defined", procName, pixd);
+    if (pixGetDepth(pixs1) != pixGetDepth(pixs2))
+        return (PIX *)ERROR_PTR("depths of pixs* unequal", procName, pixd);
+
+#if  EQUAL_SIZE_WARNING
+    if (!pixSizesEqual(pixs1, pixs2))
+        L_WARNING("pixs1 and pixs2 not equal sizes", procName);
+#endif  /* EQUAL_SIZE_WARNING */
+
+    wpl = pixGetWpl(pixs1);
+    h = pixGetHeight(pixs1);
+
+    clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
+
+    if (reqDataCopy)
+    {
+        //Read back output data from OCL buffer to cpu
+        pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs1, wpl*h, CL_MAP_READ);
+    }
+
+    return pixd;
+}
+
+// OpenCL implementation of Hollow pix
+//Note: Assumes the source and dest opencl buffer are initialized. No check done
+PIX* 
+OpenclDevice::pixHollowCL(PIX  *pixd, 
+                        PIX  *pixs, 
+                        l_int32  close_hsize, 
+                        l_int32  close_vsize, 
+                        l_int32  open_hsize, 
+                        l_int32  open_vsize,
+                        bool reqDataCopy = false)
+{
+    l_uint32 wpl, h;
+    cl_mem pixtemp;
+
+    wpl = pixGetWpl(pixs);
+    h = pixGetHeight(pixs);
+
+    //First step : Close Morph operation: Dilate followed by Erode
+    clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
+
+    //Store the output of close operation in an intermediate buffer
+    //this will be later used for pixsubtract
+    clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL);
+
+    //Second step: Open Operation - Erode followed by Dilate
+    pixtemp = pixsCLBuffer;
+    pixsCLBuffer = pixdCLBuffer;
+    pixdCLBuffer = pixtemp;
+
+    clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
+
+    //Third step: Subtract : (Close - Open)
+    pixtemp = pixsCLBuffer;
+    pixsCLBuffer = pixdCLBuffer;
+    pixdCLBuffer = pixdCLIntermediate;
+    pixdCLIntermediate = pixtemp;
+
+    clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
+
+    if (reqDataCopy)
+    {
+        //Read back output data from OCL buffer to cpu
+        pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
+    }
+    return pixd;
+}
+
+// OpenCL implementation of Get Lines from pix function
+//Note: Assumes the source and dest opencl buffer are initialized. No check done
+void 
+OpenclDevice::pixGetLinesCL(PIX  *pixd, 
+                            PIX  *pixs, 
+                            PIX** pix_vline, 
+                            PIX** pix_hline, 
+                            PIX** pixClosed,
+                            bool  getpixClosed,
+                            l_int32  close_hsize, l_int32  close_vsize, 
+                            l_int32  open_hsize, l_int32  open_vsize,
+                            l_int32  line_hsize, l_int32  line_vsize)
+{
+    l_uint32 wpl, h;
+    cl_mem pixtemp;
+
+    wpl = pixGetWpl(pixs);
+    h = pixGetHeight(pixs);
+
+    //First step : Close Morph operation: Dilate followed by Erode
+    clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
+
+    //Copy the Close output to CPU buffer
+    if (getpixClosed)
+    {
+        *pixClosed = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs, wpl*h, CL_MAP_READ, true, false);
+    }
+
+    //Store the output of close operation in an intermediate buffer
+    //this will be later used for pixsubtract
+    clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL);
+    
+    //Second step: Open Operation - Erode followed by Dilate
+    pixtemp = pixsCLBuffer;
+    pixsCLBuffer = pixdCLBuffer;
+    pixdCLBuffer = pixtemp;
+
+    clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
+
+    //Third step: Subtract : (Close - Open)
+    pixtemp = pixsCLBuffer;
+    pixsCLBuffer = pixdCLBuffer;
+    pixdCLBuffer = pixdCLIntermediate;
+    pixdCLIntermediate = pixtemp;
+
+    clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
+
+    //Store the output of Hollow operation in an intermediate buffer
+    //this will be later used 
+    clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL);
+
+    pixtemp = pixsCLBuffer;
+    pixsCLBuffer = pixdCLBuffer;
+    pixdCLBuffer = pixtemp;
+
+    //Fourth step: Get vertical line
+    //pixOpenBrick(NULL, pix_hollow, 1, min_line_length);
+    clStatus = pixOpenCL(1, line_vsize, wpl, h);
+
+    //Copy the vertical line output to CPU buffer
+    *pix_vline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl*h, CL_MAP_READ, true, false);
+    
+    pixtemp = pixsCLBuffer;
+    pixsCLBuffer = pixdCLIntermediate;
+    pixdCLIntermediate = pixtemp;
+
+    //Fifth step: Get horizontal line
+    //pixOpenBrick(NULL, pix_hollow, min_line_length, 1);
+    clStatus = pixOpenCL(line_hsize, 1, wpl, h);
+        
+    //Copy the horizontal line output to CPU buffer
+    *pix_hline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl*h, CL_MAP_READ, true, true);
+
+    return;
+}
+
+
+/*************************************************************************
+ *  HistogramRect
+ *  Otsu Thresholding Operations
+ *  histogramAllChannels is layed out as all channel 0, then all channel 1...
+ *  only supports 1 or 4 channels (bytes_per_pixel)
+ ************************************************************************/
+void OpenclDevice::HistogramRectOCL(
+    const unsigned char* imageData,
+    int bytes_per_pixel,
+    int bytes_per_line,
+    int left, // always 0
+    int top, // always 0
+    int width,
+    int height,
+    int kHistogramSize,
+    int* histogramAllChannels)
+{
+PERF_COUNT_START("HistogramRectOCL")
+    cl_int clStatus;
+    KernelEnv histKern;
+    SetKernelEnv( &histKern );
+    KernelEnv histRedKern;
+    SetKernelEnv( &histRedKern );
+    /* map imagedata to device as read only */
+    // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be coherent which we don't need.
+    // faster option would be to allocate initial image buffer
+    // using a garlic bus memory type
+    cl_mem imageBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, width*height*bytes_per_pixel*sizeof(char), (void *)imageData, &clStatus );
+    CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
+
+    /* setup work group size parameters */
+    int block_size = 256;
+    cl_uint numCUs;
+    clStatus = clGetDeviceInfo( gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs, NULL);
+    CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
+
+    int requestedOccupancy = 10;
+    int numWorkGroups = numCUs * requestedOccupancy;
+    int numThreads = block_size*numWorkGroups;
+    size_t local_work_size[] = {block_size};
+    size_t global_work_size[] = {numThreads};
+    size_t red_global_work_size[] = {block_size*kHistogramSize*bytes_per_pixel}; 
+
+    /* map histogramAllChannels as write only */
+    int numBins = kHistogramSize*bytes_per_pixel*numWorkGroups;
+    
+    cl_mem histogramBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, kHistogramSize*bytes_per_pixel*sizeof(int), (void *)histogramAllChannels, &clStatus );
+    CHECK_OPENCL( clStatus, "clCreateBuffer histogramBuffer");
+
+    /* intermediate histogram buffer */
+    int histRed = 256;
+    int tmpHistogramBins =  kHistogramSize*bytes_per_pixel*histRed; 
+
+    cl_mem tmpHistogramBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE, tmpHistogramBins*sizeof(cl_uint), NULL, &clStatus );
+    CHECK_OPENCL( clStatus, "clCreateBuffer tmpHistogramBuffer");
+
+    /* atomic sync buffer */
+    int *zeroBuffer = new int[1];
+    zeroBuffer[0] = 0;
+    cl_mem atomicSyncBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_int), (void *)zeroBuffer, &clStatus );
+    CHECK_OPENCL( clStatus, "clCreateBuffer atomicSyncBuffer");
+
+    //Create kernel objects based on bytes_per_pixel
+    if (bytes_per_pixel == 1)
+    {
+        histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectOneChannel", &clStatus );
+        CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectOneChannel");
+
+        histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectOneChannelReduction", &clStatus );
+        CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectOneChannelReduction");
+    } else {
+    histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus );
+    CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannels");
+
+    histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectAllChannelsReduction", &clStatus );
+    CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannelsReduction");
+    }
+
+    void *ptr;
+    
+    //Initialize tmpHistogramBuffer buffer
+    ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0, tmpHistogramBins*sizeof(cl_uint), 0, NULL, NULL, &clStatus);
+    CHECK_OPENCL( clStatus, "clEnqueueMapBuffer tmpHistogramBuffer");
+    
+    memset(ptr, 0, tmpHistogramBins*sizeof(cl_uint));
+    clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0, NULL, NULL);
+
+    /* set kernel 1 arguments */
+    clStatus = clSetKernelArg( histKern.mpkKernel, 0, sizeof(cl_mem), (void *)&imageBuffer );
+    CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
+    cl_uint numPixels = width*height;
+    clStatus = clSetKernelArg( histKern.mpkKernel, 1, sizeof(cl_uint), (void *)&numPixels );
+    CHECK_OPENCL( clStatus, "clSetKernelArg numPixels" );
+    clStatus = clSetKernelArg( histKern.mpkKernel, 2, sizeof(cl_mem), (void *)&tmpHistogramBuffer );
+    CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer");
+
+    /* set kernel 2 arguments */
+    int n = numThreads/bytes_per_pixel;
+    clStatus = clSetKernelArg( histRedKern.mpkKernel, 0, sizeof(cl_int), (void *)&n );
+    CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
+    clStatus = clSetKernelArg( histRedKern.mpkKernel, 1, sizeof(cl_mem), (void *)&tmpHistogramBuffer );
+    CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer");
+    clStatus = clSetKernelArg( histRedKern.mpkKernel, 2, sizeof(cl_mem), (void *)&histogramBuffer );
+    CHECK_OPENCL( clStatus, "clSetKernelArg histogramBuffer");
+
+    /* launch histogram */
+PERF_COUNT_SUB("before")
+    clStatus = clEnqueueNDRangeKernel(
+        histKern.mpkCmdQueue,
+        histKern.mpkKernel,
+        1, NULL, global_work_size, local_work_size,
+        0, NULL, NULL );
+    CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels" );
+    clFinish( histKern.mpkCmdQueue );
+
+    /* launch histogram */
+    clStatus = clEnqueueNDRangeKernel(
+        histRedKern.mpkCmdQueue,
+        histRedKern.mpkKernel,
+        1, NULL, red_global_work_size, local_work_size,
+        0, NULL, NULL );
+    CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction" );
+    clFinish( histRedKern.mpkCmdQueue );
+
+PERF_COUNT_SUB("redKernel")
+
+    /* map results back from gpu */
+    ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE, CL_MAP_READ, 0, kHistogramSize*bytes_per_pixel*sizeof(int), 0, NULL, NULL, &clStatus);
+    CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer");
+    
+    clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0, NULL, NULL);
+   
+    clReleaseMemObject(histogramBuffer);
+    clReleaseMemObject(imageBuffer);
+PERF_COUNT_SUB("after")
+PERF_COUNT_END
+
+}
+
+/*************************************************************************
+ * Threshold the rectangle, taking everything except the image buffer pointer
+ * from the class, using thresholds/hi_values to the output IMAGE.
+ * only supports 1 or 4 channels
+ ************************************************************************/
+void OpenclDevice::ThresholdRectToPixOCL(
+    const unsigned char* imageData,
+    int bytes_per_pixel,
+    int bytes_per_line,
+    const int* thresholds,
+    const int* hi_values,
+    Pix** pix,
+    int height,
+    int width,
+    int top,
+    int left) {
+PERF_COUNT_START("ThresholdRectToPixOCL")
+
+    /* create pix result buffer */                                 
+    *pix = pixCreate(width, height, 1);
+    uinT32* pixData = pixGetData(*pix);
+    int wpl = pixGetWpl(*pix);
+    int pixSize = wpl*height*sizeof(uinT32);
+
+    cl_int clStatus;
+    KernelEnv rEnv;
+    SetKernelEnv( &rEnv );
+
+    /* setup work group size parameters */
+    int block_size = 256;
+    cl_uint numCUs = 6;
+     clStatus = clGetDeviceInfo( gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs, NULL);
+    CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
+
+    int requestedOccupancy = 10;
+    int numWorkGroups = numCUs * requestedOccupancy;
+    int numThreads = block_size*numWorkGroups;
+    size_t local_work_size[] = {(size_t) block_size};
+    size_t global_work_size[] = {(size_t) numThreads};
+
+    /* map imagedata to device as read only */
+    // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be coherent which we don't need.
+    // faster option would be to allocate initial image buffer
+    // using a garlic bus memory type
+    cl_mem imageBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, width*height*bytes_per_pixel*sizeof(char), (void *)imageData, &clStatus );
+    CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
+
+    /* map pix as write only */
+    pixThBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, pixSize, (void *)pixData, &clStatus );
+    CHECK_OPENCL( clStatus, "clCreateBuffer pix");
+
+    /* map thresholds and hi_values */
+    cl_mem thresholdsBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bytes_per_pixel*sizeof(int), (void *)thresholds, &clStatus );
+    CHECK_OPENCL( clStatus, "clCreateBuffer thresholdBuffer");
+    cl_mem hiValuesBuffer   = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bytes_per_pixel*sizeof(int), (void *)hi_values, &clStatus );
+    CHECK_OPENCL( clStatus, "clCreateBuffer hiValuesBuffer");
+
+    /* compile kernel */
+    if (bytes_per_pixel == 4) {
+    rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "kernel_ThresholdRectToPix", &clStatus );
+    CHECK_OPENCL( clStatus, "clCreateKernel kernel_ThresholdRectToPix");
+    } else {
+        rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "kernel_ThresholdRectToPix_OneChan", &clStatus );
+        CHECK_OPENCL( clStatus, "clCreateKernel kernel_ThresholdRectToPix_OneChan");
+    }
+
+    /* set kernel arguments */
+    clStatus = clSetKernelArg( rEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&imageBuffer );
+    CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
+    cl_uint numPixels = width*height;
+    clStatus = clSetKernelArg( rEnv.mpkKernel, 1, sizeof(int), (void *)&height );
+    CHECK_OPENCL( clStatus, "clSetKernelArg height" );
+    clStatus = clSetKernelArg( rEnv.mpkKernel, 2, sizeof(int), (void *)&width );
+    CHECK_OPENCL( clStatus, "clSetKernelArg width" );
+    clStatus = clSetKernelArg( rEnv.mpkKernel, 3, sizeof(int), (void *)&wpl );
+    CHECK_OPENCL( clStatus, "clSetKernelArg wpl" );
+    clStatus = clSetKernelArg( rEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&thresholdsBuffer );
+    CHECK_OPENCL( clStatus, "clSetKernelArg thresholdsBuffer" );
+    clStatus = clSetKernelArg( rEnv.mpkKernel, 5, sizeof(cl_mem), (void *)&hiValuesBuffer );
+    CHECK_OPENCL( clStatus, "clSetKernelArg hiValuesBuffer" );
+    clStatus = clSetKernelArg( rEnv.mpkKernel, 6, sizeof(cl_mem), (void *)&pixThBuffer );
+    CHECK_OPENCL( clStatus, "clSetKernelArg pixThBuffer");
+
+    /* launch kernel & wait */
+PERF_COUNT_SUB("before")
+    clStatus = clEnqueueNDRangeKernel(
+        rEnv.mpkCmdQueue,
+        rEnv.mpkKernel,
+        1, NULL, global_work_size, local_work_size,
+        0, NULL, NULL );
+    CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix" );
+    clFinish( rEnv.mpkCmdQueue );
+PERF_COUNT_SUB("kernel")
+    
+    /* map results back from gpu */
+    void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0, pixSize, 0, NULL, NULL, &clStatus);
+    CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer");
+    clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0, NULL, NULL);
+    
+    clReleaseMemObject(imageBuffer);
+    clReleaseMemObject(thresholdsBuffer);
+    clReleaseMemObject(hiValuesBuffer);
+
+PERF_COUNT_SUB("after")
+PERF_COUNT_END
+}
+
+
+#if USE_DEVICE_SELECTION
+
+/******************************************************************************
+ * Data Types for Device Selection
+ *****************************************************************************/
+
+typedef struct _TessScoreEvaluationInputData {
+    int height;
+    int width;
+    int numChannels;
+    unsigned char *imageData;
+    Pix *pix;
+} TessScoreEvaluationInputData;
+
+void populateTessScoreEvaluationInputData( TessScoreEvaluationInputData *input ) {
+    srand(1);
+    // 8.5x11 inches @ 300dpi rounded to clean multiples
+    int height = 3328; // %256
+    int width = 2560; // %512
+    int numChannels = 4;
+    input->height = height;
+    input->width = width;
+    input->numChannels = numChannels;
+    unsigned char (*imageData4)[4] = (unsigned char (*)[4]) malloc(height*width*numChannels*sizeof(unsigned char)); // new unsigned char[4][height*width];
+    input->imageData = (unsigned char *) &imageData4[0];
+    
+    // zero out image
+    unsigned char pixelWhite[4] = {  0,   0,   0, 255};
+    unsigned char pixelBlack[4] = {255, 255, 255, 255};
+    for (int p = 0; p < height*width; p++) {
+        //unsigned char tmp[4] = imageData4[0];
+        imageData4[p][0] = pixelWhite[0];
+        imageData4[p][1] = pixelWhite[1];
+        imageData4[p][2] = pixelWhite[2];
+        imageData4[p][3] = pixelWhite[3];
+    }
+    // random lines to be eliminated
+    int maxLineWidth = 64; // pixels wide
+    int numLines = 10;
+    // vertical lines
+    for (int i = 0; i < numLines; i++) {
+        int lineWidth = rand()%maxLineWidth;
+        int vertLinePos = lineWidth + rand()%(width-2*lineWidth);
+        //printf("[PI] VerticalLine @ %i (w=%i)\n", vertLinePos, lineWidth);
+        for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) {
+            for (int col = 0; col < height; col++) {
+                //imageData4[row*width+col] = pixelBlack;
+                imageData4[row*width+col][0] = pixelBlack[0];
+                imageData4[row*width+col][1] = pixelBlack[1];
+                imageData4[row*width+col][2] = pixelBlack[2];
+                imageData4[row*width+col][3] = pixelBlack[3];
+            }
+        }
+    }
+    // horizontal lines
+    for (int i = 0; i < numLines; i++) {
+        int lineWidth = rand()%maxLineWidth;
+        int horLinePos = lineWidth + rand()%(height-2*lineWidth);
+        //printf("[PI] HorizontalLine @ %i (w=%i)\n", horLinePos, lineWidth);
+        for (int row = 0; row < width; row++) {
+            for (int col = horLinePos-lineWidth/2; col < horLinePos+lineWidth/2; col++) { // for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) {
+                //printf("[PI] HoizLine pix @ (%3i, %3i)\n", row, col);
+                //imageData4[row*width+col] = pixelBlack;
+                imageData4[row*width+col][0] = pixelBlack[0];
+                imageData4[row*width+col][1] = pixelBlack[1];
+                imageData4[row*width+col][2] = pixelBlack[2];
+                imageData4[row*width+col][3] = pixelBlack[3];
+            }
+        }
+    }
+    // spots (noise, squares)
+    float fractionBlack = 0.1; // how much of the image should be blackened
+    int numSpots = (height*width)*fractionBlack/(maxLineWidth*maxLineWidth/2/2);
+    for (int i = 0; i < numSpots; i++) {
+        
+        int lineWidth = rand()%maxLineWidth;
+        int col = lineWidth + rand()%(width-2*lineWidth);
+        int row = lineWidth + rand()%(height-2*lineWidth);
+        //printf("[PI] Spot[%i/%i] @ (%3i, %3i)\n", i, numSpots, row, col );
+        for (int r = row-lineWidth/2; r < row+lineWidth/2; r++) {
+            for (int c = col-lineWidth/2; c < col+lineWidth/2; c++) {
+                //printf("[PI] \tSpot[%i/%i] @ (%3i, %3i)\n", i, numSpots, r, c );
+                //imageData4[row*width+col] = pixelBlack;
+                imageData4[r*width+c][0] = pixelBlack[0];
+                imageData4[r*width+c][1] = pixelBlack[1];
+                imageData4[r*width+c][2] = pixelBlack[2];
+                imageData4[r*width+c][3] = pixelBlack[3];
+            }
+        }
+    }
+
+    input->pix = pixCreate(input->width, input->height, 1);
+}
+
+typedef struct _TessDeviceScore {
+    float time; // small time means faster device
+    bool clError; // were there any opencl errors
+    bool valid; // was the correct response generated
+} TessDeviceScore;
+
+/******************************************************************************
+ * Micro Benchmarks for Device Selection
+ *****************************************************************************/
+
+double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
+    
+    double time = 0;
+#if ON_WINDOWS
+    LARGE_INTEGER freq, time_funct_start, time_funct_end;
+    QueryPerformanceFrequency(&freq);
+#else
+    timespec time_funct_start, time_funct_end;
+#endif
+    // input data
+    l_uint32 *tiffdata = (l_uint32 *)input.imageData;// same size and random data; data doesn't change workload
+
+    // function call
+    if (type == DS_DEVICE_OPENCL_DEVICE) {
+#if ON_WINDOWS
+        QueryPerformanceCounter(&time_funct_start);
+#else
+        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
+#endif
+
+        OpenclDevice::gpuEnv = *env;
+        int wpl = pixGetWpl(input.pix);
+        OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height, wpl, NULL);
+#if ON_WINDOWS
+        QueryPerformanceCounter(&time_funct_end);
+        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
+#else
+        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
+        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
+#endif
+
+    } else {
+#if ON_WINDOWS
+        QueryPerformanceCounter(&time_funct_start);
+#else
+        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
+#endif
+        Pix *pix = pixCreate(input.width, input.height, 32);
+        l_uint32 *pixData = pixGetData(pix);
+        int wpl = pixGetWpl(pix);
+        //l_uint32* output_gpu=pixReadFromTiffKernel(tiffdata,w,h,wpl,line);
+        //pixSetData(pix, output_gpu);
+        int i, j;
+        int idx = 0;
+        for (i = 0; i < input.height ; i++) {
+            for (j = 0; j < input.width; j++) {
+                
+                l_uint32 tiffword = tiffdata[i * input.width + j];
+                l_int32 rval = ((tiffword) & 0xff);
+                l_int32 gval = (((tiffword) >> 8) & 0xff);
+                l_int32 bval = (((tiffword) >> 16) & 0xff);
+                l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8);
+                pixData[idx] = value;
+                idx++;
+            }
+        }
+#if ON_WINDOWS
+        QueryPerformanceCounter(&time_funct_end);
+        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
+#else
+        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
+        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
+#endif
+        pixDestroy(&pix);
+    }
+
+
+    // cleanup
+
+    return time;
+}
+
+double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
+    
+    double time;
+#if ON_WINDOWS
+    LARGE_INTEGER freq, time_funct_start, time_funct_end;
+    QueryPerformanceFrequency(&freq);
+#else
+    timespec time_funct_start, time_funct_end;
+#endif
+    
+    unsigned char pixelHi = (unsigned char)255;
+    
+    int left = 0;
+    int top = 0;
+    int kHistogramSize = 256;
+    int bytes_per_line = input.width*input.numChannels;
+    int *histogramAllChannels = new int[kHistogramSize*input.numChannels];
+
+    // function call
+    if (type == DS_DEVICE_OPENCL_DEVICE) {
+#if ON_WINDOWS
+        QueryPerformanceCounter(&time_funct_start);
+#else
+        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
+#endif
+
+        OpenclDevice::gpuEnv = *env;
+        int wpl = pixGetWpl(input.pix);
+        OpenclDevice::HistogramRectOCL(input.imageData, input.numChannels, bytes_per_line, top, left, input.width, input.height, kHistogramSize, histogramAllChannels);
+
+#if ON_WINDOWS
+        QueryPerformanceCounter(&time_funct_end);
+        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
+#else
+        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
+        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
+#endif
+    } else {
+    
+        int *histogram = new int[kHistogramSize];
+#if ON_WINDOWS
+        QueryPerformanceCounter(&time_funct_start);
+#else
+        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
+#endif
+        for (int ch = 0; ch < input.numChannels; ++ch) { 
+            tesseract::HistogramRect(input.pix, input.numChannels,
+                  left, top, input.width, input.height, histogram);
+        }
+#if ON_WINDOWS
+        QueryPerformanceCounter(&time_funct_end);
+        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
+#else
+        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
+        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
+#endif
+        delete[] histogram;
+    }
+
+    // cleanup
+    //delete[] imageData;
+    delete[] histogramAllChannels;
+    return time;
+}
+
+//Reproducing the ThresholdRectToPix native version
+void ThresholdRectToPix_Native(const unsigned char* imagedata,
+                                          int bytes_per_pixel,
+                                          int bytes_per_line,
+                                          const int* thresholds,
+                                          const int* hi_values,
+                                          Pix** pix) {
+    int top = 0;
+    int left = 0;
+    int width = pixGetWidth(*pix);
+    int height = pixGetHeight(*pix);
+
+  *pix = pixCreate(width, height, 1);
+  uinT32* pixdata = pixGetData(*pix);
+  int wpl = pixGetWpl(*pix);
+  const unsigned char* srcdata = imagedata + top * bytes_per_line +
+                                 left * bytes_per_pixel;
+  for (int y = 0; y < height; ++y) {
+    const uinT8* linedata = srcdata;
+    uinT32* pixline = pixdata + y * wpl;
+    for (int x = 0; x < width; ++x, linedata += bytes_per_pixel) {
+      bool white_result = true;
+      for (int ch = 0; ch < bytes_per_pixel; ++ch) {
+        if (hi_values[ch] >= 0 &&
+            (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
+          white_result = false;
+          break;
+        }
+      }
+      if (white_result)
+        CLEAR_DATA_BIT(pixline, x);
+      else
+        SET_DATA_BIT(pixline, x);
+    }
+    srcdata += bytes_per_line;
+  }
+}
+
+double thresholdRectToPixMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
+    
+    double time;
+#if ON_WINDOWS
+    LARGE_INTEGER freq, time_funct_start, time_funct_end;
+    QueryPerformanceFrequency(&freq);
+#else
+    timespec time_funct_start, time_funct_end;
+#endif
+    
+    // input data
+    unsigned char pixelHi = (unsigned char)255;
+    int* thresholds = new int[4];
+    thresholds[0] = pixelHi/2;
+    thresholds[1] = pixelHi/2;
+    thresholds[2] = pixelHi/2;
+    thresholds[3] = pixelHi/2;
+    int *hi_values = new int[4];
+    thresholds[0] = pixelHi;
+    thresholds[1] = pixelHi;
+    thresholds[2] = pixelHi;
+    thresholds[3] = pixelHi;
+    //Pix* pix = pixCreate(width, height, 1);
+    int top = 0;
+    int left = 0;
+    int bytes_per_line = input.width*input.numChannels;
+
+    // function call
+    if (type == DS_DEVICE_OPENCL_DEVICE) {
+#if ON_WINDOWS
+        QueryPerformanceCounter(&time_funct_start);
+#else
+        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
+#endif
+
+        OpenclDevice::gpuEnv = *env;
+        int wpl = pixGetWpl(input.pix);
+        OpenclDevice::ThresholdRectToPixOCL(input.imageData, input.numChannels, bytes_per_line, thresholds, hi_values, &input.pix, input.height, input.width, top, left);
+
+#if ON_WINDOWS
+        QueryPerformanceCounter(&time_funct_end);
+        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
+#else
+        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
+        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
+#endif
+    } else {
+
+
+        tesseract::ImageThresholder thresholder;
+        thresholder.SetImage( input.pix );
+#if ON_WINDOWS
+        QueryPerformanceCounter(&time_funct_start);
+#else
+        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
+#endif
+        ThresholdRectToPix_Native( input.imageData, input.numChannels, bytes_per_line,
+            thresholds, hi_values, &input.pix );
+
+#if ON_WINDOWS
+        QueryPerformanceCounter(&time_funct_end);
+        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
+#else
+        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
+        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
+#endif
+    }
+
+    // cleanup
+    delete[] thresholds;
+    delete[] hi_values;
+    return time;
+}
+
+double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
+
+    double time = 0;
+#if ON_WINDOWS
+    LARGE_INTEGER freq, time_funct_start, time_funct_end;
+    QueryPerformanceFrequency(&freq);
+#else
+    timespec time_funct_start, time_funct_end;
+#endif
+
+    // input data
+    int resolution = 300;
+    int wpl = pixGetWpl(input.pix);
+    int kThinLineFraction = 20; // tess constant
+    int kMinLineLengthFraction = 4; // tess constant
+    int max_line_width = resolution / kThinLineFraction;
+    int min_line_length = resolution / kMinLineLengthFraction;
+    int closing_brick = max_line_width / 3;
+   
+    // function call
+    if (type == DS_DEVICE_OPENCL_DEVICE) {
+#if ON_WINDOWS
+        QueryPerformanceCounter(&time_funct_start);
+#else
+        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
+#endif
+        Pix *src_pix = input.pix;
+        OpenclDevice::gpuEnv = *env;
+        OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
+        Pix *pix_vline = NULL, *pix_hline = NULL, *pix_closed = NULL;
+        OpenclDevice::pixGetLinesCL(NULL, input.pix, &pix_vline, &pix_hline, &pix_closed, true, closing_brick, closing_brick, max_line_width, max_line_width, min_line_length, min_line_length);
+
+        OpenclDevice::releaseMorphCLBuffers();
+
+#if ON_WINDOWS
+        QueryPerformanceCounter(&time_funct_end);
+        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
+#else
+        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
+        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
+#endif
+    } else {
+#if ON_WINDOWS
+        QueryPerformanceCounter(&time_funct_start);
+#else
+        clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
+#endif
+
+        // native serial code
+        Pix *src_pix = input.pix;
+        Pix *pix_closed = pixCloseBrick(NULL, src_pix, closing_brick, closing_brick);
+        Pix *pix_solid  = pixOpenBrick(NULL, pix_closed, max_line_width, max_line_width);
+        Pix *pix_hollow = pixSubtract(NULL, pix_closed, pix_solid);
+        pixDestroy(&pix_solid);
+        Pix *pix_vline = pixOpenBrick(NULL, pix_hollow, 1, min_line_length);
+        Pix *pix_hline = pixOpenBrick(NULL, pix_hollow, min_line_length, 1);
+        pixDestroy(&pix_hollow);
+
+#if ON_WINDOWS
+        QueryPerformanceCounter(&time_funct_end);
+        time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
+#else
+        clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
+        time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
+#endif
+    }
+
+    return time;
+}
+
+
+
+/******************************************************************************
+ * Device Selection
+ *****************************************************************************/
+
+#include "stdlib.h"
+
+ 
+// encode score object as byte string
+ds_status serializeScore( ds_device* device, void **serializedScore, unsigned int* serializedScoreSize ) {
+    *serializedScoreSize = sizeof(TessDeviceScore);
+    *serializedScore = (void *) new unsigned char[*serializedScoreSize];
+    memcpy(*serializedScore, device->score, *serializedScoreSize);
+    return DS_SUCCESS;
+}
+
+// parses byte string and stores in score object
+ds_status deserializeScore( ds_device* device, const unsigned char* serializedScore, unsigned int serializedScoreSize ) {
+    // check that serializedScoreSize == sizeof(TessDeviceScore);
+    device->score = new TessDeviceScore;
+    memcpy(device->score, serializedScore, serializedScoreSize);
+    return DS_SUCCESS;
+}
+
+
+
+// evaluate devices
+ds_status evaluateScoreForDevice( ds_device *device, void *inputData) {
+    
+    // overwrite statuc gpuEnv w/ current device
+    // so native opencl calls can be used; they use static gpuEnv
+    printf("\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" );
+    GPUEnv *env = NULL;
+    if (device->type == DS_DEVICE_OPENCL_DEVICE) {
+        env = new GPUEnv;
+        //printf("[DS] populating tmp GPUEnv from device\n");
+        populateGPUEnvFromDevice( env, device->oclDeviceID);
+        env->mnFileCount = 0; //argc;
+        env->mnKernelCount = 0UL;
+        //printf("[DS] compiling kernels for tmp GPUEnv\n");
+        OpenclDevice::gpuEnv = *env;
+        OpenclDevice::CompileKernelFile(env, "");
+    }
+    
+
+    TessScoreEvaluationInputData *input = (TessScoreEvaluationInputData *)inputData;
+    
+    // pixReadTiff
+    double composeRGBPixelTime = composeRGBPixelMicroBench( env, *input, device->type );
+
+    // HistogramRect
+    double histogramRectTime = histogramRectMicroBench( env, *input, device->type );
+
+    // ThresholdRectToPix
+    double thresholdRectToPixTime = thresholdRectToPixMicroBench( env, *input, device->type );
+
+    // getLineMasks
+    double getLineMasksMorphTime = getLineMasksMorphMicroBench( env, *input, device->type );
+
+
+    // weigh times (% of cpu time)
+    // these weights should be the % execution time that the native cpu code took
+    float composeRGBPixelWeight     = 1.2f;
+    float histogramRectWeight       = 2.4f;
+    float thresholdRectToPixWeight  = 4.5f;
+    float getLineMasksMorphWeight        = 5.0f;
+    
+    float weightedTime = 
+        composeRGBPixelWeight       * composeRGBPixelTime +
+        histogramRectWeight         * histogramRectTime +
+        thresholdRectToPixWeight    * thresholdRectToPixTime +
+        getLineMasksMorphWeight     * getLineMasksMorphTime
+        ;
+    device->score = (void *)new TessDeviceScore;
+    ((TessDeviceScore *)device->score)->time = weightedTime;
+    
+    printf("[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" );
+    printf("[DS]%25s: %f (w=%.1f)\n", "composeRGBPixel", composeRGBPixelTime, composeRGBPixelWeight );
+    printf("[DS]%25s: %f (w=%.1f)\n", "HistogramRect", histogramRectTime, histogramRectWeight );
+    printf("[DS]%25s: %f (w=%.1f)\n", "ThresholdRectToPix", thresholdRectToPixTime, thresholdRectToPixWeight );
+    printf("[DS]%25s: %f (w=%.1f)\n", "getLineMasksMorph", getLineMasksMorphTime, getLineMasksMorphWeight );
+    printf("[DS]%25s: %f\n", "Score", ((TessDeviceScore *)device->score)->time );
+    return DS_SUCCESS;
+}
+
+// initial call to select device
+ds_device OpenclDevice::getDeviceSelection( ) {
+//PERF_COUNT_START("getDeviceSelection")
+    if (!deviceIsSelected) {
+PERF_COUNT_START("getDeviceSelection")
+        // check if opencl is available at runtime
+        if( 1 == LoadOpencl() ) {
+            // opencl is available
+//PERF_COUNT_SUB("LoadOpencl")
+    // setup devices
+    ds_status status;
+    ds_profile *profile;
+    status = initDSProfile( &profile, "v0.1" );
+PERF_COUNT_SUB("initDSProfile")
+    // try reading scores from file
+    char *fileName = "tesseract_opencl_profile_devices.dat";
+    status = readProfileFromFile( profile, deserializeScore, fileName);
+    if (status != DS_SUCCESS) {
+        // need to run evaluation
+                printf("[DS] Profile file not available (%s); performing profiling.\n", fileName);
+
+        // create input data
+        TessScoreEvaluationInputData input;
+                populateTessScoreEvaluationInputData( &input );
+//PERF_COUNT_SUB("populateTessScoreEvaluationInputData")
+        // perform evaluations
+        unsigned int numUpdates;
+        status =  profileDevices( profile, DS_EVALUATE_ALL, evaluateScoreForDevice, (void *)&input, &numUpdates );
+PERF_COUNT_SUB("profileDevices")
+        // write scores to file
+        if ( status == DS_SUCCESS ) {
+            status = writeProfileToFile( profile, serializeScore, fileName);
+PERF_COUNT_SUB("writeProfileToFile")
+            if ( status == DS_SUCCESS ) {
+                        printf("[DS] Scores written to file (%s).\n", fileName);
+            } else {
+                        printf("[DS] Error saving scores to file (%s); scores not written to file.\n", fileName);
+            }
+        } else {
+                    printf("[DS] Unable to evaluate performance; scores not written to file.\n");
+        }
+
+    } else {
+
+PERF_COUNT_SUB("readProfileFromFile")
+                printf("[DS] Profile read from file (%s).\n", fileName);
+    }
+
+    // we now have device scores either from file or evaluation
+    // select fastest using custom Tesseract selection algorithm
+    float bestTime = FLT_MAX; // begin search with worst possible time
+    int bestDeviceIdx = -1;
+    for (int d = 0; d < profile->numDevices; d++) {
+        //((TessDeviceScore *)device->score)->time
+        ds_device device = profile->devices[d];
+        TessDeviceScore score = *(TessDeviceScore *)device.score;
+        
+        float time = score.time;
+                printf("[DS] Device[%i] %i:%s score is %f\n", d+1, device.type, device.oclDeviceName, time);
+        if (time < bestTime) {
+                    bestTime = time;
+            bestDeviceIdx = d;
+        }
+    }
+            printf("[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx+1, profile->devices[bestDeviceIdx].oclDeviceName, profile->devices[bestDeviceIdx].type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
+    // cleanup
+            // TODO: call destructor for profile object?
+
+            bool overrided = false;
+            char *overrideDeviceStr = getenv("TESSERACT_OPENCL_DEVICE");
+            if (overrideDeviceStr != NULL) {
+                int overrideDeviceIdx = atoi(overrideDeviceStr);
+                if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices ) {
+                    printf("[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, %i)\n", overrideDeviceStr, overrideDeviceIdx);
+                    bestDeviceIdx = overrideDeviceIdx - 1;
+                    overrided = true;
+                } else {
+                    printf("[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are valid devices).\n", overrideDeviceStr, profile->numDevices);
+                }
+}
+
+            if (overrided) {
+                printf("[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx+1, profile->devices[bestDeviceIdx].oclDeviceName, profile->devices[bestDeviceIdx].type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
+            }
+            selectedDevice = profile->devices[bestDeviceIdx];
+
+        } else {
+            // opencl isn't available at runtime, select native cpu device
+            printf("[DS] OpenCL runtime not available.\n");
+            selectedDevice.type = DS_DEVICE_NATIVE_CPU;
+            selectedDevice.oclDeviceName = "(null)";
+            selectedDevice.score = NULL;
+            selectedDevice.oclDeviceID = NULL;
+            selectedDevice.oclDriverVersion = NULL;
+        }
+        deviceIsSelected = true;
+PERF_COUNT_SUB("select from Profile")
+PERF_COUNT_END
+    }
+//PERF_COUNT_END
+    return selectedDevice;
+}
+
+#endif
+
+bool OpenclDevice::selectedDeviceIsOpenCL() {
+#if USE_DEVICE_SELECTION
+    ds_device device = getDeviceSelection();
+    return (device.type == DS_DEVICE_OPENCL_DEVICE);
+#else
+    return true;
+#endif
+}
+
+bool OpenclDevice::selectedDeviceIsNativeCPU() {
+#if USE_DEVICE_SELECTION
+    ds_device device = getDeviceSelection();
+    return (device.type == DS_DEVICE_NATIVE_CPU);
+#else
+    return false;
+#endif
+}
+
+
+
+#endif
diff --git a/opencl/openclwrapper.h b/opencl/openclwrapper.h
index 839ca331..d94c45ee 100644
--- a/opencl/openclwrapper.h
+++ b/opencl/openclwrapper.h
@@ -1,345 +1,345 @@
-#include <stdio.h>
-#include "allheaders.h"
-#include "pix.h"
-#ifdef USE_OPENCL
-#include "tiff.h"
-#include "tiffio.h"
-#endif
-#include "tprintf.h"
-
-// including CL/cl.h doesn't occur until USE_OPENCL defined below
-
-// platform preprocessor commands
-#if defined( WIN32 ) || defined( __WIN32__ ) || defined( _WIN32 ) || defined( __CYGWIN32__ ) || defined( __MINGW32__ )
-#define ON_WINDOWS 1
-#define ON_LINUX   0
-#define ON_APPLE   0
-#define ON_OTHER   0
-#define IF_WINDOWS(X) X
-#define IF_LINUX(X)
-#define IF_APPLE(X)
-#define IF_OTHER(X)
-#define NOT_WINDOWS(X)
-#elif defined( __linux__ )
-#define ON_WINDOWS 0
-#define ON_LINUX   1
-#define ON_APPLE   0
-#define ON_OTHER   0
-#define IF_WINDOWS(X)
-#define IF_LINUX(X) X
-#define IF_APPLE(X)
-#define IF_OTHER(X)
-#define NOT_WINDOWS(X) X
-#elif defined( __APPLE__ )
-#define ON_WINDOWS 0
-#define ON_LINUX   0
-#define ON_APPLE   1
-#define ON_OTHER   0
-#define IF_WINDOWS(X)
-#define IF_LINUX(X)
-#define IF_APPLE(X) X
-#define IF_OTHER(X)
-#define NOT_WINDOWS(X) X
-#else
-#define ON_WINDOWS 0
-#define ON_LINUX   0
-#define ON_APPLE   0
-#define ON_OTHER   1
-#define IF_WINDOWS(X)
-#define IF_LINUX(X)
-#define IF_APPLE(X)
-#define IF_OTHER(X) X
-#define NOT_WINDOWS(X) X
-#endif
-
-#if ON_LINUX
-#include <time.h>
-#endif
-
-/************************************************************************************
- * enable/disable reporting of performance
- * PERF_REPORT_LEVEL
- * 0 - no reporting
- * 1 - no reporting
- * 2 - report total function call time for functions we're tracking
- * 3 - optionally report breakdown of function calls (kernel launch, kernel time, data copies)
- ************************************************************************************/
-#define PERF_COUNT_VERBOSE 1
-#define PERF_COUNT_REPORT_STR "[%36s], %24s, %11.6f\n"
-
-
-#if ON_WINDOWS
-
-#if PERF_COUNT_VERBOSE >= 2
-#define PERF_COUNT_START(FUNCT_NAME) \
-    char *funct_name = FUNCT_NAME; \
-    double elapsed_time_sec; \
-    LARGE_INTEGER freq, time_funct_start, time_funct_end, time_sub_start, time_sub_end; \
-    QueryPerformanceFrequency(&freq); \
-    QueryPerformanceCounter(&time_funct_start); \
-    time_sub_start = time_funct_start; \
-    time_sub_end = time_funct_start;
-
-#define PERF_COUNT_END \
-    QueryPerformanceCounter(&time_funct_end); \
-    elapsed_time_sec = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); \
-    tprintf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec);
-#else
-#define PERF_COUNT_START(FUNCT_NAME)
-#define PERF_COUNT_END
-#endif
-
-#if PERF_COUNT_VERBOSE >= 3
-#define PERF_COUNT_SUB(SUB) \
-    QueryPerformanceCounter(&time_sub_end); \
-    elapsed_time_sec = (time_sub_end.QuadPart-time_sub_start.QuadPart)/(double)(freq.QuadPart); \
-    tprintf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \
-    time_sub_start = time_sub_end;
-#else
-#define PERF_COUNT_SUB(SUB)
-#endif
-
-
-// not on windows
-#else
-
-#if PERF_COUNT_VERBOSE >= 2
-#define PERF_COUNT_START(FUNCT_NAME) \
-    char *funct_name = FUNCT_NAME; \
-    double elapsed_time_sec; \
-    timespec time_funct_start, time_funct_end, time_sub_start, time_sub_end; \
-    clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); \
-    time_sub_start = time_funct_start; \
-    time_sub_end = time_funct_start;
-
-#define PERF_COUNT_END \
-    clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); \
-    elapsed_time_sec = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; \
-    tprintf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec);
-#else
-#define PERF_COUNT_START(FUNCT_NAME)
-#define PERF_COUNT_END
-#endif
-
-#if PERF_COUNT_VERBOSE >= 3
-#define PERF_COUNT_SUB(SUB) \
-    clock_gettime( CLOCK_MONOTONIC, &time_sub_end ); \
-    elapsed_time_sec = (time_sub_end.tv_sec - time_sub_start.tv_sec)*1.0 + (time_sub_end.tv_nsec - time_sub_start.tv_nsec)/1000000000.0; \
-    tprintf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \
-    time_sub_start = time_sub_end;
-#else
-#define PERF_COUNT_SUB(SUB)
-#endif
-
-#endif
-/**************************************************************************
- * enable/disable use of OpenCL
- **************************************************************************/
-
-#ifdef USE_OPENCL
-
-#define USE_DEVICE_SELECTION 1
-
-#include "opencl_device_selection.h"
-
-#ifndef strcasecmp
-#define strcasecmp strcmp
-#endif
-
-#define MAX_KERNEL_STRING_LEN 64
-#define MAX_CLFILE_NUM 50
-#define MAX_CLKERNEL_NUM 200
-#define MAX_KERNEL_NAME_LEN 64
-#define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
-#define GROUPSIZE_X 16
-#define GROUPSIZE_Y 16
-#define GROUPSIZE_HMORX 256
-#define GROUPSIZE_HMORY 1
-
-typedef struct _KernelEnv
-{
-    cl_context mpkContext;
-    cl_command_queue mpkCmdQueue;
-    cl_program mpkProgram;
-    cl_kernel mpkKernel;
-    char mckKernelName[150];
-} KernelEnv;
-
-typedef struct _OpenCLEnv
-{
-    cl_platform_id mpOclPlatformID;
-    cl_context mpOclContext;
-    cl_device_id mpOclDevsID;
-    cl_command_queue mpOclCmdQueue;
-} OpenCLEnv;
-typedef int ( *cl_kernel_function )( void **userdata, KernelEnv *kenv );
-
-
-static l_int32 MORPH_BC = ASYMMETRIC_MORPH_BC;
-
-static const l_uint32 lmask32[] = {0x0,
-    0x80000000, 0xc0000000, 0xe0000000, 0xf0000000,
-    0xf8000000, 0xfc000000, 0xfe000000, 0xff000000,
-    0xff800000, 0xffc00000, 0xffe00000, 0xfff00000,
-    0xfff80000, 0xfffc0000, 0xfffe0000, 0xffff0000,
-    0xffff8000, 0xffffc000, 0xffffe000, 0xfffff000,
-    0xfffff800, 0xfffffc00, 0xfffffe00, 0xffffff00,
-    0xffffff80, 0xffffffc0, 0xffffffe0, 0xfffffff0,
-    0xfffffff8, 0xfffffffc, 0xfffffffe, 0xffffffff};
-
-static const l_uint32 rmask32[] = {0x0,
-    0x00000001, 0x00000003, 0x00000007, 0x0000000f,
-    0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff,
-    0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
-    0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff,
-    0x0001ffff, 0x0003ffff, 0x0007ffff, 0x000fffff,
-    0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
-    0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff,
-    0x1fffffff, 0x3fffffff, 0x7fffffff, 0xffffffff};
-
-#define CHECK_OPENCL(status,name)    \
-if( status != CL_SUCCESS )    \
-{    \
-    printf ("OpenCL error code is %d at   when %s .\n", status, name);    \
-}
-
-
-typedef struct _GPUEnv
-{
-    //share vb in all modules in hb library
-    cl_platform_id mpPlatformID;
-    cl_device_type mDevType;
-    cl_context mpContext;
-    cl_device_id *mpArryDevsID;
-    cl_device_id mpDevID;
-    cl_command_queue mpCmdQueue;
-    cl_kernel mpArryKernels[MAX_CLFILE_NUM];
-    cl_program mpArryPrograms[MAX_CLFILE_NUM]; //one program object maps one kernel source file
-    char mArryKnelSrcFile[MAX_CLFILE_NUM][256], //the max len of kernel file name is 256
-         mArrykernelNames[MAX_CLKERNEL_NUM][MAX_KERNEL_STRING_LEN + 1];
-         cl_kernel_function mpArryKnelFuncs[MAX_CLKERNEL_NUM];
-    int mnKernelCount, mnFileCount, // only one kernel file
-        mnIsUserCreated; // 1: created , 0:no create and needed to create by opencl wrapper
-    int mnKhrFp64Flag;
-    int mnAmdFp64Flag;
-
-} GPUEnv;
-
-
-class OpenclDevice
-{
-
-public:
-    static GPUEnv gpuEnv;
-    static int isInited;
-    OpenclDevice();
-    ~OpenclDevice();
-    static int InitEnv(); // load dll, call InitOpenclRunEnv(0)
-    static int InitOpenclRunEnv( int argc ); // RegistOpenclKernel, double flags, compile kernels
-    static int InitOpenclRunEnv_DeviceSelection( int argc ); // RegistOpenclKernel, double flags, compile kernels
-    static int InitOpenclRunEnv( GPUEnv *gpu ); // select device by env_CPU or selector
-    static int RegistOpenclKernel();
-    static int ReleaseOpenclRunEnv();
-    static int ReleaseOpenclEnv( GPUEnv *gpuInfo );
-    static int CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption );
-    static int CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName );
-    static int GeneratBinFromKernelSource( cl_program program, const char * clFileName );
-    static int WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes );
-    static int BinaryGenerated( const char * clFileName, FILE ** fhandle );
-    //static int CompileKernelFile( const char *filename, GPUEnv *gpuInfo, const char *buildOption );
-    static l_uint32* pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl, l_uint32 *line);
-    static Pix* pixReadTiffCl( const char *filename, l_int32 n );
-    static PIX * pixReadStreamTiffCl ( FILE *fp, l_int32 n );
-    static PIX* pixReadFromTiffStreamCl(TIFF  *tif);
-    static int composeRGBPixelCl(int *tiffdata,int *line,int h,int w);
-    static l_int32 getTiffStreamResolutionCl(TIFF *tif,l_int32  *pxres,l_int32  *pyres);
-    static TIFF* fopenTiffCl(FILE *fp,const char  *modestring);
-
-/* OpenCL implementations of Morphological operations*/
-    
-    //Initialiation of OCL buffers used in Morph operations
-    static int initMorphCLAllocations(l_int32  wpl, l_int32  h, PIX* pixs);
-    static void releaseMorphCLBuffers();
-
-    // OpenCL implementation of Morphology Dilate
-    static PIX* pixDilateBrickCL(PIX  *pixd, PIX  *pixs, l_int32  hsize, l_int32  vsize, bool reqDataCopy);
-    
-    // OpenCL implementation of Morphology Erode
-    static PIX* pixErodeBrickCL(PIX  *pixd, PIX  *pixs, l_int32  hsize, l_int32  vsize, bool reqDataCopy);
-    
-    // OpenCL implementation of Morphology Close
-    static PIX* pixCloseBrickCL(PIX  *pixd, PIX  *pixs, l_int32  hsize, l_int32  vsize, bool reqDataCopy);
-
-    // OpenCL implementation of Morphology Open
-    static PIX* pixOpenBrickCL(PIX  *pixd, PIX  *pixs, l_int32  hsize, l_int32  vsize, bool reqDataCopy);
-    
-    // OpenCL implementation of Morphology Open
-    static PIX* pixSubtractCL(PIX  *pixd, PIX  *pixs1, PIX  *pixs2, bool reqDataCopy);
-
-    // OpenCL implementation of Morphology (Hollow = Closed - Open)
-    static PIX* pixHollowCL(PIX  *pixd, PIX  *pixs, l_int32  close_hsize, l_int32  close_vsize, l_int32  open_hsize, l_int32  open_vsize, bool reqDataCopy);
-
-    static void pixGetLinesCL(PIX  *pixd, PIX  *pixs, 
-                                            PIX** pix_vline, PIX** pix_hline, 
-                                            PIX** pixClosed, bool  getpixClosed,
-                                            l_int32  close_hsize, l_int32  close_vsize, 
-                                            l_int32  open_hsize, l_int32  open_vsize,
-                                            l_int32  line_hsize, l_int32  line_vsize);
-
-    //int InitOpenclAttr( OpenCLEnv * env );
-    //int ReleaseKernel( KernelEnv * env );
-    static int SetKernelEnv( KernelEnv *envInfo );
-    //int CreateKernel( char * kernelname, KernelEnv * env );
-    //int RunKernel( const char *kernelName, void **userdata );
-    //int ConvertToString( const char *filename, char **source );
-    //int CheckKernelName( KernelEnv *envInfo, const char *kernelName );
-    //int RegisterKernelWrapper( const char *kernelName, cl_kernel_function function );
-    //int RunKernelWrapper( cl_kernel_function function, const char * kernelName, void **usrdata );
-    //int GetKernelEnvAndFunc( const char *kernelName, KernelEnv *env, cl_kernel_function *function );
-    // static cl_device_id performDeviceSelection( );
-    //static bool thresholdRectToPixMicroBench( TessScoreEvaluationInputData input, ds_device_type type);
-
-    static int LoadOpencl();
-#ifdef WIN32
-    //static int OpenclInite();
-    static void FreeOpenclDll();
-#endif
-
-    //int GetOpenclState();
-    //void SetOpenclState( int state );
-    inline static int AddKernelConfig( int kCount, const char *kName );
-
-    /* for binarization */
-    static void HistogramRectOCL(
-        const unsigned char *imagedata,
-        int bytes_per_pixel,
-        int bytes_per_line,
-        int left,
-        int top,
-        int width,
-        int height,
-        int kHistogramSize,
-        int *histogramAllChannels);
-    static void ThresholdRectToPixOCL(
-        const unsigned char* imagedata,
-        int bytes_per_pixel,
-        int bytes_per_line,
-        const int* thresholds,
-        const int* hi_values,
-        Pix** pix,
-        int rect_height,
-        int rect_width,
-        int rect_top,
-        int rect_left);
-#if USE_DEVICE_SELECTION
-    static ds_device getDeviceSelection();
-    static ds_device selectedDevice;
-    static bool deviceIsSelected;
-#endif
-    static bool selectedDeviceIsOpenCL();
-    static bool selectedDeviceIsNativeCPU();
-
-};
-
-
-#endif
+#include <stdio.h>
+#include "allheaders.h"
+#include "pix.h"
+#ifdef USE_OPENCL
+#include "tiff.h"
+#include "tiffio.h"
+#endif
+#include "tprintf.h"
+
+// including CL/cl.h doesn't occur until USE_OPENCL defined below
+
+// platform preprocessor commands
+#if defined( WIN32 ) || defined( __WIN32__ ) || defined( _WIN32 ) || defined( __CYGWIN32__ ) || defined( __MINGW32__ )
+#define ON_WINDOWS 1
+#define ON_LINUX   0
+#define ON_APPLE   0
+#define ON_OTHER   0
+#define IF_WINDOWS(X) X
+#define IF_LINUX(X)
+#define IF_APPLE(X)
+#define IF_OTHER(X)
+#define NOT_WINDOWS(X)
+#elif defined( __linux__ )
+#define ON_WINDOWS 0
+#define ON_LINUX   1
+#define ON_APPLE   0
+#define ON_OTHER   0
+#define IF_WINDOWS(X)
+#define IF_LINUX(X) X
+#define IF_APPLE(X)
+#define IF_OTHER(X)
+#define NOT_WINDOWS(X) X
+#elif defined( __APPLE__ )
+#define ON_WINDOWS 0
+#define ON_LINUX   0
+#define ON_APPLE   1
+#define ON_OTHER   0
+#define IF_WINDOWS(X)
+#define IF_LINUX(X)
+#define IF_APPLE(X) X
+#define IF_OTHER(X)
+#define NOT_WINDOWS(X) X
+#else
+#define ON_WINDOWS 0
+#define ON_LINUX   0
+#define ON_APPLE   0
+#define ON_OTHER   1
+#define IF_WINDOWS(X)
+#define IF_LINUX(X)
+#define IF_APPLE(X)
+#define IF_OTHER(X) X
+#define NOT_WINDOWS(X) X
+#endif
+
+#if ON_LINUX
+#include <time.h>
+#endif
+
+/************************************************************************************
+ * enable/disable reporting of performance
+ * PERF_REPORT_LEVEL
+ * 0 - no reporting
+ * 1 - no reporting
+ * 2 - report total function call time for functions we're tracking
+ * 3 - optionally report breakdown of function calls (kernel launch, kernel time, data copies)
+ ************************************************************************************/
+#define PERF_COUNT_VERBOSE 1
+#define PERF_COUNT_REPORT_STR "[%36s], %24s, %11.6f\n"
+
+
+#if ON_WINDOWS
+
+#if PERF_COUNT_VERBOSE >= 2
+#define PERF_COUNT_START(FUNCT_NAME) \
+    char *funct_name = FUNCT_NAME; \
+    double elapsed_time_sec; \
+    LARGE_INTEGER freq, time_funct_start, time_funct_end, time_sub_start, time_sub_end; \
+    QueryPerformanceFrequency(&freq); \
+    QueryPerformanceCounter(&time_funct_start); \
+    time_sub_start = time_funct_start; \
+    time_sub_end = time_funct_start;
+
+#define PERF_COUNT_END \
+    QueryPerformanceCounter(&time_funct_end); \
+    elapsed_time_sec = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); \
+    tprintf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec);
+#else
+#define PERF_COUNT_START(FUNCT_NAME)
+#define PERF_COUNT_END
+#endif
+
+#if PERF_COUNT_VERBOSE >= 3
+#define PERF_COUNT_SUB(SUB) \
+    QueryPerformanceCounter(&time_sub_end); \
+    elapsed_time_sec = (time_sub_end.QuadPart-time_sub_start.QuadPart)/(double)(freq.QuadPart); \
+    tprintf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \
+    time_sub_start = time_sub_end;
+#else
+#define PERF_COUNT_SUB(SUB)
+#endif
+
+
+// not on windows
+#else
+
+#if PERF_COUNT_VERBOSE >= 2
+#define PERF_COUNT_START(FUNCT_NAME) \
+    char *funct_name = FUNCT_NAME; \
+    double elapsed_time_sec; \
+    timespec time_funct_start, time_funct_end, time_sub_start, time_sub_end; \
+    clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); \
+    time_sub_start = time_funct_start; \
+    time_sub_end = time_funct_start;
+
+#define PERF_COUNT_END \
+    clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); \
+    elapsed_time_sec = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; \
+    tprintf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec);
+#else
+#define PERF_COUNT_START(FUNCT_NAME)
+#define PERF_COUNT_END
+#endif
+
+#if PERF_COUNT_VERBOSE >= 3
+#define PERF_COUNT_SUB(SUB) \
+    clock_gettime( CLOCK_MONOTONIC, &time_sub_end ); \
+    elapsed_time_sec = (time_sub_end.tv_sec - time_sub_start.tv_sec)*1.0 + (time_sub_end.tv_nsec - time_sub_start.tv_nsec)/1000000000.0; \
+    tprintf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \
+    time_sub_start = time_sub_end;
+#else
+#define PERF_COUNT_SUB(SUB)
+#endif
+
+#endif
+/**************************************************************************
+ * enable/disable use of OpenCL
+ **************************************************************************/
+
+#ifdef USE_OPENCL
+
+#define USE_DEVICE_SELECTION 1
+
+#include "opencl_device_selection.h"
+
+#ifndef strcasecmp
+#define strcasecmp strcmp
+#endif
+
+#define MAX_KERNEL_STRING_LEN 64
+#define MAX_CLFILE_NUM 50
+#define MAX_CLKERNEL_NUM 200
+#define MAX_KERNEL_NAME_LEN 64
+#define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
+#define GROUPSIZE_X 16
+#define GROUPSIZE_Y 16
+#define GROUPSIZE_HMORX 256
+#define GROUPSIZE_HMORY 1
+
+typedef struct _KernelEnv
+{
+    cl_context mpkContext;
+    cl_command_queue mpkCmdQueue;
+    cl_program mpkProgram;
+    cl_kernel mpkKernel;
+    char mckKernelName[150];
+} KernelEnv;
+
+typedef struct _OpenCLEnv
+{
+    cl_platform_id mpOclPlatformID;
+    cl_context mpOclContext;
+    cl_device_id mpOclDevsID;
+    cl_command_queue mpOclCmdQueue;
+} OpenCLEnv;
+typedef int ( *cl_kernel_function )( void **userdata, KernelEnv *kenv );
+
+
+static l_int32 MORPH_BC = ASYMMETRIC_MORPH_BC;
+
+static const l_uint32 lmask32[] = {0x0,
+    0x80000000, 0xc0000000, 0xe0000000, 0xf0000000,
+    0xf8000000, 0xfc000000, 0xfe000000, 0xff000000,
+    0xff800000, 0xffc00000, 0xffe00000, 0xfff00000,
+    0xfff80000, 0xfffc0000, 0xfffe0000, 0xffff0000,
+    0xffff8000, 0xffffc000, 0xffffe000, 0xfffff000,
+    0xfffff800, 0xfffffc00, 0xfffffe00, 0xffffff00,
+    0xffffff80, 0xffffffc0, 0xffffffe0, 0xfffffff0,
+    0xfffffff8, 0xfffffffc, 0xfffffffe, 0xffffffff};
+
+static const l_uint32 rmask32[] = {0x0,
+    0x00000001, 0x00000003, 0x00000007, 0x0000000f,
+    0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff,
+    0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
+    0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff,
+    0x0001ffff, 0x0003ffff, 0x0007ffff, 0x000fffff,
+    0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
+    0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff,
+    0x1fffffff, 0x3fffffff, 0x7fffffff, 0xffffffff};
+
+#define CHECK_OPENCL(status,name)    \
+if( status != CL_SUCCESS )    \
+{    \
+    printf ("OpenCL error code is %d at   when %s .\n", status, name);    \
+}
+
+
+typedef struct _GPUEnv
+{
+    //share vb in all modules in hb library
+    cl_platform_id mpPlatformID;
+    cl_device_type mDevType;
+    cl_context mpContext;
+    cl_device_id *mpArryDevsID;
+    cl_device_id mpDevID;
+    cl_command_queue mpCmdQueue;
+    cl_kernel mpArryKernels[MAX_CLFILE_NUM];
+    cl_program mpArryPrograms[MAX_CLFILE_NUM]; //one program object maps one kernel source file
+    char mArryKnelSrcFile[MAX_CLFILE_NUM][256], //the max len of kernel file name is 256
+         mArrykernelNames[MAX_CLKERNEL_NUM][MAX_KERNEL_STRING_LEN + 1];
+         cl_kernel_function mpArryKnelFuncs[MAX_CLKERNEL_NUM];
+    int mnKernelCount, mnFileCount, // only one kernel file
+        mnIsUserCreated; // 1: created , 0:no create and needed to create by opencl wrapper
+    int mnKhrFp64Flag;
+    int mnAmdFp64Flag;
+
+} GPUEnv;
+
+
+class OpenclDevice
+{
+
+public:
+    static GPUEnv gpuEnv;
+    static int isInited;
+    OpenclDevice();
+    ~OpenclDevice();
+    static int InitEnv(); // load dll, call InitOpenclRunEnv(0)
+    static int InitOpenclRunEnv( int argc ); // RegistOpenclKernel, double flags, compile kernels
+    static int InitOpenclRunEnv_DeviceSelection( int argc ); // RegistOpenclKernel, double flags, compile kernels
+    static int InitOpenclRunEnv( GPUEnv *gpu ); // select device by env_CPU or selector
+    static int RegistOpenclKernel();
+    static int ReleaseOpenclRunEnv();
+    static int ReleaseOpenclEnv( GPUEnv *gpuInfo );
+    static int CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption );
+    static int CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName );
+    static int GeneratBinFromKernelSource( cl_program program, const char * clFileName );
+    static int WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes );
+    static int BinaryGenerated( const char * clFileName, FILE ** fhandle );
+    //static int CompileKernelFile( const char *filename, GPUEnv *gpuInfo, const char *buildOption );
+    static l_uint32* pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl, l_uint32 *line);
+    static Pix* pixReadTiffCl( const char *filename, l_int32 n );
+    static PIX * pixReadStreamTiffCl ( FILE *fp, l_int32 n );
+    static PIX* pixReadFromTiffStreamCl(TIFF  *tif);
+    static int composeRGBPixelCl(int *tiffdata,int *line,int h,int w);
+    static l_int32 getTiffStreamResolutionCl(TIFF *tif,l_int32  *pxres,l_int32  *pyres);
+    static TIFF* fopenTiffCl(FILE *fp,const char  *modestring);
+
+/* OpenCL implementations of Morphological operations*/
+    
+    //Initialiation of OCL buffers used in Morph operations
+    static int initMorphCLAllocations(l_int32  wpl, l_int32  h, PIX* pixs);
+    static void releaseMorphCLBuffers();
+
+    // OpenCL implementation of Morphology Dilate
+    static PIX* pixDilateBrickCL(PIX  *pixd, PIX  *pixs, l_int32  hsize, l_int32  vsize, bool reqDataCopy);
+    
+    // OpenCL implementation of Morphology Erode
+    static PIX* pixErodeBrickCL(PIX  *pixd, PIX  *pixs, l_int32  hsize, l_int32  vsize, bool reqDataCopy);
+    
+    // OpenCL implementation of Morphology Close
+    static PIX* pixCloseBrickCL(PIX  *pixd, PIX  *pixs, l_int32  hsize, l_int32  vsize, bool reqDataCopy);
+
+    // OpenCL implementation of Morphology Open
+    static PIX* pixOpenBrickCL(PIX  *pixd, PIX  *pixs, l_int32  hsize, l_int32  vsize, bool reqDataCopy);
+    
+    // OpenCL implementation of Morphology Open
+    static PIX* pixSubtractCL(PIX  *pixd, PIX  *pixs1, PIX  *pixs2, bool reqDataCopy);
+
+    // OpenCL implementation of Morphology (Hollow = Closed - Open)
+    static PIX* pixHollowCL(PIX  *pixd, PIX  *pixs, l_int32  close_hsize, l_int32  close_vsize, l_int32  open_hsize, l_int32  open_vsize, bool reqDataCopy);
+
+    static void pixGetLinesCL(PIX  *pixd, PIX  *pixs, 
+                                            PIX** pix_vline, PIX** pix_hline, 
+                                            PIX** pixClosed, bool  getpixClosed,
+                                            l_int32  close_hsize, l_int32  close_vsize, 
+                                            l_int32  open_hsize, l_int32  open_vsize,
+                                            l_int32  line_hsize, l_int32  line_vsize);
+
+    //int InitOpenclAttr( OpenCLEnv * env );
+    //int ReleaseKernel( KernelEnv * env );
+    static int SetKernelEnv( KernelEnv *envInfo );
+    //int CreateKernel( char * kernelname, KernelEnv * env );
+    //int RunKernel( const char *kernelName, void **userdata );
+    //int ConvertToString( const char *filename, char **source );
+    //int CheckKernelName( KernelEnv *envInfo, const char *kernelName );
+    //int RegisterKernelWrapper( const char *kernelName, cl_kernel_function function );
+    //int RunKernelWrapper( cl_kernel_function function, const char * kernelName, void **usrdata );
+    //int GetKernelEnvAndFunc( const char *kernelName, KernelEnv *env, cl_kernel_function *function );
+    // static cl_device_id performDeviceSelection( );
+    //static bool thresholdRectToPixMicroBench( TessScoreEvaluationInputData input, ds_device_type type);
+
+    static int LoadOpencl();
+#ifdef WIN32
+    //static int OpenclInite();
+    static void FreeOpenclDll();
+#endif
+
+    //int GetOpenclState();
+    //void SetOpenclState( int state );
+    inline static int AddKernelConfig( int kCount, const char *kName );
+
+    /* for binarization */
+    static void HistogramRectOCL(
+        const unsigned char *imagedata,
+        int bytes_per_pixel,
+        int bytes_per_line,
+        int left,
+        int top,
+        int width,
+        int height,
+        int kHistogramSize,
+        int *histogramAllChannels);
+    static void ThresholdRectToPixOCL(
+        const unsigned char* imagedata,
+        int bytes_per_pixel,
+        int bytes_per_line,
+        const int* thresholds,
+        const int* hi_values,
+        Pix** pix,
+        int rect_height,
+        int rect_width,
+        int rect_top,
+        int rect_left);
+#if USE_DEVICE_SELECTION
+    static ds_device getDeviceSelection();
+    static ds_device selectedDevice;
+    static bool deviceIsSelected;
+#endif
+    static bool selectedDeviceIsOpenCL();
+    static bool selectedDeviceIsNativeCPU();
+
+};
+
+
+#endif