mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-07 09:52:40 +08:00
Fixes for OpenCL issues reported on Apple Mac. Still get -54 on Apple Mac while running on OpenCL CPU, however it is ignored now.
This commit is contained in:
parent
2c8bc4a2ac
commit
8e9159b091
@ -1100,10 +1100,13 @@ void kernel_ThresholdRectToPix(
|
||||
}
|
||||
}
|
||||
|
||||
// only supports 1 channel
|
||||
\n#define CHAR_VEC_WIDTH 8 \n
|
||||
\n#define PIXELS_PER_WORD 32 \n
|
||||
\n#define PIXELS_PER_BURST 8 \n
|
||||
\n#define BURSTS_PER_WORD (PIXELS_PER_WORD/PIXELS_PER_BURST) \n
|
||||
typedef union {
|
||||
uchar s[PIXELS_PER_BURST];
|
||||
uchar8 v[(PIXELS_PER_BURST)/CHAR_VEC_WIDTH];
|
||||
uchar s[PIXELS_PER_BURST*1];
|
||||
uchar8 v[(PIXELS_PER_BURST*1)/CHAR_VEC_WIDTH];
|
||||
} charVec1;
|
||||
|
||||
__attribute__((reqd_work_group_size(256, 1, 1)))
|
||||
@ -1112,7 +1115,7 @@ void kernel_ThresholdRectToPix_OneChan(
|
||||
__global const uchar8 *imageData,
|
||||
int height,
|
||||
int width,
|
||||
int wpl, // words per line
|
||||
int wpl, // words per line of output image
|
||||
__global int *thresholds,
|
||||
__global int *hi_values,
|
||||
__global int *pix) {
|
||||
@ -1134,96 +1137,71 @@ void kernel_ThresholdRectToPix_OneChan(
|
||||
|
||||
// load burst
|
||||
charVec1 pixels;
|
||||
for ( int i = 0; i < (PIXELS_PER_BURST)/CHAR_VEC_WIDTH; i++ ) {
|
||||
pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + i];
|
||||
}
|
||||
// for each char8 in burst
|
||||
pixels.v[0] = imageData[
|
||||
w*BURSTS_PER_WORD
|
||||
+ b
|
||||
+ 0 ];
|
||||
|
||||
// for each pixel in burst
|
||||
for ( int p = 0; p < PIXELS_PER_BURST; p++) {
|
||||
for ( int c = 0; c < 1; c++) {
|
||||
unsigned char pixChan = pixels.s[p + c];
|
||||
if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
|
||||
word |= (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31));
|
||||
}
|
||||
|
||||
//int littleEndianIdx = p ^ 3;
|
||||
//int bigEndianIdx = p;
|
||||
int idx =
|
||||
\n#ifdef __ENDIAN_LITTLE__\n
|
||||
p ^ 3;
|
||||
\n#else\n
|
||||
p;
|
||||
\n#endif\n
|
||||
unsigned char pixChan = pixels.s[idx];
|
||||
if (pHi_Values[0] >= 0 && (pixChan > pThresholds[0]) == (pHi_Values[0] == 0)) {
|
||||
word |= (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31));
|
||||
}
|
||||
}
|
||||
}
|
||||
pix[w] = word;
|
||||
}
|
||||
}
|
||||
|
||||
)
|
||||
|
||||
|
||||
KERNEL(
|
||||
\n#define RED_SHIFT 24\n
|
||||
\n#define GREEN_SHIFT 16\n
|
||||
\n#define BLUE_SHIFT 8\n
|
||||
\n#define SET_DATA_BYTE( pdata, n, val ) (*(l_uint8 *)((l_uintptr_t)((l_uint8 *)(pdata) + (n)) ^ 3) = (val))\n
|
||||
\n
|
||||
\n__attribute__((reqd_work_group_size(256, 1, 1)))\n
|
||||
\n__kernel\n
|
||||
\nvoid kernel_RGBToGray(
|
||||
__global const unsigned int *srcData,
|
||||
__global unsigned char *dstData,
|
||||
int srcWPL,
|
||||
int dstWPL,
|
||||
int height,
|
||||
int width,
|
||||
float rwt,
|
||||
float gwt,
|
||||
float bwt ) {
|
||||
|
||||
// pixel index
|
||||
int pixelIdx = get_global_id(0);
|
||||
if (pixelIdx >= height*width) return;
|
||||
|
||||
unsigned int word = srcData[pixelIdx];
|
||||
int output = (rwt * ((word >> RED_SHIFT) & 0xff) +
|
||||
gwt * ((word >> GREEN_SHIFT) & 0xff) +
|
||||
bwt * ((word >> BLUE_SHIFT) & 0xff) + 0.5);
|
||||
// SET_DATA_BYTE
|
||||
dstData[pixelIdx] = output;
|
||||
}
|
||||
)
|
||||
#endif
|
||||
|
||||
; // close char*
|
||||
|
||||
#endif // USE_EXTERNAL_KERNEL
|
||||
#endif //_OCL_KERNEL_H_
|
||||
//#endif //_OCL_KERNEL_H_
|
||||
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|
||||
|
||||
// Alternative histogram kernel written to use uchar and different global memory scattered write
|
||||
// was a little better for intel platforms but still not faster then native serial code
|
||||
#if 0
|
||||
/* data layed out as
|
||||
bin0 bin1 bin2...
|
||||
r,g,b,a,r,g,b,a,r,g,b,a nthreads/4 copies
|
||||
*/
|
||||
\n__attribute__((reqd_work_group_size(256, 1, 1)))
|
||||
\n __kernel
|
||||
\n void kernel_HistogramRectAllChannels_uchar(
|
||||
\n volatile __global const uchar *data,
|
||||
\n uint numPixels,
|
||||
\n volatile __global uint *histBuffer) {
|
||||
\n
|
||||
\n // for each pixel/channel, accumulate in global memory
|
||||
\n for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS; pc += get_global_size(0) ) {
|
||||
\n uchar value = data[pc];
|
||||
\n int idx = value*get_global_size(0) + get_global_id(0);
|
||||
\n histBuffer[ idx ]++; // coalesced if same value
|
||||
\n }
|
||||
\n } // kernel_HistogramRectAllChannels
|
||||
\n
|
||||
\n __attribute__((reqd_work_group_size(256, 1, 1)))
|
||||
\n __kernel
|
||||
\n void kernel_HistogramRectAllChannelsReduction_uchar(
|
||||
\n int n, // pixel redundancy that needs to be accumulated = nthreads/4
|
||||
\n __global uint4 *histBuffer,
|
||||
\n __global uint* histResult) { // each wg accumulates 1 bin (all channels within it
|
||||
\n
|
||||
\n // declare variables
|
||||
\n int binIdx = get_group_id(0);
|
||||
\n size_t groupId = get_group_id(0);
|
||||
\n size_t localId = get_local_id(0); // 0 -> 256-1
|
||||
\n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
|
||||
\n uint numThreads = get_global_size(0);
|
||||
\n uint4 hist = {0, 0, 0, 0};
|
||||
\n
|
||||
\n // accumulate in register
|
||||
\n for ( uint p = get_local_id(0); p < n; p+=GROUP_SIZE) {
|
||||
\n hist += histBuffer[binIdx*n+p];
|
||||
\n }
|
||||
\n
|
||||
\n // reduction in local memory
|
||||
\n __local uint4 localHist[GROUP_SIZE];
|
||||
\n localHist[localId] = hist;
|
||||
\n barrier(CLK_LOCAL_MEM_FENCE);
|
||||
\n
|
||||
\n for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
|
||||
\n if (localId < stride) {
|
||||
\n hist = localHist[ localId+stride];
|
||||
\n }
|
||||
\n barrier(CLK_LOCAL_MEM_FENCE);
|
||||
\n if (localId < stride) {
|
||||
\n localHist[ localId] += hist;
|
||||
\n }
|
||||
\n barrier(CLK_LOCAL_MEM_FENCE);
|
||||
\n }
|
||||
\n
|
||||
\n // write reduction to final result
|
||||
\n if (localId == 0) {
|
||||
\n histResult[0*HIST_SIZE+binIdx] = localHist[0].s0;
|
||||
\n histResult[1*HIST_SIZE+binIdx] = localHist[0].s1;
|
||||
\n histResult[2*HIST_SIZE+binIdx] = localHist[0].s2;
|
||||
\n histResult[3*HIST_SIZE+binIdx] = localHist[0].s3;
|
||||
\n }
|
||||
\n
|
||||
\n } // kernel_HistogramRectAllChannels
|
||||
#endif
|
||||
|
@ -63,6 +63,8 @@ static ds_status releaseDSProfile(ds_profile* profile, ds_score_release sr) {
|
||||
if (profile->devices!=NULL && sr!=NULL) {
|
||||
unsigned int i;
|
||||
for (i = 0; i < profile->numDevices; i++) {
|
||||
if (profile->devices[i].oclDeviceName) free(profile->devices[i].oclDeviceName);
|
||||
if (profile->devices[i].oclDriverVersion) free(profile->devices[i].oclDriverVersion);
|
||||
status = sr(profile->devices[i].score);
|
||||
if (status != DS_SUCCESS)
|
||||
break;
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -56,13 +56,6 @@
|
||||
#include <time.h>
|
||||
#endif
|
||||
|
||||
#if ON_APPLE
|
||||
#include <mach/clock.h>
|
||||
#include <mach/mach.h>
|
||||
#define CLOCK_MONOTONIC SYSTEM_CLOCK
|
||||
#define clock_gettime clock_get_time
|
||||
#endif
|
||||
|
||||
/************************************************************************************
|
||||
* enable/disable reporting of performance
|
||||
* PERF_REPORT_LEVEL
|
||||
@ -74,13 +67,6 @@
|
||||
#define PERF_COUNT_VERBOSE 1
|
||||
#define PERF_COUNT_REPORT_STR "[%36s], %24s, %11.6f\n"
|
||||
|
||||
#if ON_APPLE
|
||||
#include <time.h>
|
||||
#include <mach/clock.h>
|
||||
#include <mach/mach.h>
|
||||
#define CLOCK_MONOTONIC SYSTEM_CLOCK
|
||||
#define clock_gettime clock_get_time
|
||||
#endif
|
||||
|
||||
#if ON_WINDOWS
|
||||
|
||||
@ -97,7 +83,7 @@
|
||||
#define PERF_COUNT_END \
|
||||
QueryPerformanceCounter(&time_funct_end); \
|
||||
elapsed_time_sec = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); \
|
||||
tprintf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec);
|
||||
printf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec);
|
||||
#else
|
||||
#define PERF_COUNT_START(FUNCT_NAME)
|
||||
#define PERF_COUNT_END
|
||||
@ -107,7 +93,7 @@
|
||||
#define PERF_COUNT_SUB(SUB) \
|
||||
QueryPerformanceCounter(&time_sub_end); \
|
||||
elapsed_time_sec = (time_sub_end.QuadPart-time_sub_start.QuadPart)/(double)(freq.QuadPart); \
|
||||
tprintf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \
|
||||
printf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \
|
||||
time_sub_start = time_sub_end;
|
||||
#else
|
||||
#define PERF_COUNT_SUB(SUB)
|
||||
@ -129,7 +115,7 @@
|
||||
#define PERF_COUNT_END \
|
||||
clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); \
|
||||
elapsed_time_sec = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; \
|
||||
tprintf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec);
|
||||
printf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec);
|
||||
#else
|
||||
#define PERF_COUNT_START(FUNCT_NAME)
|
||||
#define PERF_COUNT_END
|
||||
@ -139,7 +125,7 @@
|
||||
#define PERF_COUNT_SUB(SUB) \
|
||||
clock_gettime( CLOCK_MONOTONIC, &time_sub_end ); \
|
||||
elapsed_time_sec = (time_sub_end.tv_sec - time_sub_start.tv_sec)*1.0 + (time_sub_end.tv_nsec - time_sub_start.tv_nsec)/1000000000.0; \
|
||||
tprintf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \
|
||||
printf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \
|
||||
time_sub_start = time_sub_end;
|
||||
#else
|
||||
#define PERF_COUNT_SUB(SUB)
|
||||
@ -151,9 +137,6 @@
|
||||
**************************************************************************/
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
|
||||
#define USE_DEVICE_SELECTION 1
|
||||
|
||||
#include "opencl_device_selection.h"
|
||||
|
||||
#ifndef strcasecmp
|
||||
@ -251,7 +234,6 @@ public:
|
||||
static int InitEnv(); // load dll, call InitOpenclRunEnv(0)
|
||||
static int InitOpenclRunEnv( int argc ); // RegistOpenclKernel, double flags, compile kernels
|
||||
static int InitOpenclRunEnv_DeviceSelection( int argc ); // RegistOpenclKernel, double flags, compile kernels
|
||||
static int InitOpenclRunEnv( GPUEnv *gpu ); // select device by env_CPU or selector
|
||||
static int RegistOpenclKernel();
|
||||
static int ReleaseOpenclRunEnv();
|
||||
static int ReleaseOpenclEnv( GPUEnv *gpuInfo );
|
||||
@ -271,33 +253,33 @@ public:
|
||||
static TIFF* fopenTiffCl(FILE *fp,const char *modestring);
|
||||
|
||||
/* OpenCL implementations of Morphological operations*/
|
||||
|
||||
|
||||
//Initialiation of OCL buffers used in Morph operations
|
||||
static int initMorphCLAllocations(l_int32 wpl, l_int32 h, PIX* pixs);
|
||||
static void releaseMorphCLBuffers();
|
||||
|
||||
// OpenCL implementation of Morphology Dilate
|
||||
static PIX* pixDilateBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy);
|
||||
|
||||
|
||||
// OpenCL implementation of Morphology Erode
|
||||
static PIX* pixErodeBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy);
|
||||
|
||||
|
||||
// OpenCL implementation of Morphology Close
|
||||
static PIX* pixCloseBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy);
|
||||
|
||||
// OpenCL implementation of Morphology Open
|
||||
static PIX* pixOpenBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy);
|
||||
|
||||
|
||||
// OpenCL implementation of Morphology Open
|
||||
static PIX* pixSubtractCL(PIX *pixd, PIX *pixs1, PIX *pixs2, bool reqDataCopy);
|
||||
|
||||
// OpenCL implementation of Morphology (Hollow = Closed - Open)
|
||||
static PIX* pixHollowCL(PIX *pixd, PIX *pixs, l_int32 close_hsize, l_int32 close_vsize, l_int32 open_hsize, l_int32 open_vsize, bool reqDataCopy);
|
||||
|
||||
static void pixGetLinesCL(PIX *pixd, PIX *pixs,
|
||||
PIX** pix_vline, PIX** pix_hline,
|
||||
static void pixGetLinesCL(PIX *pixd, PIX *pixs,
|
||||
PIX** pix_vline, PIX** pix_hline,
|
||||
PIX** pixClosed, bool getpixClosed,
|
||||
l_int32 close_hsize, l_int32 close_vsize,
|
||||
l_int32 close_hsize, l_int32 close_vsize,
|
||||
l_int32 open_hsize, l_int32 open_vsize,
|
||||
l_int32 line_hsize, l_int32 line_vsize);
|
||||
|
||||
@ -320,12 +302,11 @@ public:
|
||||
static void FreeOpenclDll();
|
||||
#endif
|
||||
|
||||
//int GetOpenclState();
|
||||
//void SetOpenclState( int state );
|
||||
|
||||
inline static int AddKernelConfig( int kCount, const char *kName );
|
||||
|
||||
/* for binarization */
|
||||
static void HistogramRectOCL(
|
||||
static int HistogramRectOCL(
|
||||
const unsigned char *imagedata,
|
||||
int bytes_per_pixel,
|
||||
int bytes_per_line,
|
||||
@ -335,7 +316,8 @@ public:
|
||||
int height,
|
||||
int kHistogramSize,
|
||||
int *histogramAllChannels);
|
||||
static void ThresholdRectToPixOCL(
|
||||
|
||||
static int ThresholdRectToPixOCL(
|
||||
const unsigned char* imagedata,
|
||||
int bytes_per_pixel,
|
||||
int bytes_per_line,
|
||||
@ -346,11 +328,12 @@ public:
|
||||
int rect_width,
|
||||
int rect_top,
|
||||
int rect_left);
|
||||
#if USE_DEVICE_SELECTION
|
||||
|
||||
static Pix * pixConvertRGBToGrayOCL( Pix *pix, float weightRed = 0.3, float weightGreen = 0.5, float weightBlue = 0.2 );
|
||||
|
||||
static ds_device getDeviceSelection();
|
||||
static ds_device selectedDevice;
|
||||
static bool deviceIsSelected;
|
||||
#endif
|
||||
static bool selectedDeviceIsOpenCL();
|
||||
static bool selectedDeviceIsNativeCPU();
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user