Fixes for OpenCL issues reported on Apple Mac. Still get -54 on Apple Mac while running on OpenCL CPU, however it is ignored now.

This commit is contained in:
gargrahul 2015-08-24 15:57:18 +05:30 committed by Zdenko Podobný
parent 2c8bc4a2ac
commit 8e9159b091
4 changed files with 563 additions and 628 deletions

View File

@ -1100,10 +1100,13 @@ void kernel_ThresholdRectToPix(
}
}
// only supports 1 channel
\n#define CHAR_VEC_WIDTH 8 \n
\n#define PIXELS_PER_WORD 32 \n
\n#define PIXELS_PER_BURST 8 \n
\n#define BURSTS_PER_WORD (PIXELS_PER_WORD/PIXELS_PER_BURST) \n
typedef union {
uchar s[PIXELS_PER_BURST];
uchar8 v[(PIXELS_PER_BURST)/CHAR_VEC_WIDTH];
uchar s[PIXELS_PER_BURST*1];
uchar8 v[(PIXELS_PER_BURST*1)/CHAR_VEC_WIDTH];
} charVec1;
__attribute__((reqd_work_group_size(256, 1, 1)))
@ -1112,7 +1115,7 @@ void kernel_ThresholdRectToPix_OneChan(
__global const uchar8 *imageData,
int height,
int width,
int wpl, // words per line
int wpl, // words per line of output image
__global int *thresholds,
__global int *hi_values,
__global int *pix) {
@ -1134,96 +1137,71 @@ void kernel_ThresholdRectToPix_OneChan(
// load burst
charVec1 pixels;
for ( int i = 0; i < (PIXELS_PER_BURST)/CHAR_VEC_WIDTH; i++ ) {
pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + i];
}
// for each char8 in burst
pixels.v[0] = imageData[
w*BURSTS_PER_WORD
+ b
+ 0 ];
// for each pixel in burst
for ( int p = 0; p < PIXELS_PER_BURST; p++) {
for ( int c = 0; c < 1; c++) {
unsigned char pixChan = pixels.s[p + c];
if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
word |= (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31));
}
//int littleEndianIdx = p ^ 3;
//int bigEndianIdx = p;
int idx =
\n#ifdef __ENDIAN_LITTLE__\n
p ^ 3;
\n#else\n
p;
\n#endif\n
unsigned char pixChan = pixels.s[idx];
if (pHi_Values[0] >= 0 && (pixChan > pThresholds[0]) == (pHi_Values[0] == 0)) {
word |= (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31));
}
}
}
pix[w] = word;
}
}
)
KERNEL(
\n#define RED_SHIFT 24\n
\n#define GREEN_SHIFT 16\n
\n#define BLUE_SHIFT 8\n
\n#define SET_DATA_BYTE( pdata, n, val ) (*(l_uint8 *)((l_uintptr_t)((l_uint8 *)(pdata) + (n)) ^ 3) = (val))\n
\n
\n__attribute__((reqd_work_group_size(256, 1, 1)))\n
\n__kernel\n
\nvoid kernel_RGBToGray(
__global const unsigned int *srcData,
__global unsigned char *dstData,
int srcWPL,
int dstWPL,
int height,
int width,
float rwt,
float gwt,
float bwt ) {
// pixel index
int pixelIdx = get_global_id(0);
if (pixelIdx >= height*width) return;
unsigned int word = srcData[pixelIdx];
int output = (rwt * ((word >> RED_SHIFT) & 0xff) +
gwt * ((word >> GREEN_SHIFT) & 0xff) +
bwt * ((word >> BLUE_SHIFT) & 0xff) + 0.5);
// SET_DATA_BYTE
dstData[pixelIdx] = output;
}
)
#endif
; // close char*
#endif // USE_EXTERNAL_KERNEL
#endif //_OCL_KERNEL_H_
//#endif //_OCL_KERNEL_H_
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
// Alternative histogram kernel written to use uchar and different global memory scattered write
// was a little better for intel platforms but still not faster then native serial code
#if 0
/* data layed out as
bin0 bin1 bin2...
r,g,b,a,r,g,b,a,r,g,b,a nthreads/4 copies
*/
\n__attribute__((reqd_work_group_size(256, 1, 1)))
\n __kernel
\n void kernel_HistogramRectAllChannels_uchar(
\n volatile __global const uchar *data,
\n uint numPixels,
\n volatile __global uint *histBuffer) {
\n
\n // for each pixel/channel, accumulate in global memory
\n for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS; pc += get_global_size(0) ) {
\n uchar value = data[pc];
\n int idx = value*get_global_size(0) + get_global_id(0);
\n histBuffer[ idx ]++; // coalesced if same value
\n }
\n } // kernel_HistogramRectAllChannels
\n
\n __attribute__((reqd_work_group_size(256, 1, 1)))
\n __kernel
\n void kernel_HistogramRectAllChannelsReduction_uchar(
\n int n, // pixel redundancy that needs to be accumulated = nthreads/4
\n __global uint4 *histBuffer,
\n __global uint* histResult) { // each wg accumulates 1 bin (all channels within it
\n
\n // declare variables
\n int binIdx = get_group_id(0);
\n size_t groupId = get_group_id(0);
\n size_t localId = get_local_id(0); // 0 -> 256-1
\n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
\n uint numThreads = get_global_size(0);
\n uint4 hist = {0, 0, 0, 0};
\n
\n // accumulate in register
\n for ( uint p = get_local_id(0); p < n; p+=GROUP_SIZE) {
\n hist += histBuffer[binIdx*n+p];
\n }
\n
\n // reduction in local memory
\n __local uint4 localHist[GROUP_SIZE];
\n localHist[localId] = hist;
\n barrier(CLK_LOCAL_MEM_FENCE);
\n
\n for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
\n if (localId < stride) {
\n hist = localHist[ localId+stride];
\n }
\n barrier(CLK_LOCAL_MEM_FENCE);
\n if (localId < stride) {
\n localHist[ localId] += hist;
\n }
\n barrier(CLK_LOCAL_MEM_FENCE);
\n }
\n
\n // write reduction to final result
\n if (localId == 0) {
\n histResult[0*HIST_SIZE+binIdx] = localHist[0].s0;
\n histResult[1*HIST_SIZE+binIdx] = localHist[0].s1;
\n histResult[2*HIST_SIZE+binIdx] = localHist[0].s2;
\n histResult[3*HIST_SIZE+binIdx] = localHist[0].s3;
\n }
\n
\n } // kernel_HistogramRectAllChannels
#endif

View File

@ -63,6 +63,8 @@ static ds_status releaseDSProfile(ds_profile* profile, ds_score_release sr) {
if (profile->devices!=NULL && sr!=NULL) {
unsigned int i;
for (i = 0; i < profile->numDevices; i++) {
if (profile->devices[i].oclDeviceName) free(profile->devices[i].oclDeviceName);
if (profile->devices[i].oclDriverVersion) free(profile->devices[i].oclDriverVersion);
status = sr(profile->devices[i].score);
if (status != DS_SUCCESS)
break;

File diff suppressed because it is too large Load Diff

View File

@ -56,13 +56,6 @@
#include <time.h>
#endif
#if ON_APPLE
#include <mach/clock.h>
#include <mach/mach.h>
#define CLOCK_MONOTONIC SYSTEM_CLOCK
#define clock_gettime clock_get_time
#endif
/************************************************************************************
* enable/disable reporting of performance
* PERF_REPORT_LEVEL
@ -74,13 +67,6 @@
#define PERF_COUNT_VERBOSE 1
#define PERF_COUNT_REPORT_STR "[%36s], %24s, %11.6f\n"
#if ON_APPLE
#include <time.h>
#include <mach/clock.h>
#include <mach/mach.h>
#define CLOCK_MONOTONIC SYSTEM_CLOCK
#define clock_gettime clock_get_time
#endif
#if ON_WINDOWS
@ -97,7 +83,7 @@
#define PERF_COUNT_END \
QueryPerformanceCounter(&time_funct_end); \
elapsed_time_sec = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); \
tprintf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec);
printf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec);
#else
#define PERF_COUNT_START(FUNCT_NAME)
#define PERF_COUNT_END
@ -107,7 +93,7 @@
#define PERF_COUNT_SUB(SUB) \
QueryPerformanceCounter(&time_sub_end); \
elapsed_time_sec = (time_sub_end.QuadPart-time_sub_start.QuadPart)/(double)(freq.QuadPart); \
tprintf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \
printf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \
time_sub_start = time_sub_end;
#else
#define PERF_COUNT_SUB(SUB)
@ -129,7 +115,7 @@
#define PERF_COUNT_END \
clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); \
elapsed_time_sec = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; \
tprintf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec);
printf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec);
#else
#define PERF_COUNT_START(FUNCT_NAME)
#define PERF_COUNT_END
@ -139,7 +125,7 @@
#define PERF_COUNT_SUB(SUB) \
clock_gettime( CLOCK_MONOTONIC, &time_sub_end ); \
elapsed_time_sec = (time_sub_end.tv_sec - time_sub_start.tv_sec)*1.0 + (time_sub_end.tv_nsec - time_sub_start.tv_nsec)/1000000000.0; \
tprintf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \
printf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \
time_sub_start = time_sub_end;
#else
#define PERF_COUNT_SUB(SUB)
@ -151,9 +137,6 @@
**************************************************************************/
#ifdef USE_OPENCL
#define USE_DEVICE_SELECTION 1
#include "opencl_device_selection.h"
#ifndef strcasecmp
@ -251,7 +234,6 @@ public:
static int InitEnv(); // load dll, call InitOpenclRunEnv(0)
static int InitOpenclRunEnv( int argc ); // RegistOpenclKernel, double flags, compile kernels
static int InitOpenclRunEnv_DeviceSelection( int argc ); // RegistOpenclKernel, double flags, compile kernels
static int InitOpenclRunEnv( GPUEnv *gpu ); // select device by env_CPU or selector
static int RegistOpenclKernel();
static int ReleaseOpenclRunEnv();
static int ReleaseOpenclEnv( GPUEnv *gpuInfo );
@ -271,33 +253,33 @@ public:
static TIFF* fopenTiffCl(FILE *fp,const char *modestring);
/* OpenCL implementations of Morphological operations*/
//Initialiation of OCL buffers used in Morph operations
static int initMorphCLAllocations(l_int32 wpl, l_int32 h, PIX* pixs);
static void releaseMorphCLBuffers();
// OpenCL implementation of Morphology Dilate
static PIX* pixDilateBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy);
// OpenCL implementation of Morphology Erode
static PIX* pixErodeBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy);
// OpenCL implementation of Morphology Close
static PIX* pixCloseBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy);
// OpenCL implementation of Morphology Open
static PIX* pixOpenBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy);
// OpenCL implementation of Morphology Open
static PIX* pixSubtractCL(PIX *pixd, PIX *pixs1, PIX *pixs2, bool reqDataCopy);
// OpenCL implementation of Morphology (Hollow = Closed - Open)
static PIX* pixHollowCL(PIX *pixd, PIX *pixs, l_int32 close_hsize, l_int32 close_vsize, l_int32 open_hsize, l_int32 open_vsize, bool reqDataCopy);
static void pixGetLinesCL(PIX *pixd, PIX *pixs,
PIX** pix_vline, PIX** pix_hline,
static void pixGetLinesCL(PIX *pixd, PIX *pixs,
PIX** pix_vline, PIX** pix_hline,
PIX** pixClosed, bool getpixClosed,
l_int32 close_hsize, l_int32 close_vsize,
l_int32 close_hsize, l_int32 close_vsize,
l_int32 open_hsize, l_int32 open_vsize,
l_int32 line_hsize, l_int32 line_vsize);
@ -320,12 +302,11 @@ public:
static void FreeOpenclDll();
#endif
//int GetOpenclState();
//void SetOpenclState( int state );
inline static int AddKernelConfig( int kCount, const char *kName );
/* for binarization */
static void HistogramRectOCL(
static int HistogramRectOCL(
const unsigned char *imagedata,
int bytes_per_pixel,
int bytes_per_line,
@ -335,7 +316,8 @@ public:
int height,
int kHistogramSize,
int *histogramAllChannels);
static void ThresholdRectToPixOCL(
static int ThresholdRectToPixOCL(
const unsigned char* imagedata,
int bytes_per_pixel,
int bytes_per_line,
@ -346,11 +328,12 @@ public:
int rect_width,
int rect_top,
int rect_left);
#if USE_DEVICE_SELECTION
static Pix * pixConvertRGBToGrayOCL( Pix *pix, float weightRed = 0.3, float weightGreen = 0.5, float weightBlue = 0.2 );
static ds_device getDeviceSelection();
static ds_device selectedDevice;
static bool deviceIsSelected;
#endif
static bool selectedDeviceIsOpenCL();
static bool selectedDeviceIsNativeCPU();