Merge pull request #11409 from tomoaki0705/fixCLAHEfailure

Arm: fix the test failure of OCL_Imgproc/CLAHETest.Accuracy on ODROID-XU4 (#11409)

* fix the test failure of OCL_Imgproc/CLAHETest.Accuracy on ODROID-XU4
  * avoid the race condition in the reduce

* imgproc(ocl): simplify CLAHE code

* remove unused class
This commit is contained in:
Tomoaki Teshima 2018-04-27 22:41:56 +09:00 committed by Alexander Alekhin
parent 71d406b40c
commit 87a4f4ab3a
2 changed files with 17 additions and 68 deletions

View File

@ -54,16 +54,7 @@ namespace clahe
const int tilesX, const int tilesY, const cv::Size tileSize,
const int clipLimit, const float lutScale)
{
cv::ocl::Kernel _k("calcLut", cv::ocl::imgproc::clahe_oclsrc);
bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
cv::String opts;
if(is_cpu)
opts = "-D CPU ";
else
opts = cv::format("-D WAVE_SIZE=%d", _k.preferedWorkGroupSizeMultiple());
cv::ocl::Kernel k("calcLut", cv::ocl::imgproc::clahe_oclsrc, opts);
cv::ocl::Kernel k("calcLut", cv::ocl::imgproc::clahe_oclsrc);
if(k.empty())
return false;

View File

@ -43,10 +43,6 @@
//
//M*/
#ifndef WAVE_SIZE
#define WAVE_SIZE 1
#endif
inline int calc_lut(__local int* smem, int val, int tid)
{
smem[tid] = val;
@ -60,8 +56,7 @@ inline int calc_lut(__local int* smem, int val, int tid)
return smem[tid];
}
#ifdef CPU
inline void reduce(volatile __local int* smem, int val, int tid)
inline int reduce(__local volatile int* smem, int val, int tid)
{
smem[tid] = val;
barrier(CLK_LOCAL_MEM_FENCE);
@ -75,69 +70,39 @@ inline void reduce(volatile __local int* smem, int val, int tid)
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 32)
{
smem[tid] += smem[tid + 32];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
smem[tid] += smem[tid + 16];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
smem[tid] += smem[tid + 8];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
smem[tid] += smem[tid + 4];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
smem[tid] += smem[tid + 2];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
smem[256] = smem[tid] + smem[tid + 1];
barrier(CLK_LOCAL_MEM_FENCE);
}
#else
inline void reduce(__local volatile int* smem, int val, int tid)
{
smem[tid] = val;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 128)
smem[tid] = val += smem[tid + 128];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 64)
smem[tid] = val += smem[tid + 64];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 32)
{
smem[tid] += smem[tid + 32];
#if WAVE_SIZE < 32
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
#endif
smem[tid] += smem[tid + 16];
#if WAVE_SIZE < 16
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
if (tid == 0)
{
#endif
smem[tid] += smem[tid + 8];
smem[tid] += smem[tid + 4];
smem[tid] += smem[tid + 2];
smem[tid] += smem[tid + 1];
smem[0] = (smem[0] + smem[1]) + (smem[2] + smem[3]);
}
barrier(CLK_LOCAL_MEM_FENCE);
val = smem[0];
barrier(CLK_LOCAL_MEM_FENCE);
return val;
}
#endif
__kernel void calcLut(__global __const uchar * src, const int srcStep,
const int src_offset, __global uchar * lut,
@ -179,14 +144,7 @@ __kernel void calcLut(__global __const uchar * src, const int srcStep,
}
// find number of overall clipped samples
reduce(smem, clipped, tid);
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef CPU
clipped = smem[256];
#else
clipped = smem[0];
#endif
barrier(CLK_LOCAL_MEM_FENCE);
clipped = reduce(smem, clipped, tid);
// redistribute clipped samples evenly
int redistBatch = clipped / 256;