Merge pull request #11409 from tomoaki0705/fixCLAHEfailure

Arm: fix the test failure of OCL_Imgproc/CLAHETest.Accuracy on ODROID-XU4 (#11409) * fix the test failure of OCL_Imgproc/CLAHETest.Accuracy on ODROID-XU4 * avoid the race condition in the reduce * imgproc(ocl): simplify CLAHE code * remove unused class
2025-08-05 22:19:14 +08:00 · 2018-04-27 22:41:56 +09:00 · 2018-04-27 22:41:56 +09:00 · 87a4f4ab3a
commit 87a4f4ab3a
parent 71d406b40c
2 changed files with 17 additions and 68 deletions
--- a/modules/imgproc/src/clahe.cpp
+++ b/modules/imgproc/src/clahe.cpp
@ -54,16 +54,7 @@ namespace clahe
        const int tilesX, const int tilesY, const cv::Size tileSize,
        const int clipLimit, const float lutScale)
    {
-        cv::ocl::Kernel _k("calcLut", cv::ocl::imgproc::clahe_oclsrc);
-
-        bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
-        cv::String opts;
-        if(is_cpu)
-            opts = "-D CPU ";
-        else
-            opts = cv::format("-D WAVE_SIZE=%d", _k.preferedWorkGroupSizeMultiple());
-
-        cv::ocl::Kernel k("calcLut", cv::ocl::imgproc::clahe_oclsrc, opts);
+        cv::ocl::Kernel k("calcLut", cv::ocl::imgproc::clahe_oclsrc);
        if(k.empty())
            return false;

--- a/modules/imgproc/src/opencl/clahe.cl
+++ b/modules/imgproc/src/opencl/clahe.cl
@ -43,10 +43,6 @@
 //
 //M*/

-#ifndef WAVE_SIZE
-#define WAVE_SIZE 1
-#endif
-
 inline int calc_lut(__local int* smem, int val, int tid)
 {
    smem[tid] = val;
@ -60,8 +56,7 @@ inline int calc_lut(__local int* smem, int val, int tid)
    return smem[tid];
 }

-#ifdef CPU
-inline void reduce(volatile __local int* smem, int val, int tid)
+inline int reduce(__local volatile int* smem, int val, int tid)
 {
    smem[tid] = val;
    barrier(CLK_LOCAL_MEM_FENCE);
@ -75,69 +70,39 @@ inline void reduce(volatile __local int* smem, int val, int tid)
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 32)
+    {
        smem[tid] += smem[tid + 32];
+    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 16)
+    {
        smem[tid] += smem[tid + 16];
+    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 8)
+    {
        smem[tid] += smem[tid + 8];
+    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 4)
+    {
        smem[tid] += smem[tid + 4];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 2)
-        smem[tid] += smem[tid + 2];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 1)
-        smem[256] = smem[tid] + smem[tid + 1];
-    barrier(CLK_LOCAL_MEM_FENCE);
-}
-
-#else
-
-inline void reduce(__local volatile int* smem, int val, int tid)
-{
-    smem[tid] = val;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 128)
-        smem[tid] = val += smem[tid + 128];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 64)
-        smem[tid] = val += smem[tid + 64];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 32)
-    {
-        smem[tid] += smem[tid + 32];
-#if WAVE_SIZE < 32
-    } barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 16)
-    {
-#endif
-        smem[tid] += smem[tid + 16];
-#if WAVE_SIZE < 16
    }
    barrier(CLK_LOCAL_MEM_FENCE);

-    if (tid < 8)
+    if (tid == 0)
    {
-#endif
-        smem[tid] += smem[tid + 8];
-        smem[tid] += smem[tid + 4];
-        smem[tid] += smem[tid + 2];
-        smem[tid] += smem[tid + 1];
+        smem[0] = (smem[0] + smem[1]) + (smem[2] + smem[3]);
    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    val = smem[0];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return val;
 }
-#endif

 __kernel void calcLut(__global __const uchar * src, const int srcStep,
                      const int src_offset, __global uchar * lut,
@ -179,14 +144,7 @@ __kernel void calcLut(__global __const uchar * src, const int srcStep,
        }

        // find number of overall clipped samples
-        reduce(smem, clipped, tid);
-        barrier(CLK_LOCAL_MEM_FENCE);
-#ifdef CPU
-        clipped = smem[256];
-#else
-        clipped = smem[0];
-#endif
-        barrier(CLK_LOCAL_MEM_FENCE);
+        clipped = reduce(smem, clipped, tid);

        // redistribute clipped samples evenly
        int redistBatch = clipped / 256;