dnn(ocl4dnn): refactor pooling OpenCL calls

2025-07-20 19:17:36 +08:00 · 2017-11-23 20:10:53 +00:00 · 2017-11-23 20:10:53 +00:00 · e34b64c979
commit e34b64c979
parent f071a48ec7
3 changed files with 107 additions and 111 deletions
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@ -351,8 +351,6 @@ class OCL4DNNPool
                     UMat& top_data,
                     UMat& top_mask);
    private:
-        UMat mask_idx_;
-
        // Pooling parameters
        std::vector<int32_t> pad_;
        std::vector<int32_t> stride_;
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
@ -88,7 +88,7 @@ OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config)
 template<typename Dtype>
 OCL4DNNPool<Dtype>::~OCL4DNNPool()
 {
-    mask_idx_.release();
+    // nothing
 }

 template<typename Dtype>
@ -99,99 +99,103 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
    bool ret = true;
    size_t global[] = { 128 * 128 };
    size_t local[] = { 128 };
-    cl_uint argIdx = 0;

    // support 2D case
    switch (pool_method_)
    {
    case LIBDNN_POOLING_METHOD_MAX:
        {
-            if (top_mask.empty() && mask_idx_.empty())
-            {
-                mask_idx_.create(1, count_, CV_32FC1);
-            }
-            ocl::Kernel oclk_max_pool_forward(CL_KERNEL_SELECT("max_pool_forward"),
-                                              cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+            bool haveMask = !top_mask.empty();
+            ocl::Kernel oclk_max_pool_forward(
+                haveMask ? CL_KERNEL_SELECT("max_pool_forward_mask") : CL_KERNEL_SELECT("max_pool_forward"),
+                ocl::dnn::ocl4dnn_pooling_oclsrc,
+                format("-D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
+                       " -D STRIDE_W=%d -D STRIDE_H=%d"
+                       " -D PAD_W=%d -D PAD_H=%d%s",
+                       kernel_w_, kernel_h_,
+                       stride_w_, stride_h_,
+                       pad_w_, pad_h_,
+                       haveMask ? " -D HAVE_MASK=1" : ""
+                ));

            if (oclk_max_pool_forward.empty())
                return false;

-            argIdx = 0;
-            oclk_max_pool_forward.set(argIdx++, count_);
-            oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
-            oclk_max_pool_forward.set(argIdx++, batch_size_);
-            oclk_max_pool_forward.set(argIdx++, channels_);
-            oclk_max_pool_forward.set(argIdx++, height_);
-            oclk_max_pool_forward.set(argIdx++, width_);
-            oclk_max_pool_forward.set(argIdx++, pooled_height_);
-            oclk_max_pool_forward.set(argIdx++, pooled_width_);
-            oclk_max_pool_forward.set(argIdx++, kernel_h_);
-            oclk_max_pool_forward.set(argIdx++, kernel_w_);
-            oclk_max_pool_forward.set(argIdx++, stride_h_);
-            oclk_max_pool_forward.set(argIdx++, stride_w_);
-            oclk_max_pool_forward.set(argIdx++, pad_h_);
-            oclk_max_pool_forward.set(argIdx++, pad_w_);
-            oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
-            oclk_max_pool_forward.set(argIdx++, mask_idx_.empty() ? 0 : 1);
-            if (mask_idx_.empty())
-                oclk_max_pool_forward.set(argIdx++, (void *)NULL);
-            else
-                oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(mask_idx_));
-            oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top_mask));
+            oclk_max_pool_forward.args(
+                count_,
+                ocl::KernelArg::PtrReadOnly(bottom),
+                batch_size_,
+                channels_,
+                height_,
+                width_,
+                pooled_height_,
+                pooled_width_,
+                ocl::KernelArg::PtrWriteOnly(top),
+                ocl::KernelArg::PtrWriteOnly(top_mask)
+            );

            ret = oclk_max_pool_forward.run(1, global, local, false);
        }
        break;
    case LIBDNN_POOLING_METHOD_AVE:
        {
+            CV_Assert(top_mask.empty());
+
            ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"),
-                                              cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+                ocl::dnn::ocl4dnn_pooling_oclsrc,
+                format("-D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
+                       " -D STRIDE_W=%d -D STRIDE_H=%d"
+                       " -D PAD_W=%d -D PAD_H=%d",
+                       kernel_w_, kernel_h_,
+                       stride_w_, stride_h_,
+                       pad_w_, pad_h_
+                ));

            if (oclk_ave_pool_forward.empty())
                return false;

-            argIdx = 0;
-            oclk_ave_pool_forward.set(argIdx++, count_);
-            oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
-            oclk_ave_pool_forward.set(argIdx++, batch_size_);
-            oclk_ave_pool_forward.set(argIdx++, channels_);
-            oclk_ave_pool_forward.set(argIdx++, height_);
-            oclk_ave_pool_forward.set(argIdx++, width_);
-            oclk_ave_pool_forward.set(argIdx++, pooled_height_);
-            oclk_ave_pool_forward.set(argIdx++, pooled_width_);
-            oclk_ave_pool_forward.set(argIdx++, kernel_h_);
-            oclk_ave_pool_forward.set(argIdx++, kernel_w_);
-            oclk_ave_pool_forward.set(argIdx++, stride_h_);
-            oclk_ave_pool_forward.set(argIdx++, stride_w_);
-            oclk_ave_pool_forward.set(argIdx++, pad_h_);
-            oclk_ave_pool_forward.set(argIdx++, pad_w_);
-            oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+            oclk_ave_pool_forward.args(
+                count_,
+                ocl::KernelArg::PtrReadOnly(bottom),
+                batch_size_,
+                channels_,
+                height_,
+                width_,
+                pooled_height_,
+                pooled_width_,
+                ocl::KernelArg::PtrWriteOnly(top)
+            );

            ret = oclk_ave_pool_forward.run(1, global, local, false);
        }
        break;
    case LIBDNN_POOLING_METHOD_STO:
        {
+            CV_Assert(top_mask.empty());
+
            ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"),
-                                              cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+                ocl::dnn::ocl4dnn_pooling_oclsrc,
+                format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
+                       " -D STRIDE_W=%d -D STRIDE_H=%d",
+                       kernel_w_, kernel_h_,
+                       stride_w_, stride_h_
+                ));
+

            if (oclk_sto_pool_forward.empty())
                return false;

-            argIdx = 0;
-            oclk_sto_pool_forward.set(argIdx++, count_);
-            oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
-            oclk_sto_pool_forward.set(argIdx++, batch_size_);
-            oclk_sto_pool_forward.set(argIdx++, channels_);
-            oclk_sto_pool_forward.set(argIdx++, height_);
-            oclk_sto_pool_forward.set(argIdx++, width_);
-            oclk_sto_pool_forward.set(argIdx++, pooled_height_);
-            oclk_sto_pool_forward.set(argIdx++, pooled_width_);
-            oclk_sto_pool_forward.set(argIdx++, kernel_h_);
-            oclk_sto_pool_forward.set(argIdx++, kernel_w_);
-            oclk_sto_pool_forward.set(argIdx++, stride_h_);
-            oclk_sto_pool_forward.set(argIdx++, stride_w_);
-            oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+            oclk_sto_pool_forward.args(
+                count_,
+                ocl::KernelArg::PtrReadOnly(bottom),
+                batch_size_,
+                channels_,
+                height_,
+                width_,
+                pooled_height_,
+                pooled_width_,
+                ocl::KernelArg::PtrWriteOnly(top)
+            );

            ret = oclk_sto_pool_forward.run(1, global, local, false);
        }
--- a/modules/dnn/src/opencl/ocl4dnn_pooling.cl
+++ b/modules/dnn/src/opencl/ocl4dnn_pooling.cl
@ -44,14 +44,23 @@
 #define TEMPLATE(name,type) CONCAT(name,type)
 #define Dtype float

-void TEMPLATE(max_pool_forward_impl, Dtype)(
+#if defined KERNEL_MAX_POOL
+
+__kernel void
+#ifdef HAVE_MASK
+    TEMPLATE(max_pool_forward_mask, Dtype)
+#else
+    TEMPLATE(max_pool_forward, Dtype)
+#endif
+(
    const int nthreads, __global const Dtype* bottom_data, const int num,
    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
-    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-    const int pad_w,
-    __global Dtype* top_data,
-    const int use_mask, __global int* mask, __global Dtype* top_mask, bool no_mask)
+    const int pooled_height, const int pooled_width,
+    __global Dtype* top_data
+#ifdef HAVE_MASK
+    , __global Dtype* mask
+#endif
+)
 {
  for (int index = get_global_id(0); index < nthreads;
      index += get_global_size(0))
@ -60,10 +69,10 @@ void TEMPLATE(max_pool_forward_impl, Dtype)(
    const int ph = (index / pooled_width) % pooled_height;
    const int c = (index / pooled_width / pooled_height) % channels;
    const int n = index / pooled_width / pooled_height / channels;
-    int hstart = ph * stride_h - pad_h;
-    int wstart = pw * stride_w - pad_w;
-    const int hend = min(hstart + kernel_h, height);
-    const int wend = min(wstart + kernel_w, width);
+    int hstart = ph * STRIDE_H - PAD_H;
+    int wstart = pw * STRIDE_W - PAD_W;
+    const int hend = min(hstart + KERNEL_H, height);
+    const int wend = min(wstart + KERNEL_W, width);
    hstart = max(hstart, (int)0);
    wstart = max(wstart, (int)0);
    Dtype maxval = -FLT_MAX;
@ -79,38 +88,19 @@ void TEMPLATE(max_pool_forward_impl, Dtype)(
      }
    }
    top_data[index] = maxval;
-    if (!no_mask) {
-      if (use_mask == 1) {
-        mask[index] = maxidx;
-      } else {
-        top_mask[index] = maxidx;
-      }
-    }
+#ifdef HAVE_MASK
+    mask[index] = maxidx;
+#endif
  }
 }

-__kernel void TEMPLATE(max_pool_forward, Dtype)(
-    const int nthreads, __global const Dtype* bottom_data, const int num,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
-    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-    const int pad_w,
-    __global Dtype* top_data,
-    const int use_mask, __global int* mask, __global Dtype* top_mask)
-{
-    TEMPLATE(max_pool_forward_impl, Dtype)(
-      nthreads, bottom_data, num, channels, height, width,
-      pooled_height, pooled_width, kernel_h,
-      kernel_w, stride_h, stride_w, pad_h, pad_w, top_data, use_mask, mask, top_mask, false
-    );
-}
+#elif defined KERNEL_AVE_POOL

 __kernel void TEMPLATE(ave_pool_forward, Dtype)(
    const int nthreads, __global const Dtype* const bottom_data, const int num,
    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
-    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-    const int pad_w, __global Dtype* top_data)
+    const int pooled_height, const int pooled_width,
+    __global Dtype* top_data)
 {
  for (int index = get_global_id(0); index < nthreads;
      index += get_global_size(0))
@ -120,10 +110,10 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)(
      const int ph = (index / pooled_width) % pooled_height;
      const int c = (index / pooled_width / pooled_height) % channels;
      const int n = index / pooled_width / pooled_height / channels;
-      int hstart = ph * stride_h - pad_h;
-      int wstart = pw * stride_w - pad_w;
-      int hend = min(hstart + kernel_h, height + pad_h);
-      int wend = min(wstart + kernel_w, width + pad_w);
+      int hstart = ph * STRIDE_H - PAD_H;
+      int wstart = pw * STRIDE_W - PAD_W;
+      int hend = min(hstart + KERNEL_H, height + PAD_H);
+      int wend = min(wstart + KERNEL_W, width + PAD_W);
      const int pool_size = (hend - hstart) * (wend - wstart);
      hstart = max(hstart, (int)0);
      wstart = max(wstart, (int)0);
@ -142,11 +132,12 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)(
  }
 }

+#elif defined KERNEL_STO_POOL
+
 __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
    const int nthreads, __global const Dtype* const bottom_data, const int num,
    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
-    const int kernel_w, const int stride_h, const int stride_w,
+    const int pooled_height, const int pooled_width,
    __global Dtype* top_data)
 {
  for (int index = get_global_id(0); index < nthreads;
@ -156,10 +147,10 @@ __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
    const int ph = (index / pooled_width) % pooled_height;
    const int c = (index / pooled_width / pooled_height) % channels;
    const int n = index / pooled_width / pooled_height / channels;
-    const int hstart = ph * stride_h;
-    const int hend = min(hstart + kernel_h, height);
-    const int wstart = pw * stride_w;
-    const int wend = min(wstart + kernel_w, width);
+    const int hstart = ph * STRIDE_H;
+    const int hend = min(hstart + KERNEL_H, height);
+    const int wstart = pw * STRIDE_W;
+    const int wend = min(wstart + KERNEL_W, width);
    // We set cumsum to be 0 to avoid divide-by-zero problems
    Dtype cumsum = FLT_MIN;
    Dtype cumvalues = 0.;
@ -168,10 +159,13 @@ __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
    // First pass: get sum
    for (int h = hstart; h < hend; ++h) {
      for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_slice[h * width + w];
-        cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
+        Dtype v = bottom_slice[h * width + w];
+        cumsum += v;
+        cumvalues += v * v;
      }
    }
    top_data[index] = cumvalues / cumsum;
  }
 }
+
+#endif // KERNEL_*