dnn(ocl): fix buffer offsets in IDLF kernel

- drop CreateSubBuffer - fix FUSED_CONV_ELTWISE mode
2025-07-26 07:07:37 +08:00 · 2021-09-04 04:35:00 +00:00 · 2021-09-04 04:35:00 +00:00 · 407adc7061
commit 407adc7061
parent 7c23ec90a9
3 changed files with 41 additions and 72 deletions
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@ -269,7 +269,7 @@ class OCL4DNNConvSpatial
        void generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
                                      int blockM, int blockK, int simd_size);
        void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise);
-        void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx);
+        void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx);
        int32_t group_;
        bool bias_term_;
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@ -270,17 +270,21 @@ void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ,
 }
 template<typename Dtype>
-void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx)
+void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx)
 {
    if (fused_eltwise)
-        kernel.set(argIdx++, (cl_mem)bottom_data2_.handle(ACCESS_READ));
+    {
        kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom_data2_));
        if (fused_eltwise_offset >= 0)
            kernel.set(argIdx++, fused_eltwise_offset);
    }
    switch (fused_activ) {
        case OCL4DNN_CONV_FUSED_ACTIV_RELU:
            kernel.set(argIdx++, (float)negative_slope_);
            break;
        case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
-            kernel.set(argIdx++, (cl_mem)negative_slope_umat_.handle(ACCESS_READ));
+            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(negative_slope_umat_));
            break;
        case OCL4DNN_CONV_FUSED_ACTIV_POWER:
            kernel.set(argIdx++, (float)power_);
@ -895,10 +899,12 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
    if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) {
        if (!swizzleWeight(weight, config->workItem_output[2], false))
            return false;
 #if 0
        size_t total_bottom_size = bottom_dim_ * numImages;
        size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
        size_t total_bias_size = M_ * group_;
        size_t total_top_size = top_dim_ * numImages;
 #endif
        for (int32_t g = 0; g < group_; ++g) {
            bias_offset = M_ * g;
            int32_t image_offset = width_ * height_ * (channels_ / group_) * g;
@ -910,72 +916,22 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                return false;
            cl_uint argIdx = 0;
-            setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
+            setFusionArg(fused_activ_, fused_eltwise_, output_image_offset, kernel, argIdx);
-            UMat img_buffer;
+            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
-            if (image_offset)
+            kernel.set(argIdx++, image_offset);
            {
                CreateSubBuffer(bottom, img_buffer, image_offset,
                                total_bottom_size - image_offset, false);
                if (img_buffer.empty())
                    return false;
-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
+            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
-            }
+            kernel.set(argIdx++, kernel_offset);
            else
            {
                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
            }
            UMat kernel_buffer;
            if (kernel_offset)
            {
                CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
                                total_kernel_size - kernel_offset, false);
                if (kernel_buffer.empty())
                    return false;
                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
            }
            else
            {
                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
            }
            UMat bias_buffer;
            if (bias_term_)
            {
-                if (bias_offset)
+                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
-                {
+                kernel.set(argIdx++, bias_offset);
                    CreateSubBuffer(bias, bias_buffer, bias_offset,
                                    total_bias_size - bias_offset, false);
                    if (bias_buffer.empty())
                        return false;
                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
                }
                else
                {
                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
                }
            }
-            UMat out_buffer;
+            kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
-            if (output_image_offset)
+            kernel.set(argIdx++, (int)(top.offset / element_size) + output_image_offset);
            {
                CreateSubBuffer(top, out_buffer, output_image_offset,
                                total_top_size - output_image_offset, true);
                if (out_buffer.empty())
                    return false;
                kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
                kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
            }
            else
            {
                kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
                kernel.set(argIdx++, (int)(top.offset / element_size));
            }
            kernel.set(argIdx++, (uint16_t)width_);
            kernel.set(argIdx++, (uint16_t)height_);
@ -1005,7 +961,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                return false;
            cl_uint argIdx = 0;
-            setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
+            setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
            UMat img_buffer;
            if (image_offset)
@ -1112,7 +1068,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
            return false;
        cl_uint argIdx = 0;
-        setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
+        setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
        kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
        kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
        if (bias_term_)
@ -1152,7 +1108,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                    return false;
                cl_uint argIdx = 0;
-                setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
+                setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
                kernel.set(argIdx++, image_offset);
                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@ -74,18 +74,22 @@
    (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
 } while(0)
 #define ELTWISE_DATA_ARG __global Dtype* eltwise_data,
 #define ELTWISE_DATA_ARG_WITH_OFFSET __global Dtype* eltwise_ptr, int eltwise_offset,
 #else
 #define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \
    const Dtype _x_ = (_data_); \
    (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
 } while(0)
 #define ELTWISE_DATA_ARG
 #define ELTWISE_DATA_ARG_WITH_OFFSET
 #endif
 #if APPLY_BIAS
 #define BIAS_KERNEL_ARG __global Dtype * biases_base,
 #define BIAS_KERNEL_ARG_WITH_OFFSET __global Dtype * biases_base_ptr, int biases_base_offset,
 #else
 #define BIAS_KERNEL_ARG
 #define BIAS_KERNEL_ARG_WITH_OFFSET
 #endif
 #define __CAT(x, y) x##y
@ -223,19 +227,28 @@ __attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
 __attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
 __kernel void
 convolve_simd(
-    ELTWISE_DATA_ARG
+    ELTWISE_DATA_ARG_WITH_OFFSET
    FUSED_ARG
-    __global Dtype* inputs,
+    __global Dtype* inputs_ptr, const int inputs_offset,
-    __global Dtype* weights,
+    __global Dtype* weights_ptr, const int weights_offset,
-    BIAS_KERNEL_ARG
+    BIAS_KERNEL_ARG_WITH_OFFSET
-    __global Dtype* outputs_base,
+    __global Dtype* outputs_base, const int outputs_offset,
    const int outputs_offset,
    const ushort input_width,
    const ushort input_height,
    const ushort output_width,
    const ushort output_height)
 {
  __global Dtype* inputs = inputs_ptr + inputs_offset;
  __global Dtype* weights = weights_ptr + weights_offset;
 #if APPLY_BIAS
  __global Dtype* biases_base = biases_base_ptr + biases_base_offset;
 #endif
  __global Dtype* outputs = outputs_base + outputs_offset;
 #ifdef FUSED_CONV_ELTWISE
  __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
 #endif
  unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH;  // oc = Output Column
  unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
  unsigned int fm = get_global_id(2);                    // fm = Feature Map = od = Output Depth