dnn(ocl): fix automatic globalsize adjusting

- if kernel code doesn't support that
2025-06-10 19:24:07 +08:00 · 2021-09-04 16:27:51 +00:00 · 2021-09-04 16:27:51 +00:00 · 5578ad5e14
commit 5578ad5e14
parent d11f0a709d
9 changed files with 50 additions and 16 deletions
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@ -562,7 +562,9 @@ public:
        i = set(i, a6); i = set(i, a7); i = set(i, a8); i = set(i, a9); i = set(i, a10); i = set(i, a11);
        i = set(i, a12); i = set(i, a13); i = set(i, a14); set(i, a15); return *this;
    }
-    /** @brief Run the OpenCL kernel.
+
    /** @brief Run the OpenCL kernel (globalsize value may be adjusted)
    @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
    @param globalsize work items for each dimension. It is not the final globalsize passed to
      OpenCL. Each dimension will be adjusted to the nearest integer divisible by the corresponding
@ -571,12 +573,26 @@ public:
    @param localsize work-group size for each dimension.
    @param sync specify whether to wait for OpenCL computation to finish before return.
    @param q command queue
    @note Use run_() if your kernel code doesn't support adjusted globalsize.
    */
    bool run(int dims, size_t globalsize[],
             size_t localsize[], bool sync, const Queue& q=Queue());
    /** @brief Run the OpenCL kernel
     *
     * @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
     * @param globalsize work items for each dimension. This value is passed to OpenCL without changes.
     * @param localsize work-group size for each dimension.
     * @param sync specify whether to wait for OpenCL computation to finish before return.
     * @param q command queue
     */
    bool run_(int dims, size_t globalsize[], size_t localsize[], bool sync, const Queue& q=Queue());
    bool runTask(bool sync, const Queue& q=Queue());
-    /** @brief Similar to synchronized run() call with returning of kernel execution time
+    /** @brief Similar to synchronized run_() call with returning of kernel execution time
     *
     * Separate OpenCL command queue may be used (with CL_QUEUE_PROFILING_ENABLE)
     * @return Execution time in nanoseconds or negative number on error
     */
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@ -3160,6 +3160,14 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
 }
 bool Kernel::run_(int dims, size_t _globalsize[], size_t _localsize[],
                  bool sync, const Queue& q)
 {
    CV_Assert(p);
    return p->run(dims, _globalsize, _localsize, sync, NULL, q);
 }
 static bool isRaiseErrorOnReuseAsyncKernel()
 {
    static bool initialized = false;
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@ -231,7 +231,7 @@ public:
                kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_weight));
                kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_bias));
                kernel.set(6, ocl::KernelArg::PtrWriteOnly(dst));
-                bool ret = kernel.run(2, global, NULL, false);
+                bool ret = kernel.run_(2, global, NULL, false);
                if (!ret)
                    return false;
            }
--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@ -191,7 +191,7 @@ public:
            k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight));
            k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias));
            k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat));
-            ret = k1.run(1, globalsize, localsize, false);
+            ret = k1.run_(1, globalsize, localsize, false);
            if (!ret)
                return false;
        }
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@ -482,7 +482,7 @@ public:
                    ocl::KernelArg::PtrReadOnly(input),
                    ocl::KernelArg::PtrWriteOnly(output)
                )
-                .run(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
+                .run_(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
            if (!ret)
                return false;
        }  // for outputs.size()
--- a/modules/dnn/src/ocl4dnn/src/math_functions.cpp
+++ b/modules/dnn/src/ocl4dnn/src/math_functions.cpp
@ -116,6 +116,7 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
                .args(
                    ocl::KernelArg::PtrReadOnly(buffer),
                    image, offset,
                    padded_width, padded_height,
                    width, height,
                    ld)
                .run(2, global_copy, NULL, false);
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@ -769,12 +769,11 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
            swizzled_factor
        );
-        size_t global_work_size_copy[3] = {
+        size_t global_work_size_copy[1] = { (size_t)(alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_) };
            (size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
-        if (!oclk_copy_weight.run(3, global_work_size_copy, NULL, false))
+        if (!oclk_copy_weight.run_(1, global_work_size_copy, NULL, false))
        {
-            std::cout << "Swizzle kernel run failed." << std::endl;
+            CV_LOG_ERROR(NULL, "DNN/OpenCL: Swizzle kernel run failed");
            return false;
        }
    } else {
@ -937,7 +936,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
            kernel.set(argIdx++, (uint16_t)height_);
            kernel.set(argIdx++, (uint16_t)output_w_);
            kernel.set(argIdx++, (uint16_t)output_h_);
-            if (!kernel.run(3, config->global_work_size, config->local_work_size, false))
+            if (!kernel.run_(3, config->global_work_size, config->local_work_size, false))
            {
                std::cout << "IDLF kernel run failed." << std::endl;
                return false;
@ -1056,7 +1055,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
            gy = alignSize(gy, blockK);
            size_t global_size[3] = { gx, gy, config->global_work_size[2] };
-            if (!kernel.run(3, global_size, config->local_work_size, false))
+            if (!kernel.run_(3, global_size, config->local_work_size, false))
            {
                std::cout << "GEMM like kernel run failed." << std::endl;
                return false;
@ -1085,9 +1084,9 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
        global_size[1] = output_h_;
        global_size[2] = num_output_ * num_;
-        if (!kernel.run(3, global_size, NULL, false))
+        if (!kernel.run_(3, global_size, NULL, false))
        {
-            std::cout << "DWCONV kernel run failed." << std::endl;
+            CV_LOG_ERROR(NULL, "DNN/OpenCL: DWCONV kernel run failed");
            return false;
        }
    } else {
@ -1127,11 +1126,11 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                kernel.set(argIdx++, (uint16_t)output_h_);
                kernel.set(argIdx++, (uint16_t)pad_w_);
                kernel.set(argIdx++, (uint16_t)pad_h_);
-                if (!kernel.run(3, config->global_work_size,
+                if (!kernel.run_(3, config->global_work_size,
                                (config->use_null_local) ? NULL : config->local_work_size,
                                false))
                {
-                    std::cout << "Basic kernel run failed." << std::endl;
+                    CV_LOG_ERROR(NULL, "DNN/OpenCL: Basic kernel run failed");
                    return false;
                }
            }
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp
@ -127,7 +127,7 @@ bool OCL4DNNSoftmax<Dtype>::Forward(const UMat& bottom, UMat& top)
            oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
            oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
        }
-        ret = oclk_softmax_forward_kernel.run(3, global_size, local_size, false);
+        ret = oclk_softmax_forward_kernel.run_(3, global_size, local_size, false);
    }
    return ret;
 }
--- a/modules/dnn/src/opencl/gemm_image.cl
+++ b/modules/dnn/src/opencl/gemm_image.cl
@ -954,6 +954,10 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_transpose, Dtype)(
 {
    const int gidx = get_global_id(0);
    const int gidy = get_global_id(1);
    if (gidx >= width || gidy >= height)
        return;
    int2 coord_dst = (int2)(gidx, gidy);
    __global Dtype* A_off = A + offA;
    Dtype srcA = A_off[gidy * ldA + gidx];
@ -968,12 +972,18 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose, Dtype)(
    __global Dtype* A,
    __write_only image2d_t ImA,
    int offA,
    int padded_width,
    int padded_height,
    int width,
    int height,
    int ldA)
 {
    const int gidx = get_global_id(0);
    const int gidy = get_global_id(1);
    if (gidx >= padded_width || gidy >= padded_height)
        return;
    int2 coord_dst = (int2)(gidx, gidy);
 #if TYPE == TYPE_HALF
    if (gidx >= width || gidy >= height) {