Merge pull request #20840 from alalek:dnn_ocl_cleanup_code

2025-08-06 14:36:36 +08:00 · 2021-10-08 05:07:51 +00:00 · 2021-10-08 05:07:51 +00:00 · 81e7988eb9
commit 81e7988eb9
parent 05348f3250 8c2dd5fb9a
3 changed files with 42 additions and 99 deletions
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@ -222,8 +222,6 @@ class OCL4DNNConvSpatial
        bool createDWConvKernel(int32_t blockWidth,
                                int32_t blockHeight,
                                int32_t blockDepth);
-        void CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
-                             int32_t offset, int32_t size, bool write_only);
        bool convolve(const UMat &bottom, UMat &top,
                      const UMat &weight, const UMat &bias,
                      int32_t numImages,
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@ -219,14 +219,7 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
 #endif
        if (!use_cache_path_)
        {
-            static int warn_ = 0;
-            if (!warn_)
-            {
-                std::cerr
-                    << "OpenCV(ocl4dnn): Kernel configuration cache directory doesn't exist: " << cache_path_ << std::endl
-                    << std::endl;
-                warn_ = true;
-            }
+            CV_LOG_ONCE_ERROR(NULL, "OpenCV(ocl4dnn): Kernel configuration cache directory doesn't exist: " << cache_path_);
        }
    }

@ -419,7 +412,6 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
        addDef("CHANNELS", channels_ / group_);
        addDef("APPLY_BIAS", bias_term_);
        addDef("OUTPUT_Z", M_);
-        addDef("ZPAR", 1);
        setFusionDefine(fused_activ_, fused_eltwise_);

        src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
@ -673,8 +665,7 @@ void interleaveMatrix(Dtype* mem_dst, const Dtype *mem,
                      int r, int c, int interleavedRows, int nonInterleavedRows,
                      int blockWidth, int rowAlignment )
 {
-    CHECK_EQ(interleavedRows % 2, 0) <<
-             "interleaveMatrix only supports even values for interleavedRows.";
+    CV_Check(interleavedRows, interleavedRows % 2 == 0, "interleaveMatrix only supports even values for interleavedRows.");

    size_t memSize = r * c * sizeof(float);
    size_t dstSize = memSize *
@ -686,9 +677,12 @@ void interleaveMatrix(Dtype* mem_dst, const Dtype *mem,
    const int yStride = c * 2;
    const Dtype *pSrc = mem;
    Dtype* pDst = mem_dst;
-    for (int y = 0; y < r;) {
-        for (int rows = 0; rows < interleavedRows; rows += 2) {
-            if ( y >= r ) break;
+    for (int y = 0; y < r;)
+    {
+        for (int rows = 0; rows < interleavedRows; rows += 2)
+        {
+            if (y >= r)
+                break;
            if ((c % xStride) == 0) {
                for (int x = 0; x < c / xStride; x++) {
                    memcpy(pDst + x * xStride * 2,                         // NOLINT
@ -713,11 +707,14 @@ void interleaveMatrix(Dtype* mem_dst, const Dtype *mem,
            y += 2;
        }

-        for (int rows = 0; rows < nonInterleavedRows; rows++) {
-            if (y >= r) break;
+        for (int rows = 0; rows < nonInterleavedRows; rows++)
+        {
+            if (y >= r)
+                break;
            const int stride = rowAlignment;
            int remaining = c;
-            for (int x = 0; x < c; x += stride) {
+            for (int x = 0; x < c; x += stride)
+            {
                if (remaining >= stride) {
                    memcpy(pDst + x * 2, pSrc + x, stride * sizeof(Dtype));    // NOLINT
                    remaining -=stride;
@ -853,34 +850,6 @@ bool OCL4DNNConvSpatial<float>::createBasicKernel(int32_t blockWidth,
        return false;
 }

-template<>
-void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
-                                                int32_t offset, int32_t size, bool write_only)
-{
-    cl_mem sub_mem;
-    cl_buffer_region region;
-    cl_int err;
-    size_t element_size = (use_half_) ? sizeof(short) : sizeof(float);
-
-    region.origin = offset * element_size + buffer.offset;
-    region.size = size * element_size;
-    sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ),
-                                write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY,
-                                CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
-    if (err)
-    {
-        std::cout << "Failed to create sub buffer." << std::endl;
-        return;
-    }
-
-    int step = element_size, rows = size, cols = 1;
-    ocl::convertFromBuffer(sub_mem, step, rows, cols,
-                           (use_half_) ? CV_16SC1 : CV_32FC1, sub_buffer);
-
-    //decrease ocl mem refcount
-    clReleaseMemObject(sub_mem);
-}
-
 template<>
 bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                                         const UMat &weight, const UMat &bias,
@ -939,7 +908,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
            kernel.set(argIdx++, (uint16_t)output_h_);
            if (!kernel.run_(3, config->global_work_size, config->local_work_size, false))
            {
-                std::cout << "IDLF kernel run failed." << std::endl;
+                CV_LOG_ERROR(NULL, "DNN/OpenCL: IDLF kernel run failed");
                return false;
            }
        }
@ -1013,7 +982,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,

            if (!kernel.run_(3, global_size, config->local_work_size, false))
            {
-                std::cout << "GEMM like kernel run failed." << std::endl;
+                CV_LOG_ERROR(NULL, "DNN/OpenCL: GEMM like kernel run failed");
                return false;
            }
        }
@ -1116,14 +1085,9 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
    {
        queue = cv::ocl::Queue::getDefault();
    }
-    catch (const cv::Exception&)
+    catch (const std::exception& e)
    {
-        static int warn_ = 0;
-        if (!warn_)
-        {
-            std::cout << "OpenCV(ocl4dnn): Can't get OpenCL default queue for auto-tuning." << std::endl;
-            warn_ = true;
-        }
+        CV_LOG_ONCE_ERROR(NULL, "OpenCV(ocl4dnn): Can't get OpenCL default queue for auto-tuning: " << e.what());
        return 1e6;
    }

@ -1327,9 +1291,9 @@ ocl::Program OCL4DNNConvSpatial<Dtype>::compileKernel()
    phash.insert(std::pair<std::string, ocl::Program>(kernel_name_, program));
    if (!program.ptr())
    {
-        std::cout << "Failed to compile kernel: " << kernel_name_
-                  << ", buildflags: " << options
-                  << ", errmsg: " << errmsg << std::endl;
+        CV_LOG_WARNING(NULL, "DNN/OpenCL: Failed to compile kernel: " << kernel_name_
+            << ", buildflags: '" << options << "', errmsg: '" << errmsg << "'"
+        );
    }
    return program;
 }
@ -1755,7 +1719,8 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
                    fastestTime = kernelQueue[x]->executionTime;
                }
            }
-            if (fastestKernel < 0) break;
+            if (fastestKernel < 0)
+                break;
            // Test fastest kernel
            bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[fastestKernel], verifyTop);
            if (verified == true) {
@ -1914,17 +1879,18 @@ bool OCL4DNNConvSpatial<Dtype>::setupKernelByConfig(int x, int y, int z, int typ
    {
        if (z == 1)
            z = 16;
-        CHECK_EQ(z == 16 || z == 8, true) << "invalid SIMD size" << std::endl;
+        CV_Check(z, z == 16 || z == 8, "DNN/OpenCL: IDLF - invalid SIMD size");
    }
    kernelQueue.clear();
    createConvolutionKernel(type, x, y, z);
-    if (kernelQueue.size() != 1) {
-        std::cerr << "Failed setup kernel by config:"
+    if (kernelQueue.size() != 1)
+    {
+        CV_LOG_ERROR(NULL, "DNN/OpenCL: Failed setup kernel by config: "
            << " x = " << x
            << " y = " << y
            << " z = " << z
            << " type = " << type
-            << std::endl;
+        );
        return false;
    }
    bestKernelConfig = kernelQueue[0];
@ -1956,13 +1922,9 @@ bool OCL4DNNConvSpatial<Dtype>::loadTunedConfig()
    {
        if (cache_path_.empty())
        {
-            static int warn_ = 0;
-            if (!warn_)
-            {
-                std::cout << "OpenCV(ocl4dnn): consider to specify kernel configuration cache directory " << std::endl
-                          << "                 via OPENCV_OCL4DNN_CONFIG_PATH parameter." << std::endl;
-                warn_ = true;
-            }
+            CV_LOG_ONCE_WARNING(NULL, "OpenCV(ocl4dnn): consider to specify kernel configuration cache directory "
+                "through OPENCV_OCL4DNN_CONFIG_PATH parameter."
+            );
        }
        return false;
    }
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@ -161,23 +161,15 @@ __kernel void ConvolveBasic(
    const int out_idx = get_global_id(0);  // 1D task layout: [output_width * output_height * OUTPUT_Z]
    const int plane_size = output_width * output_height;
    const int out_plane_idx = out_idx % plane_size;
-    const int outputZ = out_idx / plane_size;
+    const int outputZ = out_idx / plane_size;  // kernelNum
    const int outputY = out_plane_idx / output_width;
    const int outputX = out_plane_idx % output_width;
-    const int kernelNum = outputZ * ZPAR;
-    if (kernelNum < OUTPUT_Z)
+    if (outputZ < OUTPUT_Z)
    {
-        Dtype sum[ZPAR];
-        for (int kern = 0; kern < ZPAR; kern++)
-        {
-            sum[kern] = 0.0f;
-        }
+        Dtype sum = 0.0f;
        const int org_y = outputY * STRIDE_Y - pad_h;
        const int org_x = outputX * STRIDE_X - pad_w;
-        const int currentKernelOffset = kernel_offset + kernelNum*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS;
-#if APPLY_BIAS
-        const int biasIndex = bias_offset + kernelNum;
-#endif
+        const int currentKernelOffset = kernel_offset + outputZ*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS;
        const int local_image_offset = org_y * input_width + org_x;
        const int imageSize = input_width * input_height;
        __global Dtype* image_dataPtr = (image_data + (image_offset + local_image_offset));
@ -186,17 +178,13 @@ __kernel void ConvolveBasic(
        {
            for (int y = 0; y < KERNEL_HEIGHT; y++)
            {
+                int y_ = org_y + y * DILATION_Y;
                for (int x = 0; x < KERNEL_WIDTH; x++)
                {
-                    int y_ = org_y + y * DILATION_Y;
                    int x_ = org_x + x * DILATION_X;
-                    if (!(y_ >= 0 && y_ < input_height && x_ >= 0 && x_ < input_width))
+                    if (y_ >= 0 && y_ < input_height && x_ >= 0 && x_ < input_width)
                    {
-                        continue;
-                    }
-                    for (int kern = 0; kern < ZPAR; kern++)
-                    {
-                        sum[kern] += image_dataPtr[x * DILATION_X] * kernel_dataPtr[kern*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS + x];
+                        sum = mad(image_dataPtr[x * DILATION_X], kernel_dataPtr[x], sum);
                    }
                }
                image_dataPtr += input_width * DILATION_Y;
@ -205,18 +193,13 @@ __kernel void ConvolveBasic(
            image_dataPtr += imageSize - input_width*KERNEL_HEIGHT*DILATION_Y;
        }

-        for (int kern = 0; kern < ZPAR; kern++)
-        {
-            if (kernelNum + kern < OUTPUT_Z)
-            {
-                int offset = convolved_image_offset + (kernelNum+kern)*output_height*output_width + outputY*output_width + outputX;
+        int offset = convolved_image_offset + out_idx;
 #if APPLY_BIAS
-                ACTIVATION_FUNCTION(convolved_image, offset, sum[kern] + bias[biasIndex + kern], biasIndex + kern);
+        int biasIndex = bias_offset + outputZ;
+        ACTIVATION_FUNCTION(convolved_image, offset, sum + bias[biasIndex], biasIndex);
 #else
-                ACTIVATION_FUNCTION(convolved_image, offset, sum[kern], kernelNum + kern);
+        ACTIVATION_FUNCTION(convolved_image, offset, sum, outputZ);
 #endif
-            }
-        }
    }
 }