Fix an issue with Kernel object reset release when consecutive Kernel::run calls

Kernel::run launch OCL gpu kernels and set a event callback function to decreate the ref count of UMat or remove UMat when the lauched workloads are completed. However, for some OCL kernels requires multiple call of Kernel::run function with some kernel parameter changes (e.g., input and output buffer offset) to get the final computation result. In the case, the current implementation requires unnecessary synchronization and cleanupMat. This fix requires the user to specify whether there will be more work or not. If there is no remaining computation, the Kernel::run will reset the kernel object Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
2025-08-06 06:26:29 +08:00 · 2017-01-25 15:28:32 -08:00 · 2017-01-25 15:28:32 -08:00 · cc7f9f5469
commit cc7f9f5469
parent 12569dc730
2 changed files with 15 additions and 8 deletions
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@ -578,9 +578,17 @@ public:
    @param localsize work-group size for each dimension.
    @param sync specify whether to wait for OpenCL computation to finish before return.
    @param q command queue
+    @param moreWorkDone specify whether there will the remaining work to be computed (more Kernel::run calls).
+    When a computation requires multiple kernel execution by changing input and output buffer offset to get
+    the final computation results.
+    kernel.setArg(0, ..);
+    kernel.setArg(1, offset);
+    kernel.run(..., q, true);
+    kernel.setArg(1, offset+256);
+    kernel.run(..., q, false);
    */
    bool run(int dims, size_t globalsize[],
-             size_t localsize[], bool sync, const Queue& q=Queue());
+             size_t localsize[], bool sync, const Queue& q=Queue(), bool moreWorkDone = false);
    bool runTask(bool sync, const Queue& q=Queue());

    size_t workGroupSize() const;
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@ -3185,7 +3185,7 @@ struct Kernel::Impl

    void cleanupUMats()
    {
-        for( int i = 0; i < MAX_ARRS; i++ )
+        for( int i = 0; i < nu; i++ )
            if( u[i] )
            {
                if( CV_XADD(&u[i]->urefcount, -1) == 1 )
@ -3446,9 +3446,8 @@ int Kernel::set(int i, const KernelArg& arg)
    return i+1;
 }

-
 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
-                 bool sync, const Queue& q)
+                 bool sync, const Queue& q, bool moreWorkDone)
 {
    CV_INSTRUMENT_REGION_OPENCL_RUN(p->name.c_str());

@ -3469,11 +3468,11 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
    }
    if( total == 0 )
        return true;
-    if( p->haveTempDstUMats )
+    if( p->haveTempDstUMats && !moreWorkDone)
        sync = true;
    cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
                                           offset, globalsize, _localsize, 0, 0,
-                                           sync ? 0 : &p->e);
+                                           sync ? 0 : (moreWorkDone? 0: &p->e ));
 #if CV_OPENCL_SHOW_RUN_ERRORS
    if (retval != CL_SUCCESS)
    {
@ -3484,9 +3483,9 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
    if( sync || retval != CL_SUCCESS )
    {
        CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
-        p->cleanupUMats();
+        if (!moreWorkDone) p->cleanupUMats();
    }
-    else
+    else if (!moreWorkDone)
    {
        p->addref();
        CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);