mirror of
https://github.com/opencv/opencv.git
synced 2025-08-06 06:26:29 +08:00
Merge pull request #9604 from alalek:ocl_kernel_profiling
This commit is contained in:
commit
af8fc3e209
@ -333,8 +333,12 @@ public:
|
||||
void* ptr() const;
|
||||
static Queue& getDefault();
|
||||
|
||||
/// @brief Returns OpenCL command queue with enable profiling mode support
|
||||
const Queue& getProfilingQueue() const;
|
||||
|
||||
struct Impl; friend struct Impl;
|
||||
inline Impl* getImpl() const { return p; }
|
||||
protected:
|
||||
struct Impl;
|
||||
Impl* p;
|
||||
};
|
||||
|
||||
@ -569,6 +573,12 @@ public:
|
||||
size_t localsize[], bool sync, const Queue& q=Queue());
|
||||
bool runTask(bool sync, const Queue& q=Queue());
|
||||
|
||||
/** @brief Similar to synchronized run() call with returning of kernel execution time
|
||||
* Separate OpenCL command queue may be used (with CL_QUEUE_PROFILING_ENABLE)
|
||||
* @return Execution time in nanoseconds or negative number on error
|
||||
*/
|
||||
int64 runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q=Queue());
|
||||
|
||||
size_t workGroupSize() const;
|
||||
size_t preferedWorkGroupSizeMultiple() const;
|
||||
bool compileWorkGroupSize(size_t wsz[]) const;
|
||||
|
@ -1840,9 +1840,35 @@ void initializeContextFromHandle(Context& ctx, void* platform, void* _context, v
|
||||
|
||||
struct Queue::Impl
|
||||
{
|
||||
Impl(const Context& c, const Device& d)
|
||||
inline void __init()
|
||||
{
|
||||
refcount = 1;
|
||||
handle = 0;
|
||||
isProfilingQueue_ = false;
|
||||
}
|
||||
|
||||
Impl(cl_command_queue q)
|
||||
{
|
||||
__init();
|
||||
handle = q;
|
||||
|
||||
cl_command_queue_properties props = 0;
|
||||
cl_int result = clGetCommandQueueInfo(handle, CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &props, NULL);
|
||||
CV_Assert(result && "clGetCommandQueueInfo(CL_QUEUE_PROPERTIES)");
|
||||
isProfilingQueue_ = !!(props & CL_QUEUE_PROFILING_ENABLE);
|
||||
}
|
||||
|
||||
Impl(cl_command_queue q, bool isProfilingQueue)
|
||||
{
|
||||
__init();
|
||||
handle = q;
|
||||
isProfilingQueue_ = isProfilingQueue;
|
||||
}
|
||||
|
||||
Impl(const Context& c, const Device& d, bool withProfiling = false)
|
||||
{
|
||||
__init();
|
||||
|
||||
const Context* pc = &c;
|
||||
cl_context ch = (cl_context)pc->ptr();
|
||||
if( !ch )
|
||||
@ -1854,8 +1880,10 @@ struct Queue::Impl
|
||||
if( !dh )
|
||||
dh = (cl_device_id)pc->device(0).ptr();
|
||||
cl_int retval = 0;
|
||||
handle = clCreateCommandQueue(ch, dh, 0, &retval);
|
||||
cl_command_queue_properties props = withProfiling ? CL_QUEUE_PROFILING_ENABLE : 0;
|
||||
handle = clCreateCommandQueue(ch, dh, props, &retval);
|
||||
CV_OclDbgAssert(retval == CL_SUCCESS);
|
||||
isProfilingQueue_ = withProfiling;
|
||||
}
|
||||
|
||||
~Impl()
|
||||
@ -1873,9 +1901,37 @@ struct Queue::Impl
|
||||
}
|
||||
}
|
||||
|
||||
const cv::ocl::Queue& getProfilingQueue(const cv::ocl::Queue& self)
|
||||
{
|
||||
if (isProfilingQueue_)
|
||||
return self;
|
||||
|
||||
if (profiling_queue_.ptr())
|
||||
return profiling_queue_;
|
||||
|
||||
cl_context ctx = 0;
|
||||
CV_Assert(CL_SUCCESS == clGetCommandQueueInfo(handle, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, NULL));
|
||||
|
||||
cl_device_id device = 0;
|
||||
CV_Assert(CL_SUCCESS == clGetCommandQueueInfo(handle, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL));
|
||||
|
||||
cl_int result = CL_SUCCESS;
|
||||
cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
|
||||
cl_command_queue q = clCreateCommandQueue(ctx, device, props, &result);
|
||||
CV_Assert(result == CL_SUCCESS && "clCreateCommandQueue(with CL_QUEUE_PROFILING_ENABLE)");
|
||||
|
||||
Queue queue;
|
||||
queue.p = new Impl(q, true);
|
||||
profiling_queue_ = queue;
|
||||
|
||||
return profiling_queue_;
|
||||
}
|
||||
|
||||
IMPLEMENT_REFCOUNTABLE();
|
||||
|
||||
cl_command_queue handle;
|
||||
bool isProfilingQueue_;
|
||||
cv::ocl::Queue profiling_queue_;
|
||||
};
|
||||
|
||||
Queue::Queue()
|
||||
@ -1929,6 +1985,12 @@ void Queue::finish()
|
||||
}
|
||||
}
|
||||
|
||||
const Queue& Queue::getProfilingQueue() const
|
||||
{
|
||||
CV_Assert(p);
|
||||
return p->getProfilingQueue(*this);
|
||||
}
|
||||
|
||||
void* Queue::ptr() const
|
||||
{
|
||||
return p ? p->handle : 0;
|
||||
@ -2032,6 +2094,9 @@ struct Kernel::Impl
|
||||
release();
|
||||
}
|
||||
|
||||
bool run(int dims, size_t _globalsize[], size_t _localsize[],
|
||||
bool sync, int64* timeNS, const Queue& q);
|
||||
|
||||
~Impl()
|
||||
{
|
||||
if(handle)
|
||||
@ -2259,19 +2324,15 @@ int Kernel::set(int i, const KernelArg& arg)
|
||||
return i+1;
|
||||
}
|
||||
|
||||
|
||||
bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
|
||||
bool sync, const Queue& q)
|
||||
{
|
||||
CV_INSTRUMENT_REGION_OPENCL_RUN(p->name.c_str());
|
||||
|
||||
if(!p || !p->handle || p->isInProgress)
|
||||
if (!p)
|
||||
return false;
|
||||
|
||||
cl_command_queue qq = getQueue(q);
|
||||
size_t globalsize[CV_MAX_DIM] = {1,1,1};
|
||||
size_t total = 1;
|
||||
CV_Assert(_globalsize != 0);
|
||||
CV_Assert(_globalsize != NULL);
|
||||
for (int i = 0; i < dims; i++)
|
||||
{
|
||||
size_t val = _localsize ? _localsize[i] :
|
||||
@ -2283,12 +2344,28 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
|
||||
globalsize[i] = divUp(_globalsize[i], (unsigned int)val) * val;
|
||||
}
|
||||
CV_Assert(total > 0);
|
||||
if( p->haveTempDstUMats )
|
||||
|
||||
return p->run(dims, globalsize, _localsize, sync, NULL, q);
|
||||
}
|
||||
|
||||
|
||||
bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
|
||||
bool sync, int64* timeNS, const Queue& q)
|
||||
{
|
||||
CV_INSTRUMENT_REGION_OPENCL_RUN(p->name.c_str());
|
||||
|
||||
if (!handle || isInProgress)
|
||||
return false;
|
||||
|
||||
cl_command_queue qq = getQueue(q);
|
||||
if (haveTempDstUMats)
|
||||
sync = true;
|
||||
if (timeNS)
|
||||
sync = true;
|
||||
cl_event asyncEvent = 0;
|
||||
cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
|
||||
NULL, globalsize, _localsize, 0, 0,
|
||||
sync ? 0 : &asyncEvent);
|
||||
cl_int retval = clEnqueueNDRangeKernel(qq, handle, (cl_uint)dims,
|
||||
NULL, globalsize, localsize, 0, 0,
|
||||
(sync && !timeNS) ? 0 : &asyncEvent);
|
||||
#if CV_OPENCL_SHOW_RUN_ERRORS
|
||||
if (retval != CL_SUCCESS)
|
||||
{
|
||||
@ -2296,16 +2373,31 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
|
||||
fflush(stdout);
|
||||
}
|
||||
#endif
|
||||
if( sync || retval != CL_SUCCESS )
|
||||
if (sync || retval != CL_SUCCESS)
|
||||
{
|
||||
CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
|
||||
p->cleanupUMats();
|
||||
if (timeNS)
|
||||
{
|
||||
if (retval == CL_SUCCESS)
|
||||
{
|
||||
clWaitForEvents(1, &asyncEvent);
|
||||
cl_ulong startTime, stopTime;
|
||||
CV_Assert(CL_SUCCESS == clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL));
|
||||
CV_Assert(CL_SUCCESS == clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_END, sizeof(stopTime), &stopTime, NULL));
|
||||
*timeNS = (int64)(stopTime - startTime);
|
||||
}
|
||||
else
|
||||
{
|
||||
*timeNS = -1;
|
||||
}
|
||||
}
|
||||
cleanupUMats();
|
||||
}
|
||||
else
|
||||
{
|
||||
p->addref();
|
||||
p->isInProgress = true;
|
||||
CV_OclDbgAssert(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
|
||||
addref();
|
||||
isInProgress = true;
|
||||
CV_OclDbgAssert(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, this) == CL_SUCCESS);
|
||||
}
|
||||
if (asyncEvent)
|
||||
clReleaseEvent(asyncEvent);
|
||||
@ -2336,6 +2428,17 @@ bool Kernel::runTask(bool sync, const Queue& q)
|
||||
return retval == CL_SUCCESS;
|
||||
}
|
||||
|
||||
int64 Kernel::runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q_)
|
||||
{
|
||||
CV_Assert(p && p->handle && !p->isInProgress);
|
||||
Queue q = q_.ptr() ? q_ : Queue::getDefault();
|
||||
CV_Assert(q.ptr());
|
||||
q.finish(); // call clFinish() on base queue
|
||||
Queue profilingQueue = q.getProfilingQueue();
|
||||
int64 timeNs = -1;
|
||||
bool res = p->run(dims, globalsize, localsize, true, &timeNs, profilingQueue);
|
||||
return res ? timeNs : -1;
|
||||
}
|
||||
|
||||
size_t Kernel::workGroupSize() const
|
||||
{
|
||||
|
@ -759,15 +759,15 @@ OCL_FUNC_P(cl_mem, clCreateBuffer,
|
||||
|
||||
/*
|
||||
OCL_FUNC(cl_int, clRetainCommandQueue, (cl_command_queue command_queue), (command_queue))
|
||||
|
||||
*/
|
||||
OCL_FUNC(cl_int, clGetCommandQueueInfo,
|
||||
(cl_command_queue command_queue,
|
||||
cl_command_queue_info param_name,
|
||||
size_t param_value_size,
|
||||
void * param_value,
|
||||
size_t * param_value_size_ret),
|
||||
(command_queue, param_name, param_value_size, param_value, param_value_size_ret))
|
||||
|
||||
(cl_command_queue command_queue,
|
||||
cl_command_queue_info param_name,
|
||||
size_t param_value_size,
|
||||
void * param_value,
|
||||
size_t * param_value_size_ret),
|
||||
(command_queue, param_name, param_value_size, param_value, param_value_size_ret))
|
||||
/*
|
||||
OCL_FUNC_P(cl_mem, clCreateSubBuffer,
|
||||
(cl_mem buffer,
|
||||
cl_mem_flags flags,
|
||||
@ -1202,6 +1202,19 @@ OCL_FUNC(cl_int, clSetEventCallback,
|
||||
|
||||
OCL_FUNC(cl_int, clReleaseEvent, (cl_event event), (event))
|
||||
|
||||
OCL_FUNC(cl_int, clWaitForEvents,
|
||||
(cl_uint num_events, const cl_event *event_list),
|
||||
(num_events, event_list))
|
||||
|
||||
|
||||
OCL_FUNC(cl_int, clGetEventProfilingInfo, (
|
||||
cl_event event,
|
||||
cl_profiling_info param_name,
|
||||
size_t param_value_size,
|
||||
void *param_value,
|
||||
size_t *param_value_size_ret),
|
||||
(event, param_name, param_value_size, param_value, param_value_size_ret))
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user