mirror of
https://github.com/opencv/opencv.git
synced 2025-06-07 01:13:28 +08:00
dnn(ocl): fix automatic globalsize adjusting
- if kernel code doesn't support that
This commit is contained in:
parent
d11f0a709d
commit
5578ad5e14
@ -562,7 +562,9 @@ public:
|
||||
i = set(i, a6); i = set(i, a7); i = set(i, a8); i = set(i, a9); i = set(i, a10); i = set(i, a11);
|
||||
i = set(i, a12); i = set(i, a13); i = set(i, a14); set(i, a15); return *this;
|
||||
}
|
||||
/** @brief Run the OpenCL kernel.
|
||||
|
||||
/** @brief Run the OpenCL kernel (globalsize value may be adjusted)
|
||||
|
||||
@param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
|
||||
@param globalsize work items for each dimension. It is not the final globalsize passed to
|
||||
OpenCL. Each dimension will be adjusted to the nearest integer divisible by the corresponding
|
||||
@ -571,12 +573,26 @@ public:
|
||||
@param localsize work-group size for each dimension.
|
||||
@param sync specify whether to wait for OpenCL computation to finish before return.
|
||||
@param q command queue
|
||||
|
||||
@note Use run_() if your kernel code doesn't support adjusted globalsize.
|
||||
*/
|
||||
bool run(int dims, size_t globalsize[],
|
||||
size_t localsize[], bool sync, const Queue& q=Queue());
|
||||
|
||||
/** @brief Run the OpenCL kernel
|
||||
*
|
||||
* @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
|
||||
* @param globalsize work items for each dimension. This value is passed to OpenCL without changes.
|
||||
* @param localsize work-group size for each dimension.
|
||||
* @param sync specify whether to wait for OpenCL computation to finish before return.
|
||||
* @param q command queue
|
||||
*/
|
||||
bool run_(int dims, size_t globalsize[], size_t localsize[], bool sync, const Queue& q=Queue());
|
||||
|
||||
bool runTask(bool sync, const Queue& q=Queue());
|
||||
|
||||
/** @brief Similar to synchronized run() call with returning of kernel execution time
|
||||
/** @brief Similar to synchronized run_() call with returning of kernel execution time
|
||||
*
|
||||
* Separate OpenCL command queue may be used (with CL_QUEUE_PROFILING_ENABLE)
|
||||
* @return Execution time in nanoseconds or negative number on error
|
||||
*/
|
||||
|
@ -3160,6 +3160,14 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
|
||||
}
|
||||
|
||||
|
||||
bool Kernel::run_(int dims, size_t _globalsize[], size_t _localsize[],
|
||||
bool sync, const Queue& q)
|
||||
{
|
||||
CV_Assert(p);
|
||||
return p->run(dims, _globalsize, _localsize, sync, NULL, q);
|
||||
}
|
||||
|
||||
|
||||
static bool isRaiseErrorOnReuseAsyncKernel()
|
||||
{
|
||||
static bool initialized = false;
|
||||
|
@ -231,7 +231,7 @@ public:
|
||||
kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_weight));
|
||||
kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_bias));
|
||||
kernel.set(6, ocl::KernelArg::PtrWriteOnly(dst));
|
||||
bool ret = kernel.run(2, global, NULL, false);
|
||||
bool ret = kernel.run_(2, global, NULL, false);
|
||||
if (!ret)
|
||||
return false;
|
||||
}
|
||||
|
@ -191,7 +191,7 @@ public:
|
||||
k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight));
|
||||
k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias));
|
||||
k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat));
|
||||
ret = k1.run(1, globalsize, localsize, false);
|
||||
ret = k1.run_(1, globalsize, localsize, false);
|
||||
if (!ret)
|
||||
return false;
|
||||
}
|
||||
|
@ -482,7 +482,7 @@ public:
|
||||
ocl::KernelArg::PtrReadOnly(input),
|
||||
ocl::KernelArg::PtrWriteOnly(output)
|
||||
)
|
||||
.run(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
|
||||
.run_(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
|
||||
if (!ret)
|
||||
return false;
|
||||
} // for outputs.size()
|
||||
|
@ -116,6 +116,7 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
|
||||
.args(
|
||||
ocl::KernelArg::PtrReadOnly(buffer),
|
||||
image, offset,
|
||||
padded_width, padded_height,
|
||||
width, height,
|
||||
ld)
|
||||
.run(2, global_copy, NULL, false);
|
||||
|
@ -769,12 +769,11 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
|
||||
swizzled_factor
|
||||
);
|
||||
|
||||
size_t global_work_size_copy[3] = {
|
||||
(size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
|
||||
size_t global_work_size_copy[1] = { (size_t)(alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_) };
|
||||
|
||||
if (!oclk_copy_weight.run(3, global_work_size_copy, NULL, false))
|
||||
if (!oclk_copy_weight.run_(1, global_work_size_copy, NULL, false))
|
||||
{
|
||||
std::cout << "Swizzle kernel run failed." << std::endl;
|
||||
CV_LOG_ERROR(NULL, "DNN/OpenCL: Swizzle kernel run failed");
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
@ -937,7 +936,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
kernel.set(argIdx++, (uint16_t)height_);
|
||||
kernel.set(argIdx++, (uint16_t)output_w_);
|
||||
kernel.set(argIdx++, (uint16_t)output_h_);
|
||||
if (!kernel.run(3, config->global_work_size, config->local_work_size, false))
|
||||
if (!kernel.run_(3, config->global_work_size, config->local_work_size, false))
|
||||
{
|
||||
std::cout << "IDLF kernel run failed." << std::endl;
|
||||
return false;
|
||||
@ -1056,7 +1055,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
gy = alignSize(gy, blockK);
|
||||
size_t global_size[3] = { gx, gy, config->global_work_size[2] };
|
||||
|
||||
if (!kernel.run(3, global_size, config->local_work_size, false))
|
||||
if (!kernel.run_(3, global_size, config->local_work_size, false))
|
||||
{
|
||||
std::cout << "GEMM like kernel run failed." << std::endl;
|
||||
return false;
|
||||
@ -1085,9 +1084,9 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
global_size[1] = output_h_;
|
||||
global_size[2] = num_output_ * num_;
|
||||
|
||||
if (!kernel.run(3, global_size, NULL, false))
|
||||
if (!kernel.run_(3, global_size, NULL, false))
|
||||
{
|
||||
std::cout << "DWCONV kernel run failed." << std::endl;
|
||||
CV_LOG_ERROR(NULL, "DNN/OpenCL: DWCONV kernel run failed");
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
@ -1127,11 +1126,11 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
kernel.set(argIdx++, (uint16_t)output_h_);
|
||||
kernel.set(argIdx++, (uint16_t)pad_w_);
|
||||
kernel.set(argIdx++, (uint16_t)pad_h_);
|
||||
if (!kernel.run(3, config->global_work_size,
|
||||
if (!kernel.run_(3, config->global_work_size,
|
||||
(config->use_null_local) ? NULL : config->local_work_size,
|
||||
false))
|
||||
{
|
||||
std::cout << "Basic kernel run failed." << std::endl;
|
||||
CV_LOG_ERROR(NULL, "DNN/OpenCL: Basic kernel run failed");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -127,7 +127,7 @@ bool OCL4DNNSoftmax<Dtype>::Forward(const UMat& bottom, UMat& top)
|
||||
oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||
oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
||||
}
|
||||
ret = oclk_softmax_forward_kernel.run(3, global_size, local_size, false);
|
||||
ret = oclk_softmax_forward_kernel.run_(3, global_size, local_size, false);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
@ -954,6 +954,10 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_transpose, Dtype)(
|
||||
{
|
||||
const int gidx = get_global_id(0);
|
||||
const int gidy = get_global_id(1);
|
||||
|
||||
if (gidx >= width || gidy >= height)
|
||||
return;
|
||||
|
||||
int2 coord_dst = (int2)(gidx, gidy);
|
||||
__global Dtype* A_off = A + offA;
|
||||
Dtype srcA = A_off[gidy * ldA + gidx];
|
||||
@ -968,12 +972,18 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose, Dtype)(
|
||||
__global Dtype* A,
|
||||
__write_only image2d_t ImA,
|
||||
int offA,
|
||||
int padded_width,
|
||||
int padded_height,
|
||||
int width,
|
||||
int height,
|
||||
int ldA)
|
||||
{
|
||||
const int gidx = get_global_id(0);
|
||||
const int gidy = get_global_id(1);
|
||||
|
||||
if (gidx >= padded_width || gidy >= padded_height)
|
||||
return;
|
||||
|
||||
int2 coord_dst = (int2)(gidx, gidy);
|
||||
#if TYPE == TYPE_HALF
|
||||
if (gidx >= width || gidy >= height) {
|
||||
|
Loading…
Reference in New Issue
Block a user