mirror of
https://github.com/opencv/opencv.git
synced 2025-06-10 19:24:07 +08:00
dnn(ocl): fix automatic globalsize adjusting
- if kernel code doesn't support that
This commit is contained in:
parent
d11f0a709d
commit
5578ad5e14
@ -562,7 +562,9 @@ public:
|
|||||||
i = set(i, a6); i = set(i, a7); i = set(i, a8); i = set(i, a9); i = set(i, a10); i = set(i, a11);
|
i = set(i, a6); i = set(i, a7); i = set(i, a8); i = set(i, a9); i = set(i, a10); i = set(i, a11);
|
||||||
i = set(i, a12); i = set(i, a13); i = set(i, a14); set(i, a15); return *this;
|
i = set(i, a12); i = set(i, a13); i = set(i, a14); set(i, a15); return *this;
|
||||||
}
|
}
|
||||||
/** @brief Run the OpenCL kernel.
|
|
||||||
|
/** @brief Run the OpenCL kernel (globalsize value may be adjusted)
|
||||||
|
|
||||||
@param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
|
@param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
|
||||||
@param globalsize work items for each dimension. It is not the final globalsize passed to
|
@param globalsize work items for each dimension. It is not the final globalsize passed to
|
||||||
OpenCL. Each dimension will be adjusted to the nearest integer divisible by the corresponding
|
OpenCL. Each dimension will be adjusted to the nearest integer divisible by the corresponding
|
||||||
@ -571,12 +573,26 @@ public:
|
|||||||
@param localsize work-group size for each dimension.
|
@param localsize work-group size for each dimension.
|
||||||
@param sync specify whether to wait for OpenCL computation to finish before return.
|
@param sync specify whether to wait for OpenCL computation to finish before return.
|
||||||
@param q command queue
|
@param q command queue
|
||||||
|
|
||||||
|
@note Use run_() if your kernel code doesn't support adjusted globalsize.
|
||||||
*/
|
*/
|
||||||
bool run(int dims, size_t globalsize[],
|
bool run(int dims, size_t globalsize[],
|
||||||
size_t localsize[], bool sync, const Queue& q=Queue());
|
size_t localsize[], bool sync, const Queue& q=Queue());
|
||||||
|
|
||||||
|
/** @brief Run the OpenCL kernel
|
||||||
|
*
|
||||||
|
* @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
|
||||||
|
* @param globalsize work items for each dimension. This value is passed to OpenCL without changes.
|
||||||
|
* @param localsize work-group size for each dimension.
|
||||||
|
* @param sync specify whether to wait for OpenCL computation to finish before return.
|
||||||
|
* @param q command queue
|
||||||
|
*/
|
||||||
|
bool run_(int dims, size_t globalsize[], size_t localsize[], bool sync, const Queue& q=Queue());
|
||||||
|
|
||||||
bool runTask(bool sync, const Queue& q=Queue());
|
bool runTask(bool sync, const Queue& q=Queue());
|
||||||
|
|
||||||
/** @brief Similar to synchronized run() call with returning of kernel execution time
|
/** @brief Similar to synchronized run_() call with returning of kernel execution time
|
||||||
|
*
|
||||||
* Separate OpenCL command queue may be used (with CL_QUEUE_PROFILING_ENABLE)
|
* Separate OpenCL command queue may be used (with CL_QUEUE_PROFILING_ENABLE)
|
||||||
* @return Execution time in nanoseconds or negative number on error
|
* @return Execution time in nanoseconds or negative number on error
|
||||||
*/
|
*/
|
||||||
|
@ -3160,6 +3160,14 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool Kernel::run_(int dims, size_t _globalsize[], size_t _localsize[],
|
||||||
|
bool sync, const Queue& q)
|
||||||
|
{
|
||||||
|
CV_Assert(p);
|
||||||
|
return p->run(dims, _globalsize, _localsize, sync, NULL, q);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static bool isRaiseErrorOnReuseAsyncKernel()
|
static bool isRaiseErrorOnReuseAsyncKernel()
|
||||||
{
|
{
|
||||||
static bool initialized = false;
|
static bool initialized = false;
|
||||||
|
@ -231,7 +231,7 @@ public:
|
|||||||
kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_weight));
|
kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_weight));
|
||||||
kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_bias));
|
kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_bias));
|
||||||
kernel.set(6, ocl::KernelArg::PtrWriteOnly(dst));
|
kernel.set(6, ocl::KernelArg::PtrWriteOnly(dst));
|
||||||
bool ret = kernel.run(2, global, NULL, false);
|
bool ret = kernel.run_(2, global, NULL, false);
|
||||||
if (!ret)
|
if (!ret)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -191,7 +191,7 @@ public:
|
|||||||
k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight));
|
k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight));
|
||||||
k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias));
|
k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias));
|
||||||
k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat));
|
k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat));
|
||||||
ret = k1.run(1, globalsize, localsize, false);
|
ret = k1.run_(1, globalsize, localsize, false);
|
||||||
if (!ret)
|
if (!ret)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -482,7 +482,7 @@ public:
|
|||||||
ocl::KernelArg::PtrReadOnly(input),
|
ocl::KernelArg::PtrReadOnly(input),
|
||||||
ocl::KernelArg::PtrWriteOnly(output)
|
ocl::KernelArg::PtrWriteOnly(output)
|
||||||
)
|
)
|
||||||
.run(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
|
.run_(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
|
||||||
if (!ret)
|
if (!ret)
|
||||||
return false;
|
return false;
|
||||||
} // for outputs.size()
|
} // for outputs.size()
|
||||||
|
@ -116,6 +116,7 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
|
|||||||
.args(
|
.args(
|
||||||
ocl::KernelArg::PtrReadOnly(buffer),
|
ocl::KernelArg::PtrReadOnly(buffer),
|
||||||
image, offset,
|
image, offset,
|
||||||
|
padded_width, padded_height,
|
||||||
width, height,
|
width, height,
|
||||||
ld)
|
ld)
|
||||||
.run(2, global_copy, NULL, false);
|
.run(2, global_copy, NULL, false);
|
||||||
|
@ -769,12 +769,11 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
|
|||||||
swizzled_factor
|
swizzled_factor
|
||||||
);
|
);
|
||||||
|
|
||||||
size_t global_work_size_copy[3] = {
|
size_t global_work_size_copy[1] = { (size_t)(alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_) };
|
||||||
(size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
|
|
||||||
|
|
||||||
if (!oclk_copy_weight.run(3, global_work_size_copy, NULL, false))
|
if (!oclk_copy_weight.run_(1, global_work_size_copy, NULL, false))
|
||||||
{
|
{
|
||||||
std::cout << "Swizzle kernel run failed." << std::endl;
|
CV_LOG_ERROR(NULL, "DNN/OpenCL: Swizzle kernel run failed");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -937,7 +936,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
|||||||
kernel.set(argIdx++, (uint16_t)height_);
|
kernel.set(argIdx++, (uint16_t)height_);
|
||||||
kernel.set(argIdx++, (uint16_t)output_w_);
|
kernel.set(argIdx++, (uint16_t)output_w_);
|
||||||
kernel.set(argIdx++, (uint16_t)output_h_);
|
kernel.set(argIdx++, (uint16_t)output_h_);
|
||||||
if (!kernel.run(3, config->global_work_size, config->local_work_size, false))
|
if (!kernel.run_(3, config->global_work_size, config->local_work_size, false))
|
||||||
{
|
{
|
||||||
std::cout << "IDLF kernel run failed." << std::endl;
|
std::cout << "IDLF kernel run failed." << std::endl;
|
||||||
return false;
|
return false;
|
||||||
@ -1056,7 +1055,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
|||||||
gy = alignSize(gy, blockK);
|
gy = alignSize(gy, blockK);
|
||||||
size_t global_size[3] = { gx, gy, config->global_work_size[2] };
|
size_t global_size[3] = { gx, gy, config->global_work_size[2] };
|
||||||
|
|
||||||
if (!kernel.run(3, global_size, config->local_work_size, false))
|
if (!kernel.run_(3, global_size, config->local_work_size, false))
|
||||||
{
|
{
|
||||||
std::cout << "GEMM like kernel run failed." << std::endl;
|
std::cout << "GEMM like kernel run failed." << std::endl;
|
||||||
return false;
|
return false;
|
||||||
@ -1085,9 +1084,9 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
|||||||
global_size[1] = output_h_;
|
global_size[1] = output_h_;
|
||||||
global_size[2] = num_output_ * num_;
|
global_size[2] = num_output_ * num_;
|
||||||
|
|
||||||
if (!kernel.run(3, global_size, NULL, false))
|
if (!kernel.run_(3, global_size, NULL, false))
|
||||||
{
|
{
|
||||||
std::cout << "DWCONV kernel run failed." << std::endl;
|
CV_LOG_ERROR(NULL, "DNN/OpenCL: DWCONV kernel run failed");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -1127,11 +1126,11 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
|||||||
kernel.set(argIdx++, (uint16_t)output_h_);
|
kernel.set(argIdx++, (uint16_t)output_h_);
|
||||||
kernel.set(argIdx++, (uint16_t)pad_w_);
|
kernel.set(argIdx++, (uint16_t)pad_w_);
|
||||||
kernel.set(argIdx++, (uint16_t)pad_h_);
|
kernel.set(argIdx++, (uint16_t)pad_h_);
|
||||||
if (!kernel.run(3, config->global_work_size,
|
if (!kernel.run_(3, config->global_work_size,
|
||||||
(config->use_null_local) ? NULL : config->local_work_size,
|
(config->use_null_local) ? NULL : config->local_work_size,
|
||||||
false))
|
false))
|
||||||
{
|
{
|
||||||
std::cout << "Basic kernel run failed." << std::endl;
|
CV_LOG_ERROR(NULL, "DNN/OpenCL: Basic kernel run failed");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -127,7 +127,7 @@ bool OCL4DNNSoftmax<Dtype>::Forward(const UMat& bottom, UMat& top)
|
|||||||
oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||||
oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
||||||
}
|
}
|
||||||
ret = oclk_softmax_forward_kernel.run(3, global_size, local_size, false);
|
ret = oclk_softmax_forward_kernel.run_(3, global_size, local_size, false);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -954,6 +954,10 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_transpose, Dtype)(
|
|||||||
{
|
{
|
||||||
const int gidx = get_global_id(0);
|
const int gidx = get_global_id(0);
|
||||||
const int gidy = get_global_id(1);
|
const int gidy = get_global_id(1);
|
||||||
|
|
||||||
|
if (gidx >= width || gidy >= height)
|
||||||
|
return;
|
||||||
|
|
||||||
int2 coord_dst = (int2)(gidx, gidy);
|
int2 coord_dst = (int2)(gidx, gidy);
|
||||||
__global Dtype* A_off = A + offA;
|
__global Dtype* A_off = A + offA;
|
||||||
Dtype srcA = A_off[gidy * ldA + gidx];
|
Dtype srcA = A_off[gidy * ldA + gidx];
|
||||||
@ -968,12 +972,18 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose, Dtype)(
|
|||||||
__global Dtype* A,
|
__global Dtype* A,
|
||||||
__write_only image2d_t ImA,
|
__write_only image2d_t ImA,
|
||||||
int offA,
|
int offA,
|
||||||
|
int padded_width,
|
||||||
|
int padded_height,
|
||||||
int width,
|
int width,
|
||||||
int height,
|
int height,
|
||||||
int ldA)
|
int ldA)
|
||||||
{
|
{
|
||||||
const int gidx = get_global_id(0);
|
const int gidx = get_global_id(0);
|
||||||
const int gidy = get_global_id(1);
|
const int gidy = get_global_id(1);
|
||||||
|
|
||||||
|
if (gidx >= padded_width || gidy >= padded_height)
|
||||||
|
return;
|
||||||
|
|
||||||
int2 coord_dst = (int2)(gidx, gidy);
|
int2 coord_dst = (int2)(gidx, gidy);
|
||||||
#if TYPE == TYPE_HALF
|
#if TYPE == TYPE_HALF
|
||||||
if (gidx >= width || gidy >= height) {
|
if (gidx >= width || gidy >= height) {
|
||||||
|
Loading…
Reference in New Issue
Block a user