mirror of
https://github.com/opencv/opencv.git
synced 2025-08-06 14:36:36 +08:00
Merge pull request #20840 from alalek:dnn_ocl_cleanup_code
This commit is contained in:
commit
81e7988eb9
@ -222,8 +222,6 @@ class OCL4DNNConvSpatial
|
||||
bool createDWConvKernel(int32_t blockWidth,
|
||||
int32_t blockHeight,
|
||||
int32_t blockDepth);
|
||||
void CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
|
||||
int32_t offset, int32_t size, bool write_only);
|
||||
bool convolve(const UMat &bottom, UMat &top,
|
||||
const UMat &weight, const UMat &bias,
|
||||
int32_t numImages,
|
||||
|
@ -219,14 +219,7 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
|
||||
#endif
|
||||
if (!use_cache_path_)
|
||||
{
|
||||
static int warn_ = 0;
|
||||
if (!warn_)
|
||||
{
|
||||
std::cerr
|
||||
<< "OpenCV(ocl4dnn): Kernel configuration cache directory doesn't exist: " << cache_path_ << std::endl
|
||||
<< std::endl;
|
||||
warn_ = true;
|
||||
}
|
||||
CV_LOG_ONCE_ERROR(NULL, "OpenCV(ocl4dnn): Kernel configuration cache directory doesn't exist: " << cache_path_);
|
||||
}
|
||||
}
|
||||
|
||||
@ -419,7 +412,6 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
|
||||
addDef("CHANNELS", channels_ / group_);
|
||||
addDef("APPLY_BIAS", bias_term_);
|
||||
addDef("OUTPUT_Z", M_);
|
||||
addDef("ZPAR", 1);
|
||||
setFusionDefine(fused_activ_, fused_eltwise_);
|
||||
|
||||
src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
|
||||
@ -673,8 +665,7 @@ void interleaveMatrix(Dtype* mem_dst, const Dtype *mem,
|
||||
int r, int c, int interleavedRows, int nonInterleavedRows,
|
||||
int blockWidth, int rowAlignment )
|
||||
{
|
||||
CHECK_EQ(interleavedRows % 2, 0) <<
|
||||
"interleaveMatrix only supports even values for interleavedRows.";
|
||||
CV_Check(interleavedRows, interleavedRows % 2 == 0, "interleaveMatrix only supports even values for interleavedRows.");
|
||||
|
||||
size_t memSize = r * c * sizeof(float);
|
||||
size_t dstSize = memSize *
|
||||
@ -686,9 +677,12 @@ void interleaveMatrix(Dtype* mem_dst, const Dtype *mem,
|
||||
const int yStride = c * 2;
|
||||
const Dtype *pSrc = mem;
|
||||
Dtype* pDst = mem_dst;
|
||||
for (int y = 0; y < r;) {
|
||||
for (int rows = 0; rows < interleavedRows; rows += 2) {
|
||||
if ( y >= r ) break;
|
||||
for (int y = 0; y < r;)
|
||||
{
|
||||
for (int rows = 0; rows < interleavedRows; rows += 2)
|
||||
{
|
||||
if (y >= r)
|
||||
break;
|
||||
if ((c % xStride) == 0) {
|
||||
for (int x = 0; x < c / xStride; x++) {
|
||||
memcpy(pDst + x * xStride * 2, // NOLINT
|
||||
@ -713,11 +707,14 @@ void interleaveMatrix(Dtype* mem_dst, const Dtype *mem,
|
||||
y += 2;
|
||||
}
|
||||
|
||||
for (int rows = 0; rows < nonInterleavedRows; rows++) {
|
||||
if (y >= r) break;
|
||||
for (int rows = 0; rows < nonInterleavedRows; rows++)
|
||||
{
|
||||
if (y >= r)
|
||||
break;
|
||||
const int stride = rowAlignment;
|
||||
int remaining = c;
|
||||
for (int x = 0; x < c; x += stride) {
|
||||
for (int x = 0; x < c; x += stride)
|
||||
{
|
||||
if (remaining >= stride) {
|
||||
memcpy(pDst + x * 2, pSrc + x, stride * sizeof(Dtype)); // NOLINT
|
||||
remaining -=stride;
|
||||
@ -853,34 +850,6 @@ bool OCL4DNNConvSpatial<float>::createBasicKernel(int32_t blockWidth,
|
||||
return false;
|
||||
}
|
||||
|
||||
template<>
|
||||
void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
|
||||
int32_t offset, int32_t size, bool write_only)
|
||||
{
|
||||
cl_mem sub_mem;
|
||||
cl_buffer_region region;
|
||||
cl_int err;
|
||||
size_t element_size = (use_half_) ? sizeof(short) : sizeof(float);
|
||||
|
||||
region.origin = offset * element_size + buffer.offset;
|
||||
region.size = size * element_size;
|
||||
sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ),
|
||||
write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
if (err)
|
||||
{
|
||||
std::cout << "Failed to create sub buffer." << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
int step = element_size, rows = size, cols = 1;
|
||||
ocl::convertFromBuffer(sub_mem, step, rows, cols,
|
||||
(use_half_) ? CV_16SC1 : CV_32FC1, sub_buffer);
|
||||
|
||||
//decrease ocl mem refcount
|
||||
clReleaseMemObject(sub_mem);
|
||||
}
|
||||
|
||||
template<>
|
||||
bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
const UMat &weight, const UMat &bias,
|
||||
@ -939,7 +908,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
kernel.set(argIdx++, (uint16_t)output_h_);
|
||||
if (!kernel.run_(3, config->global_work_size, config->local_work_size, false))
|
||||
{
|
||||
std::cout << "IDLF kernel run failed." << std::endl;
|
||||
CV_LOG_ERROR(NULL, "DNN/OpenCL: IDLF kernel run failed");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -1013,7 +982,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
|
||||
if (!kernel.run_(3, global_size, config->local_work_size, false))
|
||||
{
|
||||
std::cout << "GEMM like kernel run failed." << std::endl;
|
||||
CV_LOG_ERROR(NULL, "DNN/OpenCL: GEMM like kernel run failed");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -1116,14 +1085,9 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
|
||||
{
|
||||
queue = cv::ocl::Queue::getDefault();
|
||||
}
|
||||
catch (const cv::Exception&)
|
||||
catch (const std::exception& e)
|
||||
{
|
||||
static int warn_ = 0;
|
||||
if (!warn_)
|
||||
{
|
||||
std::cout << "OpenCV(ocl4dnn): Can't get OpenCL default queue for auto-tuning." << std::endl;
|
||||
warn_ = true;
|
||||
}
|
||||
CV_LOG_ONCE_ERROR(NULL, "OpenCV(ocl4dnn): Can't get OpenCL default queue for auto-tuning: " << e.what());
|
||||
return 1e6;
|
||||
}
|
||||
|
||||
@ -1327,9 +1291,9 @@ ocl::Program OCL4DNNConvSpatial<Dtype>::compileKernel()
|
||||
phash.insert(std::pair<std::string, ocl::Program>(kernel_name_, program));
|
||||
if (!program.ptr())
|
||||
{
|
||||
std::cout << "Failed to compile kernel: " << kernel_name_
|
||||
<< ", buildflags: " << options
|
||||
<< ", errmsg: " << errmsg << std::endl;
|
||||
CV_LOG_WARNING(NULL, "DNN/OpenCL: Failed to compile kernel: " << kernel_name_
|
||||
<< ", buildflags: '" << options << "', errmsg: '" << errmsg << "'"
|
||||
);
|
||||
}
|
||||
return program;
|
||||
}
|
||||
@ -1755,7 +1719,8 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
|
||||
fastestTime = kernelQueue[x]->executionTime;
|
||||
}
|
||||
}
|
||||
if (fastestKernel < 0) break;
|
||||
if (fastestKernel < 0)
|
||||
break;
|
||||
// Test fastest kernel
|
||||
bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[fastestKernel], verifyTop);
|
||||
if (verified == true) {
|
||||
@ -1914,17 +1879,18 @@ bool OCL4DNNConvSpatial<Dtype>::setupKernelByConfig(int x, int y, int z, int typ
|
||||
{
|
||||
if (z == 1)
|
||||
z = 16;
|
||||
CHECK_EQ(z == 16 || z == 8, true) << "invalid SIMD size" << std::endl;
|
||||
CV_Check(z, z == 16 || z == 8, "DNN/OpenCL: IDLF - invalid SIMD size");
|
||||
}
|
||||
kernelQueue.clear();
|
||||
createConvolutionKernel(type, x, y, z);
|
||||
if (kernelQueue.size() != 1) {
|
||||
std::cerr << "Failed setup kernel by config:"
|
||||
if (kernelQueue.size() != 1)
|
||||
{
|
||||
CV_LOG_ERROR(NULL, "DNN/OpenCL: Failed setup kernel by config: "
|
||||
<< " x = " << x
|
||||
<< " y = " << y
|
||||
<< " z = " << z
|
||||
<< " type = " << type
|
||||
<< std::endl;
|
||||
);
|
||||
return false;
|
||||
}
|
||||
bestKernelConfig = kernelQueue[0];
|
||||
@ -1956,13 +1922,9 @@ bool OCL4DNNConvSpatial<Dtype>::loadTunedConfig()
|
||||
{
|
||||
if (cache_path_.empty())
|
||||
{
|
||||
static int warn_ = 0;
|
||||
if (!warn_)
|
||||
{
|
||||
std::cout << "OpenCV(ocl4dnn): consider to specify kernel configuration cache directory " << std::endl
|
||||
<< " via OPENCV_OCL4DNN_CONFIG_PATH parameter." << std::endl;
|
||||
warn_ = true;
|
||||
}
|
||||
CV_LOG_ONCE_WARNING(NULL, "OpenCV(ocl4dnn): consider to specify kernel configuration cache directory "
|
||||
"through OPENCV_OCL4DNN_CONFIG_PATH parameter."
|
||||
);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
@ -161,23 +161,15 @@ __kernel void ConvolveBasic(
|
||||
const int out_idx = get_global_id(0); // 1D task layout: [output_width * output_height * OUTPUT_Z]
|
||||
const int plane_size = output_width * output_height;
|
||||
const int out_plane_idx = out_idx % plane_size;
|
||||
const int outputZ = out_idx / plane_size;
|
||||
const int outputZ = out_idx / plane_size; // kernelNum
|
||||
const int outputY = out_plane_idx / output_width;
|
||||
const int outputX = out_plane_idx % output_width;
|
||||
const int kernelNum = outputZ * ZPAR;
|
||||
if (kernelNum < OUTPUT_Z)
|
||||
if (outputZ < OUTPUT_Z)
|
||||
{
|
||||
Dtype sum[ZPAR];
|
||||
for (int kern = 0; kern < ZPAR; kern++)
|
||||
{
|
||||
sum[kern] = 0.0f;
|
||||
}
|
||||
Dtype sum = 0.0f;
|
||||
const int org_y = outputY * STRIDE_Y - pad_h;
|
||||
const int org_x = outputX * STRIDE_X - pad_w;
|
||||
const int currentKernelOffset = kernel_offset + kernelNum*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS;
|
||||
#if APPLY_BIAS
|
||||
const int biasIndex = bias_offset + kernelNum;
|
||||
#endif
|
||||
const int currentKernelOffset = kernel_offset + outputZ*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS;
|
||||
const int local_image_offset = org_y * input_width + org_x;
|
||||
const int imageSize = input_width * input_height;
|
||||
__global Dtype* image_dataPtr = (image_data + (image_offset + local_image_offset));
|
||||
@ -186,17 +178,13 @@ __kernel void ConvolveBasic(
|
||||
{
|
||||
for (int y = 0; y < KERNEL_HEIGHT; y++)
|
||||
{
|
||||
int y_ = org_y + y * DILATION_Y;
|
||||
for (int x = 0; x < KERNEL_WIDTH; x++)
|
||||
{
|
||||
int y_ = org_y + y * DILATION_Y;
|
||||
int x_ = org_x + x * DILATION_X;
|
||||
if (!(y_ >= 0 && y_ < input_height && x_ >= 0 && x_ < input_width))
|
||||
if (y_ >= 0 && y_ < input_height && x_ >= 0 && x_ < input_width)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
for (int kern = 0; kern < ZPAR; kern++)
|
||||
{
|
||||
sum[kern] += image_dataPtr[x * DILATION_X] * kernel_dataPtr[kern*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS + x];
|
||||
sum = mad(image_dataPtr[x * DILATION_X], kernel_dataPtr[x], sum);
|
||||
}
|
||||
}
|
||||
image_dataPtr += input_width * DILATION_Y;
|
||||
@ -205,18 +193,13 @@ __kernel void ConvolveBasic(
|
||||
image_dataPtr += imageSize - input_width*KERNEL_HEIGHT*DILATION_Y;
|
||||
}
|
||||
|
||||
for (int kern = 0; kern < ZPAR; kern++)
|
||||
{
|
||||
if (kernelNum + kern < OUTPUT_Z)
|
||||
{
|
||||
int offset = convolved_image_offset + (kernelNum+kern)*output_height*output_width + outputY*output_width + outputX;
|
||||
int offset = convolved_image_offset + out_idx;
|
||||
#if APPLY_BIAS
|
||||
ACTIVATION_FUNCTION(convolved_image, offset, sum[kern] + bias[biasIndex + kern], biasIndex + kern);
|
||||
int biasIndex = bias_offset + outputZ;
|
||||
ACTIVATION_FUNCTION(convolved_image, offset, sum + bias[biasIndex], biasIndex);
|
||||
#else
|
||||
ACTIVATION_FUNCTION(convolved_image, offset, sum[kern], kernelNum + kern);
|
||||
ACTIVATION_FUNCTION(convolved_image, offset, sum, outputZ);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user