mirror of
https://github.com/opencv/opencv.git
synced 2025-07-24 14:06:27 +08:00
enable concat layer fuse for OCL target
Signed-off-by: Li Peng <peng.li@intel.com>
This commit is contained in:
parent
625d20b9b4
commit
f0cadaa6e3
@ -1492,7 +1492,8 @@ struct Net::Impl
|
||||
// TODO: OpenCL target support more fusion styles.
|
||||
if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
(!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
|
||||
ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling")) )
|
||||
ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling" &&
|
||||
ld.layerInstance->type != "Concat")) )
|
||||
continue;
|
||||
|
||||
Ptr<Layer>& currLayer = ld.layerInstance;
|
||||
@ -1701,6 +1702,31 @@ struct Net::Impl
|
||||
ld.outputBlobs.size() == 1 )
|
||||
{
|
||||
Mat& output = ld.outputBlobs[0];
|
||||
UMat umat_output;
|
||||
if (!ld.outputBlobsWrappers.empty() &&
|
||||
(preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget)))
|
||||
{
|
||||
size_t i, ninputs = ld.inputBlobsId.size();
|
||||
bool conv_layer = true;
|
||||
for( i = 0; i < ninputs; i++ )
|
||||
{
|
||||
LayerPin pin = ld.inputBlobsId[i];
|
||||
LayerData* inp_i_data = &layers[pin.lid];
|
||||
while(inp_i_data->skip &&
|
||||
inp_i_data->inputBlobsId.size() == 1 &&
|
||||
inp_i_data->consumers.size() == 1)
|
||||
{
|
||||
pin = inp_i_data->inputBlobsId[0];
|
||||
inp_i_data = &layers[pin.lid];
|
||||
}
|
||||
conv_layer = conv_layer && (inp_i_data->getLayerInstance()->type == "Convolution");
|
||||
}
|
||||
if (!conv_layer)
|
||||
continue;
|
||||
std::vector<UMat> umat_outputBlobs;
|
||||
umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
|
||||
umat_output = umat_outputBlobs[0];
|
||||
}
|
||||
|
||||
// TODO: in general, this optimization can always be done, but
|
||||
// many layers currently check that the input/output blobs are
|
||||
@ -1737,6 +1763,14 @@ struct Net::Impl
|
||||
// Allocate new memory to prevent collisions during memory
|
||||
// reusing (see https://github.com/opencv/opencv/pull/10456).
|
||||
output = output.clone();
|
||||
if (preferableBackend == DNN_BACKEND_OPENCV &&
|
||||
IS_DNN_OPENCL_TARGET(preferableTarget))
|
||||
{
|
||||
std::vector<UMat> umats(1);
|
||||
umat_output = umat_output.clone();
|
||||
umats[0] = umat_output;
|
||||
OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umats);
|
||||
}
|
||||
Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() };
|
||||
int ofs = 0;
|
||||
for( i = 0; i < ninputs; i++ )
|
||||
@ -1753,6 +1787,12 @@ struct Net::Impl
|
||||
CV_Assert(output_slice.isContinuous() && output_slice.size == curr_output.size);
|
||||
Mat* oldPtr = &curr_output;
|
||||
curr_output = output_slice;
|
||||
if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
|
||||
{
|
||||
std::vector<UMat> umats(inp_i_data->outputBlobsWrappers.size());
|
||||
umats[pin.oid] = umat_output(chrange);
|
||||
OpenCLBackendWrapper::update(inp_i_data->outputBlobsWrappers, umats);
|
||||
}
|
||||
// Layers that refer old input Mat will refer to the
|
||||
// new data but the same Mat object.
|
||||
CV_Assert(curr_output.data == output_slice.data, oldPtr == &curr_output);
|
||||
|
@ -821,7 +821,7 @@ void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_bu
|
||||
cl_int err;
|
||||
size_t element_size = (use_half_) ? sizeof(short) : sizeof(float);
|
||||
|
||||
region.origin = offset * element_size;
|
||||
region.origin = offset * element_size + buffer.offset;
|
||||
region.size = size * element_size;
|
||||
sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ),
|
||||
write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY,
|
||||
@ -853,6 +853,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
return false;
|
||||
|
||||
int32_t bias_offset;
|
||||
int32_t element_size = use_half_ ? sizeof(short) : sizeof(float);
|
||||
|
||||
if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) {
|
||||
if (!swizzleWeight(weight, config->workItem_output[2], false))
|
||||
@ -931,10 +932,12 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
return false;
|
||||
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
|
||||
kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
||||
kernel.set(argIdx++, (int)(top.offset / element_size));
|
||||
}
|
||||
|
||||
kernel.set(argIdx++, (uint16_t)width_);
|
||||
@ -1024,10 +1027,12 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
return false;
|
||||
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
|
||||
kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
||||
kernel.set(argIdx++, (int)(top.offset / element_size));
|
||||
}
|
||||
|
||||
kernel.set(argIdx++, (uint16_t)width_);
|
||||
@ -1079,6 +1084,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
if (bias_term_)
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
||||
kernel.set(argIdx++, (int)(top.offset / element_size));
|
||||
kernel.set(argIdx++, (uint16_t)width_);
|
||||
kernel.set(argIdx++, (uint16_t)height_);
|
||||
kernel.set(argIdx++, (uint16_t)output_w_);
|
||||
@ -1126,6 +1132,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
kernel.set(argIdx++, (void *)NULL);
|
||||
kernel.set(argIdx++, bias_offset);
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
||||
kernel.set(argIdx++, (int)(top.offset / element_size));
|
||||
kernel.set(argIdx++, output_image_offset);
|
||||
kernel.set(argIdx++, (uint16_t)width_);
|
||||
kernel.set(argIdx++, (uint16_t)height_);
|
||||
|
@ -136,7 +136,8 @@ __kernel void ConvolveBasic(
|
||||
int kernel_offset,
|
||||
__global Dtype* bias,
|
||||
const int bias_offset,
|
||||
__global Dtype* convolved_image,
|
||||
__global Dtype* convolved_image_base,
|
||||
const int convolved_image_base_offset,
|
||||
const int convolved_image_offset,
|
||||
const ushort input_width,
|
||||
const ushort input_height,
|
||||
@ -146,6 +147,7 @@ __kernel void ConvolveBasic(
|
||||
const ushort pad_h
|
||||
)
|
||||
{
|
||||
__global Dtype* convolved_image = convolved_image_base + convolved_image_base_offset;
|
||||
const int outputX = get_global_id(0);
|
||||
const int outputY = get_global_id(1);
|
||||
const int kernelNum = get_global_id(2) * ZPAR;
|
||||
@ -220,12 +222,14 @@ convolve_simd(
|
||||
__global Dtype* inputs,
|
||||
__global Dtype* weights,
|
||||
BIAS_KERNEL_ARG
|
||||
__global Dtype* outputs,
|
||||
__global Dtype* outputs_base,
|
||||
const int outputs_offset,
|
||||
const ushort input_width,
|
||||
const ushort input_height,
|
||||
const ushort output_width,
|
||||
const ushort output_height)
|
||||
{
|
||||
__global Dtype* outputs = outputs_base + outputs_offset;
|
||||
unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column
|
||||
unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
|
||||
unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth
|
||||
@ -395,7 +399,8 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
|
||||
const __global Dtype *src0, \
|
||||
const __global Dtype *src1, \
|
||||
BIAS_KERNEL_ARG \
|
||||
__global Dtype *dst, \
|
||||
__global Dtype *dst_base, \
|
||||
const int dst_offset, \
|
||||
const ushort input_width, \
|
||||
const ushort input_height, \
|
||||
const ushort output_width, \
|
||||
@ -425,6 +430,7 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
|
||||
__attribute__((intel_reqd_sub_group_size(8)))
|
||||
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
||||
{
|
||||
__global Dtype *dst = dst_base + dst_offset;
|
||||
const int group_x = get_group_id(0);
|
||||
const int group_y = get_group_id(1);
|
||||
const int global_x = get_global_id(0);
|
||||
@ -813,6 +819,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
||||
__attribute__((intel_reqd_sub_group_size(8)))
|
||||
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
||||
{
|
||||
__global Dtype *dst = dst_base + dst_offset;
|
||||
const int group_x = get_group_id(0);
|
||||
const int group_y = get_group_id(1);
|
||||
const int global_x = get_global_id(0);
|
||||
@ -1374,6 +1381,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
||||
{
|
||||
__global Dtype *dst = dst_base + dst_offset;
|
||||
const int group_x = get_group_id(0);
|
||||
const int group_y = get_group_id(1);
|
||||
const int global_x = get_global_id(0);
|
||||
@ -1559,6 +1567,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
||||
{
|
||||
__global Dtype *dst = dst_base + dst_offset;
|
||||
const int group_x = get_group_id(0);
|
||||
const int group_y = get_group_id(1);
|
||||
const int global_x = get_global_id(0);
|
||||
@ -1770,12 +1779,13 @@ __kernel void DWCONV(
|
||||
__global Dtype* image_data,
|
||||
__global Dtype* kernel_data,
|
||||
BIAS_KERNEL_ARG
|
||||
__global Dtype* convolved_image,
|
||||
__global Dtype* convolved_image_base,
|
||||
const int convolved_image_offset,
|
||||
const ushort input_width,
|
||||
const ushort input_height,
|
||||
const ushort output_width,
|
||||
const ushort output_height) {
|
||||
|
||||
__global Dtype* convolved_image = convolved_image_base + convolved_image_offset;
|
||||
const int outputX = get_global_id(0);
|
||||
const int outputY = get_global_id(1);
|
||||
const int outputZ = get_global_id(2);
|
||||
|
Loading…
Reference in New Issue
Block a user