mirror of
https://github.com/opencv/opencv.git
synced 2025-07-25 14:47:07 +08:00
dnn(ocl): fix buffer offsets in IDLF kernel
- drop CreateSubBuffer - fix FUSED_CONV_ELTWISE mode
This commit is contained in:
parent
7c23ec90a9
commit
407adc7061
@ -269,7 +269,7 @@ class OCL4DNNConvSpatial
|
||||
void generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
|
||||
int blockM, int blockK, int simd_size);
|
||||
void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise);
|
||||
void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx);
|
||||
void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx);
|
||||
|
||||
int32_t group_;
|
||||
bool bias_term_;
|
||||
|
@ -270,17 +270,21 @@ void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ,
|
||||
}
|
||||
|
||||
template<typename Dtype>
|
||||
void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx)
|
||||
void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx)
|
||||
{
|
||||
if (fused_eltwise)
|
||||
kernel.set(argIdx++, (cl_mem)bottom_data2_.handle(ACCESS_READ));
|
||||
{
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom_data2_));
|
||||
if (fused_eltwise_offset >= 0)
|
||||
kernel.set(argIdx++, fused_eltwise_offset);
|
||||
}
|
||||
|
||||
switch (fused_activ) {
|
||||
case OCL4DNN_CONV_FUSED_ACTIV_RELU:
|
||||
kernel.set(argIdx++, (float)negative_slope_);
|
||||
break;
|
||||
case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
|
||||
kernel.set(argIdx++, (cl_mem)negative_slope_umat_.handle(ACCESS_READ));
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(negative_slope_umat_));
|
||||
break;
|
||||
case OCL4DNN_CONV_FUSED_ACTIV_POWER:
|
||||
kernel.set(argIdx++, (float)power_);
|
||||
@ -895,10 +899,12 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) {
|
||||
if (!swizzleWeight(weight, config->workItem_output[2], false))
|
||||
return false;
|
||||
#if 0
|
||||
size_t total_bottom_size = bottom_dim_ * numImages;
|
||||
size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
|
||||
size_t total_bias_size = M_ * group_;
|
||||
size_t total_top_size = top_dim_ * numImages;
|
||||
#endif
|
||||
for (int32_t g = 0; g < group_; ++g) {
|
||||
bias_offset = M_ * g;
|
||||
int32_t image_offset = width_ * height_ * (channels_ / group_) * g;
|
||||
@ -910,72 +916,22 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
return false;
|
||||
|
||||
cl_uint argIdx = 0;
|
||||
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
|
||||
setFusionArg(fused_activ_, fused_eltwise_, output_image_offset, kernel, argIdx);
|
||||
|
||||
UMat img_buffer;
|
||||
if (image_offset)
|
||||
{
|
||||
CreateSubBuffer(bottom, img_buffer, image_offset,
|
||||
total_bottom_size - image_offset, false);
|
||||
if (img_buffer.empty())
|
||||
return false;
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||
kernel.set(argIdx++, image_offset);
|
||||
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||
}
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
|
||||
kernel.set(argIdx++, kernel_offset);
|
||||
|
||||
UMat kernel_buffer;
|
||||
if (kernel_offset)
|
||||
{
|
||||
CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
|
||||
total_kernel_size - kernel_offset, false);
|
||||
if (kernel_buffer.empty())
|
||||
return false;
|
||||
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
|
||||
}
|
||||
|
||||
UMat bias_buffer;
|
||||
if (bias_term_)
|
||||
{
|
||||
if (bias_offset)
|
||||
{
|
||||
CreateSubBuffer(bias, bias_buffer, bias_offset,
|
||||
total_bias_size - bias_offset, false);
|
||||
if (bias_buffer.empty())
|
||||
return false;
|
||||
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
|
||||
}
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
|
||||
kernel.set(argIdx++, bias_offset);
|
||||
}
|
||||
|
||||
UMat out_buffer;
|
||||
if (output_image_offset)
|
||||
{
|
||||
CreateSubBuffer(top, out_buffer, output_image_offset,
|
||||
total_top_size - output_image_offset, true);
|
||||
if (out_buffer.empty())
|
||||
return false;
|
||||
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
|
||||
kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
||||
kernel.set(argIdx++, (int)(top.offset / element_size));
|
||||
}
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
||||
kernel.set(argIdx++, (int)(top.offset / element_size) + output_image_offset);
|
||||
|
||||
kernel.set(argIdx++, (uint16_t)width_);
|
||||
kernel.set(argIdx++, (uint16_t)height_);
|
||||
@ -1005,7 +961,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
return false;
|
||||
|
||||
cl_uint argIdx = 0;
|
||||
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
|
||||
setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
|
||||
|
||||
UMat img_buffer;
|
||||
if (image_offset)
|
||||
@ -1112,7 +1068,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
return false;
|
||||
|
||||
cl_uint argIdx = 0;
|
||||
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
|
||||
setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
||||
if (bias_term_)
|
||||
@ -1152,7 +1108,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
return false;
|
||||
|
||||
cl_uint argIdx = 0;
|
||||
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
|
||||
setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||
kernel.set(argIdx++, image_offset);
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
||||
|
@ -74,18 +74,22 @@
|
||||
(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
|
||||
} while(0)
|
||||
#define ELTWISE_DATA_ARG __global Dtype* eltwise_data,
|
||||
#define ELTWISE_DATA_ARG_WITH_OFFSET __global Dtype* eltwise_ptr, int eltwise_offset,
|
||||
#else
|
||||
#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \
|
||||
const Dtype _x_ = (_data_); \
|
||||
(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
|
||||
} while(0)
|
||||
#define ELTWISE_DATA_ARG
|
||||
#define ELTWISE_DATA_ARG_WITH_OFFSET
|
||||
#endif
|
||||
|
||||
#if APPLY_BIAS
|
||||
#define BIAS_KERNEL_ARG __global Dtype * biases_base,
|
||||
#define BIAS_KERNEL_ARG_WITH_OFFSET __global Dtype * biases_base_ptr, int biases_base_offset,
|
||||
#else
|
||||
#define BIAS_KERNEL_ARG
|
||||
#define BIAS_KERNEL_ARG_WITH_OFFSET
|
||||
#endif
|
||||
|
||||
#define __CAT(x, y) x##y
|
||||
@ -223,19 +227,28 @@ __attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
|
||||
__kernel void
|
||||
convolve_simd(
|
||||
ELTWISE_DATA_ARG
|
||||
ELTWISE_DATA_ARG_WITH_OFFSET
|
||||
FUSED_ARG
|
||||
__global Dtype* inputs,
|
||||
__global Dtype* weights,
|
||||
BIAS_KERNEL_ARG
|
||||
__global Dtype* outputs_base,
|
||||
const int outputs_offset,
|
||||
__global Dtype* inputs_ptr, const int inputs_offset,
|
||||
__global Dtype* weights_ptr, const int weights_offset,
|
||||
BIAS_KERNEL_ARG_WITH_OFFSET
|
||||
__global Dtype* outputs_base, const int outputs_offset,
|
||||
const ushort input_width,
|
||||
const ushort input_height,
|
||||
const ushort output_width,
|
||||
const ushort output_height)
|
||||
{
|
||||
__global Dtype* inputs = inputs_ptr + inputs_offset;
|
||||
__global Dtype* weights = weights_ptr + weights_offset;
|
||||
#if APPLY_BIAS
|
||||
__global Dtype* biases_base = biases_base_ptr + biases_base_offset;
|
||||
#endif
|
||||
|
||||
__global Dtype* outputs = outputs_base + outputs_offset;
|
||||
#ifdef FUSED_CONV_ELTWISE
|
||||
__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
|
||||
#endif
|
||||
|
||||
unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column
|
||||
unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
|
||||
unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth
|
||||
|
Loading…
Reference in New Issue
Block a user