mirror of
https://github.com/opencv/opencv.git
synced 2025-07-26 07:07:37 +08:00
dnn(ocl): fix buffer offsets in IDLF kernel
- drop CreateSubBuffer - fix FUSED_CONV_ELTWISE mode
This commit is contained in:
parent
7c23ec90a9
commit
407adc7061
@ -269,7 +269,7 @@ class OCL4DNNConvSpatial
|
|||||||
void generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
|
void generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
|
||||||
int blockM, int blockK, int simd_size);
|
int blockM, int blockK, int simd_size);
|
||||||
void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise);
|
void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise);
|
||||||
void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx);
|
void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx);
|
||||||
|
|
||||||
int32_t group_;
|
int32_t group_;
|
||||||
bool bias_term_;
|
bool bias_term_;
|
||||||
|
@ -270,17 +270,21 @@ void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<typename Dtype>
|
template<typename Dtype>
|
||||||
void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx)
|
void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx)
|
||||||
{
|
{
|
||||||
if (fused_eltwise)
|
if (fused_eltwise)
|
||||||
kernel.set(argIdx++, (cl_mem)bottom_data2_.handle(ACCESS_READ));
|
{
|
||||||
|
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom_data2_));
|
||||||
|
if (fused_eltwise_offset >= 0)
|
||||||
|
kernel.set(argIdx++, fused_eltwise_offset);
|
||||||
|
}
|
||||||
|
|
||||||
switch (fused_activ) {
|
switch (fused_activ) {
|
||||||
case OCL4DNN_CONV_FUSED_ACTIV_RELU:
|
case OCL4DNN_CONV_FUSED_ACTIV_RELU:
|
||||||
kernel.set(argIdx++, (float)negative_slope_);
|
kernel.set(argIdx++, (float)negative_slope_);
|
||||||
break;
|
break;
|
||||||
case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
|
case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
|
||||||
kernel.set(argIdx++, (cl_mem)negative_slope_umat_.handle(ACCESS_READ));
|
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(negative_slope_umat_));
|
||||||
break;
|
break;
|
||||||
case OCL4DNN_CONV_FUSED_ACTIV_POWER:
|
case OCL4DNN_CONV_FUSED_ACTIV_POWER:
|
||||||
kernel.set(argIdx++, (float)power_);
|
kernel.set(argIdx++, (float)power_);
|
||||||
@ -895,10 +899,12 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
|||||||
if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) {
|
if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) {
|
||||||
if (!swizzleWeight(weight, config->workItem_output[2], false))
|
if (!swizzleWeight(weight, config->workItem_output[2], false))
|
||||||
return false;
|
return false;
|
||||||
|
#if 0
|
||||||
size_t total_bottom_size = bottom_dim_ * numImages;
|
size_t total_bottom_size = bottom_dim_ * numImages;
|
||||||
size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
|
size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
|
||||||
size_t total_bias_size = M_ * group_;
|
size_t total_bias_size = M_ * group_;
|
||||||
size_t total_top_size = top_dim_ * numImages;
|
size_t total_top_size = top_dim_ * numImages;
|
||||||
|
#endif
|
||||||
for (int32_t g = 0; g < group_; ++g) {
|
for (int32_t g = 0; g < group_; ++g) {
|
||||||
bias_offset = M_ * g;
|
bias_offset = M_ * g;
|
||||||
int32_t image_offset = width_ * height_ * (channels_ / group_) * g;
|
int32_t image_offset = width_ * height_ * (channels_ / group_) * g;
|
||||||
@ -910,72 +916,22 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
cl_uint argIdx = 0;
|
cl_uint argIdx = 0;
|
||||||
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
|
setFusionArg(fused_activ_, fused_eltwise_, output_image_offset, kernel, argIdx);
|
||||||
|
|
||||||
UMat img_buffer;
|
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||||
if (image_offset)
|
kernel.set(argIdx++, image_offset);
|
||||||
{
|
|
||||||
CreateSubBuffer(bottom, img_buffer, image_offset,
|
|
||||||
total_bottom_size - image_offset, false);
|
|
||||||
if (img_buffer.empty())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
|
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
|
||||||
}
|
kernel.set(argIdx++, kernel_offset);
|
||||||
else
|
|
||||||
{
|
|
||||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
|
||||||
}
|
|
||||||
|
|
||||||
UMat kernel_buffer;
|
|
||||||
if (kernel_offset)
|
|
||||||
{
|
|
||||||
CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
|
|
||||||
total_kernel_size - kernel_offset, false);
|
|
||||||
if (kernel_buffer.empty())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
|
|
||||||
}
|
|
||||||
|
|
||||||
UMat bias_buffer;
|
|
||||||
if (bias_term_)
|
if (bias_term_)
|
||||||
{
|
{
|
||||||
if (bias_offset)
|
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
|
||||||
{
|
kernel.set(argIdx++, bias_offset);
|
||||||
CreateSubBuffer(bias, bias_buffer, bias_offset,
|
|
||||||
total_bias_size - bias_offset, false);
|
|
||||||
if (bias_buffer.empty())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
UMat out_buffer;
|
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
||||||
if (output_image_offset)
|
kernel.set(argIdx++, (int)(top.offset / element_size) + output_image_offset);
|
||||||
{
|
|
||||||
CreateSubBuffer(top, out_buffer, output_image_offset,
|
|
||||||
total_top_size - output_image_offset, true);
|
|
||||||
if (out_buffer.empty())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
|
|
||||||
kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
|
||||||
kernel.set(argIdx++, (int)(top.offset / element_size));
|
|
||||||
}
|
|
||||||
|
|
||||||
kernel.set(argIdx++, (uint16_t)width_);
|
kernel.set(argIdx++, (uint16_t)width_);
|
||||||
kernel.set(argIdx++, (uint16_t)height_);
|
kernel.set(argIdx++, (uint16_t)height_);
|
||||||
@ -1005,7 +961,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
cl_uint argIdx = 0;
|
cl_uint argIdx = 0;
|
||||||
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
|
setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
|
||||||
|
|
||||||
UMat img_buffer;
|
UMat img_buffer;
|
||||||
if (image_offset)
|
if (image_offset)
|
||||||
@ -1112,7 +1068,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
cl_uint argIdx = 0;
|
cl_uint argIdx = 0;
|
||||||
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
|
setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
|
||||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
||||||
if (bias_term_)
|
if (bias_term_)
|
||||||
@ -1152,7 +1108,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
cl_uint argIdx = 0;
|
cl_uint argIdx = 0;
|
||||||
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
|
setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
|
||||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||||
kernel.set(argIdx++, image_offset);
|
kernel.set(argIdx++, image_offset);
|
||||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
||||||
|
@ -74,18 +74,22 @@
|
|||||||
(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
|
(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
|
||||||
} while(0)
|
} while(0)
|
||||||
#define ELTWISE_DATA_ARG __global Dtype* eltwise_data,
|
#define ELTWISE_DATA_ARG __global Dtype* eltwise_data,
|
||||||
|
#define ELTWISE_DATA_ARG_WITH_OFFSET __global Dtype* eltwise_ptr, int eltwise_offset,
|
||||||
#else
|
#else
|
||||||
#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \
|
#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \
|
||||||
const Dtype _x_ = (_data_); \
|
const Dtype _x_ = (_data_); \
|
||||||
(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
|
(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
|
||||||
} while(0)
|
} while(0)
|
||||||
#define ELTWISE_DATA_ARG
|
#define ELTWISE_DATA_ARG
|
||||||
|
#define ELTWISE_DATA_ARG_WITH_OFFSET
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if APPLY_BIAS
|
#if APPLY_BIAS
|
||||||
#define BIAS_KERNEL_ARG __global Dtype * biases_base,
|
#define BIAS_KERNEL_ARG __global Dtype * biases_base,
|
||||||
|
#define BIAS_KERNEL_ARG_WITH_OFFSET __global Dtype * biases_base_ptr, int biases_base_offset,
|
||||||
#else
|
#else
|
||||||
#define BIAS_KERNEL_ARG
|
#define BIAS_KERNEL_ARG
|
||||||
|
#define BIAS_KERNEL_ARG_WITH_OFFSET
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define __CAT(x, y) x##y
|
#define __CAT(x, y) x##y
|
||||||
@ -223,19 +227,28 @@ __attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
|
|||||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
|
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
|
||||||
__kernel void
|
__kernel void
|
||||||
convolve_simd(
|
convolve_simd(
|
||||||
ELTWISE_DATA_ARG
|
ELTWISE_DATA_ARG_WITH_OFFSET
|
||||||
FUSED_ARG
|
FUSED_ARG
|
||||||
__global Dtype* inputs,
|
__global Dtype* inputs_ptr, const int inputs_offset,
|
||||||
__global Dtype* weights,
|
__global Dtype* weights_ptr, const int weights_offset,
|
||||||
BIAS_KERNEL_ARG
|
BIAS_KERNEL_ARG_WITH_OFFSET
|
||||||
__global Dtype* outputs_base,
|
__global Dtype* outputs_base, const int outputs_offset,
|
||||||
const int outputs_offset,
|
|
||||||
const ushort input_width,
|
const ushort input_width,
|
||||||
const ushort input_height,
|
const ushort input_height,
|
||||||
const ushort output_width,
|
const ushort output_width,
|
||||||
const ushort output_height)
|
const ushort output_height)
|
||||||
{
|
{
|
||||||
|
__global Dtype* inputs = inputs_ptr + inputs_offset;
|
||||||
|
__global Dtype* weights = weights_ptr + weights_offset;
|
||||||
|
#if APPLY_BIAS
|
||||||
|
__global Dtype* biases_base = biases_base_ptr + biases_base_offset;
|
||||||
|
#endif
|
||||||
|
|
||||||
__global Dtype* outputs = outputs_base + outputs_offset;
|
__global Dtype* outputs = outputs_base + outputs_offset;
|
||||||
|
#ifdef FUSED_CONV_ELTWISE
|
||||||
|
__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
|
||||||
|
#endif
|
||||||
|
|
||||||
unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column
|
unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column
|
||||||
unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
|
unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
|
||||||
unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth
|
unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth
|
||||||
|
Loading…
Reference in New Issue
Block a user