mirror of
https://github.com/opencv/opencv.git
synced 2025-01-18 22:44:02 +08:00
add depthwise convolution kernel
Signed-off-by: Li Peng <peng.li@intel.com>
This commit is contained in:
parent
910d7dab1f
commit
436d7e4eaf
@ -215,6 +215,9 @@ class OCL4DNNConvSpatial
|
||||
bool createGEMMLikeConvKernel(int32_t blockWidth,
|
||||
int32_t blockHeight,
|
||||
int32_t blockDepth);
|
||||
bool createDWConvKernel(int32_t blockWidth,
|
||||
int32_t blockHeight,
|
||||
int32_t blockDepth);
|
||||
void CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
|
||||
int32_t offset, int32_t size, bool write_only);
|
||||
bool convolve(const UMat &bottom, UMat &top,
|
||||
@ -282,6 +285,8 @@ class OCL4DNNConvSpatial
|
||||
int32_t M_;
|
||||
|
||||
bool tuned_;
|
||||
bool dwconv_;
|
||||
|
||||
std::string key_, key_sanitized_;
|
||||
std::string short_key_;
|
||||
std::string kernel_name_;
|
||||
|
@ -103,6 +103,7 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
|
||||
top_dim_ = num_output_ * output_w_ * output_h_;
|
||||
|
||||
cache_path_ = utils::getConfigurationParameterString("OPENCV_OCL4DNN_CONFIG_PATH", "");
|
||||
dwconv_ = (num_output_ == channels_ && channels_ == group_);
|
||||
|
||||
use_cache_path_ = false;
|
||||
if (!cache_path_.empty())
|
||||
@ -203,7 +204,8 @@ void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
|
||||
typedef enum {
|
||||
KERNEL_TYPE_INTEL_IDLF = 2,
|
||||
KERNEL_TYPE_BASIC = 4,
|
||||
KERNEL_TYPE_GEMM_LIKE = 5
|
||||
KERNEL_TYPE_GEMM_LIKE = 5,
|
||||
KERNEL_TYPE_DWCONV = 6
|
||||
} ocl4dnnConvSpatialKernelType_t;
|
||||
|
||||
template<typename Dtype>
|
||||
@ -313,6 +315,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
|
||||
if (clOptionSupport("-cl-no-subgroup-ifp"))
|
||||
options_ << " -cl-no-subgroup-ifp ";
|
||||
|
||||
addDef("KERNEL_GEMM_LIKE");
|
||||
addDef("INPUT_DEPTH", channels_);
|
||||
addDef("WIDTH1", M_);
|
||||
addDef("OUT_PADDING_LEFT", 0);
|
||||
@ -329,6 +332,28 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
|
||||
setFusionDefine(fused_activ_, fused_eltwise_);
|
||||
src_ = ocl::dnn::conv_layer_spatial_oclsrc;
|
||||
}
|
||||
else if (kernelType == KERNEL_TYPE_DWCONV)
|
||||
{
|
||||
kernelUKey = generateSpecificKey(KERNEL_TYPE_DWCONV, blockM, blockK, blockN);
|
||||
kernel_name_ = "DWCONV_";
|
||||
kernel_name_ += kernelUKey.c_str();
|
||||
|
||||
options_ << " -cl-fast-relaxed-math ";
|
||||
if (clOptionSupport("-cl-no-subgroup-ifp"))
|
||||
options_ << " -cl-no-subgroup-ifp ";
|
||||
|
||||
addDef("KERNEL_DWCONV");
|
||||
addDef("KERNEL_SIZE", kernel_w_ * kernel_h_);
|
||||
addDef("KERNEL_W", kernel_w_);
|
||||
addDef("KERNEL_H", kernel_h_);
|
||||
addDef("APPLY_BIAS", bias_term_);
|
||||
addDef("OUTPUT_Z", num_output_ * num_);
|
||||
addDef("CHANNELS", num_output_);
|
||||
setFusionDefine(fused_activ_, fused_eltwise_);
|
||||
|
||||
options_ << " -D DWCONV=" << kernel_name_;
|
||||
src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Dtype>
|
||||
@ -906,6 +931,33 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} else if (config->kernelType == KERNEL_TYPE_DWCONV) {
|
||||
ocl::Kernel kernel(config->kernelName.c_str(), program);
|
||||
if (kernel.empty())
|
||||
return false;
|
||||
|
||||
cl_uint argIdx = 0;
|
||||
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
||||
if (bias_term_)
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
||||
kernel.set(argIdx++, (uint16_t)width_);
|
||||
kernel.set(argIdx++, (uint16_t)height_);
|
||||
kernel.set(argIdx++, (uint16_t)output_w_);
|
||||
kernel.set(argIdx++, (uint16_t)output_h_);
|
||||
|
||||
size_t global_size[3];
|
||||
global_size[0] = output_w_;
|
||||
global_size[1] = output_h_;
|
||||
global_size[2] = num_output_ * num_;
|
||||
|
||||
if (!kernel.run(3, global_size, NULL, false))
|
||||
{
|
||||
std::cout << "DWCONV kernel run failed." << std::endl;
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
for (int32_t n = 0; n < numImages; ++n) {
|
||||
for (int32_t g = 0; g < group_; ++g) {
|
||||
@ -1222,6 +1274,39 @@ bool OCL4DNNConvSpatial<float>::createIDLFKernel(int32_t blockWidth,
|
||||
return false;
|
||||
}
|
||||
|
||||
template<>
|
||||
bool OCL4DNNConvSpatial<float>::createDWConvKernel(int32_t blockWidth,
|
||||
int32_t blockHeight,
|
||||
int32_t blockDepth)
|
||||
{
|
||||
if (!dwconv_)
|
||||
return false;
|
||||
|
||||
int workItemOutput[3] = { 1, 1, 1 };
|
||||
size_t local_size[3] = { 1, 1, 1 };
|
||||
size_t global_size[3];
|
||||
global_size[0] = divUp(output_w_, workItemOutput[0]);
|
||||
global_size[1] = divUp(output_h_, workItemOutput[1]);
|
||||
global_size[2] = divUp(M_ * num_, workItemOutput[2]);
|
||||
|
||||
kernelType_ = KERNEL_TYPE_DWCONV;
|
||||
blockM_ = blockWidth;
|
||||
blockK_ = blockHeight;
|
||||
blockN_ = blockDepth;
|
||||
|
||||
setupKernel();
|
||||
|
||||
ocl::Program program = compileKernel();
|
||||
if (program.ptr())
|
||||
{
|
||||
kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0],
|
||||
&workItemOutput[0], false, KERNEL_TYPE_DWCONV));
|
||||
return true;
|
||||
}
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
template<>
|
||||
bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
|
||||
int32_t blockWidth,
|
||||
@ -1238,6 +1323,8 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
|
||||
return createBasicKernel(blockWidth, blockHeight, blockDepth);
|
||||
else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
|
||||
return createGEMMLikeConvKernel(blockWidth, blockHeight, blockDepth);
|
||||
else if (kernelType == KERNEL_TYPE_DWCONV)
|
||||
return createDWConvKernel(blockWidth, blockHeight, blockDepth);
|
||||
else
|
||||
CV_Assert(0 && "Internal error");
|
||||
return false;
|
||||
@ -1246,7 +1333,16 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
|
||||
template<>
|
||||
void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
|
||||
{
|
||||
if (ocl::Device::getDefault().intelSubgroupsSupport()) {
|
||||
if (ocl::Device::getDefault().intelSubgroupsSupport())
|
||||
{
|
||||
//depth_wise kernels
|
||||
if (dwconv_)
|
||||
{
|
||||
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_DWCONV, 1, 1, 1));
|
||||
if (group_ > 8)
|
||||
return;
|
||||
}
|
||||
|
||||
/* IDLF kernels are using Intel specific extension which make
|
||||
them intel only. */
|
||||
// Generates static key_
|
||||
|
@ -383,7 +383,7 @@ convolve_simd(
|
||||
}
|
||||
}
|
||||
|
||||
#else // KERNEL_GEMM_LIKE
|
||||
#elif defined KERNEL_GEMM_LIKE
|
||||
|
||||
#if APPLY_BIAS
|
||||
// Dtype bias[4];
|
||||
@ -1501,4 +1501,59 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
||||
INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);
|
||||
}
|
||||
#endif
|
||||
#endif // KERNEL_BASIC/IDLF/GEMM_LIKE
|
||||
|
||||
#elif defined KERNEL_DWCONV
|
||||
|
||||
__kernel void DWCONV(
|
||||
ELTWISE_DATA_ARG
|
||||
NEGATIVE_SLOPE_ARG
|
||||
__global Dtype* image_data,
|
||||
__global Dtype* kernel_data,
|
||||
BIAS_KERNEL_ARG
|
||||
__global Dtype* convolved_image,
|
||||
const ushort input_width,
|
||||
const ushort input_height,
|
||||
const ushort output_width,
|
||||
const ushort output_height) {
|
||||
|
||||
const int outputX = get_global_id(0);
|
||||
const int outputY = get_global_id(1);
|
||||
const int outputZ = get_global_id(2);
|
||||
if(outputX < output_width && outputY < output_height)
|
||||
{
|
||||
Dtype sum = 0.;
|
||||
|
||||
const int org_y = outputY * STRIDE_Y - INPUT_PAD_H;
|
||||
const int org_x = outputX * STRIDE_X - INPUT_PAD_W;
|
||||
const int currentKernelOffset = KERNEL_SIZE*(outputZ%CHANNELS);
|
||||
const int biasIndex=outputZ%CHANNELS;
|
||||
const int local_image_offset = org_y*input_width + org_x;
|
||||
const int imageSize = input_width*input_height;
|
||||
|
||||
__global Dtype* image_dataPtrFloat = (image_data + (imageSize*outputZ + local_image_offset));
|
||||
__global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));
|
||||
|
||||
for(int y = 0; y < KERNEL_H; y++)
|
||||
{
|
||||
for(int x = 0; x < KERNEL_W; x++)
|
||||
{
|
||||
if(!(org_y + y * DILATION_Y >= 0 && org_y + y * DILATION_Y < input_height && org_x + x * DILATION_X >= 0 && org_x + x * DILATION_X < input_width))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
sum += image_dataPtrFloat[x * DILATION_X] * kernel_dataPtrFloat[x];
|
||||
}
|
||||
image_dataPtrFloat += input_width * DILATION_Y;
|
||||
kernel_dataPtrFloat += KERNEL_W;
|
||||
}
|
||||
|
||||
#if APPLY_BIAS
|
||||
int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
|
||||
ACTIVATION_FUNCTION(convolved_image, offset, sum + biases_base[biasIndex], biasIndex);
|
||||
#else
|
||||
int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
|
||||
ACTIVATION_FUNCTION(convolved_image, offset, sum, biasIndex);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#endif // KERNEL_BASIC/IDLF/GEMM_LIKE/DWCONV
|
||||
|
Loading…
Reference in New Issue
Block a user