mirror of
https://github.com/opencv/opencv.git
synced 2024-12-01 06:49:50 +08:00
add depthwise convolution kernel
Signed-off-by: Li Peng <peng.li@intel.com>
This commit is contained in:
parent
910d7dab1f
commit
436d7e4eaf
@ -215,6 +215,9 @@ class OCL4DNNConvSpatial
|
|||||||
bool createGEMMLikeConvKernel(int32_t blockWidth,
|
bool createGEMMLikeConvKernel(int32_t blockWidth,
|
||||||
int32_t blockHeight,
|
int32_t blockHeight,
|
||||||
int32_t blockDepth);
|
int32_t blockDepth);
|
||||||
|
bool createDWConvKernel(int32_t blockWidth,
|
||||||
|
int32_t blockHeight,
|
||||||
|
int32_t blockDepth);
|
||||||
void CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
|
void CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
|
||||||
int32_t offset, int32_t size, bool write_only);
|
int32_t offset, int32_t size, bool write_only);
|
||||||
bool convolve(const UMat &bottom, UMat &top,
|
bool convolve(const UMat &bottom, UMat &top,
|
||||||
@ -282,6 +285,8 @@ class OCL4DNNConvSpatial
|
|||||||
int32_t M_;
|
int32_t M_;
|
||||||
|
|
||||||
bool tuned_;
|
bool tuned_;
|
||||||
|
bool dwconv_;
|
||||||
|
|
||||||
std::string key_, key_sanitized_;
|
std::string key_, key_sanitized_;
|
||||||
std::string short_key_;
|
std::string short_key_;
|
||||||
std::string kernel_name_;
|
std::string kernel_name_;
|
||||||
|
@ -103,6 +103,7 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
|
|||||||
top_dim_ = num_output_ * output_w_ * output_h_;
|
top_dim_ = num_output_ * output_w_ * output_h_;
|
||||||
|
|
||||||
cache_path_ = utils::getConfigurationParameterString("OPENCV_OCL4DNN_CONFIG_PATH", "");
|
cache_path_ = utils::getConfigurationParameterString("OPENCV_OCL4DNN_CONFIG_PATH", "");
|
||||||
|
dwconv_ = (num_output_ == channels_ && channels_ == group_);
|
||||||
|
|
||||||
use_cache_path_ = false;
|
use_cache_path_ = false;
|
||||||
if (!cache_path_.empty())
|
if (!cache_path_.empty())
|
||||||
@ -203,7 +204,8 @@ void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
|
|||||||
typedef enum {
|
typedef enum {
|
||||||
KERNEL_TYPE_INTEL_IDLF = 2,
|
KERNEL_TYPE_INTEL_IDLF = 2,
|
||||||
KERNEL_TYPE_BASIC = 4,
|
KERNEL_TYPE_BASIC = 4,
|
||||||
KERNEL_TYPE_GEMM_LIKE = 5
|
KERNEL_TYPE_GEMM_LIKE = 5,
|
||||||
|
KERNEL_TYPE_DWCONV = 6
|
||||||
} ocl4dnnConvSpatialKernelType_t;
|
} ocl4dnnConvSpatialKernelType_t;
|
||||||
|
|
||||||
template<typename Dtype>
|
template<typename Dtype>
|
||||||
@ -313,6 +315,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
|
|||||||
if (clOptionSupport("-cl-no-subgroup-ifp"))
|
if (clOptionSupport("-cl-no-subgroup-ifp"))
|
||||||
options_ << " -cl-no-subgroup-ifp ";
|
options_ << " -cl-no-subgroup-ifp ";
|
||||||
|
|
||||||
|
addDef("KERNEL_GEMM_LIKE");
|
||||||
addDef("INPUT_DEPTH", channels_);
|
addDef("INPUT_DEPTH", channels_);
|
||||||
addDef("WIDTH1", M_);
|
addDef("WIDTH1", M_);
|
||||||
addDef("OUT_PADDING_LEFT", 0);
|
addDef("OUT_PADDING_LEFT", 0);
|
||||||
@ -329,6 +332,28 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
|
|||||||
setFusionDefine(fused_activ_, fused_eltwise_);
|
setFusionDefine(fused_activ_, fused_eltwise_);
|
||||||
src_ = ocl::dnn::conv_layer_spatial_oclsrc;
|
src_ = ocl::dnn::conv_layer_spatial_oclsrc;
|
||||||
}
|
}
|
||||||
|
else if (kernelType == KERNEL_TYPE_DWCONV)
|
||||||
|
{
|
||||||
|
kernelUKey = generateSpecificKey(KERNEL_TYPE_DWCONV, blockM, blockK, blockN);
|
||||||
|
kernel_name_ = "DWCONV_";
|
||||||
|
kernel_name_ += kernelUKey.c_str();
|
||||||
|
|
||||||
|
options_ << " -cl-fast-relaxed-math ";
|
||||||
|
if (clOptionSupport("-cl-no-subgroup-ifp"))
|
||||||
|
options_ << " -cl-no-subgroup-ifp ";
|
||||||
|
|
||||||
|
addDef("KERNEL_DWCONV");
|
||||||
|
addDef("KERNEL_SIZE", kernel_w_ * kernel_h_);
|
||||||
|
addDef("KERNEL_W", kernel_w_);
|
||||||
|
addDef("KERNEL_H", kernel_h_);
|
||||||
|
addDef("APPLY_BIAS", bias_term_);
|
||||||
|
addDef("OUTPUT_Z", num_output_ * num_);
|
||||||
|
addDef("CHANNELS", num_output_);
|
||||||
|
setFusionDefine(fused_activ_, fused_eltwise_);
|
||||||
|
|
||||||
|
options_ << " -D DWCONV=" << kernel_name_;
|
||||||
|
src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Dtype>
|
template<typename Dtype>
|
||||||
@ -906,6 +931,33 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else if (config->kernelType == KERNEL_TYPE_DWCONV) {
|
||||||
|
ocl::Kernel kernel(config->kernelName.c_str(), program);
|
||||||
|
if (kernel.empty())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
cl_uint argIdx = 0;
|
||||||
|
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
|
||||||
|
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||||
|
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
||||||
|
if (bias_term_)
|
||||||
|
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
|
||||||
|
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
||||||
|
kernel.set(argIdx++, (uint16_t)width_);
|
||||||
|
kernel.set(argIdx++, (uint16_t)height_);
|
||||||
|
kernel.set(argIdx++, (uint16_t)output_w_);
|
||||||
|
kernel.set(argIdx++, (uint16_t)output_h_);
|
||||||
|
|
||||||
|
size_t global_size[3];
|
||||||
|
global_size[0] = output_w_;
|
||||||
|
global_size[1] = output_h_;
|
||||||
|
global_size[2] = num_output_ * num_;
|
||||||
|
|
||||||
|
if (!kernel.run(3, global_size, NULL, false))
|
||||||
|
{
|
||||||
|
std::cout << "DWCONV kernel run failed." << std::endl;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int32_t n = 0; n < numImages; ++n) {
|
for (int32_t n = 0; n < numImages; ++n) {
|
||||||
for (int32_t g = 0; g < group_; ++g) {
|
for (int32_t g = 0; g < group_; ++g) {
|
||||||
@ -1222,6 +1274,39 @@ bool OCL4DNNConvSpatial<float>::createIDLFKernel(int32_t blockWidth,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
bool OCL4DNNConvSpatial<float>::createDWConvKernel(int32_t blockWidth,
|
||||||
|
int32_t blockHeight,
|
||||||
|
int32_t blockDepth)
|
||||||
|
{
|
||||||
|
if (!dwconv_)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
int workItemOutput[3] = { 1, 1, 1 };
|
||||||
|
size_t local_size[3] = { 1, 1, 1 };
|
||||||
|
size_t global_size[3];
|
||||||
|
global_size[0] = divUp(output_w_, workItemOutput[0]);
|
||||||
|
global_size[1] = divUp(output_h_, workItemOutput[1]);
|
||||||
|
global_size[2] = divUp(M_ * num_, workItemOutput[2]);
|
||||||
|
|
||||||
|
kernelType_ = KERNEL_TYPE_DWCONV;
|
||||||
|
blockM_ = blockWidth;
|
||||||
|
blockK_ = blockHeight;
|
||||||
|
blockN_ = blockDepth;
|
||||||
|
|
||||||
|
setupKernel();
|
||||||
|
|
||||||
|
ocl::Program program = compileKernel();
|
||||||
|
if (program.ptr())
|
||||||
|
{
|
||||||
|
kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0],
|
||||||
|
&workItemOutput[0], false, KERNEL_TYPE_DWCONV));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
|
bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
|
||||||
int32_t blockWidth,
|
int32_t blockWidth,
|
||||||
@ -1238,6 +1323,8 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
|
|||||||
return createBasicKernel(blockWidth, blockHeight, blockDepth);
|
return createBasicKernel(blockWidth, blockHeight, blockDepth);
|
||||||
else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
|
else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
|
||||||
return createGEMMLikeConvKernel(blockWidth, blockHeight, blockDepth);
|
return createGEMMLikeConvKernel(blockWidth, blockHeight, blockDepth);
|
||||||
|
else if (kernelType == KERNEL_TYPE_DWCONV)
|
||||||
|
return createDWConvKernel(blockWidth, blockHeight, blockDepth);
|
||||||
else
|
else
|
||||||
CV_Assert(0 && "Internal error");
|
CV_Assert(0 && "Internal error");
|
||||||
return false;
|
return false;
|
||||||
@ -1246,7 +1333,16 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
|
|||||||
template<>
|
template<>
|
||||||
void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
|
void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
|
||||||
{
|
{
|
||||||
if (ocl::Device::getDefault().intelSubgroupsSupport()) {
|
if (ocl::Device::getDefault().intelSubgroupsSupport())
|
||||||
|
{
|
||||||
|
//depth_wise kernels
|
||||||
|
if (dwconv_)
|
||||||
|
{
|
||||||
|
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_DWCONV, 1, 1, 1));
|
||||||
|
if (group_ > 8)
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
/* IDLF kernels are using Intel specific extension which make
|
/* IDLF kernels are using Intel specific extension which make
|
||||||
them intel only. */
|
them intel only. */
|
||||||
// Generates static key_
|
// Generates static key_
|
||||||
|
@ -383,7 +383,7 @@ convolve_simd(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#else // KERNEL_GEMM_LIKE
|
#elif defined KERNEL_GEMM_LIKE
|
||||||
|
|
||||||
#if APPLY_BIAS
|
#if APPLY_BIAS
|
||||||
// Dtype bias[4];
|
// Dtype bias[4];
|
||||||
@ -1501,4 +1501,59 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
|||||||
INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);
|
INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#endif // KERNEL_BASIC/IDLF/GEMM_LIKE
|
|
||||||
|
#elif defined KERNEL_DWCONV
|
||||||
|
|
||||||
|
__kernel void DWCONV(
|
||||||
|
ELTWISE_DATA_ARG
|
||||||
|
NEGATIVE_SLOPE_ARG
|
||||||
|
__global Dtype* image_data,
|
||||||
|
__global Dtype* kernel_data,
|
||||||
|
BIAS_KERNEL_ARG
|
||||||
|
__global Dtype* convolved_image,
|
||||||
|
const ushort input_width,
|
||||||
|
const ushort input_height,
|
||||||
|
const ushort output_width,
|
||||||
|
const ushort output_height) {
|
||||||
|
|
||||||
|
const int outputX = get_global_id(0);
|
||||||
|
const int outputY = get_global_id(1);
|
||||||
|
const int outputZ = get_global_id(2);
|
||||||
|
if(outputX < output_width && outputY < output_height)
|
||||||
|
{
|
||||||
|
Dtype sum = 0.;
|
||||||
|
|
||||||
|
const int org_y = outputY * STRIDE_Y - INPUT_PAD_H;
|
||||||
|
const int org_x = outputX * STRIDE_X - INPUT_PAD_W;
|
||||||
|
const int currentKernelOffset = KERNEL_SIZE*(outputZ%CHANNELS);
|
||||||
|
const int biasIndex=outputZ%CHANNELS;
|
||||||
|
const int local_image_offset = org_y*input_width + org_x;
|
||||||
|
const int imageSize = input_width*input_height;
|
||||||
|
|
||||||
|
__global Dtype* image_dataPtrFloat = (image_data + (imageSize*outputZ + local_image_offset));
|
||||||
|
__global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));
|
||||||
|
|
||||||
|
for(int y = 0; y < KERNEL_H; y++)
|
||||||
|
{
|
||||||
|
for(int x = 0; x < KERNEL_W; x++)
|
||||||
|
{
|
||||||
|
if(!(org_y + y * DILATION_Y >= 0 && org_y + y * DILATION_Y < input_height && org_x + x * DILATION_X >= 0 && org_x + x * DILATION_X < input_width))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
sum += image_dataPtrFloat[x * DILATION_X] * kernel_dataPtrFloat[x];
|
||||||
|
}
|
||||||
|
image_dataPtrFloat += input_width * DILATION_Y;
|
||||||
|
kernel_dataPtrFloat += KERNEL_W;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if APPLY_BIAS
|
||||||
|
int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
|
||||||
|
ACTIVATION_FUNCTION(convolved_image, offset, sum + biases_base[biasIndex], biasIndex);
|
||||||
|
#else
|
||||||
|
int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
|
||||||
|
ACTIVATION_FUNCTION(convolved_image, offset, sum, biasIndex);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif // KERNEL_BASIC/IDLF/GEMM_LIKE/DWCONV
|
||||||
|
Loading…
Reference in New Issue
Block a user