add depthwise convolution kernel

Signed-off-by: Li Peng <peng.li@intel.com>
2024-12-01 06:49:50 +08:00 · 2017-12-19 17:59:13 +08:00 · 2017-12-19 17:59:13 +08:00 · 436d7e4eaf
commit 436d7e4eaf
parent 910d7dab1f
3 changed files with 160 additions and 4 deletions
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@ -215,6 +215,9 @@ class OCL4DNNConvSpatial
        bool createGEMMLikeConvKernel(int32_t blockWidth,
                                      int32_t blockHeight,
                                      int32_t blockDepth);
        bool createDWConvKernel(int32_t blockWidth,
                                int32_t blockHeight,
                                int32_t blockDepth);
        void CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
                             int32_t offset, int32_t size, bool write_only);
        bool convolve(const UMat &bottom, UMat &top,
@ -282,6 +285,8 @@ class OCL4DNNConvSpatial
        int32_t M_;
        bool tuned_;
        bool dwconv_;
        std::string key_, key_sanitized_;
        std::string short_key_;
        std::string kernel_name_;
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@ -103,6 +103,7 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
    top_dim_ = num_output_ * output_w_ * output_h_;
    cache_path_ = utils::getConfigurationParameterString("OPENCV_OCL4DNN_CONFIG_PATH", "");
    dwconv_ = (num_output_ == channels_ && channels_ == group_);
    use_cache_path_ = false;
    if (!cache_path_.empty())
@ -203,7 +204,8 @@ void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
 typedef enum {
    KERNEL_TYPE_INTEL_IDLF = 2,
    KERNEL_TYPE_BASIC = 4,
-    KERNEL_TYPE_GEMM_LIKE = 5
+    KERNEL_TYPE_GEMM_LIKE = 5,
    KERNEL_TYPE_DWCONV = 6
 } ocl4dnnConvSpatialKernelType_t;
 template<typename Dtype>
@ -313,6 +315,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
        if (clOptionSupport("-cl-no-subgroup-ifp"))
            options_ << " -cl-no-subgroup-ifp ";
        addDef("KERNEL_GEMM_LIKE");
        addDef("INPUT_DEPTH", channels_);
        addDef("WIDTH1", M_);
        addDef("OUT_PADDING_LEFT", 0);
@ -329,6 +332,28 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
        setFusionDefine(fused_activ_, fused_eltwise_);
        src_ = ocl::dnn::conv_layer_spatial_oclsrc;
    }
    else if (kernelType == KERNEL_TYPE_DWCONV)
    {
        kernelUKey = generateSpecificKey(KERNEL_TYPE_DWCONV, blockM, blockK, blockN);
        kernel_name_ = "DWCONV_";
        kernel_name_ += kernelUKey.c_str();
        options_ << " -cl-fast-relaxed-math ";
        if (clOptionSupport("-cl-no-subgroup-ifp"))
            options_ << " -cl-no-subgroup-ifp ";
        addDef("KERNEL_DWCONV");
        addDef("KERNEL_SIZE", kernel_w_ * kernel_h_);
        addDef("KERNEL_W", kernel_w_);
        addDef("KERNEL_H", kernel_h_);
        addDef("APPLY_BIAS", bias_term_);
        addDef("OUTPUT_Z", num_output_ * num_);
        addDef("CHANNELS", num_output_);
        setFusionDefine(fused_activ_, fused_eltwise_);
        options_ << " -D DWCONV=" << kernel_name_;
        src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
    }
 }
 template<typename Dtype>
@ -906,6 +931,33 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                return false;
            }
        }
    } else if (config->kernelType == KERNEL_TYPE_DWCONV) {
        ocl::Kernel kernel(config->kernelName.c_str(), program);
        if (kernel.empty())
            return false;
        cl_uint argIdx = 0;
        setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
        kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
        kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
        if (bias_term_)
            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
        kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
        kernel.set(argIdx++, (uint16_t)width_);
        kernel.set(argIdx++, (uint16_t)height_);
        kernel.set(argIdx++, (uint16_t)output_w_);
        kernel.set(argIdx++, (uint16_t)output_h_);
        size_t global_size[3];
        global_size[0] = output_w_;
        global_size[1] = output_h_;
        global_size[2] = num_output_ * num_;
        if (!kernel.run(3, global_size, NULL, false))
        {
            std::cout << "DWCONV kernel run failed." << std::endl;
            return false;
        }
    } else {
        for (int32_t n = 0; n < numImages; ++n) {
            for (int32_t g = 0; g < group_; ++g) {
@ -1222,6 +1274,39 @@ bool OCL4DNNConvSpatial<float>::createIDLFKernel(int32_t blockWidth,
        return false;
 }
 template<>
 bool OCL4DNNConvSpatial<float>::createDWConvKernel(int32_t blockWidth,
                                                   int32_t blockHeight,
                                                   int32_t blockDepth)
 {
    if (!dwconv_)
        return false;
    int workItemOutput[3] = { 1, 1, 1 };
    size_t local_size[3] = { 1, 1, 1 };
    size_t global_size[3];
    global_size[0] = divUp(output_w_, workItemOutput[0]);
    global_size[1] = divUp(output_h_, workItemOutput[1]);
    global_size[2] = divUp(M_ * num_, workItemOutput[2]);
    kernelType_ = KERNEL_TYPE_DWCONV;
    blockM_ = blockWidth;
    blockK_ = blockHeight;
    blockN_ = blockDepth;
    setupKernel();
    ocl::Program program = compileKernel();
    if (program.ptr())
    {
        kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0],
                              &workItemOutput[0], false, KERNEL_TYPE_DWCONV));
        return true;
    }
    else
        return false;
 }
 template<>
 bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
                                                        int32_t blockWidth,
@ -1238,6 +1323,8 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
        return createBasicKernel(blockWidth, blockHeight, blockDepth);
    else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
        return createGEMMLikeConvKernel(blockWidth, blockHeight, blockDepth);
    else if (kernelType == KERNEL_TYPE_DWCONV)
        return createDWConvKernel(blockWidth, blockHeight, blockDepth);
    else
        CV_Assert(0 && "Internal error");
    return false;
@ -1246,7 +1333,16 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
 template<>
 void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
 {
-    if (ocl::Device::getDefault().intelSubgroupsSupport()) {
+    if (ocl::Device::getDefault().intelSubgroupsSupport())
    {
        //depth_wise kernels
        if (dwconv_)
        {
            tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_DWCONV, 1, 1, 1));
            if (group_ > 8)
                return;
        }
        /* IDLF kernels are using Intel specific extension which make
           them intel only. */
        // Generates static key_
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@ -383,7 +383,7 @@ convolve_simd(
  }
 }
-#else // KERNEL_GEMM_LIKE
+#elif defined KERNEL_GEMM_LIKE
 #if APPLY_BIAS
 // Dtype bias[4];
@ -1501,4 +1501,59 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
    INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);
 }
 #endif
-#endif // KERNEL_BASIC/IDLF/GEMM_LIKE
+
 #elif defined KERNEL_DWCONV
 __kernel void DWCONV(
    ELTWISE_DATA_ARG
    NEGATIVE_SLOPE_ARG
    __global Dtype* image_data,
    __global Dtype* kernel_data,
    BIAS_KERNEL_ARG
    __global Dtype* convolved_image,
    const ushort input_width,
    const ushort input_height,
    const ushort output_width,
    const ushort output_height) {
  const int outputX = get_global_id(0);
  const int outputY = get_global_id(1);
  const int outputZ = get_global_id(2);
  if(outputX < output_width && outputY < output_height)
  {
    Dtype sum = 0.;
    const int org_y = outputY * STRIDE_Y - INPUT_PAD_H;
    const int org_x = outputX * STRIDE_X - INPUT_PAD_W;
    const int currentKernelOffset = KERNEL_SIZE*(outputZ%CHANNELS);
    const int biasIndex=outputZ%CHANNELS;
    const int local_image_offset = org_y*input_width + org_x;
    const int imageSize = input_width*input_height;
    __global Dtype* image_dataPtrFloat = (image_data + (imageSize*outputZ + local_image_offset));
    __global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));
    for(int y = 0; y < KERNEL_H; y++)
    {
      for(int x = 0; x < KERNEL_W; x++)
      {
        if(!(org_y + y * DILATION_Y >= 0 && org_y + y * DILATION_Y < input_height && org_x + x * DILATION_X >= 0 && org_x + x * DILATION_X < input_width))
        {
          continue;
        }
        sum += image_dataPtrFloat[x * DILATION_X] * kernel_dataPtrFloat[x];
      }
      image_dataPtrFloat += input_width * DILATION_Y;
      kernel_dataPtrFloat += KERNEL_W;
    }
    #if APPLY_BIAS
    int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
    ACTIVATION_FUNCTION(convolved_image, offset, sum + biases_base[biasIndex], biasIndex);
    #else
    int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
    ACTIVATION_FUNCTION(convolved_image, offset, sum, biasIndex);
    #endif
  }
 }
 #endif // KERNEL_BASIC/IDLF/GEMM_LIKE/DWCONV