From 910d7dab1f056ccd6531c8615e4f50e0c1c1c76d Mon Sep 17 00:00:00 2001
From: Li Peng <peng.li@intel.com>
Date: Tue, 5 Dec 2017 23:17:34 +0800
Subject: [PATCH 1/3] prior box layer ocl implementation

Signed-off-by: Li Peng <peng.li@intel.com>
---
 modules/dnn/src/layers/prior_box_layer.cpp | 106 +++++++++++++++
 modules/dnn/src/opencl/prior_box.cl        | 148 +++++++++++++++++++++
 2 files changed, 254 insertions(+)
 create mode 100644 modules/dnn/src/opencl/prior_box.cl
diff --git a/modules/dnn/src/layers/prior_box_layer.cpp b/modules/dnn/src/layers/prior_box_layer.cpp
index 5fc852a82e..575ac5eb9c 100644
--- a/modules/dnn/src/layers/prior_box_layer.cpp
+++ b/modules/dnn/src/layers/prior_box_layer.cpp
@@ -45,6 +45,7 @@
 #include <float.h>
 #include <algorithm>
 #include <cmath>
+#include "opencl_kernels_dnn.hpp"
 
 namespace cv
 {
@@ -270,11 +271,108 @@ public:
         return false;
     }
 
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        int _layerWidth = inputs[0].size[3];
+        int _layerHeight = inputs[0].size[2];
+
+        int _imageWidth = inputs[1].size[3];
+        int _imageHeight = inputs[1].size[2];
+
+        float stepX, stepY;
+        if (_stepX == 0 || _stepY == 0)
+        {
+            stepX = static_cast<float>(_imageWidth) / _layerWidth;
+            stepY = static_cast<float>(_imageHeight) / _layerHeight;
+        } else {
+            stepX = _stepX;
+            stepY = _stepY;
+        }
+
+        if (umat_offsetsX.empty())
+        {
+            Mat offsetsX(1, _offsetsX.size(), CV_32FC1, &_offsetsX[0]);
+            Mat offsetsY(1, _offsetsX.size(), CV_32FC1, &_offsetsY[0]);
+            Mat aspectRatios(1, _aspectRatios.size(), CV_32FC1, &_aspectRatios[0]);
+            Mat variance(1, _variance.size(), CV_32FC1, &_variance[0]);
+
+            offsetsX.copyTo(umat_offsetsX);
+            offsetsY.copyTo(umat_offsetsY);
+            aspectRatios.copyTo(umat_aspectRatios);
+            variance.copyTo(umat_variance);
+
+            int real_numPriors = _numPriors / pow(2, _offsetsX.size() - 1);
+            umat_scales = UMat(1, &real_numPriors, CV_32F, 1.0f);
+        }
+
+        size_t nthreads = _layerHeight * _layerWidth;
+
+        ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc);
+        kernel.set(0, (int)nthreads);
+        kernel.set(1, (float)stepX);
+        kernel.set(2, (float)stepY);
+        kernel.set(3, (float)_minSize);
+        kernel.set(4, (float)_maxSize);
+        kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_offsetsX));
+        kernel.set(6, ocl::KernelArg::PtrReadOnly(umat_offsetsY));
+        kernel.set(7, (int)_offsetsX.size());
+        kernel.set(8, ocl::KernelArg::PtrReadOnly(umat_aspectRatios));
+        kernel.set(9, (int)_aspectRatios.size());
+        kernel.set(10, ocl::KernelArg::PtrReadOnly(umat_scales));
+        kernel.set(11, ocl::KernelArg::PtrWriteOnly(outputs[0]));
+        kernel.set(12, (int)_layerHeight);
+        kernel.set(13, (int)_layerWidth);
+        kernel.set(14, (int)_imageHeight);
+        kernel.set(15, (int)_imageWidth);
+        kernel.run(1, &nthreads, NULL, false);
+
+        // clip the prior's coordidate such that it is within [0, 1]
+        if (_clip)
+        {
+            Mat mat = outputs[0].getMat(ACCESS_READ);
+            int aspect_count = (_maxSize > 0) ? 1 : 0;
+            int offset = nthreads * 4 * _offsetsX.size() * (1 + aspect_count + _aspectRatios.size());
+            float* outputPtr = mat.ptr<float>() + offset;
+            int _outChannelSize = _layerHeight * _layerWidth * _numPriors * 4;
+            for (size_t d = 0; d < _outChannelSize; ++d)
+            {
+                outputPtr[d] = std::min<float>(std::max<float>(outputPtr[d], 0.), 1.);
+            }
+        }
+
+        // set the variance.
+        {
+            ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc);
+            int offset = total(shape(outputs[0]), 2);
+            size_t nthreads = _layerHeight * _layerWidth * _numPriors;
+            kernel.set(0, (int)nthreads);
+            kernel.set(1, (int)offset);
+            kernel.set(2, (int)_variance.size());
+            kernel.set(3, ocl::KernelArg::PtrReadOnly(umat_variance));
+            kernel.set(4, ocl::KernelArg::PtrWriteOnly(outputs[0]));
+            if (!kernel.run(1, &nthreads, NULL, false))
+                return false;
+        }
+        return true;
+    }
+#endif
+
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
     {
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
+        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
         Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
     }
 
@@ -441,6 +539,14 @@ private:
     std::vector<float> _offsetsX;
     std::vector<float> _offsetsY;
 
+#ifdef HAVE_OPENCL
+    UMat umat_offsetsX;
+    UMat umat_offsetsY;
+    UMat umat_aspectRatios;
+    UMat umat_scales;
+    UMat umat_variance;
+#endif
+
     bool _flip;
     bool _clip;
     bool _explicitSizes;
diff --git a/modules/dnn/src/opencl/prior_box.cl b/modules/dnn/src/opencl/prior_box.cl
new file mode 100644
index 0000000000..660ccb64d5
--- /dev/null
+++ b/modules/dnn/src/opencl/prior_box.cl
@@ -0,0 +1,148 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define Dtype float
+#define Dtype4 float4
+
+__kernel void prior_box(const int nthreads,
+                        const Dtype stepX,
+                        const Dtype stepY,
+                        const Dtype _minSize,
+                        const Dtype _maxSize,
+                        __global const Dtype* _offsetsX,
+                        __global const Dtype* _offsetsY,
+                        const int offsetsX_size,
+                        __global const Dtype* _aspectRatios,
+                        const int aspectRatios_size,
+                        __global const Dtype* scales,
+                        __global Dtype* dst,
+                        const int _layerHeight,
+                        const int _layerWidth,
+                        const int imgHeight,
+                        const int imgWidth)
+{
+    for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
+    {
+        int w = index % _layerWidth;
+        int h = index / _layerWidth;
+        __global Dtype* outputPtr;
+        int aspect_count = (_maxSize > 0) ? 1 : 0;
+        outputPtr = dst + index * 4 * offsetsX_size * (1 + aspect_count + aspectRatios_size);
+
+        Dtype _boxWidth, _boxHeight;
+        Dtype4 vec;
+        _boxWidth = _boxHeight = _minSize * scales[0];
+        for (int i = 0; i < offsetsX_size; ++i)
+        {
+            float center_x = (w + _offsetsX[i]) * stepX;
+            float center_y = (h + _offsetsY[i]) * stepY;
+
+            vec.x = (center_x - _boxWidth * 0.5f) / imgWidth;    // xmin
+            vec.y = (center_y - _boxHeight * 0.5f) / imgHeight;  // ymin
+            vec.z = (center_x + _boxWidth * 0.5f) / imgWidth;    // xmax
+            vec.w = (center_y + _boxHeight * 0.5f) / imgHeight;  // ymax
+            vstore4(vec, 0, outputPtr);
+
+            outputPtr += 4;
+        }
+
+        if (_maxSize > 0)
+        {
+            _boxWidth = _boxHeight = native_sqrt(_minSize * _maxSize) * scales[1];
+
+            for (int i = 0; i < offsetsX_size; ++i)
+            {
+                float center_x = (w + _offsetsX[i]) * stepX;
+                float center_y = (h + _offsetsY[i]) * stepY;
+
+                vec.x = (center_x - _boxWidth * 0.5f) / imgWidth;    // xmin
+                vec.y = (center_y - _boxHeight * 0.5f) / imgHeight;  // ymin
+                vec.z = (center_x + _boxWidth * 0.5f) / imgWidth;    // xmax
+                vec.w = (center_y + _boxHeight * 0.5f) / imgHeight;  // ymax
+                vstore4(vec, 0, outputPtr);
+
+                outputPtr += 4;
+            }
+        }
+
+        for (int r = 0; r < aspectRatios_size; ++r)
+        {
+            float ar = native_sqrt(_aspectRatios[r]);
+            float scale = scales[(_maxSize > 0 ? 2 : 1) + r];
+
+            _boxWidth = _minSize * ar * scale;
+            _boxHeight = _minSize / ar * scale;
+
+            for (int i = 0; i < offsetsX_size; ++i)
+            {
+                float center_x = (w + _offsetsX[i]) * stepX;
+                float center_y = (h + _offsetsY[i]) * stepY;
+
+                vec.x = (center_x - _boxWidth * 0.5f) / imgWidth;    // xmin
+                vec.y = (center_y - _boxHeight * 0.5f) / imgHeight;  // ymin
+                vec.z = (center_x + _boxWidth * 0.5f) / imgWidth;    // xmax
+                vec.w = (center_y + _boxHeight * 0.5f) / imgHeight;  // ymax
+                vstore4(vec, 0, outputPtr);
+
+                outputPtr += 4;
+            }
+        }
+    }
+}
+
+__kernel void set_variance(const int nthreads,
+                           const int offset,
+                           const int variance_size,
+                           __global const Dtype* variance,
+                           __global Dtype* dst)
+{
+    for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
+    {
+        Dtype4 var_vec;
+
+        if (variance_size == 1)
+            var_vec = (Dtype4)(variance[0]);
+        else
+            var_vec = vload4(0, variance);
+
+        vstore4(var_vec, 0, dst + offset + index * 4);
+    }
+}

From 436d7e4eaf4315139e81fda29db316c4ef81eb04 Mon Sep 17 00:00:00 2001
From: Li Peng <peng.li@intel.com>
Date: Tue, 19 Dec 2017 17:59:13 +0800
Subject: [PATCH 2/3] add depthwise convolution kernel

Signed-off-by: Li Peng <peng.li@intel.com>
---
 modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp   |   5 +
 .../src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp  | 100 +++++++++++++++++-
 modules/dnn/src/opencl/conv_layer_spatial.cl  |  59 ++++++++++-
 3 files changed, 160 insertions(+), 4 deletions(-)

diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
index b137896bbe..f9a74ae4e7 100644
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@@ -215,6 +215,9 @@ class OCL4DNNConvSpatial
         bool createGEMMLikeConvKernel(int32_t blockWidth,
                                       int32_t blockHeight,
                                       int32_t blockDepth);
+        bool createDWConvKernel(int32_t blockWidth,
+                                int32_t blockHeight,
+                                int32_t blockDepth);
         void CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
                              int32_t offset, int32_t size, bool write_only);
         bool convolve(const UMat &bottom, UMat &top,
@@ -282,6 +285,8 @@ class OCL4DNNConvSpatial
         int32_t M_;
 
         bool tuned_;
+        bool dwconv_;
+
         std::string key_, key_sanitized_;
         std::string short_key_;
         std::string kernel_name_;
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
index 6a305558eb..ae188f763b 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@@ -103,6 +103,7 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
     top_dim_ = num_output_ * output_w_ * output_h_;
 
     cache_path_ = utils::getConfigurationParameterString("OPENCV_OCL4DNN_CONFIG_PATH", "");
+    dwconv_ = (num_output_ == channels_ && channels_ == group_);
 
     use_cache_path_ = false;
     if (!cache_path_.empty())
@@ -203,7 +204,8 @@ void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
 typedef enum {
     KERNEL_TYPE_INTEL_IDLF = 2,
     KERNEL_TYPE_BASIC = 4,
-    KERNEL_TYPE_GEMM_LIKE = 5
+    KERNEL_TYPE_GEMM_LIKE = 5,
+    KERNEL_TYPE_DWCONV = 6
 } ocl4dnnConvSpatialKernelType_t;
 
 template<typename Dtype>
@@ -313,6 +315,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
         if (clOptionSupport("-cl-no-subgroup-ifp"))
             options_ << " -cl-no-subgroup-ifp ";
 
+        addDef("KERNEL_GEMM_LIKE");
         addDef("INPUT_DEPTH", channels_);
         addDef("WIDTH1", M_);
         addDef("OUT_PADDING_LEFT", 0);
@@ -329,6 +332,28 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
         setFusionDefine(fused_activ_, fused_eltwise_);
         src_ = ocl::dnn::conv_layer_spatial_oclsrc;
     }
+    else if (kernelType == KERNEL_TYPE_DWCONV)
+    {
+        kernelUKey = generateSpecificKey(KERNEL_TYPE_DWCONV, blockM, blockK, blockN);
+        kernel_name_ = "DWCONV_";
+        kernel_name_ += kernelUKey.c_str();
+
+        options_ << " -cl-fast-relaxed-math ";
+        if (clOptionSupport("-cl-no-subgroup-ifp"))
+            options_ << " -cl-no-subgroup-ifp ";
+
+        addDef("KERNEL_DWCONV");
+        addDef("KERNEL_SIZE", kernel_w_ * kernel_h_);
+        addDef("KERNEL_W", kernel_w_);
+        addDef("KERNEL_H", kernel_h_);
+        addDef("APPLY_BIAS", bias_term_);
+        addDef("OUTPUT_Z", num_output_ * num_);
+        addDef("CHANNELS", num_output_);
+        setFusionDefine(fused_activ_, fused_eltwise_);
+
+        options_ << " -D DWCONV=" << kernel_name_;
+        src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
+    }
 }
 
 template<typename Dtype>
@@ -906,6 +931,33 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                 return false;
             }
         }
+    } else if (config->kernelType == KERNEL_TYPE_DWCONV) {
+        ocl::Kernel kernel(config->kernelName.c_str(), program);
+        if (kernel.empty())
+            return false;
+
+        cl_uint argIdx = 0;
+        setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
+        kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+        kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
+        if (bias_term_)
+            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
+        kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+        kernel.set(argIdx++, (uint16_t)width_);
+        kernel.set(argIdx++, (uint16_t)height_);
+        kernel.set(argIdx++, (uint16_t)output_w_);
+        kernel.set(argIdx++, (uint16_t)output_h_);
+
+        size_t global_size[3];
+        global_size[0] = output_w_;
+        global_size[1] = output_h_;
+        global_size[2] = num_output_ * num_;
+
+        if (!kernel.run(3, global_size, NULL, false))
+        {
+            std::cout << "DWCONV kernel run failed." << std::endl;
+            return false;
+        }
     } else {
         for (int32_t n = 0; n < numImages; ++n) {
             for (int32_t g = 0; g < group_; ++g) {
@@ -1222,6 +1274,39 @@ bool OCL4DNNConvSpatial<float>::createIDLFKernel(int32_t blockWidth,
         return false;
 }
 
+template<>
+bool OCL4DNNConvSpatial<float>::createDWConvKernel(int32_t blockWidth,
+                                                   int32_t blockHeight,
+                                                   int32_t blockDepth)
+{
+    if (!dwconv_)
+        return false;
+
+    int workItemOutput[3] = { 1, 1, 1 };
+    size_t local_size[3] = { 1, 1, 1 };
+    size_t global_size[3];
+    global_size[0] = divUp(output_w_, workItemOutput[0]);
+    global_size[1] = divUp(output_h_, workItemOutput[1]);
+    global_size[2] = divUp(M_ * num_, workItemOutput[2]);
+
+    kernelType_ = KERNEL_TYPE_DWCONV;
+    blockM_ = blockWidth;
+    blockK_ = blockHeight;
+    blockN_ = blockDepth;
+
+    setupKernel();
+
+    ocl::Program program = compileKernel();
+    if (program.ptr())
+    {
+        kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0],
+                              &workItemOutput[0], false, KERNEL_TYPE_DWCONV));
+        return true;
+    }
+    else
+        return false;
+}
+
 template<>
 bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
                                                         int32_t blockWidth,
@@ -1238,6 +1323,8 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
         return createBasicKernel(blockWidth, blockHeight, blockDepth);
     else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
         return createGEMMLikeConvKernel(blockWidth, blockHeight, blockDepth);
+    else if (kernelType == KERNEL_TYPE_DWCONV)
+        return createDWConvKernel(blockWidth, blockHeight, blockDepth);
     else
         CV_Assert(0 && "Internal error");
     return false;
@@ -1246,7 +1333,16 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
 template<>
 void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
 {
-    if (ocl::Device::getDefault().intelSubgroupsSupport()) {
+    if (ocl::Device::getDefault().intelSubgroupsSupport())
+    {
+        //depth_wise kernels
+        if (dwconv_)
+        {
+            tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_DWCONV, 1, 1, 1));
+            if (group_ > 8)
+                return;
+        }
+
         /* IDLF kernels are using Intel specific extension which make
            them intel only. */
         // Generates static key_
diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl
index 91066bdbfd..2457cf7677 100644
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@@ -383,7 +383,7 @@ convolve_simd(
   }
 }
 
-#else // KERNEL_GEMM_LIKE
+#elif defined KERNEL_GEMM_LIKE
 
 #if APPLY_BIAS
 // Dtype bias[4];
@@ -1501,4 +1501,59 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
     INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);
 }
 #endif
-#endif // KERNEL_BASIC/IDLF/GEMM_LIKE
+
+#elif defined KERNEL_DWCONV
+
+__kernel void DWCONV(
+    ELTWISE_DATA_ARG
+    NEGATIVE_SLOPE_ARG
+    __global Dtype* image_data,
+    __global Dtype* kernel_data,
+    BIAS_KERNEL_ARG
+    __global Dtype* convolved_image,
+    const ushort input_width,
+    const ushort input_height,
+    const ushort output_width,
+    const ushort output_height) {
+
+  const int outputX = get_global_id(0);
+  const int outputY = get_global_id(1);
+  const int outputZ = get_global_id(2);
+  if(outputX < output_width && outputY < output_height)
+  {
+    Dtype sum = 0.;
+
+    const int org_y = outputY * STRIDE_Y - INPUT_PAD_H;
+    const int org_x = outputX * STRIDE_X - INPUT_PAD_W;
+    const int currentKernelOffset = KERNEL_SIZE*(outputZ%CHANNELS);
+    const int biasIndex=outputZ%CHANNELS;
+    const int local_image_offset = org_y*input_width + org_x;
+    const int imageSize = input_width*input_height;
+
+    __global Dtype* image_dataPtrFloat = (image_data + (imageSize*outputZ + local_image_offset));
+    __global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));
+
+    for(int y = 0; y < KERNEL_H; y++)
+    {
+      for(int x = 0; x < KERNEL_W; x++)
+      {
+        if(!(org_y + y * DILATION_Y >= 0 && org_y + y * DILATION_Y < input_height && org_x + x * DILATION_X >= 0 && org_x + x * DILATION_X < input_width))
+        {
+          continue;
+        }
+        sum += image_dataPtrFloat[x * DILATION_X] * kernel_dataPtrFloat[x];
+      }
+      image_dataPtrFloat += input_width * DILATION_Y;
+      kernel_dataPtrFloat += KERNEL_W;
+    }
+
+    #if APPLY_BIAS
+    int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
+    ACTIVATION_FUNCTION(convolved_image, offset, sum + biases_base[biasIndex], biasIndex);
+    #else
+    int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
+    ACTIVATION_FUNCTION(convolved_image, offset, sum, biasIndex);
+    #endif
+  }
+}
+#endif // KERNEL_BASIC/IDLF/GEMM_LIKE/DWCONV

From 3b84acfc48426dc5f6177cf9638edba2ef9872d9 Mon Sep 17 00:00:00 2001
From: Li Peng <peng.li@intel.com>
Date: Tue, 19 Dec 2017 16:59:46 +0800
Subject: [PATCH 3/3] add ocl accuracy test for tf mobilenet ssd

Signed-off-by: Li Peng <peng.li@intel.com>
---
 modules/dnn/test/test_tf_importer.cpp | 39 +++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index fbb90a7241..1badf74ab7 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -11,6 +11,8 @@ Test for Tensorflow models loading
 
 #include "test_precomp.hpp"
 #include "npy_blob.hpp"
+#include <opencv2/core/ocl.hpp>
+#include <opencv2/ts/ocl_test.hpp>
 
 namespace cvtest
 {
@@ -219,6 +221,43 @@ TEST(Test_TensorFlow, MobileNet_SSD)
     normAssert(target[2].reshape(1, 1), output[2].reshape(1, 1), "", 4e-5, 1e-2);
 }
 
+OCL_TEST(Test_TensorFlow, MobileNet_SSD)
+{
+    std::string netPath = findDataFile("dnn/ssd_mobilenet_v1_coco.pb", false);
+    std::string netConfig = findDataFile("dnn/ssd_mobilenet_v1_coco.pbtxt", false);
+    std::string imgPath = findDataFile("dnn/street.png", false);
+
+    Mat inp;
+    resize(imread(imgPath), inp, Size(300, 300));
+    inp = blobFromImage(inp, 1.0f / 127.5, Size(), Scalar(127.5, 127.5, 127.5), true);
+
+    std::vector<String> outNames(3);
+    outNames[0] = "concat";
+    outNames[1] = "concat_1";
+    outNames[2] = "detection_out";
+
+    std::vector<Mat> target(outNames.size());
+    for (int i = 0; i < outNames.size(); ++i)
+    {
+        std::string path = findDataFile("dnn/tensorflow/ssd_mobilenet_v1_coco." + outNames[i] + ".npy", false);
+        target[i] = blobFromNPY(path);
+    }
+
+    Net net = readNetFromTensorflow(netPath, netConfig);
+
+    net.setPreferableBackend(DNN_BACKEND_DEFAULT);
+    net.setPreferableTarget(DNN_TARGET_OPENCL);
+
+    net.setInput(inp);
+
+    std::vector<Mat> output;
+    net.forward(output, outNames);
+
+    normAssert(target[0].reshape(1, 1), output[0].reshape(1, 1));
+    normAssert(target[1].reshape(1, 1), output[1].reshape(1, 1), "", 1e-5, 2e-4);
+    normAssert(target[2].reshape(1, 1), output[2].reshape(1, 1), "", 4e-5, 1e-2);
+}
+
 TEST(Test_TensorFlow, lstm)
 {
     runTensorFlowNet("lstm", true);