From 2124361ff7c61f8e8693d7ac249552aacb0ae16f Mon Sep 17 00:00:00 2001
From: Li Peng <peng.li@intel.com>
Date: Tue, 16 Jan 2018 21:54:32 +0800
Subject: [PATCH] ocl support for Deconvolution layer

Signed-off-by: Li Peng <peng.li@intel.com>
---
 modules/dnn/src/layers/convolution_layer.cpp |  99 ++++++++++++++
 modules/dnn/src/opencl/col2im.cl             | 129 +++++++++++--------
 modules/dnn/test/test_layers.cpp             |   5 +
 modules/dnn/test/test_tf_importer.cpp        |   5 +
 modules/dnn/test/test_torch_importer.cpp     |   5 +
 5 files changed, 187 insertions(+), 56 deletions(-)
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 7abde1397a..e2ae78cf83 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -46,6 +46,7 @@
 #include "opencv2/core/hal/hal.hpp"
 #include "opencv2/core/hal/intrin.hpp"
 #include <iostream>
+#include "opencl_kernels_dnn.hpp"
 
 #ifdef HAVE_OPENCL
 using namespace cv::dnn::ocl4dnn;
@@ -1051,6 +1052,8 @@ class DeConvolutionLayerImpl : public BaseConvolutionLayerImpl
 {
 public:
     Mat weightsMat, biasesMat;
+    UMat umat_weights;
+    UMat umat_biases;
 
     MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
     {
@@ -1341,11 +1344,107 @@ public:
         }
     };
 
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+        std::vector<UMat> internals;
+
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+        internals_.getUMatVector(internals);
+
+        int outCn = numOutput;
+        int inpCn = inputs[0].size[1];
+
+        if (is1x1())
+            return false;
+
+        if (umat_weights.empty())
+        {
+            transpose(blobs[0].reshape(1, inpCn), umat_weights);
+            umat_biases = hasBias() ? blobs[1].reshape(1, outCn).getUMat(ACCESS_READ) :
+                          UMat::zeros(outCn, 1, CV_32F);
+        }
+
+        String buildopt = format("-DT=%s ", ocl::typeToStr(inputs[0].type()));
+        buildopt += format("-DPAD_H=%d -DPAD_W=%d -DKERNEL_H=%d -DKERNEL_W=%d -DSTRIDE_H=%d -DSTRIDE_W=%d ",
+                           pad.height, pad.width, kernel.height, kernel.width, stride.height, stride.width);
+
+        for (size_t ii = 0; ii < outputs.size(); ii++)
+        {
+            int ngroups = outCn / blobs[0].size[1];
+            int inpGroupCn = inpCn / ngroups;
+            int outGroupCn = blobs[0].size[1];
+            const UMat& inp = inputs[ii];
+            UMat& out = outputs[ii];
+            int numImg = inp.size[0];
+            int inpH = inp.size[2], inpW = inp.size[3];
+            int outH = out.size[2], outW = out.size[3];
+
+            MatShape inpshape = shape(numImg*inpCn, inpH*inpW);
+            MatShape outshape = shape(numImg*outCn, outH*outW);
+            UMat convBlob = inputs[ii].reshape(1, inpshape.size(), &inpshape[0]);
+            UMat decnBlob = out.reshape(1, outshape.size(), &outshape[0]);
+            int rows = internals[0].rows / ngroups;
+
+            for (int n = 0; n < numImg; n++)
+            {
+                for (int g = 0; g < ngroups; g++)
+                {
+                    UMat colMat = internals[0].rowRange(_Range(g * rows, rows));
+                    UMat convMat = convBlob.rowRange(_Range((g + n * ngroups) * inpGroupCn, inpGroupCn));
+                    UMat wghtMat = umat_weights.colRange(_Range(g * inpGroupCn, inpGroupCn));
+                    gemm(wghtMat, convMat, 1, noArray(), 0, colMat, 0);
+                }
+
+                for (int g = 0; g < ngroups; g++)
+                {
+                    int total = outGroupCn * decnBlob.cols;
+                    int index = 0;
+                    int height_col = (outH + 2 * pad.height - kernel.height) / stride.height + 1;
+                    int width_col = (outW + 2 * pad.width - kernel.width) / stride.width + 1;
+                    int coeff_h = (1 - stride.height * kernel.width * height_col) * width_col;
+                    int coeff_w = (1 - stride.width * height_col * width_col);
+
+                    ocl::Kernel k("col2im", ocl::dnn::col2im_oclsrc, buildopt);
+                    k.set(index++, total);
+                    k.set(index++, ocl::KernelArg::PtrReadOnly(internals[0]));
+                    k.set(index++, (int)(g * rows * internals[0].cols));
+                    k.set(index++, outGroupCn);
+                    k.set(index++, outH);
+                    k.set(index++, outW);
+                    k.set(index++, height_col);
+                    k.set(index++, width_col);
+                    k.set(index++, coeff_h);
+                    k.set(index++, coeff_w);
+                    k.set(index++, ocl::KernelArg::PtrReadOnly(umat_biases));
+                    k.set(index++, (int)(g * outGroupCn * umat_biases.cols));
+                    k.set(index++, ocl::KernelArg::PtrWriteOnly(decnBlob));
+                    k.set(index++, (int)((g + n * ngroups) * outGroupCn * decnBlob.cols));
+
+                    size_t global[] = { (size_t)total };
+                    bool ret = k.run(1, global, NULL, false);
+                    if (!ret)
+                        return false;
+                }
+            }
+        }
+
+        return true;
+    }
+#endif
+
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
     {
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
+        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
         Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
     }
 
diff --git a/modules/dnn/src/opencl/col2im.cl b/modules/dnn/src/opencl/col2im.cl
index 30d4664df8..f2ca95154c 100644
--- a/modules/dnn/src/opencl/col2im.cl
+++ b/modules/dnn/src/opencl/col2im.cl
@@ -1,62 +1,79 @@
-/*************************************************************************************
- * Copyright (c) 2015, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification,
- * are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation and/or
- *  other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
- * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- **************************************************************************************/
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
 
-__kernel void col2im(const int n, __global const T* data_col, const int col_offset,
-    const int height, const int width, const int channels,
-    const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    const int height_col, const int width_col,
-    __global T* data_im, const int img_offset)
+__kernel void col2im(const int n, __global const T* data_col,
+                     const int data_col_offset,
+                     const int channels,
+                     const int height, const int width,
+                     const int height_col, const int width_col,
+                     const int coeff_h, const int coeff_w,
+                     __global const T* biasvec,
+                     const int bias_offset,
+                     __global T* data_im,
+                     const int data_im_offset)
 {
-  data_col = data_col + col_offset;
-  data_im = data_im + img_offset;
-  int index = get_global_id(0);
-  if(index < n) {
-    T val = 0;
-    int w = index % width + pad_w;
-    int h = (index / width) % height + pad_h;
-    int c = index / (width * height);
+    data_col = data_col + data_col_offset;
+    biasvec = biasvec + bias_offset;
+    data_im = data_im + data_im_offset;
+    int index = get_global_id(0);
 
-    // compute the start and end of the output
-    int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
-    int w_col_end = min(w / stride_w + 1, width_col);
-    int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
-    int h_col_end = min(h / stride_h + 1, height_col);
+    if(index < n)
+    {
+        T val = 0.f;
+        int w = index % width + PAD_W;
+        int h = (index / width) % height + PAD_H;
+        int c = index / (width * height);
+        int h_col_start = (h < KERNEL_H) ? 0 : (h - KERNEL_H) / STRIDE_H + 1;
+        int h_col_end = min(h / STRIDE_H + 1, height_col);
+        int plane_size_col = height_col * width_col;
+        int offset = (c * KERNEL_H * KERNEL_W + h * KERNEL_W + w) * plane_size_col;
 
-    // equivalent implementation
-    int offset =
-    (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
-    int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
-    int coeff_w_col = (1 - stride_w * height_col * width_col);
-    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-        val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-      }
+        int w_col_start = (w < KERNEL_W) ? 0 : (w - KERNEL_W) / STRIDE_W + 1;
+        int w_col_end = min(w / STRIDE_W + 1, width_col);
+
+        for (int h_col = h_col_start; h_col < h_col_end; ++h_col)
+            for (int w_col = w_col_start; w_col < w_col_end; ++w_col)
+                val += data_col[offset + h_col * coeff_h + w_col * coeff_w];
+
+        data_im[index] = val + biasvec[c];
     }
-    data_im[index] = val;
-  }
 }
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index d88f01d380..0f90b42bfd 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -167,6 +167,11 @@ TEST(Layer_Test_DeConvolution, Accuracy)
     testLayerUsingCaffeModels("layer_deconvolution", DNN_TARGET_CPU, true, false);
 }
 
+OCL_TEST(Layer_Test_DeConvolution, Accuracy)
+{
+    testLayerUsingCaffeModels("layer_deconvolution", DNN_TARGET_OPENCL, true, false);
+}
+
 TEST(Layer_Test_InnerProduct, Accuracy)
 {
     testLayerUsingCaffeModels("layer_inner_product", DNN_TARGET_CPU, true);
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index bde5760bfc..8cf471df53 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -171,6 +171,11 @@ TEST(Test_TensorFlow, deconvolution)
     runTensorFlowNet("deconvolution");
 }
 
+OCL_TEST(Test_TensorFlow, deconvolution)
+{
+    runTensorFlowNet("deconvolution", DNN_TARGET_OPENCL);
+}
+
 TEST(Test_TensorFlow, matmul)
 {
     runTensorFlowNet("matmul");
diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp
index f7471dd144..60bc3fedbe 100644
--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@@ -165,6 +165,11 @@ TEST(Torch_Importer, run_deconv)
     runTorchNet("net_deconv");
 }
 
+OCL_TEST(Torch_Importer, run_deconv)
+{
+    runTorchNet("net_deconv", DNN_TARGET_OPENCL);
+}
+
 TEST(Torch_Importer, run_batch_norm)
 {
     runTorchNet("net_batch_norm", DNN_TARGET_CPU, "", false, true);