From ef937dd67642fe23f4c5095f4a252bec36d9daa5 Mon Sep 17 00:00:00 2001
From: Wu Zhiwen <zhiwen.wu@intel.com>
Date: Mon, 26 Feb 2018 14:57:04 +0800
Subject: [PATCH] ocl4dnn: Fix SAME padding mode for convolve

Signed-off-by: Wu, Zhiwen <zhiwen.wu@intel.com>
Signed-off-by: Li Peng <peng.li@intel.com>
---
 modules/dnn/src/layers/convolution_layer.cpp  |  3 --
 modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp   |  2 +
 .../src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp  |  8 +++
 modules/dnn/src/opencl/conv_layer_spatial.cl  | 51 ++++++++++---------
 modules/dnn/test/test_tf_importer.cpp         |  2 +-
 5 files changed, 38 insertions(+), 28 deletions(-)
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 40681c963f..0c719a21fa 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -824,9 +824,6 @@ public:
         for (int i = 0; i < inputs.size(); ++i)
             CV_Assert(inputs[i].u != outputs[0].u);
 
-        if (padMode == "SAME")
-            return false;
-
         if (convolutionOp.empty())
         {
             OCL4DNNConvConfig config;
diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
index 70ced11276..2cc2377e8f 100644
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@@ -285,6 +285,8 @@ class OCL4DNNConvSpatial
         int32_t width_;
         int32_t pad_h_;
         int32_t pad_w_;
+        int32_t pad_bottom_;
+        int32_t pad_right_;
         int32_t stride_h_;
         int32_t stride_w_;
         int32_t dilation_h_;
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
index 84ea1914dc..0f6cd8d01b 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@@ -103,6 +103,12 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
     output_w_ = config.out_shape[dims - spatial_dims + 1];
     bottom_dim_ = channels_ * width_ * height_;
     top_dim_ = num_output_ * output_w_ * output_h_;
+    int Ph = (output_h_ - 1) * stride_h_ + (dilation_h_ * (kernel_h_ - 1) + 1) - height_;
+    int Pw = (output_w_ - 1) * stride_w_ + (dilation_w_ * (kernel_w_ - 1) + 1) - width_;
+    Ph = (Ph > 0) ? Ph : 0;
+    Pw = (Pw > 0) ? Pw : 0;
+    pad_right_  = (Pw + 1) / 2;
+    pad_bottom_ = (Ph + 1) / 2;
 
     cache_path_ = utils::getConfigurationParameterString("OPENCV_OCL4DNN_CONFIG_PATH", "");
     dwconv_ = (num_output_ == channels_ && channels_ == group_);
@@ -379,6 +385,8 @@ void OCL4DNNConvSpatial<Dtype>::setupKernel()
     {
         addDef("INPUT_PAD_W", pad_w_);
         addDef("INPUT_PAD_H", pad_h_);
+        addDef("INPUT_PAD_RIGHT", pad_right_);
+        addDef("INPUT_PAD_BOTTOM", pad_bottom_);
     }
 
     setupKernelDetails(kernelType_, blockM_, blockK_, blockN_);
diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl
index 8f6e5a38b2..5308bf1d1a 100644
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@@ -238,7 +238,7 @@ convolve_simd(
   int curr_local_x = ( lid % ( TILE_X / 4 ) ) * 4;
   int curr_y = or * STRIDE_Y + curr_local_y;
   int curr_x = oc * STRIDE_X + curr_local_x;
-#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0
+#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
   int saved_y = curr_y;
 #endif
   in_addr = input_batch_offset
@@ -256,19 +256,22 @@ convolve_simd(
     LOOP(INVEC_SIZE, reg,
       {
         if (curr_local_y + reg * TILE_Y_STRIDE < TILE_Y || INVEC_SIZE * TILE_Y_STRIDE <= (TILE_Y + 2) || reg < INVEC_SIZE - 1) {
-#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0
+#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
         if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + 3 >= INPUT_PAD_W && curr_x < input_width + INPUT_PAD_W) {
           if (curr_x < INPUT_PAD_W) {
             in_buf.in_vec[reg].s0 = 0;
-            if (curr_x + 1 >= INPUT_PAD_W)
+            if (curr_x + 1 >= INPUT_PAD_W && curr_x + 1 < input_width + INPUT_PAD_W)
               in_buf.in_vec[reg].s1 = *(inputs + in_offset + 1);
             else
               in_buf.in_vec[reg].s1 = 0;
-            if (curr_x + 2 >= INPUT_PAD_W)
+            if (curr_x + 2 >= INPUT_PAD_W && curr_x + 2 < input_width + INPUT_PAD_W)
               in_buf.in_vec[reg].s2 = *(inputs + in_offset + 2);
             else
               in_buf.in_vec[reg].s2 = 0;
-            in_buf.in_vec[reg].s3 = *(inputs + in_offset + 3);
+            if (curr_x + 3 < input_width + INPUT_PAD_W)
+              in_buf.in_vec[reg].s3 = *(inputs + in_offset + 3);
+            else
+              in_buf.in_vec[reg].s3 = 0;
           } else {
             VLOAD4(in_buf.in_vec[reg], inputs + in_offset);
             if (curr_x + 1 >= input_width + INPUT_PAD_W)
@@ -289,7 +292,7 @@ convolve_simd(
         in_offset += input_width * TILE_Y_STRIDE;
       });
     in_addr += input_height * input_width;
-#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0
+#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
     curr_y = saved_y;
 #endif
 
@@ -492,7 +495,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         // atile is M rows x K columns.
         int curr_x = ( global_y % output_width ) * STRIDE_X;
         int curr_y = ( global_y / output_width ) * STRIDE_Y;
-#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
         int saved_y = curr_y;
 #endif
         const __global Dtype *src0_read = src0
@@ -512,7 +515,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         do
         {
             int patch_row = 0;
-#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
             curr_y = saved_y;
 #endif
 
@@ -530,7 +533,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
                 // ...
                 const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
 
-#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1
+#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
                 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];
                 Dtype*  pblockA00 = (Dtype*)(&blockA00);
 #else
@@ -646,7 +649,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         // atile is M rows x K columns.
         int curr_x = ( global_y % output_width ) * STRIDE_X;
         int curr_y = ( global_y / output_width ) * STRIDE_Y;
-#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
         int saved_y = curr_y;
 #endif
         const __global Dtype *src0_read = src0
@@ -666,14 +669,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         do
         {
             int patch_row = 0;
-#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
             curr_y = saved_y;
 #endif
             do
             {
                 // Load atile and interleaved btile.
                 const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
-#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1
+#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
                 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];
                 Dtype*  pblockA00 = (Dtype*)(&blockA00);
 #else
@@ -873,7 +876,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;
         int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;
         int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;
-#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
         int saved_y0 = curr_y0;
         int saved_y1 = curr_y1;
 #endif
@@ -911,7 +914,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
                 // (0, 2) (8, 2) (16, 2) (24, 2) ...       ...
                 // ...
                 const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
-#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1
+#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
                 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[  0  ]; src0_read0 += ROW_PITCH;
                 Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[  0  ]; src0_read1 += ROW_PITCH;
                 Dtype*  pblockA00 = (Dtype*)(&blockA00);
@@ -997,7 +1000,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 
             //while( ++patch_row < 1 ); //debug
             while( ++patch_row < KERNEL_HEIGHT );
-#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1
+#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
             curr_y0 = saved_y0;
             curr_y1 = saved_y1;
 #endif
@@ -1073,7 +1076,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;
         int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;
         int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;
-#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
         int saved_y0 = curr_y0;
         int saved_y1 = curr_y1;
 #endif
@@ -1102,7 +1105,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
             {
                 // Load atile and interleaved btile.
                 const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
-#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1
+#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
                 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[  0  ]; src0_read0 += ROW_PITCH;
                 Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[  0  ]; src0_read1 += ROW_PITCH;
                 Dtype*  pblockA00 = (Dtype*)(&blockA00);
@@ -1210,7 +1213,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 
             //while( ++patch_row < 1 ); //debug
             while( ++patch_row < KERNEL_HEIGHT );
-#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1
+#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
             curr_y0 = saved_y0;
             curr_y1 = saved_y1;
 #endif
@@ -1377,7 +1380,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
     // atile is M rows x K columns.
     int curr_x = ( global_y % output_width ) * STRIDE_X;
     int curr_y = ( global_y / output_width ) * STRIDE_Y;
-#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
     int saved_y = curr_y;
 #endif
     const __global Dtype *src0_read = src0
@@ -1419,7 +1422,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
     do
     {
         int patch_row = 0;
-#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
         curr_y = saved_y;
 #endif
         __attribute__((opencl_unroll_hint(1)))
@@ -1437,7 +1440,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
             // ...
             const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
 
-#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1
+#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
             Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];
             Dtype*  pblockA00 = (Dtype*)(&blockA00);
 #else
@@ -1580,7 +1583,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;
         int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;
         int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;
-#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
         int saved_y0 = curr_y0;
         int saved_y1 = curr_y1;
 #endif
@@ -1618,7 +1621,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
                 // (0, 2) (8, 2) (16, 2) (24, 2) ...       ...
                 // ...
                 const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
-#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1
+#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
                 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[  0  ]; src0_read0 += ROW_PITCH;
                 Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[  0  ]; src0_read1 += ROW_PITCH;
                 Dtype*  pblockA00 = (Dtype*)(&blockA00);
@@ -1692,7 +1695,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 
             //while( ++patch_row < 1 ); //debug
             while( ++patch_row < KERNEL_HEIGHT );
-#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1
+#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
             curr_y0 = saved_y0;
             curr_y1 = saved_y1;
 #endif
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index cfa66de3b1..62540db771 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -321,7 +321,7 @@ OCL_TEST(Test_TensorFlow, MobileNet_SSD)
     std::vector<Mat> output;
     net.forward(output, outNames);
 
-    normAssert(target[0].reshape(1, 1), output[0].reshape(1, 1));
+    normAssert(target[0].reshape(1, 1), output[0].reshape(1, 1), "", 1e-5, 1.5e-4);
     normAssert(target[1].reshape(1, 1), output[1].reshape(1, 1), "", 1e-5, 3e-4);
     normAssert(target[2].reshape(1, 1), output[2].reshape(1, 1), "", 4e-5, 1e-2);
 }