From ef937dd67642fe23f4c5095f4a252bec36d9daa5 Mon Sep 17 00:00:00 2001 From: Wu Zhiwen Date: Mon, 26 Feb 2018 14:57:04 +0800 Subject: [PATCH] ocl4dnn: Fix SAME padding mode for convolve Signed-off-by: Wu, Zhiwen Signed-off-by: Li Peng --- modules/dnn/src/layers/convolution_layer.cpp | 3 -- modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp | 2 + .../src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp | 8 +++ modules/dnn/src/opencl/conv_layer_spatial.cl | 51 ++++++++++--------- modules/dnn/test/test_tf_importer.cpp | 2 +- 5 files changed, 38 insertions(+), 28 deletions(-) diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 40681c963f..0c719a21fa 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -824,9 +824,6 @@ public: for (int i = 0; i < inputs.size(); ++i) CV_Assert(inputs[i].u != outputs[0].u); - if (padMode == "SAME") - return false; - if (convolutionOp.empty()) { OCL4DNNConvConfig config; diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp index 70ced11276..2cc2377e8f 100644 --- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp +++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp @@ -285,6 +285,8 @@ class OCL4DNNConvSpatial int32_t width_; int32_t pad_h_; int32_t pad_w_; + int32_t pad_bottom_; + int32_t pad_right_; int32_t stride_h_; int32_t stride_w_; int32_t dilation_h_; diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp index 84ea1914dc..0f6cd8d01b 100644 --- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp @@ -103,6 +103,12 @@ OCL4DNNConvSpatial::OCL4DNNConvSpatial(OCL4DNNConvConfig config) output_w_ = config.out_shape[dims - spatial_dims + 1]; bottom_dim_ = channels_ * width_ * height_; top_dim_ = num_output_ * output_w_ * output_h_; + int Ph = (output_h_ - 1) * stride_h_ + (dilation_h_ * (kernel_h_ - 1) + 1) - height_; + int Pw = (output_w_ - 1) * stride_w_ + (dilation_w_ * (kernel_w_ - 1) + 1) - width_; + Ph = (Ph > 0) ? Ph : 0; + Pw = (Pw > 0) ? Pw : 0; + pad_right_ = (Pw + 1) / 2; + pad_bottom_ = (Ph + 1) / 2; cache_path_ = utils::getConfigurationParameterString("OPENCV_OCL4DNN_CONFIG_PATH", ""); dwconv_ = (num_output_ == channels_ && channels_ == group_); @@ -379,6 +385,8 @@ void OCL4DNNConvSpatial::setupKernel() { addDef("INPUT_PAD_W", pad_w_); addDef("INPUT_PAD_H", pad_h_); + addDef("INPUT_PAD_RIGHT", pad_right_); + addDef("INPUT_PAD_BOTTOM", pad_bottom_); } setupKernelDetails(kernelType_, blockM_, blockK_, blockN_); diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl index 8f6e5a38b2..5308bf1d1a 100644 --- a/modules/dnn/src/opencl/conv_layer_spatial.cl +++ b/modules/dnn/src/opencl/conv_layer_spatial.cl @@ -238,7 +238,7 @@ convolve_simd( int curr_local_x = ( lid % ( TILE_X / 4 ) ) * 4; int curr_y = or * STRIDE_Y + curr_local_y; int curr_x = oc * STRIDE_X + curr_local_x; -#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 +#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 int saved_y = curr_y; #endif in_addr = input_batch_offset @@ -256,19 +256,22 @@ convolve_simd( LOOP(INVEC_SIZE, reg, { if (curr_local_y + reg * TILE_Y_STRIDE < TILE_Y || INVEC_SIZE * TILE_Y_STRIDE <= (TILE_Y + 2) || reg < INVEC_SIZE - 1) { -#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 +#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + 3 >= INPUT_PAD_W && curr_x < input_width + INPUT_PAD_W) { if (curr_x < INPUT_PAD_W) { in_buf.in_vec[reg].s0 = 0; - if (curr_x + 1 >= INPUT_PAD_W) + if (curr_x + 1 >= INPUT_PAD_W && curr_x + 1 < input_width + INPUT_PAD_W) in_buf.in_vec[reg].s1 = *(inputs + in_offset + 1); else in_buf.in_vec[reg].s1 = 0; - if (curr_x + 2 >= INPUT_PAD_W) + if (curr_x + 2 >= INPUT_PAD_W && curr_x + 2 < input_width + INPUT_PAD_W) in_buf.in_vec[reg].s2 = *(inputs + in_offset + 2); else in_buf.in_vec[reg].s2 = 0; - in_buf.in_vec[reg].s3 = *(inputs + in_offset + 3); + if (curr_x + 3 < input_width + INPUT_PAD_W) + in_buf.in_vec[reg].s3 = *(inputs + in_offset + 3); + else + in_buf.in_vec[reg].s3 = 0; } else { VLOAD4(in_buf.in_vec[reg], inputs + in_offset); if (curr_x + 1 >= input_width + INPUT_PAD_W) @@ -289,7 +292,7 @@ convolve_simd( in_offset += input_width * TILE_Y_STRIDE; }); in_addr += input_height * input_width; -#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 +#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 curr_y = saved_y; #endif @@ -492,7 +495,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) // atile is M rows x K columns. int curr_x = ( global_y % output_width ) * STRIDE_X; int curr_y = ( global_y / output_width ) * STRIDE_Y; -#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 +#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 int saved_y = curr_y; #endif const __global Dtype *src0_read = src0 @@ -512,7 +515,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) do { int patch_row = 0; -#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 +#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 curr_y = saved_y; #endif @@ -530,7 +533,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) // ... const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; -#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 +#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; Dtype* pblockA00 = (Dtype*)(&blockA00); #else @@ -646,7 +649,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) // atile is M rows x K columns. int curr_x = ( global_y % output_width ) * STRIDE_X; int curr_y = ( global_y / output_width ) * STRIDE_Y; -#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 +#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 int saved_y = curr_y; #endif const __global Dtype *src0_read = src0 @@ -666,14 +669,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) do { int patch_row = 0; -#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 +#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 curr_y = saved_y; #endif do { // Load atile and interleaved btile. const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; -#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 +#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; Dtype* pblockA00 = (Dtype*)(&blockA00); #else @@ -873,7 +876,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X; int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y; int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y; -#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 +#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 int saved_y0 = curr_y0; int saved_y1 = curr_y1; #endif @@ -911,7 +914,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) // (0, 2) (8, 2) (16, 2) (24, 2) ... ... // ... const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; -#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 +#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH; Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH; Dtype* pblockA00 = (Dtype*)(&blockA00); @@ -997,7 +1000,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) //while( ++patch_row < 1 ); //debug while( ++patch_row < KERNEL_HEIGHT ); -#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 +#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 curr_y0 = saved_y0; curr_y1 = saved_y1; #endif @@ -1073,7 +1076,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X; int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y; int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y; -#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 +#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 int saved_y0 = curr_y0; int saved_y1 = curr_y1; #endif @@ -1102,7 +1105,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) { // Load atile and interleaved btile. const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; -#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 +#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH; Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH; Dtype* pblockA00 = (Dtype*)(&blockA00); @@ -1210,7 +1213,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) //while( ++patch_row < 1 ); //debug while( ++patch_row < KERNEL_HEIGHT ); -#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 +#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 curr_y0 = saved_y0; curr_y1 = saved_y1; #endif @@ -1377,7 +1380,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) // atile is M rows x K columns. int curr_x = ( global_y % output_width ) * STRIDE_X; int curr_y = ( global_y / output_width ) * STRIDE_Y; -#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 +#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 int saved_y = curr_y; #endif const __global Dtype *src0_read = src0 @@ -1419,7 +1422,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) do { int patch_row = 0; -#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 +#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 curr_y = saved_y; #endif __attribute__((opencl_unroll_hint(1))) @@ -1437,7 +1440,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) // ... const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; -#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 +#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; Dtype* pblockA00 = (Dtype*)(&blockA00); #else @@ -1580,7 +1583,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X; int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y; int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y; -#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 +#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 int saved_y0 = curr_y0; int saved_y1 = curr_y1; #endif @@ -1618,7 +1621,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) // (0, 2) (8, 2) (16, 2) (24, 2) ... ... // ... const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; -#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 +#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH; Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH; Dtype* pblockA00 = (Dtype*)(&blockA00); @@ -1692,7 +1695,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) //while( ++patch_row < 1 ); //debug while( ++patch_row < KERNEL_HEIGHT ); -#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 +#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 curr_y0 = saved_y0; curr_y1 = saved_y1; #endif diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp index cfa66de3b1..62540db771 100644 --- a/modules/dnn/test/test_tf_importer.cpp +++ b/modules/dnn/test/test_tf_importer.cpp @@ -321,7 +321,7 @@ OCL_TEST(Test_TensorFlow, MobileNet_SSD) std::vector output; net.forward(output, outNames); - normAssert(target[0].reshape(1, 1), output[0].reshape(1, 1)); + normAssert(target[0].reshape(1, 1), output[0].reshape(1, 1), "", 1e-5, 1.5e-4); normAssert(target[1].reshape(1, 1), output[1].reshape(1, 1), "", 1e-5, 3e-4); normAssert(target[2].reshape(1, 1), output[2].reshape(1, 1), "", 4e-5, 1e-2); }