From 3fddd3bf93137b02acab03ca789e64e7c25b213e Mon Sep 17 00:00:00 2001 From: Yashas Samaga B L Date: Tue, 10 Dec 2019 00:56:58 +0530 Subject: [PATCH] Merge pull request #16069 from YashasSamaga:cuda4dnn-crop_and_resize add CropAndResize layer for CUDA backend * add CropAndResize layer * process multiple channels per iteration --- modules/dnn/src/cuda/crop_and_resize.cu | 168 ++++++++++++++++++ .../src/cuda4dnn/kernels/crop_and_resize.hpp | 19 ++ modules/dnn/src/cuda4dnn/kernels/resize.hpp | 2 - .../cuda4dnn/primitives/crop_and_resize.hpp | 51 ++++++ .../dnn/src/layers/crop_and_resize_layer.cpp | 22 +++ 5 files changed, 260 insertions(+), 2 deletions(-) create mode 100644 modules/dnn/src/cuda/crop_and_resize.cu create mode 100644 modules/dnn/src/cuda4dnn/kernels/crop_and_resize.hpp create mode 100644 modules/dnn/src/cuda4dnn/primitives/crop_and_resize.hpp diff --git a/modules/dnn/src/cuda/crop_and_resize.cu b/modules/dnn/src/cuda/crop_and_resize.cu new file mode 100644 index 0000000000..c7e95104da --- /dev/null +++ b/modules/dnn/src/cuda/crop_and_resize.cu @@ -0,0 +1,168 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include +#include + +#include "math.hpp" +#include "types.hpp" +#include "grid_stride_range.hpp" +#include "execution.hpp" + +#include "../cuda4dnn/csl/stream.hpp" +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/span.hpp" + +#include + +#include + +using namespace cv::dnn::cuda4dnn::csl; +using namespace cv::dnn::cuda4dnn::csl::device; + +namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { + + namespace raw { + + template + __global__ void crop_and_resize( + Span output, size_type out_height, size_type out_width, + View input, size_type in_height, size_type in_width, + View boxes, + size_type num_channels) + { + // input [1, num_channels, in_height, in_width] + // output [boxes, num_channels, out_height, out_width] + + const auto in_image_size = in_height * in_width; + const auto out_image_size = out_height * out_width; + const auto out_box_size = num_channels * out_image_size; + + /* we have to compute the output value for every combination of (box, c, y, x) in the output + * + * the computation involving (y, x) are identical for all non-spatial dimensions + * the computation and memory requests involving the box are identical for remaining three axes + * + * we process multiple channels every iteration to reuse the identical computation + * and memory requests involved with the box and spatial dimensions + */ + + /* + * if we are processing `CHANNELS_PER_ITER` channels per iteration, we will need + * (num_channels / CHANNELS_PER_ITER) iterations per (box, x, y) + */ + auto num_channel_iters_per_box_xy = num_channels / CHANNELS_PER_ITER; + + /* we need `num_channel_iters_per_box_xy` iterations per (box, x, y) and there are + * `num_boxes` boxes and `out_image_size` combinations of (x, y) + */ + auto num_boxes = boxes.size() / 7; /* 7 values per box */ + auto iters_per_box = num_channel_iters_per_box_xy * out_image_size; + auto iters_required = num_boxes * iters_per_box; + + for (auto iter : grid_stride_range(iters_required)) { + const index_type box_no = iter / iters_per_box; + const index_type c_start = ((iter % iters_per_box) / out_image_size) * CHANNELS_PER_ITER; + + /* note here that consecutive `iter` values will often have consecutive `x` values + * => stores into output will be coalesced across threads + */ + const index_type y = (iter % out_image_size) / out_width; + const index_type x = iter % out_width; + + const index_type box_offset = box_no * 7; + const auto left = boxes[box_offset + 3], + top = boxes[box_offset + 4], + right = boxes[box_offset + 5], + bottom = boxes[box_offset + 6]; + + const auto box_width = right - left; + const auto box_height = bottom - top; + + const auto o2i_fy = static_cast(in_height - 1) / static_cast(out_height - 1); + const auto o2i_fx = static_cast(in_width - 1) / static_cast(out_width - 1); + + const auto height_scale = box_height * o2i_fy; + const auto width_scale = box_width * o2i_fx; + + const auto in_y = top * static_cast(in_height - 1) + static_cast(y) * height_scale; + const auto in_x = left * static_cast(in_width - 1) + static_cast(x) * width_scale; + + const auto in_y0 = static_cast(in_y); + const auto in_x0 = static_cast(in_x); + + using device::min; + const auto in_x1 = min(in_x0 + 1, in_width - 1); + const auto in_y1 = min(in_y0 + 1, in_height - 1); + + index_type in_offset_r0 = c_start * in_image_size + in_y0 * in_width; + index_type in_offset_r1 = c_start * in_image_size + in_y1 * in_width; + index_type out_idx = box_no * out_box_size + c_start * out_image_size + y * out_width + x; + + #pragma unroll 1 /* disable unrolling */ + for (int i = 0; i < CHANNELS_PER_ITER; i++) { + auto v_00 = input[in_offset_r0 + in_x0], + v_01 = input[in_offset_r0 + in_x1], + v_10 = input[in_offset_r1 + in_x0], + v_11 = input[in_offset_r1 + in_x1]; + + output[out_idx] = + v_00 + + T(in_y - T(in_y0)) * T(v_10 - v_00) + + T(in_x - T(in_x0)) * T(v_01 - v_00) + + T(in_y - T(in_y0)) * T(in_x - T(in_x0)) * T(v_11 - v_01 - v_10 + v_00); + + in_offset_r0 += in_image_size; + in_offset_r1 += in_image_size; + out_idx += out_image_size; + } + } + } + } + + template static + void launch_multichannel_crop_and_resize(const Stream& stream, + Span output, size_type out_height, size_type out_width, + View input, size_type in_height, size_type in_width, + View boxes, size_type num_channels) + { + auto kernel = raw::crop_and_resize; + auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream); + launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, boxes, num_channels); + } + + template + void crop_and_resize(const Stream& stream, TensorSpan output, TensorView input, View boxes) { + CV_Assert(input.get_axis_size(0) == 1); /* batch not supported */ + CV_Assert(input.get_axis_size(1) == output.get_axis_size(1)); + + auto out_height = output.get_axis_size(-2); + auto out_width = output.get_axis_size(-1); + + auto in_height = input.get_axis_size(-2); + auto in_width = input.get_axis_size(-1); + + auto num_channels = input.get_axis_size(1); + + if (num_channels % 64 == 0) { + launch_multichannel_crop_and_resize(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels); + } else if (num_channels % 32 == 0) { + launch_multichannel_crop_and_resize(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels); + } else if (num_channels % 16 == 0) { + launch_multichannel_crop_and_resize(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels); + } else if (num_channels % 8 == 0) { + launch_multichannel_crop_and_resize(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels); + } else if (num_channels % 4 == 0) { + launch_multichannel_crop_and_resize(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels); + } else if (num_channels % 2 == 0) { + launch_multichannel_crop_and_resize(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels); + } else { + launch_multichannel_crop_and_resize(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels); + } + } + + template void crop_and_resize<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, View<__half> boxes); + template void crop_and_resize(const Stream&, TensorSpan, TensorView, View boxes); + +}}}} /* namespace cv::dnn::cuda4dnn::kernels */ diff --git a/modules/dnn/src/cuda4dnn/kernels/crop_and_resize.hpp b/modules/dnn/src/cuda4dnn/kernels/crop_and_resize.hpp new file mode 100644 index 0000000000..046223d49d --- /dev/null +++ b/modules/dnn/src/cuda4dnn/kernels/crop_and_resize.hpp @@ -0,0 +1,19 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CROP_AND_RESIZE_HPP +#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CROP_AND_RESIZE_HPP + +#include "../csl/stream.hpp" +#include "../csl/tensor.hpp" +#include "../csl/span.hpp" + +namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { + + template + void crop_and_resize(const csl::Stream& stream, csl::TensorSpan output, csl::TensorView input, csl::View boxes); + +}}}} /* namespace cv::dnn::cuda4dnn::kernels */ + +#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CROP_AND_RESIZE_HPP */ diff --git a/modules/dnn/src/cuda4dnn/kernels/resize.hpp b/modules/dnn/src/cuda4dnn/kernels/resize.hpp index 5c5cc3d9d6..31aee3d371 100644 --- a/modules/dnn/src/cuda4dnn/kernels/resize.hpp +++ b/modules/dnn/src/cuda4dnn/kernels/resize.hpp @@ -8,8 +8,6 @@ #include "../csl/stream.hpp" #include "../csl/tensor.hpp" -#include - namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { template diff --git a/modules/dnn/src/cuda4dnn/primitives/crop_and_resize.hpp b/modules/dnn/src/cuda4dnn/primitives/crop_and_resize.hpp new file mode 100644 index 0000000000..f0cfcc7a77 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/crop_and_resize.hpp @@ -0,0 +1,51 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CROP_AND_RESIZE_HPP +#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CROP_AND_RESIZE_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/stream.hpp" +#include "../csl/span.hpp" + +#include "../kernels/crop_and_resize.hpp" + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + template + class CropAndResizeOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + CropAndResizeOp(csl::Stream stream_) : stream(std::move(stream_)) { } + + void forward( + const std::vector>& inputs, + const std::vector>& outputs, + csl::Workspace& workspace) override + { + CV_Assert(inputs.size() == 2 && outputs.size() == 1); + + auto input_wrapper = inputs[0].dynamicCast(); + auto input = input_wrapper->getView(); + + auto box_wrapper = inputs[1].dynamicCast(); + auto boxes = box_wrapper->getView(); + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + kernels::crop_and_resize(stream, output, input, static_cast>(boxes)); + } + + private: + csl::Stream stream; + }; + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CROP_AND_RESIZE_HPP */ diff --git a/modules/dnn/src/layers/crop_and_resize_layer.cpp b/modules/dnn/src/layers/crop_and_resize_layer.cpp index de87107d43..ab242e1b2e 100644 --- a/modules/dnn/src/layers/crop_and_resize_layer.cpp +++ b/modules/dnn/src/layers/crop_and_resize_layer.cpp @@ -7,6 +7,11 @@ #include "../precomp.hpp" #include "layers_common.hpp" +#ifdef HAVE_CUDA +#include "../cuda4dnn/primitives/crop_and_resize.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + namespace cv { namespace dnn { class CropAndResizeLayerImpl CV_FINAL : public CropAndResizeLayer @@ -36,6 +41,11 @@ public: return false; } + virtual bool supportBackend(int backendId) CV_OVERRIDE + { + return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA; + } + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE { CV_TRACE_FUNCTION(); @@ -111,6 +121,18 @@ public: } } +#ifdef HAVE_CUDA + Ptr initCUDA( + void *context_, + const std::vector>& inputs, + const std::vector>& outputs + ) override + { + auto context = reinterpret_cast(context_); + return make_cuda_node(preferableTarget, std::move(context->stream)); + } +#endif + private: int outWidth, outHeight; };