From 1f695c4532537ccd55fbc2bd3e0737fe84edf651 Mon Sep 17 00:00:00 2001 From: Yashas Samaga B L Date: Thu, 20 Feb 2020 18:13:05 +0530 Subject: [PATCH] Merge pull request #16161 from YashasSamaga:cuda4dnn-concat-fusion cuda4dnn(concat): write outputs from previous layers directly into concat's output * eliminate concat by directly writing to its output buffer * fix concat fusion not happening sometimes * use a whitelist instead of a blacklist --- modules/dnn/src/dnn.cpp | 48 +++++++++++++++++++++++++++++++++++-- modules/dnn/src/op_cuda.hpp | 24 +++++++++++++++++-- 2 files changed, 68 insertions(+), 4 deletions(-) diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 0fba1578b4..7a87d46dc2 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -2470,7 +2470,9 @@ struct Net::Impl ld.layerInstance->type != "Concat")) ) continue; - if (preferableBackend == DNN_BACKEND_CUDA && IS_DNN_CUDA_TARGET(preferableTarget) && ld.layerInstance->type != "Convolution") + if (preferableBackend == DNN_BACKEND_CUDA && IS_DNN_CUDA_TARGET(preferableTarget) + && ld.layerInstance->type != "Convolution" + && ld.layerInstance->type != "Concat") continue; while (nextData) @@ -2626,7 +2628,7 @@ struct Net::Impl } } - if (preferableBackend != DNN_BACKEND_OPENCV) + if (preferableBackend != DNN_BACKEND_OPENCV && preferableBackend != DNN_BACKEND_CUDA) continue; // Go to the next layer. // the optimization #2. if there is concat layer that concatenates channels @@ -2694,6 +2696,15 @@ struct Net::Impl if(inp_i_data->skip || inp_i_data->consumers.size() != 1) break; +#ifdef HAVE_CUDA + if (preferableBackend == DNN_BACKEND_CUDA && + (inp_i_data->layerInstance->supportBackend(DNN_BACKEND_CUDA) == false || + (inp_i_data->layerInstance->type != "Convolution" && + inp_i_data->layerInstance->type != "Pooling"))) + { + break; + } +#endif realinputs[i] = pin; } @@ -2711,6 +2722,10 @@ struct Net::Impl umats[0] = umat_output; OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umats); } +#endif +#ifdef HAVE_CUDA + if (preferableBackend == DNN_BACKEND_CUDA) + ld.outputBlobsWrappers[0] = wrap(output); #endif Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() }; int ofs = 0; @@ -2735,11 +2750,40 @@ struct Net::Impl umats[pin.oid] = umat_output(chrange); OpenCLBackendWrapper::update(inp_i_data->outputBlobsWrappers, umats); } +#endif +#ifdef HAVE_CUDA + if (preferableBackend == DNN_BACKEND_CUDA) + { + auto cuda_wrapper = wrap(output).dynamicCast(); + auto offset = chrange[1].start * (output.size[2] * output.size[3]); + auto shape = MatShape{1, chrange[1].size(), output.size[2], output.size[3]}; + cuda_wrapper->update(shape, offset); + inp_i_data->outputBlobsWrappers[pin.oid] = cuda_wrapper.staticCast(); + } #endif // Layers that refer old input Mat will refer to the // new data but the same Mat object. CV_Assert_N(curr_output.data == output_slice.data, oldPtr == &curr_output); } + +#ifdef HAVE_CUDA + if (preferableBackend == DNN_BACKEND_CUDA) + { + for (int i = 0; i < ld.consumers.size(); i++) + { + LayerData& consumer = layers[ld.consumers[i].lid]; + for (int j = 0; j < consumer.inputBlobsId.size(); j++) + { + if (consumer.inputBlobsId[j].lid == ld.id) + { + CV_Assert(consumer.inputBlobs[j]->data == ld.outputBlobs[0].data); + consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0]; + break; + } + } + } + } +#endif ld.skip = true; printf_(("\toptimized out Concat layer %s\n", concatLayer->name.c_str())); } diff --git a/modules/dnn/src/op_cuda.hpp b/modules/dnn/src/op_cuda.hpp index d702989c8c..44e4955c50 100644 --- a/modules/dnn/src/op_cuda.hpp +++ b/modules/dnn/src/op_cuda.hpp @@ -217,6 +217,8 @@ namespace cv { namespace dnn { /** @note setting the stream updates the stream for all wrappers which use the same tensor */ virtual void setStream(cuda4dnn::csl::Stream stream) noexcept = 0; + + virtual void update(const MatShape& shape, std::size_t offset) = 0; }; namespace cuda4dnn { namespace detail { @@ -276,6 +278,7 @@ namespace cv { namespace dnn { : CUDABackendWrapper(TargetID) { shape = cv::dnn::shape(m); + offset = 0; shared_block = std::make_shared(); shared_block->host_dirty = true; @@ -300,6 +303,7 @@ namespace cv { namespace dnn { CV_Assert(base); shape = shape_; + offset = 0; shared_block = base->shared_block; } @@ -313,6 +317,8 @@ namespace cv { namespace dnn { void copyToHost() override { if (shared_block->device_dirty) { + CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */ + shared_block->host_dirty = false; shared_block->device_dirty = false; @@ -339,6 +345,8 @@ namespace cv { namespace dnn { void copyToDevice() override { if (shared_block->host_dirty) { + CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */ + shared_block->host_dirty = false; shared_block->device_dirty = false; @@ -365,13 +373,24 @@ namespace cv { namespace dnn { shared_block->stream = std::move(stream); } + void update(const MatShape& shape_, std::size_t offset_) override { + auto total = std::accumulate(std::begin(shape_), std::end(shape_), 1, std::multiplies()); + if (offset_ + total > shared_block->device.size()) { + CV_Error(Error::BadOffset, "shape and offset provided can potentially leads to OOB access"); + } + shape = shape_; + offset = offset_; + } + cv::Mat getMutableHostMat() noexcept { + CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */ copyToHost(); setHostDirty(); return shared_block->host; } const cv::Mat getImmutableHostMat() const noexcept { + CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */ copyToHost(); return shared_block->host; } @@ -388,12 +407,12 @@ namespace cv { namespace dnn { */ tensor_span_type getSpan() noexcept { setDeviceDirty(); - return tensor_span_type(shared_block->device.get(), std::begin(shape), std::end(shape)); + return tensor_span_type(shared_block->device.get() + offset, std::begin(shape), std::end(shape)); } tensor_view_type getView() noexcept { copyToDevice(); - return tensor_view_type(shared_block->device.get(), std::begin(shape), std::end(shape)); + return tensor_view_type(shared_block->device.get() + offset, std::begin(shape), std::end(shape)); } private: @@ -407,6 +426,7 @@ namespace cv { namespace dnn { */ MatShape shape; + std::size_t offset; struct shared_block_type { bool host_dirty;