From 1f695c4532537ccd55fbc2bd3e0737fe84edf651 Mon Sep 17 00:00:00 2001
From: Yashas Samaga B L <yashas_2010@yahoo.com>
Date: Thu, 20 Feb 2020 18:13:05 +0530
Subject: [PATCH] Merge pull request #16161 from
 YashasSamaga:cuda4dnn-concat-fusion

cuda4dnn(concat): write outputs from previous layers directly into concat's output

* eliminate concat by directly writing to its output buffer

* fix concat fusion not happening sometimes

* use a whitelist instead of a blacklist
---
 modules/dnn/src/dnn.cpp     | 48 +++++++++++++++++++++++++++++++++++--
 modules/dnn/src/op_cuda.hpp | 24 +++++++++++++++++--
 2 files changed, 68 insertions(+), 4 deletions(-)
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index 0fba1578b4..7a87d46dc2 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -2470,7 +2470,9 @@ struct Net::Impl
                      ld.layerInstance->type != "Concat")) )
                     continue;
 
-                if (preferableBackend == DNN_BACKEND_CUDA && IS_DNN_CUDA_TARGET(preferableTarget) && ld.layerInstance->type != "Convolution")
+                if (preferableBackend == DNN_BACKEND_CUDA && IS_DNN_CUDA_TARGET(preferableTarget)
+                    && ld.layerInstance->type != "Convolution"
+                    && ld.layerInstance->type != "Concat")
                     continue;
 
                 while (nextData)
@@ -2626,7 +2628,7 @@ struct Net::Impl
                 }
             }
 
-            if (preferableBackend != DNN_BACKEND_OPENCV)
+            if (preferableBackend != DNN_BACKEND_OPENCV && preferableBackend != DNN_BACKEND_CUDA)
                 continue;  // Go to the next layer.
 
             // the optimization #2. if there is concat layer that concatenates channels
@@ -2694,6 +2696,15 @@ struct Net::Impl
 
                         if(inp_i_data->skip || inp_i_data->consumers.size() != 1)
                             break;
+#ifdef HAVE_CUDA
+                        if (preferableBackend == DNN_BACKEND_CUDA &&
+                            (inp_i_data->layerInstance->supportBackend(DNN_BACKEND_CUDA) == false ||
+                             (inp_i_data->layerInstance->type != "Convolution" &&
+                              inp_i_data->layerInstance->type != "Pooling")))
+                        {
+                            break;
+                        }
+#endif
                         realinputs[i] = pin;
                     }
 
@@ -2711,6 +2722,10 @@ struct Net::Impl
                             umats[0] = umat_output;
                             OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umats);
                         }
+#endif
+#ifdef HAVE_CUDA
+                        if (preferableBackend == DNN_BACKEND_CUDA)
+                            ld.outputBlobsWrappers[0] = wrap(output);
 #endif
                         Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() };
                         int ofs = 0;
@@ -2735,11 +2750,40 @@ struct Net::Impl
                                 umats[pin.oid] = umat_output(chrange);
                                 OpenCLBackendWrapper::update(inp_i_data->outputBlobsWrappers, umats);
                             }
+#endif
+#ifdef HAVE_CUDA
+                            if (preferableBackend == DNN_BACKEND_CUDA)
+                            {
+                                auto cuda_wrapper = wrap(output).dynamicCast<CUDABackendWrapper>();
+                                auto offset = chrange[1].start * (output.size[2] * output.size[3]);
+                                auto shape = MatShape{1, chrange[1].size(), output.size[2], output.size[3]};
+                                cuda_wrapper->update(shape, offset);
+                                inp_i_data->outputBlobsWrappers[pin.oid] = cuda_wrapper.staticCast<BackendWrapper>();
+                            }
 #endif
                             // Layers that refer old input Mat will refer to the
                             // new data but the same Mat object.
                             CV_Assert_N(curr_output.data == output_slice.data, oldPtr == &curr_output);
                         }
+
+#ifdef HAVE_CUDA
+                        if (preferableBackend == DNN_BACKEND_CUDA)
+                        {
+                            for (int i = 0; i < ld.consumers.size(); i++)
+                            {
+                                LayerData& consumer = layers[ld.consumers[i].lid];
+                                for (int j = 0; j < consumer.inputBlobsId.size(); j++)
+                                {
+                                    if (consumer.inputBlobsId[j].lid == ld.id)
+                                    {
+                                        CV_Assert(consumer.inputBlobs[j]->data == ld.outputBlobs[0].data);
+                                        consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
+                                        break;
+                                    }
+                                }
+                            }
+                        }
+#endif
                         ld.skip = true;
                         printf_(("\toptimized out Concat layer %s\n", concatLayer->name.c_str()));
                     }
diff --git a/modules/dnn/src/op_cuda.hpp b/modules/dnn/src/op_cuda.hpp
index d702989c8c..44e4955c50 100644
--- a/modules/dnn/src/op_cuda.hpp
+++ b/modules/dnn/src/op_cuda.hpp
@@ -217,6 +217,8 @@ namespace cv { namespace dnn {
 
         /** @note setting the stream updates the stream for all wrappers which use the same tensor */
         virtual void setStream(cuda4dnn::csl::Stream stream) noexcept = 0;
+
+        virtual void update(const MatShape& shape, std::size_t offset) = 0;
     };
 
     namespace cuda4dnn { namespace detail {
@@ -276,6 +278,7 @@ namespace cv { namespace dnn {
             : CUDABackendWrapper(TargetID)
         {
             shape = cv::dnn::shape(m);
+            offset = 0;
 
             shared_block = std::make_shared<shared_block_type>();
             shared_block->host_dirty = true;
@@ -300,6 +303,7 @@ namespace cv { namespace dnn {
             CV_Assert(base);
 
             shape = shape_;
+            offset = 0;
             shared_block = base->shared_block;
         }
 
@@ -313,6 +317,8 @@ namespace cv { namespace dnn {
 
         void copyToHost() override {
             if (shared_block->device_dirty) {
+                CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */
+
                 shared_block->host_dirty = false;
                 shared_block->device_dirty = false;
 
@@ -339,6 +345,8 @@ namespace cv { namespace dnn {
 
         void copyToDevice() override {
             if (shared_block->host_dirty) {
+                CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */
+
                 shared_block->host_dirty = false;
                 shared_block->device_dirty = false;
 
@@ -365,13 +373,24 @@ namespace cv { namespace dnn {
             shared_block->stream = std::move(stream);
         }
 
+        void update(const MatShape& shape_, std::size_t offset_) override {
+            auto total = std::accumulate(std::begin(shape_), std::end(shape_), 1, std::multiplies<MatShape::value_type>());
+            if (offset_ + total > shared_block->device.size()) {
+                CV_Error(Error::BadOffset, "shape and offset provided can potentially leads to OOB access");
+            }
+            shape = shape_;
+            offset = offset_;
+        }
+
         cv::Mat getMutableHostMat() noexcept {
+            CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */
             copyToHost();
             setHostDirty();
             return shared_block->host;
         }
 
         const cv::Mat getImmutableHostMat() const noexcept {
+            CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */
             copyToHost();
             return shared_block->host;
         }
@@ -388,12 +407,12 @@ namespace cv { namespace dnn {
          */
         tensor_span_type getSpan() noexcept {
             setDeviceDirty();
-            return tensor_span_type(shared_block->device.get(), std::begin(shape), std::end(shape));
+            return tensor_span_type(shared_block->device.get() + offset, std::begin(shape), std::end(shape));
         }
 
         tensor_view_type getView() noexcept {
             copyToDevice();
-            return tensor_view_type(shared_block->device.get(), std::begin(shape), std::end(shape));
+            return tensor_view_type(shared_block->device.get() + offset, std::begin(shape), std::end(shape));
         }
 
     private:
@@ -407,6 +426,7 @@ namespace cv { namespace dnn {
          */
 
         MatShape shape;
+        std::size_t offset;
 
         struct shared_block_type {
             bool host_dirty;