Merge pull request #17788 from YashasSamaga:cuda4dnn-nice-build

2025-08-05 22:19:14 +08:00 · 2020-07-15 12:25:23 +00:00 · 2020-07-15 12:25:23 +00:00 · cd0f0384ef
commit cd0f0384ef
parent 5924770504 1949056423
3 changed files with 163 additions and 47 deletions
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@ -27,8 +27,18 @@ ocv_option(OPENCV_DNN_CUDA "Build with CUDA support"
    AND HAVE_CUDNN
 )

-if(OPENCV_DNN_CUDA AND HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN)
-  add_definitions(-DCV_CUDA4DNN=1)
+if(OPENCV_DNN_CUDA)
+  if(HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN)
+    add_definitions(-DCV_CUDA4DNN=1)
+  else()
+    if(NOT HAVE_CUDA)
+      message(SEND_ERROR "DNN: CUDA backend requires CUDA Toolkit. Please resolve dependency or disable OPENCV_DNN_CUDA=OFF")
+    elseif(NOT HAVE_CUBLAS)
+      message(SEND_ERROR "DNN: CUDA backend requires cuBLAS. Please resolve dependency or disable OPENCV_DNN_CUDA=OFF")
+    elseif(NOT HAVE_CUDNN)
+      message(SEND_ERROR "DNN: CUDA backend requires cuDNN. Please resolve dependency or disable OPENCV_DNN_CUDA=OFF")
+    endif()
+  endif()
 endif()

 ocv_cmake_hook_append(INIT_MODULE_SOURCES_opencv_dnn "${CMAKE_CURRENT_LIST_DIR}/cmake/hooks/INIT_MODULE_SOURCES_opencv_dnn.cmake")
--- a/modules/dnn/src/cuda4dnn/init.hpp
+++ b/modules/dnn/src/cuda4dnn/init.hpp
@ -0,0 +1,96 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_INIT_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_INIT_HPP
+
+#include "csl/error.hpp"
+
+#include <cuda_runtime.h>
+#include <cudnn.h>
+
+#include <opencv2/core/cuda.hpp>
+#include <sstream>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    void checkVersions()
+    {
+        int cudart_version = 0;
+        CUDA4DNN_CHECK_CUDA(cudaRuntimeGetVersion(&cudart_version));
+        if (cudart_version != CUDART_VERSION)
+        {
+            std::ostringstream oss;
+            oss << "CUDART reports version " << cudart_version << " which does not match with the version " << CUDART_VERSION << " with which OpenCV was built";
+            CV_LOG_WARNING(NULL, oss.str().c_str());
+        }
+
+        auto cudnn_version = cudnnGetVersion();
+        if (cudnn_version != CUDNN_VERSION)
+        {
+            std::ostringstream oss;
+            oss << "cuDNN reports version " << cudnn_version << " which does not match with the version " << CUDNN_VERSION << " with which OpenCV was built";
+            CV_LOG_WARNING(NULL, oss.str().c_str());
+        }
+
+        auto cudnn_cudart_version = cudnnGetCudartVersion();
+        if (cudart_version != cudnn_cudart_version)
+        {
+            std::ostringstream oss;
+            oss << "CUDART version " << cudnn_cudart_version << " reported by cuDNN " << cudnn_version << " does not match with the version reported by CUDART " << cudart_version;
+            CV_LOG_WARNING(NULL, oss.str().c_str());
+        }
+    }
+
+    int getDeviceCount()
+    {
+        return cuda::getCudaEnabledDeviceCount();
+    }
+
+    int getDevice()
+    {
+        int device_id = -1;
+        CUDA4DNN_CHECK_CUDA(cudaGetDevice(&device_id));
+        return device_id;
+    }
+
+    bool isDeviceCompatible()
+    {
+        int device_id = getDevice();
+        if (device_id < 0)
+            return false;
+
+        int major = 0, minor = 0;
+        CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_id));
+        CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_id));
+
+        if (cv::cuda::TargetArchs::hasEqualOrLessPtx(major, minor))
+            return true;
+
+        for (int i = minor; i >= 0; i--)
+            if (cv::cuda::TargetArchs::hasBin(major, i))
+                return true;
+
+        return false;
+    }
+
+    bool doesDeviceSupportFP16()
+    {
+        int device_id = getDevice();
+        if (device_id < 0)
+            return false;
+
+        int major = 0, minor = 0;
+        CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_id));
+        CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_id));
+
+        int version = major * 10 + minor;
+        if (version < 53)
+            return false;
+        return true;
+    }
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_INIT_HPP */
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -47,7 +47,8 @@
 #include "op_cuda.hpp"

 #ifdef HAVE_CUDA
-#include "cuda4dnn/primitives/eltwise.hpp"
+#include "cuda4dnn/init.hpp"
+#include "cuda4dnn/primitives/eltwise.hpp" // required by fuseLayers
 #endif

 #include "halide_scheduler.hpp"
@ -66,8 +67,6 @@
 #include <opencv2/core/utils/configuration.private.hpp>
 #include <opencv2/core/utils/logger.hpp>

-#include <opencv2/core/cuda.hpp>
-
 namespace cv {
 namespace dnn {
 CV__DNN_INLINE_NS_BEGIN
@ -159,23 +158,6 @@ public:
    }
 #endif

-#ifdef HAVE_CUDA
-    static inline bool cudaDeviceSupportsFp16() {
-        if (cv::cuda::getCudaEnabledDeviceCount() <= 0)
-            return false;
-        const int devId = cv::cuda::getDevice();
-        if (devId<0)
-            return false;
-        cv::cuda::DeviceInfo dev_info(devId);
-        if (!dev_info.isCompatible())
-            return false;
-        int version = dev_info.majorVersion() * 10 + dev_info.minorVersion();
-        if (version < 53)
-            return false;
-        return true;
-    }
-#endif
-
 private:
    BackendRegistry()
    {
@ -247,9 +229,10 @@ private:
 #endif

 #ifdef HAVE_CUDA
-        if (haveCUDA()) {
+        if (haveCUDA() && cuda4dnn::isDeviceCompatible())
+        {
            backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA));
-            if (cudaDeviceSupportsFp16())
+            if (cuda4dnn::doesDeviceSupportFP16())
                backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16));
        }
 #endif
@ -1189,19 +1172,6 @@ struct Net::Impl : public detail::NetImplBase
        preferableBackend = DNN_BACKEND_DEFAULT;
        preferableTarget = DNN_TARGET_CPU;
        skipInfEngineInit = false;
-
-#ifdef HAVE_CUDA
-        if (cv::cuda::getCudaEnabledDeviceCount() > 0)
-        {
-            cuda4dnn::csl::CSLContext context;
-            context.stream = cuda4dnn::csl::Stream(true);
-            context.cublas_handle = cuda4dnn::csl::cublas::Handle(context.stream);
-            context.cudnn_handle = cuda4dnn::csl::cudnn::Handle(context.stream);
-
-            auto d2h_stream = cuda4dnn::csl::Stream(true); // stream for background D2H data transfers
-            cudaInfo = std::unique_ptr<CudaInfo_t>(new CudaInfo_t(std::move(context), std::move(d2h_stream)));
-        }
-#endif
    }

    Ptr<DataLayer> netInputLayer;
@ -1300,13 +1270,6 @@ struct Net::Impl : public detail::NetImplBase
        }

        Ptr<BackendWrapper> wrapper = wrapMat(preferableBackend, preferableTarget, host);
-#ifdef HAVE_CUDA
-        if (preferableBackend == DNN_BACKEND_CUDA)
-        {
-            auto cudaWrapper = wrapper.dynamicCast<CUDABackendWrapper>();
-            cudaWrapper->setStream(cudaInfo->context.stream, cudaInfo->d2h_stream);
-        }
-#endif
        backendWrappers[data] = wrapper;
        return wrapper;
    }
@ -2374,10 +2337,57 @@ struct Net::Impl : public detail::NetImplBase
 #endif
    }

-    void initCUDABackend(const std::vector<LayerPin>& blobsToKeep_) {
+    void initCUDABackend(const std::vector<LayerPin>& blobsToKeep_)
+    {
        CV_Assert(haveCUDA());
+        CV_Assert(preferableBackend == DNN_BACKEND_CUDA);

 #ifdef HAVE_CUDA
+        if (cuda4dnn::getDeviceCount() <= 0)
+            CV_Error(Error::StsError, "No CUDA capable device found.");
+
+        if (cuda4dnn::getDevice() < 0)
+            CV_Error(Error::StsError, "No CUDA capable device selected.");
+
+        if (!cuda4dnn::isDeviceCompatible())
+            CV_Error(Error::GpuNotSupported, "OpenCV was not built to work with the selected device. Please check CUDA_ARCH_PTX or CUDA_ARCH_BIN in your build configuration.");
+
+        if (preferableTarget == DNN_TARGET_CUDA_FP16 && !cuda4dnn::doesDeviceSupportFP16())
+            CV_Error(Error::StsError, "The selected CUDA device does not support FP16 operations.");
+
+        if (!cudaInfo)
+        {
+            cuda4dnn::csl::CSLContext context;
+            context.stream = cuda4dnn::csl::Stream(true);
+            context.cublas_handle = cuda4dnn::csl::cublas::Handle(context.stream);
+            context.cudnn_handle = cuda4dnn::csl::cudnn::Handle(context.stream);
+
+            auto d2h_stream = cuda4dnn::csl::Stream(true); // stream for background D2H data transfers
+            cudaInfo = std::unique_ptr<CudaInfo_t>(new CudaInfo_t(std::move(context), std::move(d2h_stream)));
+            cuda4dnn::checkVersions();
+        }
+
+        cudaInfo->workspace = cuda4dnn::csl::Workspace(); // release workspace memory if any
+
+        for (auto& layer : layers)
+        {
+            auto& ld = layer.second;
+            if (ld.id == 0)
+            {
+                for (auto& wrapper : ld.inputBlobsWrappers)
+                {
+                    auto cudaWrapper = wrapper.dynamicCast<CUDABackendWrapper>();
+                    cudaWrapper->setStream(cudaInfo->context.stream, cudaInfo->d2h_stream);
+                }
+            }
+
+            for (auto& wrapper : ld.outputBlobsWrappers)
+            {
+                auto cudaWrapper = wrapper.dynamicCast<CUDABackendWrapper>();
+                cudaWrapper->setStream(cudaInfo->context.stream, cudaInfo->d2h_stream);
+            }
+        }
+
        for (auto& layer : layers)
        {
            auto& ld = layer.second;
@ -2653,11 +2663,11 @@ struct Net::Impl : public detail::NetImplBase
                    if (IS_DNN_CUDA_TARGET(preferableTarget) && !nextEltwiseLayer.empty())
                    {
                        // we create a temporary backend node for eltwise layer to obtain the eltwise configuration
-                        auto context = cudaInfo->context; /* make a copy so that initCUDA doesn't modify cudaInfo */
+                        cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp does not use the context during init
                        const auto node = nextData->layerInstance->initCUDA(&context, nextData->inputBlobsWrappers, nextData->outputBlobsWrappers);
                        const auto eltwiseNode = node.dynamicCast<cuda4dnn::EltwiseOpBase>();
                        if (eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty())
-                             nextEltwiseLayer = Ptr<EltwiseLayer>();
+                            nextEltwiseLayer = Ptr<EltwiseLayer>();

                        // check for variable channels
                        auto& inputs = nextData->inputBlobs;