diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt index ff9d17fcc2..a492b6ad45 100644 --- a/modules/dnn/CMakeLists.txt +++ b/modules/dnn/CMakeLists.txt @@ -27,8 +27,18 @@ ocv_option(OPENCV_DNN_CUDA "Build with CUDA support" AND HAVE_CUDNN ) -if(OPENCV_DNN_CUDA AND HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN) - add_definitions(-DCV_CUDA4DNN=1) +if(OPENCV_DNN_CUDA) + if(HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN) + add_definitions(-DCV_CUDA4DNN=1) + else() + if(NOT HAVE_CUDA) + message(SEND_ERROR "DNN: CUDA backend requires CUDA Toolkit. Please resolve dependency or disable OPENCV_DNN_CUDA=OFF") + elseif(NOT HAVE_CUBLAS) + message(SEND_ERROR "DNN: CUDA backend requires cuBLAS. Please resolve dependency or disable OPENCV_DNN_CUDA=OFF") + elseif(NOT HAVE_CUDNN) + message(SEND_ERROR "DNN: CUDA backend requires cuDNN. Please resolve dependency or disable OPENCV_DNN_CUDA=OFF") + endif() + endif() endif() ocv_cmake_hook_append(INIT_MODULE_SOURCES_opencv_dnn "${CMAKE_CURRENT_LIST_DIR}/cmake/hooks/INIT_MODULE_SOURCES_opencv_dnn.cmake") diff --git a/modules/dnn/src/cuda4dnn/init.hpp b/modules/dnn/src/cuda4dnn/init.hpp new file mode 100644 index 0000000000..b548f0958a --- /dev/null +++ b/modules/dnn/src/cuda4dnn/init.hpp @@ -0,0 +1,96 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_SRC_CUDA4DNN_INIT_HPP +#define OPENCV_DNN_SRC_CUDA4DNN_INIT_HPP + +#include "csl/error.hpp" + +#include +#include + +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + void checkVersions() + { + int cudart_version = 0; + CUDA4DNN_CHECK_CUDA(cudaRuntimeGetVersion(&cudart_version)); + if (cudart_version != CUDART_VERSION) + { + std::ostringstream oss; + oss << "CUDART reports version " << cudart_version << " which does not match with the version " << CUDART_VERSION << " with which OpenCV was built"; + CV_LOG_WARNING(NULL, oss.str().c_str()); + } + + auto cudnn_version = cudnnGetVersion(); + if (cudnn_version != CUDNN_VERSION) + { + std::ostringstream oss; + oss << "cuDNN reports version " << cudnn_version << " which does not match with the version " << CUDNN_VERSION << " with which OpenCV was built"; + CV_LOG_WARNING(NULL, oss.str().c_str()); + } + + auto cudnn_cudart_version = cudnnGetCudartVersion(); + if (cudart_version != cudnn_cudart_version) + { + std::ostringstream oss; + oss << "CUDART version " << cudnn_cudart_version << " reported by cuDNN " << cudnn_version << " does not match with the version reported by CUDART " << cudart_version; + CV_LOG_WARNING(NULL, oss.str().c_str()); + } + } + + int getDeviceCount() + { + return cuda::getCudaEnabledDeviceCount(); + } + + int getDevice() + { + int device_id = -1; + CUDA4DNN_CHECK_CUDA(cudaGetDevice(&device_id)); + return device_id; + } + + bool isDeviceCompatible() + { + int device_id = getDevice(); + if (device_id < 0) + return false; + + int major = 0, minor = 0; + CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_id)); + CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_id)); + + if (cv::cuda::TargetArchs::hasEqualOrLessPtx(major, minor)) + return true; + + for (int i = minor; i >= 0; i--) + if (cv::cuda::TargetArchs::hasBin(major, i)) + return true; + + return false; + } + + bool doesDeviceSupportFP16() + { + int device_id = getDevice(); + if (device_id < 0) + return false; + + int major = 0, minor = 0; + CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_id)); + CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_id)); + + int version = major * 10 + minor; + if (version < 53) + return false; + return true; + } + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_SRC_CUDA4DNN_INIT_HPP */ diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 59c00323d6..a04db892fc 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -47,7 +47,8 @@ #include "op_cuda.hpp" #ifdef HAVE_CUDA -#include "cuda4dnn/primitives/eltwise.hpp" +#include "cuda4dnn/init.hpp" +#include "cuda4dnn/primitives/eltwise.hpp" // required by fuseLayers #endif #include "halide_scheduler.hpp" @@ -66,8 +67,6 @@ #include #include -#include - namespace cv { namespace dnn { CV__DNN_INLINE_NS_BEGIN @@ -159,23 +158,6 @@ public: } #endif -#ifdef HAVE_CUDA - static inline bool cudaDeviceSupportsFp16() { - if (cv::cuda::getCudaEnabledDeviceCount() <= 0) - return false; - const int devId = cv::cuda::getDevice(); - if (devId<0) - return false; - cv::cuda::DeviceInfo dev_info(devId); - if (!dev_info.isCompatible()) - return false; - int version = dev_info.majorVersion() * 10 + dev_info.minorVersion(); - if (version < 53) - return false; - return true; - } -#endif - private: BackendRegistry() { @@ -247,9 +229,10 @@ private: #endif #ifdef HAVE_CUDA - if (haveCUDA()) { + if (haveCUDA() && cuda4dnn::isDeviceCompatible()) + { backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA)); - if (cudaDeviceSupportsFp16()) + if (cuda4dnn::doesDeviceSupportFP16()) backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16)); } #endif @@ -1189,19 +1172,6 @@ struct Net::Impl : public detail::NetImplBase preferableBackend = DNN_BACKEND_DEFAULT; preferableTarget = DNN_TARGET_CPU; skipInfEngineInit = false; - -#ifdef HAVE_CUDA - if (cv::cuda::getCudaEnabledDeviceCount() > 0) - { - cuda4dnn::csl::CSLContext context; - context.stream = cuda4dnn::csl::Stream(true); - context.cublas_handle = cuda4dnn::csl::cublas::Handle(context.stream); - context.cudnn_handle = cuda4dnn::csl::cudnn::Handle(context.stream); - - auto d2h_stream = cuda4dnn::csl::Stream(true); // stream for background D2H data transfers - cudaInfo = std::unique_ptr(new CudaInfo_t(std::move(context), std::move(d2h_stream))); - } -#endif } Ptr netInputLayer; @@ -1300,13 +1270,6 @@ struct Net::Impl : public detail::NetImplBase } Ptr wrapper = wrapMat(preferableBackend, preferableTarget, host); -#ifdef HAVE_CUDA - if (preferableBackend == DNN_BACKEND_CUDA) - { - auto cudaWrapper = wrapper.dynamicCast(); - cudaWrapper->setStream(cudaInfo->context.stream, cudaInfo->d2h_stream); - } -#endif backendWrappers[data] = wrapper; return wrapper; } @@ -2374,10 +2337,57 @@ struct Net::Impl : public detail::NetImplBase #endif } - void initCUDABackend(const std::vector& blobsToKeep_) { + void initCUDABackend(const std::vector& blobsToKeep_) + { CV_Assert(haveCUDA()); + CV_Assert(preferableBackend == DNN_BACKEND_CUDA); #ifdef HAVE_CUDA + if (cuda4dnn::getDeviceCount() <= 0) + CV_Error(Error::StsError, "No CUDA capable device found."); + + if (cuda4dnn::getDevice() < 0) + CV_Error(Error::StsError, "No CUDA capable device selected."); + + if (!cuda4dnn::isDeviceCompatible()) + CV_Error(Error::GpuNotSupported, "OpenCV was not built to work with the selected device. Please check CUDA_ARCH_PTX or CUDA_ARCH_BIN in your build configuration."); + + if (preferableTarget == DNN_TARGET_CUDA_FP16 && !cuda4dnn::doesDeviceSupportFP16()) + CV_Error(Error::StsError, "The selected CUDA device does not support FP16 operations."); + + if (!cudaInfo) + { + cuda4dnn::csl::CSLContext context; + context.stream = cuda4dnn::csl::Stream(true); + context.cublas_handle = cuda4dnn::csl::cublas::Handle(context.stream); + context.cudnn_handle = cuda4dnn::csl::cudnn::Handle(context.stream); + + auto d2h_stream = cuda4dnn::csl::Stream(true); // stream for background D2H data transfers + cudaInfo = std::unique_ptr(new CudaInfo_t(std::move(context), std::move(d2h_stream))); + cuda4dnn::checkVersions(); + } + + cudaInfo->workspace = cuda4dnn::csl::Workspace(); // release workspace memory if any + + for (auto& layer : layers) + { + auto& ld = layer.second; + if (ld.id == 0) + { + for (auto& wrapper : ld.inputBlobsWrappers) + { + auto cudaWrapper = wrapper.dynamicCast(); + cudaWrapper->setStream(cudaInfo->context.stream, cudaInfo->d2h_stream); + } + } + + for (auto& wrapper : ld.outputBlobsWrappers) + { + auto cudaWrapper = wrapper.dynamicCast(); + cudaWrapper->setStream(cudaInfo->context.stream, cudaInfo->d2h_stream); + } + } + for (auto& layer : layers) { auto& ld = layer.second; @@ -2653,11 +2663,11 @@ struct Net::Impl : public detail::NetImplBase if (IS_DNN_CUDA_TARGET(preferableTarget) && !nextEltwiseLayer.empty()) { // we create a temporary backend node for eltwise layer to obtain the eltwise configuration - auto context = cudaInfo->context; /* make a copy so that initCUDA doesn't modify cudaInfo */ + cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp does not use the context during init const auto node = nextData->layerInstance->initCUDA(&context, nextData->inputBlobsWrappers, nextData->outputBlobsWrappers); const auto eltwiseNode = node.dynamicCast(); if (eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty()) - nextEltwiseLayer = Ptr(); + nextEltwiseLayer = Ptr(); // check for variable channels auto& inputs = nextData->inputBlobs;