mirror of
https://github.com/opencv/opencv.git
synced 2025-08-05 22:19:14 +08:00
Merge pull request #17788 from YashasSamaga:cuda4dnn-nice-build
This commit is contained in:
commit
cd0f0384ef
@ -27,8 +27,18 @@ ocv_option(OPENCV_DNN_CUDA "Build with CUDA support"
|
||||
AND HAVE_CUDNN
|
||||
)
|
||||
|
||||
if(OPENCV_DNN_CUDA AND HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN)
|
||||
add_definitions(-DCV_CUDA4DNN=1)
|
||||
if(OPENCV_DNN_CUDA)
|
||||
if(HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN)
|
||||
add_definitions(-DCV_CUDA4DNN=1)
|
||||
else()
|
||||
if(NOT HAVE_CUDA)
|
||||
message(SEND_ERROR "DNN: CUDA backend requires CUDA Toolkit. Please resolve dependency or disable OPENCV_DNN_CUDA=OFF")
|
||||
elseif(NOT HAVE_CUBLAS)
|
||||
message(SEND_ERROR "DNN: CUDA backend requires cuBLAS. Please resolve dependency or disable OPENCV_DNN_CUDA=OFF")
|
||||
elseif(NOT HAVE_CUDNN)
|
||||
message(SEND_ERROR "DNN: CUDA backend requires cuDNN. Please resolve dependency or disable OPENCV_DNN_CUDA=OFF")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
ocv_cmake_hook_append(INIT_MODULE_SOURCES_opencv_dnn "${CMAKE_CURRENT_LIST_DIR}/cmake/hooks/INIT_MODULE_SOURCES_opencv_dnn.cmake")
|
||||
|
96
modules/dnn/src/cuda4dnn/init.hpp
Normal file
96
modules/dnn/src/cuda4dnn/init.hpp
Normal file
@ -0,0 +1,96 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_INIT_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_INIT_HPP
|
||||
|
||||
#include "csl/error.hpp"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cudnn.h>
|
||||
|
||||
#include <opencv2/core/cuda.hpp>
|
||||
#include <sstream>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
void checkVersions()
|
||||
{
|
||||
int cudart_version = 0;
|
||||
CUDA4DNN_CHECK_CUDA(cudaRuntimeGetVersion(&cudart_version));
|
||||
if (cudart_version != CUDART_VERSION)
|
||||
{
|
||||
std::ostringstream oss;
|
||||
oss << "CUDART reports version " << cudart_version << " which does not match with the version " << CUDART_VERSION << " with which OpenCV was built";
|
||||
CV_LOG_WARNING(NULL, oss.str().c_str());
|
||||
}
|
||||
|
||||
auto cudnn_version = cudnnGetVersion();
|
||||
if (cudnn_version != CUDNN_VERSION)
|
||||
{
|
||||
std::ostringstream oss;
|
||||
oss << "cuDNN reports version " << cudnn_version << " which does not match with the version " << CUDNN_VERSION << " with which OpenCV was built";
|
||||
CV_LOG_WARNING(NULL, oss.str().c_str());
|
||||
}
|
||||
|
||||
auto cudnn_cudart_version = cudnnGetCudartVersion();
|
||||
if (cudart_version != cudnn_cudart_version)
|
||||
{
|
||||
std::ostringstream oss;
|
||||
oss << "CUDART version " << cudnn_cudart_version << " reported by cuDNN " << cudnn_version << " does not match with the version reported by CUDART " << cudart_version;
|
||||
CV_LOG_WARNING(NULL, oss.str().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
int getDeviceCount()
|
||||
{
|
||||
return cuda::getCudaEnabledDeviceCount();
|
||||
}
|
||||
|
||||
int getDevice()
|
||||
{
|
||||
int device_id = -1;
|
||||
CUDA4DNN_CHECK_CUDA(cudaGetDevice(&device_id));
|
||||
return device_id;
|
||||
}
|
||||
|
||||
bool isDeviceCompatible()
|
||||
{
|
||||
int device_id = getDevice();
|
||||
if (device_id < 0)
|
||||
return false;
|
||||
|
||||
int major = 0, minor = 0;
|
||||
CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_id));
|
||||
CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_id));
|
||||
|
||||
if (cv::cuda::TargetArchs::hasEqualOrLessPtx(major, minor))
|
||||
return true;
|
||||
|
||||
for (int i = minor; i >= 0; i--)
|
||||
if (cv::cuda::TargetArchs::hasBin(major, i))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool doesDeviceSupportFP16()
|
||||
{
|
||||
int device_id = getDevice();
|
||||
if (device_id < 0)
|
||||
return false;
|
||||
|
||||
int major = 0, minor = 0;
|
||||
CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_id));
|
||||
CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_id));
|
||||
|
||||
int version = major * 10 + minor;
|
||||
if (version < 53)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_INIT_HPP */
|
@ -47,7 +47,8 @@
|
||||
#include "op_cuda.hpp"
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
#include "cuda4dnn/primitives/eltwise.hpp"
|
||||
#include "cuda4dnn/init.hpp"
|
||||
#include "cuda4dnn/primitives/eltwise.hpp" // required by fuseLayers
|
||||
#endif
|
||||
|
||||
#include "halide_scheduler.hpp"
|
||||
@ -66,8 +67,6 @@
|
||||
#include <opencv2/core/utils/configuration.private.hpp>
|
||||
#include <opencv2/core/utils/logger.hpp>
|
||||
|
||||
#include <opencv2/core/cuda.hpp>
|
||||
|
||||
namespace cv {
|
||||
namespace dnn {
|
||||
CV__DNN_INLINE_NS_BEGIN
|
||||
@ -159,23 +158,6 @@ public:
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
static inline bool cudaDeviceSupportsFp16() {
|
||||
if (cv::cuda::getCudaEnabledDeviceCount() <= 0)
|
||||
return false;
|
||||
const int devId = cv::cuda::getDevice();
|
||||
if (devId<0)
|
||||
return false;
|
||||
cv::cuda::DeviceInfo dev_info(devId);
|
||||
if (!dev_info.isCompatible())
|
||||
return false;
|
||||
int version = dev_info.majorVersion() * 10 + dev_info.minorVersion();
|
||||
if (version < 53)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
private:
|
||||
BackendRegistry()
|
||||
{
|
||||
@ -247,9 +229,10 @@ private:
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
if (haveCUDA()) {
|
||||
if (haveCUDA() && cuda4dnn::isDeviceCompatible())
|
||||
{
|
||||
backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA));
|
||||
if (cudaDeviceSupportsFp16())
|
||||
if (cuda4dnn::doesDeviceSupportFP16())
|
||||
backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16));
|
||||
}
|
||||
#endif
|
||||
@ -1189,19 +1172,6 @@ struct Net::Impl : public detail::NetImplBase
|
||||
preferableBackend = DNN_BACKEND_DEFAULT;
|
||||
preferableTarget = DNN_TARGET_CPU;
|
||||
skipInfEngineInit = false;
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
if (cv::cuda::getCudaEnabledDeviceCount() > 0)
|
||||
{
|
||||
cuda4dnn::csl::CSLContext context;
|
||||
context.stream = cuda4dnn::csl::Stream(true);
|
||||
context.cublas_handle = cuda4dnn::csl::cublas::Handle(context.stream);
|
||||
context.cudnn_handle = cuda4dnn::csl::cudnn::Handle(context.stream);
|
||||
|
||||
auto d2h_stream = cuda4dnn::csl::Stream(true); // stream for background D2H data transfers
|
||||
cudaInfo = std::unique_ptr<CudaInfo_t>(new CudaInfo_t(std::move(context), std::move(d2h_stream)));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
Ptr<DataLayer> netInputLayer;
|
||||
@ -1300,13 +1270,6 @@ struct Net::Impl : public detail::NetImplBase
|
||||
}
|
||||
|
||||
Ptr<BackendWrapper> wrapper = wrapMat(preferableBackend, preferableTarget, host);
|
||||
#ifdef HAVE_CUDA
|
||||
if (preferableBackend == DNN_BACKEND_CUDA)
|
||||
{
|
||||
auto cudaWrapper = wrapper.dynamicCast<CUDABackendWrapper>();
|
||||
cudaWrapper->setStream(cudaInfo->context.stream, cudaInfo->d2h_stream);
|
||||
}
|
||||
#endif
|
||||
backendWrappers[data] = wrapper;
|
||||
return wrapper;
|
||||
}
|
||||
@ -2374,10 +2337,57 @@ struct Net::Impl : public detail::NetImplBase
|
||||
#endif
|
||||
}
|
||||
|
||||
void initCUDABackend(const std::vector<LayerPin>& blobsToKeep_) {
|
||||
void initCUDABackend(const std::vector<LayerPin>& blobsToKeep_)
|
||||
{
|
||||
CV_Assert(haveCUDA());
|
||||
CV_Assert(preferableBackend == DNN_BACKEND_CUDA);
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
if (cuda4dnn::getDeviceCount() <= 0)
|
||||
CV_Error(Error::StsError, "No CUDA capable device found.");
|
||||
|
||||
if (cuda4dnn::getDevice() < 0)
|
||||
CV_Error(Error::StsError, "No CUDA capable device selected.");
|
||||
|
||||
if (!cuda4dnn::isDeviceCompatible())
|
||||
CV_Error(Error::GpuNotSupported, "OpenCV was not built to work with the selected device. Please check CUDA_ARCH_PTX or CUDA_ARCH_BIN in your build configuration.");
|
||||
|
||||
if (preferableTarget == DNN_TARGET_CUDA_FP16 && !cuda4dnn::doesDeviceSupportFP16())
|
||||
CV_Error(Error::StsError, "The selected CUDA device does not support FP16 operations.");
|
||||
|
||||
if (!cudaInfo)
|
||||
{
|
||||
cuda4dnn::csl::CSLContext context;
|
||||
context.stream = cuda4dnn::csl::Stream(true);
|
||||
context.cublas_handle = cuda4dnn::csl::cublas::Handle(context.stream);
|
||||
context.cudnn_handle = cuda4dnn::csl::cudnn::Handle(context.stream);
|
||||
|
||||
auto d2h_stream = cuda4dnn::csl::Stream(true); // stream for background D2H data transfers
|
||||
cudaInfo = std::unique_ptr<CudaInfo_t>(new CudaInfo_t(std::move(context), std::move(d2h_stream)));
|
||||
cuda4dnn::checkVersions();
|
||||
}
|
||||
|
||||
cudaInfo->workspace = cuda4dnn::csl::Workspace(); // release workspace memory if any
|
||||
|
||||
for (auto& layer : layers)
|
||||
{
|
||||
auto& ld = layer.second;
|
||||
if (ld.id == 0)
|
||||
{
|
||||
for (auto& wrapper : ld.inputBlobsWrappers)
|
||||
{
|
||||
auto cudaWrapper = wrapper.dynamicCast<CUDABackendWrapper>();
|
||||
cudaWrapper->setStream(cudaInfo->context.stream, cudaInfo->d2h_stream);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto& wrapper : ld.outputBlobsWrappers)
|
||||
{
|
||||
auto cudaWrapper = wrapper.dynamicCast<CUDABackendWrapper>();
|
||||
cudaWrapper->setStream(cudaInfo->context.stream, cudaInfo->d2h_stream);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto& layer : layers)
|
||||
{
|
||||
auto& ld = layer.second;
|
||||
@ -2653,11 +2663,11 @@ struct Net::Impl : public detail::NetImplBase
|
||||
if (IS_DNN_CUDA_TARGET(preferableTarget) && !nextEltwiseLayer.empty())
|
||||
{
|
||||
// we create a temporary backend node for eltwise layer to obtain the eltwise configuration
|
||||
auto context = cudaInfo->context; /* make a copy so that initCUDA doesn't modify cudaInfo */
|
||||
cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp does not use the context during init
|
||||
const auto node = nextData->layerInstance->initCUDA(&context, nextData->inputBlobsWrappers, nextData->outputBlobsWrappers);
|
||||
const auto eltwiseNode = node.dynamicCast<cuda4dnn::EltwiseOpBase>();
|
||||
if (eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty())
|
||||
nextEltwiseLayer = Ptr<EltwiseLayer>();
|
||||
nextEltwiseLayer = Ptr<EltwiseLayer>();
|
||||
|
||||
// check for variable channels
|
||||
auto& inputs = nextData->inputBlobs;
|
||||
|
Loading…
Reference in New Issue
Block a user