mirror of
https://github.com/opencv/opencv.git
synced 2025-06-13 04:52:53 +08:00
add cuDNN 8 support
This commit is contained in:
parent
206c843f36
commit
62a63021c7
@ -21,14 +21,10 @@ if(OPENCV_DNN_OPENCL AND HAVE_OPENCL)
|
|||||||
add_definitions(-DCV_OCL4DNN=1)
|
add_definitions(-DCV_OCL4DNN=1)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(NOT DEFINED OPENCV_DNN_CUDA AND HAVE_CUDNN AND CUDNN_VERSION VERSION_LESS 8.0)
|
|
||||||
message(STATUS "DNN: CUDNN 8.0 is not supported yes. Details: https://github.com/opencv/opencv/issues/17496")
|
|
||||||
endif()
|
|
||||||
ocv_option(OPENCV_DNN_CUDA "Build with CUDA support"
|
ocv_option(OPENCV_DNN_CUDA "Build with CUDA support"
|
||||||
HAVE_CUDA
|
HAVE_CUDA
|
||||||
AND HAVE_CUBLAS
|
AND HAVE_CUBLAS
|
||||||
AND HAVE_CUDNN
|
AND HAVE_CUDNN
|
||||||
AND CUDNN_VERSION VERSION_LESS 8.0
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if(OPENCV_DNN_CUDA AND HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN)
|
if(OPENCV_DNN_CUDA AND HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN)
|
||||||
|
@ -225,6 +225,15 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionGroupCount(descriptor, group_count));
|
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionGroupCount(descriptor, group_count));
|
||||||
|
|
||||||
|
#if CUDNN_MAJOR >= 8
|
||||||
|
/* cuDNN 7 and below use FMA math by default. cuDNN 8 includes TF32 Tensor Ops
|
||||||
|
* in the default setting. TF32 convolutions have lower precision than FP32.
|
||||||
|
* Hence, we set the math type to CUDNN_FMA_MATH to reproduce old behavior.
|
||||||
|
*/
|
||||||
|
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionMathType(descriptor, CUDNN_FMA_MATH));
|
||||||
|
#endif
|
||||||
|
|
||||||
if (std::is_same<T, half>::value)
|
if (std::is_same<T, half>::value)
|
||||||
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionMathType(descriptor, CUDNN_TENSOR_OP_MATH));
|
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionMathType(descriptor, CUDNN_TENSOR_OP_MATH));
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
@ -254,15 +263,49 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
|
|||||||
*/
|
*/
|
||||||
ConvolutionAlgorithm(
|
ConvolutionAlgorithm(
|
||||||
const Handle& handle,
|
const Handle& handle,
|
||||||
const ConvolutionDescriptor<T>& conv,
|
const ConvolutionDescriptor<T>& convDesc,
|
||||||
const FilterDescriptor<T>& filter,
|
const FilterDescriptor<T>& filterDesc,
|
||||||
const TensorDescriptor<T>& input,
|
const TensorDescriptor<T>& inputDesc,
|
||||||
const TensorDescriptor<T>& output)
|
const TensorDescriptor<T>& outputDesc)
|
||||||
{
|
{
|
||||||
|
#if CUDNN_MAJOR >= 8
|
||||||
|
int requestedAlgoCount = 0, returnedAlgoCount = 0;
|
||||||
|
CUDA4DNN_CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithmMaxCount(handle.get(), &requestedAlgoCount));
|
||||||
|
std::vector<cudnnConvolutionFwdAlgoPerf_t> results(requestedAlgoCount);
|
||||||
|
CUDA4DNN_CHECK_CUDNN(
|
||||||
|
cudnnGetConvolutionForwardAlgorithm_v7(
|
||||||
|
handle.get(),
|
||||||
|
inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(),
|
||||||
|
requestedAlgoCount,
|
||||||
|
&returnedAlgoCount,
|
||||||
|
&results[0]
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
|
size_t free_memory, total_memory;
|
||||||
|
CUDA4DNN_CHECK_CUDA(cudaMemGetInfo(&free_memory, &total_memory));
|
||||||
|
|
||||||
|
bool found_conv_algorithm = false;
|
||||||
|
for (int i = 0; i < returnedAlgoCount; i++)
|
||||||
|
{
|
||||||
|
if (results[i].status == CUDNN_STATUS_SUCCESS &&
|
||||||
|
results[i].algo != CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
|
||||||
|
results[i].memory < free_memory)
|
||||||
|
{
|
||||||
|
found_conv_algorithm = true;
|
||||||
|
algo = results[i].algo;
|
||||||
|
workspace_size = results[i].memory;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!found_conv_algorithm)
|
||||||
|
CV_Error (cv::Error::GpuApiCallError, "cuDNN did not return a suitable algorithm for convolution.");
|
||||||
|
#else
|
||||||
CUDA4DNN_CHECK_CUDNN(
|
CUDA4DNN_CHECK_CUDNN(
|
||||||
cudnnGetConvolutionForwardAlgorithm(
|
cudnnGetConvolutionForwardAlgorithm(
|
||||||
handle.get(),
|
handle.get(),
|
||||||
input.get(), filter.get(), conv.get(), output.get(),
|
inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(),
|
||||||
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
|
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
|
||||||
0, /* no memory limit */
|
0, /* no memory limit */
|
||||||
&algo
|
&algo
|
||||||
@ -272,10 +315,11 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
|
|||||||
CUDA4DNN_CHECK_CUDNN(
|
CUDA4DNN_CHECK_CUDNN(
|
||||||
cudnnGetConvolutionForwardWorkspaceSize(
|
cudnnGetConvolutionForwardWorkspaceSize(
|
||||||
handle.get(),
|
handle.get(),
|
||||||
input.get(), filter.get(), conv.get(), output.get(),
|
inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(),
|
||||||
algo, &workspace_size
|
algo, &workspace_size
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
ConvolutionAlgorithm& operator=(const ConvolutionAlgorithm&) = default;
|
ConvolutionAlgorithm& operator=(const ConvolutionAlgorithm&) = default;
|
||||||
|
@ -30,15 +30,49 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
|
|||||||
|
|
||||||
TransposeConvolutionAlgorithm(
|
TransposeConvolutionAlgorithm(
|
||||||
const Handle& handle,
|
const Handle& handle,
|
||||||
const ConvolutionDescriptor<T>& conv,
|
const ConvolutionDescriptor<T>& convDesc,
|
||||||
const FilterDescriptor<T>& filter,
|
const FilterDescriptor<T>& filterDesc,
|
||||||
const TensorDescriptor<T>& input,
|
const TensorDescriptor<T>& inputDesc,
|
||||||
const TensorDescriptor<T>& output)
|
const TensorDescriptor<T>& outputDesc)
|
||||||
{
|
{
|
||||||
|
#if CUDNN_MAJOR >= 8
|
||||||
|
int requestedAlgoCount = 0, returnedAlgoCount = 0;
|
||||||
|
CUDA4DNN_CHECK_CUDNN(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(handle.get(), &requestedAlgoCount));
|
||||||
|
std::vector<cudnnConvolutionBwdDataAlgoPerf_t> results(requestedAlgoCount);
|
||||||
|
CUDA4DNN_CHECK_CUDNN(
|
||||||
|
cudnnGetConvolutionBackwardDataAlgorithm_v7(
|
||||||
|
handle.get(),
|
||||||
|
filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(),
|
||||||
|
requestedAlgoCount,
|
||||||
|
&returnedAlgoCount,
|
||||||
|
&results[0]
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
|
size_t free_memory, total_memory;
|
||||||
|
CUDA4DNN_CHECK_CUDA(cudaMemGetInfo(&free_memory, &total_memory));
|
||||||
|
|
||||||
|
bool found_conv_algorithm = false;
|
||||||
|
for (int i = 0; i < returnedAlgoCount; i++)
|
||||||
|
{
|
||||||
|
if (results[i].status == CUDNN_STATUS_SUCCESS &&
|
||||||
|
results[i].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED &&
|
||||||
|
results[i].memory < free_memory)
|
||||||
|
{
|
||||||
|
found_conv_algorithm = true;
|
||||||
|
dalgo = results[i].algo;
|
||||||
|
workspace_size = results[i].memory;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!found_conv_algorithm)
|
||||||
|
CV_Error (cv::Error::GpuApiCallError, "cuDNN did not return a suitable algorithm for transpose convolution.");
|
||||||
|
#else
|
||||||
CUDA4DNN_CHECK_CUDNN(
|
CUDA4DNN_CHECK_CUDNN(
|
||||||
cudnnGetConvolutionBackwardDataAlgorithm(
|
cudnnGetConvolutionBackwardDataAlgorithm(
|
||||||
handle.get(),
|
handle.get(),
|
||||||
filter.get(), input.get(), conv.get(), output.get(),
|
filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(),
|
||||||
CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
|
CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
|
||||||
0, /* no memory limit */
|
0, /* no memory limit */
|
||||||
&dalgo
|
&dalgo
|
||||||
@ -48,10 +82,11 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
|
|||||||
CUDA4DNN_CHECK_CUDNN(
|
CUDA4DNN_CHECK_CUDNN(
|
||||||
cudnnGetConvolutionBackwardDataWorkspaceSize(
|
cudnnGetConvolutionBackwardDataWorkspaceSize(
|
||||||
handle.get(),
|
handle.get(),
|
||||||
filter.get(), input.get(), conv.get(), output.get(),
|
filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(),
|
||||||
dalgo, &workspace_size
|
dalgo, &workspace_size
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
TransposeConvolutionAlgorithm& operator=(const TransposeConvolutionAlgorithm&) = default;
|
TransposeConvolutionAlgorithm& operator=(const TransposeConvolutionAlgorithm&) = default;
|
||||||
|
Loading…
Reference in New Issue
Block a user