// This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. #ifndef OPENCV_DNN_SRC_OP_CUDA_HPP #define OPENCV_DNN_SRC_OP_CUDA_HPP #ifdef HAVE_CUDA #include "cuda4dnn/csl/stream.hpp" #include "cuda4dnn/csl/event.hpp" #include "cuda4dnn/csl/cublas.hpp" #include "cuda4dnn/csl/cudnn.hpp" #include "cuda4dnn/csl/tensor.hpp" #include "cuda4dnn/csl/memory.hpp" #include "cuda4dnn/csl/workspace.hpp" #include "cuda4dnn/kernels/fp_conversion.hpp" #endif #include #include #include #include #include namespace cv { namespace dnn { constexpr bool IS_DNN_CUDA_TARGET(int id) { return id == DNN_TARGET_CUDA_FP16 || id == DNN_TARGET_CUDA; } constexpr bool haveCUDA() { #ifdef HAVE_CUDA return true; #else return false; #endif } #ifdef HAVE_CUDA namespace cuda4dnn { namespace csl { struct CSLContext { Stream stream; cublas::Handle cublas_handle; cudnn::Handle cudnn_handle; }; /** @brief creates Tensor object from cv::Mat (only the header is created, i.e. no data is copied) * * \tparam T element type for the tensor * \param[in] mat cv::Mat from which the shape must be inferred * * \return a Tensor object with the shape of \p mat */ template Tensor makeTensorHeader(const Mat& mat) { auto sizes = shape(mat); return Tensor(std::begin(sizes), std::end(sizes)); } /** @brief copies data from a cv::Mat to TensorType * * \tparam T the type of the elements contained in TensorType object * * \param[in] srcMat source matrix * \param[out] destTensor destination tensor * \param stream CUDA stream to use for the memory transfer * * The memory copy starts from beginning \p srcMat. The number of elements copied is * equal to the number of elements in \p destTensor. * * Pre-conditions: * - \p srcMat must contain elements of type CV_32F * - the size of \p srcMat must be larger than or equal to the size of \p destTensor * * @note best performance when \p srcMat is continuous and page-locked * @note blocks calling thread if \p srcMat is not page-locked */ template void copyMatToTensor(const Mat& srcMat, const TensorSpan destTensor, const Stream& stream); template <> inline void copyMatToTensor(const Mat& srcMat, const TensorSpan destTensor, const Stream& stream) { /* should perhaps convert cv::Mat of different type to the required type and copy */ CV_Assert(srcMat.type() == CV_32F); CV_Assert(srcMat.total() >= destTensor.size()); Mat temp; srcMat.convertTo(temp, CV_16F); CV_Assert(temp.isContinuous()); memcpy(destTensor.get(), reinterpret_cast(temp.data), destTensor.size(), stream); } template <> inline void copyMatToTensor(const Mat& srcMat, const TensorSpan destTensor, const Stream& stream) { /* should perhaps convert cv::Mat of different type to the required type and copy */ CV_Assert(srcMat.type() == CV_32F); CV_Assert(srcMat.total() >= destTensor.size()); Mat temp = srcMat.isContinuous() ? srcMat : srcMat.clone(); CV_Assert(temp.isContinuous()); memcpy(destTensor.get(), reinterpret_cast(temp.data), destTensor.size(), stream); } /** @brief copies data from a TensorType to a cv::Mat * * \tparam T the type of the elements contained in TensorType object * * \param[in] srcTensor source tensor * \param[out] destMat destination matrix * \param stream CUDA stream to use for the memory transfer * * The entire memory block held by the \p srcTensor is copied to \p destMat. * * Pre-conditions: * - \p destMat must contain elements of type CV_32F * - the size of \p destMat must be larger than or equal to the size of \p srcTensor * * @note best performance when \p destMat is continuous and page-locked * @note blocks calling thread if \p destMat is not page-locked */ template void copyTensorToMat(TensorView srcTensor, Mat& destMat, const Stream& stream); template <> inline void copyTensorToMat(TensorView srcTensor, Mat& destMat, const Stream& stream) { CV_Assert(destMat.type() == CV_32F); CV_Assert(destMat.total() >= srcTensor.size()); Mat temp(shape(destMat), CV_16F); CV_Assert(temp.isContinuous()); memcpy(reinterpret_cast(temp.data), srcTensor.get(), srcTensor.size(), stream); temp.convertTo(destMat, CV_32F); } template <> inline void copyTensorToMat(TensorView srcTensor, Mat& destMat, const Stream& stream) { CV_Assert(destMat.type() == CV_32F); CV_Assert(destMat.total() >= srcTensor.size()); Mat temp = destMat.isContinuous() ? destMat : destMat.clone(); CV_Assert(temp.isContinuous()); memcpy(reinterpret_cast(temp.data), srcTensor.get(), srcTensor.size(), stream); if (temp.data != destMat.data) temp.copyTo(destMat); } }} /* namespace cuda4dnn::csl */ /** base class for CUDA operation nodes (for all supported targets) */ class CUDABackendNode : public BackendNode { public: CUDABackendNode() : BackendNode(DNN_BACKEND_CUDA) { } virtual ~CUDABackendNode() { } virtual void forward( const std::vector>& inputs, const std::vector>& outputs, cuda4dnn::csl::Workspace& workspace) = 0; virtual std::size_t get_workspace_memory_in_bytes() const noexcept { return 0; } }; /** @brief utility function which creates CUDA node of correct type from `targetId` * * CUDA operation nodes take the type of data they operate on as a template parameter. * For example, ConcatOp is an operation node which concats tensors of `float` type * into a tensor of `float` type. * * This utility function aids the creation of nodes of different types and eliminates the * need for CUDA target constants (`DNN_TARGET_XXX`) to appear in the operation code which * reduces coupling between modules. * * Example: * template * class ConcatOp : public CUDABackendNode; * * // returns a cv::Ptr to a ConcatOp object * auto node = make_cuda_node(DNN_TARGET_CUDA_FP16, axis); * * // returns a cv::Ptr to a ConcatOp object * auto node = make_cuda_node(DNN_TARGET_CUDA, axis); */ template