Merge pull request #14827 from YashasSamaga:cuda4dnn-csl-low
CUDA backend for the DNN module
* stub cuda4dnn design
* minor fixes for tests and doxygen
* add csl public api directory to module headers
* add low-level CSL components
* add high-level CSL components
* integrate csl::Tensor into backbone code
* switch to CPU iff unsupported; otherwise, fail on error
* add fully connected layer
* add softmax layer
* add activation layers
* support arbitary rank TensorDescriptor
* pass input wrappers to `initCUDA()`
* add 1d/2d/3d-convolution
* add pooling layer
* reorganize and refactor code
* fixes for gcc, clang and doxygen; remove cxx14/17 code
* add blank_layer
* add LRN layer
* add rounding modes for pooling layer
* split tensor.hpp into tensor.hpp and tensor_ops.hpp
* add concat layer
* add scale layer
* add batch normalization layer
* split math.cu into activations.cu and math.hpp
* add eltwise layer
* add flatten layer
* add tensor transform api
* add asymmetric padding support for convolution layer
* add reshape layer
* fix rebase issues
* add permute layer
* add padding support for concat layer
* refactor and reorganize code
* add normalize layer
* optimize bias addition in scale layer
* add prior box layer
* fix and optimize normalize layer
* add asymmetric padding support for pooling layer
* add event API
* improve pooling performance for some padding scenarios
* avoid over-allocation of compute resources to kernels
* improve prior box performance
* enable layer fusion
* add const layer
* add resize layer
* add slice layer
* add padding layer
* add deconvolution layer
* fix channelwise ReLU initialization
* add vector traits
* add vectorized versions of relu, clipped_relu, power
* add vectorized concat kernels
* improve concat_with_offsets performance
* vectorize scale and bias kernels
* add support for multi-billion element tensors
* vectorize prior box kernels
* fix address alignment check
* improve bias addition performance of conv/deconv/fc layers
* restructure code for supporting multiple targets
* add DNN_TARGET_CUDA_FP64
* add DNN_TARGET_FP16
* improve vectorization
* add region layer
* improve tensor API, add dynamic ranks
1. use ManagedPtr instead of a Tensor in backend wrapper
2. add new methods to tensor classes
- size_range: computes the combined size of for a given axis range
- tensor span/view can be constructed from a raw pointer and shape
3. the tensor classes can change their rank at runtime (previously rank was fixed at compile-time)
4. remove device code from tensor classes (as they are unused)
5. enforce strict conditions on tensor class APIs to improve debugging ability
* fix parametric relu activation
* add squeeze/unsqueeze tensor API
* add reorg layer
* optimize permute and enable 2d permute
* enable 1d and 2d slice
* add split layer
* add shuffle channel layer
* allow tensors of different ranks in reshape primitive
* patch SliceOp to allow Crop Layer
* allow extra shape inputs in reshape layer
* use `std::move_backward` instead of `std::move` for insert in resizable_static_array
* improve workspace management
* add spatial LRN
* add nms (cpu) to region layer
* add max pooling with argmax ( and a fix to limits.hpp)
* add max unpooling layer
* rename DNN_TARGET_CUDA_FP32 to DNN_TARGET_CUDA
* update supportBackend to be more rigorous
* remove stray include from preventing non-cuda build
* include op_cuda.hpp outside condition #if
* refactoring, fixes and many optimizations
* drop DNN_TARGET_CUDA_FP64
* fix gcc errors
* increase max. tensor rank limit to six
* add Interp layer
* drop custom layers; use BackendNode
* vectorize activation kernels
* fixes for gcc
* remove wrong assertion
* fix broken assertion in unpooling primitive
* fix build errors in non-CUDA build
* completely remove workspace from public API
* fix permute layer
* enable accuracy and perf. tests for DNN_TARGET_CUDA
* add asynchronous forward
* vectorize eltwise ops
* vectorize fill kernel
* fixes for gcc
* remove CSL headers from public API
* remove csl header source group from cmake
* update min. cudnn version in cmake
* add numerically stable FP32 log1pexp
* refactor code
* add FP16 specialization to cudnn based tensor addition
* vectorize scale1 and bias1 + minor refactoring
* fix doxygen build
* fix invalid alignment assertion
* clear backend wrappers before allocateLayers
* ignore memory lock failures
* do not allocate internal blobs
* integrate NVTX
* add numerically stable half precision log1pexp
* fix indentation, following coding style, improve docs
* remove accidental modification of IE code
* Revert "add asynchronous forward"
This reverts commit 1154b9da9da07e9b52f8a81bdcea48cf31c56f70.
* [cmake] throw error for unsupported CC versions
* fix rebase issues
* add more docs, refactor code, fix bugs
* minor refactoring and fixes
* resolve warnings/errors from clang
* remove haveCUDA() checks from supportBackend()
* remove NVTX integration
* changes based on review comments
* avoid exception when no CUDA device is present
* add color code for CUDA in Net::dump
2019-10-21 19:28:00 +08:00
|
|
|
// This file is part of OpenCV project.
|
|
|
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
|
|
|
// of this distribution and at http://opencv.org/license.html.
|
|
|
|
|
|
|
|
#ifndef OPENCV_DNN_SRC_OP_CUDA_HPP
|
|
|
|
#define OPENCV_DNN_SRC_OP_CUDA_HPP
|
|
|
|
|
|
|
|
#ifdef HAVE_CUDA
|
|
|
|
#include "cuda4dnn/csl/stream.hpp"
|
|
|
|
#include "cuda4dnn/csl/cublas.hpp"
|
|
|
|
#include "cuda4dnn/csl/cudnn.hpp"
|
|
|
|
#include "cuda4dnn/csl/tensor.hpp"
|
|
|
|
#include "cuda4dnn/csl/memory.hpp"
|
|
|
|
#include "cuda4dnn/csl/fp16.hpp"
|
|
|
|
#include "cuda4dnn/csl/workspace.hpp"
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include <opencv2/dnn/shape_utils.hpp>
|
|
|
|
#include <opencv2/core.hpp>
|
|
|
|
|
|
|
|
#include <cstddef>
|
|
|
|
#include <memory>
|
|
|
|
#include <iterator>
|
|
|
|
|
|
|
|
namespace cv { namespace dnn {
|
|
|
|
|
|
|
|
constexpr bool IS_DNN_CUDA_TARGET(int id) {
|
|
|
|
return id == DNN_TARGET_CUDA_FP16 || id == DNN_TARGET_CUDA;
|
|
|
|
}
|
|
|
|
|
|
|
|
constexpr bool haveCUDA() {
|
|
|
|
#ifdef HAVE_CUDA
|
|
|
|
return true;
|
|
|
|
#else
|
|
|
|
return false;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef HAVE_CUDA
|
|
|
|
namespace cuda4dnn { namespace csl {
|
|
|
|
struct CSLContext {
|
|
|
|
Stream stream;
|
|
|
|
cublas::Handle cublas_handle;
|
|
|
|
cudnn::Handle cudnn_handle;
|
|
|
|
};
|
|
|
|
|
|
|
|
/** @brief creates Tensor object from cv::Mat (only the header is created, i.e. no data is copied)
|
|
|
|
*
|
|
|
|
* \tparam T element type for the tensor
|
|
|
|
* \param[in] mat cv::Mat from which the shape must be inferred
|
|
|
|
*
|
|
|
|
* \return a Tensor object with the shape of \p mat
|
|
|
|
*/
|
|
|
|
template <class T>
|
|
|
|
Tensor<T> makeTensorHeader(const Mat& mat) {
|
|
|
|
auto sizes = shape(mat);
|
|
|
|
return Tensor<T>(std::begin(sizes), std::end(sizes));
|
|
|
|
}
|
|
|
|
|
|
|
|
/** @brief copies data from a cv::Mat to TensorType
|
|
|
|
*
|
|
|
|
* \tparam T the type of the elements contained in TensorType object
|
|
|
|
*
|
|
|
|
* \param[in] srcMat source matrix
|
|
|
|
* \param[out] destTensor destination tensor
|
|
|
|
* \param stream CUDA stream to use for the memory transfer
|
|
|
|
*
|
2019-11-26 08:55:07 +08:00
|
|
|
* The memory copy starts from beginning \p srcMat. The number of elements copied is
|
Merge pull request #14827 from YashasSamaga:cuda4dnn-csl-low
CUDA backend for the DNN module
* stub cuda4dnn design
* minor fixes for tests and doxygen
* add csl public api directory to module headers
* add low-level CSL components
* add high-level CSL components
* integrate csl::Tensor into backbone code
* switch to CPU iff unsupported; otherwise, fail on error
* add fully connected layer
* add softmax layer
* add activation layers
* support arbitary rank TensorDescriptor
* pass input wrappers to `initCUDA()`
* add 1d/2d/3d-convolution
* add pooling layer
* reorganize and refactor code
* fixes for gcc, clang and doxygen; remove cxx14/17 code
* add blank_layer
* add LRN layer
* add rounding modes for pooling layer
* split tensor.hpp into tensor.hpp and tensor_ops.hpp
* add concat layer
* add scale layer
* add batch normalization layer
* split math.cu into activations.cu and math.hpp
* add eltwise layer
* add flatten layer
* add tensor transform api
* add asymmetric padding support for convolution layer
* add reshape layer
* fix rebase issues
* add permute layer
* add padding support for concat layer
* refactor and reorganize code
* add normalize layer
* optimize bias addition in scale layer
* add prior box layer
* fix and optimize normalize layer
* add asymmetric padding support for pooling layer
* add event API
* improve pooling performance for some padding scenarios
* avoid over-allocation of compute resources to kernels
* improve prior box performance
* enable layer fusion
* add const layer
* add resize layer
* add slice layer
* add padding layer
* add deconvolution layer
* fix channelwise ReLU initialization
* add vector traits
* add vectorized versions of relu, clipped_relu, power
* add vectorized concat kernels
* improve concat_with_offsets performance
* vectorize scale and bias kernels
* add support for multi-billion element tensors
* vectorize prior box kernels
* fix address alignment check
* improve bias addition performance of conv/deconv/fc layers
* restructure code for supporting multiple targets
* add DNN_TARGET_CUDA_FP64
* add DNN_TARGET_FP16
* improve vectorization
* add region layer
* improve tensor API, add dynamic ranks
1. use ManagedPtr instead of a Tensor in backend wrapper
2. add new methods to tensor classes
- size_range: computes the combined size of for a given axis range
- tensor span/view can be constructed from a raw pointer and shape
3. the tensor classes can change their rank at runtime (previously rank was fixed at compile-time)
4. remove device code from tensor classes (as they are unused)
5. enforce strict conditions on tensor class APIs to improve debugging ability
* fix parametric relu activation
* add squeeze/unsqueeze tensor API
* add reorg layer
* optimize permute and enable 2d permute
* enable 1d and 2d slice
* add split layer
* add shuffle channel layer
* allow tensors of different ranks in reshape primitive
* patch SliceOp to allow Crop Layer
* allow extra shape inputs in reshape layer
* use `std::move_backward` instead of `std::move` for insert in resizable_static_array
* improve workspace management
* add spatial LRN
* add nms (cpu) to region layer
* add max pooling with argmax ( and a fix to limits.hpp)
* add max unpooling layer
* rename DNN_TARGET_CUDA_FP32 to DNN_TARGET_CUDA
* update supportBackend to be more rigorous
* remove stray include from preventing non-cuda build
* include op_cuda.hpp outside condition #if
* refactoring, fixes and many optimizations
* drop DNN_TARGET_CUDA_FP64
* fix gcc errors
* increase max. tensor rank limit to six
* add Interp layer
* drop custom layers; use BackendNode
* vectorize activation kernels
* fixes for gcc
* remove wrong assertion
* fix broken assertion in unpooling primitive
* fix build errors in non-CUDA build
* completely remove workspace from public API
* fix permute layer
* enable accuracy and perf. tests for DNN_TARGET_CUDA
* add asynchronous forward
* vectorize eltwise ops
* vectorize fill kernel
* fixes for gcc
* remove CSL headers from public API
* remove csl header source group from cmake
* update min. cudnn version in cmake
* add numerically stable FP32 log1pexp
* refactor code
* add FP16 specialization to cudnn based tensor addition
* vectorize scale1 and bias1 + minor refactoring
* fix doxygen build
* fix invalid alignment assertion
* clear backend wrappers before allocateLayers
* ignore memory lock failures
* do not allocate internal blobs
* integrate NVTX
* add numerically stable half precision log1pexp
* fix indentation, following coding style, improve docs
* remove accidental modification of IE code
* Revert "add asynchronous forward"
This reverts commit 1154b9da9da07e9b52f8a81bdcea48cf31c56f70.
* [cmake] throw error for unsupported CC versions
* fix rebase issues
* add more docs, refactor code, fix bugs
* minor refactoring and fixes
* resolve warnings/errors from clang
* remove haveCUDA() checks from supportBackend()
* remove NVTX integration
* changes based on review comments
* avoid exception when no CUDA device is present
* add color code for CUDA in Net::dump
2019-10-21 19:28:00 +08:00
|
|
|
* equal to the number of elements in \p destTensor.
|
|
|
|
*
|
|
|
|
* Pre-conditions:
|
|
|
|
* - \p srcMat must contain elements of type CV_32F
|
|
|
|
* - the size of \p srcMat must be larger than or equal to the size of \p destTensor
|
|
|
|
*
|
|
|
|
* @note best performance when \p srcMat is continuous and page-locked
|
|
|
|
* @note blocks calling thread if \p srcMat is not page-locked
|
|
|
|
*/
|
|
|
|
template <class T>
|
|
|
|
void copyMatToTensor(const Mat& srcMat, const TensorSpan<T> destTensor, const Stream& stream);
|
|
|
|
|
|
|
|
template <> inline
|
|
|
|
void copyMatToTensor(const Mat& srcMat, const TensorSpan<half> destTensor, const Stream& stream) {
|
|
|
|
/* should perhaps convert cv::Mat of different type to the required type and copy */
|
|
|
|
CV_Assert(srcMat.type() == CV_32F);
|
|
|
|
CV_Assert(srcMat.total() >= destTensor.size());
|
|
|
|
|
|
|
|
Mat temp;
|
|
|
|
srcMat.convertTo(temp, CV_16F);
|
|
|
|
CV_Assert(temp.isContinuous());
|
|
|
|
|
|
|
|
memcpy<half>(destTensor.get(), reinterpret_cast<half*>(temp.data), destTensor.size(), stream);
|
|
|
|
}
|
|
|
|
|
|
|
|
template <> inline
|
|
|
|
void copyMatToTensor(const Mat& srcMat, const TensorSpan<float> destTensor, const Stream& stream) {
|
|
|
|
/* should perhaps convert cv::Mat of different type to the required type and copy */
|
|
|
|
CV_Assert(srcMat.type() == CV_32F);
|
|
|
|
CV_Assert(srcMat.total() >= destTensor.size());
|
|
|
|
|
|
|
|
Mat temp = srcMat.isContinuous() ? srcMat : srcMat.clone();
|
|
|
|
CV_Assert(temp.isContinuous());
|
|
|
|
|
|
|
|
memcpy<float>(destTensor.get(), reinterpret_cast<float*>(temp.data), destTensor.size(), stream);
|
|
|
|
}
|
|
|
|
|
|
|
|
/** @brief copies data from a TensorType to a cv::Mat
|
|
|
|
*
|
|
|
|
* \tparam T the type of the elements contained in TensorType object
|
|
|
|
*
|
|
|
|
* \param[in] srcTensor source tensor
|
|
|
|
* \param[out] destMat destination matrix
|
|
|
|
* \param stream CUDA stream to use for the memory transfer
|
|
|
|
*
|
|
|
|
* The entire memory block held by the \p srcTensor is copied to \p destMat.
|
|
|
|
*
|
|
|
|
* Pre-conditions:
|
|
|
|
* - \p destMat must contain elements of type CV_32F
|
|
|
|
* - the size of \p destMat must be larger than or equal to the size of \p srcTensor
|
|
|
|
*
|
|
|
|
* @note best performance when \p destMat is continuous and page-locked
|
|
|
|
* @note blocks calling thread if \p destMat is not page-locked
|
|
|
|
*/
|
|
|
|
template <class T>
|
|
|
|
void copyTensorToMat(TensorView<T> srcTensor, Mat& destMat, const Stream& stream);
|
|
|
|
|
|
|
|
template <> inline
|
|
|
|
void copyTensorToMat(TensorView<half> srcTensor, Mat& destMat, const Stream& stream) {
|
|
|
|
CV_Assert(destMat.type() == CV_32F);
|
|
|
|
CV_Assert(destMat.total() >= srcTensor.size());
|
|
|
|
|
|
|
|
Mat temp(shape(destMat), CV_16F);
|
|
|
|
CV_Assert(temp.isContinuous());
|
|
|
|
|
|
|
|
memcpy<half>(reinterpret_cast<half*>(temp.data), srcTensor.get(), srcTensor.size(), stream);
|
|
|
|
|
|
|
|
temp.convertTo(destMat, CV_32F);
|
|
|
|
}
|
|
|
|
|
|
|
|
template <> inline
|
|
|
|
void copyTensorToMat(TensorView<float> srcTensor, Mat& destMat, const Stream& stream) {
|
|
|
|
CV_Assert(destMat.type() == CV_32F);
|
|
|
|
CV_Assert(destMat.total() >= srcTensor.size());
|
|
|
|
|
|
|
|
Mat temp = destMat.isContinuous() ? destMat : destMat.clone();
|
|
|
|
CV_Assert(temp.isContinuous());
|
|
|
|
|
|
|
|
memcpy<float>(reinterpret_cast<float*>(temp.data), srcTensor.get(), srcTensor.size(), stream);
|
|
|
|
|
|
|
|
if (temp.data != destMat.data)
|
|
|
|
temp.copyTo(destMat);
|
|
|
|
}
|
|
|
|
|
|
|
|
}} /* namespace cuda4dnn::csl */
|
|
|
|
|
|
|
|
/** base class for CUDA operation nodes (for all supported targets) */
|
|
|
|
class CUDABackendNode : public BackendNode {
|
|
|
|
public:
|
|
|
|
CUDABackendNode() : BackendNode(DNN_BACKEND_CUDA) { }
|
|
|
|
virtual ~CUDABackendNode() { }
|
|
|
|
|
|
|
|
virtual void forward(
|
|
|
|
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
|
|
|
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
|
|
|
cuda4dnn::csl::Workspace& workspace) = 0;
|
|
|
|
|
|
|
|
virtual std::size_t get_workspace_memory_in_bytes() const noexcept { return 0; }
|
|
|
|
};
|
|
|
|
|
|
|
|
/** @brief utility function which creates CUDA node of correct type from `targetId`
|
|
|
|
*
|
|
|
|
* CUDA operation nodes take the type of data they operate on as a template parameter.
|
|
|
|
* For example, ConcatOp<float> is an operation node which concats tensors of `float` type
|
|
|
|
* into a tensor of `float` type.
|
|
|
|
*
|
|
|
|
* This utility function aids the creation of nodes of different types and eliminates the
|
|
|
|
* need for CUDA target constants (`DNN_TARGET_XXX`) to appear in the operation code which
|
|
|
|
* reduces coupling between modules.
|
|
|
|
*
|
|
|
|
* Example:
|
|
|
|
* template <class T>
|
|
|
|
* class ConcatOp : public CUDABackendNode;
|
|
|
|
*
|
|
|
|
* // returns a cv::Ptr to a ConcatOp<half> object
|
|
|
|
* auto node = make_cuda_node<ConcatOp>(DNN_TARGET_CUDA_FP16, axis);
|
|
|
|
*
|
|
|
|
* // returns a cv::Ptr to a ConcatOp<float> object
|
|
|
|
* auto node = make_cuda_node<ConcatOp>(DNN_TARGET_CUDA, axis);
|
|
|
|
*/
|
|
|
|
template <template <class> class NodeType, class ...Args>
|
|
|
|
cv::Ptr<BackendNode> make_cuda_node(int targetId, Args&& ...args) {
|
|
|
|
switch (targetId)
|
|
|
|
{
|
|
|
|
case DNN_TARGET_CUDA_FP16:
|
|
|
|
return Ptr<BackendNode>(new NodeType<half>(std::forward<Args>(args)...));
|
|
|
|
case DNN_TARGET_CUDA:
|
|
|
|
return Ptr<BackendNode>(new NodeType<float>(std::forward<Args>(args)...));
|
|
|
|
default:
|
|
|
|
CV_Assert(IS_DNN_CUDA_TARGET(targetId));
|
|
|
|
}
|
|
|
|
return Ptr<BackendNode>();
|
|
|
|
}
|
|
|
|
|
|
|
|
/* base class for all CUDA backend/target wrappers */
|
|
|
|
class CUDABackendWrapper : public BackendWrapper {
|
|
|
|
public:
|
|
|
|
CUDABackendWrapper(int targetId) : BackendWrapper(DNN_BACKEND_CUDA, targetId) { }
|
|
|
|
virtual ~CUDABackendWrapper() { }
|
|
|
|
|
|
|
|
void copyToHost() override = 0;
|
|
|
|
void setHostDirty() override = 0;
|
|
|
|
|
|
|
|
virtual void copyToDevice() = 0;
|
|
|
|
virtual void setDeviceDirty() = 0;
|
|
|
|
|
|
|
|
virtual MatShape getShape() const noexcept = 0;
|
|
|
|
virtual std::size_t getRank() const noexcept = 0;
|
|
|
|
|
|
|
|
/** @note setting the stream updates the stream for all wrappers which use the same tensor */
|
|
|
|
virtual void setStream(cuda4dnn::csl::Stream stream) noexcept = 0;
|
|
|
|
};
|
|
|
|
|
|
|
|
template <class T, int TargetID>
|
|
|
|
class GenericCUDABackendWrapper final : public CUDABackendWrapper {
|
|
|
|
public:
|
|
|
|
using value_type = T;
|
|
|
|
using tensor_span_type = cuda4dnn::csl::TensorSpan<value_type>;
|
|
|
|
using tensor_view_type = cuda4dnn::csl::TensorView<value_type>;
|
|
|
|
|
|
|
|
/* Pre-conditions:
|
|
|
|
* - there must be no other instance of `GenericCUDABackendWrapper` which wraps the host memory used by `m`
|
|
|
|
* - the host memory must remain allocated throughout the lifetime of this object
|
|
|
|
*
|
|
|
|
* Post-conditions:
|
|
|
|
* - the host memory used by \p m "may" be page-locked
|
|
|
|
*/
|
|
|
|
GenericCUDABackendWrapper(Mat& m)
|
|
|
|
: CUDABackendWrapper(TargetID)
|
|
|
|
{
|
|
|
|
shape = cv::dnn::shape(m);
|
|
|
|
|
|
|
|
shared_block = std::make_shared<shared_block_type>();
|
|
|
|
shared_block->host_dirty = true;
|
|
|
|
shared_block->device_dirty = false;
|
|
|
|
|
|
|
|
shared_block->host = m;
|
|
|
|
|
|
|
|
try {
|
|
|
|
shared_block->memGuard = cuda4dnn::csl::MemoryLockGuard(m.data, m.total() * m.elemSize());
|
|
|
|
} catch (...) {
|
|
|
|
/* a common reason for failure is that the host system (for example, a Jetson device) does not support it */
|
|
|
|
/* we ignore the failure as this is just an optimization and not a requirement */
|
|
|
|
}
|
|
|
|
|
|
|
|
shared_block->device = cuda4dnn::csl::ManagedPtr<T>(m.total());
|
|
|
|
}
|
|
|
|
|
|
|
|
GenericCUDABackendWrapper(const Ptr<BackendWrapper>& base_, const MatShape& shape_)
|
|
|
|
: CUDABackendWrapper(TargetID)
|
|
|
|
{
|
|
|
|
const Ptr<GenericCUDABackendWrapper> base = base_.dynamicCast<GenericCUDABackendWrapper>();
|
|
|
|
CV_Assert(base);
|
|
|
|
|
|
|
|
shape = shape_;
|
|
|
|
shared_block = base->shared_block;
|
|
|
|
}
|
|
|
|
|
|
|
|
static Ptr<BackendWrapper> create(Mat& m) {
|
|
|
|
return Ptr<BackendWrapper>(new GenericCUDABackendWrapper(m));
|
|
|
|
}
|
|
|
|
|
|
|
|
static Ptr<BackendWrapper> create(const Ptr<BackendWrapper>& base, const MatShape& shape) {
|
|
|
|
return Ptr<BackendWrapper>(new GenericCUDABackendWrapper(base, shape));
|
|
|
|
}
|
|
|
|
|
|
|
|
void copyToHost() override {
|
|
|
|
if (shared_block->device_dirty) {
|
|
|
|
shared_block->host_dirty = false;
|
|
|
|
shared_block->device_dirty = false;
|
|
|
|
|
|
|
|
/* If the wrapper is being reused, the device tensor might be larger in size than the wrapper.
|
|
|
|
* Using the device tensor does not give incorrect code but leads to unused region of memory being copied.
|
|
|
|
*
|
|
|
|
* We use a view to ensure that only the required region of memory is copied.
|
|
|
|
*/
|
|
|
|
auto view = tensor_view_type(shared_block->device.get(), std::begin(shape), std::end(shape));
|
|
|
|
cuda4dnn::csl::copyTensorToMat<T>(view, shared_block->host, shared_block->stream);
|
|
|
|
|
|
|
|
shared_block->stream.synchronize();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void setHostDirty() override {
|
|
|
|
shared_block->device_dirty = false;
|
|
|
|
shared_block->host_dirty = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void copyToDevice() override {
|
|
|
|
if (shared_block->host_dirty) {
|
|
|
|
shared_block->host_dirty = false;
|
|
|
|
shared_block->device_dirty = false;
|
|
|
|
|
|
|
|
auto span = tensor_span_type(shared_block->device.get(), std::begin(shape), std::end(shape));
|
|
|
|
cuda4dnn::csl::copyMatToTensor<T>(shared_block->host, span, shared_block->stream);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void setDeviceDirty() override {
|
|
|
|
shared_block->device_dirty = true;
|
|
|
|
shared_block->host_dirty = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
MatShape getShape() const noexcept override { return shape; }
|
|
|
|
|
|
|
|
std::size_t getRank() const noexcept override { return shape.size(); }
|
|
|
|
|
|
|
|
void setStream(cuda4dnn::csl::Stream stream) noexcept override {
|
|
|
|
shared_block->stream = std::move(stream);
|
|
|
|
}
|
|
|
|
|
|
|
|
cv::Mat getMutableHostMat() noexcept {
|
|
|
|
copyToHost();
|
|
|
|
setHostDirty();
|
|
|
|
return shared_block->host;
|
|
|
|
}
|
|
|
|
|
|
|
|
const cv::Mat getImmutableHostMat() const noexcept {
|
|
|
|
copyToHost();
|
|
|
|
return shared_block->host;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Optimization Note: use getSpan() and getView() judiciously
|
|
|
|
*
|
|
|
|
* getSpan() is meant to be used when the memory is going to be modified
|
|
|
|
* getView() is meant to be used when the memory is only going to be read
|
|
|
|
*
|
|
|
|
* getSpan() marks the device memory as dirty but getView() does not
|
|
|
|
*
|
|
|
|
* getView() implicitly performs host to device memory transfer if required
|
|
|
|
* getSpan() does not perform any synchronization (use copyToDevice if sync. is required)
|
|
|
|
*/
|
|
|
|
tensor_span_type getSpan() noexcept {
|
|
|
|
setDeviceDirty();
|
|
|
|
return tensor_span_type(shared_block->device.get(), std::begin(shape), std::end(shape));
|
|
|
|
}
|
|
|
|
|
|
|
|
tensor_view_type getView() noexcept {
|
|
|
|
copyToDevice();
|
|
|
|
return tensor_view_type(shared_block->device.get(), std::begin(shape), std::end(shape));
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
/* The same tensor memory can be reused by different layers whenever possible.
|
2019-12-26 19:45:03 +08:00
|
|
|
* Hence, it is possible for different backend wrappers to point to the same memory.
|
Merge pull request #14827 from YashasSamaga:cuda4dnn-csl-low
CUDA backend for the DNN module
* stub cuda4dnn design
* minor fixes for tests and doxygen
* add csl public api directory to module headers
* add low-level CSL components
* add high-level CSL components
* integrate csl::Tensor into backbone code
* switch to CPU iff unsupported; otherwise, fail on error
* add fully connected layer
* add softmax layer
* add activation layers
* support arbitary rank TensorDescriptor
* pass input wrappers to `initCUDA()`
* add 1d/2d/3d-convolution
* add pooling layer
* reorganize and refactor code
* fixes for gcc, clang and doxygen; remove cxx14/17 code
* add blank_layer
* add LRN layer
* add rounding modes for pooling layer
* split tensor.hpp into tensor.hpp and tensor_ops.hpp
* add concat layer
* add scale layer
* add batch normalization layer
* split math.cu into activations.cu and math.hpp
* add eltwise layer
* add flatten layer
* add tensor transform api
* add asymmetric padding support for convolution layer
* add reshape layer
* fix rebase issues
* add permute layer
* add padding support for concat layer
* refactor and reorganize code
* add normalize layer
* optimize bias addition in scale layer
* add prior box layer
* fix and optimize normalize layer
* add asymmetric padding support for pooling layer
* add event API
* improve pooling performance for some padding scenarios
* avoid over-allocation of compute resources to kernels
* improve prior box performance
* enable layer fusion
* add const layer
* add resize layer
* add slice layer
* add padding layer
* add deconvolution layer
* fix channelwise ReLU initialization
* add vector traits
* add vectorized versions of relu, clipped_relu, power
* add vectorized concat kernels
* improve concat_with_offsets performance
* vectorize scale and bias kernels
* add support for multi-billion element tensors
* vectorize prior box kernels
* fix address alignment check
* improve bias addition performance of conv/deconv/fc layers
* restructure code for supporting multiple targets
* add DNN_TARGET_CUDA_FP64
* add DNN_TARGET_FP16
* improve vectorization
* add region layer
* improve tensor API, add dynamic ranks
1. use ManagedPtr instead of a Tensor in backend wrapper
2. add new methods to tensor classes
- size_range: computes the combined size of for a given axis range
- tensor span/view can be constructed from a raw pointer and shape
3. the tensor classes can change their rank at runtime (previously rank was fixed at compile-time)
4. remove device code from tensor classes (as they are unused)
5. enforce strict conditions on tensor class APIs to improve debugging ability
* fix parametric relu activation
* add squeeze/unsqueeze tensor API
* add reorg layer
* optimize permute and enable 2d permute
* enable 1d and 2d slice
* add split layer
* add shuffle channel layer
* allow tensors of different ranks in reshape primitive
* patch SliceOp to allow Crop Layer
* allow extra shape inputs in reshape layer
* use `std::move_backward` instead of `std::move` for insert in resizable_static_array
* improve workspace management
* add spatial LRN
* add nms (cpu) to region layer
* add max pooling with argmax ( and a fix to limits.hpp)
* add max unpooling layer
* rename DNN_TARGET_CUDA_FP32 to DNN_TARGET_CUDA
* update supportBackend to be more rigorous
* remove stray include from preventing non-cuda build
* include op_cuda.hpp outside condition #if
* refactoring, fixes and many optimizations
* drop DNN_TARGET_CUDA_FP64
* fix gcc errors
* increase max. tensor rank limit to six
* add Interp layer
* drop custom layers; use BackendNode
* vectorize activation kernels
* fixes for gcc
* remove wrong assertion
* fix broken assertion in unpooling primitive
* fix build errors in non-CUDA build
* completely remove workspace from public API
* fix permute layer
* enable accuracy and perf. tests for DNN_TARGET_CUDA
* add asynchronous forward
* vectorize eltwise ops
* vectorize fill kernel
* fixes for gcc
* remove CSL headers from public API
* remove csl header source group from cmake
* update min. cudnn version in cmake
* add numerically stable FP32 log1pexp
* refactor code
* add FP16 specialization to cudnn based tensor addition
* vectorize scale1 and bias1 + minor refactoring
* fix doxygen build
* fix invalid alignment assertion
* clear backend wrappers before allocateLayers
* ignore memory lock failures
* do not allocate internal blobs
* integrate NVTX
* add numerically stable half precision log1pexp
* fix indentation, following coding style, improve docs
* remove accidental modification of IE code
* Revert "add asynchronous forward"
This reverts commit 1154b9da9da07e9b52f8a81bdcea48cf31c56f70.
* [cmake] throw error for unsupported CC versions
* fix rebase issues
* add more docs, refactor code, fix bugs
* minor refactoring and fixes
* resolve warnings/errors from clang
* remove haveCUDA() checks from supportBackend()
* remove NVTX integration
* changes based on review comments
* avoid exception when no CUDA device is present
* add color code for CUDA in Net::dump
2019-10-21 19:28:00 +08:00
|
|
|
* However, it may use only a part of that memory and have a different shape.
|
|
|
|
*
|
|
|
|
* We store the common information such as device tensor and its corresponding host memory in
|
|
|
|
* a shared block. The shared block is shared by all backend wrappers which use the same memory.
|
|
|
|
* The shape, which can be different for different wrappers, is stored as a member object.
|
|
|
|
*/
|
|
|
|
|
|
|
|
MatShape shape;
|
|
|
|
|
|
|
|
struct shared_block_type {
|
|
|
|
bool host_dirty;
|
|
|
|
bool device_dirty;
|
|
|
|
|
|
|
|
cv::Mat host;
|
|
|
|
cuda4dnn::csl::MemoryLockGuard memGuard; /* keeps host memory page-locked if possible */
|
|
|
|
|
|
|
|
cuda4dnn::csl::ManagedPtr<T> device;
|
|
|
|
cuda4dnn::csl::Stream stream;
|
|
|
|
};
|
|
|
|
|
|
|
|
std::shared_ptr<shared_block_type> shared_block;
|
|
|
|
};
|
|
|
|
|
|
|
|
using CUDABackendWrapperFP16 = GenericCUDABackendWrapper<half, DNN_TARGET_CUDA_FP16>;
|
|
|
|
using CUDABackendWrapperFP32 = GenericCUDABackendWrapper<float, DNN_TARGET_CUDA>;
|
|
|
|
|
|
|
|
template <class T> struct GetCUDABackendWrapperType_ { };
|
|
|
|
template <> struct GetCUDABackendWrapperType_<half> { typedef CUDABackendWrapperFP16 type; };
|
|
|
|
template <> struct GetCUDABackendWrapperType_<float> { typedef CUDABackendWrapperFP32 type; };
|
|
|
|
|
|
|
|
template <class T>
|
|
|
|
using GetCUDABackendWrapperType = typename GetCUDABackendWrapperType_<T>::type;
|
|
|
|
|
|
|
|
#endif
|
|
|
|
}} /* namespace cv::dnn */
|
|
|
|
|
|
|
|
#endif /* OPENCV_DNN_SRC_OP_CUDA_HPP */
|