mirror of
https://github.com/opencv/opencv.git
synced 2025-07-26 07:07:37 +08:00
Merge pull request #17748 from YashasSamaga:cuda4dnn-data-parallel
This commit is contained in:
commit
988bc804bf
@ -33,7 +33,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
|
|||||||
/** if \p create is `true`, a new event will be created; otherwise, an empty event object is created */
|
/** if \p create is `true`, a new event will be created; otherwise, an empty event object is created */
|
||||||
Event(bool create, bool timing_event = false) : event{nullptr} {
|
Event(bool create, bool timing_event = false) : event{nullptr} {
|
||||||
if (create) {
|
if (create) {
|
||||||
unsigned int flags = cudaEventBlockingSync | (timing_event ? 0 : cudaEventDisableTiming);
|
unsigned int flags = (timing_event ? 0 : cudaEventDisableTiming);
|
||||||
CUDA4DNN_CHECK_CUDA(cudaEventCreateWithFlags(&event, flags));
|
CUDA4DNN_CHECK_CUDA(cudaEventCreateWithFlags(&event, flags));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -60,6 +60,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
|
|||||||
|
|
||||||
/** mark a point in \p stream */
|
/** mark a point in \p stream */
|
||||||
void record(const Stream& stream) {
|
void record(const Stream& stream) {
|
||||||
|
CV_Assert(stream);
|
||||||
CUDA4DNN_CHECK_CUDA(cudaEventRecord(event, stream.get()));
|
CUDA4DNN_CHECK_CUDA(cudaEventRecord(event, stream.get()));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -85,12 +86,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
|
|||||||
};
|
};
|
||||||
|
|
||||||
/** makes a stream wait on an event */
|
/** makes a stream wait on an event */
|
||||||
void StreamWaitOnEvent(const Stream& stream, const Event& event) {
|
inline void StreamWaitOnEvent(const Stream& stream, const Event& event) {
|
||||||
|
CV_Assert(stream);
|
||||||
CUDA4DNN_CHECK_CUDA(cudaStreamWaitEvent(stream.get(), event.get(), 0));
|
CUDA4DNN_CHECK_CUDA(cudaStreamWaitEvent(stream.get(), event.get(), 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
/** returns the time elapsed between two events in milliseconds */
|
/** returns the time elapsed between two events in milliseconds */
|
||||||
float TimeElapsedBetweenEvents(const Event& start, const Event& end) {
|
inline float TimeElapsedBetweenEvents(const Event& start, const Event& end) {
|
||||||
float temp;
|
float temp;
|
||||||
CUDA4DNN_CHECK_CUDA(cudaEventElapsedTime(&temp, start.get(), end.get()));
|
CUDA4DNN_CHECK_CUDA(cudaEventElapsedTime(&temp, start.get(), end.get()));
|
||||||
return temp;
|
return temp;
|
||||||
|
@ -585,6 +585,13 @@ struct LayerData
|
|||||||
std::vector<Ptr<BackendWrapper> > inputBlobsWrappers;
|
std::vector<Ptr<BackendWrapper> > inputBlobsWrappers;
|
||||||
std::vector<Ptr<BackendWrapper> > internalBlobsWrappers;
|
std::vector<Ptr<BackendWrapper> > internalBlobsWrappers;
|
||||||
|
|
||||||
|
#ifdef HAVE_CUDA
|
||||||
|
/* output ids which must be transferred to the host in the background
|
||||||
|
* after the completion of the forward pass of the layer
|
||||||
|
*/
|
||||||
|
std::vector<int> cudaD2HBackgroundTransfers;
|
||||||
|
#endif
|
||||||
|
|
||||||
Ptr<Layer> layerInstance;
|
Ptr<Layer> layerInstance;
|
||||||
std::vector<Mat> outputBlobs;
|
std::vector<Mat> outputBlobs;
|
||||||
std::vector<Mat*> inputBlobs;
|
std::vector<Mat*> inputBlobs;
|
||||||
@ -1187,7 +1194,8 @@ struct Net::Impl : public detail::NetImplBase
|
|||||||
context.cublas_handle = cuda4dnn::csl::cublas::Handle(context.stream);
|
context.cublas_handle = cuda4dnn::csl::cublas::Handle(context.stream);
|
||||||
context.cudnn_handle = cuda4dnn::csl::cudnn::Handle(context.stream);
|
context.cudnn_handle = cuda4dnn::csl::cudnn::Handle(context.stream);
|
||||||
|
|
||||||
cudaInfo = std::unique_ptr<CudaInfo_t>(new CudaInfo_t(std::move(context)));
|
auto d2h_stream = cuda4dnn::csl::Stream(true); // stream for background D2H data transfers
|
||||||
|
cudaInfo = std::unique_ptr<CudaInfo_t>(new CudaInfo_t(std::move(context), std::move(d2h_stream)));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -1215,8 +1223,10 @@ struct Net::Impl : public detail::NetImplBase
|
|||||||
#ifdef HAVE_CUDA
|
#ifdef HAVE_CUDA
|
||||||
struct CudaInfo_t
|
struct CudaInfo_t
|
||||||
{
|
{
|
||||||
CudaInfo_t(cuda4dnn::csl::CSLContext ctxt) : context(std::move(ctxt)) { }
|
CudaInfo_t(cuda4dnn::csl::CSLContext ctxt, cuda4dnn::csl::Stream d2h_stream_)
|
||||||
|
: context(std::move(ctxt)), d2h_stream(std::move(d2h_stream_)) { }
|
||||||
cuda4dnn::csl::CSLContext context;
|
cuda4dnn::csl::CSLContext context;
|
||||||
|
cuda4dnn::csl::Stream d2h_stream;
|
||||||
cuda4dnn::csl::Workspace workspace;
|
cuda4dnn::csl::Workspace workspace;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1290,7 +1300,7 @@ struct Net::Impl : public detail::NetImplBase
|
|||||||
if (preferableBackend == DNN_BACKEND_CUDA)
|
if (preferableBackend == DNN_BACKEND_CUDA)
|
||||||
{
|
{
|
||||||
auto cudaWrapper = wrapper.dynamicCast<CUDABackendWrapper>();
|
auto cudaWrapper = wrapper.dynamicCast<CUDABackendWrapper>();
|
||||||
cudaWrapper->setStream(cudaInfo->context.stream);
|
cudaWrapper->setStream(cudaInfo->context.stream, cudaInfo->d2h_stream);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
backendWrappers[data] = wrapper;
|
backendWrappers[data] = wrapper;
|
||||||
@ -1630,7 +1640,7 @@ struct Net::Impl : public detail::NetImplBase
|
|||||||
else if (preferableBackend == DNN_BACKEND_VKCOM)
|
else if (preferableBackend == DNN_BACKEND_VKCOM)
|
||||||
initVkComBackend();
|
initVkComBackend();
|
||||||
else if (preferableBackend == DNN_BACKEND_CUDA)
|
else if (preferableBackend == DNN_BACKEND_CUDA)
|
||||||
initCUDABackend();
|
initCUDABackend(blobsToKeep_);
|
||||||
else
|
else
|
||||||
CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
|
CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
|
||||||
}
|
}
|
||||||
@ -2360,7 +2370,7 @@ struct Net::Impl : public detail::NetImplBase
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void initCUDABackend() {
|
void initCUDABackend(const std::vector<LayerPin>& blobsToKeep_) {
|
||||||
CV_Assert(haveCUDA());
|
CV_Assert(haveCUDA());
|
||||||
|
|
||||||
#ifdef HAVE_CUDA
|
#ifdef HAVE_CUDA
|
||||||
@ -2386,6 +2396,15 @@ struct Net::Impl : public detail::NetImplBase
|
|||||||
auto cudaNode = node.dynamicCast<CUDABackendNode>();
|
auto cudaNode = node.dynamicCast<CUDABackendNode>();
|
||||||
cudaInfo->workspace.require(cudaNode->get_workspace_memory_in_bytes());
|
cudaInfo->workspace.require(cudaNode->get_workspace_memory_in_bytes());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (blobsToKeep_.size() > 1)
|
||||||
|
{
|
||||||
|
for (const auto& pin : blobsToKeep_)
|
||||||
|
{
|
||||||
|
LayerData& ld = layers[pin.lid];
|
||||||
|
ld.cudaD2HBackgroundTransfers.push_back(pin.oid);
|
||||||
|
}
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3126,6 +3145,12 @@ struct Net::Impl : public detail::NetImplBase
|
|||||||
CV_Assert(!cudaNode.empty());
|
CV_Assert(!cudaNode.empty());
|
||||||
|
|
||||||
cudaNode->forward(ld.inputBlobsWrappers, ld.outputBlobsWrappers, cudaInfo->workspace);
|
cudaNode->forward(ld.inputBlobsWrappers, ld.outputBlobsWrappers, cudaInfo->workspace);
|
||||||
|
|
||||||
|
for (auto id : ld.cudaD2HBackgroundTransfers)
|
||||||
|
{
|
||||||
|
auto wrapper = ld.outputBlobsWrappers[id].dynamicCast<CUDABackendWrapper>();
|
||||||
|
wrapper->copyToHostInBackground();
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
else if (preferableBackend == DNN_BACKEND_HALIDE)
|
else if (preferableBackend == DNN_BACKEND_HALIDE)
|
||||||
|
@ -7,6 +7,7 @@
|
|||||||
|
|
||||||
#ifdef HAVE_CUDA
|
#ifdef HAVE_CUDA
|
||||||
#include "cuda4dnn/csl/stream.hpp"
|
#include "cuda4dnn/csl/stream.hpp"
|
||||||
|
#include "cuda4dnn/csl/event.hpp"
|
||||||
#include "cuda4dnn/csl/cublas.hpp"
|
#include "cuda4dnn/csl/cublas.hpp"
|
||||||
#include "cuda4dnn/csl/cudnn.hpp"
|
#include "cuda4dnn/csl/cudnn.hpp"
|
||||||
#include "cuda4dnn/csl/tensor.hpp"
|
#include "cuda4dnn/csl/tensor.hpp"
|
||||||
@ -206,6 +207,7 @@ namespace cv { namespace dnn {
|
|||||||
virtual ~CUDABackendWrapper() { }
|
virtual ~CUDABackendWrapper() { }
|
||||||
|
|
||||||
void copyToHost() override = 0;
|
void copyToHost() override = 0;
|
||||||
|
virtual void copyToHostInBackground() = 0;
|
||||||
void setHostDirty() override = 0;
|
void setHostDirty() override = 0;
|
||||||
|
|
||||||
virtual void copyToDevice() = 0;
|
virtual void copyToDevice() = 0;
|
||||||
@ -215,7 +217,7 @@ namespace cv { namespace dnn {
|
|||||||
virtual std::size_t getRank() const noexcept = 0;
|
virtual std::size_t getRank() const noexcept = 0;
|
||||||
|
|
||||||
/** @note setting the stream updates the stream for all wrappers which use the same tensor */
|
/** @note setting the stream updates the stream for all wrappers which use the same tensor */
|
||||||
virtual void setStream(cuda4dnn::csl::Stream stream) noexcept = 0;
|
virtual void setStream(cuda4dnn::csl::Stream stream, cuda4dnn::csl::Stream h2d_stream) noexcept = 0;
|
||||||
|
|
||||||
virtual void update(const MatShape& shape, std::size_t offset) = 0;
|
virtual void update(const MatShape& shape, std::size_t offset) = 0;
|
||||||
};
|
};
|
||||||
@ -240,6 +242,36 @@ namespace cv { namespace dnn {
|
|||||||
cuda4dnn::csl::memcpy<float>(reinterpret_cast<float*>(mat.data), view.data(), view.size(), stream);
|
cuda4dnn::csl::memcpy<float>(reinterpret_cast<float*>(mat.data), view.data(), view.size(), stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class U>
|
||||||
|
void convert_D2H_background(const cv::Mat& mat, cuda4dnn::csl::View<U> view, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream, const cuda4dnn::csl::Stream& d2h_stream, cuda4dnn::csl::Event& d2h_event);
|
||||||
|
|
||||||
|
template <> inline
|
||||||
|
void convert_D2H_background<half>(const cv::Mat& mat, cuda4dnn::csl::View<half> view, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream, const cuda4dnn::csl::Stream& d2h_stream, cuda4dnn::csl::Event& d2h_event) {
|
||||||
|
if (device_temp.size() < view.size())
|
||||||
|
device_temp.reset(view.size());
|
||||||
|
auto temp_span = cuda4dnn::csl::Span<float>(device_temp.get(), view.size());
|
||||||
|
|
||||||
|
/* The conversion kernel should can be executed in the background stream for better
|
||||||
|
* performance. We do it in the inference stream to prevent an unexplained performance
|
||||||
|
* regression on RTX 2080 Ti. Executing conversion kernel in the background stream causes
|
||||||
|
* everything to slow down (even operations that appear before the background transfer).
|
||||||
|
*
|
||||||
|
* TODO: identify the cause and move conversion kernel to the background stream
|
||||||
|
*/
|
||||||
|
cuda4dnn::kernels::fp16_to_fp32(stream, temp_span, view);
|
||||||
|
|
||||||
|
d2h_event.record(stream); // mark position in inference stream
|
||||||
|
cuda4dnn::csl::StreamWaitOnEvent(d2h_stream, d2h_event); // don't start transfer until data is available
|
||||||
|
cuda4dnn::csl::memcpy<float>(reinterpret_cast<float*>(mat.data), temp_span.data(), view.size(), d2h_stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <> inline
|
||||||
|
void convert_D2H_background<float>(const cv::Mat& mat, cuda4dnn::csl::View<float> view, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream, const cuda4dnn::csl::Stream& d2h_stream, cuda4dnn::csl::Event& d2h_event) {
|
||||||
|
d2h_event.record(stream);
|
||||||
|
cuda4dnn::csl::StreamWaitOnEvent(d2h_stream, d2h_event);
|
||||||
|
cuda4dnn::csl::memcpy<float>(reinterpret_cast<float*>(mat.data), view.data(), view.size(), d2h_stream);
|
||||||
|
}
|
||||||
|
|
||||||
template <class U>
|
template <class U>
|
||||||
void convert_H2D(cuda4dnn::csl::Span<U> span, const cv::Mat& mat, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream);
|
void convert_H2D(cuda4dnn::csl::Span<U> span, const cv::Mat& mat, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream);
|
||||||
|
|
||||||
@ -349,6 +381,28 @@ namespace cv { namespace dnn {
|
|||||||
|
|
||||||
cuda4dnn::detail::convert_D2H<T>(mat, view, shared_block->device_temp, shared_block->stream);
|
cuda4dnn::detail::convert_D2H<T>(mat, view, shared_block->device_temp, shared_block->stream);
|
||||||
shared_block->stream.synchronize();
|
shared_block->stream.synchronize();
|
||||||
|
} else if(shared_block->d2h_event && shared_block->d2h_event.busy()) {
|
||||||
|
/* wait for the background copy to finish */
|
||||||
|
shared_block->d2h_event.synchronize();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void copyToHostInBackground() override {
|
||||||
|
CV_Assert(shared_block->d2h_stream);
|
||||||
|
if (shared_block->device_dirty) {
|
||||||
|
shared_block->host_dirty = false;
|
||||||
|
shared_block->device_dirty = false;
|
||||||
|
|
||||||
|
auto view = tensor_view_type(shared_block->device.get(), std::begin(shape), std::end(shape));
|
||||||
|
|
||||||
|
auto& mat = shared_block->host;
|
||||||
|
CV_Assert(mat.isContinuous());
|
||||||
|
CV_Assert(mat.type() == CV_32F);
|
||||||
|
|
||||||
|
if (!shared_block->d2h_event)
|
||||||
|
shared_block->d2h_event = cuda4dnn::csl::Event(true);
|
||||||
|
cuda4dnn::detail::convert_D2H_background<T>(mat, view, shared_block->device_temp, shared_block->stream, shared_block->d2h_stream, shared_block->d2h_event);
|
||||||
|
shared_block->d2h_event.record(shared_block->d2h_stream); // record position so that we can check status later
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -383,8 +437,9 @@ namespace cv { namespace dnn {
|
|||||||
|
|
||||||
std::size_t getRank() const noexcept override { return shape.size(); }
|
std::size_t getRank() const noexcept override { return shape.size(); }
|
||||||
|
|
||||||
void setStream(cuda4dnn::csl::Stream stream) noexcept override {
|
void setStream(cuda4dnn::csl::Stream stream, cuda4dnn::csl::Stream d2h_stream) noexcept override {
|
||||||
shared_block->stream = std::move(stream);
|
shared_block->stream = std::move(stream);
|
||||||
|
shared_block->d2h_stream = std::move(d2h_stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
void update(const MatShape& shape_, std::size_t offset_) override {
|
void update(const MatShape& shape_, std::size_t offset_) override {
|
||||||
@ -452,6 +507,9 @@ namespace cv { namespace dnn {
|
|||||||
cuda4dnn::csl::ManagedPtr<T> device;
|
cuda4dnn::csl::ManagedPtr<T> device;
|
||||||
cuda4dnn::csl::ManagedPtr<float> device_temp; /* use for conversions */
|
cuda4dnn::csl::ManagedPtr<float> device_temp; /* use for conversions */
|
||||||
cuda4dnn::csl::Stream stream;
|
cuda4dnn::csl::Stream stream;
|
||||||
|
|
||||||
|
cuda4dnn::csl::Event d2h_event;
|
||||||
|
cuda4dnn::csl::Stream d2h_stream;
|
||||||
};
|
};
|
||||||
|
|
||||||
std::shared_ptr<shared_block_type> shared_block;
|
std::shared_ptr<shared_block_type> shared_block;
|
||||||
|
Loading…
Reference in New Issue
Block a user