/*M/////////////////////////////////////////////////////////////////////////////////////// // // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // // By downloading, copying, installing or using the software you agree to this license. // If you do not agree to this license, do not download, install, // copy or use the software. // // // License Agreement // For Open Source Computer Vision Library // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // // * Redistribution's of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. // // This software is provided by the copyright holders and contributors "as is" and // any express or implied warranties, including, but not limited to, the implied // warranties of merchantability and fitness for a particular purpose are disclaimed. // In no event shall the Intel Corporation or contributors be liable for any direct, // indirect, incidental, special, exemplary, or consequential damages // (including, but not limited to, procurement of substitute goods or services; // loss of use, data, or profits; or business interruption) however caused // and on any theory of liability, whether in contract, strict liability, // or tort (including negligence or otherwise) arising in any way out of // the use of this software, even if advised of the possibility of such damage. // //M*/ #include "precomp.hpp" #include using namespace cv; using namespace cv::cuda; #if defined(_MSC_VER) #pragma warning(disable : 4702) // unreachable code #endif ///////////////////////////////////////////////////////////// /// MemoryStack #ifdef HAVE_CUDA namespace { class MemoryPool; class MemoryStack { public: uchar* requestMemory(size_t size); void returnMemory(uchar* ptr); uchar* datastart; uchar* dataend; uchar* tip; bool isFree; MemoryPool* pool; #if !defined(NDEBUG) std::vector allocations; #endif }; uchar* MemoryStack::requestMemory(size_t size) { const size_t freeMem = dataend - tip; if (size > freeMem) return 0; uchar* ptr = tip; tip += size; #if !defined(NDEBUG) allocations.push_back(size); #endif return ptr; } void MemoryStack::returnMemory(uchar* ptr) { CV_DbgAssert( ptr >= datastart && ptr < dataend ); #if !defined(NDEBUG) const size_t allocSize = tip - ptr; CV_Assert( allocSize == allocations.back() ); allocations.pop_back(); #endif tip = ptr; } } #endif ///////////////////////////////////////////////////////////// /// MemoryPool #ifdef HAVE_CUDA namespace { class MemoryPool { public: MemoryPool(); ~MemoryPool() { release(); } void initialize(size_t stackSize, int stackCount); void release(); MemoryStack* getFreeMemStack(); void returnMemStack(MemoryStack* memStack); private: void initilizeImpl(); Mutex mtx_; bool initialized_; size_t stackSize_; int stackCount_; uchar* mem_; std::vector stacks_; MemoryPool(const MemoryPool&); //= delete; }; MemoryPool::MemoryPool() : initialized_(false), mem_(0) { // default : 10 Mb, 5 stacks stackSize_ = 10 * 1024 * 1024; stackCount_ = 5; } void MemoryPool::initialize(size_t stackSize, int stackCount) { AutoLock lock(mtx_); release(); stackSize_ = stackSize; stackCount_ = stackCount; initilizeImpl(); } void MemoryPool::initilizeImpl() { const size_t totalSize = stackSize_ * stackCount_; if (totalSize > 0) { cudaError_t err = cudaMalloc(&mem_, totalSize); if (err != cudaSuccess) return; stacks_.resize(stackCount_); uchar* ptr = mem_; for (int i = 0; i < stackCount_; ++i) { stacks_[i].datastart = ptr; stacks_[i].dataend = ptr + stackSize_; stacks_[i].tip = ptr; stacks_[i].isFree = true; stacks_[i].pool = this; ptr += stackSize_; } initialized_ = true; } } void MemoryPool::release() { if (mem_) { #if !defined(NDEBUG) for (int i = 0; i < stackCount_; ++i) { CV_DbgAssert( stacks_[i].isFree ); CV_DbgAssert( stacks_[i].tip == stacks_[i].datastart ); } #endif cudaFree(mem_); mem_ = 0; initialized_ = false; } } MemoryStack* MemoryPool::getFreeMemStack() { AutoLock lock(mtx_); if (!initialized_) initilizeImpl(); if (!mem_) return 0; for (int i = 0; i < stackCount_; ++i) { if (stacks_[i].isFree) { stacks_[i].isFree = false; return &stacks_[i]; } } return 0; } void MemoryPool::returnMemStack(MemoryStack* memStack) { AutoLock lock(mtx_); CV_DbgAssert( !memStack->isFree ); #if !defined(NDEBUG) bool found = false; for (int i = 0; i < stackCount_; ++i) { if (memStack == &stacks_[i]) { found = true; break; } } CV_DbgAssert( found ); #endif CV_DbgAssert( memStack->tip == memStack->datastart ); memStack->isFree = true; } } #endif //////////////////////////////////////////////////////////////// /// Stream::Impl #ifndef HAVE_CUDA class cv::cuda::Stream::Impl { public: Impl(void* ptr = 0) { CV_UNUSED(ptr); throw_no_cuda(); } }; #else namespace { class StackAllocator; } class cv::cuda::Stream::Impl { public: cudaStream_t stream; bool ownStream; Ptr allocator; Impl(); Impl(const Ptr& allocator); Impl(const unsigned int cudaFlags); explicit Impl(cudaStream_t stream); ~Impl(); }; cv::cuda::Stream::Impl::Impl() : stream(0), ownStream(false) { cudaSafeCall( cudaStreamCreate(&stream) ); ownStream = true; allocator = makePtr(stream); } cv::cuda::Stream::Impl::Impl(const Ptr& allocator) : stream(0), ownStream(false), allocator(allocator) { cudaSafeCall( cudaStreamCreate(&stream) ); ownStream = true; } cv::cuda::Stream::Impl::Impl(const unsigned int cudaFlags) : stream(0), ownStream(false) { cudaSafeCall(cudaStreamCreateWithFlags(&stream, cudaFlags)); ownStream = true; allocator = makePtr(stream); } cv::cuda::Stream::Impl::Impl(cudaStream_t stream_) : stream(stream_), ownStream(false) { allocator = makePtr(stream); } cv::cuda::Stream::Impl::~Impl() { allocator.release(); if (stream && ownStream) { cudaStreamDestroy(stream); } } #endif ///////////////////////////////////////////////////////////// /// DefaultDeviceInitializer #ifdef HAVE_CUDA namespace cv { namespace cuda { class DefaultDeviceInitializer { public: DefaultDeviceInitializer(); ~DefaultDeviceInitializer(); Stream& getNullStream(int deviceId); MemoryPool& getMemoryPool(int deviceId); private: void initStreams(); void initPools(); std::vector > streams_; Mutex streams_mtx_; std::vector > pools_; Mutex pools_mtx_; }; DefaultDeviceInitializer::DefaultDeviceInitializer() { } DefaultDeviceInitializer::~DefaultDeviceInitializer() { streams_.clear(); for (size_t i = 0; i < pools_.size(); ++i) { cudaSetDevice(static_cast(i)); pools_[i]->release(); } pools_.clear(); } Stream& DefaultDeviceInitializer::getNullStream(int deviceId) { AutoLock lock(streams_mtx_); if (streams_.empty()) { int deviceCount = getCudaEnabledDeviceCount(); if (deviceCount > 0) streams_.resize(deviceCount); } CV_DbgAssert( deviceId >= 0 && deviceId < static_cast(streams_.size()) ); if (streams_[deviceId].empty()) { cudaStream_t stream = NULL; Ptr impl = makePtr(stream); streams_[deviceId] = Ptr(new Stream(impl)); } return *streams_[deviceId]; } MemoryPool& DefaultDeviceInitializer::getMemoryPool(int deviceId) { AutoLock lock(pools_mtx_); if (pools_.empty()) { int deviceCount = getCudaEnabledDeviceCount(); if (deviceCount > 0) { pools_.resize(deviceCount); for (size_t i = 0; i < pools_.size(); ++i) { cudaSetDevice(static_cast(i)); pools_[i] = makePtr(); } } } CV_DbgAssert( deviceId >= 0 && deviceId < static_cast(pools_.size()) ); MemoryPool* p = pools_[deviceId]; CV_Assert(p); return *p; } DefaultDeviceInitializer initializer; }} #endif ///////////////////////////////////////////////////////////// /// Stream cv::cuda::Stream::Stream() { #ifndef HAVE_CUDA throw_no_cuda(); #else impl_ = makePtr(); #endif } cv::cuda::Stream::Stream(const Ptr& allocator) { #ifndef HAVE_CUDA CV_UNUSED(allocator); throw_no_cuda(); #else impl_ = makePtr(allocator); #endif } cv::cuda::Stream::Stream(const size_t cudaFlags) { #ifndef HAVE_CUDA CV_UNUSED(cudaFlags); throw_no_cuda(); #else impl_ = makePtr(cudaFlags & UINT_MAX); #endif } bool cv::cuda::Stream::queryIfComplete() const { #ifndef HAVE_CUDA throw_no_cuda(); #else cudaError_t err = cudaStreamQuery(impl_->stream); if (err == cudaErrorNotReady || err == cudaSuccess) return err == cudaSuccess; cudaSafeCall(err); return false; #endif } void cv::cuda::Stream::waitForCompletion() { #ifndef HAVE_CUDA throw_no_cuda(); #else cudaSafeCall( cudaStreamSynchronize(impl_->stream) ); #endif } void cv::cuda::Stream::waitEvent(const Event& event) { #ifndef HAVE_CUDA CV_UNUSED(event); throw_no_cuda(); #else cudaSafeCall( cudaStreamWaitEvent(impl_->stream, EventAccessor::getEvent(event), 0) ); #endif } #if defined(HAVE_CUDA) && (CUDART_VERSION >= 5000) namespace { struct CallbackData { Stream::StreamCallback callback; void* userData; CallbackData(Stream::StreamCallback callback_, void* userData_) : callback(callback_), userData(userData_) {} }; void CUDART_CB cudaStreamCallback(cudaStream_t, cudaError_t status, void* userData) { CallbackData* data = reinterpret_cast(userData); data->callback(static_cast(status), data->userData); delete data; } } #endif void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userData) { #ifndef HAVE_CUDA CV_UNUSED(callback); CV_UNUSED(userData); throw_no_cuda(); #else #if CUDART_VERSION < 5000 CV_UNUSED(callback); CV_UNUSED(userData); CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA >= 5.0"); #else CallbackData* data = new CallbackData(callback, userData); cudaSafeCall( cudaStreamAddCallback(impl_->stream, cudaStreamCallback, data, 0) ); #endif #endif } Stream& cv::cuda::Stream::Null() { #ifndef HAVE_CUDA throw_no_cuda(); #else const int deviceId = getDevice(); return initializer.getNullStream(deviceId); #endif } void* cv::cuda::Stream::cudaPtr() const { #ifndef HAVE_CUDA return nullptr; #else return impl_->stream; #endif } cv::cuda::Stream::operator bool_type() const { #ifndef HAVE_CUDA return 0; #else return (impl_->stream != 0) ? &Stream::this_type_does_not_support_comparisons : 0; #endif } #ifdef HAVE_CUDA cudaStream_t cv::cuda::StreamAccessor::getStream(const Stream& stream) { return stream.impl_->stream; } Stream cv::cuda::StreamAccessor::wrapStream(cudaStream_t stream) { return Stream(makePtr(stream)); } #endif Stream cv::cuda::wrapStream(size_t cudaStreamMemoryAddress) { #ifndef HAVE_CUDA CV_UNUSED(cudaStreamMemoryAddress); throw_no_cuda(); #else return cv::cuda::StreamAccessor::wrapStream(reinterpret_cast(cudaStreamMemoryAddress)); #endif } ///////////////////////////////////////////////////////////// /// StackAllocator #ifdef HAVE_CUDA namespace { bool enableMemoryPool = false; class StackAllocator : public GpuMat::Allocator { public: explicit StackAllocator(cudaStream_t stream); ~StackAllocator(); bool allocate(GpuMat* mat, int rows, int cols, size_t elemSize) CV_OVERRIDE; void free(GpuMat* mat) CV_OVERRIDE; private: StackAllocator(const StackAllocator&); StackAllocator& operator =(const StackAllocator&); cudaStream_t stream_; MemoryStack* memStack_; size_t alignment_; }; StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream), memStack_(0) { if (enableMemoryPool) { const int deviceId = getDevice(); memStack_ = initializer.getMemoryPool(deviceId).getFreeMemStack(); DeviceInfo devInfo(deviceId); alignment_ = devInfo.textureAlignment(); } } StackAllocator::~StackAllocator() { if (memStack_ != 0) { cudaStreamSynchronize(stream_); memStack_->pool->returnMemStack(memStack_); } } size_t alignUp(size_t what, size_t alignment) { size_t alignMask = alignment-1; size_t inverseAlignMask = ~alignMask; size_t res = (what + alignMask) & inverseAlignMask; return res; } bool StackAllocator::allocate(GpuMat* mat, int rows, int cols, size_t elemSize) { if (memStack_ == 0) return false; size_t pitch, memSize; if (rows > 1 && cols > 1) { pitch = alignUp(cols * elemSize, alignment_); memSize = pitch * rows; } else { // Single row or single column must be continuous pitch = elemSize * cols; memSize = alignUp(elemSize * cols * rows, 64); } uchar* ptr = memStack_->requestMemory(memSize); if (ptr == 0) return false; mat->data = ptr; mat->step = pitch; mat->refcount = (int*) fastMalloc(sizeof(int)); return true; } void StackAllocator::free(GpuMat* mat) { if (memStack_ == 0) return; memStack_->returnMemory(mat->datastart); fastFree(mat->refcount); } } #endif ///////////////////////////////////////////////////////////// /// BufferPool void cv::cuda::setBufferPoolUsage(bool on) { #ifndef HAVE_CUDA CV_UNUSED(on); throw_no_cuda(); #else enableMemoryPool = on; #endif } void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount) { #ifndef HAVE_CUDA CV_UNUSED(deviceId); CV_UNUSED(stackSize); CV_UNUSED(stackCount); throw_no_cuda(); #else const int currentDevice = getDevice(); if (deviceId >= 0) { setDevice(deviceId); initializer.getMemoryPool(deviceId).initialize(stackSize, stackCount); } else { const int deviceCount = getCudaEnabledDeviceCount(); for (deviceId = 0; deviceId < deviceCount; ++deviceId) { setDevice(deviceId); initializer.getMemoryPool(deviceId).initialize(stackSize, stackCount); } } setDevice(currentDevice); #endif } #ifndef HAVE_CUDA cv::cuda::BufferPool::BufferPool(Stream& stream) { CV_UNUSED(stream); throw_no_cuda(); } #else cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->allocator) { } #endif GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type) { #ifndef HAVE_CUDA CV_UNUSED(rows); CV_UNUSED(cols); CV_UNUSED(type); throw_no_cuda(); #else GpuMat buf(allocator_); buf.create(rows, cols, type); return buf; #endif } //////////////////////////////////////////////////////////////// // Event #ifndef HAVE_CUDA class cv::cuda::Event::Impl { public: Impl(unsigned int) { throw_no_cuda(); } }; #else class cv::cuda::Event::Impl { public: cudaEvent_t event; bool ownEvent; explicit Impl(unsigned int flags); explicit Impl(cudaEvent_t event); ~Impl(); }; cv::cuda::Event::Impl::Impl(unsigned int flags) : event(0), ownEvent(false) { cudaSafeCall( cudaEventCreateWithFlags(&event, flags) ); ownEvent = true; } cv::cuda::Event::Impl::Impl(cudaEvent_t e) : event(e), ownEvent(false) { } cv::cuda::Event::Impl::~Impl() { if (event && ownEvent) { cudaEventDestroy(event); } } cudaEvent_t cv::cuda::EventAccessor::getEvent(const Event& event) { return event.impl_->event; } Event cv::cuda::EventAccessor::wrapEvent(cudaEvent_t event) { return Event(makePtr(event)); } #endif cv::cuda::Event::Event(const Event::CreateFlags flags) { #ifndef HAVE_CUDA CV_UNUSED(flags); throw_no_cuda(); #else impl_ = makePtr(flags); #endif } void cv::cuda::Event::record(Stream& stream) { #ifndef HAVE_CUDA CV_UNUSED(stream); throw_no_cuda(); #else cudaSafeCall( cudaEventRecord(impl_->event, StreamAccessor::getStream(stream)) ); #endif } bool cv::cuda::Event::queryIfComplete() const { #ifndef HAVE_CUDA throw_no_cuda(); #else cudaError_t err = cudaEventQuery(impl_->event); if (err == cudaErrorNotReady || err == cudaSuccess) return err == cudaSuccess; cudaSafeCall(err); return false; #endif } void cv::cuda::Event::waitForCompletion() { #ifndef HAVE_CUDA throw_no_cuda(); #else cudaSafeCall( cudaEventSynchronize(impl_->event) ); #endif } float cv::cuda::Event::elapsedTime(const Event& start, const Event& end) { #ifndef HAVE_CUDA CV_UNUSED(start); CV_UNUSED(end); throw_no_cuda(); #else float ms; cudaSafeCall( cudaEventElapsedTime(&ms, start.impl_->event, end.impl_->event) ); return ms; #endif }