fix cuda::BufferPool deinitialization

The deinitialization of BufferPool internal objects is controled by global
object, but it depends on other global objects, which leads to errors
caused by undefined deinitialization order of global objects.

I merge global objects initialization into single class, which performs
initialization and deinitialization in correct order.
This commit is contained in:
Vladislav Vinogradov 2014-12-17 18:51:15 +03:00
parent fd6ef87c32
commit 7ed38b97c3
3 changed files with 466 additions and 458 deletions

View File

@ -479,6 +479,7 @@ private:
friend struct StreamAccessor;
friend class BufferPool;
friend class DefaultDeviceInitializer;
};
class CV_EXPORTS Event

View File

@ -1,435 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace cv;
using namespace cv::cuda;
#ifdef HAVE_CUDA
#include "opencv2/cudev/common.hpp"
/////////////////////////////////////////////////////////////
/// MemoryStack
namespace
{
class MemoryPool;
}
class cv::cuda::MemoryStack
{
public:
uchar* requestMemory(size_t size);
void returnMemory(uchar* ptr);
uchar* datastart;
uchar* dataend;
uchar* tip;
bool isFree;
MemoryPool* pool;
#if defined(DEBUG) || defined(_DEBUG)
std::vector<size_t> allocations;
#endif
};
uchar* cv::cuda::MemoryStack::requestMemory(size_t size)
{
const size_t freeMem = dataend - tip;
if (size > freeMem)
return 0;
uchar* ptr = tip;
tip += size;
#if defined(DEBUG) || defined(_DEBUG)
allocations.push_back(size);
#endif
return ptr;
}
void cv::cuda::MemoryStack::returnMemory(uchar* ptr)
{
CV_DbgAssert( ptr >= datastart && ptr < dataend );
#if defined(DEBUG) || defined(_DEBUG)
const size_t allocSize = tip - ptr;
CV_Assert( allocSize == allocations.back() );
allocations.pop_back();
#endif
tip = ptr;
}
/////////////////////////////////////////////////////////////
/// MemoryPool
namespace
{
class MemoryPool
{
public:
MemoryPool();
void initialize(size_t stackSize, int stackCount);
void release();
MemoryStack* getFreeMemStack();
void returnMemStack(MemoryStack* memStack);
private:
void initilizeImpl();
Mutex mtx_;
bool initialized_;
size_t stackSize_;
int stackCount_;
uchar* mem_;
std::vector<MemoryStack> stacks_;
};
MemoryPool::MemoryPool() : initialized_(false), mem_(0)
{
// default : 10 Mb, 5 stacks
stackSize_ = 10 * 1024 * 1024;
stackCount_ = 5;
}
void MemoryPool::initialize(size_t stackSize, int stackCount)
{
AutoLock lock(mtx_);
release();
stackSize_ = stackSize;
stackCount_ = stackCount;
initilizeImpl();
}
void MemoryPool::initilizeImpl()
{
const size_t totalSize = stackSize_ * stackCount_;
if (totalSize > 0)
{
cudaError_t err = cudaMalloc(&mem_, totalSize);
if (err != cudaSuccess)
return;
stacks_.resize(stackCount_);
uchar* ptr = mem_;
for (int i = 0; i < stackCount_; ++i)
{
stacks_[i].datastart = ptr;
stacks_[i].dataend = ptr + stackSize_;
stacks_[i].tip = ptr;
stacks_[i].isFree = true;
stacks_[i].pool = this;
ptr += stackSize_;
}
initialized_ = true;
}
}
void MemoryPool::release()
{
if (mem_)
{
#if defined(DEBUG) || defined(_DEBUG)
for (int i = 0; i < stackCount_; ++i)
{
CV_DbgAssert( stacks_[i].isFree );
CV_DbgAssert( stacks_[i].tip == stacks_[i].datastart );
}
#endif
cudaFree( mem_ );
mem_ = 0;
initialized_ = false;
}
}
MemoryStack* MemoryPool::getFreeMemStack()
{
AutoLock lock(mtx_);
if (!initialized_)
initilizeImpl();
if (!mem_)
return 0;
for (int i = 0; i < stackCount_; ++i)
{
if (stacks_[i].isFree)
{
stacks_[i].isFree = false;
return &stacks_[i];
}
}
return 0;
}
void MemoryPool::returnMemStack(MemoryStack* memStack)
{
AutoLock lock(mtx_);
CV_DbgAssert( !memStack->isFree );
#if defined(DEBUG) || defined(_DEBUG)
bool found = false;
for (int i = 0; i < stackCount_; ++i)
{
if (memStack == &stacks_[i])
{
found = true;
break;
}
}
CV_DbgAssert( found );
#endif
CV_DbgAssert( memStack->tip == memStack->datastart );
memStack->isFree = true;
}
}
/////////////////////////////////////////////////////////////
/// MemoryPoolManager
namespace
{
Mutex mtx_;
bool memory_pool_manager_initialized;
class MemoryPoolManager
{
public:
MemoryPoolManager();
~MemoryPoolManager();
void Init();
MemoryPool* getPool(int deviceId);
private:
std::vector<MemoryPool> pools_;
} manager;
//MemoryPoolManager ;
MemoryPoolManager::MemoryPoolManager()
{
}
void MemoryPoolManager::Init()
{
int deviceCount = getCudaEnabledDeviceCount();
if (deviceCount > 0)
pools_.resize(deviceCount);
}
MemoryPoolManager::~MemoryPoolManager()
{
for (size_t i = 0; i < pools_.size(); ++i)
{
cudaSetDevice(static_cast<int>(i));
pools_[i].release();
}
}
MemoryPool* MemoryPoolManager::getPool(int deviceId)
{
CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(pools_.size()) );
return &pools_[deviceId];
}
MemoryPool* memPool(int deviceId)
{
{
AutoLock lock(mtx_);
if (!memory_pool_manager_initialized)
{
memory_pool_manager_initialized = true;
manager.Init();
}
}
return manager.getPool(deviceId);
}
}
/////////////////////////////////////////////////////////////
/// StackAllocator
namespace
{
bool enableMemoryPool = true;
}
cv::cuda::StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream), memStack_(0)
{
if (enableMemoryPool)
{
const int deviceId = getDevice();
{
AutoLock lock(mtx_);
memStack_ = memPool(deviceId)->getFreeMemStack();
}
DeviceInfo devInfo(deviceId);
alignment_ = devInfo.textureAlignment();
}
}
cv::cuda::StackAllocator::~StackAllocator()
{
cudaStreamSynchronize(stream_);
if (memStack_ != 0)
memStack_->pool->returnMemStack(memStack_);
}
namespace
{
size_t alignUp(size_t what, size_t alignment)
{
size_t alignMask = alignment-1;
size_t inverseAlignMask = ~alignMask;
size_t res = (what + alignMask) & inverseAlignMask;
return res;
}
}
bool cv::cuda::StackAllocator::allocate(GpuMat* mat, int rows, int cols, size_t elemSize)
{
if (memStack_ == 0)
return false;
size_t pitch, memSize;
if (rows > 1 && cols > 1)
{
pitch = alignUp(cols * elemSize, alignment_);
memSize = pitch * rows;
}
else
{
// Single row or single column must be continuous
pitch = elemSize * cols;
memSize = alignUp(elemSize * cols * rows, 64);
}
uchar* ptr = memStack_->requestMemory(memSize);
if (ptr == 0)
return false;
mat->data = ptr;
mat->step = pitch;
mat->refcount = (int*) fastMalloc(sizeof(int));
return true;
}
void cv::cuda::StackAllocator::free(GpuMat* mat)
{
if (memStack_ == 0)
return;
memStack_->returnMemory(mat->datastart);
fastFree(mat->refcount);
}
void cv::cuda::setBufferPoolUsage(bool on)
{
enableMemoryPool = on;
}
void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount)
{
const int currentDevice = getDevice();
if (deviceId >= 0)
{
setDevice(deviceId);
memPool(deviceId)->initialize(stackSize, stackCount);
}
else
{
const int deviceCount = getCudaEnabledDeviceCount();
for (deviceId = 0; deviceId < deviceCount; ++deviceId)
{
setDevice(deviceId);
memPool(deviceId)->initialize(stackSize, stackCount);
}
}
setDevice(currentDevice);
}
/////////////////////////////////////////////////////////////
/// BufferPool
GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type)
{
GpuMat buf(allocator_);
buf.create(rows, cols, type);
return buf;
}
#endif

View File

@ -45,8 +45,217 @@
using namespace cv;
using namespace cv::cuda;
/////////////////////////////////////////////////////////////
/// MemoryStack
#ifdef HAVE_CUDA
namespace
{
class MemoryPool;
}
class cv::cuda::MemoryStack
{
public:
uchar* requestMemory(size_t size);
void returnMemory(uchar* ptr);
uchar* datastart;
uchar* dataend;
uchar* tip;
bool isFree;
MemoryPool* pool;
#if !defined(NDEBUG)
std::vector<size_t> allocations;
#endif
};
uchar* cv::cuda::MemoryStack::requestMemory(size_t size)
{
const size_t freeMem = dataend - tip;
if (size > freeMem)
return 0;
uchar* ptr = tip;
tip += size;
#if !defined(NDEBUG)
allocations.push_back(size);
#endif
return ptr;
}
void cv::cuda::MemoryStack::returnMemory(uchar* ptr)
{
CV_DbgAssert( ptr >= datastart && ptr < dataend );
#if !defined(NDEBUG)
const size_t allocSize = tip - ptr;
CV_Assert( allocSize == allocations.back() );
allocations.pop_back();
#endif
tip = ptr;
}
#endif
/////////////////////////////////////////////////////////////
/// MemoryPool
#ifdef HAVE_CUDA
namespace
{
class MemoryPool
{
public:
MemoryPool();
void initialize(size_t stackSize, int stackCount);
void release();
MemoryStack* getFreeMemStack();
void returnMemStack(MemoryStack* memStack);
private:
void initilizeImpl();
Mutex mtx_;
bool initialized_;
size_t stackSize_;
int stackCount_;
uchar* mem_;
std::vector<MemoryStack> stacks_;
};
MemoryPool::MemoryPool() : initialized_(false), mem_(0)
{
// default : 10 Mb, 5 stacks
stackSize_ = 10 * 1024 * 1024;
stackCount_ = 5;
}
void MemoryPool::initialize(size_t stackSize, int stackCount)
{
AutoLock lock(mtx_);
release();
stackSize_ = stackSize;
stackCount_ = stackCount;
initilizeImpl();
}
void MemoryPool::initilizeImpl()
{
const size_t totalSize = stackSize_ * stackCount_;
if (totalSize > 0)
{
cudaError_t err = cudaMalloc(&mem_, totalSize);
if (err != cudaSuccess)
return;
stacks_.resize(stackCount_);
uchar* ptr = mem_;
for (int i = 0; i < stackCount_; ++i)
{
stacks_[i].datastart = ptr;
stacks_[i].dataend = ptr + stackSize_;
stacks_[i].tip = ptr;
stacks_[i].isFree = true;
stacks_[i].pool = this;
ptr += stackSize_;
}
initialized_ = true;
}
}
void MemoryPool::release()
{
if (mem_)
{
#if !defined(NDEBUG)
for (int i = 0; i < stackCount_; ++i)
{
CV_DbgAssert( stacks_[i].isFree );
CV_DbgAssert( stacks_[i].tip == stacks_[i].datastart );
}
#endif
cudaFree(mem_);
mem_ = 0;
initialized_ = false;
}
}
MemoryStack* MemoryPool::getFreeMemStack()
{
AutoLock lock(mtx_);
if (!initialized_)
initilizeImpl();
if (!mem_)
return 0;
for (int i = 0; i < stackCount_; ++i)
{
if (stacks_[i].isFree)
{
stacks_[i].isFree = false;
return &stacks_[i];
}
}
return 0;
}
void MemoryPool::returnMemStack(MemoryStack* memStack)
{
AutoLock lock(mtx_);
CV_DbgAssert( !memStack->isFree );
#if !defined(NDEBUG)
bool found = false;
for (int i = 0; i < stackCount_; ++i)
{
if (memStack == &stacks_[i])
{
found = true;
break;
}
}
CV_DbgAssert( found );
#endif
CV_DbgAssert( memStack->tip == memStack->datastart );
memStack->isFree = true;
}
}
#endif
////////////////////////////////////////////////////////////////
// Stream
/// Stream::Impl
#ifndef HAVE_CUDA
@ -74,10 +283,6 @@ public:
~Impl();
};
cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->stackAllocator_.get())
{
}
cv::cuda::Stream::Impl::Impl() : stream(0)
{
cudaSafeCall( cudaStreamCreate(&stream) );
@ -98,13 +303,120 @@ cv::cuda::Stream::Impl::~Impl()
cudaStreamDestroy(stream);
}
cudaStream_t cv::cuda::StreamAccessor::getStream(const Stream& stream)
#endif
/////////////////////////////////////////////////////////////
/// DefaultDeviceInitializer
#ifdef HAVE_CUDA
namespace cv { namespace cuda
{
return stream.impl_->stream;
}
class DefaultDeviceInitializer
{
public:
DefaultDeviceInitializer();
~DefaultDeviceInitializer();
Stream& getNullStream(int deviceId);
MemoryPool* getMemoryPool(int deviceId);
private:
void initStreams();
void initPools();
Mutex streams_mtx_;
volatile bool streams_initialized_;
Mutex pools_mtx_;
volatile bool pools_initialized_;
std::vector<Ptr<Stream> > streams_;
std::vector<MemoryPool> pools_;
};
DefaultDeviceInitializer::DefaultDeviceInitializer()
{
}
DefaultDeviceInitializer::~DefaultDeviceInitializer()
{
streams_.clear();
for (size_t i = 0; i < pools_.size(); ++i)
{
cudaSetDevice(static_cast<int>(i));
pools_[i].release();
}
pools_.clear();
}
Stream& DefaultDeviceInitializer::getNullStream(int deviceId)
{
initStreams();
CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(streams_.size()) );
return *streams_[deviceId];
}
MemoryPool* DefaultDeviceInitializer::getMemoryPool(int deviceId)
{
initPools();
CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(pools_.size()) );
return &pools_[deviceId];
}
void DefaultDeviceInitializer::initStreams()
{
AutoLock lock(streams_mtx_);
if (!streams_initialized_)
{
int deviceCount = getCudaEnabledDeviceCount();
if (deviceCount > 0)
{
streams_.resize(deviceCount);
for (int i = 0; i < deviceCount; ++i)
{
cudaStream_t stream = NULL;
Ptr<Stream::Impl> impl = makePtr<Stream::Impl>(stream);
streams_[i] = Ptr<Stream>(new Stream(impl));
}
}
streams_initialized_ = true;
}
}
void DefaultDeviceInitializer::initPools()
{
AutoLock lock(pools_mtx_);
if (!pools_initialized_)
{
int deviceCount = getCudaEnabledDeviceCount();
if (deviceCount > 0)
pools_.resize(deviceCount);
pools_initialized_ = true;
}
}
DefaultDeviceInitializer initializer;
}}
#endif
/////////////////////////////////////////////////////////////
/// Stream
cv::cuda::Stream::Stream()
{
#ifndef HAVE_CUDA
@ -181,7 +493,7 @@ void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userDa
#if CUDART_VERSION < 5000
(void) callback;
(void) userData;
CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA 5.0");
CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA >= 5.0");
#else
CallbackData* data = new CallbackData(callback, userData);
@ -190,22 +502,16 @@ void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userDa
#endif
}
namespace
{
bool default_stream_is_initialized;
Mutex mtx;
Ptr<Stream> default_stream;
}
Stream& cv::cuda::Stream::Null()
{
AutoLock lock(mtx);
if (!default_stream_is_initialized)
{
default_stream = Ptr<Stream>(new Stream(Ptr<Impl>(new Impl(0))));
default_stream_is_initialized = true;
}
return *default_stream;
#ifndef HAVE_CUDA
throw_no_cuda();
static Stream stream;
return stream;
#else
const int deviceId = getDevice();
return initializer.getNullStream(deviceId);
#endif
}
cv::cuda::Stream::operator bool_type() const
@ -217,6 +523,142 @@ cv::cuda::Stream::operator bool_type() const
#endif
}
#ifdef HAVE_CUDA
cudaStream_t cv::cuda::StreamAccessor::getStream(const Stream& stream)
{
return stream.impl_->stream;
}
#endif
/////////////////////////////////////////////////////////////
/// StackAllocator
#ifdef HAVE_CUDA
namespace
{
bool enableMemoryPool = true;
}
cv::cuda::StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream), memStack_(0)
{
if (enableMemoryPool)
{
const int deviceId = getDevice();
memStack_ = initializer.getMemoryPool(deviceId)->getFreeMemStack();
DeviceInfo devInfo(deviceId);
alignment_ = devInfo.textureAlignment();
}
}
cv::cuda::StackAllocator::~StackAllocator()
{
cudaStreamSynchronize(stream_);
if (memStack_ != 0)
memStack_->pool->returnMemStack(memStack_);
}
namespace
{
size_t alignUp(size_t what, size_t alignment)
{
size_t alignMask = alignment-1;
size_t inverseAlignMask = ~alignMask;
size_t res = (what + alignMask) & inverseAlignMask;
return res;
}
}
bool cv::cuda::StackAllocator::allocate(GpuMat* mat, int rows, int cols, size_t elemSize)
{
if (memStack_ == 0)
return false;
size_t pitch, memSize;
if (rows > 1 && cols > 1)
{
pitch = alignUp(cols * elemSize, alignment_);
memSize = pitch * rows;
}
else
{
// Single row or single column must be continuous
pitch = elemSize * cols;
memSize = alignUp(elemSize * cols * rows, 64);
}
uchar* ptr = memStack_->requestMemory(memSize);
if (ptr == 0)
return false;
mat->data = ptr;
mat->step = pitch;
mat->refcount = (int*) fastMalloc(sizeof(int));
return true;
}
void cv::cuda::StackAllocator::free(GpuMat* mat)
{
if (memStack_ == 0)
return;
memStack_->returnMemory(mat->datastart);
fastFree(mat->refcount);
}
void cv::cuda::setBufferPoolUsage(bool on)
{
enableMemoryPool = on;
}
void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount)
{
const int currentDevice = getDevice();
if (deviceId >= 0)
{
setDevice(deviceId);
initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount);
}
else
{
const int deviceCount = getCudaEnabledDeviceCount();
for (deviceId = 0; deviceId < deviceCount; ++deviceId)
{
setDevice(deviceId);
initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount);
}
}
setDevice(currentDevice);
}
#endif
/////////////////////////////////////////////////////////////
/// BufferPool
#ifdef HAVE_CUDA
cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->stackAllocator_.get())
{
}
GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type)
{
GpuMat buf(allocator_);
buf.create(rows, cols, type);
return buf;
}
#endif
////////////////////////////////////////////////////////////////
// Event