diff --git a/modules/core/include/opencv2/core/gpu.hpp b/modules/core/include/opencv2/core/gpu.hpp index f60d246bd7..f0050f208b 100644 --- a/modules/core/include/opencv2/core/gpu.hpp +++ b/modules/core/include/opencv2/core/gpu.hpp @@ -53,6 +53,181 @@ namespace cv { namespace gpu { + +//////////////////////////////// GpuMat /////////////////////////////// + +// Smart pointer for GPU memory with reference counting. +// Its interface is mostly similar with cv::Mat. + +class CV_EXPORTS GpuMat +{ +public: + //! default constructor + GpuMat(); + + //! constructs GpuMat of the specified size and type + GpuMat(int rows, int cols, int type); + GpuMat(Size size, int type); + + //! constucts GpuMat and fills it with the specified value _s + GpuMat(int rows, int cols, int type, Scalar s); + GpuMat(Size size, int type, Scalar s); + + //! copy constructor + GpuMat(const GpuMat& m); + + //! constructor for GpuMat headers pointing to user-allocated data + GpuMat(int rows, int cols, int type, void* data, size_t step = Mat::AUTO_STEP); + GpuMat(Size size, int type, void* data, size_t step = Mat::AUTO_STEP); + + //! creates a GpuMat header for a part of the bigger matrix + GpuMat(const GpuMat& m, Range rowRange, Range colRange); + GpuMat(const GpuMat& m, Rect roi); + + //! builds GpuMat from Mat. Perfom blocking upload to device + explicit GpuMat(const Mat& m); + + //! destructor - calls release() + ~GpuMat(); + + //! assignment operators + GpuMat& operator =(const GpuMat& m); + + //! allocates new GpuMat data unless the GpuMat already has specified size and type + void create(int rows, int cols, int type); + void create(Size size, int type); + + //! decreases reference counter, deallocate the data when reference counter reaches 0 + void release(); + + //! swaps with other smart pointer + void swap(GpuMat& mat); + + //! pefroms blocking upload data to GpuMat + void upload(const Mat& m); + + //! downloads data from device to host memory (Blocking calls) + void download(Mat& m) const; + + //! returns deep copy of the GpuMat, i.e. the data is copied + GpuMat clone() const; + + //! copies the GpuMat content to "m" + void copyTo(GpuMat& m) const; + + //! copies those GpuMat elements to "m" that are marked with non-zero mask elements + void copyTo(GpuMat& m, const GpuMat& mask) const; + + //! sets some of the GpuMat elements to s, according to the mask + GpuMat& setTo(Scalar s, const GpuMat& mask = GpuMat()); + + //! converts GpuMat to another datatype with optional scaling + void convertTo(GpuMat& m, int rtype, double alpha = 1, double beta = 0) const; + + void assignTo(GpuMat& m, int type=-1) const; + + //! returns pointer to y-th row + uchar* ptr(int y = 0); + const uchar* ptr(int y = 0) const; + + //! template version of the above method + template _Tp* ptr(int y = 0); + template const _Tp* ptr(int y = 0) const; + + template operator PtrStepSz<_Tp>() const; + template operator PtrStep<_Tp>() const; + + //! returns a new GpuMat header for the specified row + GpuMat row(int y) const; + + //! returns a new GpuMat header for the specified column + GpuMat col(int x) const; + + //! ... for the specified row span + GpuMat rowRange(int startrow, int endrow) const; + GpuMat rowRange(Range r) const; + + //! ... for the specified column span + GpuMat colRange(int startcol, int endcol) const; + GpuMat colRange(Range r) const; + + //! extracts a rectangular sub-GpuMat (this is a generalized form of row, rowRange etc.) + GpuMat operator ()(Range rowRange, Range colRange) const; + GpuMat operator ()(Rect roi) const; + + //! creates alternative GpuMat header for the same data, with different + //! number of channels and/or different number of rows + GpuMat reshape(int cn, int rows = 0) const; + + //! locates GpuMat header within a parent GpuMat + void locateROI(Size& wholeSize, Point& ofs) const; + + //! moves/resizes the current GpuMat ROI inside the parent GpuMat + GpuMat& adjustROI(int dtop, int dbottom, int dleft, int dright); + + //! returns true iff the GpuMat data is continuous + //! (i.e. when there are no gaps between successive rows) + bool isContinuous() const; + + //! returns element size in bytes + size_t elemSize() const; + + //! returns the size of element channel in bytes + size_t elemSize1() const; + + //! returns element type + int type() const; + + //! returns element type + int depth() const; + + //! returns number of channels + int channels() const; + + //! returns step/elemSize1() + size_t step1() const; + + //! returns GpuMat size : width == number of columns, height == number of rows + Size size() const; + + //! returns true if GpuMat data is NULL + bool empty() const; + + /*! includes several bit-fields: + - the magic signature + - continuity flag + - depth + - number of channels + */ + int flags; + + //! the number of rows and columns + int rows, cols; + + //! a distance between successive rows in bytes; includes the gap if any + size_t step; + + //! pointer to the data + uchar* data; + + //! pointer to the reference counter; + //! when GpuMat points to user-allocated data, the pointer is NULL + int* refcount; + + //! helper fields used in locateROI and adjustROI + uchar* datastart; + uchar* dataend; +}; + +//! Creates continuous GPU matrix +CV_EXPORTS void createContinuous(int rows, int cols, int type, GpuMat& m); + +//! Ensures that size of the given matrix is not less than (rows, cols) size +//! and matrix type is match specified one too +CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m); + +CV_EXPORTS GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat& mat); + //////////////////////////////// CudaMem //////////////////////////////// // CudaMem is limited cv::Mat with page locked memory allocation. // Page locked memory is only needed for async and faster coping to GPU. @@ -289,169 +464,6 @@ CV_EXPORTS void printCudaDeviceInfo(int device); CV_EXPORTS void printShortCudaDeviceInfo(int device); -//////////////////////////////// GpuMat /////////////////////////////// - -//! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat. -class CV_EXPORTS GpuMat -{ -public: - //! default constructor - GpuMat(); - - //! constructs GpuMatrix of the specified size and type (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.) - GpuMat(int rows, int cols, int type); - GpuMat(Size size, int type); - - //! constucts GpuMatrix and fills it with the specified value _s. - GpuMat(int rows, int cols, int type, Scalar s); - GpuMat(Size size, int type, Scalar s); - - //! copy constructor - GpuMat(const GpuMat& m); - - //! constructor for GpuMatrix headers pointing to user-allocated data - GpuMat(int rows, int cols, int type, void* data, size_t step = Mat::AUTO_STEP); - GpuMat(Size size, int type, void* data, size_t step = Mat::AUTO_STEP); - - //! creates a matrix header for a part of the bigger matrix - GpuMat(const GpuMat& m, Range rowRange, Range colRange); - GpuMat(const GpuMat& m, Rect roi); - - //! builds GpuMat from Mat. Perfom blocking upload to device. - explicit GpuMat(const Mat& m); - - //! destructor - calls release() - ~GpuMat(); - - //! assignment operators - GpuMat& operator = (const GpuMat& m); - - //! pefroms blocking upload data to GpuMat. - void upload(const Mat& m); - - //! downloads data from device to host memory. Blocking calls. - void download(Mat& m) const; - - //! returns a new GpuMatrix header for the specified row - GpuMat row(int y) const; - //! returns a new GpuMatrix header for the specified column - GpuMat col(int x) const; - //! ... for the specified row span - GpuMat rowRange(int startrow, int endrow) const; - GpuMat rowRange(Range r) const; - //! ... for the specified column span - GpuMat colRange(int startcol, int endcol) const; - GpuMat colRange(Range r) const; - - //! returns deep copy of the GpuMatrix, i.e. the data is copied - GpuMat clone() const; - //! copies the GpuMatrix content to "m". - // It calls m.create(this->size(), this->type()). - void copyTo(GpuMat& m) const; - //! copies those GpuMatrix elements to "m" that are marked with non-zero mask elements. - void copyTo(GpuMat& m, const GpuMat& mask) const; - //! converts GpuMatrix to another datatype with optional scalng. See cvConvertScale. - void convertTo(GpuMat& m, int rtype, double alpha = 1, double beta = 0) const; - - void assignTo(GpuMat& m, int type=-1) const; - - //! sets every GpuMatrix element to s - GpuMat& operator = (Scalar s); - //! sets some of the GpuMatrix elements to s, according to the mask - GpuMat& setTo(Scalar s, const GpuMat& mask = GpuMat()); - //! creates alternative GpuMatrix header for the same data, with different - // number of channels and/or different number of rows. see cvReshape. - GpuMat reshape(int cn, int rows = 0) const; - - //! allocates new GpuMatrix data unless the GpuMatrix already has specified size and type. - // previous data is unreferenced if needed. - void create(int rows, int cols, int type); - void create(Size size, int type); - //! decreases reference counter; - // deallocate the data when reference counter reaches 0. - void release(); - - //! swaps with other smart pointer - void swap(GpuMat& mat); - - //! locates GpuMatrix header within a parent GpuMatrix. See below - void locateROI(Size& wholeSize, Point& ofs) const; - //! moves/resizes the current GpuMatrix ROI inside the parent GpuMatrix. - GpuMat& adjustROI(int dtop, int dbottom, int dleft, int dright); - //! extracts a rectangular sub-GpuMatrix - // (this is a generalized form of row, rowRange etc.) - GpuMat operator()(Range rowRange, Range colRange) const; - GpuMat operator()(Rect roi) const; - - //! returns true iff the GpuMatrix data is continuous - // (i.e. when there are no gaps between successive rows). - // similar to CV_IS_GpuMat_CONT(cvGpuMat->type) - bool isContinuous() const; - //! returns element size in bytes, - // similar to CV_ELEM_SIZE(cvMat->type) - size_t elemSize() const; - //! returns the size of element channel in bytes. - size_t elemSize1() const; - //! returns element type, similar to CV_MAT_TYPE(cvMat->type) - int type() const; - //! returns element type, similar to CV_MAT_DEPTH(cvMat->type) - int depth() const; - //! returns element type, similar to CV_MAT_CN(cvMat->type) - int channels() const; - //! returns step/elemSize1() - size_t step1() const; - //! returns GpuMatrix size: - // width == number of columns, height == number of rows - Size size() const; - //! returns true if GpuMatrix data is NULL - bool empty() const; - - //! returns pointer to y-th row - uchar* ptr(int y = 0); - const uchar* ptr(int y = 0) const; - - //! template version of the above method - template _Tp* ptr(int y = 0); - template const _Tp* ptr(int y = 0) const; - - template operator PtrStepSz<_Tp>() const; - template operator PtrStep<_Tp>() const; - - /*! includes several bit-fields: - - the magic signature - - continuity flag - - depth - - number of channels - */ - int flags; - - //! the number of rows and columns - int rows, cols; - - //! a distance between successive rows in bytes; includes the gap if any - size_t step; - - //! pointer to the data - uchar* data; - - //! pointer to the reference counter; - // when GpuMatrix points to user-allocated data, the pointer is NULL - int* refcount; - - //! helper fields used in locateROI and adjustROI - uchar* datastart; - uchar* dataend; -}; - -//! Creates continuous GPU matrix -CV_EXPORTS void createContinuous(int rows, int cols, int type, GpuMat& m); - -//! Ensures that size of the given matrix is not less than (rows, cols) size -//! and matrix type is match specified one too -CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m); - -CV_EXPORTS GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat &mat); - }} // cv::gpu #include "opencv2/core/gpu.inl.hpp" diff --git a/modules/core/include/opencv2/core/gpu.inl.hpp b/modules/core/include/opencv2/core/gpu.inl.hpp index 460dc0a15c..cf295a0d73 100644 --- a/modules/core/include/opencv2/core/gpu.inl.hpp +++ b/modules/core/include/opencv2/core/gpu.inl.hpp @@ -94,12 +94,58 @@ GpuMat::GpuMat(Size size_, int type_, Scalar s_) } } +inline +GpuMat::GpuMat(const GpuMat& m) + : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend) +{ + if (refcount) + CV_XADD(refcount, 1); +} + +inline +GpuMat::GpuMat(const Mat& m) : + flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) +{ + upload(m); +} + inline GpuMat::~GpuMat() { release(); } +inline +GpuMat& GpuMat::operator =(const GpuMat& m) +{ + if (this != &m) + { + GpuMat temp(m); + swap(temp); + } + + return *this; +} + +inline +void GpuMat::create(Size size_, int type_) +{ + create(size_.height, size_.width, type_); +} + +inline +void GpuMat::swap(GpuMat& b) +{ + std::swap(flags, b.flags); + std::swap(rows, b.rows); + std::swap(cols, b.cols); + std::swap(step, b.step); + std::swap(data, b.data); + std::swap(datastart, b.datastart); + std::swap(dataend, b.dataend); + std::swap(refcount, b.refcount); +} + inline GpuMat GpuMat::clone() const { @@ -118,15 +164,17 @@ void GpuMat::assignTo(GpuMat& m, int _type) const } inline -size_t GpuMat::step1() const +uchar* GpuMat::ptr(int y) { - return step / elemSize1(); + CV_DbgAssert( (unsigned)y < (unsigned)rows ); + return data + step * y; } inline -bool GpuMat::empty() const +const uchar* GpuMat::ptr(int y) const { - return data == 0; + CV_DbgAssert( (unsigned)y < (unsigned)rows ); + return data + step * y; } template inline @@ -141,6 +189,18 @@ const _Tp* GpuMat::ptr(int y) const return (const _Tp*)ptr(y); } +template inline +GpuMat::operator PtrStepSz() const +{ + return PtrStepSz(rows, cols, (T*)data, step); +} + +template inline +GpuMat::operator PtrStep() const +{ + return PtrStep((T*)data, step); +} + inline GpuMat GpuMat::row(int y) const { @@ -178,19 +238,13 @@ GpuMat GpuMat::colRange(Range r) const } inline -void GpuMat::create(Size size_, int type_) +GpuMat GpuMat::operator ()(Range rowRange_, Range colRange_) const { - create(size_.height, size_.width, type_); + return GpuMat(*this, rowRange_, colRange_); } inline -GpuMat GpuMat::operator()(Range _rowRange, Range _colRange) const -{ - return GpuMat(*this, _rowRange, _colRange); -} - -inline -GpuMat GpuMat::operator()(Rect roi) const +GpuMat GpuMat::operator ()(Rect roi) const { return GpuMat(*this, roi); } @@ -231,6 +285,12 @@ int GpuMat::channels() const return CV_MAT_CN(flags); } +inline +size_t GpuMat::step1() const +{ + return step / elemSize1(); +} + inline Size GpuMat::size() const { @@ -238,42 +298,9 @@ Size GpuMat::size() const } inline -uchar* GpuMat::ptr(int y) +bool GpuMat::empty() const { - CV_DbgAssert((unsigned)y < (unsigned)rows); - return data + step * y; -} - -inline -const uchar* GpuMat::ptr(int y) const -{ - CV_DbgAssert((unsigned)y < (unsigned)rows); - return data + step * y; -} - -inline -GpuMat& GpuMat::operator = (Scalar s) -{ - setTo(s); - return *this; -} - -template inline -GpuMat::operator PtrStepSz() const -{ - return PtrStepSz(rows, cols, (T*)data, step); -} - -template inline -GpuMat::operator PtrStep() const -{ - return PtrStep((T*)data, step); -} - -static inline -void swap(GpuMat& a, GpuMat& b) -{ - a.swap(b); + return data == 0; } static inline @@ -304,6 +331,23 @@ void ensureSizeIsEnough(Size size, int type, GpuMat& m) ensureSizeIsEnough(size.height, size.width, type, m); } +static inline +void swap(GpuMat& a, GpuMat& b) +{ + a.swap(b); +} + }} // namespace cv { namespace gpu +namespace cv { + +inline +Mat::Mat(const gpu::GpuMat& m) + : flags(0), dims(0), rows(0), cols(0), data(0), refcount(0), datastart(0), dataend(0), datalimit(0), allocator(0), size(&rows) +{ + m.download(*this); +} + +} + #endif // __OPENCV_CORE_GPUINL_HPP__ diff --git a/modules/core/src/cuda/matrix_operations.cu b/modules/core/src/cuda/matrix_operations.cu index 521ee1a2ca..d16a88df1f 100644 --- a/modules/core/src/cuda/matrix_operations.cu +++ b/modules/core/src/cuda/matrix_operations.cu @@ -45,18 +45,7 @@ #include "opencv2/core/cuda/functional.hpp" #include "opencv2/core/cuda/type_traits.hpp" -namespace cv { namespace gpu { namespace cudev -{ - void writeScalar(const uchar*); - void writeScalar(const schar*); - void writeScalar(const ushort*); - void writeScalar(const short int*); - void writeScalar(const int*); - void writeScalar(const float*); - void writeScalar(const double*); - void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream); - void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t); -}}} +#include "matrix_operations.hpp" namespace cv { namespace gpu { namespace cudev { @@ -73,32 +62,33 @@ namespace cv { namespace gpu { namespace cudev ////////////////////////////////// CopyTo ///////////////////////////////// /////////////////////////////////////////////////////////////////////////// - template void copyToWithMask(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream) + template + void copyWithMask(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool multiChannelMask, cudaStream_t stream) { - if (colorMask) + if (multiChannelMask) cv::gpu::cudev::transform((PtrStepSz)src, (PtrStepSz)dst, identity(), SingleMask(mask), stream); else cv::gpu::cudev::transform((PtrStepSz)src, (PtrStepSz)dst, identity(), SingleMaskChannels(mask, cn), stream); } - void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream) + void copyWithMask(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool multiChannelMask, cudaStream_t stream) { - typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream); + typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool multiChannelMask, cudaStream_t stream); static func_t tab[] = { 0, - copyToWithMask, - copyToWithMask, + copyWithMask, + copyWithMask, 0, - copyToWithMask, + copyWithMask, 0, 0, 0, - copyToWithMask + copyWithMask }; - tab[elemSize1](src, dst, cn, mask, colorMask, stream); + tab[elemSize1](src, dst, cn, mask, multiChannelMask, stream); } /////////////////////////////////////////////////////////////////////////// @@ -122,37 +112,37 @@ namespace cv { namespace gpu { namespace cudev template <> __device__ __forceinline__ float readScalar(int i) {return scalar_32f[i];} template <> __device__ __forceinline__ double readScalar(int i) {return scalar_64f[i];} - void writeScalar(const uchar* vals) + static inline void writeScalar(const uchar* vals) { cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) ); } - void writeScalar(const schar* vals) + static inline void writeScalar(const schar* vals) { cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) ); } - void writeScalar(const ushort* vals) + static inline void writeScalar(const ushort* vals) { cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) ); } - void writeScalar(const short* vals) + static inline void writeScalar(const short* vals) { cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) ); } - void writeScalar(const int* vals) + static inline void writeScalar(const int* vals) { cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) ); } - void writeScalar(const float* vals) + static inline void writeScalar(const float* vals) { cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) ); } - void writeScalar(const double* vals) + static inline void writeScalar(const double* vals) { cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) ); } template - __global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels) + __global__ void set(T* mat, int cols, int rows, size_t step, int channels) { size_t x = blockIdx.x * blockDim.x + threadIdx.x; size_t y = blockIdx.y * blockDim.y + threadIdx.y; @@ -164,8 +154,31 @@ namespace cv { namespace gpu { namespace cudev } } + template + void set(PtrStepSz mat, const T* scalar, int channels, cudaStream_t stream) + { + writeScalar(scalar); + + dim3 threadsPerBlock(32, 8, 1); + dim3 numBlocks(mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1); + + set<<>>(mat.data, mat.cols, mat.rows, mat.step, channels); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall ( cudaDeviceSynchronize() ); + } + + template void set(PtrStepSz mat, const uchar* scalar, int channels, cudaStream_t stream); + template void set(PtrStepSz mat, const schar* scalar, int channels, cudaStream_t stream); + template void set(PtrStepSz mat, const ushort* scalar, int channels, cudaStream_t stream); + template void set(PtrStepSz mat, const short* scalar, int channels, cudaStream_t stream); + template void set(PtrStepSz mat, const int* scalar, int channels, cudaStream_t stream); + template void set(PtrStepSz mat, const float* scalar, int channels, cudaStream_t stream); + template void set(PtrStepSz mat, const double* scalar, int channels, cudaStream_t stream); + template - __global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask) + __global__ void set(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask) { size_t x = blockIdx.x * blockDim.x + threadIdx.x; size_t y = blockIdx.y * blockDim.y + threadIdx.y; @@ -177,51 +190,29 @@ namespace cv { namespace gpu { namespace cudev mat[idx] = readScalar(x % channels); } } + template - void set_to_gpu(PtrStepSzb mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream) + void set(PtrStepSz mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream) { writeScalar(scalar); dim3 threadsPerBlock(32, 8, 1); - dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1); + dim3 numBlocks(mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1); - set_to_with_mask<<>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step); + set<<>>(mat.data, mask.data, mat.cols, mat.rows, mat.step, channels, mask.step); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall ( cudaDeviceSynchronize() ); } - template void set_to_gpu(PtrStepSzb mat, const uchar* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); - template void set_to_gpu(PtrStepSzb mat, const schar* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); - template void set_to_gpu(PtrStepSzb mat, const ushort* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); - template void set_to_gpu(PtrStepSzb mat, const short* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); - template void set_to_gpu(PtrStepSzb mat, const int* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); - template void set_to_gpu(PtrStepSzb mat, const float* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); - template void set_to_gpu(PtrStepSzb mat, const double* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); - - template - void set_to_gpu(PtrStepSzb mat, const T* scalar, int channels, cudaStream_t stream) - { - writeScalar(scalar); - - dim3 threadsPerBlock(32, 8, 1); - dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1); - - set_to_without_mask<<>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall ( cudaDeviceSynchronize() ); - } - - template void set_to_gpu(PtrStepSzb mat, const uchar* scalar, int channels, cudaStream_t stream); - template void set_to_gpu(PtrStepSzb mat, const schar* scalar, int channels, cudaStream_t stream); - template void set_to_gpu(PtrStepSzb mat, const ushort* scalar, int channels, cudaStream_t stream); - template void set_to_gpu(PtrStepSzb mat, const short* scalar, int channels, cudaStream_t stream); - template void set_to_gpu(PtrStepSzb mat, const int* scalar, int channels, cudaStream_t stream); - template void set_to_gpu(PtrStepSzb mat, const float* scalar, int channels, cudaStream_t stream); - template void set_to_gpu(PtrStepSzb mat, const double* scalar, int channels, cudaStream_t stream); + template void set(PtrStepSz mat, const uchar* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); + template void set(PtrStepSz mat, const schar* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); + template void set(PtrStepSz mat, const ushort* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); + template void set(PtrStepSz mat, const short* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); + template void set(PtrStepSz mat, const int* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); + template void set(PtrStepSz mat, const float* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); + template void set(PtrStepSz mat, const double* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); /////////////////////////////////////////////////////////////////////////// //////////////////////////////// ConvertTo //////////////////////////////// @@ -296,12 +287,7 @@ namespace cv { namespace gpu { namespace cudev cv::gpu::cudev::transform((PtrStepSz)src, (PtrStepSz)dst, op, WithOutMask(), stream); } -#if defined __clang__ -# pragma clang diagnostic push -# pragma clang diagnostic ignored "-Wmissing-declarations" -#endif - - void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream) + void convert(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream) { typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream); @@ -372,11 +358,7 @@ namespace cv { namespace gpu { namespace cudev } }; - caller_t func = tab[sdepth][ddepth]; + const caller_t func = tab[sdepth][ddepth]; func(src, dst, alpha, beta, stream); } - -#if defined __clang__ -# pragma clang diagnostic pop -#endif }}} // namespace cv { namespace gpu { namespace cudev diff --git a/modules/core/src/cuda/matrix_operations.hpp b/modules/core/src/cuda/matrix_operations.hpp new file mode 100644 index 0000000000..4e451061b8 --- /dev/null +++ b/modules/core/src/cuda/matrix_operations.hpp @@ -0,0 +1,57 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "opencv2/core/cuda/common.hpp" + +namespace cv { namespace gpu { namespace cudev +{ + void copyWithMask(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool multiChannelMask, cudaStream_t stream); + + template + void set(PtrStepSz mat, const T* scalar, int channels, cudaStream_t stream); + + template + void set(PtrStepSz mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); + + void convert(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream); +}}} diff --git a/modules/core/src/gpu.cpp b/modules/core/src/gpu.cpp index 9637f86489..ce38088056 100644 --- a/modules/core/src/gpu.cpp +++ b/modules/core/src/gpu.cpp @@ -509,1020 +509,6 @@ void cv::gpu::printShortCudaDeviceInfo(int device) #endif // HAVE_CUDA -//////////////////////////////// GpuMat /////////////////////////////// - -cv::gpu::GpuMat::GpuMat(const GpuMat& m) - : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend) -{ - if (refcount) - CV_XADD(refcount, 1); -} - -cv::gpu::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t step_) : - flags(Mat::MAGIC_VAL + (type_ & Mat::TYPE_MASK)), rows(rows_), cols(cols_), - step(step_), data((uchar*)data_), refcount(0), - datastart((uchar*)data_), dataend((uchar*)data_) -{ - size_t minstep = cols * elemSize(); - - if (step == Mat::AUTO_STEP) - { - step = minstep; - flags |= Mat::CONTINUOUS_FLAG; - } - else - { - if (rows == 1) - step = minstep; - - CV_DbgAssert(step >= minstep); - - flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0; - } - dataend += step * (rows - 1) + minstep; -} - -cv::gpu::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) : - flags(Mat::MAGIC_VAL + (type_ & Mat::TYPE_MASK)), rows(size_.height), cols(size_.width), - step(step_), data((uchar*)data_), refcount(0), - datastart((uchar*)data_), dataend((uchar*)data_) -{ - size_t minstep = cols * elemSize(); - - if (step == Mat::AUTO_STEP) - { - step = minstep; - flags |= Mat::CONTINUOUS_FLAG; - } - else - { - if (rows == 1) - step = minstep; - - CV_DbgAssert(step >= minstep); - - flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0; - } - dataend += step * (rows - 1) + minstep; -} - -cv::gpu::GpuMat::GpuMat(const GpuMat& m, Range _rowRange, Range _colRange) -{ - flags = m.flags; - step = m.step; refcount = m.refcount; - data = m.data; datastart = m.datastart; dataend = m.dataend; - - if (_rowRange == Range::all()) - rows = m.rows; - else - { - CV_Assert(0 <= _rowRange.start && _rowRange.start <= _rowRange.end && _rowRange.end <= m.rows); - - rows = _rowRange.size(); - data += step*_rowRange.start; - } - - if (_colRange == Range::all()) - cols = m.cols; - else - { - CV_Assert(0 <= _colRange.start && _colRange.start <= _colRange.end && _colRange.end <= m.cols); - - cols = _colRange.size(); - data += _colRange.start*elemSize(); - flags &= cols < m.cols ? ~Mat::CONTINUOUS_FLAG : -1; - } - - if (rows == 1) - flags |= Mat::CONTINUOUS_FLAG; - - if (refcount) - CV_XADD(refcount, 1); - - if (rows <= 0 || cols <= 0) - rows = cols = 0; -} - -cv::gpu::GpuMat::GpuMat(const GpuMat& m, Rect roi) : - flags(m.flags), rows(roi.height), cols(roi.width), - step(m.step), data(m.data + roi.y*step), refcount(m.refcount), - datastart(m.datastart), dataend(m.dataend) -{ - flags &= roi.width < m.cols ? ~Mat::CONTINUOUS_FLAG : -1; - data += roi.x * elemSize(); - - CV_Assert(0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= m.cols && 0 <= roi.y && 0 <= roi.height && roi.y + roi.height <= m.rows); - - if (refcount) - CV_XADD(refcount, 1); - - if (rows <= 0 || cols <= 0) - rows = cols = 0; -} - -cv::gpu::GpuMat::GpuMat(const Mat& m) : - flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) -{ - upload(m); -} - -GpuMat& cv::gpu::GpuMat::operator = (const GpuMat& m) -{ - if (this != &m) - { - GpuMat temp(m); - swap(temp); - } - - return *this; -} - -void cv::gpu::GpuMat::swap(GpuMat& b) -{ - std::swap(flags, b.flags); - std::swap(rows, b.rows); - std::swap(cols, b.cols); - std::swap(step, b.step); - std::swap(data, b.data); - std::swap(datastart, b.datastart); - std::swap(dataend, b.dataend); - std::swap(refcount, b.refcount); -} - -void cv::gpu::GpuMat::locateROI(Size& wholeSize, Point& ofs) const -{ - size_t esz = elemSize(); - ptrdiff_t delta1 = data - datastart; - ptrdiff_t delta2 = dataend - datastart; - - CV_DbgAssert(step > 0); - - if (delta1 == 0) - ofs.x = ofs.y = 0; - else - { - ofs.y = static_cast(delta1 / step); - ofs.x = static_cast((delta1 - step * ofs.y) / esz); - - CV_DbgAssert(data == datastart + ofs.y * step + ofs.x * esz); - } - - size_t minstep = (ofs.x + cols) * esz; - - wholeSize.height = std::max(static_cast((delta2 - minstep) / step + 1), ofs.y + rows); - wholeSize.width = std::max(static_cast((delta2 - step * (wholeSize.height - 1)) / esz), ofs.x + cols); -} - -GpuMat& cv::gpu::GpuMat::adjustROI(int dtop, int dbottom, int dleft, int dright) -{ - Size wholeSize; - Point ofs; - locateROI(wholeSize, ofs); - - size_t esz = elemSize(); - - int row1 = std::max(ofs.y - dtop, 0); - int row2 = std::min(ofs.y + rows + dbottom, wholeSize.height); - - int col1 = std::max(ofs.x - dleft, 0); - int col2 = std::min(ofs.x + cols + dright, wholeSize.width); - - data += (row1 - ofs.y) * step + (col1 - ofs.x) * esz; - rows = row2 - row1; - cols = col2 - col1; - - if (esz * cols == step || rows == 1) - flags |= Mat::CONTINUOUS_FLAG; - else - flags &= ~Mat::CONTINUOUS_FLAG; - - return *this; -} - -GpuMat cv::gpu::GpuMat::reshape(int new_cn, int new_rows) const -{ - GpuMat hdr = *this; - - int cn = channels(); - if (new_cn == 0) - new_cn = cn; - - int total_width = cols * cn; - - if ((new_cn > total_width || total_width % new_cn != 0) && new_rows == 0) - new_rows = rows * total_width / new_cn; - - if (new_rows != 0 && new_rows != rows) - { - int total_size = total_width * rows; - - if (!isContinuous()) - CV_Error(CV_BadStep, "The matrix is not continuous, thus its number of rows can not be changed"); - - if ((unsigned)new_rows > (unsigned)total_size) - CV_Error(CV_StsOutOfRange, "Bad new number of rows"); - - total_width = total_size / new_rows; - - if (total_width * new_rows != total_size) - CV_Error(CV_StsBadArg, "The total number of matrix elements is not divisible by the new number of rows"); - - hdr.rows = new_rows; - hdr.step = total_width * elemSize1(); - } - - int new_width = total_width / new_cn; - - if (new_width * new_cn != total_width) - CV_Error(CV_BadNumChannels, "The total width is not divisible by the new number of channels"); - - hdr.cols = new_width; - hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn - 1) << CV_CN_SHIFT); - - return hdr; -} - -cv::Mat::Mat(const GpuMat& m) : flags(0), dims(0), rows(0), cols(0), data(0), refcount(0), datastart(0), dataend(0), datalimit(0), allocator(0), size(&rows) -{ - m.download(*this); -} - -void cv::gpu::createContinuous(int rows, int cols, int type, GpuMat& m) -{ - int area = rows * cols; - if (m.empty() || m.type() != type || !m.isContinuous() || m.size().area() < area) - m.create(1, area, type); - - m.cols = cols; - m.rows = rows; - m.step = m.elemSize() * cols; - m.flags |= Mat::CONTINUOUS_FLAG; -} - -void cv::gpu::ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m) -{ - if (m.empty() || m.type() != type || m.data != m.datastart) - m.create(rows, cols, type); - else - { - const size_t esz = m.elemSize(); - const ptrdiff_t delta2 = m.dataend - m.datastart; - - const size_t minstep = m.cols * esz; - - Size wholeSize; - wholeSize.height = std::max(static_cast((delta2 - minstep) / m.step + 1), m.rows); - wholeSize.width = std::max(static_cast((delta2 - m.step * (wholeSize.height - 1)) / esz), m.cols); - - if (wholeSize.height < rows || wholeSize.width < cols) - m.create(rows, cols, type); - else - { - m.cols = cols; - m.rows = rows; - } - } -} - -GpuMat cv::gpu::allocMatFromBuf(int rows, int cols, int type, GpuMat &mat) -{ - if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols) - return mat(Rect(0, 0, cols, rows)); - return mat = GpuMat(rows, cols, type); -} - -namespace -{ - class GpuFuncTable - { - public: - virtual ~GpuFuncTable() {} - - virtual void copy(const Mat& src, GpuMat& dst) const = 0; - virtual void copy(const GpuMat& src, Mat& dst) const = 0; - virtual void copy(const GpuMat& src, GpuMat& dst) const = 0; - - virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0; - - virtual void convert(const GpuMat& src, GpuMat& dst) const = 0; - virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const = 0; - - virtual void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const = 0; - - virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0; - virtual void free(void* devPtr) const = 0; - }; -} - -#ifndef HAVE_CUDA - -namespace -{ - class EmptyFuncTable : public GpuFuncTable - { - public: - void copy(const Mat&, GpuMat&) const { throw_no_cuda(); } - void copy(const GpuMat&, Mat&) const { throw_no_cuda(); } - void copy(const GpuMat&, GpuMat&) const { throw_no_cuda(); } - - void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_no_cuda(); } - - void convert(const GpuMat&, GpuMat&) const { throw_no_cuda(); } - void convert(const GpuMat&, GpuMat&, double, double) const { throw_no_cuda(); } - - void setTo(GpuMat&, Scalar, const GpuMat&) const { throw_no_cuda(); } - - void mallocPitch(void**, size_t*, size_t, size_t) const { throw_no_cuda(); } - void free(void*) const {} - }; - - const GpuFuncTable* gpuFuncTable() - { - static EmptyFuncTable empty; - return ∅ - } -} - -#else // HAVE_CUDA - -namespace cv { namespace gpu { namespace cudev -{ - void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream); - - template - void set_to_gpu(PtrStepSzb mat, const T* scalar, int channels, cudaStream_t stream); - - template - void set_to_gpu(PtrStepSzb mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); - - void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream); -}}} - -namespace -{ - template void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream) - { - Scalar_ sf = s; - cv::gpu::cudev::set_to_gpu(src, sf.val, src.channels(), stream); - } - - template void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) - { - Scalar_ sf = s; - cv::gpu::cudev::set_to_gpu(src, sf.val, mask, src.channels(), stream); - } -} - - -namespace cv { namespace gpu -{ - CV_EXPORTS void copyWithMask(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, const cv::gpu::GpuMat&, CUstream_st*); - CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&); - CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, double, double, CUstream_st*); - CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*); - CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*); - CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar); - CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&); -}} - - -namespace cv { namespace gpu -{ - void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0) - { - CV_Assert(src.size() == dst.size() && src.type() == dst.type()); - CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); - - cv::gpu::cudev::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream); - } - - void convertTo(const GpuMat& src, GpuMat& dst) - { - cv::gpu::cudev::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0); - } - - void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) - { - cv::gpu::cudev::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream); - } - - void setTo(GpuMat& src, Scalar s, cudaStream_t stream) - { - typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream); - - static const caller_t callers[] = - { - kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, - kernelSetCaller, kernelSetCaller - }; - - callers[src.depth()](src, s, stream); - } - - void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) - { - typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream); - - static const caller_t callers[] = - { - kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, - kernelSetCaller, kernelSetCaller - }; - - callers[src.depth()](src, s, mask, stream); - } - - void setTo(GpuMat& src, Scalar s) - { - setTo(src, s, 0); - } - - void setTo(GpuMat& src, Scalar s, const GpuMat& mask) - { - setTo(src, s, mask, 0); - } -}} - -namespace -{ - template struct NPPTypeTraits; - template<> struct NPPTypeTraits { typedef Npp8u npp_type; }; - template<> struct NPPTypeTraits { typedef Npp8s npp_type; }; - template<> struct NPPTypeTraits { typedef Npp16u npp_type; }; - template<> struct NPPTypeTraits { typedef Npp16s npp_type; }; - template<> struct NPPTypeTraits { typedef Npp32s npp_type; }; - template<> struct NPPTypeTraits { typedef Npp32f npp_type; }; - template<> struct NPPTypeTraits { typedef Npp64f npp_type; }; - - ////////////////////////////////////////////////////////////////////////// - // Convert - - template struct NppConvertFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - typedef typename NPPTypeTraits::npp_type dst_t; - - typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI); - }; - template struct NppConvertFunc - { - typedef typename NPPTypeTraits::npp_type dst_t; - - typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode); - }; - - template::func_ptr func> struct NppCvt - { - typedef typename NPPTypeTraits::npp_type src_t; - typedef typename NPPTypeTraits::npp_type dst_t; - - static void call(const GpuMat& src, GpuMat& dst) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template::func_ptr func> struct NppCvt - { - typedef typename NPPTypeTraits::npp_type dst_t; - - static void call(const GpuMat& src, GpuMat& dst) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, NPP_RND_NEAR) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - ////////////////////////////////////////////////////////////////////////// - // Set - - template struct NppSetFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - template struct NppSetFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - template struct NppSetFunc - { - typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - template<> struct NppSetFunc - { - typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - - template::func_ptr func> struct NppSet - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template::func_ptr func> struct NppSet - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - template struct NppSetMaskFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); - }; - template struct NppSetMaskFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); - }; - - template::func_ptr func> struct NppSetMask - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s, const GpuMat& mask) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template::func_ptr func> struct NppSetMask - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s, const GpuMat& mask) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - ////////////////////////////////////////////////////////////////////////// - // CopyMasked - - template struct NppCopyMaskedFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); - }; - - template::func_ptr func> struct NppCopyMasked - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, mask.ptr(), static_cast(mask.step)) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - template static inline bool isAligned(const T* ptr, size_t size) - { - return reinterpret_cast(ptr) % size == 0; - } - - ////////////////////////////////////////////////////////////////////////// - // CudaFuncTable - - class CudaFuncTable : public GpuFuncTable - { - public: - void copy(const Mat& src, GpuMat& dst) const - { - cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) ); - } - void copy(const GpuMat& src, Mat& dst) const - { - cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) ); - } - void copy(const GpuMat& src, GpuMat& dst) const - { - cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) ); - } - - void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const - { - CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); - CV_Assert(src.size() == dst.size() && src.type() == dst.type()); - CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); - - if (src.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream); - static const func_t funcs[7][4] = - { - /* 8U */ {NppCopyMasked::call, cv::gpu::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 8S */ {cv::gpu::copyWithMask , cv::gpu::copyWithMask, cv::gpu::copyWithMask , cv::gpu::copyWithMask }, - /* 16U */ {NppCopyMasked::call, cv::gpu::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 16S */ {NppCopyMasked::call, cv::gpu::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 32S */ {NppCopyMasked::call, cv::gpu::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 32F */ {NppCopyMasked::call, cv::gpu::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 64F */ {cv::gpu::copyWithMask , cv::gpu::copyWithMask, cv::gpu::copyWithMask , cv::gpu::copyWithMask } - }; - - const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::copyWithMask; - - func(src, dst, mask, 0); - } - - void convert(const GpuMat& src, GpuMat& dst) const - { - typedef void (*func_t)(const GpuMat& src, GpuMat& dst); - static const func_t funcs[7][7][4] = - { - { - /* 8U -> 8U */ {0, 0, 0, 0}, - /* 8U -> 8S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 8U -> 16U */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt::call}, - /* 8U -> 16S */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt::call}, - /* 8U -> 32S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 8U -> 32F */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 8U -> 64F */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo } - }, - { - /* 8S -> 8U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 8S -> 8S */ {0,0,0,0}, - /* 8S -> 16U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 8S -> 16S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 8S -> 32S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 8S -> 32F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 8S -> 64F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo} - }, - { - /* 16U -> 8U */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt::call}, - /* 16U -> 8S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16U -> 16U */ {0,0,0,0}, - /* 16U -> 16S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16U -> 32S */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16U -> 32F */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16U -> 64F */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo } - }, - { - /* 16S -> 8U */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt::call}, - /* 16S -> 8S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16S -> 16U */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16S -> 16S */ {0,0,0,0}, - /* 16S -> 32S */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16S -> 32F */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16S -> 64F */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo } - }, - { - /* 32S -> 8U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32S -> 8S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32S -> 16U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32S -> 16S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32S -> 32S */ {0,0,0,0}, - /* 32S -> 32F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32S -> 64F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo} - }, - { - /* 32F -> 8U */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32F -> 8S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32F -> 16U */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32F -> 16S */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32F -> 32S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32F -> 32F */ {0,0,0,0}, - /* 32F -> 64F */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo} - }, - { - /* 64F -> 8U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 8S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 16U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 16S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 32S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 32F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 64F */ {0,0,0,0} - } - }; - - CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); - CV_Assert(dst.depth() <= CV_64F); - CV_Assert(src.size() == dst.size() && src.channels() == dst.channels()); - - if (src.depth() == CV_64F || dst.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16); - if (!aligned) - { - cv::gpu::convertTo(src, dst); - return; - } - - const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1]; - CV_DbgAssert(func != 0); - - func(src, dst); - } - - void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const - { - CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); - CV_Assert(dst.depth() <= CV_64F); - - if (src.depth() == CV_64F || dst.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - cv::gpu::convertTo(src, dst, alpha, beta); - } - - void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const - { - if (mask.empty()) - { - if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0) - { - cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) ); - return; - } - - if (m.depth() == CV_8U) - { - int cn = m.channels(); - - if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3])) - { - int val = saturate_cast(s[0]); - cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) ); - return; - } - } - - typedef void (*func_t)(GpuMat& src, Scalar s); - static const func_t funcs[7][4] = - { - {NppSet::call, cv::gpu::setTo , cv::gpu::setTo , NppSet::call}, - {NppSet::call, NppSet::call, NppSet::call, NppSet::call}, - {NppSet::call, NppSet::call, cv::gpu::setTo , NppSet::call}, - {NppSet::call, NppSet::call, cv::gpu::setTo , NppSet::call}, - {NppSet::call, cv::gpu::setTo , cv::gpu::setTo , NppSet::call}, - {NppSet::call, cv::gpu::setTo , cv::gpu::setTo , NppSet::call}, - {cv::gpu::setTo , cv::gpu::setTo , cv::gpu::setTo , cv::gpu::setTo } - }; - - CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); - - if (m.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - funcs[m.depth()][m.channels() - 1](m, s); - } - else - { - typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask); - static const func_t funcs[7][4] = - { - {NppSetMask::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::call}, - {cv::gpu::setTo , cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo }, - {NppSetMask::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::call}, - {NppSetMask::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::call}, - {NppSetMask::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::call}, - {NppSetMask::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::call}, - {cv::gpu::setTo , cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo } - }; - - CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); - - if (m.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - funcs[m.depth()][m.channels() - 1](m, s, mask); - } - } - - void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const - { - cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) ); - } - - void free(void* devPtr) const - { - cudaFree(devPtr); - } - }; - - const GpuFuncTable* gpuFuncTable() - { - static CudaFuncTable funcTable; - return &funcTable; - } -} - -#endif // HAVE_CUDA - -void cv::gpu::GpuMat::upload(const Mat& m) -{ - CV_DbgAssert(!m.empty()); - - create(m.size(), m.type()); - - gpuFuncTable()->copy(m, *this); -} - -void cv::gpu::GpuMat::download(Mat& m) const -{ - CV_DbgAssert(!empty()); - - m.create(size(), type()); - - gpuFuncTable()->copy(*this, m); -} - -void cv::gpu::GpuMat::copyTo(GpuMat& m) const -{ - CV_DbgAssert(!empty()); - - m.create(size(), type()); - - gpuFuncTable()->copy(*this, m); -} - -void cv::gpu::GpuMat::copyTo(GpuMat& mat, const GpuMat& mask) const -{ - if (mask.empty()) - copyTo(mat); - else - { - mat.create(size(), type()); - - gpuFuncTable()->copyWithMask(*this, mat, mask); - } -} - -void cv::gpu::GpuMat::convertTo(GpuMat& dst, int rtype, double alpha, double beta) const -{ - bool noScale = fabs(alpha - 1) < std::numeric_limits::epsilon() && fabs(beta) < std::numeric_limits::epsilon(); - - if (rtype < 0) - rtype = type(); - else - rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels()); - - int sdepth = depth(); - int ddepth = CV_MAT_DEPTH(rtype); - if (sdepth == ddepth && noScale) - { - copyTo(dst); - return; - } - - GpuMat temp; - const GpuMat* psrc = this; - if (sdepth != ddepth && psrc == &dst) - { - temp = *this; - psrc = &temp; - } - - dst.create(size(), rtype); - - if (noScale) - gpuFuncTable()->convert(*psrc, dst); - else - gpuFuncTable()->convert(*psrc, dst, alpha, beta); -} - -GpuMat& cv::gpu::GpuMat::setTo(Scalar s, const GpuMat& mask) -{ - CV_Assert(mask.empty() || mask.type() == CV_8UC1); - CV_DbgAssert(!empty()); - - gpuFuncTable()->setTo(*this, s, mask); - - return *this; -} - -void cv::gpu::GpuMat::create(int _rows, int _cols, int _type) -{ - _type &= Mat::TYPE_MASK; - - if (rows == _rows && cols == _cols && type() == _type && data) - return; - - if (data) - release(); - - CV_DbgAssert(_rows >= 0 && _cols >= 0); - - if (_rows > 0 && _cols > 0) - { - flags = Mat::MAGIC_VAL + _type; - rows = _rows; - cols = _cols; - - size_t esz = elemSize(); - - void* devPtr; - gpuFuncTable()->mallocPitch(&devPtr, &step, esz * cols, rows); - - // Single row must be continuous - if (rows == 1) - step = esz * cols; - - if (esz * cols == step) - flags |= Mat::CONTINUOUS_FLAG; - - int64 _nettosize = static_cast(step) * rows; - size_t nettosize = static_cast(_nettosize); - - datastart = data = static_cast(devPtr); - dataend = data + nettosize; - - refcount = static_cast(fastMalloc(sizeof(*refcount))); - *refcount = 1; - } -} - -void cv::gpu::GpuMat::release() -{ - if (refcount && CV_XADD(refcount, -1) == 1) - { - fastFree(refcount); - - gpuFuncTable()->free(datastart); - } - - data = datastart = dataend = 0; - step = rows = cols = 0; - refcount = 0; -} - //////////////////////////////////////////////////////////////////////// // Error handling diff --git a/modules/core/src/gpu_mat.cpp b/modules/core/src/gpu_mat.cpp new file mode 100644 index 0000000000..144828b640 --- /dev/null +++ b/modules/core/src/gpu_mat.cpp @@ -0,0 +1,993 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +using namespace cv; +using namespace cv::gpu; + +/////////////////////////// matrix operations ///////////////////////// + +#ifdef HAVE_CUDA + +// CUDA implementation + +#include "cuda/matrix_operations.hpp" + +namespace +{ + template void cudaSet_(GpuMat& src, Scalar s, cudaStream_t stream) + { + Scalar_ sf = s; + cudev::set(PtrStepSz(src), sf.val, src.channels(), stream); + } + + void cudaSet(GpuMat& src, Scalar s, cudaStream_t stream) + { + typedef void (*func_t)(GpuMat& src, Scalar s, cudaStream_t stream); + static const func_t funcs[] = + { + cudaSet_, + cudaSet_, + cudaSet_, + cudaSet_, + cudaSet_, + cudaSet_, + cudaSet_ + }; + + funcs[src.depth()](src, s, stream); + } + + template void cudaSet_(GpuMat& src, Scalar s, PtrStepSzb mask, cudaStream_t stream) + { + Scalar_ sf = s; + cudev::set(PtrStepSz(src), sf.val, mask, src.channels(), stream); + } + + void cudaSet(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) + { + typedef void (*func_t)(GpuMat& src, Scalar s, PtrStepSzb mask, cudaStream_t stream); + static const func_t funcs[] = + { + cudaSet_, + cudaSet_, + cudaSet_, + cudaSet_, + cudaSet_, + cudaSet_, + cudaSet_ + }; + + funcs[src.depth()](src, s, mask, stream); + } + + void cudaCopyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream) + { + cudev::copyWithMask(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream); + } + + void cudaConvert(const GpuMat& src, GpuMat& dst, cudaStream_t stream) + { + cudev::convert(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, stream); + } + + void cudaConvert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) + { + cudev::convert(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream); + } +} + +// NPP implementation + +namespace +{ + ////////////////////////////////////////////////////////////////////////// + // Convert + + template struct NppConvertFunc + { + typedef typename NPPTypeTraits::npp_type src_t; + typedef typename NPPTypeTraits::npp_type dst_t; + + typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI); + }; + template struct NppConvertFunc + { + typedef typename NPPTypeTraits::npp_type dst_t; + + typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode); + }; + + template::func_ptr func> struct NppCvt + { + typedef typename NPPTypeTraits::npp_type src_t; + typedef typename NPPTypeTraits::npp_type dst_t; + + static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + NppStreamHandler h(stream); + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppCvt + { + typedef typename NPPTypeTraits::npp_type dst_t; + + static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + NppStreamHandler h(stream); + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, NPP_RND_NEAR) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + + ////////////////////////////////////////////////////////////////////////// + // Set + + template struct NppSetFunc + { + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI); + }; + template struct NppSetFunc + { + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI); + }; + template struct NppSetFunc + { + typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); + }; + template<> struct NppSetFunc + { + typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); + }; + + template::func_ptr func> struct NppSet + { + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s, cudaStream_t stream) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + NppStreamHandler h(stream); + + nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppSet + { + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s, cudaStream_t stream) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + NppStreamHandler h(stream); + + nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + + template struct NppSetMaskFunc + { + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); + }; + template struct NppSetMaskFunc + { + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); + }; + + template::func_ptr func> struct NppSetMask + { + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + NppStreamHandler h(stream); + + nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppSetMask + { + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + NppStreamHandler h(stream); + + nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + + ////////////////////////////////////////////////////////////////////////// + // CopyMasked + + template struct NppCopyWithMaskFunc + { + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); + }; + + template::func_ptr func> struct NppCopyWithMask + { + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + NppStreamHandler h(stream); + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, mask.ptr(), static_cast(mask.step)) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; +} + +// Dispatcher + +namespace cv { namespace gpu +{ + void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0); + void convert(const GpuMat& src, GpuMat& dst, cudaStream_t stream = 0); + void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0); + void set(GpuMat& m, Scalar s, cudaStream_t stream = 0); + void set(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream = 0); +}} + +namespace cv { namespace gpu +{ + void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream) + { + CV_DbgAssert( src.size() == dst.size() && src.type() == dst.type() ); + + CV_Assert( src.depth() <= CV_64F && src.channels() <= 4 ); + CV_Assert( src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()) ); + + if (src.depth() == CV_64F) + { + CV_Assert( deviceSupports(NATIVE_DOUBLE) ); + } + + typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream); + static const func_t funcs[7][4] = + { + /* 8U */ {NppCopyWithMask::call, cudaCopyWithMask, NppCopyWithMask::call, NppCopyWithMask::call}, + /* 8S */ {cudaCopyWithMask , cudaCopyWithMask, cudaCopyWithMask , cudaCopyWithMask }, + /* 16U */ {NppCopyWithMask::call, cudaCopyWithMask, NppCopyWithMask::call, NppCopyWithMask::call}, + /* 16S */ {NppCopyWithMask::call, cudaCopyWithMask, NppCopyWithMask::call, NppCopyWithMask::call}, + /* 32S */ {NppCopyWithMask::call, cudaCopyWithMask, NppCopyWithMask::call, NppCopyWithMask::call}, + /* 32F */ {NppCopyWithMask::call, cudaCopyWithMask, NppCopyWithMask::call, NppCopyWithMask::call}, + /* 64F */ {cudaCopyWithMask , cudaCopyWithMask, cudaCopyWithMask , cudaCopyWithMask } + }; + + const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cudaCopyWithMask; + + func(src, dst, mask, stream); + } + + void convert(const GpuMat& src, GpuMat& dst, cudaStream_t stream) + { + CV_DbgAssert( src.size() == dst.size() && src.channels() == dst.channels() ); + + CV_Assert( src.depth() <= CV_64F && src.channels() <= 4 ); + CV_Assert( dst.depth() <= CV_64F ); + + if (src.depth() == CV_64F || dst.depth() == CV_64F) + { + CV_Assert( deviceSupports(NATIVE_DOUBLE) ); + } + + typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream); + static const func_t funcs[7][7][4] = + { + { + /* 8U -> 8U */ {0, 0, 0, 0}, + /* 8U -> 8S */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert }, + /* 8U -> 16U */ {NppCvt::call, cudaConvert, cudaConvert, NppCvt::call}, + /* 8U -> 16S */ {NppCvt::call, cudaConvert, cudaConvert, NppCvt::call}, + /* 8U -> 32S */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert }, + /* 8U -> 32F */ {NppCvt::call, cudaConvert, cudaConvert, cudaConvert }, + /* 8U -> 64F */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert } + }, + { + /* 8S -> 8U */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, + /* 8S -> 8S */ {0,0,0,0}, + /* 8S -> 16U */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, + /* 8S -> 16S */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, + /* 8S -> 32S */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, + /* 8S -> 32F */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, + /* 8S -> 64F */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert} + }, + { + /* 16U -> 8U */ {NppCvt::call, cudaConvert, cudaConvert, NppCvt::call}, + /* 16U -> 8S */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert }, + /* 16U -> 16U */ {0,0,0,0}, + /* 16U -> 16S */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert }, + /* 16U -> 32S */ {NppCvt::call, cudaConvert, cudaConvert, cudaConvert }, + /* 16U -> 32F */ {NppCvt::call, cudaConvert, cudaConvert, cudaConvert }, + /* 16U -> 64F */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert } + }, + { + /* 16S -> 8U */ {NppCvt::call, cudaConvert, cudaConvert, NppCvt::call}, + /* 16S -> 8S */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert }, + /* 16S -> 16U */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert }, + /* 16S -> 16S */ {0,0,0,0}, + /* 16S -> 32S */ {NppCvt::call, cudaConvert, cudaConvert, cudaConvert }, + /* 16S -> 32F */ {NppCvt::call, cudaConvert, cudaConvert, cudaConvert }, + /* 16S -> 64F */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert } + }, + { + /* 32S -> 8U */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, + /* 32S -> 8S */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, + /* 32S -> 16U */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, + /* 32S -> 16S */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, + /* 32S -> 32S */ {0,0,0,0}, + /* 32S -> 32F */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, + /* 32S -> 64F */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert} + }, + { + /* 32F -> 8U */ {NppCvt::call, cudaConvert, cudaConvert, cudaConvert}, + /* 32F -> 8S */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert}, + /* 32F -> 16U */ {NppCvt::call, cudaConvert, cudaConvert, cudaConvert}, + /* 32F -> 16S */ {NppCvt::call, cudaConvert, cudaConvert, cudaConvert}, + /* 32F -> 32S */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert}, + /* 32F -> 32F */ {0,0,0,0}, + /* 32F -> 64F */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert} + }, + { + /* 64F -> 8U */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, + /* 64F -> 8S */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, + /* 64F -> 16U */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, + /* 64F -> 16S */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, + /* 64F -> 32S */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, + /* 64F -> 32F */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, + /* 64F -> 64F */ {0,0,0,0} + } + }; + + const bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16); + if (!aligned) + { + cudaConvert(src, dst, stream); + return; + } + + const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1]; + CV_DbgAssert( func != 0 ); + + func(src, dst, stream); + } + + void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) + { + CV_DbgAssert( src.size() == dst.size() && src.channels() == dst.channels() ); + + CV_Assert( src.depth() <= CV_64F && src.channels() <= 4 ); + CV_Assert( dst.depth() <= CV_64F ); + + if (src.depth() == CV_64F || dst.depth() == CV_64F) + { + CV_Assert( deviceSupports(NATIVE_DOUBLE) ); + } + + cudaConvert(src, dst, alpha, beta, stream); + } + + void set(GpuMat& m, Scalar s, cudaStream_t stream) + { + if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0) + { + if (stream) + cudaSafeCall( cudaMemset2DAsync(m.data, m.step, 0, m.cols * m.elemSize(), m.rows, stream) ); + else + cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) ); + return; + } + + if (m.depth() == CV_8U) + { + int cn = m.channels(); + + if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3])) + { + int val = saturate_cast(s[0]); + if (stream) + cudaSafeCall( cudaMemset2DAsync(m.data, m.step, val, m.cols * m.elemSize(), m.rows, stream) ); + else + cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) ); + return; + } + } + + typedef void (*func_t)(GpuMat& src, Scalar s, cudaStream_t stream); + static const func_t funcs[7][4] = + { + {NppSet::call, cudaSet , cudaSet , NppSet::call}, + {NppSet::call, NppSet::call, NppSet::call, NppSet::call}, + {NppSet::call, NppSet::call, cudaSet , NppSet::call}, + {NppSet::call, NppSet::call, cudaSet , NppSet::call}, + {NppSet::call, cudaSet , cudaSet , NppSet::call}, + {NppSet::call, cudaSet , cudaSet , NppSet::call}, + {cudaSet , cudaSet , cudaSet , cudaSet } + }; + + CV_Assert( m.depth() <= CV_64F && m.channels() <= 4 ); + + if (m.depth() == CV_64F) + { + CV_Assert( deviceSupports(NATIVE_DOUBLE) ); + } + + funcs[m.depth()][m.channels() - 1](m, s, stream); + } + + void set(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream) + { + CV_DbgAssert( !mask.empty() ); + + CV_Assert( m.depth() <= CV_64F && m.channels() <= 4 ); + + if (m.depth() == CV_64F) + { + CV_Assert( deviceSupports(NATIVE_DOUBLE) ); + } + + typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream); + static const func_t funcs[7][4] = + { + {NppSetMask::call, cudaSet, cudaSet, NppSetMask::call}, + {cudaSet , cudaSet, cudaSet, cudaSet }, + {NppSetMask::call, cudaSet, cudaSet, NppSetMask::call}, + {NppSetMask::call, cudaSet, cudaSet, NppSetMask::call}, + {NppSetMask::call, cudaSet, cudaSet, NppSetMask::call}, + {NppSetMask::call, cudaSet, cudaSet, NppSetMask::call}, + {cudaSet , cudaSet, cudaSet, cudaSet } + }; + + funcs[m.depth()][m.channels() - 1](m, s, mask, stream); + } +}} + +#endif // HAVE_CUDA + +cv::gpu::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t step_) : + flags(Mat::MAGIC_VAL + (type_ & Mat::TYPE_MASK)), rows(rows_), cols(cols_), + step(step_), data((uchar*)data_), refcount(0), + datastart((uchar*)data_), dataend((uchar*)data_) +{ + size_t minstep = cols * elemSize(); + + if (step == Mat::AUTO_STEP) + { + step = minstep; + flags |= Mat::CONTINUOUS_FLAG; + } + else + { + if (rows == 1) + step = minstep; + + CV_DbgAssert( step >= minstep ); + + flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0; + } + + dataend += step * (rows - 1) + minstep; +} + +cv::gpu::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) : + flags(Mat::MAGIC_VAL + (type_ & Mat::TYPE_MASK)), rows(size_.height), cols(size_.width), + step(step_), data((uchar*)data_), refcount(0), + datastart((uchar*)data_), dataend((uchar*)data_) +{ + size_t minstep = cols * elemSize(); + + if (step == Mat::AUTO_STEP) + { + step = minstep; + flags |= Mat::CONTINUOUS_FLAG; + } + else + { + if (rows == 1) + step = minstep; + + CV_DbgAssert( step >= minstep ); + + flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0; + } + dataend += step * (rows - 1) + minstep; +} + +cv::gpu::GpuMat::GpuMat(const GpuMat& m, Range rowRange_, Range colRange_) +{ + flags = m.flags; + step = m.step; refcount = m.refcount; + data = m.data; datastart = m.datastart; dataend = m.dataend; + + if (rowRange_ == Range::all()) + { + rows = m.rows; + } + else + { + CV_Assert( 0 <= rowRange_.start && rowRange_.start <= rowRange_.end && rowRange_.end <= m.rows ); + + rows = rowRange_.size(); + data += step*rowRange_.start; + } + + if (colRange_ == Range::all()) + { + cols = m.cols; + } + else + { + CV_Assert( 0 <= colRange_.start && colRange_.start <= colRange_.end && colRange_.end <= m.cols ); + + cols = colRange_.size(); + data += colRange_.start*elemSize(); + flags &= cols < m.cols ? ~Mat::CONTINUOUS_FLAG : -1; + } + + if (rows == 1) + flags |= Mat::CONTINUOUS_FLAG; + + if (refcount) + CV_XADD(refcount, 1); + + if (rows <= 0 || cols <= 0) + rows = cols = 0; +} + +cv::gpu::GpuMat::GpuMat(const GpuMat& m, Rect roi) : + flags(m.flags), rows(roi.height), cols(roi.width), + step(m.step), data(m.data + roi.y*step), refcount(m.refcount), + datastart(m.datastart), dataend(m.dataend) +{ + flags &= roi.width < m.cols ? ~Mat::CONTINUOUS_FLAG : -1; + data += roi.x * elemSize(); + + CV_Assert( 0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= m.cols && 0 <= roi.y && 0 <= roi.height && roi.y + roi.height <= m.rows ); + + if (refcount) + CV_XADD(refcount, 1); + + if (rows <= 0 || cols <= 0) + rows = cols = 0; +} + +void cv::gpu::GpuMat::create(int _rows, int _cols, int _type) +{ +#ifndef HAVE_CUDA + (void) _rows; + (void) _cols; + (void) _type; + throw_no_cuda(); +#else + _type &= Mat::TYPE_MASK; + + if (rows == _rows && cols == _cols && type() == _type && data) + return; + + if (data) + release(); + + CV_DbgAssert( _rows >= 0 && _cols >= 0 ); + + if (_rows > 0 && _cols > 0) + { + flags = Mat::MAGIC_VAL + _type; + rows = _rows; + cols = _cols; + + size_t esz = elemSize(); + + void* devPtr; + cudaSafeCall( cudaMallocPitch(&devPtr, &step, esz * cols, rows) ); + + // Single row must be continuous + if (rows == 1) + step = esz * cols; + + if (esz * cols == step) + flags |= Mat::CONTINUOUS_FLAG; + + int64 _nettosize = static_cast(step) * rows; + size_t nettosize = static_cast(_nettosize); + + datastart = data = static_cast(devPtr); + dataend = data + nettosize; + + refcount = static_cast(fastMalloc(sizeof(*refcount))); + *refcount = 1; + } +#endif +} + +void cv::gpu::GpuMat::release() +{ +#ifdef HAVE_CUDA + if (refcount && CV_XADD(refcount, -1) == 1) + { + cudaFree(datastart); + fastFree(refcount); + } + + data = datastart = dataend = 0; + step = rows = cols = 0; + refcount = 0; +#endif +} + +void cv::gpu::GpuMat::upload(const Mat& m) +{ +#ifndef HAVE_CUDA + (void) m; + throw_no_cuda(); +#else + CV_DbgAssert( !m.empty() ); + + create(m.size(), m.type()); + + cudaSafeCall( cudaMemcpy2D(data, step, m.data, m.step, cols * elemSize(), rows, cudaMemcpyHostToDevice) ); +#endif +} + +void cv::gpu::GpuMat::download(Mat& m) const +{ +#ifndef HAVE_CUDA + (void) m; + throw_no_cuda(); +#else + CV_DbgAssert( !empty() ); + + m.create(size(), type()); + + cudaSafeCall( cudaMemcpy2D(m.data, m.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToHost) ); +#endif +} + +void cv::gpu::GpuMat::copyTo(GpuMat& m) const +{ +#ifndef HAVE_CUDA + (void) m; + throw_no_cuda(); +#else + CV_DbgAssert( !empty() ); + + m.create(size(), type()); + + cudaSafeCall( cudaMemcpy2D(m.data, m.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToDevice) ); +#endif +} + +void cv::gpu::GpuMat::copyTo(GpuMat& mat, const GpuMat& mask) const +{ +#ifndef HAVE_CUDA + (void) mat; + (void) mask; + throw_no_cuda(); +#else + CV_DbgAssert( !empty() ); + + if (mask.empty()) + { + copyTo(mat); + } + else + { + mat.create(size(), type()); + + copyWithMask(*this, mat, mask); + } +#endif +} + +GpuMat& cv::gpu::GpuMat::setTo(Scalar s, const GpuMat& mask) +{ +#ifndef HAVE_CUDA + (void) s; + (void) mask; + throw_no_cuda(); + return *this; +#else + CV_DbgAssert( !empty() ); + + if (mask.empty()) + set(*this, s); + else + set(*this, s, mask); + + return *this; +#endif +} + +void cv::gpu::GpuMat::convertTo(GpuMat& dst, int rtype, double alpha, double beta) const +{ +#ifndef HAVE_CUDA + (void) dst; + (void) rtype; + (void) alpha; + (void) beta; + throw_no_cuda(); +#else + bool noScale = fabs(alpha - 1) < std::numeric_limits::epsilon() && fabs(beta) < std::numeric_limits::epsilon(); + + if (rtype < 0) + rtype = type(); + else + rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels()); + + int sdepth = depth(); + int ddepth = CV_MAT_DEPTH(rtype); + if (sdepth == ddepth && noScale) + { + copyTo(dst); + return; + } + + GpuMat temp; + const GpuMat* psrc = this; + if (sdepth != ddepth && psrc == &dst) + { + temp = *this; + psrc = &temp; + } + + dst.create(size(), rtype); + + if (noScale) + convert(*psrc, dst); + else + convert(*psrc, dst, alpha, beta); +#endif +} + +GpuMat cv::gpu::GpuMat::reshape(int new_cn, int new_rows) const +{ + GpuMat hdr = *this; + + int cn = channels(); + if (new_cn == 0) + new_cn = cn; + + int total_width = cols * cn; + + if ((new_cn > total_width || total_width % new_cn != 0) && new_rows == 0) + new_rows = rows * total_width / new_cn; + + if (new_rows != 0 && new_rows != rows) + { + int total_size = total_width * rows; + + if (!isContinuous()) + CV_Error(cv::Error::BadStep, "The matrix is not continuous, thus its number of rows can not be changed"); + + if ((unsigned)new_rows > (unsigned)total_size) + CV_Error(cv::Error::StsOutOfRange, "Bad new number of rows"); + + total_width = total_size / new_rows; + + if (total_width * new_rows != total_size) + CV_Error(cv::Error::StsBadArg, "The total number of matrix elements is not divisible by the new number of rows"); + + hdr.rows = new_rows; + hdr.step = total_width * elemSize1(); + } + + int new_width = total_width / new_cn; + + if (new_width * new_cn != total_width) + CV_Error(cv::Error::BadNumChannels, "The total width is not divisible by the new number of channels"); + + hdr.cols = new_width; + hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn - 1) << CV_CN_SHIFT); + + return hdr; +} + +void cv::gpu::GpuMat::locateROI(Size& wholeSize, Point& ofs) const +{ + CV_DbgAssert( step > 0 ); + + size_t esz = elemSize(); + ptrdiff_t delta1 = data - datastart; + ptrdiff_t delta2 = dataend - datastart; + + if (delta1 == 0) + { + ofs.x = ofs.y = 0; + } + else + { + ofs.y = static_cast(delta1 / step); + ofs.x = static_cast((delta1 - step * ofs.y) / esz); + + CV_DbgAssert( data == datastart + ofs.y * step + ofs.x * esz ); + } + + size_t minstep = (ofs.x + cols) * esz; + + wholeSize.height = std::max(static_cast((delta2 - minstep) / step + 1), ofs.y + rows); + wholeSize.width = std::max(static_cast((delta2 - step * (wholeSize.height - 1)) / esz), ofs.x + cols); +} + +GpuMat& cv::gpu::GpuMat::adjustROI(int dtop, int dbottom, int dleft, int dright) +{ + Size wholeSize; + Point ofs; + locateROI(wholeSize, ofs); + + size_t esz = elemSize(); + + int row1 = std::max(ofs.y - dtop, 0); + int row2 = std::min(ofs.y + rows + dbottom, wholeSize.height); + + int col1 = std::max(ofs.x - dleft, 0); + int col2 = std::min(ofs.x + cols + dright, wholeSize.width); + + data += (row1 - ofs.y) * step + (col1 - ofs.x) * esz; + rows = row2 - row1; + cols = col2 - col1; + + if (esz * cols == step || rows == 1) + flags |= Mat::CONTINUOUS_FLAG; + else + flags &= ~Mat::CONTINUOUS_FLAG; + + return *this; +} + +void cv::gpu::createContinuous(int rows, int cols, int type, GpuMat& m) +{ + const int area = rows * cols; + + if (m.empty() || m.type() != type || !m.isContinuous() || m.size().area() < area) + m.create(1, area, type); + + m.cols = cols; + m.rows = rows; + m.step = m.elemSize() * cols; + m.flags |= Mat::CONTINUOUS_FLAG; +} + +void cv::gpu::ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m) +{ + if (m.empty() || m.type() != type || m.data != m.datastart) + { + m.create(rows, cols, type); + } + else + { + const size_t esz = m.elemSize(); + const ptrdiff_t delta2 = m.dataend - m.datastart; + + const size_t minstep = m.cols * esz; + + Size wholeSize; + wholeSize.height = std::max(static_cast((delta2 - minstep) / m.step + 1), m.rows); + wholeSize.width = std::max(static_cast((delta2 - m.step * (wholeSize.height - 1)) / esz), m.cols); + + if (wholeSize.height < rows || wholeSize.width < cols) + { + m.create(rows, cols, type); + } + else + { + m.cols = cols; + m.rows = rows; + } + } +} + +GpuMat cv::gpu::allocMatFromBuf(int rows, int cols, int type, GpuMat& mat) +{ + if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols) + return mat(Rect(0, 0, cols, rows)); + + return mat = GpuMat(rows, cols, type); +} diff --git a/modules/core/src/gpu_stream.cpp b/modules/core/src/gpu_stream.cpp index 346204dd5e..cebaaa3650 100644 --- a/modules/core/src/gpu_stream.cpp +++ b/modules/core/src/gpu_stream.cpp @@ -72,10 +72,10 @@ void cv::gpu::Stream::release() { throw_no_cuda(); } namespace cv { namespace gpu { - void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream); - void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream); - void setTo(GpuMat& src, Scalar s, cudaStream_t stream); - void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream); + void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0); + void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0); + void set(GpuMat& m, Scalar s, cudaStream_t stream = 0); + void set(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream = 0); }} struct Stream::Impl @@ -217,7 +217,7 @@ void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val) } } - setTo(src, val, stream); + set(src, val, stream); } void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask) @@ -234,7 +234,7 @@ void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask) cudaStream_t stream = Impl::getStream(impl); - setTo(src, val, mask, stream); + set(src, val, mask, stream); } void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int dtype, double alpha, double beta) @@ -265,7 +265,7 @@ void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int dtype, dst.create(src.size(), dtype); cudaStream_t stream = Impl::getStream(impl); - convertTo(src, dst, alpha, beta, stream); + convert(src, dst, alpha, beta, stream); } #if CUDART_VERSION >= 5000