diff --git a/modules/gpu/src/cuda/cuda_shared.hpp b/modules/gpu/src/cuda/cuda_shared.hpp index d0fba59641..469314d170 100644 --- a/modules/gpu/src/cuda/cuda_shared.hpp +++ b/modules/gpu/src/cuda/cuda_shared.hpp @@ -51,6 +51,7 @@ namespace cv namespace gpu { typedef unsigned char uchar; + typedef signed char schar; typedef unsigned short ushort; typedef unsigned int uint; @@ -62,6 +63,8 @@ namespace cv extern "C" void set_to_without_mask (const DevMem2D& mat, const double * scalar, int depth, int channels); extern "C" void set_to_with_mask (const DevMem2D& mat, const double * scalar, const DevMem2D& mask, int depth, int channels); + + extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta); } } } diff --git a/modules/gpu/src/cuda/matrix_operations.cu b/modules/gpu/src/cuda/matrix_operations.cu index ce5c6cd2f2..6f9d0c49ef 100644 --- a/modules/gpu/src/cuda/matrix_operations.cu +++ b/modules/gpu/src/cuda/matrix_operations.cu @@ -46,10 +46,17 @@ #include "cuda_shared.hpp" #include "cuda_runtime.h" +using namespace cv::gpu; +using namespace cv::gpu::impl; + __constant__ __align__(16) float scalar_d[4]; namespace mat_operators { + ////////////////////////////////////////////////////////// + // SetTo + ////////////////////////////////////////////////////////// + template __global__ void kernel_set_to_without_mask(T * mat, int cols, int rows, int step) { @@ -76,7 +83,245 @@ namespace mat_operators mat[idx] = scalar_d[ x % channels ]; } } -} + + + ////////////////////////////////////////////////////////// + // ConvertTo + ////////////////////////////////////////////////////////// + + template + struct Converter + { + __device__ static void convert(uchar* srcmat, size_t src_step, uchar* dstmat, size_t dst_step, size_t width, size_t height, double alpha, double beta) + { + size_t x = threadIdx.x + blockIdx.x * blockDim.x; + size_t y = threadIdx.y + blockIdx.y * blockDim.y; + if (x < width && y < height) + { + const T* src = (const T*)(srcmat + src_step * y); + DT* dst = (DT*)(dstmat + dst_step * y); + + dst[x] = (DT)__double2int_rn(alpha * src[x] + beta); + } + } + __host__ static inline dim3 calcGrid(size_t width, size_t height, dim3 block) + { + return dim3(divUp(width, block.x), divUp(height, block.y)); + } + }; + + template + struct Converter + { + __device__ static void convert(uchar* srcmat, size_t src_step, uchar* dstmat, size_t dst_step, size_t width, size_t height, double alpha, double beta) + { + size_t x = threadIdx.x + blockIdx.x * blockDim.x; + size_t y = threadIdx.y + blockIdx.y * blockDim.y; + if (y < height) + { + const T* src = (const T*)(srcmat + src_step * y); + DT* dst = (DT*)(dstmat + dst_step * y); + if ((x << 2) + 3 < width) + { + uchar4 src4b = ((const uchar4*)src)[x]; + uchar4 dst4b; + + const T* src1b = (const T*) &src4b.x; + DT* dst1b = (DT*) &dst4b.x; + + dst1b[0] = (DT)__double2int_rn(alpha * src1b[0] + beta); + dst1b[1] = (DT)__double2int_rn(alpha * src1b[1] + beta); + dst1b[2] = (DT)__double2int_rn(alpha * src1b[2] + beta); + dst1b[3] = (DT)__double2int_rn(alpha * src1b[3] + beta); + + ((uchar4*)dst)[x] = dst4b; + } + else + { + if ((x << 2) + 0 < width) + dst[(x << 2) + 0] = (DT)__double2int_rn(alpha * src[(x << 2) + 0] + beta); + + if ((x << 2) + 1 < width) + dst[(x << 2) + 1] = (DT)__double2int_rn(alpha * src[(x << 2) + 1] + beta); + + if ((x << 2) + 2 < width) + dst[(x << 2) + 2] = (DT)__double2int_rn(alpha * src[(x << 2) + 2] + beta); + } + } + } + __host__ static inline dim3 calcGrid(size_t width, size_t height, dim3 block) + { + return dim3(divUp(width, block.x << 2), divUp(height, block.y)); + } + };/**/ + + template + struct Converter + { + __device__ static void convert(uchar* srcmat, size_t src_step, uchar* dstmat, size_t dst_step, size_t width, size_t height, double alpha, double beta) + { + size_t x = threadIdx.x + blockIdx.x * blockDim.x; + size_t y = threadIdx.y + blockIdx.y * blockDim.y; + if (y < height) + { + const T* src = (const T*)(srcmat + src_step * y); + DT* dst = (DT*)(dstmat + dst_step * y); + if ((x << 1) + 1 < width) + { + uchar2 src2b = ((const uchar2*)src)[x]; + ushort2 dst2s; + + const T* src1b = (const T*) &src2b; + DT* dst1s = (DT*) &dst2s; + dst1s[0] = (DT)__double2int_rn(alpha * src1b[0] + beta); + dst1s[1] = (DT)__double2int_rn(alpha * src1b[1] + beta); + + ((ushort2*)(dst))[x] = dst2s; + } + else + { + if ((x << 1) < width) + dst[(x << 1)] = (DT)__double2int_rn(alpha * src[(x << 1)] + beta); + } + } + } + __host__ static inline dim3 calcGrid(size_t width, size_t height, dim3 block) + { + return dim3(divUp(width, block.x << 1), divUp(height, block.y)); + } + };/**/ + + template + struct Converter + { + __device__ static void convert(uchar* srcmat, size_t src_step, uchar* dstmat, size_t dst_step, size_t width, size_t height, double alpha, double beta) + { + size_t x = threadIdx.x + blockIdx.x * blockDim.x; + size_t y = threadIdx.y + blockIdx.y * blockDim.y; + if (y < height) + { + const T* src = (const T*)(srcmat + src_step * y); + DT* dst = (DT*)(dstmat + dst_step * y); + if ((x << 2) + 3 < width) + { + ushort4 src4s = ((const ushort4*)src)[x]; + uchar4 dst4b; + + const T* src1s = (const T*) &src4s.x; + DT* dst1b = (DT*) &dst4b.x; + dst1b[0] = (DT)__double2int_rn(alpha * src1s[0] + beta); + dst1b[1] = (DT)__double2int_rn(alpha * src1s[1] + beta); + dst1b[2] = (DT)__double2int_rn(alpha * src1s[2] + beta); + dst1b[3] = (DT)__double2int_rn(alpha * src1s[3] + beta); + + ((uchar4*)(dst))[x] = dst4b; + } + else + { + if ((x << 2) + 0 < width) + dst[(x << 2) + 0] = (DT)__double2int_rn(alpha * src[(x << 2) + 0] + beta); + if ((x << 2) + 1 < width) + dst[(x << 2) + 1] = (DT)__double2int_rn(alpha * src[(x << 2) + 1] + beta); + if ((x << 2) + 2 < width) + dst[(x << 2) + 2] = (DT)__double2int_rn(alpha * src[(x << 2) + 2] + beta); + } + } + } + __host__ static inline dim3 calcGrid(size_t width, size_t height, dim3 block) + { + return dim3(divUp(width, block.x << 2), divUp(height, block.y)); + } + };/**/ + + template + struct Converter + { + __device__ static void convert(uchar* srcmat, size_t src_step, uchar* dstmat, size_t dst_step, size_t width, size_t height, double alpha, double beta) + { + size_t x = threadIdx.x + blockIdx.x * blockDim.x; + size_t y = threadIdx.y + blockIdx.y * blockDim.y; + if (y < height) + { + const T* src = (const T*)(srcmat + src_step * y); + DT* dst = (DT*)(dstmat + dst_step * y); + if ((x << 1) + 1 < width) + { + ushort2 src2s = ((const ushort2*)src)[x]; + ushort2 dst2s; + + const T* src1s = (const T*) &src2s.x; + DT* dst1s = (DT*) &dst2s.x; + dst1s[0] = (DT)__double2int_rn(alpha * src1s[0] + beta); + dst1s[1] = (DT)__double2int_rn(alpha * src1s[1] + beta); + + ((ushort2*)dst)[x] = dst2s; + } + else + { + if ((x << 1) < width) + dst[(x << 1)] = (DT)__double2int_rn(alpha * src[(x << 1)] + beta); + } + } + } + __host__ static inline dim3 calcGrid(size_t width, size_t height, dim3 block) + { + return dim3(divUp(width, block.x << 1), divUp(height, block.y)); + } + };/**/ + + template + struct Converter + { + __device__ static void convert(uchar* srcmat, size_t src_step, uchar* dstmat, size_t dst_step, size_t width, size_t height, double alpha, double beta) + { + size_t x = threadIdx.x + blockIdx.x * blockDim.x; + size_t y = threadIdx.y + blockIdx.y * blockDim.y; + if (x < width && y < height) + { + const T* src = (const T*)(srcmat + src_step * y); + float* dst = (float*)(dstmat + dst_step * y); + + dst[x] = (float)(alpha * src[x] + beta); + } + } + __host__ static inline dim3 calcGrid(size_t width, size_t height, dim3 block) + { + return dim3(divUp(width, block.x), divUp(height, block.y)); + } + }; + + template + struct Converter + { + __device__ static void convert(uchar* srcmat, size_t src_step, uchar* dstmat, size_t dst_step, size_t width, size_t height, double alpha, double beta) + { + size_t x = threadIdx.x + blockIdx.x * blockDim.x; + size_t y = threadIdx.y + blockIdx.y * blockDim.y; + if (x < width && y < height) + { + const T* src = (const T*)(srcmat + src_step * y); + double* dst = (double*)(dstmat + dst_step * y); + + dst[x] = (double)(alpha * src[x] + beta); + } + } + __host__ static inline dim3 calcGrid(size_t width, size_t height, dim3 block) + { + return dim3(divUp(width, block.x), divUp(height, block.y)); + } + }; + + template + __global__ static void kernel_convert_to(uchar* srcmat, size_t src_step, uchar* dstmat, size_t dst_step, size_t width, size_t height, double alpha, double beta) + { + Converter::convert(srcmat, src_step, dstmat, dst_step, width, height, alpha, beta); + } + +} // namespace mat_operators + +////////////////////////////////////////////////////////////// +// SetTo +////////////////////////////////////////////////////////////// extern "C" void cv::gpu::impl::set_to_without_mask(const DevMem2D& mat, const double * scalar, int elemSize1, int channels) { @@ -158,3 +403,66 @@ extern "C" void cv::gpu::impl::set_to_with_mask(const DevMem2D& mat, const doubl cudaSafeCall ( cudaThreadSynchronize() ); } +////////////////////////////////////////////////////////////// +// ConvertTo +////////////////////////////////////////////////////////////// + +namespace cv +{ + namespace gpu + { + namespace impl + { + + typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta); + + //#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 130) + + template + void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta) + { + dim3 block(32, 8); + dim3 grid = ::mat_operators::Converter::calcGrid(width, height, block); + ::mat_operators::kernel_convert_to<<>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta); + cudaSafeCall( cudaThreadSynchronize() ); + } + //#endif + + extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta) + { + static CvtFunc tab[8][8] = + { + {cvt_, cvt_, cvt_, cvt_, + cvt_, cvt_, cvt_, 0}, + + {cvt_, cvt_, cvt_, cvt_, + cvt_, cvt_, cvt_, 0}, + + {cvt_, cvt_, cvt_, cvt_, + cvt_, cvt_, cvt_, 0}, + + {cvt_, cvt_, cvt_, cvt_, + cvt_, cvt_, cvt_, 0}, + + {cvt_, cvt_, cvt_, + cvt_, cvt_, cvt_, cvt_, 0}, + + {cvt_, cvt_, cvt_, + cvt_, cvt_, cvt_, cvt_, 0}, + + {cvt_, cvt_, cvt_, + cvt_, cvt_, cvt_, cvt_, 0}, + + {0,0,0,0,0,0,0,0} + }; + + CvtFunc func = tab[sdepth][ddepth]; + if (func == 0) + error("Operation \'ConvertTo\' doesn't supported on your GPU model", __FILE__, __LINE__); + func(src, dst, width, height, alpha, beta); + } + } + + + } +} diff --git a/modules/gpu/src/matrix_operations.cpp b/modules/gpu/src/matrix_operations.cpp index fe78eda9c6..867efdacd3 100644 --- a/modules/gpu/src/matrix_operations.cpp +++ b/modules/gpu/src/matrix_operations.cpp @@ -104,9 +104,31 @@ void cv::gpu::GpuMat::copyTo( GpuMat& /*m*/, const GpuMat&/* mask */) const CV_Assert(!"Not implemented"); } -void cv::gpu::GpuMat::convertTo( GpuMat& /*m*/, int /*rtype*/, double /*alpha*/, double /*beta*/ ) const +void cv::gpu::GpuMat::convertTo( GpuMat& dst, int rtype, double alpha, double beta ) const { - CV_Assert(!"Not implemented"); + //CV_Assert(!"Not implemented"); + + bool noScale = fabs(alpha-1) < std::numeric_limits::epsilon() && fabs(beta) < std::numeric_limits::epsilon(); + + if( rtype < 0 ) + rtype = type(); + else + rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels()); + + int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype); + /*if( sdepth == ddepth && noScale ) + { + copyTo(dst); + return; + }*/ + + GpuMat temp; + const GpuMat* psrc = this; + if( sdepth != ddepth && psrc == &dst ) + psrc = &(temp = *this); + + dst.create( size(), rtype ); + impl::convert_to(*psrc, sdepth, dst, ddepth, psrc->cols * psrc->channels(), psrc->rows, alpha, beta); } GpuMat& GpuMat::operator = (const Scalar& s) diff --git a/modules/gpu/src/precomp.hpp b/modules/gpu/src/precomp.hpp index a632af0acf..1caa9dbcb6 100644 --- a/modules/gpu/src/precomp.hpp +++ b/modules/gpu/src/precomp.hpp @@ -51,6 +51,7 @@ #endif #include +#include #include "opencv2/gpu/gpu.hpp" diff --git a/tests/gpu/src/convert_to.cpp b/tests/gpu/src/convert_to.cpp new file mode 100644 index 0000000000..c53fbc455a --- /dev/null +++ b/tests/gpu/src/convert_to.cpp @@ -0,0 +1,89 @@ +#include "gputest.hpp" +#include +#include +#include +#include +#include +#include + +using namespace cv; +using namespace std; +using namespace gpu; + +class CV_GpuMatOpConvertTo : public CvTest +{ + public: + CV_GpuMatOpConvertTo(); + ~CV_GpuMatOpConvertTo(); + protected: + void run(int); +}; + +CV_GpuMatOpConvertTo::CV_GpuMatOpConvertTo(): CvTest( "GpuMatOperatorConvertTo", "convertTo" ) {} +CV_GpuMatOpConvertTo::~CV_GpuMatOpConvertTo() {} + +void CV_GpuMatOpConvertTo::run( int /* start_from */) +{ + const Size img_size(67, 35); + + const int types[] = {CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F/**/}; + const int types_num = sizeof(types) / sizeof(int); + + const char* types_str[] = {"CV_8U", "CV_8S", "CV_16U", "CV_16S", "CV_32S", "CV_32F", "CV_64F"}; + + bool passed = true; + + for (int i = 0; i < types_num && passed; ++i) + { + for (int j = 0; j < types_num && passed; ++j) + { + for (int c = 1; c < 2 && passed; ++c) + { + //if (i == j) + // continue; + + const int src_type = CV_MAKETYPE(types[i], c); + const int dst_type = types[j]; + const double alpha = (double)rand() / RAND_MAX * 10.0; + const double beta = (double)rand() / RAND_MAX * 10.0; + + Mat cpumatsrc(img_size, src_type); + randu(cpumatsrc, Scalar::all(0), Scalar::all(10)); + GpuMat gpumatsrc(cpumatsrc); + Mat cpumatdst; + GpuMat gpumatdst; + + //double cput = (double)getTickCount(); + cpumatsrc.convertTo(cpumatdst, dst_type, alpha, beta); + //cput = ((double)getTickCount() - cput) / getTickFrequency(); + + //double gput = (double)getTickCount(); + gpumatsrc.convertTo(gpumatdst, dst_type, alpha, beta); + //gput = ((double)getTickCount() - gput) / getTickFrequency(); + + /*cout << "convertTo time: " << endl; + cout << "CPU time: " << cput << endl; + cout << "GPU time: " << gput << endl;/**/ + + double r = norm(cpumatdst, gpumatdst, NORM_L1); + if (r > 1) + { + /*namedWindow("CPU"); + imshow("CPU", cpumatdst); + namedWindow("GPU"); + imshow("GPU", gpumatdst); + waitKey();/**/ + + cout << "Failed:" << endl; + cout << "\tr = " << r << endl; + cout << "\tSRC_TYPE=" << types_str[i] << "C" << c << " DST_TYPE=" << types_str[j] << endl;/**/ + + passed = false; + } + } + } + } + ts->set_failed_test_info(passed ? CvTS::OK : CvTS::FAIL_GENERIC); +} + +CV_GpuMatOpConvertTo CV_GpuMatOpConvertTo_test;