diff --git a/modules/gpu/include/opencv2/gpu/devmem2d.hpp b/modules/gpu/include/opencv2/gpu/devmem2d.hpp index 443330eed4..80290ed2e9 100644 --- a/modules/gpu/include/opencv2/gpu/devmem2d.hpp +++ b/modules/gpu/include/opencv2/gpu/devmem2d.hpp @@ -50,7 +50,7 @@ namespace cv // Simple lightweight structure that encapsulates image ptr on device, its pitch and its sizes. // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile - template + template struct DevMem2D_ { typedef T elem_t; @@ -60,16 +60,21 @@ namespace cv int rows; T* ptr; size_t step; + size_t elem_step; - DevMem2D_() : cols(0), rows(0), ptr(0), step(0) {} + DevMem2D_() : cols(0), rows(0), ptr(0), step(0), elem_step(0) {} DevMem2D_(int rows_, int cols_, T *ptr_, size_t step_) - : cols(cols_), rows(rows_), ptr(ptr_), step(step_) {} + : cols(cols_), rows(rows_), ptr(ptr_), step(step_), elem_step(step_ / sizeof(T)) {} + + template + explicit DevMem2D_(const DevMem2D_& d) + : cols(d.cols), rows(d.rows), ptr((T*)d.ptr), step(d.step), elem_step(d.step / sizeof(T)) {} size_t elemSize() const { return elem_size; } }; - typedef DevMem2D_<> DevMem2D; + typedef DevMem2D_ DevMem2D; typedef DevMem2D_ DevMem2Df; typedef DevMem2D_ DevMem2Di; } diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 93a7304fd9..1146de400b 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -636,7 +636,7 @@ namespace cv //! returns the separable filter engine with the specified filters CV_EXPORTS Ptr createSeparableFilter_GPU(const Ptr& rowFilter, - const Ptr& columnFilter, bool rowFilterFirst = true); + const Ptr& columnFilter); //! returns horizontal 1D box filter //! supports only CV_8UC1 source type and CV_32FC1 sum type @@ -658,7 +658,7 @@ namespace cv //! only MORPH_ERODE and MORPH_DILATE are supported //! supports CV_8UC1 and CV_8UC4 types //! kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height - CV_EXPORTS Ptr getMorphologyFilter_GPU(int op, int type, const GpuMat& kernel, const Size& ksize, + CV_EXPORTS Ptr getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize, Point anchor=Point(-1,-1)); //! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported. @@ -667,25 +667,24 @@ namespace cv //! returns 2D filter with the specified kernel //! supports CV_8UC1 and CV_8UC4 types - //! kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height - CV_EXPORTS Ptr getLinearFilter_GPU(int srcType, int dstType, const GpuMat& kernel, const Size& ksize, - Point anchor = Point(-1, -1), int nDivisor = 1); + CV_EXPORTS Ptr getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Size& ksize, + Point anchor = Point(-1, -1)); //! returns the non-separable linear filter engine CV_EXPORTS Ptr createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Point& anchor = Point(-1,-1)); //! returns the primitive row filter with the specified kernel - CV_EXPORTS Ptr getLinearRowFilter_GPU(int srcType, int bufType, const GpuMat& rowKernel, - int anchor = -1, int nDivisor = 1); + CV_EXPORTS Ptr getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, + int anchor = -1); //! returns the primitive column filter with the specified kernel - CV_EXPORTS Ptr getLinearColumnFilter_GPU(int bufType, int dstType, const GpuMat& columnKernel, - int anchor = -1, int nDivisor = 1); + CV_EXPORTS Ptr getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, + int anchor = -1); //! returns the separable linear filter engine CV_EXPORTS Ptr createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, - const Mat& columnKernel, const Point& anchor = Point(-1,-1), bool rowFilterFirst = true); + const Mat& columnKernel, const Point& anchor = Point(-1,-1)); //! returns filter engine for the generalized Sobel operator CV_EXPORTS Ptr createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize); @@ -720,7 +719,7 @@ namespace cv //! applies separable 2D linear filter to the image CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, - Point anchor = Point(-1,-1), bool rowFilterFirst = true); + Point anchor = Point(-1,-1)); //! applies generalized Sobel operator to the image CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1); diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index 2ed3f43289..b507cc660c 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -316,9 +316,9 @@ void cv::gpu::absdiff(const GpuMat& src, const Scalar& s, GpuMat& dst) //////////////////////////////////////////////////////////////////////// // compare -namespace cv { namespace gpu { namespace matrix_operations +namespace cv { namespace gpu { namespace mathfunc { - void compare_ne_8u(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst); + void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst); void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst); }}} @@ -346,7 +346,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c } else { - matrix_operations::compare_ne_8u(src1, src2, dst); + mathfunc::compare_ne_8uc4(src1, src2, dst); } } else @@ -359,7 +359,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c } else { - matrix_operations::compare_ne_32f(src1, src2, dst); + mathfunc::compare_ne_32f(src1, src2, dst); } } } diff --git a/modules/gpu/src/cuda/bilateral_filter.cu b/modules/gpu/src/cuda/bilateral_filter.cu deleted file mode 100644 index 132ca84ceb..0000000000 --- a/modules/gpu/src/cuda/bilateral_filter.cu +++ /dev/null @@ -1,233 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#include "opencv2/gpu/devmem2d.hpp" -#include "saturate_cast.hpp" -#include "safe_call.hpp" - -using namespace cv::gpu; - -#ifndef FLT_MAX -#define FLT_MAX 3.402823466e+30F -#endif - -namespace bf_krnls -{ - __constant__ float* ctable_color; - __constant__ float* ctable_space; - __constant__ size_t ctable_space_step; - - __constant__ int cndisp; - __constant__ int cradius; - - __constant__ short cedge_disc; - __constant__ short cmax_disc; -} - -namespace cv { namespace gpu { namespace bf -{ - void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc) - { - cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) ); - cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.ptr, sizeof(table_space.ptr)) ); - size_t table_space_step = table_space.step / sizeof(float); - cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) ); - - cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) ); - cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) ); - - cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) ); - cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) ); - } -}}} - -namespace bf_krnls -{ - template - struct DistRgbMax - { - static __device__ uchar calc(const uchar* a, const uchar* b) - { - uchar x = abs(a[0] - b[0]); - uchar y = abs(a[1] - b[1]); - uchar z = abs(a[2] - b[2]); - return (max(max(x, y), z)); - } - }; - - template <> - struct DistRgbMax<1> - { - static __device__ uchar calc(const uchar* a, const uchar* b) - { - return abs(a[0] - b[0]); - } - }; - - template - __global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w) - { - const int y = blockIdx.y * blockDim.y + threadIdx.y; - const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1); - - T dp[5]; - - if (y > 0 && y < h - 1 && x > 0 && x < w - 1) - { - dp[0] = *(disp + (y ) * disp_step + x + 0); - dp[1] = *(disp + (y-1) * disp_step + x + 0); - dp[2] = *(disp + (y ) * disp_step + x - 1); - dp[3] = *(disp + (y+1) * disp_step + x + 0); - dp[4] = *(disp + (y ) * disp_step + x + 1); - - if(abs(dp[1] - dp[0]) >= cedge_disc || abs(dp[2] - dp[0]) >= cedge_disc || abs(dp[3] - dp[0]) >= cedge_disc || abs(dp[4] - dp[0]) >= cedge_disc) - { - const int ymin = max(0, y - cradius); - const int xmin = max(0, x - cradius); - const int ymax = min(h - 1, y + cradius); - const int xmax = min(w - 1, x + cradius); - - float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; - - const uchar* ic = img + y * img_step + channels * x; - - for(int yi = ymin; yi <= ymax; yi++) - { - const T* disp_y = disp + yi * disp_step; - - for(int xi = xmin; xi <= xmax; xi++) - { - const uchar* in = img + yi * img_step + channels * xi; - - uchar dist_rgb = DistRgbMax::calc(in, ic); - - const float weight = ctable_color[dist_rgb] * (ctable_space + abs(y-yi)* ctable_space_step)[abs(x-xi)]; - - const T disp_reg = disp_y[xi]; - - cost[0] += min(cmax_disc, abs(disp_reg - dp[0])) * weight; - cost[1] += min(cmax_disc, abs(disp_reg - dp[1])) * weight; - cost[2] += min(cmax_disc, abs(disp_reg - dp[2])) * weight; - cost[3] += min(cmax_disc, abs(disp_reg - dp[3])) * weight; - cost[4] += min(cmax_disc, abs(disp_reg - dp[4])) * weight; - } - } - - float minimum = FLT_MAX; - int id = 0; - - if (cost[0] < minimum) - { - minimum = cost[0]; - id = 0; - } - if (cost[1] < minimum) - { - minimum = cost[1]; - id = 1; - } - if (cost[2] < minimum) - { - minimum = cost[2]; - id = 2; - } - if (cost[3] < minimum) - { - minimum = cost[3]; - id = 3; - } - if (cost[4] < minimum) - { - minimum = cost[4]; - id = 4; - } - - *(disp + y * disp_step + x) = dp[id]; - } - } - } -} - -namespace cv { namespace gpu { namespace bf -{ - template - void bilateral_filter_caller(const DevMem2D_& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream) - { - dim3 threads(32, 8, 1); - dim3 grid(1, 1, 1); - grid.x = divUp(disp.cols, threads.x << 1); - grid.y = divUp(disp.rows, threads.y); - - switch (channels) - { - case 1: - for (int i = 0; i < iters; ++i) - { - bf_krnls::bilateral_filter<1><<>>(0, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols); - bf_krnls::bilateral_filter<1><<>>(1, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols); - } - break; - case 3: - for (int i = 0; i < iters; ++i) - { - bf_krnls::bilateral_filter<3><<>>(0, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols); - bf_krnls::bilateral_filter<3><<>>(1, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols); - } - break; - default: - cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); - } - - if (stream != 0) - cudaSafeCall( cudaThreadSynchronize() ); - } - - void bilateral_filter_gpu(const DevMem2D& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream) - { - bilateral_filter_caller(disp, img, channels, iters, stream); - } - - void bilateral_filter_gpu(const DevMem2D_& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream) - { - bilateral_filter_caller(disp, img, channels, iters, stream); - } -}}} diff --git a/modules/gpu/src/cuda/color.cu b/modules/gpu/src/cuda/color.cu index 699e285a70..e956ff8dd7 100644 --- a/modules/gpu/src/cuda/color.cu +++ b/modules/gpu/src/cuda/color.cu @@ -42,6 +42,7 @@ #include "cuda_shared.hpp" #include "saturate_cast.hpp" +#include "vecmath.hpp" using namespace cv::gpu; @@ -53,16 +54,8 @@ using namespace cv::gpu; #define FLT_EPSILON 1.192092896e-07F #endif -namespace imgproc +namespace imgproc_krnls { - template struct TypeVec {}; - template<> struct TypeVec { typedef uchar3 vec_t; }; - template<> struct TypeVec { typedef uchar4 vec_t; }; - template<> struct TypeVec { typedef ushort3 vec_t; }; - template<> struct TypeVec { typedef ushort4 vec_t; }; - template<> struct TypeVec { typedef float3 vec_t; }; - template<> struct TypeVec { typedef float4 vec_t; }; - template struct ColorChannel {}; template<> struct ColorChannel { @@ -106,7 +99,7 @@ namespace imgproc ////////////////// Various 3/4-channel to 3/4-channel RGB transformations ///////////////// -namespace imgproc +namespace imgproc_krnls { template __global__ void RGB2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx) @@ -132,7 +125,7 @@ namespace imgproc } } -namespace cv { namespace gpu { namespace improc +namespace cv { namespace gpu { namespace imgproc { template void RGB2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream) @@ -143,7 +136,7 @@ namespace cv { namespace gpu { namespace improc grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - imgproc::RGB2RGB<<>>(src.ptr, src.step, + imgproc_krnls::RGB2RGB<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols, bidx); if (stream == 0) @@ -189,7 +182,7 @@ namespace cv { namespace gpu { namespace improc /////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB ////////// -namespace imgproc +namespace imgproc_krnls { template struct RGB5x52RGBConverter {}; template struct RGB5x52RGBConverter<5, DSTCN> @@ -281,7 +274,7 @@ namespace imgproc } } -namespace cv { namespace gpu { namespace improc +namespace cv { namespace gpu { namespace imgproc { template void RGB5x52RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream) @@ -292,7 +285,7 @@ namespace cv { namespace gpu { namespace improc grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - imgproc::RGB5x52RGB<<>>(src.ptr, src.step, + imgproc_krnls::RGB5x52RGB<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols, bidx); if (stream == 0) @@ -320,7 +313,7 @@ namespace cv { namespace gpu { namespace improc grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - imgproc::RGB2RGB5x5<<>>(src.ptr, src.step, + imgproc_krnls::RGB2RGB5x5<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols, bidx); if (stream == 0) @@ -342,7 +335,7 @@ namespace cv { namespace gpu { namespace improc ///////////////////////////////// Grayscale to Color //////////////////////////////// -namespace imgproc +namespace imgproc_krnls { template __global__ void Gray2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols) @@ -396,7 +389,7 @@ namespace imgproc } } -namespace cv { namespace gpu { namespace improc +namespace cv { namespace gpu { namespace imgproc { template void Gray2RGB_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream) @@ -407,7 +400,7 @@ namespace cv { namespace gpu { namespace improc grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - imgproc::Gray2RGB<<>>(src.ptr, src.step, + imgproc_krnls::Gray2RGB<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols); if (stream == 0) @@ -447,7 +440,7 @@ namespace cv { namespace gpu { namespace improc grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - imgproc::Gray2RGB5x5<<>>(src.ptr, src.step, + imgproc_krnls::Gray2RGB5x5<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols); if (stream == 0) @@ -468,7 +461,7 @@ namespace cv { namespace gpu { namespace improc ///////////////////////////////// Color to Grayscale //////////////////////////////// -namespace imgproc +namespace imgproc_krnls { #undef R2Y #undef G2Y @@ -550,7 +543,7 @@ namespace imgproc } } -namespace cv { namespace gpu { namespace improc +namespace cv { namespace gpu { namespace imgproc { template void RGB2Gray_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream) @@ -561,7 +554,7 @@ namespace cv { namespace gpu { namespace improc grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - imgproc::RGB2Gray<<>>(src.ptr, src.step, + imgproc_krnls::RGB2Gray<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols, bidx); if (stream == 0) @@ -601,7 +594,7 @@ namespace cv { namespace gpu { namespace improc grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - imgproc::RGB5x52Gray<<>>(src.ptr, src.step, + imgproc_krnls::RGB5x52Gray<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols); if (stream == 0) @@ -622,7 +615,7 @@ namespace cv { namespace gpu { namespace improc ///////////////////////////////////// RGB <-> YCrCb ////////////////////////////////////// -namespace imgproc +namespace imgproc_krnls { __constant__ float cYCrCbCoeffs_f[5]; __constant__ int cYCrCbCoeffs_i[5]; @@ -721,7 +714,7 @@ namespace imgproc } } -namespace cv { namespace gpu { namespace improc +namespace cv { namespace gpu { namespace imgproc { template void RGB2YCrCb_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream) @@ -732,7 +725,7 @@ namespace cv { namespace gpu { namespace improc grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - imgproc::RGB2YCrCb<<>>(src.ptr, src.step, + imgproc_krnls::RGB2YCrCb<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols, bidx); if (stream == 0) @@ -748,7 +741,7 @@ namespace cv { namespace gpu { namespace improc {RGB2YCrCb_caller, RGB2YCrCb_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) ); RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream); } @@ -762,7 +755,7 @@ namespace cv { namespace gpu { namespace improc {RGB2YCrCb_caller, RGB2YCrCb_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) ); RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream); } @@ -776,7 +769,7 @@ namespace cv { namespace gpu { namespace improc {RGB2YCrCb_caller, RGB2YCrCb_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_f, coeffs, 5 * sizeof(float)) ); + cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cYCrCbCoeffs_f, coeffs, 5 * sizeof(float)) ); RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream); } @@ -790,7 +783,7 @@ namespace cv { namespace gpu { namespace improc grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - imgproc::YCrCb2RGB<<>>(src.ptr, src.step, + imgproc_krnls::YCrCb2RGB<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols, bidx); if (stream == 0) @@ -806,7 +799,7 @@ namespace cv { namespace gpu { namespace improc {YCrCb2RGB_caller, YCrCb2RGB_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) ); YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream); } @@ -820,7 +813,7 @@ namespace cv { namespace gpu { namespace improc {YCrCb2RGB_caller, YCrCb2RGB_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) ); YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream); } @@ -834,7 +827,7 @@ namespace cv { namespace gpu { namespace improc {YCrCb2RGB_caller, YCrCb2RGB_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_f, coeffs, 4 * sizeof(float)) ); + cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cYCrCbCoeffs_f, coeffs, 4 * sizeof(float)) ); YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream); } @@ -842,7 +835,7 @@ namespace cv { namespace gpu { namespace improc ////////////////////////////////////// RGB <-> XYZ /////////////////////////////////////// -namespace imgproc +namespace imgproc_krnls { __constant__ float cXYZ_D65f[9]; __constant__ int cXYZ_D65i[9]; @@ -931,7 +924,7 @@ namespace imgproc } } -namespace cv { namespace gpu { namespace improc +namespace cv { namespace gpu { namespace imgproc { template void RGB2XYZ_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream) @@ -942,7 +935,7 @@ namespace cv { namespace gpu { namespace improc grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - imgproc::RGB2XYZ<<>>(src.ptr, src.step, + imgproc_krnls::RGB2XYZ<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols); if (stream == 0) @@ -958,7 +951,7 @@ namespace cv { namespace gpu { namespace improc {RGB2XYZ_caller, RGB2XYZ_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65i, coeffs, 9 * sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cXYZ_D65i, coeffs, 9 * sizeof(int)) ); RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream); } @@ -972,7 +965,7 @@ namespace cv { namespace gpu { namespace improc {RGB2XYZ_caller, RGB2XYZ_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65i, coeffs, 9 * sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cXYZ_D65i, coeffs, 9 * sizeof(int)) ); RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream); } @@ -986,7 +979,7 @@ namespace cv { namespace gpu { namespace improc {RGB2XYZ_caller, RGB2XYZ_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65f, coeffs, 9 * sizeof(float)) ); + cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cXYZ_D65f, coeffs, 9 * sizeof(float)) ); RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream); } @@ -1000,7 +993,7 @@ namespace cv { namespace gpu { namespace improc grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - imgproc::XYZ2RGB<<>>(src.ptr, src.step, + imgproc_krnls::XYZ2RGB<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols); if (stream == 0) @@ -1016,7 +1009,7 @@ namespace cv { namespace gpu { namespace improc {XYZ2RGB_caller, XYZ2RGB_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65i, coeffs, 9 * sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cXYZ_D65i, coeffs, 9 * sizeof(int)) ); XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream); } @@ -1030,7 +1023,7 @@ namespace cv { namespace gpu { namespace improc {XYZ2RGB_caller, XYZ2RGB_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65i, coeffs, 9 * sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cXYZ_D65i, coeffs, 9 * sizeof(int)) ); XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream); } @@ -1044,7 +1037,7 @@ namespace cv { namespace gpu { namespace improc {XYZ2RGB_caller, XYZ2RGB_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65f, coeffs, 9 * sizeof(float)) ); + cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cXYZ_D65f, coeffs, 9 * sizeof(float)) ); XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream); } @@ -1052,7 +1045,7 @@ namespace cv { namespace gpu { namespace improc ////////////////////////////////////// RGB <-> HSV /////////////////////////////////////// -namespace imgproc +namespace imgproc_krnls { __constant__ int cHsvDivTable[256]; @@ -1229,7 +1222,7 @@ namespace imgproc } } -namespace cv { namespace gpu { namespace improc +namespace cv { namespace gpu { namespace imgproc { template void RGB2HSV_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream) @@ -1241,10 +1234,10 @@ namespace cv { namespace gpu { namespace improc grid.y = divUp(src.rows, threads.y); if (hrange == 180) - imgproc::RGB2HSV<<>>(src.ptr, src.step, + imgproc_krnls::RGB2HSV<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols, bidx); else - imgproc::RGB2HSV<<>>(src.ptr, src.step, + imgproc_krnls::RGB2HSV<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols, bidx); if (stream == 0) @@ -1295,7 +1288,7 @@ namespace cv { namespace gpu { namespace improc 4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229, 4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096 }; - cudaSafeCall( cudaMemcpyToSymbol(imgproc::cHsvDivTable, div_table, sizeof(div_table)) ); + cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cHsvDivTable, div_table, sizeof(div_table)) ); RGB2HSV_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream); } @@ -1323,10 +1316,10 @@ namespace cv { namespace gpu { namespace improc grid.y = divUp(src.rows, threads.y); if (hrange == 180) - imgproc::HSV2RGB<<>>(src.ptr, src.step, + imgproc_krnls::HSV2RGB<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols, bidx); else - imgproc::HSV2RGB<<>>(src.ptr, src.step, + imgproc_krnls::HSV2RGB<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols, bidx); if (stream == 0) @@ -1345,7 +1338,7 @@ namespace cv { namespace gpu { namespace improc static const int sector_data[][3] = {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; - cudaSafeCall( cudaMemcpyToSymbol(imgproc::cHsvSectorData, sector_data, sizeof(sector_data)) ); + cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cHsvSectorData, sector_data, sizeof(sector_data)) ); HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream); } @@ -1362,7 +1355,7 @@ namespace cv { namespace gpu { namespace improc static const int sector_data[][3] = {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; - cudaSafeCall( cudaMemcpyToSymbol(imgproc::cHsvSectorData, sector_data, sizeof(sector_data)) ); + cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cHsvSectorData, sector_data, sizeof(sector_data)) ); HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream); } @@ -1370,7 +1363,7 @@ namespace cv { namespace gpu { namespace improc /////////////////////////////////////// RGB <-> HLS //////////////////////////////////////// -namespace imgproc +namespace imgproc_krnls { template struct RGB2HLSConvertor; template struct RGB2HLSConvertor @@ -1541,7 +1534,7 @@ namespace imgproc } } -namespace cv { namespace gpu { namespace improc +namespace cv { namespace gpu { namespace imgproc { template void RGB2HLS_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream) @@ -1553,10 +1546,10 @@ namespace cv { namespace gpu { namespace improc grid.y = divUp(src.rows, threads.y); if (hrange == 180) - imgproc::RGB2HLS<<>>(src.ptr, src.step, + imgproc_krnls::RGB2HLS<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols, bidx); else - imgproc::RGB2HLS<<>>(src.ptr, src.step, + imgproc_krnls::RGB2HLS<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols, bidx); if (stream == 0) @@ -1598,10 +1591,10 @@ namespace cv { namespace gpu { namespace improc grid.y = divUp(src.rows, threads.y); if (hrange == 180) - imgproc::HLS2RGB<<>>(src.ptr, src.step, + imgproc_krnls::HLS2RGB<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols, bidx); else - imgproc::HLS2RGB<<>>(src.ptr, src.step, + imgproc_krnls::HLS2RGB<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols, bidx); if (stream == 0) @@ -1620,7 +1613,7 @@ namespace cv { namespace gpu { namespace improc static const int sector_data[][3]= {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; - cudaSafeCall( cudaMemcpyToSymbol(imgproc::cHlsSectorData, sector_data, sizeof(sector_data)) ); + cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cHlsSectorData, sector_data, sizeof(sector_data)) ); HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream); } @@ -1637,7 +1630,7 @@ namespace cv { namespace gpu { namespace improc static const int sector_data[][3]= {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; - cudaSafeCall( cudaMemcpyToSymbol(imgproc::cHlsSectorData, sector_data, sizeof(sector_data)) ); + cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cHlsSectorData, sector_data, sizeof(sector_data)) ); HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream); } diff --git a/modules/gpu/src/cuda/constantspacebp.cu b/modules/gpu/src/cuda/constantspacebp.cu index 7be11791e4..0602834d2b 100644 --- a/modules/gpu/src/cuda/constantspacebp.cu +++ b/modules/gpu/src/cuda/constantspacebp.cu @@ -54,20 +54,18 @@ using namespace cv::gpu; #define SHRT_MAX 32767 #endif -template -struct TypeLimits {}; - -template <> -struct TypeLimits +namespace csbp_krnls { - static __device__ short max() {return SHRT_MAX;} -}; - -template <> -struct TypeLimits -{ - static __device__ float max() {return FLT_MAX;} -}; + template struct TypeLimits; + template <> struct TypeLimits + { + static __device__ short max() {return SHRT_MAX;} + }; + template <> struct TypeLimits + { + static __device__ float max() {return FLT_MAX;} + }; +} /////////////////////////////////////////////////////////////// /////////////////////// load constants //////////////////////// diff --git a/modules/gpu/src/cuda/cuda_shared.hpp b/modules/gpu/src/cuda/cuda_shared.hpp index 449fcb0e1d..a3c5657278 100644 --- a/modules/gpu/src/cuda/cuda_shared.hpp +++ b/modules/gpu/src/cuda/cuda_shared.hpp @@ -58,19 +58,8 @@ namespace cv static inline int divUp(int a, int b) { return (a % b == 0) ? a/b : a/b + 1; } - namespace matrix_operations - { - extern "C" void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0); - - extern "C" void set_to_without_mask (DevMem2D dst, int depth, const double *scalar, int channels, const cudaStream_t & stream = 0); - extern "C" void set_to_with_mask (DevMem2D dst, int depth, const double *scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0); - - extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream = 0); - } - template - inline void uploadConstant(const char* name, const T& value) { cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); } - + static inline void uploadConstant(const char* name, const T& value) { cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); } } } diff --git a/modules/gpu/src/cuda/filters.cu b/modules/gpu/src/cuda/filters.cu new file mode 100644 index 0000000000..185cd633a7 --- /dev/null +++ b/modules/gpu/src/cuda/filters.cu @@ -0,0 +1,455 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "opencv2/gpu/devmem2d.hpp" +#include "saturate_cast.hpp" +#include "safe_call.hpp" +#include "cuda_shared.hpp" + +using namespace cv::gpu; + +#ifndef FLT_MAX +#define FLT_MAX 3.402823466e+30F +#endif + +///////////////////////////////////////////////////////////////////////////////////////////////// +// Linear filters + +#define MAX_KERNEL_SIZE 16 + +namespace filter_krnls +{ + __constant__ float cLinearKernel[MAX_KERNEL_SIZE]; +} + +namespace cv { namespace gpu { namespace filters +{ + void loadLinearKernel(const float kernel[], int ksize) + { + cudaSafeCall( cudaMemcpyToSymbol(filter_krnls::cLinearKernel, kernel, ksize * sizeof(float)) ); + } +}}} + +namespace filter_krnls +{ + template + __global__ void linearRowFilter(const T* src, size_t src_step, D* dst, size_t dst_step, int anchor, int width, int height) + { + __shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3]; + + const int blockStartX = blockDim.x * blockIdx.x; + const int blockStartY = blockDim.y * blockIdx.y; + + const int threadX = blockStartX + threadIdx.x; + const int prevThreadX = threadX - blockDim.x; + const int nextThreadX = threadX + blockDim.x; + + const int threadY = blockStartY + threadIdx.y; + + T* sDataRow = smem + threadIdx.y * blockDim.x * 3; + + if (threadY < height) + { + const T* rowSrc = src + threadY * src_step; + + sDataRow[threadIdx.x + blockDim.x] = threadX < width ? rowSrc[threadX] : 0; + + sDataRow[threadIdx.x] = prevThreadX >= 0 ? rowSrc[prevThreadX] : 0; + + sDataRow[(blockDim.x << 1) + threadIdx.x] = nextThreadX < width ? rowSrc[nextThreadX] : 0; + + __syncthreads(); + + if (threadX < width) + { + float sum = 0; + + sDataRow += threadIdx.x + blockDim.x - anchor; + + #pragma unroll + for(int i = 0; i < KERNEL_SIZE; ++i) + sum += cLinearKernel[i] * sDataRow[i]; + + dst[threadY * dst_step + threadX] = saturate_cast(sum); + } + } + } +} + +namespace cv { namespace gpu { namespace filters +{ + template + void linearRowFilter_caller(const DevMem2D_& src, const DevMem2D_& dst, int anchor) + { + const int BLOCK_DIM_X = 16; + const int BLOCK_DIM_Y = 16; + + dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); + dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); + + filter_krnls::linearRowFilter<<>>(src.ptr, src.elem_step, + dst.ptr, dst.elem_step, anchor, src.cols, src.rows); + + cudaSafeCall( cudaThreadSynchronize() ); + } + + template + inline void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) + { + typedef void (*caller_t)(const DevMem2D_& src, const DevMem2D_& dst, int anchor); + static const caller_t callers[] = + {linearRowFilter_caller<0 , T, D>, linearRowFilter_caller<1 , T, D>, + linearRowFilter_caller<2 , T, D>, linearRowFilter_caller<3 , T, D>, + linearRowFilter_caller<4 , T, D>, linearRowFilter_caller<5 , T, D>, + linearRowFilter_caller<6 , T, D>, linearRowFilter_caller<7 , T, D>, + linearRowFilter_caller<8 , T, D>, linearRowFilter_caller<9 , T, D>, + linearRowFilter_caller<10, T, D>, linearRowFilter_caller<11, T, D>, + linearRowFilter_caller<12, T, D>, linearRowFilter_caller<13, T, D>, + linearRowFilter_caller<14, T, D>, linearRowFilter_caller<15, T, D>}; + + loadLinearKernel(kernel, ksize); + callers[ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor); + } + + void linearRowFilter_gpu_32s32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) + { + linearRowFilter_gpu(src, dst, kernel, ksize, anchor); + } + void linearRowFilter_gpu_32s32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) + { + linearRowFilter_gpu(src, dst, kernel, ksize, anchor); + } + void linearRowFilter_gpu_32f32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) + { + linearRowFilter_gpu(src, dst, kernel, ksize, anchor); + } + void linearRowFilter_gpu_32f32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) + { + linearRowFilter_gpu(src, dst, kernel, ksize, anchor); + } +}}} + +namespace filter_krnls +{ + template + __global__ void linearColumnFilter(const T* src, size_t src_step, D* dst, size_t dst_step, int anchor, int width, int height) + { + __shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3]; + + const int blockStartX = blockDim.x * blockIdx.x; + const int blockStartY = blockDim.y * blockIdx.y; + + const int threadX = blockStartX + threadIdx.x; + + const int threadY = blockStartY + threadIdx.y; + const int prevThreadY = threadY - blockDim.y; + const int nextThreadY = threadY + blockDim.y; + + const int smem_step = blockDim.x; + + T* sDataColumn = smem + threadIdx.x; + + if (threadX < width) + { + const T* colSrc = src + threadX; + + sDataColumn[(threadIdx.y + blockDim.y) * smem_step] = threadY < height ? colSrc[threadY * src_step] : 0; + + sDataColumn[threadIdx.y * smem_step] = prevThreadY >= 0 ? colSrc[prevThreadY * src_step] : 0; + + sDataColumn[(threadIdx.y + (blockDim.y << 1)) * smem_step] = nextThreadY < height ? colSrc[nextThreadY * src_step] : 0; + + __syncthreads(); + + if (threadY < height) + { + float sum = 0; + + sDataColumn += (threadIdx.y + blockDim.y - anchor)* smem_step; + + #pragma unroll + for(int i = 0; i < KERNEL_SIZE; ++i) + sum += cLinearKernel[i] * sDataColumn[i * smem_step]; + + dst[threadY * dst_step + threadX] = saturate_cast(sum); + } + } + } +} + +namespace cv { namespace gpu { namespace filters +{ + template + void linearColumnFilter_caller(const DevMem2D_& src, const DevMem2D_& dst, int anchor) + { + const int BLOCK_DIM_X = 16; + const int BLOCK_DIM_Y = 16; + + dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); + dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); + + filter_krnls::linearColumnFilter<<>>(src.ptr, src.elem_step, + dst.ptr, dst.elem_step, anchor, src.cols, src.rows); + + cudaSafeCall( cudaThreadSynchronize() ); + } + + template + inline void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) + { + typedef void (*caller_t)(const DevMem2D_& src, const DevMem2D_& dst, int anchor); + static const caller_t callers[] = + {linearColumnFilter_caller<0 , T, D>, linearColumnFilter_caller<1 , T, D>, + linearColumnFilter_caller<2 , T, D>, linearColumnFilter_caller<3 , T, D>, + linearColumnFilter_caller<4 , T, D>, linearColumnFilter_caller<5 , T, D>, + linearColumnFilter_caller<6 , T, D>, linearColumnFilter_caller<7 , T, D>, + linearColumnFilter_caller<8 , T, D>, linearColumnFilter_caller<9 , T, D>, + linearColumnFilter_caller<10, T, D>, linearColumnFilter_caller<11, T, D>, + linearColumnFilter_caller<12, T, D>, linearColumnFilter_caller<13, T, D>, + linearColumnFilter_caller<14, T, D>, linearColumnFilter_caller<15, T, D>}; + + loadLinearKernel(kernel, ksize); + callers[ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor); + } + + void linearColumnFilter_gpu_32s32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) + { + linearColumnFilter_gpu(src, dst, kernel, ksize, anchor); + } + void linearColumnFilter_gpu_32s32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) + { + linearColumnFilter_gpu(src, dst, kernel, ksize, anchor); + } + void linearColumnFilter_gpu_32f32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) + { + linearColumnFilter_gpu(src, dst, kernel, ksize, anchor); + } + void linearColumnFilter_gpu_32f32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) + { + linearColumnFilter_gpu(src, dst, kernel, ksize, anchor); + } +}}} + +///////////////////////////////////////////////////////////////////////////////////////////////// +// Bilateral filters + +namespace bf_krnls +{ + __constant__ float* ctable_color; + __constant__ float* ctable_space; + __constant__ size_t ctable_space_step; + + __constant__ int cndisp; + __constant__ int cradius; + + __constant__ short cedge_disc; + __constant__ short cmax_disc; +} + +namespace cv { namespace gpu { namespace bf +{ + void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc) + { + cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) ); + cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.ptr, sizeof(table_space.ptr)) ); + size_t table_space_step = table_space.step / sizeof(float); + cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) ); + + cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) ); + + cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) ); + cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) ); + } +}}} + +namespace bf_krnls +{ + template + struct DistRgbMax + { + static __device__ uchar calc(const uchar* a, const uchar* b) + { + uchar x = abs(a[0] - b[0]); + uchar y = abs(a[1] - b[1]); + uchar z = abs(a[2] - b[2]); + return (max(max(x, y), z)); + } + }; + + template <> + struct DistRgbMax<1> + { + static __device__ uchar calc(const uchar* a, const uchar* b) + { + return abs(a[0] - b[0]); + } + }; + + template + __global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w) + { + const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1); + + T dp[5]; + + if (y > 0 && y < h - 1 && x > 0 && x < w - 1) + { + dp[0] = *(disp + (y ) * disp_step + x + 0); + dp[1] = *(disp + (y-1) * disp_step + x + 0); + dp[2] = *(disp + (y ) * disp_step + x - 1); + dp[3] = *(disp + (y+1) * disp_step + x + 0); + dp[4] = *(disp + (y ) * disp_step + x + 1); + + if(abs(dp[1] - dp[0]) >= cedge_disc || abs(dp[2] - dp[0]) >= cedge_disc || abs(dp[3] - dp[0]) >= cedge_disc || abs(dp[4] - dp[0]) >= cedge_disc) + { + const int ymin = max(0, y - cradius); + const int xmin = max(0, x - cradius); + const int ymax = min(h - 1, y + cradius); + const int xmax = min(w - 1, x + cradius); + + float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + + const uchar* ic = img + y * img_step + channels * x; + + for(int yi = ymin; yi <= ymax; yi++) + { + const T* disp_y = disp + yi * disp_step; + + for(int xi = xmin; xi <= xmax; xi++) + { + const uchar* in = img + yi * img_step + channels * xi; + + uchar dist_rgb = DistRgbMax::calc(in, ic); + + const float weight = ctable_color[dist_rgb] * (ctable_space + abs(y-yi)* ctable_space_step)[abs(x-xi)]; + + const T disp_reg = disp_y[xi]; + + cost[0] += min(cmax_disc, abs(disp_reg - dp[0])) * weight; + cost[1] += min(cmax_disc, abs(disp_reg - dp[1])) * weight; + cost[2] += min(cmax_disc, abs(disp_reg - dp[2])) * weight; + cost[3] += min(cmax_disc, abs(disp_reg - dp[3])) * weight; + cost[4] += min(cmax_disc, abs(disp_reg - dp[4])) * weight; + } + } + + float minimum = FLT_MAX; + int id = 0; + + if (cost[0] < minimum) + { + minimum = cost[0]; + id = 0; + } + if (cost[1] < minimum) + { + minimum = cost[1]; + id = 1; + } + if (cost[2] < minimum) + { + minimum = cost[2]; + id = 2; + } + if (cost[3] < minimum) + { + minimum = cost[3]; + id = 3; + } + if (cost[4] < minimum) + { + minimum = cost[4]; + id = 4; + } + + *(disp + y * disp_step + x) = dp[id]; + } + } + } +} + +namespace cv { namespace gpu { namespace bf +{ + template + void bilateral_filter_caller(const DevMem2D_& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream) + { + dim3 threads(32, 8, 1); + dim3 grid(1, 1, 1); + grid.x = divUp(disp.cols, threads.x << 1); + grid.y = divUp(disp.rows, threads.y); + + switch (channels) + { + case 1: + for (int i = 0; i < iters; ++i) + { + bf_krnls::bilateral_filter<1><<>>(0, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols); + bf_krnls::bilateral_filter<1><<>>(1, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols); + } + break; + case 3: + for (int i = 0; i < iters; ++i) + { + bf_krnls::bilateral_filter<3><<>>(0, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols); + bf_krnls::bilateral_filter<3><<>>(1, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols); + } + break; + default: + cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); + } + + if (stream != 0) + cudaSafeCall( cudaThreadSynchronize() ); + } + + void bilateral_filter_gpu(const DevMem2D& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream) + { + bilateral_filter_caller(disp, img, channels, iters, stream); + } + + void bilateral_filter_gpu(const DevMem2D_& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream) + { + bilateral_filter_caller(disp, img, channels, iters, stream); + } +}}} diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu index 938e2d1185..e36a9428ac 100644 --- a/modules/gpu/src/cuda/imgproc.cu +++ b/modules/gpu/src/cuda/imgproc.cu @@ -45,7 +45,7 @@ using namespace cv::gpu; /////////////////////////////////// Remap /////////////////////////////////////////////// -namespace imgproc +namespace imgproc_krnls { texture tex_remap; @@ -123,7 +123,7 @@ namespace imgproc } } -namespace cv { namespace gpu { namespace improc +namespace cv { namespace gpu { namespace imgproc { void remap_gpu_1c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst) { @@ -132,15 +132,15 @@ namespace cv { namespace gpu { namespace improc grid.x = divUp(dst.cols, threads.x); grid.y = divUp(dst.rows, threads.y); - imgproc::tex_remap.filterMode = cudaFilterModeLinear; - imgproc::tex_remap.addressMode[0] = imgproc::tex_remap.addressMode[1] = cudaAddressModeWrap; + imgproc_krnls::tex_remap.filterMode = cudaFilterModeLinear; + imgproc_krnls::tex_remap.addressMode[0] = imgproc_krnls::tex_remap.addressMode[1] = cudaAddressModeWrap; cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D(0, imgproc::tex_remap, src.ptr, desc, src.cols, src.rows, src.step) ); + cudaSafeCall( cudaBindTexture2D(0, imgproc_krnls::tex_remap, src.ptr, desc, src.cols, src.rows, src.step) ); - imgproc::remap_1c<<>>(xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows); + imgproc_krnls::remap_1c<<>>(xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows); cudaSafeCall( cudaThreadSynchronize() ); - cudaSafeCall( cudaUnbindTexture(imgproc::tex_remap) ); + cudaSafeCall( cudaUnbindTexture(imgproc_krnls::tex_remap) ); } void remap_gpu_3c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst) @@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace improc grid.x = divUp(dst.cols, threads.x); grid.y = divUp(dst.rows, threads.y); - imgproc::remap_3c<<>>(src.ptr, src.step, xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows); + imgproc_krnls::remap_3c<<>>(src.ptr, src.step, xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows); cudaSafeCall( cudaThreadSynchronize() ); } @@ -159,7 +159,7 @@ namespace cv { namespace gpu { namespace improc /////////////////////////////////// MeanShiftfiltering /////////////////////////////////////////////// -namespace imgproc +namespace imgproc_krnls { texture tex_meanshift; @@ -254,7 +254,7 @@ namespace imgproc } } -namespace cv { namespace gpu { namespace improc +namespace cv { namespace gpu { namespace imgproc { extern "C" void meanShiftFiltering_gpu(const DevMem2D& src, DevMem2D dst, int sp, int sr, int maxIter, float eps) { @@ -264,11 +264,11 @@ namespace cv { namespace gpu { namespace improc grid.y = divUp(src.rows, threads.y); cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D( 0, imgproc::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) ); + cudaSafeCall( cudaBindTexture2D( 0, imgproc_krnls::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) ); - imgproc::meanshift_kernel<<< grid, threads >>>( dst.ptr, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps ); + imgproc_krnls::meanshift_kernel<<< grid, threads >>>( dst.ptr, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps ); cudaSafeCall( cudaThreadSynchronize() ); - cudaSafeCall( cudaUnbindTexture( imgproc::tex_meanshift ) ); + cudaSafeCall( cudaUnbindTexture( imgproc_krnls::tex_meanshift ) ); } extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps) { @@ -278,17 +278,17 @@ namespace cv { namespace gpu { namespace improc grid.y = divUp(src.rows, threads.y); cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D( 0, imgproc::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) ); + cudaSafeCall( cudaBindTexture2D( 0, imgproc_krnls::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) ); - imgproc::meanshiftproc_kernel<<< grid, threads >>>( dstr.ptr, dstr.step, dstsp.ptr, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps ); + imgproc_krnls::meanshiftproc_kernel<<< grid, threads >>>( dstr.ptr, dstr.step, dstsp.ptr, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps ); cudaSafeCall( cudaThreadSynchronize() ); - cudaSafeCall( cudaUnbindTexture( imgproc::tex_meanshift ) ); + cudaSafeCall( cudaUnbindTexture( imgproc_krnls::tex_meanshift ) ); } }}} /////////////////////////////////// drawColorDisp /////////////////////////////////////////////// -namespace imgproc +namespace imgproc_krnls { template __device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1) @@ -391,7 +391,7 @@ namespace imgproc } } -namespace cv { namespace gpu { namespace improc +namespace cv { namespace gpu { namespace imgproc { void drawColorDisp_gpu(const DevMem2D& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream) { @@ -400,7 +400,7 @@ namespace cv { namespace gpu { namespace improc grid.x = divUp(src.cols, threads.x << 2); grid.y = divUp(src.rows, threads.y); - imgproc::drawColorDisp<<>>(src.ptr, src.step, dst.ptr, dst.step, src.cols, src.rows, ndisp); + imgproc_krnls::drawColorDisp<<>>(src.ptr, src.step, dst.ptr, dst.step, src.cols, src.rows, ndisp); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -413,7 +413,7 @@ namespace cv { namespace gpu { namespace improc grid.x = divUp(src.cols, threads.x << 1); grid.y = divUp(src.rows, threads.y); - imgproc::drawColorDisp<<>>(src.ptr, src.step / sizeof(short), dst.ptr, dst.step, src.cols, src.rows, ndisp); + imgproc_krnls::drawColorDisp<<>>(src.ptr, src.step / sizeof(short), dst.ptr, dst.step, src.cols, src.rows, ndisp); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -422,7 +422,7 @@ namespace cv { namespace gpu { namespace improc /////////////////////////////////// reprojectImageTo3D /////////////////////////////////////////////// -namespace imgproc +namespace imgproc_krnls { __constant__ float cq[16]; @@ -457,7 +457,7 @@ namespace imgproc } } -namespace cv { namespace gpu { namespace improc +namespace cv { namespace gpu { namespace imgproc { template inline void reprojectImageTo3D_caller(const DevMem2D_& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream) @@ -467,9 +467,9 @@ namespace cv { namespace gpu { namespace improc grid.x = divUp(disp.cols, threads.x); grid.y = divUp(disp.rows, threads.y); - cudaSafeCall( cudaMemcpyToSymbol(imgproc::cq, q, 16 * sizeof(float)) ); + cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cq, q, 16 * sizeof(float)) ); - imgproc::reprojectImageTo3D<<>>(disp.ptr, disp.step / sizeof(T), xyzw.ptr, xyzw.step / sizeof(float), disp.rows, disp.cols); + imgproc_krnls::reprojectImageTo3D<<>>(disp.ptr, disp.step / sizeof(T), xyzw.ptr, xyzw.step / sizeof(float), disp.rows, disp.cols); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu index fef25ac1fb..aca1c574e6 100644 --- a/modules/gpu/src/cuda/mathfunc.cu +++ b/modules/gpu/src/cuda/mathfunc.cu @@ -41,6 +41,9 @@ //M*/ #include "cuda_shared.hpp" +#include "saturate_cast.hpp" +#include "transform.hpp" +#include "vecmath.hpp" using namespace cv::gpu; @@ -48,6 +51,9 @@ using namespace cv::gpu; #define CV_PI 3.1415926535897932384626433832795f #endif +////////////////////////////////////////////////////////////////////////////////////// +// Cart <-> Polar + namespace mathfunc_krnls { struct Nothing @@ -143,8 +149,8 @@ namespace cv { namespace gpu { namespace mathfunc const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f; mathfunc_krnls::cartToPolar<<>>( - x.ptr, x.step / sizeof(float), y.ptr, y.step / sizeof(float), - mag.ptr, mag.step / sizeof(float), angle.ptr, angle.step / sizeof(float), scale, x.cols, x.rows); + x.ptr, x.elem_step, y.ptr, y.elem_step, + mag.ptr, mag.elem_step, angle.ptr, angle.elem_step, scale, x.cols, x.rows); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -191,8 +197,8 @@ namespace cv { namespace gpu { namespace mathfunc const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f; - mathfunc_krnls::polarToCart<<>>(mag.ptr, mag.step / sizeof(float), - angle.ptr, angle.step / sizeof(float), scale, x.ptr, x.step / sizeof(float), y.ptr, y.step / sizeof(float), mag.cols, mag.rows); + mathfunc_krnls::polarToCart<<>>(mag.ptr, mag.elem_step, + angle.ptr, angle.elem_step, scale, x.ptr, x.elem_step, y.ptr, y.elem_step, mag.cols, mag.rows); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -210,3 +216,37 @@ namespace cv { namespace gpu { namespace mathfunc callers[mag.ptr == 0](mag, angle, x, y, angleInDegrees, stream); } }}} + +////////////////////////////////////////////////////////////////////////////////////// +// Compare + +namespace mathfunc_krnls +{ + template + struct NotEqual + { + __device__ uchar operator()(const T1& src1, const T2& src2, int, int) + { + return static_cast(static_cast(src1 != src2) * 255); + } + }; +} + +namespace cv { namespace gpu { namespace mathfunc +{ + template + inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst) + { + mathfunc_krnls::NotEqual op; + transform(static_cast< DevMem2D_ >(src1), static_cast< DevMem2D_ >(src2), dst, op, 0); + } + + void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst) + { + compare_ne(src1, src2, dst); + } + void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst) + { + compare_ne(src1, src2, dst); + } +}}} diff --git a/modules/gpu/src/cuda/matrix_operations.cu b/modules/gpu/src/cuda/matrix_operations.cu index 0b791fa72d..f9a46b4c14 100644 --- a/modules/gpu/src/cuda/matrix_operations.cu +++ b/modules/gpu/src/cuda/matrix_operations.cu @@ -47,121 +47,18 @@ #include "saturate_cast.hpp" using namespace cv::gpu; -using namespace cv::gpu::matrix_operations; - -namespace mat_operators +namespace matop_krnls { - __constant__ double scalar_d[4]; - - - template - class shift_and_sizeof; - - template <> - class shift_and_sizeof - { - public: - enum { shift = 0 }; - }; - - template <> - class shift_and_sizeof - { - public: - enum { shift = 0 }; - }; - - template <> - class shift_and_sizeof - { - public: - enum { shift = 1 }; - }; - - template <> - class shift_and_sizeof - { - public: - enum { shift = 1 }; - }; - - template <> - class shift_and_sizeof - { - public: - enum { shift = 2 }; - }; - - template <> - class shift_and_sizeof - { - public: - enum { shift = 2 }; - }; - - template <> - class shift_and_sizeof - { - public: - enum { shift = 3 }; - }; - - - /////////////////////////////////////////////////////////////////////////// - ////////////////////////////////// CopyTo ///////////////////////////////// - /////////////////////////////////////////////////////////////////////////// - - template - __global__ void kernel_copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, int step_mat, int step_mask, int channels) - { - size_t x = blockIdx.x * blockDim.x + threadIdx.x; - size_t y = blockIdx.y * blockDim.y + threadIdx.y; - - if ((x < cols * channels ) && (y < rows)) - if (mask[y * step_mask + x / channels] != 0) - { - size_t idx = y * ( step_mat >> shift_and_sizeof::shift ) + x; - mat_dst[idx] = mat_src[idx]; - } - } - - /////////////////////////////////////////////////////////////////////////// - ////////////////////////////////// SetTo ////////////////////////////////// - /////////////////////////////////////////////////////////////////////////// - - template - __global__ void kernel_set_to_without_mask(T * mat, int cols, int rows, int step, int channels) - { - size_t x = blockIdx.x * blockDim.x + threadIdx.x; - size_t y = blockIdx.y * blockDim.y + threadIdx.y; - - if ((x < cols * channels ) && (y < rows)) - { - size_t idx = y * ( step >> shift_and_sizeof::shift ) + x; - mat[idx] = scalar_d[ x % channels ]; - } - } - - template - __global__ void kernel_set_to_with_mask(T * mat, const unsigned char * mask, int cols, int rows, int step, int channels, int step_mask) - { - size_t x = blockIdx.x * blockDim.x + threadIdx.x; - size_t y = blockIdx.y * blockDim.y + threadIdx.y; - - if ((x < cols * channels ) && (y < rows)) - if (mask[y * step_mask + x / channels] != 0) - { - size_t idx = y * ( step >> shift_and_sizeof::shift ) + x; - mat[idx] = scalar_d[ x % channels ]; - } - } - - - /////////////////////////////////////////////////////////////////////////// - //////////////////////////////// ConvertTo //////////////////////////////// - /////////////////////////////////////////////////////////////////////////// - + template struct shift_and_sizeof; + template <> struct shift_and_sizeof { enum { shift = 0 }; }; + template <> struct shift_and_sizeof { enum { shift = 0 }; }; + template <> struct shift_and_sizeof { enum { shift = 1 }; }; + template <> struct shift_and_sizeof { enum { shift = 1 }; }; + template <> struct shift_and_sizeof { enum { shift = 2 }; }; + template <> struct shift_and_sizeof { enum { shift = 2 }; }; + template <> struct shift_and_sizeof { enum { shift = 3 }; }; + template struct ReadWriteTraits { @@ -218,9 +115,206 @@ namespace mat_operators typedef int2 read_type; typedef short2 write_type; }; +} +/////////////////////////////////////////////////////////////////////////// +////////////////////////////////// CopyTo ///////////////////////////////// +/////////////////////////////////////////////////////////////////////////// + +namespace matop_krnls +{ + template + __global__ void copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, int step_mat, int step_mask, int channels) + { + size_t x = blockIdx.x * blockDim.x + threadIdx.x; + size_t y = blockIdx.y * blockDim.y + threadIdx.y; + + if ((x < cols * channels ) && (y < rows)) + if (mask[y * step_mask + x / channels] != 0) + { + size_t idx = y * ( step_mat >> shift_and_sizeof::shift ) + x; + mat_dst[idx] = mat_src[idx]; + } + } +} + +namespace cv { namespace gpu { namespace matrix_operations +{ + typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream); + + template + void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream) + { + dim3 threadsPerBlock(16,16, 1); + dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1); + if (stream == 0) + { + ::matop_krnls::copy_to_with_mask<<>> + ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels); + cudaSafeCall ( cudaThreadSynchronize() ); + } + else + { + ::matop_krnls::copy_to_with_mask<<>> + ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels); + } + } + + void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream) + { + static CopyToFunc tab[8] = + { + copy_to_with_mask_run, + copy_to_with_mask_run, + copy_to_with_mask_run, + copy_to_with_mask_run, + copy_to_with_mask_run, + copy_to_with_mask_run, + copy_to_with_mask_run, + 0 + }; + + CopyToFunc func = tab[depth]; + + if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__); + + func(mat_src, mat_dst, mask, channels, stream); + } +}}} + +/////////////////////////////////////////////////////////////////////////// +////////////////////////////////// SetTo ////////////////////////////////// +/////////////////////////////////////////////////////////////////////////// + +namespace matop_krnls +{ + __constant__ double scalar_d[4]; + + template + __global__ void set_to_without_mask(T * mat, int cols, int rows, int step, int channels) + { + size_t x = blockIdx.x * blockDim.x + threadIdx.x; + size_t y = blockIdx.y * blockDim.y + threadIdx.y; + + if ((x < cols * channels ) && (y < rows)) + { + size_t idx = y * ( step >> shift_and_sizeof::shift ) + x; + mat[idx] = scalar_d[ x % channels ]; + } + } + + template + __global__ void set_to_with_mask(T * mat, const unsigned char * mask, int cols, int rows, int step, int channels, int step_mask) + { + size_t x = blockIdx.x * blockDim.x + threadIdx.x; + size_t y = blockIdx.y * blockDim.y + threadIdx.y; + + if ((x < cols * channels ) && (y < rows)) + if (mask[y * step_mask + x / channels] != 0) + { + size_t idx = y * ( step >> shift_and_sizeof::shift ) + x; + mat[idx] = scalar_d[ x % channels ]; + } + } +} + +namespace cv { namespace gpu { namespace matrix_operations +{ + typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream); + typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels, const cudaStream_t & stream); + + template + void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream) + { + dim3 threadsPerBlock(32, 8, 1); + dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1); + + if (stream == 0) + { + ::matop_krnls::set_to_with_mask<<>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step); + cudaSafeCall ( cudaThreadSynchronize() ); + } + else + { + ::matop_krnls::set_to_with_mask<<>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step); + } + + } + + template + void set_to_without_mask_run(const DevMem2D& mat, int channels, const cudaStream_t & stream) + { + dim3 threadsPerBlock(32, 8, 1); + dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1); + + if (stream == 0) + { + matop_krnls::set_to_without_mask<<>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels); + cudaSafeCall ( cudaThreadSynchronize() ); + } + else + { + matop_krnls::set_to_without_mask<<>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels); + } + } + + void set_to_without_mask(DevMem2D mat, int depth, const double *scalar, int channels, const cudaStream_t & stream) + { + cudaSafeCall( cudaMemcpyToSymbol(matop_krnls::scalar_d, scalar, sizeof(double) * 4)); + + static SetToFunc_without_mask tab[8] = + { + set_to_without_mask_run, + set_to_without_mask_run, + set_to_without_mask_run, + set_to_without_mask_run, + set_to_without_mask_run, + set_to_without_mask_run, + set_to_without_mask_run, + 0 + }; + + SetToFunc_without_mask func = tab[depth]; + + if (func == 0) + cv::gpu::error("Unsupported setTo operation", __FILE__, __LINE__); + + func(mat, channels, stream); + } + + void set_to_with_mask(DevMem2D mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream) + { + cudaSafeCall( cudaMemcpyToSymbol(matop_krnls::scalar_d, scalar, sizeof(double) * 4)); + + static SetToFunc_with_mask tab[8] = + { + set_to_with_mask_run, + set_to_with_mask_run, + set_to_with_mask_run, + set_to_with_mask_run, + set_to_with_mask_run, + set_to_with_mask_run, + set_to_with_mask_run, + 0 + }; + + SetToFunc_with_mask func = tab[depth]; + + if (func == 0) + cv::gpu::error("Unsupported setTo operation", __FILE__, __LINE__); + + func(mat, mask, channels, stream); + } +}}} + +/////////////////////////////////////////////////////////////////////////// +//////////////////////////////// ConvertTo //////////////////////////////// +/////////////////////////////////////////////////////////////////////////// + +namespace matop_krnls +{ template - __global__ static void kernel_convert_to(uchar* srcmat, size_t src_step, uchar* dstmat, size_t dst_step, size_t width, size_t height, double alpha, double beta) + __global__ static void convert_to(uchar* srcmat, size_t src_step, uchar* dstmat, size_t dst_step, size_t width, size_t height, double alpha, double beta) { typedef typename ReadWriteTraits::read_type read_type; typedef typename ReadWriteTraits::write_type write_type; @@ -253,253 +347,63 @@ namespace mat_operators dst[(x * shift) + i] = saturate_cast
(alpha * src[(x * shift) + i] + beta); } } - } + } +} - /////////////////////////////////////////////////////////////////////////// - /////////////////////////////// compare_ne //////////////////////////////// - /////////////////////////////////////////////////////////////////////////// +namespace cv { namespace gpu { namespace matrix_operations +{ + typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream); - template - __global__ void kernel_compare_ne(uchar* src1, size_t src1_step, uchar* src2, size_t src2_step, uchar* dst, size_t dst_step, int cols, int rows) + template + void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream) { - const size_t x = threadIdx.x + blockIdx.x * blockDim.x; - const size_t y = threadIdx.y + blockIdx.y * blockDim.y; + const int shift = ::matop_krnls::ReadWriteTraits::shift; - if (x < cols && y < rows) + dim3 block(32, 8); + dim3 grid(divUp(width, block.x * shift), divUp(height, block.y)); + + if (stream == 0) { - T src1_pix = ((T*)(src1 + y * src1_step))[x]; - T src2_pix = ((T*)(src2 + y * src2_step))[x]; - uchar res = (uchar)(src1_pix != src2_pix) * 255; - ((dst + y * dst_step))[x] = res; + matop_krnls::convert_to<<>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta); + cudaSafeCall( cudaThreadSynchronize() ); + } + else + { + matop_krnls::convert_to<<>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta); } } -} // namespace mat_operators -namespace cv -{ - namespace gpu + void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream) { - namespace matrix_operations + static CvtFunc tab[8][8] = { + {cvt_, cvt_, cvt_, cvt_, + cvt_, cvt_, cvt_, 0}, - /////////////////////////////////////////////////////////////////////////// - ////////////////////////////////// CopyTo ///////////////////////////////// - /////////////////////////////////////////////////////////////////////////// + {cvt_, cvt_, cvt_, cvt_, + cvt_, cvt_, cvt_, 0}, - typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream); + {cvt_, cvt_, cvt_, cvt_, + cvt_, cvt_, cvt_, 0}, - template - void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream) - { - dim3 threadsPerBlock(16,16, 1); - dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1); - if (stream == 0) - { - ::mat_operators::kernel_copy_to_with_mask<<>> - ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels); - cudaSafeCall ( cudaThreadSynchronize() ); - } - else - { - ::mat_operators::kernel_copy_to_with_mask<<>> - ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels); - } - } + {cvt_, cvt_, cvt_, cvt_, + cvt_, cvt_, cvt_, 0}, - extern "C" void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream) - { - static CopyToFunc tab[8] = - { - copy_to_with_mask_run, - copy_to_with_mask_run, - copy_to_with_mask_run, - copy_to_with_mask_run, - copy_to_with_mask_run, - copy_to_with_mask_run, - copy_to_with_mask_run, - 0 - }; + {cvt_, cvt_, cvt_, + cvt_, cvt_, cvt_, cvt_, 0}, - CopyToFunc func = tab[depth]; + {cvt_, cvt_, cvt_, + cvt_, cvt_, cvt_, cvt_, 0}, - if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__); + {cvt_, cvt_, cvt_, + cvt_, cvt_, cvt_, cvt_, 0}, - func(mat_src, mat_dst, mask, channels, stream); - } + {0,0,0,0,0,0,0,0} + }; - - /////////////////////////////////////////////////////////////////////////// - ////////////////////////////////// SetTo ////////////////////////////////// - /////////////////////////////////////////////////////////////////////////// - - typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream); - typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels, const cudaStream_t & stream); - - template - void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream) - { - dim3 threadsPerBlock(32, 8, 1); - dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1); - - if (stream == 0) - { - ::mat_operators::kernel_set_to_with_mask<<>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step); - cudaSafeCall ( cudaThreadSynchronize() ); - } - else - { - ::mat_operators::kernel_set_to_with_mask<<>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step); - } - - } - - template - void set_to_without_mask_run(const DevMem2D& mat, int channels, const cudaStream_t & stream) - { - dim3 threadsPerBlock(32, 8, 1); - dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1); - - if (stream == 0) - { - mat_operators::kernel_set_to_without_mask<<>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels); - cudaSafeCall ( cudaThreadSynchronize() ); - } - else - { - mat_operators::kernel_set_to_without_mask<<>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels); - } - } - - extern "C" void set_to_without_mask(DevMem2D mat, int depth, const double *scalar, int channels, const cudaStream_t & stream) - { - cudaSafeCall( cudaMemcpyToSymbol(mat_operators::scalar_d, scalar, sizeof(double) * 4)); - - static SetToFunc_without_mask tab[8] = - { - set_to_without_mask_run, - set_to_without_mask_run, - set_to_without_mask_run, - set_to_without_mask_run, - set_to_without_mask_run, - set_to_without_mask_run, - set_to_without_mask_run, - 0 - }; - - SetToFunc_without_mask func = tab[depth]; - - if (func == 0) - cv::gpu::error("Unsupported setTo operation", __FILE__, __LINE__); - - func(mat, channels, stream); - } - - - extern "C" void set_to_with_mask(DevMem2D mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream) - { - cudaSafeCall( cudaMemcpyToSymbol(mat_operators::scalar_d, scalar, sizeof(double) * 4)); - - static SetToFunc_with_mask tab[8] = - { - set_to_with_mask_run, - set_to_with_mask_run, - set_to_with_mask_run, - set_to_with_mask_run, - set_to_with_mask_run, - set_to_with_mask_run, - set_to_with_mask_run, - 0 - }; - - SetToFunc_with_mask func = tab[depth]; - - if (func == 0) - cv::gpu::error("Unsupported setTo operation", __FILE__, __LINE__); - - func(mat, mask, channels, stream); - } - - - /////////////////////////////////////////////////////////////////////////// - //////////////////////////////// ConvertTo //////////////////////////////// - /////////////////////////////////////////////////////////////////////////// - - typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream); - - template - void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream) - { - const int shift = ::mat_operators::ReadWriteTraits::shift; - - dim3 block(32, 8); - dim3 grid(divUp(width, block.x * shift), divUp(height, block.y)); - - if (stream == 0) - { - mat_operators::kernel_convert_to<<>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta); - cudaSafeCall( cudaThreadSynchronize() ); - } - else - { - mat_operators::kernel_convert_to<<>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta); - } - } - - extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream) - { - static CvtFunc tab[8][8] = - { - {cvt_, cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, cvt_, 0}, - - {0,0,0,0,0,0,0,0} - }; - - CvtFunc func = tab[sdepth][ddepth]; - if (func == 0) - cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__); - func(src, dst, src.cols * channels, src.rows, alpha, beta, stream); - } - - /////////////////////////////////////////////////////////////////////////// - /////////////////////////////// compare_ne //////////////////////////////// - /////////////////////////////////////////////////////////////////////////// - - void compare_ne_8u(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst) - { - dim3 block(32, 8); - dim3 grid(divUp(src1.cols, block.x), divUp(src1.rows, block.y)); - - mat_operators::kernel_compare_ne<<>>(src1.ptr, src1.step, src2.ptr, src2.step, dst.ptr, dst.step, src1.cols, src1.rows); - cudaSafeCall( cudaThreadSynchronize() ); - } - - void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst) - { - dim3 block(32, 8); - dim3 grid(divUp(src1.cols, block.x), divUp(src1.rows, block.y)); - - mat_operators::kernel_compare_ne<<>>(src1.ptr, src1.step, src2.ptr, src2.step, dst.ptr, dst.step, src1.cols, src1.rows); - cudaSafeCall( cudaThreadSynchronize() ); - } - } // namespace matrix_operations - } // namespace gpu -} // namespace cv + CvtFunc func = tab[sdepth][ddepth]; + if (func == 0) + cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__); + func(src, dst, src.cols * channels, src.rows, alpha, beta, stream); + } +}}} diff --git a/modules/gpu/src/cuda/saturate_cast.hpp b/modules/gpu/src/cuda/saturate_cast.hpp index 2b58eb1190..e5a5a83600 100644 --- a/modules/gpu/src/cuda/saturate_cast.hpp +++ b/modules/gpu/src/cuda/saturate_cast.hpp @@ -49,124 +49,206 @@ namespace cv { namespace gpu { - // To fix link error: this func already defined in other obj file - namespace + template static __device__ _Tp saturate_cast(uchar v) { return _Tp(v); } + template static __device__ _Tp saturate_cast(schar v) { return _Tp(v); } + template static __device__ _Tp saturate_cast(ushort v) { return _Tp(v); } + template static __device__ _Tp saturate_cast(short v) { return _Tp(v); } + template static __device__ _Tp saturate_cast(uint v) { return _Tp(v); } + template static __device__ _Tp saturate_cast(int v) { return _Tp(v); } + template static __device__ _Tp saturate_cast(float v) { return _Tp(v); } + template static __device__ _Tp saturate_cast(double v) { return _Tp(v); } + + template<> static __device__ uchar saturate_cast(schar v) + { return (uchar)max((int)v, 0); } + template<> static __device__ uchar saturate_cast(ushort v) + { return (uchar)min((uint)v, (uint)UCHAR_MAX); } + template<> static __device__ uchar saturate_cast(int v) + { return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); } + template<> static __device__ uchar saturate_cast(uint v) + { return (uchar)min(v, (uint)UCHAR_MAX); } + template<> static __device__ uchar saturate_cast(short v) + { return saturate_cast((uint)v); } + + template<> static __device__ uchar saturate_cast(float v) + { int iv = __float2int_rn(v); return saturate_cast(iv); } + template<> static __device__ uchar saturate_cast(double v) { - template __device__ _Tp saturate_cast(uchar v) { return _Tp(v); } - template __device__ _Tp saturate_cast(schar v) { return _Tp(v); } - template __device__ _Tp saturate_cast(ushort v) { return _Tp(v); } - template __device__ _Tp saturate_cast(short v) { return _Tp(v); } - template __device__ _Tp saturate_cast(uint v) { return _Tp(v); } - template __device__ _Tp saturate_cast(int v) { return _Tp(v); } - template __device__ _Tp saturate_cast(float v) { return _Tp(v); } - template __device__ _Tp saturate_cast(double v) { return _Tp(v); } - - template<> __device__ uchar saturate_cast(schar v) - { return (uchar)max((int)v, 0); } - template<> __device__ uchar saturate_cast(ushort v) - { return (uchar)min((uint)v, (uint)UCHAR_MAX); } - template<> __device__ uchar saturate_cast(int v) - { return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); } - template<> __device__ uchar saturate_cast(uint v) - { return (uchar)min(v, (uint)UCHAR_MAX); } - template<> __device__ uchar saturate_cast(short v) - { return saturate_cast((uint)v); } - - template<> __device__ uchar saturate_cast(float v) - { int iv = __float2int_rn(v); return saturate_cast(iv); } - template<> __device__ uchar saturate_cast(double v) - { - #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); return saturate_cast(iv); - #else - return saturate_cast((float)v); - #endif - } - - template<> __device__ schar saturate_cast(uchar v) - { return (schar)min((int)v, SCHAR_MAX); } - template<> __device__ schar saturate_cast(ushort v) - { return (schar)min((uint)v, (uint)SCHAR_MAX); } - template<> __device__ schar saturate_cast(int v) - { - return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? - v : v > 0 ? SCHAR_MAX : SCHAR_MIN); - } - template<> __device__ schar saturate_cast(short v) - { return saturate_cast((int)v); } - template<> __device__ schar saturate_cast(uint v) - { return (schar)min(v, (uint)SCHAR_MAX); } - - template<> __device__ schar saturate_cast(float v) - { int iv = __float2int_rn(v); return saturate_cast(iv); } - template<> __device__ schar saturate_cast(double v) - { - #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); return saturate_cast(iv); - #else - return saturate_cast((float)v); - #endif - } - - template<> __device__ ushort saturate_cast(schar v) - { return (ushort)max((int)v, 0); } - template<> __device__ ushort saturate_cast(short v) - { return (ushort)max((int)v, 0); } - template<> __device__ ushort saturate_cast(int v) - { return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); } - template<> __device__ ushort saturate_cast(uint v) - { return (ushort)min(v, (uint)USHRT_MAX); } - template<> __device__ ushort saturate_cast(float v) - { int iv = __float2int_rn(v); return saturate_cast(iv); } - template<> __device__ ushort saturate_cast(double v) - { - #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); return saturate_cast(iv); - #else - return saturate_cast((float)v); - #endif - } - - template<> __device__ short saturate_cast(ushort v) - { return (short)min((int)v, SHRT_MAX); } - template<> __device__ short saturate_cast(int v) - { - return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? - v : v > 0 ? SHRT_MAX : SHRT_MIN); - } - template<> __device__ short saturate_cast(uint v) - { return (short)min(v, (uint)SHRT_MAX); } - template<> __device__ short saturate_cast(float v) - { int iv = __float2int_rn(v); return saturate_cast(iv); } - template<> __device__ short saturate_cast(double v) - { - #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); return saturate_cast(iv); - #else - return saturate_cast((float)v); - #endif - } - - template<> __device__ int saturate_cast(float v) { return __float2int_rn(v); } - template<> __device__ int saturate_cast(double v) - { - #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 - return __double2int_rn(v); - #else - return saturate_cast((float)v); - #endif - } - - template<> __device__ uint saturate_cast(float v){ return __float2uint_rn(v); } - template<> __device__ uint saturate_cast(double v) - { - #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 - return __double2uint_rn(v); - #else - return saturate_cast((float)v); - #endif - } + #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 + int iv = __double2int_rn(v); return saturate_cast(iv); + #else + return saturate_cast((float)v); + #endif } + + template<> static __device__ schar saturate_cast(uchar v) + { return (schar)min((int)v, SCHAR_MAX); } + template<> static __device__ schar saturate_cast(ushort v) + { return (schar)min((uint)v, (uint)SCHAR_MAX); } + template<> static __device__ schar saturate_cast(int v) + { + return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? + v : v > 0 ? SCHAR_MAX : SCHAR_MIN); + } + template<> static __device__ schar saturate_cast(short v) + { return saturate_cast((int)v); } + template<> static __device__ schar saturate_cast(uint v) + { return (schar)min(v, (uint)SCHAR_MAX); } + + template<> static __device__ schar saturate_cast(float v) + { int iv = __float2int_rn(v); return saturate_cast(iv); } + template<> static __device__ schar saturate_cast(double v) + { + #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 + int iv = __double2int_rn(v); return saturate_cast(iv); + #else + return saturate_cast((float)v); + #endif + } + + template<> static __device__ ushort saturate_cast(schar v) + { return (ushort)max((int)v, 0); } + template<> static __device__ ushort saturate_cast(short v) + { return (ushort)max((int)v, 0); } + template<> static __device__ ushort saturate_cast(int v) + { return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); } + template<> static __device__ ushort saturate_cast(uint v) + { return (ushort)min(v, (uint)USHRT_MAX); } + template<> static __device__ ushort saturate_cast(float v) + { int iv = __float2int_rn(v); return saturate_cast(iv); } + template<> static __device__ ushort saturate_cast(double v) + { + #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 + int iv = __double2int_rn(v); return saturate_cast(iv); + #else + return saturate_cast((float)v); + #endif + } + + template<> static __device__ short saturate_cast(ushort v) + { return (short)min((int)v, SHRT_MAX); } + template<> static __device__ short saturate_cast(int v) + { + return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? + v : v > 0 ? SHRT_MAX : SHRT_MIN); + } + template<> static __device__ short saturate_cast(uint v) + { return (short)min(v, (uint)SHRT_MAX); } + template<> static __device__ short saturate_cast(float v) + { int iv = __float2int_rn(v); return saturate_cast(iv); } + template<> static __device__ short saturate_cast(double v) + { + #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 + int iv = __double2int_rn(v); return saturate_cast(iv); + #else + return saturate_cast((float)v); + #endif + } + + template<> static __device__ int saturate_cast(float v) { return __float2int_rn(v); } + template<> static __device__ int saturate_cast(double v) + { + #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 + return __double2int_rn(v); + #else + return saturate_cast((float)v); + #endif + } + + template<> static __device__ uint saturate_cast(float v){ return __float2uint_rn(v); } + template<> static __device__ uint saturate_cast(double v) + { + #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 + return __double2uint_rn(v); + #else + return saturate_cast((float)v); + #endif + } + + template static __device__ _Tp saturate_cast(uchar4 v) { return _Tp(v); } + template static __device__ _Tp saturate_cast(char4 v) { return _Tp(v); } + template static __device__ _Tp saturate_cast(ushort4 v) { return _Tp(v); } + template static __device__ _Tp saturate_cast(short4 v) { return _Tp(v); } + template static __device__ _Tp saturate_cast(uint4 v) { return _Tp(v); } + template static __device__ _Tp saturate_cast(int4 v) { return _Tp(v); } + template static __device__ _Tp saturate_cast(float4 v) { return _Tp(v); } + + template<> static __device__ uchar4 saturate_cast(char4 v) + { return make_uchar4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + template<> static __device__ uchar4 saturate_cast(ushort4 v) + { return make_uchar4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + template<> static __device__ uchar4 saturate_cast(short4 v) + { return make_uchar4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + template<> static __device__ uchar4 saturate_cast(uint4 v) + { return make_uchar4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + template<> static __device__ uchar4 saturate_cast(int4 v) + { return make_uchar4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + template<> static __device__ uchar4 saturate_cast(float4 v) + { return make_uchar4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + + template<> static __device__ char4 saturate_cast(uchar4 v) + { return make_char4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + template<> static __device__ char4 saturate_cast(ushort4 v) + { return make_char4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + template<> static __device__ char4 saturate_cast(short4 v) + { return make_char4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + template<> static __device__ char4 saturate_cast(uint4 v) + { return make_char4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + template<> static __device__ char4 saturate_cast(int4 v) + { return make_char4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + template<> static __device__ char4 saturate_cast(float4 v) + { return make_char4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + + template<> static __device__ ushort4 saturate_cast(uchar4 v) + { return make_ushort4(v.x, v.y, v.z, v.w); } + template<> static __device__ ushort4 saturate_cast(char4 v) + { return make_ushort4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + template<> static __device__ ushort4 saturate_cast(short4 v) + { return make_ushort4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + template<> static __device__ ushort4 saturate_cast(uint4 v) + { return make_ushort4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + template<> static __device__ ushort4 saturate_cast(int4 v) + { return make_ushort4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + template<> static __device__ ushort4 saturate_cast(float4 v) + { return make_ushort4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + + template<> static __device__ short4 saturate_cast(uchar4 v) + { return make_short4(v.x, v.y, v.z, v.w); } + template<> static __device__ short4 saturate_cast(char4 v) + { return make_short4(v.x, v.y, v.z, v.w); } + template<> static __device__ short4 saturate_cast(ushort4 v) + { return make_short4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + template<> static __device__ short4 saturate_cast(uint4 v) + { return make_short4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + template<> static __device__ short4 saturate_cast(int4 v) + { return make_short4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + template<> static __device__ short4 saturate_cast(float4 v) + { return make_short4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + + template<> static __device__ uint4 saturate_cast(uchar4 v) + { return make_uint4(v.x, v.y, v.z, v.w); } + template<> static __device__ uint4 saturate_cast(char4 v) + { return make_uint4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + template<> static __device__ uint4 saturate_cast(ushort4 v) + { return make_uint4(v.x, v.y, v.z, v.w); } + template<> static __device__ uint4 saturate_cast(short4 v) + { return make_uint4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + template<> static __device__ uint4 saturate_cast(int4 v) + { return make_uint4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + template<> static __device__ uint4 saturate_cast(float4 v) + { return make_uint4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + + template<> static __device__ int4 saturate_cast(uchar4 v) + { return make_int4(v.x, v.y, v.z, v.w); } + template<> static __device__ int4 saturate_cast(char4 v) + { return make_int4(v.x, v.y, v.z, v.w); } + template<> static __device__ int4 saturate_cast(ushort4 v) + { return make_int4(v.x, v.y, v.z, v.w); } + template<> static __device__ int4 saturate_cast(short4 v) + { return make_int4(v.x, v.y, v.z, v.w); } + template<> static __device__ int4 saturate_cast(uint4 v) + { return make_int4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } + template<> static __device__ int4 saturate_cast(float4 v) + { return make_int4(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } } } diff --git a/modules/gpu/src/cuda/transform.hpp b/modules/gpu/src/cuda/transform.hpp new file mode 100644 index 0000000000..43ed19e952 --- /dev/null +++ b/modules/gpu/src/cuda/transform.hpp @@ -0,0 +1,118 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_TRANSFORM_HPP__ +#define __OPENCV_GPU_TRANSFORM_HPP__ + +#include "cuda_shared.hpp" +#include "saturate_cast.hpp" +#include "vecmath.hpp" + +namespace cv { namespace gpu { namespace algo_krnls +{ + template + static __global__ void transform(const T* src, size_t src_step, + D* dst, size_t dst_step, int width, int height, UnOp op) + { + const int x = blockDim.x * blockIdx.x + threadIdx.x; + const int y = blockDim.y * blockIdx.y + threadIdx.y; + + if (x < width && y < height) + { + T src_data = src[y * src_step + x]; + dst[y * dst_step + x] = op(src_data, x, y); + } + } + template + static __global__ void transform(const T1* src1, size_t src1_step, const T2* src2, size_t src2_step, + D* dst, size_t dst_step, int width, int height, BinOp op) + { + const int x = blockDim.x * blockIdx.x + threadIdx.x; + const int y = blockDim.y * blockIdx.y + threadIdx.y; + + if (x < width && y < height) + { + T1 src1_data = src1[y * src1_step + x]; + T2 src2_data = src2[y * src2_step + x]; + dst[y * dst_step + x] = op(src1_data, src2_data, x, y); + } + } +}}} + +namespace cv +{ + namespace gpu + { + template + static void transform(const DevMem2D_& src, const DevMem2D_& dst, UnOp op, cudaStream_t stream) + { + dim3 threads(16, 16, 1); + dim3 grid(1, 1, 1); + + grid.x = divUp(src.cols, threads.x); + grid.y = divUp(src.rows, threads.y); + + algo_krnls::transform<<>>(src.ptr, src.elem_step, + dst.ptr, dst.elem_step, src.cols, src.rows, op); + + if (stream == 0) + cudaSafeCall( cudaThreadSynchronize() ); + } + template + static void transform(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, BinOp op, cudaStream_t stream) + { + dim3 threads(16, 16, 1); + dim3 grid(1, 1, 1); + + grid.x = divUp(src1.cols, threads.x); + grid.y = divUp(src1.rows, threads.y); + + algo_krnls::transform<<>>(src1.ptr, src1.elem_step, + src2.ptr, src2.elem_step, dst.ptr, dst.elem_step, src1.cols, src1.rows, op); + + if (stream == 0) + cudaSafeCall( cudaThreadSynchronize() ); + } + } +} + +#endif // __OPENCV_GPU_TRANSFORM_HPP__ diff --git a/modules/gpu/src/cuda/vecmath.hpp b/modules/gpu/src/cuda/vecmath.hpp new file mode 100644 index 0000000000..225e9584bd --- /dev/null +++ b/modules/gpu/src/cuda/vecmath.hpp @@ -0,0 +1,126 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_VECMATH_HPP__ +#define __OPENCV_GPU_VECMATH_HPP__ + +#include "cuda_shared.hpp" + +namespace cv +{ + namespace gpu + { + template struct TypeVec; + template struct TypeVec { typedef T vec_t; }; + template<> struct TypeVec { typedef uchar2 vec_t; }; + template<> struct TypeVec { typedef uchar2 vec_t; }; + template<> struct TypeVec { typedef uchar3 vec_t; };; + template<> struct TypeVec { typedef uchar3 vec_t; }; + template<> struct TypeVec { typedef uchar4 vec_t; };; + template<> struct TypeVec { typedef uchar4 vec_t; }; + template<> struct TypeVec { typedef char2 vec_t; }; + template<> struct TypeVec { typedef char2 vec_t; }; + template<> struct TypeVec { typedef char3 vec_t; }; + template<> struct TypeVec { typedef char3 vec_t; }; + template<> struct TypeVec { typedef char4 vec_t; }; + template<> struct TypeVec { typedef char4 vec_t; }; + template<> struct TypeVec { typedef ushort2 vec_t; }; + template<> struct TypeVec { typedef ushort2 vec_t; }; + template<> struct TypeVec { typedef ushort3 vec_t; }; + template<> struct TypeVec { typedef ushort3 vec_t; }; + template<> struct TypeVec { typedef ushort4 vec_t; }; + template<> struct TypeVec { typedef ushort4 vec_t; }; + template<> struct TypeVec { typedef short2 vec_t; }; + template<> struct TypeVec { typedef short2 vec_t; }; + template<> struct TypeVec { typedef short3 vec_t; }; + template<> struct TypeVec { typedef short3 vec_t; }; + template<> struct TypeVec { typedef short4 vec_t; }; + template<> struct TypeVec { typedef short4 vec_t; }; + template<> struct TypeVec { typedef uint2 vec_t; }; + template<> struct TypeVec { typedef uint2 vec_t; }; + template<> struct TypeVec { typedef uint3 vec_t; }; + template<> struct TypeVec { typedef uint3 vec_t; }; + template<> struct TypeVec { typedef uint4 vec_t; }; + template<> struct TypeVec { typedef uint4 vec_t; }; + template<> struct TypeVec { typedef int2 vec_t; }; + template<> struct TypeVec { typedef int2 vec_t; }; + template<> struct TypeVec { typedef int3 vec_t; }; + template<> struct TypeVec { typedef int3 vec_t; }; + template<> struct TypeVec { typedef int4 vec_t; }; + template<> struct TypeVec { typedef int4 vec_t; }; + template<> struct TypeVec { typedef float2 vec_t; }; + template<> struct TypeVec { typedef float2 vec_t; }; + template<> struct TypeVec { typedef float3 vec_t; }; + template<> struct TypeVec { typedef float3 vec_t; }; + template<> struct TypeVec { typedef float4 vec_t; }; + template<> struct TypeVec { typedef float4 vec_t; }; + + static __device__ uchar4 operator+(const uchar4& a, const uchar4& b) + { + return make_uchar4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); + } + static __device__ uchar4 operator-(const uchar4& a, const uchar4& b) + { + return make_uchar4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); + } + static __device__ uchar4 operator*(const uchar4& a, const uchar4& b) + { + return make_uchar4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); + } + static __device__ uchar4 operator/(const uchar4& a, const uchar4& b) + { + return make_uchar4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); + } + template + static __device__ uchar4 operator*(const uchar4& a, T s) + { + return make_uchar4(a.x * s, a.y * s, a.z * s, a.w * s); + } + template + static __device__ uchar4 operator*(T s, const uchar4& a) + { + return a * s; + } + } +} + +#endif // __OPENCV_GPU_VECMATH_HPP__ \ No newline at end of file diff --git a/modules/gpu/src/cudastream.cpp b/modules/gpu/src/cudastream.cpp index f933453e83..7f93c12fc4 100644 --- a/modules/gpu/src/cudastream.cpp +++ b/modules/gpu/src/cudastream.cpp @@ -69,6 +69,22 @@ void cv::gpu::Stream::enqueueConvert(const GpuMat& /*src*/, GpuMat& /*dst*/, int #include "opencv2/gpu/stream_accessor.hpp" +namespace cv +{ + namespace gpu + { + namespace matrix_operations + { + void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0); + + void set_to_without_mask (DevMem2D dst, int depth, const double *scalar, int channels, const cudaStream_t & stream = 0); + void set_to_with_mask (DevMem2D dst, int depth, const double *scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0); + + void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream = 0); + } + } +} + struct Stream::Impl { cudaStream_t stream; diff --git a/modules/gpu/src/filtering_npp.cpp b/modules/gpu/src/filtering.cpp similarity index 79% rename from modules/gpu/src/filtering_npp.cpp rename to modules/gpu/src/filtering.cpp index a9aceb50ef..a87d194274 100644 --- a/modules/gpu/src/filtering_npp.cpp +++ b/modules/gpu/src/filtering.cpp @@ -49,18 +49,18 @@ using namespace cv::gpu; #if !defined (HAVE_CUDA) Ptr cv::gpu::createFilter2D_GPU(const Ptr) { throw_nogpu(); return Ptr(0); } -Ptr cv::gpu::createSeparableFilter_GPU(const Ptr&, const Ptr&, bool) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::createSeparableFilter_GPU(const Ptr&, const Ptr&) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getBoxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::createBoxFilter_GPU(int, int, const Size&, const Point&) { throw_nogpu(); return Ptr(0); } -Ptr cv::gpu::getMorphologyFilter_GPU(int, int, const GpuMat&, const Size&, Point) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::getMorphologyFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, const Point&, int) { throw_nogpu(); return Ptr(0); } -Ptr cv::gpu::getLinearFilter_GPU(int, int, const GpuMat&, const Size&, Point, int) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::getLinearFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::createLinearFilter_GPU(int, int, const Mat&, const Point&) { throw_nogpu(); return Ptr(0); } -Ptr cv::gpu::getLinearRowFilter_GPU(int, int, const GpuMat&, int, int) { throw_nogpu(); return Ptr(0); } -Ptr cv::gpu::getLinearColumnFilter_GPU(int, int, const GpuMat&, int, int) { throw_nogpu(); return Ptr(0); } -Ptr cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&, bool) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::createDerivFilter_GPU(int, int, int, int, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::createGaussianFilter_GPU(int, Size, double, double) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr(0); } @@ -71,7 +71,7 @@ void cv::gpu::erode( const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nog void cv::gpu::dilate( const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nogpu(); } void cv::gpu::morphologyEx( const GpuMat&, GpuMat&, int, const Mat&, Point, int) { throw_nogpu(); } void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point) { throw_nogpu(); } -void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, bool) { throw_nogpu(); } +void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point) { throw_nogpu(); } void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double) { throw_nogpu(); } void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double) { throw_nogpu(); } void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double) { throw_nogpu(); } @@ -164,28 +164,10 @@ Ptr cv::gpu::createFilter2D_GPU(const Ptr filt namespace { - struct RowColumnFilterApply - { - void operator()(Ptr& rowFilter, Ptr& columnFilter, - GpuMat& srcROI, GpuMat& dstROI, GpuMat& dstBufROI) - { - (*rowFilter)(srcROI, dstBufROI); - (*columnFilter)(dstBufROI, dstROI); - } - }; - struct ColumnRowFilterApply - { - void operator()(Ptr& rowFilter, Ptr& columnFilter, - GpuMat& srcROI, GpuMat& dstROI, GpuMat& dstBufROI) - { - (*columnFilter)(srcROI, dstBufROI); - (*rowFilter)(dstBufROI, dstROI); - } - }; - class SeparableFilterEngine_GPU_base : public FilterEngine_GPU + class SeparableFilterEngine_GPU : public FilterEngine_GPU { public: - SeparableFilterEngine_GPU_base(const Ptr& rowFilter_, + SeparableFilterEngine_GPU(const Ptr& rowFilter_, const Ptr& columnFilter_) : rowFilter(rowFilter_), columnFilter(columnFilter_) { @@ -208,6 +190,9 @@ namespace srcROI = src(roi); dstROI = dst(roi); dstBufROI = dstBuf(roi); + + (*rowFilter)(srcROI, dstBufROI); + (*columnFilter)(dstBufROI, dstROI); } Ptr rowFilter; @@ -219,32 +204,12 @@ namespace GpuMat dstROI; GpuMat dstBufROI; }; - template - class SeparableFilterEngine_GPU : public SeparableFilterEngine_GPU_base - { - public: - SeparableFilterEngine_GPU(const Ptr& rowFilter_, - const Ptr& columnFilter_, FA fa_) : - SeparableFilterEngine_GPU_base(rowFilter_, columnFilter_), fa(fa_) - { - } - - virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1)) - { - SeparableFilterEngine_GPU_base::apply(src, dst, roi); - fa(rowFilter, columnFilter, srcROI, dstROI, dstBufROI); - } - - FA fa; - }; } Ptr cv::gpu::createSeparableFilter_GPU(const Ptr& rowFilter, - const Ptr& columnFilter, bool rowFilterFirst) + const Ptr& columnFilter) { - if (rowFilterFirst) - return Ptr(new SeparableFilterEngine_GPU(rowFilter, columnFilter, RowColumnFilterApply())); - return Ptr(new SeparableFilterEngine_GPU(rowFilter, columnFilter, ColumnRowFilterApply())); + return Ptr(new SeparableFilterEngine_GPU(rowFilter, columnFilter)); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -398,7 +363,7 @@ namespace }; } -Ptr cv::gpu::getMorphologyFilter_GPU(int op, int type, const GpuMat& kernel, const Size& ksize, Point anchor) +Ptr cv::gpu::getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize, Point anchor) { static const nppMorfFilter_t nppMorfFilter_callers[2][5] = { @@ -408,11 +373,12 @@ Ptr cv::gpu::getMorphologyFilter_GPU(int op, int type, const Gpu CV_Assert(op == MORPH_ERODE || op == MORPH_DILATE); CV_Assert(type == CV_8UC1 || type == CV_8UC4); - CV_Assert(kernel.type() == CV_8UC1 && kernel.rows == 1 && kernel.cols == ksize.area()); - + + GpuMat gpu_krnl; + normalizeKernel(kernel, gpu_krnl); normalizeAnchor(anchor, ksize); - return Ptr(new NPPMorphFilter(ksize, anchor, kernel, nppMorfFilter_callers[op][CV_MAT_CN(type)])); + return Ptr(new NPPMorphFilter(ksize, anchor, gpu_krnl, nppMorfFilter_callers[op][CV_MAT_CN(type)])); } namespace @@ -447,10 +413,7 @@ Ptr cv::gpu::createMorphologyFilter_GPU(int op, int type, cons Size ksize = kernel.size(); - GpuMat gpu_krnl; - normalizeKernel(kernel, gpu_krnl); - - Ptr filter2D = getMorphologyFilter_GPU(op, type, gpu_krnl, ksize, anchor); + Ptr filter2D = getMorphologyFilter_GPU(op, type, kernel, ksize, anchor); return Ptr(new MorphologyFilterEngine_GPU(filter2D, iterations)); } @@ -575,27 +538,25 @@ namespace }; } -Ptr cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const GpuMat& kernel, const Size& ksize, Point anchor, int nDivisor) +Ptr cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Size& ksize, Point anchor) { static const nppFilter2D_t cppFilter2D_callers[] = {0, nppiFilter_8u_C1R, 0, 0, nppiFilter_8u_C4R}; - CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType); - CV_Assert(kernel.type() == CV_32SC1 && kernel.rows == 1 && kernel.cols == ksize.area()); - + CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType); + + GpuMat gpu_krnl; + int nDivisor; + normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, true); normalizeAnchor(anchor, ksize); - return Ptr(new NPPLinearFilter(ksize, anchor, kernel, nDivisor, cppFilter2D_callers[CV_MAT_CN(srcType)])); + return Ptr(new NPPLinearFilter(ksize, anchor, gpu_krnl, nDivisor, cppFilter2D_callers[CV_MAT_CN(srcType)])); } Ptr cv::gpu::createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Point& anchor) { Size ksize = kernel.size(); - GpuMat gpu_krnl; - int nDivisor; - normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, true); - - Ptr linearFilter = getLinearFilter_GPU(srcType, dstType, gpu_krnl, ksize, anchor, nDivisor); + Ptr linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, ksize, anchor); return createFilter2D_GPU(linearFilter); } @@ -614,11 +575,26 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke //////////////////////////////////////////////////////////////////////////////////////////////////// // Separable Linear Filter +namespace cv { namespace gpu { namespace filters +{ + void linearRowFilter_gpu_32s32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); + void linearRowFilter_gpu_32s32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); + void linearRowFilter_gpu_32f32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); + void linearRowFilter_gpu_32f32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); + + void linearColumnFilter_gpu_32s32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); + void linearColumnFilter_gpu_32s32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); + void linearColumnFilter_gpu_32f32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); + void linearColumnFilter_gpu_32f32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); +}}} + namespace { typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI, const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor); + typedef void (*gpuFilter1D_t)(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); + class NppLinearRowFilter : public BaseRowFilter_GPU { public: @@ -638,20 +614,64 @@ namespace Npp32s nDivisor; nppFilter1D_t func; }; + + class GpuLinearRowFilter : public BaseRowFilter_GPU + { + public: + GpuLinearRowFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_) : + BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {} + + virtual void operator()(const GpuMat& src, GpuMat& dst) + { + func(src, dst, kernel.ptr(), ksize, anchor); + } + + Mat kernel; + gpuFilter1D_t func; + }; } -Ptr cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const GpuMat& rowKernel, int anchor, int nDivisor) +Ptr cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor) { + using namespace cv::gpu::filters; static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R}; + static const gpuFilter1D_t gpuFilter1D_callers[6][6] = + { + {0,0,0,0,0,0}, + {0,0,0,0,0,0}, + {0,0,0,0,0,0}, + {0,0,0,0,0,0}, + {0,0,0,0,linearRowFilter_gpu_32s32s, linearRowFilter_gpu_32s32f}, + {0,0,0,0,linearRowFilter_gpu_32f32s, linearRowFilter_gpu_32f32f} + }; + + if ((srcType == CV_8UC1 || srcType == CV_8UC4) && bufType == srcType) + { + GpuMat gpu_row_krnl; + int nDivisor; + normalizeKernel(rowKernel, gpu_row_krnl, CV_32S, &nDivisor, true); - CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && bufType == srcType); - CV_Assert(rowKernel.type() == CV_32SC1 && rowKernel.rows == 1); + int ksize = gpu_row_krnl.cols; + normalizeAnchor(anchor, ksize); - int ksize = rowKernel.cols; + return Ptr(new NppLinearRowFilter(ksize, anchor, gpu_row_krnl, nDivisor, + nppFilter1D_callers[CV_MAT_CN(srcType)])); + } + else if ((srcType == CV_32SC1 || srcType == CV_32FC1) && (bufType == CV_32SC1 || bufType == CV_32FC1)) + { + Mat temp(rowKernel.size(), CV_32FC1); + rowKernel.convertTo(temp, CV_32FC1); + Mat cont_krnl = temp.reshape(1, 1); - normalizeAnchor(anchor, ksize); + int ksize = cont_krnl.cols; + normalizeAnchor(anchor, ksize); - return Ptr(new NppLinearRowFilter(ksize, anchor, rowKernel, nDivisor, nppFilter1D_callers[CV_MAT_CN(srcType)])); + return Ptr(new GpuLinearRowFilter(ksize, anchor, cont_krnl, + gpuFilter1D_callers[CV_MAT_DEPTH(srcType)][CV_MAT_DEPTH(bufType)])); + } + + CV_Assert(!"Unsupported types"); + return Ptr(0); } namespace @@ -675,49 +695,88 @@ namespace Npp32s nDivisor; nppFilter1D_t func; }; + + class GpuLinearColumnFilter : public BaseColumnFilter_GPU + { + public: + GpuLinearColumnFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_) : + BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {} + + virtual void operator()(const GpuMat& src, GpuMat& dst) + { + func(src, dst, kernel.ptr(), ksize, anchor); + } + + Mat kernel; + gpuFilter1D_t func; + }; } -Ptr cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const GpuMat& columnKernel, int anchor, int nDivisor) +Ptr cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor) { + using namespace cv::gpu::filters; static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R}; + static const gpuFilter1D_t gpuFilter1D_callers[6][6] = + { + {0,0,0,0,0,0}, + {0,0,0,0,0,0}, + {0,0,0,0,0,0}, + {0,0,0,0,0,0}, + {0,0,0,0,linearColumnFilter_gpu_32s32s, linearColumnFilter_gpu_32s32f}, + {0,0,0,0,linearColumnFilter_gpu_32f32s, linearColumnFilter_gpu_32f32f} + }; + + if ((bufType == CV_8UC1 || bufType == CV_8UC4) && dstType == bufType) + { + GpuMat gpu_col_krnl; + int nDivisor; + normalizeKernel(columnKernel, gpu_col_krnl, CV_32S, &nDivisor, true); - CV_Assert((bufType == CV_8UC1 || bufType == CV_8UC4) && dstType == bufType); - CV_Assert(columnKernel.type() == CV_32SC1 && columnKernel.rows == 1); + int ksize = gpu_col_krnl.cols; + normalizeAnchor(anchor, ksize); - int ksize = columnKernel.cols; + return Ptr(new NppLinearColumnFilter(ksize, anchor, gpu_col_krnl, nDivisor, + nppFilter1D_callers[CV_MAT_CN(bufType)])); + } + else if ((bufType == CV_32SC1 || bufType == CV_32FC1) && (dstType == CV_32SC1 || dstType == CV_32FC1)) + { + Mat temp(columnKernel.size(), CV_32FC1); + columnKernel.convertTo(temp, CV_32FC1); + Mat cont_krnl = temp.reshape(1, 1); - normalizeAnchor(anchor, ksize); + int ksize = cont_krnl.cols; + normalizeAnchor(anchor, ksize); - return Ptr(new NppLinearColumnFilter(ksize, anchor, columnKernel, nDivisor, nppFilter1D_callers[CV_MAT_CN(bufType)])); + return Ptr(new GpuLinearColumnFilter(ksize, anchor, cont_krnl, + gpuFilter1D_callers[CV_MAT_DEPTH(bufType)][CV_MAT_DEPTH(dstType)])); + } + + CV_Assert(!"Unsupported types"); + return Ptr(0); } Ptr cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, - const Point& anchor, bool rowFilterFirst) + const Point& anchor) { int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType); int cn = CV_MAT_CN(srcType); int bdepth = std::max(sdepth, ddepth); int bufType = CV_MAKETYPE(bdepth, cn); - GpuMat gpu_row_krnl, gpu_col_krnl; - int nRowDivisor, nColDivisor; - normalizeKernel(rowKernel, gpu_row_krnl, CV_32S, &nRowDivisor, true); - normalizeKernel(columnKernel, gpu_col_krnl, CV_32S, &nColDivisor, true); + Ptr rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x); + Ptr columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y); - Ptr rowFilter = getLinearRowFilter_GPU(srcType, bufType, gpu_row_krnl, anchor.x, nRowDivisor); - Ptr columnFilter = getLinearColumnFilter_GPU(bufType, dstType, gpu_col_krnl, anchor.y, nColDivisor); - - return createSeparableFilter_GPU(rowFilter, columnFilter, rowFilterFirst); + return createSeparableFilter_GPU(rowFilter, columnFilter); } -void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor, bool rowFilterFirst) +void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor) { if( ddepth < 0 ) ddepth = src.depth(); dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels())); - Ptr f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, rowFilterFirst); + Ptr f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor); f->apply(src, dst); } @@ -728,7 +787,7 @@ Ptr cv::gpu::createDerivFilter_GPU(int srcType, int dstType, i { Mat kx, ky; getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F); - return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1), dx >= dy); + return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1)); } void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale) @@ -746,7 +805,7 @@ void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, ky *= scale; } - sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), dx >= dy); + sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1)); } void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale) @@ -764,7 +823,7 @@ void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, ky *= scale; } - sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), dx >= dy); + sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1)); } void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale) diff --git a/modules/gpu/src/imgproc_gpu.cpp b/modules/gpu/src/imgproc_gpu.cpp index 73b44498e0..c60e605991 100644 --- a/modules/gpu/src/imgproc_gpu.cpp +++ b/modules/gpu/src/imgproc_gpu.cpp @@ -75,7 +75,7 @@ void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*) { throw_nogpu(); namespace cv { namespace gpu { - namespace improc + namespace imgproc { void remap_gpu_1c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst); void remap_gpu_3c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst); @@ -142,7 +142,7 @@ namespace cv { namespace gpu void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap) { typedef void (*remap_gpu_t)(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst); - static const remap_gpu_t callers[] = {improc::remap_gpu_1c, 0, improc::remap_gpu_3c}; + static const remap_gpu_t callers[] = {imgproc::remap_gpu_1c, 0, imgproc::remap_gpu_3c}; CV_Assert((src.type() == CV_8U || src.type() == CV_8UC3) && xmap.type() == CV_32F && ymap.type() == CV_32F); @@ -180,7 +180,7 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, eps = 1.f; eps = (float)std::max(criteria.epsilon, 0.0); - improc::meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps); + imgproc::meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps); } //////////////////////////////////////////////////////////////////////// @@ -207,7 +207,7 @@ void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int eps = 1.f; eps = (float)std::max(criteria.epsilon, 0.0); - improc::meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps); + imgproc::meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps); } //////////////////////////////////////////////////////////////////////// @@ -223,7 +223,7 @@ namespace out = dst; out.create(src.size(), CV_8UC4); - improc::drawColorDisp_gpu((DevMem2D_)src, out, ndisp, stream); + imgproc::drawColorDisp_gpu((DevMem2D_)src, out, ndisp, stream); dst = out; } @@ -256,7 +256,7 @@ namespace void reprojectImageTo3D_caller(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const cudaStream_t& stream) { xyzw.create(disp.rows, disp.cols, CV_32FC4); - improc::reprojectImageTo3D_gpu((DevMem2D_)disp, xyzw, Q.ptr(), stream); + imgproc::reprojectImageTo3D_gpu((DevMem2D_)disp, xyzw, Q.ptr(), stream); } typedef void (*reprojectImageTo3D_caller_t)(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const cudaStream_t& stream); @@ -313,7 +313,7 @@ namespace case CV_RGBA2BGR: case CV_RGB2BGR: case CV_BGRA2RGBA: { typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, cudaStream_t stream); - static const func_t funcs[] = {improc::RGB2RGB_gpu_8u, 0, improc::RGB2RGB_gpu_16u, 0, 0, improc::RGB2RGB_gpu_32f}; + static const func_t funcs[] = {imgproc::RGB2RGB_gpu_8u, 0, imgproc::RGB2RGB_gpu_16u, 0, 0, imgproc::RGB2RGB_gpu_32f}; CV_Assert(scn == 3 || scn == 4); @@ -338,7 +338,7 @@ namespace dst.create(sz, CV_8UC2); - improc::RGB2RGB5x5_gpu(src, scn, dst, green_bits, bidx, stream); + imgproc::RGB2RGB5x5_gpu(src, scn, dst, green_bits, bidx, stream); break; } @@ -356,14 +356,14 @@ namespace dst.create(sz, CV_MAKETYPE(depth, dcn)); - improc::RGB5x52RGB_gpu(src, green_bits, dst, dcn, bidx, stream); + imgproc::RGB5x52RGB_gpu(src, green_bits, dst, dcn, bidx, stream); break; } case CV_BGR2GRAY: case CV_BGRA2GRAY: case CV_RGB2GRAY: case CV_RGBA2GRAY: { typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream); - static const func_t funcs[] = {improc::RGB2Gray_gpu_8u, 0, improc::RGB2Gray_gpu_16u, 0, 0, improc::RGB2Gray_gpu_32f}; + static const func_t funcs[] = {imgproc::RGB2Gray_gpu_8u, 0, imgproc::RGB2Gray_gpu_16u, 0, 0, imgproc::RGB2Gray_gpu_32f}; CV_Assert(scn == 3 || scn == 4); @@ -383,14 +383,14 @@ namespace dst.create(sz, CV_8UC1); - improc::RGB5x52Gray_gpu(src, green_bits, dst, stream); + imgproc::RGB5x52Gray_gpu(src, green_bits, dst, stream); break; } case CV_GRAY2BGR: case CV_GRAY2BGRA: { typedef void (*func_t)(const DevMem2D& src, const DevMem2D& dst, int dstcn, cudaStream_t stream); - static const func_t funcs[] = {improc::Gray2RGB_gpu_8u, 0, improc::Gray2RGB_gpu_16u, 0, 0, improc::Gray2RGB_gpu_32f}; + static const func_t funcs[] = {imgproc::Gray2RGB_gpu_8u, 0, imgproc::Gray2RGB_gpu_16u, 0, 0, imgproc::Gray2RGB_gpu_32f}; if (dcn <= 0) dcn = 3; @@ -410,7 +410,7 @@ namespace dst.create(sz, CV_8UC2); - improc::Gray2RGB5x5_gpu(src, dst, green_bits, stream); + imgproc::Gray2RGB5x5_gpu(src, dst, green_bits, stream); break; } @@ -419,7 +419,7 @@ namespace { typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream); - static const func_t funcs[] = {improc::RGB2YCrCb_gpu_8u, 0, improc::RGB2YCrCb_gpu_16u, 0, 0, improc::RGB2YCrCb_gpu_32f}; + static const func_t funcs[] = {imgproc::RGB2YCrCb_gpu_8u, 0, imgproc::RGB2YCrCb_gpu_16u, 0, 0, imgproc::RGB2YCrCb_gpu_32f}; if (dcn <= 0) dcn = 3; CV_Assert((scn == 3 || scn == 4) && (dcn == 3 || dcn == 4)); @@ -456,7 +456,7 @@ namespace { typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream); - static const func_t funcs[] = {improc::YCrCb2RGB_gpu_8u, 0, improc::YCrCb2RGB_gpu_16u, 0, 0, improc::YCrCb2RGB_gpu_32f}; + static const func_t funcs[] = {imgproc::YCrCb2RGB_gpu_8u, 0, imgproc::YCrCb2RGB_gpu_16u, 0, 0, imgproc::YCrCb2RGB_gpu_32f}; if (dcn <= 0) dcn = 3; @@ -485,7 +485,7 @@ namespace { typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream); - static const func_t funcs[] = {improc::RGB2XYZ_gpu_8u, 0, improc::RGB2XYZ_gpu_16u, 0, 0, improc::RGB2XYZ_gpu_32f}; + static const func_t funcs[] = {imgproc::RGB2XYZ_gpu_8u, 0, imgproc::RGB2XYZ_gpu_16u, 0, 0, imgproc::RGB2XYZ_gpu_32f}; if (dcn <= 0) dcn = 3; @@ -534,7 +534,7 @@ namespace { typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream); - static const func_t funcs[] = {improc::XYZ2RGB_gpu_8u, 0, improc::XYZ2RGB_gpu_16u, 0, 0, improc::XYZ2RGB_gpu_32f}; + static const func_t funcs[] = {imgproc::XYZ2RGB_gpu_8u, 0, imgproc::XYZ2RGB_gpu_16u, 0, 0, imgproc::XYZ2RGB_gpu_32f}; if (dcn <= 0) dcn = 3; @@ -584,8 +584,8 @@ namespace { typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream); - static const func_t funcs_hsv[] = {improc::RGB2HSV_gpu_8u, 0, 0, 0, 0, improc::RGB2HSV_gpu_32f}; - static const func_t funcs_hls[] = {improc::RGB2HLS_gpu_8u, 0, 0, 0, 0, improc::RGB2HLS_gpu_32f}; + static const func_t funcs_hsv[] = {imgproc::RGB2HSV_gpu_8u, 0, 0, 0, 0, imgproc::RGB2HSV_gpu_32f}; + static const func_t funcs_hls[] = {imgproc::RGB2HLS_gpu_8u, 0, 0, 0, 0, imgproc::RGB2HLS_gpu_32f}; if (dcn <= 0) dcn = 3; @@ -610,8 +610,8 @@ namespace { typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream); - static const func_t funcs_hsv[] = {improc::HSV2RGB_gpu_8u, 0, 0, 0, 0, improc::HSV2RGB_gpu_32f}; - static const func_t funcs_hls[] = {improc::HLS2RGB_gpu_8u, 0, 0, 0, 0, improc::HLS2RGB_gpu_32f}; + static const func_t funcs_hsv[] = {imgproc::HSV2RGB_gpu_8u, 0, 0, 0, 0, imgproc::HSV2RGB_gpu_32f}; + static const func_t funcs_hls[] = {imgproc::HLS2RGB_gpu_8u, 0, 0, 0, 0, imgproc::HLS2RGB_gpu_32f}; if (dcn <= 0) dcn = 3; diff --git a/modules/gpu/src/matrix_operations.cpp b/modules/gpu/src/matrix_operations.cpp index 7d58619b28..814c79c6b4 100644 --- a/modules/gpu/src/matrix_operations.cpp +++ b/modules/gpu/src/matrix_operations.cpp @@ -77,6 +77,22 @@ namespace cv #else /* !defined (HAVE_CUDA) */ +namespace cv +{ + namespace gpu + { + namespace matrix_operations + { + void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0); + + void set_to_without_mask (DevMem2D dst, int depth, const double *scalar, int channels, const cudaStream_t & stream = 0); + void set_to_with_mask (DevMem2D dst, int depth, const double *scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0); + + void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream = 0); + } + } +} + void cv::gpu::GpuMat::upload(const Mat& m) { CV_DbgAssert(!m.empty()); diff --git a/tests/gpu/src/gputest_main.cpp b/tests/gpu/src/gputest_main.cpp index ebcbf2c65e..12d62a8c15 100644 --- a/tests/gpu/src/gputest_main.cpp +++ b/tests/gpu/src/gputest_main.cpp @@ -53,7 +53,6 @@ const char* blacklist[] = //"GPU-NppImageMeanStdDev", // different precision //"GPU-NppImageExp", // different precision //"GPU-NppImageLog", // different precision - //"GPU-NppImageMagnitude", // different precision "GPU-NppImageCanny", // NPP_TEXTURE_BIND_ERROR //"GPU-NppImageResize", // different precision @@ -61,8 +60,8 @@ const char* blacklist[] = //"GPU-NppImageWarpPerspective", // different precision //"GPU-NppImageIntegral", // different precision - //"GPU-NppImageSobel", // ??? - //"GPU-NppImageScharr", // ??? + //"GPU-NppImageSobel", // sign error + //"GPU-NppImageScharr", // sign error //"GPU-NppImageGaussianBlur", // different precision 0 };