fixed bug in gpu filter engine (incorrect buffer type) and in vector's saturate_cast.

changed buffer type in linear filters to float.
added support of 1 channel image to linear filters.
added support of BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border type to gpu linear filters.
minor fix in tests.
update comments in gpu.hpp.
This commit is contained in:
Vladislav Vinogradov 2010-12-13 08:43:04 +00:00
parent 108ab94023
commit 49ec8ba742
9 changed files with 677 additions and 310 deletions

View File

@ -388,7 +388,7 @@ namespace cv
CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c);
//! transposes the matrix
//! supports only CV_8UC1 type
//! supports CV_8UC1, CV_8SC1, CV_8UC4, CV_8SC4, CV_16UC2, CV_16SC2, CV_32SC1, CV_32FC1 type
CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst);
//! computes element-wise absolute difference of two arrays (c = abs(a - b))
@ -725,11 +725,11 @@ namespace cv
};
//! returns the non-separable filter engine with the specified filter
CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU> filter2D);
CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU> filter2D, int srcType, int dstType);
//! returns the separable filter engine with the specified filters
CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
const Ptr<BaseColumnFilter_GPU>& columnFilter);
const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType);
//! returns horizontal 1D box filter
//! supports only CV_8UC1 source type and CV_32FC1 sum type
@ -767,23 +767,40 @@ namespace cv
CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel,
const Point& anchor = Point(-1,-1));
//! returns the primitive row filter with the specified kernel
//! returns the primitive row filter with the specified kernel.
//! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 source type.
//! there are two version of algorithm: NPP and OpenCV.
//! NPP calls when srcType == CV_8UC1 or srcType == CV_8UC4 and bufType == srcType,
//! otherwise calls OpenCV version.
//! NPP supports only BORDER_CONSTANT border type.
//! OpenCV version supports only CV_32F as buffer depth and
//! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel,
int anchor = -1);
int anchor = -1, int borderType = BORDER_CONSTANT);
//! returns the primitive column filter with the specified kernel
//! returns the primitive column filter with the specified kernel.
//! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 dst type.
//! there are two version of algorithm: NPP and OpenCV.
//! NPP calls when dstType == CV_8UC1 or dstType == CV_8UC4 and bufType == dstType,
//! otherwise calls OpenCV version.
//! NPP supports only BORDER_CONSTANT border type.
//! OpenCV version supports only CV_32F as buffer depth and
//! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel,
int anchor = -1);
int anchor = -1, int borderType = BORDER_CONSTANT);
//! returns the separable linear filter engine
CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,
const Mat& columnKernel, const Point& anchor = Point(-1,-1));
const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,
int columnBorderType = -1);
//! returns filter engine for the generalized Sobel operator
CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize);
CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize,
int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
//! returns the Gaussian filter engine
CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0);
CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0,
int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
//! returns maximum filter
CV_EXPORTS Ptr<BaseFilter_GPU> getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
@ -812,16 +829,19 @@ namespace cv
//! applies separable 2D linear filter to the image
CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
Point anchor = Point(-1,-1));
Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
//! applies generalized Sobel operator to the image
CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1);
CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1,
int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
//! applies the vertical or horizontal Scharr operator to the image
CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1);
CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1,
int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
//! smooths the image using Gaussian filter.
CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0);
CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0,
int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
//! applies Laplacian operator to the image
//! supports only ksize = 1 and ksize = 3

View File

@ -277,12 +277,12 @@ namespace cv { namespace gpu { namespace mathfunc
void cv::gpu::transpose(const GpuMat& src, GpuMat& dst)
{
CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_8SC4
CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8SC1 || src.type() == CV_8UC4 || src.type() == CV_8SC4
|| src.type() == CV_16UC2 || src.type() == CV_16SC2 || src.type() == CV_32SC1 || src.type() == CV_32FC1);
dst.create( src.cols, src.rows, src.type() );
if (src.type() == CV_8UC1)
if (src.type() == CV_8UC1 || src.type() == CV_8SC1)
{
NppiSize sz;
sz.width = src.cols;

View File

@ -43,6 +43,7 @@
#include "opencv2/gpu/devmem2d.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/vecmath.hpp"
#include "opencv2/gpu/device/limits_gpu.hpp"
#include "safe_call.hpp"
#include "internal_shared.hpp"
@ -50,14 +51,198 @@
using namespace cv::gpu;
using namespace cv::gpu::device;
#ifndef FLT_MAX
#define FLT_MAX 3.402823466e+30F
#endif
namespace cv
{
namespace gpu
{
namespace device
{
struct BrdReflect101
{
explicit BrdReflect101(int len): last(len - 1) {}
__device__ int idx_low(int i) const
{
return abs(i);
}
__device__ int idx_high(int i) const
{
return last - abs(last - i);
}
__device__ int idx(int i) const
{
return abs(idx_high(i));
}
bool is_range_safe(int mini, int maxi) const
{
return -last <= mini && maxi <= 2 * last;
}
int last;
};
template <typename D>
struct BrdRowReflect101: BrdReflect101
{
explicit BrdRowReflect101(int len): BrdReflect101(len) {}
template <typename T>
__device__ D at_low(int i, const T* data) const
{
return saturate_cast<D>(data[idx_low(i)]);
}
template <typename T>
__device__ D at_high(int i, const T* data) const
{
return saturate_cast<D>(data[idx_high(i)]);
}
};
template <typename D>
struct BrdColReflect101: BrdReflect101
{
BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {}
template <typename T>
__device__ D at_low(int i, const T* data) const
{
return saturate_cast<D>(data[idx_low(i) * step]);
}
template <typename T>
__device__ D at_high(int i, const T* data) const
{
return saturate_cast<D>(data[idx_high(i) * step]);
}
int step;
};
struct BrdReplicate
{
explicit BrdReplicate(int len): last(len - 1) {}
__device__ int idx_low(int i) const
{
return max(i, 0);
}
__device__ int idx_high(int i) const
{
return min(i, last);
}
__device__ int idx(int i) const
{
return max(min(i, last), 0);
}
bool is_range_safe(int mini, int maxi) const
{
return true;
}
int last;
};
template <typename D>
struct BrdRowReplicate: BrdReplicate
{
explicit BrdRowReplicate(int len): BrdReplicate(len) {}
template <typename T>
__device__ D at_low(int i, const T* data) const
{
return saturate_cast<D>(data[idx_low(i)]);
}
template <typename T>
__device__ D at_high(int i, const T* data) const
{
return saturate_cast<D>(data[idx_high(i)]);
}
};
template <typename D>
struct BrdColReplicate: BrdReplicate
{
BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {}
template <typename T>
__device__ D at_low(int i, const T* data) const
{
return saturate_cast<D>(data[idx_low(i) * step]);
}
template <typename T>
__device__ D at_high(int i, const T* data) const
{
return saturate_cast<D>(data[idx_high(i) * step]);
}
int step;
};
template <typename D>
struct BrdRowConstant
{
explicit BrdRowConstant(int len_, const D& val_ = VecTraits<D>::all(0)): len(len_), val(val_) {}
template <typename T>
__device__ D at_low(int i, const T* data) const
{
return i >= 0 ? saturate_cast<D>(data[i]) : val;
}
template <typename T>
__device__ D at_high(int i, const T* data) const
{
return i < len ? saturate_cast<D>(data[i]) : val;
}
bool is_range_safe(int mini, int maxi) const
{
return true;
}
int len;
D val;
};
template <typename D>
struct BrdColConstant
{
BrdColConstant(int len_, int step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}
template <typename T>
__device__ D at_low(int i, const T* data) const
{
return i >= 0 ? saturate_cast<D>(data[i * step]) : val;
}
template <typename T>
__device__ D at_high(int i, const T* data) const
{
return i < len ? saturate_cast<D>(data[i * step]) : val;
}
bool is_range_safe(int mini, int maxi) const
{
return true;
}
int len;
int step;
D val;
};
}
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////
// Linear filters
#define MAX_KERNEL_SIZE 16
#define BLOCK_DIM_X 16
#define BLOCK_DIM_Y 16
namespace filter_krnls
{
@ -74,46 +259,53 @@ namespace cv { namespace gpu { namespace filters
namespace filter_krnls
{
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int KERNEL_SIZE, typename T, typename D>
__global__ void linearRowFilter(const T* src, size_t src_step, D* dst, size_t dst_step, int anchor, int width, int height)
template <typename T, size_t size> struct SmemType_
{
__shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_t smem_t;
};
template <typename T> struct SmemType_<T, 4>
{
typedef T smem_t;
};
template <typename T> struct SmemType
{
typedef typename SmemType_<T, sizeof(T)>::smem_t smem_t;
};
template <int ksize, typename T, typename D, typename B>
__global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b)
{
typedef typename SmemType<T>::smem_t smem_t;
__shared__ smem_t smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
const int blockStartX = blockDim.x * blockIdx.x;
const int blockStartY = blockDim.y * blockIdx.y;
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
const int threadX = blockStartX + threadIdx.x;
const int prevThreadX = threadX - blockDim.x;
const int nextThreadX = threadX + blockDim.x;
smem_t* sDataRow = smem + threadIdx.y * BLOCK_DIM_X * 3;
const int threadY = blockStartY + threadIdx.y;
T* sDataRow = smem + threadIdx.y * blockDim.x * 3;
if (threadY < height)
if (y < src.rows)
{
const T* rowSrc = src + threadY * src_step;
const T* rowSrc = src.ptr(y);
sDataRow[threadIdx.x + blockDim.x] = threadX < width ? rowSrc[threadX] : VecTraits<T>::all(0);
sDataRow[threadIdx.x] = prevThreadX >= 0 ? rowSrc[prevThreadX] : VecTraits<T>::all(0);
sDataRow[(blockDim.x << 1) + threadIdx.x] = nextThreadX < width ? rowSrc[nextThreadX] : VecTraits<T>::all(0);
sDataRow[threadIdx.x ] = b.at_low(x - BLOCK_DIM_X, rowSrc);
sDataRow[threadIdx.x + BLOCK_DIM_X ] = b.at_high(x, rowSrc);
sDataRow[threadIdx.x + BLOCK_DIM_X * 2] = b.at_high(x + BLOCK_DIM_X, rowSrc);
__syncthreads();
if (threadX < width)
if (x < src.cols)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_t sum_t;
sum_t sum = VecTraits<sum_t>::all(0);
sDataRow += threadIdx.x + blockDim.x - anchor;
sDataRow += threadIdx.x + BLOCK_DIM_X - anchor;
#pragma unroll
for(int i = 0; i < KERNEL_SIZE; ++i)
for(int i = 0; i < ksize; ++i)
sum = sum + sDataRow[i] * cLinearKernel[i];
dst[threadY * dst_step + threadX] = saturate_cast<D>(sum);
dst.ptr(y)[x] = saturate_cast<D>(sum);
}
}
}
@ -121,100 +313,138 @@ namespace filter_krnls
namespace cv { namespace gpu { namespace filters
{
template <int KERNEL_SIZE, typename T, typename D>
template <int ksize, typename T, typename D, template<typename> class B>
void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor)
{
const int BLOCK_DIM_X = 16;
const int BLOCK_DIM_Y = 16;
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
filter_krnls::linearRowFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE><<<blocks, threads>>>(src.data, src.step/src.elemSize(),
dst.data, dst.step/dst.elemSize(), anchor, src.cols, src.rows);
typedef typename filter_krnls::SmemType<T>::smem_t smem_t;
B<smem_t> b(src.cols);
if (!b.is_range_safe(-BLOCK_DIM_X, (grid.x + 1) * BLOCK_DIM_X - 1))
{
cv::gpu::error("linearRowFilter: can't use specified border extrapolation, image is too small, "
"try bigger image or another border extrapolation mode", __FILE__, __LINE__);
}
filter_krnls::linearRowFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b);
cudaSafeCall( cudaThreadSynchronize() );
}
template <typename T, typename D>
void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type)
{
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor);
static const caller_t callers[] =
{linearRowFilter_caller<0 , T, D>, linearRowFilter_caller<1 , T, D>,
linearRowFilter_caller<2 , T, D>, linearRowFilter_caller<3 , T, D>,
linearRowFilter_caller<4 , T, D>, linearRowFilter_caller<5 , T, D>,
linearRowFilter_caller<6 , T, D>, linearRowFilter_caller<7 , T, D>,
linearRowFilter_caller<8 , T, D>, linearRowFilter_caller<9 , T, D>,
linearRowFilter_caller<10, T, D>, linearRowFilter_caller<11, T, D>,
linearRowFilter_caller<12, T, D>, linearRowFilter_caller<13, T, D>,
linearRowFilter_caller<14, T, D>, linearRowFilter_caller<15, T, D>};
static const caller_t callers[3][17] =
{
{
0,
linearRowFilter_caller<1 , T, D, BrdRowReflect101>,
linearRowFilter_caller<2 , T, D, BrdRowReflect101>,
linearRowFilter_caller<3 , T, D, BrdRowReflect101>,
linearRowFilter_caller<4 , T, D, BrdRowReflect101>,
linearRowFilter_caller<5 , T, D, BrdRowReflect101>,
linearRowFilter_caller<6 , T, D, BrdRowReflect101>,
linearRowFilter_caller<7 , T, D, BrdRowReflect101>,
linearRowFilter_caller<8 , T, D, BrdRowReflect101>,
linearRowFilter_caller<9 , T, D, BrdRowReflect101>,
linearRowFilter_caller<10, T, D, BrdRowReflect101>,
linearRowFilter_caller<11, T, D, BrdRowReflect101>,
linearRowFilter_caller<12, T, D, BrdRowReflect101>,
linearRowFilter_caller<13, T, D, BrdRowReflect101>,
linearRowFilter_caller<14, T, D, BrdRowReflect101>,
linearRowFilter_caller<15, T, D, BrdRowReflect101>,
linearRowFilter_caller<16, T, D, BrdRowReflect101>,
},
{
0,
linearRowFilter_caller<1 , T, D, BrdRowReplicate>,
linearRowFilter_caller<2 , T, D, BrdRowReplicate>,
linearRowFilter_caller<3 , T, D, BrdRowReplicate>,
linearRowFilter_caller<4 , T, D, BrdRowReplicate>,
linearRowFilter_caller<5 , T, D, BrdRowReplicate>,
linearRowFilter_caller<6 , T, D, BrdRowReplicate>,
linearRowFilter_caller<7 , T, D, BrdRowReplicate>,
linearRowFilter_caller<8 , T, D, BrdRowReplicate>,
linearRowFilter_caller<9 , T, D, BrdRowReplicate>,
linearRowFilter_caller<10, T, D, BrdRowReplicate>,
linearRowFilter_caller<11, T, D, BrdRowReplicate>,
linearRowFilter_caller<12, T, D, BrdRowReplicate>,
linearRowFilter_caller<13, T, D, BrdRowReplicate>,
linearRowFilter_caller<14, T, D, BrdRowReplicate>,
linearRowFilter_caller<15, T, D, BrdRowReplicate>,
linearRowFilter_caller<16, T, D, BrdRowReplicate>,
},
{
0,
linearRowFilter_caller<1 , T, D, BrdRowConstant>,
linearRowFilter_caller<2 , T, D, BrdRowConstant>,
linearRowFilter_caller<3 , T, D, BrdRowConstant>,
linearRowFilter_caller<4 , T, D, BrdRowConstant>,
linearRowFilter_caller<5 , T, D, BrdRowConstant>,
linearRowFilter_caller<6 , T, D, BrdRowConstant>,
linearRowFilter_caller<7 , T, D, BrdRowConstant>,
linearRowFilter_caller<8 , T, D, BrdRowConstant>,
linearRowFilter_caller<9 , T, D, BrdRowConstant>,
linearRowFilter_caller<10, T, D, BrdRowConstant>,
linearRowFilter_caller<11, T, D, BrdRowConstant>,
linearRowFilter_caller<12, T, D, BrdRowConstant>,
linearRowFilter_caller<13, T, D, BrdRowConstant>,
linearRowFilter_caller<14, T, D, BrdRowConstant>,
linearRowFilter_caller<15, T, D, BrdRowConstant>,
linearRowFilter_caller<16, T, D, BrdRowConstant>,
}
};
loadLinearKernel(kernel, ksize);
callers[ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);
}
template void linearRowFilter_gpu<uchar4, uchar4>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearRowFilter_gpu<uchar4, char4>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearRowFilter_gpu<char4, uchar4>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearRowFilter_gpu<char4, char4>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearRowFilter_gpu<ushort2, ushort2>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearRowFilter_gpu<ushort2, short2>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearRowFilter_gpu<short2, ushort2>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearRowFilter_gpu<short2, short2>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearRowFilter_gpu<int, int>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearRowFilter_gpu<int, float>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearRowFilter_gpu<float, int>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearRowFilter_gpu<float, float>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearRowFilter_gpu<uchar , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
template void linearRowFilter_gpu<uchar4, float4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
template void linearRowFilter_gpu<short , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);;
template void linearRowFilter_gpu<short2, float2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
template void linearRowFilter_gpu<int , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
template void linearRowFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
}}}
namespace filter_krnls
{
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int KERNEL_SIZE, typename T, typename D>
__global__ void linearColumnFilter(const T* src, size_t src_step, D* dst, size_t dst_step, int anchor, int width, int height)
template <int ksize, typename T, typename D, typename B>
__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b)
{
__shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
const int blockStartX = blockDim.x * blockIdx.x;
const int blockStartY = blockDim.y * blockIdx.y;
const int threadX = blockStartX + threadIdx.x;
const int threadY = blockStartY + threadIdx.y;
const int prevThreadY = threadY - blockDim.y;
const int nextThreadY = threadY + blockDim.y;
const int smem_step = blockDim.x;
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
T* sDataColumn = smem + threadIdx.x;
if (threadX < width)
if (x < src.cols)
{
const T* colSrc = src + threadX;
const T* srcCol = src.ptr() + x;
sDataColumn[(threadIdx.y + blockDim.y) * smem_step] = threadY < height ? colSrc[threadY * src_step] : VecTraits<T>::all(0);
sDataColumn[threadIdx.y * smem_step] = prevThreadY >= 0 ? colSrc[prevThreadY * src_step] : VecTraits<T>::all(0);
sDataColumn[(threadIdx.y + (blockDim.y << 1)) * smem_step] = nextThreadY < height ? colSrc[nextThreadY * src_step] : VecTraits<T>::all(0);
sDataColumn[ threadIdx.y * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol);
sDataColumn[(threadIdx.y + BLOCK_DIM_Y) * BLOCK_DIM_X] = b.at_high(y, srcCol);
sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol);
__syncthreads();
if (threadY < height)
if (y < src.rows)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_t sum_t;
sum_t sum = VecTraits<sum_t>::all(0);
sDataColumn += (threadIdx.y + blockDim.y - anchor)* smem_step;
sDataColumn += (threadIdx.y + BLOCK_DIM_Y - anchor) * BLOCK_DIM_X;
#pragma unroll
for(int i = 0; i < KERNEL_SIZE; ++i)
sum = sum + sDataColumn[i * smem_step] * cLinearKernel[i];
for(int i = 0; i < ksize; ++i)
sum = sum + sDataColumn[i * BLOCK_DIM_X] * cLinearKernel[i];
dst[threadY * dst_step + threadX] = saturate_cast<D>(sum);
dst.ptr(y)[x] = saturate_cast<D>(sum);
}
}
}
@ -222,54 +452,101 @@ namespace filter_krnls
namespace cv { namespace gpu { namespace filters
{
template <int KERNEL_SIZE, typename T, typename D>
template <int ksize, typename T, typename D, template<typename> class B>
void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor)
{
const int BLOCK_DIM_X = 16;
const int BLOCK_DIM_Y = 16;
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
filter_krnls::linearColumnFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE><<<blocks, threads>>>(src.data, src.step/src.elemSize(),
dst.data, dst.step/dst.elemSize(), anchor, src.cols, src.rows);
B<T> b(src.rows, src.step / src.elemSize());
if (!b.is_range_safe(-BLOCK_DIM_Y, (grid.y + 1) * BLOCK_DIM_Y - 1))
{
cv::gpu::error("linearColumnFilter: can't use specified border extrapolation, image is too small, "
"try bigger image or another border extrapolation mode", __FILE__, __LINE__);
}
filter_krnls::linearColumnFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b);
cudaSafeCall( cudaThreadSynchronize() );
}
template <typename T, typename D>
void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type)
{
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor);
static const caller_t callers[] =
{linearColumnFilter_caller<0 , T, D>, linearColumnFilter_caller<1 , T, D>,
linearColumnFilter_caller<2 , T, D>, linearColumnFilter_caller<3 , T, D>,
linearColumnFilter_caller<4 , T, D>, linearColumnFilter_caller<5 , T, D>,
linearColumnFilter_caller<6 , T, D>, linearColumnFilter_caller<7 , T, D>,
linearColumnFilter_caller<8 , T, D>, linearColumnFilter_caller<9 , T, D>,
linearColumnFilter_caller<10, T, D>, linearColumnFilter_caller<11, T, D>,
linearColumnFilter_caller<12, T, D>, linearColumnFilter_caller<13, T, D>,
linearColumnFilter_caller<14, T, D>, linearColumnFilter_caller<15, T, D>};
static const caller_t callers[3][17] =
{
{
0,
linearColumnFilter_caller<1 , T, D, BrdColReflect101>,
linearColumnFilter_caller<2 , T, D, BrdColReflect101>,
linearColumnFilter_caller<3 , T, D, BrdColReflect101>,
linearColumnFilter_caller<4 , T, D, BrdColReflect101>,
linearColumnFilter_caller<5 , T, D, BrdColReflect101>,
linearColumnFilter_caller<6 , T, D, BrdColReflect101>,
linearColumnFilter_caller<7 , T, D, BrdColReflect101>,
linearColumnFilter_caller<8 , T, D, BrdColReflect101>,
linearColumnFilter_caller<9 , T, D, BrdColReflect101>,
linearColumnFilter_caller<10, T, D, BrdColReflect101>,
linearColumnFilter_caller<11, T, D, BrdColReflect101>,
linearColumnFilter_caller<12, T, D, BrdColReflect101>,
linearColumnFilter_caller<13, T, D, BrdColReflect101>,
linearColumnFilter_caller<14, T, D, BrdColReflect101>,
linearColumnFilter_caller<15, T, D, BrdColReflect101>,
linearColumnFilter_caller<16, T, D, BrdColReflect101>,
},
{
0,
linearColumnFilter_caller<1 , T, D, BrdColReplicate>,
linearColumnFilter_caller<2 , T, D, BrdColReplicate>,
linearColumnFilter_caller<3 , T, D, BrdColReplicate>,
linearColumnFilter_caller<4 , T, D, BrdColReplicate>,
linearColumnFilter_caller<5 , T, D, BrdColReplicate>,
linearColumnFilter_caller<6 , T, D, BrdColReplicate>,
linearColumnFilter_caller<7 , T, D, BrdColReplicate>,
linearColumnFilter_caller<8 , T, D, BrdColReplicate>,
linearColumnFilter_caller<9 , T, D, BrdColReplicate>,
linearColumnFilter_caller<10, T, D, BrdColReplicate>,
linearColumnFilter_caller<11, T, D, BrdColReplicate>,
linearColumnFilter_caller<12, T, D, BrdColReplicate>,
linearColumnFilter_caller<13, T, D, BrdColReplicate>,
linearColumnFilter_caller<14, T, D, BrdColReplicate>,
linearColumnFilter_caller<15, T, D, BrdColReplicate>,
linearColumnFilter_caller<16, T, D, BrdColReplicate>,
},
{
0,
linearColumnFilter_caller<1 , T, D, BrdColConstant>,
linearColumnFilter_caller<2 , T, D, BrdColConstant>,
linearColumnFilter_caller<3 , T, D, BrdColConstant>,
linearColumnFilter_caller<4 , T, D, BrdColConstant>,
linearColumnFilter_caller<5 , T, D, BrdColConstant>,
linearColumnFilter_caller<6 , T, D, BrdColConstant>,
linearColumnFilter_caller<7 , T, D, BrdColConstant>,
linearColumnFilter_caller<8 , T, D, BrdColConstant>,
linearColumnFilter_caller<9 , T, D, BrdColConstant>,
linearColumnFilter_caller<10, T, D, BrdColConstant>,
linearColumnFilter_caller<11, T, D, BrdColConstant>,
linearColumnFilter_caller<12, T, D, BrdColConstant>,
linearColumnFilter_caller<13, T, D, BrdColConstant>,
linearColumnFilter_caller<14, T, D, BrdColConstant>,
linearColumnFilter_caller<15, T, D, BrdColConstant>,
linearColumnFilter_caller<16, T, D, BrdColConstant>,
}
};
loadLinearKernel(kernel, ksize);
callers[ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);
}
template void linearColumnFilter_gpu<uchar4, uchar4>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearColumnFilter_gpu<uchar4, char4>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearColumnFilter_gpu<char4, uchar4>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearColumnFilter_gpu<char4, char4>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearColumnFilter_gpu<ushort2, ushort2>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearColumnFilter_gpu<ushort2, short2>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearColumnFilter_gpu<short2, ushort2>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearColumnFilter_gpu<short2, short2>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearColumnFilter_gpu<int, int>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearColumnFilter_gpu<int, float>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearColumnFilter_gpu<float, int>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearColumnFilter_gpu<float, float>(const DevMem2D&, const DevMem2D&, const float[], int , int);
template void linearColumnFilter_gpu<float , uchar >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
template void linearColumnFilter_gpu<float , short >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
template void linearColumnFilter_gpu<float2, short2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
template void linearColumnFilter_gpu<float , int >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
template void linearColumnFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
}}}
/////////////////////////////////////////////////////////////////////////////////////////////////
@ -377,7 +654,7 @@ namespace bf_krnls
}
}
float minimum = FLT_MAX;
float minimum = numeric_limits_gpu<float>::max();
int id = 0;
if (cost[0] < minimum)

View File

@ -59,7 +59,8 @@ namespace cv
enum
{
BORDER_REFLECT101_GPU = 0,
BORDER_REPLICATE_GPU
BORDER_REPLICATE_GPU,
BORDER_CONSTANT_GPU
};
// Converts CPU border extrapolation mode into GPU internal analogue.

View File

@ -48,8 +48,8 @@ using namespace cv::gpu;
#if !defined (HAVE_CUDA)
Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); }
Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); }
Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
@ -58,11 +58,11 @@ Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int, int, const Mat&, const
Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, const Point&, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int, int, const Mat&, const Point&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); }
Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, double, double) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); }
Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, double, double, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
Ptr<BaseFilter_GPU> cv::gpu::getMinFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
@ -71,10 +71,10 @@ void cv::gpu::erode( const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nog
void cv::gpu::dilate( const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nogpu(); }
void cv::gpu::morphologyEx( const GpuMat&, GpuMat&, int, const Mat&, Point, int) { throw_nogpu(); }
void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point) { throw_nogpu(); }
void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point) { throw_nogpu(); }
void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double) { throw_nogpu(); }
void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double) { throw_nogpu(); }
void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double) { throw_nogpu(); }
void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, int, int) { throw_nogpu(); }
void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double, int, int) { throw_nogpu(); }
void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double, int, int) { throw_nogpu(); }
void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double, int, int) { throw_nogpu(); }
void cv::gpu::Laplacian(const GpuMat&, GpuMat&, int, int, double) { throw_nogpu(); }
#else
@ -133,13 +133,17 @@ namespace
class Filter2DEngine_GPU : public FilterEngine_GPU
{
public:
Filter2DEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_) : filter2D(filter2D_) {}
Filter2DEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int srcType_, int dstType_) :
filter2D(filter2D_), srcType(srcType_), dstType(dstType_)
{}
virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1))
{
CV_Assert(src.type() == srcType);
Size src_size = src.size();
dst.create(src_size, src.type());
dst.create(src_size, dstType);
dst = Scalar(0.0);
normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size);
@ -151,12 +155,13 @@ namespace
}
Ptr<BaseFilter_GPU> filter2D;
int srcType, dstType;
};
}
Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU> filter2D)
Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU> filter2D, int srcType, int dstType)
{
return Ptr<FilterEngine_GPU>(new Filter2DEngine_GPU(filter2D));
return Ptr<FilterEngine_GPU>(new Filter2DEngine_GPU(filter2D, srcType, dstType));
}
////////////////////////////////////////////////////////////////////////////////////////////////////
@ -168,8 +173,9 @@ namespace
{
public:
SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_,
const Ptr<BaseColumnFilter_GPU>& columnFilter_) :
rowFilter(rowFilter_), columnFilter(columnFilter_)
const Ptr<BaseColumnFilter_GPU>& columnFilter_, int srcType_, int bufType_, int dstType_) :
rowFilter(rowFilter_), columnFilter(columnFilter_),
srcType(srcType_), bufType(bufType_), dstType(dstType_)
{
ksize = Size(rowFilter->ksize, columnFilter->ksize);
anchor = Point(rowFilter->anchor, columnFilter->anchor);
@ -177,19 +183,20 @@ namespace
virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1))
{
Size src_size = src.size();
int src_type = src.type();
CV_Assert(src.type() == srcType);
dst.create(src_size, src_type);
Size src_size = src.size();
dst.create(src_size, dstType);
dst = Scalar(0.0);
dstBuf.create(src_size, src_type);
dstBuf.create(src_size, bufType);
dstBuf = Scalar(0.0);
normalizeROI(roi, ksize, anchor, src_size);
srcROI = src(roi);
dstROI = dst(roi);
dstBufROI = dstBuf(roi);
GpuMat srcROI = src(roi);
GpuMat dstROI = dst(roi);
GpuMat dstBufROI = dstBuf(roi);
(*rowFilter)(srcROI, dstBufROI);
(*columnFilter)(dstBufROI, dstROI);
@ -197,19 +204,19 @@ namespace
Ptr<BaseRowFilter_GPU> rowFilter;
Ptr<BaseColumnFilter_GPU> columnFilter;
int srcType, bufType, dstType;
Size ksize;
Point anchor;
GpuMat dstBuf;
GpuMat srcROI;
GpuMat dstROI;
GpuMat dstBufROI;
GpuMat dstBuf;
};
}
Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
const Ptr<BaseColumnFilter_GPU>& columnFilter)
const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType)
{
return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter));
return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType));
}
////////////////////////////////////////////////////////////////////////////////////////////////////
@ -315,7 +322,7 @@ Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int srcType, int dstType, const Si
Ptr<FilterEngine_GPU> cv::gpu::createBoxFilter_GPU(int srcType, int dstType, const Size& ksize, const Point& anchor)
{
Ptr<BaseFilter_GPU> boxFilter = getBoxFilter_GPU(srcType, dstType, ksize, anchor);
return createFilter2D_GPU(boxFilter);
return createFilter2D_GPU(boxFilter, srcType, dstType);
}
void cv::gpu::boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor)
@ -386,8 +393,8 @@ namespace
class MorphologyFilterEngine_GPU : public Filter2DEngine_GPU
{
public:
MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int iters_) :
Filter2DEngine_GPU(filter2D_), iters(iters_) {}
MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int type, int iters_) :
Filter2DEngine_GPU(filter2D_, type, type), iters(iters_) {}
virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1))
{
@ -415,7 +422,7 @@ Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int op, int type, cons
Ptr<BaseFilter_GPU> filter2D = getMorphologyFilter_GPU(op, type, kernel, ksize, anchor);
return Ptr<FilterEngine_GPU>(new MorphologyFilterEngine_GPU(filter2D, iterations));
return Ptr<FilterEngine_GPU>(new MorphologyFilterEngine_GPU(filter2D, type, iterations));
}
namespace
@ -558,7 +565,7 @@ Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int srcType, int dstType,
Ptr<BaseFilter_GPU> linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, ksize, anchor);
return createFilter2D_GPU(linearFilter);
return createFilter2D_GPU(linearFilter, srcType, dstType);
}
void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor)
@ -578,10 +585,10 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke
namespace cv { namespace gpu { namespace filters
{
template <typename T, typename D>
void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
template <typename T, typename D>
void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
}}}
namespace
@ -589,7 +596,7 @@ namespace
typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI,
const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor);
typedef void (*gpuFilter1D_t)(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
typedef void (*gpuFilter1D_t)(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
class NppLinearRowFilter : public BaseRowFilter_GPU
{
@ -614,35 +621,28 @@ namespace
class GpuLinearRowFilter : public BaseRowFilter_GPU
{
public:
GpuLinearRowFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_) :
BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {}
GpuLinearRowFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) :
BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}
virtual void operator()(const GpuMat& src, GpuMat& dst)
{
func(src, dst, kernel.ptr<float>(), ksize, anchor);
func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type);
}
Mat kernel;
gpuFilter1D_t func;
int brd_type;
};
}
Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor)
Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType)
{
using namespace cv::gpu::filters;
static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R};
static const gpuFilter1D_t gpuFilter1D_callers[6][6] =
{
{linearRowFilter_gpu<uchar4, uchar4>,linearRowFilter_gpu<uchar4, char4>,0,0,0,0},
{linearRowFilter_gpu<char4, uchar4>,linearRowFilter_gpu<char4, char4>,0,0,0,0},
{0,0,linearRowFilter_gpu<ushort2, ushort2>,linearRowFilter_gpu<ushort2, short2>,0,0},
{0,0,linearRowFilter_gpu<short2, ushort2>,linearRowFilter_gpu<short2, short2>,0,0},
{0,0,0,0,linearRowFilter_gpu<int, int>, linearRowFilter_gpu<int, float>},
{0,0,0,0,linearRowFilter_gpu<float, int>, linearRowFilter_gpu<float, float>}
};
if ((bufType == srcType) && (srcType == CV_8UC1 || srcType == CV_8UC4))
{
CV_Assert(borderType == BORDER_CONSTANT);
GpuMat gpu_row_krnl;
int nDivisor;
normalizeKernel(rowKernel, gpu_row_krnl, CV_32S, &nDivisor, true);
@ -653,9 +653,15 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
return Ptr<BaseRowFilter_GPU>(new NppLinearRowFilter(ksize, anchor, gpu_row_krnl, nDivisor,
nppFilter1D_callers[CV_MAT_CN(srcType)]));
}
CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT);
int gpuBorderType;
CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
CV_Assert(srcType == CV_8UC4 || srcType == CV_8SC4 || srcType == CV_16UC2 || srcType == CV_16SC2 || srcType == CV_32SC1 || srcType == CV_32FC1);
CV_Assert(bufType == CV_8UC4 || bufType == CV_8SC4 || bufType == CV_16UC2 || bufType == CV_16SC2 || bufType == CV_32SC1 || bufType == CV_32FC1);
CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC4 || srcType == CV_16SC1 || srcType == CV_16SC2
|| srcType == CV_32SC1 || srcType == CV_32FC1);
CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(srcType) == CV_MAT_CN(bufType));
Mat temp(rowKernel.size(), CV_32FC1);
rowKernel.convertTo(temp, CV_32FC1);
@ -663,12 +669,35 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
int ksize = cont_krnl.cols;
CV_Assert(ksize < 16);
CV_Assert(ksize > 0 && ksize <= 16);
normalizeAnchor(anchor, ksize);
return Ptr<BaseRowFilter_GPU>(new GpuLinearRowFilter(ksize, anchor, cont_krnl,
gpuFilter1D_callers[CV_MAT_DEPTH(srcType)][CV_MAT_DEPTH(bufType)]));
gpuFilter1D_t func = 0;
switch (srcType)
{
case CV_8UC1:
func = filters::linearRowFilter_gpu<uchar, float>;
break;
case CV_8UC4:
func = filters::linearRowFilter_gpu<uchar4, float4>;
break;
case CV_16SC1:
func = filters::linearRowFilter_gpu<short, float>;
break;
case CV_16SC2:
func = filters::linearRowFilter_gpu<short2, float2>;
break;
case CV_32SC1:
func = filters::linearRowFilter_gpu<int, float>;
break;
case CV_32FC1:
func = filters::linearRowFilter_gpu<float, float>;
break;
}
return Ptr<BaseRowFilter_GPU>(new GpuLinearRowFilter(ksize, anchor, cont_krnl, func, gpuBorderType));
}
namespace
@ -696,35 +725,28 @@ namespace
class GpuLinearColumnFilter : public BaseColumnFilter_GPU
{
public:
GpuLinearColumnFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_) :
BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {}
GpuLinearColumnFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) :
BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}
virtual void operator()(const GpuMat& src, GpuMat& dst)
{
func(src, dst, kernel.ptr<float>(), ksize, anchor);
func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type);
}
Mat kernel;
gpuFilter1D_t func;
int brd_type;
};
}
Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor)
Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType)
{
using namespace cv::gpu::filters;
static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R};
static const gpuFilter1D_t gpuFilter1D_callers[6][6] =
{
{linearColumnFilter_gpu<uchar4, uchar4>,linearColumnFilter_gpu<uchar4, char4>,0,0,0,0},
{linearColumnFilter_gpu<char4, uchar4>,linearColumnFilter_gpu<char4, char4>,0,0,0,0},
{0,0,linearColumnFilter_gpu<ushort2, ushort2>,linearColumnFilter_gpu<ushort2, short2>,0,0},
{0,0,linearColumnFilter_gpu<short2, ushort2>,linearColumnFilter_gpu<short2, short2>,0,0},
{0,0,0,0,linearColumnFilter_gpu<int, int>, linearColumnFilter_gpu<int, float>},
{0,0,0,0,linearColumnFilter_gpu<float, int>, linearColumnFilter_gpu<float, float>}
};
if ((bufType == dstType) && (bufType == CV_8UC1 || bufType == CV_8UC4))
{
CV_Assert(borderType == BORDER_CONSTANT);
GpuMat gpu_col_krnl;
int nDivisor;
normalizeKernel(columnKernel, gpu_col_krnl, CV_32S, &nDivisor, true);
@ -735,9 +757,15 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
return Ptr<BaseColumnFilter_GPU>(new NppLinearColumnFilter(ksize, anchor, gpu_col_krnl, nDivisor,
nppFilter1D_callers[CV_MAT_CN(bufType)]));
}
CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT);
int gpuBorderType;
CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
CV_Assert(dstType == CV_8UC1 || dstType == CV_8UC4 || dstType == CV_16SC1 || dstType == CV_16SC2
|| dstType == CV_32SC1 || dstType == CV_32FC1);
CV_Assert(dstType == CV_8UC4 || dstType == CV_8SC4 || dstType == CV_16UC2 || dstType == CV_16SC2 || dstType == CV_32SC1 || dstType == CV_32FC1);
CV_Assert(bufType == CV_8UC4 || bufType == CV_8SC4 || bufType == CV_16UC2 || bufType == CV_16SC2 || bufType == CV_32SC1 || bufType == CV_32FC1);
CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(dstType) == CV_MAT_CN(bufType));
Mat temp(columnKernel.size(), CV_32FC1);
columnKernel.convertTo(temp, CV_32FC1);
@ -745,50 +773,76 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
int ksize = cont_krnl.cols;
CV_Assert(ksize < 16);
CV_Assert(ksize > 0 && ksize <= 16);
normalizeAnchor(anchor, ksize);
return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, cont_krnl,
gpuFilter1D_callers[CV_MAT_DEPTH(bufType)][CV_MAT_DEPTH(dstType)]));
gpuFilter1D_t func = 0;
switch (dstType)
{
case CV_8UC1:
func = filters::linearColumnFilter_gpu<float, uchar>;
break;
case CV_8UC4:
func = filters::linearColumnFilter_gpu<float4, uchar4>;
break;
case CV_16SC1:
func = filters::linearColumnFilter_gpu<float, short>;
break;
case CV_16SC2:
func = filters::linearColumnFilter_gpu<float2, short2>;
break;
case CV_32SC1:
func = filters::linearColumnFilter_gpu<float, int>;
break;
case CV_32FC1:
func = filters::linearColumnFilter_gpu<float, float>;
break;
}
return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, cont_krnl, func, gpuBorderType));
}
Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel,
const Point& anchor)
const Point& anchor, int rowBorderType, int columnBorderType)
{
if (columnBorderType < 0)
columnBorderType = rowBorderType;
int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType);
int cn = CV_MAT_CN(srcType);
int bdepth = std::max(sdepth, ddepth);
int bdepth = CV_32F;
int bufType = CV_MAKETYPE(bdepth, cn);
Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x);
Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y);
Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, rowBorderType);
Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, columnBorderType);
return createSeparableFilter_GPU(rowFilter, columnFilter);
return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType);
}
void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor)
void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor, int rowBorderType, int columnBorderType)
{
if( ddepth < 0 )
ddepth = src.depth();
dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor);
f->apply(src, dst);
Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, rowBorderType, columnBorderType);
f->apply(src, dst, Rect(0, 0, src.cols, src.rows));
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Deriv Filter
Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize)
Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int rowBorderType, int columnBorderType)
{
Mat kx, ky;
getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1));
return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
}
void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale)
void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale, int rowBorderType, int columnBorderType)
{
Mat kx, ky;
getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
@ -803,10 +857,10 @@ void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy,
ky *= scale;
}
sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1));
sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
}
void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale)
void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale, int rowBorderType, int columnBorderType)
{
Mat kx, ky;
getDerivKernels(kx, ky, dx, dy, -1, false, CV_32F);
@ -821,7 +875,7 @@ void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy,
ky *= scale;
}
sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1));
sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
}
void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale)
@ -843,7 +897,7 @@ void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, d
////////////////////////////////////////////////////////////////////////////////////////////////////
// Gaussian Filter
Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2)
Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
{
int depth = CV_MAT_DEPTH(type);
@ -868,10 +922,10 @@ Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, do
else
ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );
return createSeparableLinearFilter_GPU(type, type, kx, ky);
return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
}
void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2)
void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
{
if (ksize.width == 1 && ksize.height == 1)
{
@ -881,8 +935,8 @@ void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double si
dst.create(src.size(), src.type());
Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2);
f->apply(src, dst);
Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, rowBorderType, columnBorderType);
f->apply(src, dst, Rect(0, 0, src.cols, src.rows));
}
////////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -972,6 +972,12 @@ bool cv::gpu::tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType)
gpuBorderType = cv::gpu::BORDER_REPLICATE_GPU;
return true;
}
if (cpuBorderType == cv::BORDER_CONSTANT)
{
gpuBorderType = cv::gpu::BORDER_CONSTANT_GPU;
return true;
}
return false;
}

View File

@ -123,278 +123,295 @@ namespace cv
{
typedef uchar elem_t;
enum {cn=1};
static __device__ uchar all(uchar v) {return v;}
static __device__ __host__ uchar all(uchar v) {return v;}
static __device__ __host__ uchar make(uchar x) {return x;}
};
template<> struct VecTraits<uchar1>
{
typedef uchar elem_t;
enum {cn=1};
static __device__ uchar1 all(uchar v) {return make_uchar1(v);}
static __device__ __host__ uchar1 all(uchar v) {return make_uchar1(v);}
static __device__ __host__ uchar1 make(uchar x) {return make_uchar1(x);}
};
template<> struct VecTraits<uchar2>
{
typedef uchar elem_t;
enum {cn=2};
static __device__ uchar2 all(uchar v) {return make_uchar2(v, v);}
static __device__ __host__ uchar2 all(uchar v) {return make_uchar2(v, v);}
static __device__ __host__ uchar2 make(uchar x, uchar y) {return make_uchar2(x, y);}
};
template<> struct VecTraits<uchar3>
{
typedef uchar elem_t;
enum {cn=3};
static __device__ uchar3 all(uchar v) {return make_uchar3(v, v, v);}
static __device__ __host__ uchar3 all(uchar v) {return make_uchar3(v, v, v);}
static __device__ __host__ uchar3 make(uchar x, uchar y, uchar z) {return make_uchar3(x, y, z);}
};
template<> struct VecTraits<uchar4>
{
typedef uchar elem_t;
enum {cn=4};
static __device__ uchar4 all(uchar v) {return make_uchar4(v, v, v, v);}
static __device__ __host__ uchar4 all(uchar v) {return make_uchar4(v, v, v, v);}
static __device__ __host__ uchar4 make(uchar x, uchar y, uchar z, uchar w) {return make_uchar4(x, y, z, w);}
};
template<> struct VecTraits<char>
{
typedef char elem_t;
enum {cn=1};
static __device__ char all(char v) {return v;}
static __device__ __host__ char all(char v) {return v;}
static __device__ __host__ char make(char x) {return x;}
};
template<> struct VecTraits<char1>
{
typedef char elem_t;
enum {cn=1};
static __device__ char1 all(char v) {return make_char1(v);}
static __device__ __host__ char1 all(char v) {return make_char1(v);}
static __device__ __host__ char1 make(char x) {return make_char1(x);}
};
template<> struct VecTraits<char2>
{
typedef char elem_t;
enum {cn=2};
static __device__ char2 all(char v) {return make_char2(v, v);}
static __device__ __host__ char2 all(char v) {return make_char2(v, v);}
static __device__ __host__ char2 make(char x, char y) {return make_char2(x, y);}
};
template<> struct VecTraits<char3>
{
typedef char elem_t;
enum {cn=3};
static __device__ char3 all(char v) {return make_char3(v, v, v);}
static __device__ __host__ char3 all(char v) {return make_char3(v, v, v);}
static __device__ __host__ char3 make(char x, char y, char z) {return make_char3(x, y, z);}
};
template<> struct VecTraits<char4>
{
typedef char elem_t;
enum {cn=4};
static __device__ char4 all(char v) {return make_char4(v, v, v, v);}
static __device__ __host__ char4 all(char v) {return make_char4(v, v, v, v);}
static __device__ __host__ char4 make(char x, char y, char z, char w) {return make_char4(x, y, z, w);}
};
template<> struct VecTraits<ushort>
{
typedef ushort elem_t;
enum {cn=1};
static __device__ ushort all(ushort v) {return v;}
static __device__ __host__ ushort all(ushort v) {return v;}
static __device__ __host__ ushort make(ushort x) {return x;}
};
template<> struct VecTraits<ushort1>
{
typedef ushort elem_t;
enum {cn=1};
static __device__ ushort1 all(ushort v) {return make_ushort1(v);}
static __device__ __host__ ushort1 all(ushort v) {return make_ushort1(v);}
static __device__ __host__ ushort1 make(ushort x) {return make_ushort1(x);}
};
template<> struct VecTraits<ushort2>
{
typedef ushort elem_t;
enum {cn=2};
static __device__ ushort2 all(ushort v) {return make_ushort2(v, v);}
static __device__ __host__ ushort2 all(ushort v) {return make_ushort2(v, v);}
static __device__ __host__ ushort2 make(ushort x, ushort y) {return make_ushort2(x, y);}
};
template<> struct VecTraits<ushort3>
{
typedef ushort elem_t;
enum {cn=3};
static __device__ ushort3 all(ushort v) {return make_ushort3(v, v, v);}
static __device__ __host__ ushort3 all(ushort v) {return make_ushort3(v, v, v);}
static __device__ __host__ ushort3 make(ushort x, ushort y, ushort z) {return make_ushort3(x, y, z);}
};
template<> struct VecTraits<ushort4>
{
typedef ushort elem_t;
enum {cn=4};
static __device__ ushort4 all(ushort v) {return make_ushort4(v, v, v, v);}
static __device__ __host__ ushort4 all(ushort v) {return make_ushort4(v, v, v, v);}
static __device__ __host__ ushort4 make(ushort x, ushort y, ushort z, ushort w) {return make_ushort4(x, y, z, w);}
};
template<> struct VecTraits<short>
{
typedef short elem_t;
enum {cn=1};
static __device__ short all(short v) {return v;}
static __device__ __host__ short all(short v) {return v;}
static __device__ __host__ short make(short x) {return x;}
};
template<> struct VecTraits<short1>
{
typedef short elem_t;
enum {cn=1};
static __device__ short1 all(short v) {return make_short1(v);}
static __device__ __host__ short1 all(short v) {return make_short1(v);}
static __device__ __host__ short1 make(short x) {return make_short1(x);}
};
template<> struct VecTraits<short2>
{
typedef short elem_t;
enum {cn=2};
static __device__ short2 all(short v) {return make_short2(v, v);}
static __device__ __host__ short2 all(short v) {return make_short2(v, v);}
static __device__ __host__ short2 make(short x, short y) {return make_short2(x, y);}
};
template<> struct VecTraits<short3>
{
typedef short elem_t;
enum {cn=3};
static __device__ short3 all(short v) {return make_short3(v, v, v);}
static __device__ __host__ short3 all(short v) {return make_short3(v, v, v);}
static __device__ __host__ short3 make(short x, short y, short z) {return make_short3(x, y, z);}
};
template<> struct VecTraits<short4>
{
typedef short elem_t;
enum {cn=4};
static __device__ short4 all(short v) {return make_short4(v, v, v, v);}
static __device__ __host__ short4 all(short v) {return make_short4(v, v, v, v);}
static __device__ __host__ short4 make(short x, short y, short z, short w) {return make_short4(x, y, z, w);}
};
template<> struct VecTraits<uint>
{
typedef uint elem_t;
enum {cn=1};
static __device__ uint all(uint v) {return v;}
static __device__ __host__ uint all(uint v) {return v;}
static __device__ __host__ uint make(uint x) {return x;}
};
template<> struct VecTraits<uint1>
{
typedef uint elem_t;
enum {cn=1};
static __device__ uint1 all(uint v) {return make_uint1(v);}
static __device__ __host__ uint1 all(uint v) {return make_uint1(v);}
static __device__ __host__ uint1 make(uint x) {return make_uint1(x);}
};
template<> struct VecTraits<uint2>
{
typedef uint elem_t;
enum {cn=2};
static __device__ uint2 all(uint v) {return make_uint2(v, v);}
static __device__ __host__ uint2 all(uint v) {return make_uint2(v, v);}
static __device__ __host__ uint2 make(uint x, uint y) {return make_uint2(x, y);}
};
template<> struct VecTraits<uint3>
{
typedef uint elem_t;
enum {cn=3};
static __device__ uint3 all(uint v) {return make_uint3(v, v, v);}
static __device__ __host__ uint3 all(uint v) {return make_uint3(v, v, v);}
static __device__ __host__ uint3 make(uint x, uint y, uint z) {return make_uint3(x, y, z);}
};
template<> struct VecTraits<uint4>
{
typedef uint elem_t;
enum {cn=4};
static __device__ uint4 all(uint v) {return make_uint4(v, v, v, v);}
static __device__ __host__ uint4 all(uint v) {return make_uint4(v, v, v, v);}
static __device__ __host__ uint4 make(uint x, uint y, uint z, uint w) {return make_uint4(x, y, z, w);}
};
template<> struct VecTraits<int>
{
typedef int elem_t;
enum {cn=1};
static __device__ int all(int v) {return v;}
static __device__ __host__ int all(int v) {return v;}
static __device__ __host__ int make(int x) {return x;}
};
template<> struct VecTraits<int1>
{
typedef int elem_t;
enum {cn=1};
static __device__ int1 all(int v) {return make_int1(v);}
static __device__ __host__ int1 all(int v) {return make_int1(v);}
static __device__ __host__ int1 make(int x) {return make_int1(x);}
};
template<> struct VecTraits<int2>
{
typedef int elem_t;
enum {cn=2};
static __device__ int2 all(int v) {return make_int2(v, v);}
static __device__ __host__ int2 all(int v) {return make_int2(v, v);}
static __device__ __host__ int2 make(int x, int y) {return make_int2(x, y);}
};
template<> struct VecTraits<int3>
{
typedef int elem_t;
enum {cn=3};
static __device__ int3 all(int v) {return make_int3(v, v, v);}
static __device__ __host__ int3 all(int v) {return make_int3(v, v, v);}
static __device__ __host__ int3 make(int x, int y, int z) {return make_int3(x, y, z);}
};
template<> struct VecTraits<int4>
{
typedef int elem_t;
enum {cn=4};
static __device__ int4 all(int v) {return make_int4(v, v, v, v);}
static __device__ __host__ int4 all(int v) {return make_int4(v, v, v, v);}
static __device__ __host__ int4 make(int x, int y, int z, int w) {return make_int4(x, y, z, w);}
};
template<> struct VecTraits<float>
{
typedef float elem_t;
enum {cn=1};
static __device__ float all(float v) {return v;}
static __device__ __host__ float all(float v) {return v;}
static __device__ __host__ float make(float x) {return x;}
};
template<> struct VecTraits<float1>
{
typedef float elem_t;
enum {cn=1};
static __device__ float1 all(float v) {return make_float1(v);}
static __device__ __host__ float1 all(float v) {return make_float1(v);}
static __device__ __host__ float1 make(float x) {return make_float1(x);}
};
template<> struct VecTraits<float2>
{
typedef float elem_t;
enum {cn=2};
static __device__ float2 all(float v) {return make_float2(v, v);}
static __device__ __host__ float2 all(float v) {return make_float2(v, v);}
static __device__ __host__ float2 make(float x, float y) {return make_float2(x, y);}
};
template<> struct VecTraits<float3>
{
typedef float elem_t;
enum {cn=3};
static __device__ float3 all(float v) {return make_float3(v, v, v);}
static __device__ __host__ float3 all(float v) {return make_float3(v, v, v);}
static __device__ __host__ float3 make(float x, float y, float z) {return make_float3(x, y, z);}
};
template<> struct VecTraits<float4>
{
typedef float elem_t;
enum {cn=4};
static __device__ float4 all(float v) {return make_float4(v, v, v, v);}
static __device__ __host__ float4 all(float v) {return make_float4(v, v, v, v);}
static __device__ __host__ float4 make(float x, float y, float z, float w) {return make_float4(x, y, z, w);}
};
template <int cn, typename VecD> struct SatCast;
template <typename VecD> struct SatCast<1, VecD>
{
template <typename VecS>
__device__ VecD operator()(const VecS& v)
static __device__ VecD cast(const VecS& v)
{
VecD res;
res.x = saturate_cast< VecTraits<VecD>::elem_t >(v.x);
return res;
typedef typename VecTraits<VecD>::elem_t D;
return VecTraits<VecD>::make(saturate_cast<D>(v.x));
}
};
template <typename VecD> struct SatCast<2, VecD>
{
template <typename VecS>
__device__ VecD operator()(const VecS& v)
static __device__ VecD cast(const VecS& v)
{
VecD res;
res.x = saturate_cast< VecTraits<VecD>::elem_t >(v.x);
res.y = saturate_cast< VecTraits<VecD>::elem_t >(v.y);
return res;
typedef typename VecTraits<VecD>::elem_t D;
return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
}
};
template <typename VecD> struct SatCast<3, VecD>
{
template <typename VecS>
__device__ VecD operator()(const VecS& v)
static __device__ VecD cast(const VecS& v)
{
VecD res;
res.x = saturate_cast< VecTraits<VecD>::elem_t >(v.x);
res.y = saturate_cast< VecTraits<VecD>::elem_t >(v.y);
res.y = saturate_cast< VecTraits<VecD>::elem_t >(v.z);
return res;
typedef typename VecTraits<VecD>::elem_t D;
return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
}
};
template <typename VecD> struct SatCast<4, VecD>
{
template <typename VecS>
__device__ VecD operator()(const VecS& v)
static __device__ VecD cast(const VecS& v)
{
VecD res;
res.x = saturate_cast< VecTraits<VecD>::elem_t >(v.x);
res.y = saturate_cast< VecTraits<VecD>::elem_t >(v.y);
res.y = saturate_cast< VecTraits<VecD>::elem_t >(v.z);
res.w = saturate_cast< VecTraits<VecD>::elem_t >(v.w);
return res;
typedef typename VecTraits<VecD>::elem_t D;
return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
}
};
template <typename VecD, typename VecS> static __device__ VecD saturate_cast_caller(const VecS& v)
{
SatCast<
VecTraits<VecD>::cn,
VecD
>
cast;
return cast(v);
return SatCast<VecTraits<VecD>::cn, VecD>::cast(v);
}
template<typename _Tp> static __device__ _Tp saturate_cast(const uchar1& v) {return saturate_cast_caller<_Tp>(v);}

View File

@ -107,7 +107,7 @@ protected:
if (!compareMatches(matchesCPU, matchesGPU))
{
ts->printf(CvTS::LOG, "Match FAIL");
ts->printf(CvTS::LOG, "Match FAIL\n");
ts->set_failed_test_info(CvTS::FAIL_MISMATCH);
return;
}
@ -119,7 +119,7 @@ protected:
if (!compareMatches(knnMatchesCPU, knnMatchesGPU))
{
ts->printf(CvTS::LOG, "KNN Match FAIL");
ts->printf(CvTS::LOG, "KNN Match FAIL\n");
ts->set_failed_test_info(CvTS::FAIL_MISMATCH);
return;
}
@ -131,7 +131,7 @@ protected:
if (!compareMatches(radiusMatchesCPU, radiusMatchesGPU))
{
ts->printf(CvTS::LOG, "Radius Match FAIL");
ts->printf(CvTS::LOG, "Radius Match FAIL\n");
ts->set_failed_test_info(CvTS::FAIL_MISMATCH);
return;
}

View File

@ -80,7 +80,8 @@ protected:
double res = norm(m1ROI, m2ROI, NORM_INF);
if (res <= 1)
// Max difference (2.0) in GaussianBlur
if (res <= 2)
return CvTS::OK;
ts->printf(CvTS::LOG, "Norm: %f\n", res);
@ -166,8 +167,6 @@ struct CV_GpuNppImageSobelTest : public CV_GpuNppFilterTest
int test(const Mat& img)
{
if (img.type() != CV_8UC1)
return CvTS::OK;
int ksizes[] = {3, 5, 7};
int ksizes_num = sizeof(ksizes) / sizeof(int);
@ -183,10 +182,8 @@ struct CV_GpuNppImageSobelTest : public CV_GpuNppFilterTest
cv::Sobel(img, cpudst, -1, dx, dy, ksizes[i]);
GpuMat gpu1(img);
gpu1.convertTo(gpu1, CV_32S);
GpuMat gpudst;
cv::gpu::Sobel(gpu1, gpudst, -1, dx, dy, ksizes[i]);
gpudst.convertTo(gpudst, CV_8U);
if (CheckNorm(cpudst, gpudst, Size(ksizes[i], ksizes[i])) != CvTS::OK)
test_res = CvTS::FAIL_GENERIC;
@ -204,20 +201,15 @@ struct CV_GpuNppImageScharrTest : public CV_GpuNppFilterTest
int test(const Mat& img)
{
if (img.type() != CV_8UC1)
return CvTS::OK;
int dx = 1, dy = 0;
Mat cpudst;
cv::Scharr(img, cpudst, -1, dx, dy);
GpuMat gpu1(img);
gpu1.convertTo(gpu1, CV_32S);
GpuMat gpudst;
cv::gpu::Scharr(gpu1, gpudst, -1, dx, dy);
gpudst.convertTo(gpudst, CV_8U);
return CheckNorm(cpudst, gpudst, Size(3, 3));
}
};
@ -244,7 +236,7 @@ struct CV_GpuNppImageGaussianBlurTest : public CV_GpuNppFilterTest
{
cv::Size ksize(ksizes[i], ksizes[j]);
ts->printf(CvTS::LOG, "ksize = (%dx%d)\t", ksizes[i], ksizes[j]);
ts->printf(CvTS::LOG, "ksize = (%dx%d)\t\n", ksizes[i], ksizes[j]);
Mat cpudst;
cv::GaussianBlur(img, cpudst, ksize, sigma1);