/*M/////////////////////////////////////////////////////////////////////////////////////// // // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // // By downloading, copying, installing or using the software you agree to this license. // If you do not agree to this license, do not download, install, // copy or use the software. // // // License Agreement // For Open Source Computer Vision Library // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // // * Redistribution's of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. // // This software is provided by the copyright holders and contributors "as is" and // any express or implied warranties, including, but not limited to, the implied // warranties of merchantability and fitness for a particular purpose are disclaimed. // In no event shall the Intel Corporation or contributors be liable for any direct, // indirect, incidental, special, exemplary, or consequential damages // (including, but not limited to, procurement of substitute goods or services; // loss of use, data, or profits; or business interruption) however caused // and on any theory of liability, whether in contract, strict liability, // or tort (including negligence or otherwise) arising in any way out of // the use of this software, even if advised of the possibility of such damage. // //M*/ #include "opencv2/gpu/device/vecmath.hpp" #include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/limits_gpu.hpp" #include "opencv2/gpu/device/saturate_cast.hpp" #include "internal_shared.hpp" using namespace cv::gpu; using namespace cv::gpu::device; namespace cv { namespace gpu { namespace mathfunc { ////////////////////////////////////////////////////////////////////////////////////// // Compare template struct NotEqual { __device__ __forceinline__ uchar operator()(const T1& src1, const T2& src2) { return static_cast(static_cast(src1 != src2) * 255); } }; template inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream) { NotEqual op; transform(static_cast< DevMem2D_ >(src1), static_cast< DevMem2D_ >(src2), dst, op, stream); } void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream) { compare_ne(src1, src2, dst, stream); } void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream) { compare_ne(src1, src2, dst, stream); } ////////////////////////////////////////////////////////////////////////// // Unary bitwise logical matrix operations enum { UN_OP_NOT }; template struct UnOp; template struct UnOp { static __device__ __forceinline__ T call(T v) { return ~v; } }; template __global__ void bitwiseUnOpKernel(int rows, int width, const PtrStep src, PtrStep dst) { const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4; const int y = blockDim.y * blockIdx.y + threadIdx.y; if (y < rows) { uchar* dst_ptr = dst.ptr(y) + x; const uchar* src_ptr = src.ptr(y) + x; if (x + sizeof(uint) - 1 < width) { *(uint*)dst_ptr = UnOp::call(*(uint*)src_ptr); } else { const uchar* src_end = src.ptr(y) + width; while (src_ptr < src_end) { *dst_ptr++ = UnOp::call(*src_ptr++); } } } } template void bitwiseUnOp(int rows, int width, const PtrStep src, PtrStep dst, cudaStream_t stream) { dim3 threads(16, 16); dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y)); bitwiseUnOpKernel<<>>(rows, width, src, dst); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } template __global__ void bitwiseUnOpKernel(int rows, int cols, int cn, const PtrStep src, const PtrStep mask, PtrStep dst) { const int x = blockDim.x * blockIdx.x + threadIdx.x; const int y = blockDim.y * blockIdx.y + threadIdx.y; if (x < cols && y < rows && mask.ptr(y)[x / cn]) { T* dst_row = (T*)dst.ptr(y); const T* src_row = (const T*)src.ptr(y); dst_row[x] = UnOp::call(src_row[x]); } } template void bitwiseUnOp(int rows, int cols, int cn, const PtrStep src, const PtrStep mask, PtrStep dst, cudaStream_t stream) { dim3 threads(16, 16); dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); bitwiseUnOpKernel<<>>(rows, cols, cn, src, mask, dst); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } void bitwiseNotCaller(int rows, int cols, int elem_size1, int cn, const PtrStep src, PtrStep dst, cudaStream_t stream) { bitwiseUnOp(rows, cols * elem_size1 * cn, src, dst, stream); } template void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStep src, const PtrStep mask, PtrStep dst, cudaStream_t stream) { bitwiseUnOp(rows, cols * cn, cn, src, mask, dst, stream); } template void bitwiseMaskNotCaller(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t); template void bitwiseMaskNotCaller(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t); template void bitwiseMaskNotCaller(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t); ////////////////////////////////////////////////////////////////////////// // Binary bitwise logical matrix operations enum { BIN_OP_OR, BIN_OP_AND, BIN_OP_XOR }; template struct BinOp; template struct BinOp { static __device__ __forceinline__ T call(T a, T b) { return a | b; } }; template struct BinOp { static __device__ __forceinline__ T call(T a, T b) { return a & b; } }; template struct BinOp { static __device__ __forceinline__ T call(T a, T b) { return a ^ b; } }; template __global__ void bitwiseBinOpKernel(int rows, int width, const PtrStep src1, const PtrStep src2, PtrStep dst) { const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4; const int y = blockDim.y * blockIdx.y + threadIdx.y; if (y < rows) { uchar* dst_ptr = dst.ptr(y) + x; const uchar* src1_ptr = src1.ptr(y) + x; const uchar* src2_ptr = src2.ptr(y) + x; if (x + sizeof(uint) - 1 < width) { *(uint*)dst_ptr = BinOp::call(*(uint*)src1_ptr, *(uint*)src2_ptr); } else { const uchar* src1_end = src1.ptr(y) + width; while (src1_ptr < src1_end) { *dst_ptr++ = BinOp::call(*src1_ptr++, *src2_ptr++); } } } } template void bitwiseBinOp(int rows, int width, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream) { dim3 threads(16, 16); dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y)); bitwiseBinOpKernel<<>>(rows, width, src1, src2, dst); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } template __global__ void bitwiseBinOpKernel( int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst) { const int x = blockDim.x * blockIdx.x + threadIdx.x; const int y = blockDim.y * blockIdx.y + threadIdx.y; if (x < cols && y < rows && mask.ptr(y)[x / cn]) { T* dst_row = (T*)dst.ptr(y); const T* src1_row = (const T*)src1.ptr(y); const T* src2_row = (const T*)src2.ptr(y); dst_row[x] = BinOp::call(src1_row[x], src2_row[x]); } } template void bitwiseBinOp(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream) { dim3 threads(16, 16); dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); bitwiseBinOpKernel<<>>(rows, cols, cn, src1, src2, mask, dst); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } void bitwiseOrCaller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream) { bitwiseBinOp(rows, cols * elem_size1 * cn, src1, src2, dst, stream); } template void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream) { bitwiseBinOp(rows, cols * cn, cn, src1, src2, mask, dst, stream); } template void bitwiseMaskOrCaller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); template void bitwiseMaskOrCaller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); template void bitwiseMaskOrCaller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); void bitwiseAndCaller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream) { bitwiseBinOp(rows, cols * elem_size1 * cn, src1, src2, dst, stream); } template void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream) { bitwiseBinOp(rows, cols * cn, cn, src1, src2, mask, dst, stream); } template void bitwiseMaskAndCaller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); template void bitwiseMaskAndCaller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); template void bitwiseMaskAndCaller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); void bitwiseXorCaller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream) { bitwiseBinOp(rows, cols * elem_size1 * cn, src1, src2, dst, stream); } template void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream) { bitwiseBinOp(rows, cols * cn, cn, src1, src2, mask, dst, stream); } template void bitwiseMaskXorCaller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); template void bitwiseMaskXorCaller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); template void bitwiseMaskXorCaller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); ////////////////////////////////////////////////////////////////////////// // min/max struct MinOp { template __device__ __forceinline__ T operator()(T a, T b) { return min(a, b); } __device__ __forceinline__ float operator()(float a, float b) { return fmin(a, b); } __device__ __forceinline__ double operator()(double a, double b) { return fmin(a, b); } }; struct MaxOp { template __device__ __forceinline__ T operator()(T a, T b) { return max(a, b); } __device__ __forceinline__ float operator()(float a, float b) { return fmax(a, b); } __device__ __forceinline__ double operator()(double a, double b) { return fmax(a, b); } }; template struct ScalarMinOp { T s; explicit ScalarMinOp(T s_) : s(s_) {} __device__ __forceinline__ T operator()(T a) { return min(a, s); } }; template <> struct ScalarMinOp { float s; explicit ScalarMinOp(float s_) : s(s_) {} __device__ __forceinline__ float operator()(float a) { return fmin(a, s); } }; template <> struct ScalarMinOp { double s; explicit ScalarMinOp(double s_) : s(s_) {} __device__ __forceinline__ double operator()(double a) { return fmin(a, s); } }; template struct ScalarMaxOp { T s; explicit ScalarMaxOp(T s_) : s(s_) {} __device__ __forceinline__ T operator()(T a) { return max(a, s); } }; template <> struct ScalarMaxOp { float s; explicit ScalarMaxOp(float s_) : s(s_) {} __device__ __forceinline__ float operator()(float a) { return fmax(a, s); } }; template <> struct ScalarMaxOp { double s; explicit ScalarMaxOp(double s_) : s(s_) {} __device__ __forceinline__ double operator()(double a) { return fmax(a, s); } }; template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream) { MinOp op; transform(src1, src2, dst, op, stream); } template void min_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream) { MaxOp op; transform(src1, src2, dst, op, stream); } template void max_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, T src2, const DevMem2D_& dst, cudaStream_t stream) { ScalarMinOp op(src2); transform(src1, dst, op, stream); } template void min_gpu(const DevMem2D& src1, uchar src2, const DevMem2D& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, schar src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, ushort src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, short src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, int src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, float src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, T src2, const DevMem2D_& dst, cudaStream_t stream) { ScalarMaxOp op(src2); transform(src1, dst, op, stream); } template void max_gpu(const DevMem2D& src1, uchar src2, const DevMem2D& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, schar src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, ushort src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, short src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, int src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, float src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); ////////////////////////////////////////////////////////////////////////// // threshold template struct ThreshBinary { ThreshBinary(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {} __device__ __forceinline__ T operator()(const T& src) const { return src > thresh ? maxVal : 0; } private: T thresh; T maxVal; }; template struct ThreshBinaryInv { ThreshBinaryInv(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {} __device__ __forceinline__ T operator()(const T& src) const { return src > thresh ? 0 : maxVal; } private: T thresh; T maxVal; }; template struct ThreshTrunc { ThreshTrunc(T thresh_, T) : thresh(thresh_) {} __device__ __forceinline__ T operator()(const T& src) const { return min(src, thresh); } private: T thresh; }; template <> struct ThreshTrunc { ThreshTrunc(float thresh_, float) : thresh(thresh_) {} __device__ __forceinline__ float operator()(const float& src) const { return fmin(src, thresh); } private: float thresh; }; template <> struct ThreshTrunc { ThreshTrunc(double thresh_, double) : thresh(thresh_) {} __device__ __forceinline__ double operator()(const double& src) const { return fmin(src, thresh); } private: double thresh; }; template struct ThreshToZero { public: ThreshToZero(T thresh_, T) : thresh(thresh_) {} __device__ __forceinline__ T operator()(const T& src) const { return src > thresh ? src : 0; } private: T thresh; }; template struct ThreshToZeroInv { public: ThreshToZeroInv(T thresh_, T) : thresh(thresh_) {} __device__ __forceinline__ T operator()(const T& src) const { return src > thresh ? 0 : src; } private: T thresh; }; template