/*M/////////////////////////////////////////////////////////////////////////////////////// // // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // // By downloading, copying, installing or using the software you agree to this license. // If you do not agree to this license, do not download, install, // copy or use the software. // // // License Agreement // For Open Source Computer Vision Library // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // // * Redistribution's of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. // // This software is provided by the copyright holders and contributors "as is" and // any express or implied warranties, including, but not limited to, the implied // warranties of merchantability and fitness for a particular purpose are disclaimed. // In no event shall the Intel Corporation or contributors be liable for any direct, // indirect, incidental, special, exemplary, or consequential damages // (including, but not limited to, procurement of substitute goods or services; // loss of use, data, or profits; or business interruption) however caused // and on any theory of liability, whether in contract, strict liability, // or tort (including negligence or otherwise) arising in any way out of // the use of this software, even if advised of the possibility of such damage. // //M*/ #include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/saturate_cast.hpp" #include "internal_shared.hpp" namespace cv { namespace gpu { namespace device { ////////////////////////////////////////////////////////////////////////////////////// // Compare template struct NotEqual : binary_function { __device__ __forceinline__ uchar operator()(T src1, T src2) const { return static_cast(static_cast(src1 != src2) * 255); } }; template inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream) { NotEqual op; transform(static_cast< DevMem2D_ >(src1), static_cast< DevMem2D_ >(src2), dst, op, stream); } void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream) { compare_ne(src1, src2, dst, stream); } void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream) { compare_ne(src1, src2, dst, stream); } ////////////////////////////////////////////////////////////////////////// // Unary bitwise logical matrix operations enum { UN_OP_NOT }; template struct UnOp; template struct UnOp { static __device__ __forceinline__ T call(T v) { return ~v; } }; template __global__ void bitwiseUnOpKernel(int rows, int width, const PtrStep src, PtrStep dst) { const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4; const int y = blockDim.y * blockIdx.y + threadIdx.y; if (y < rows) { uchar* dst_ptr = dst.ptr(y) + x; const uchar* src_ptr = src.ptr(y) + x; if (x + sizeof(uint) - 1 < width) { *(uint*)dst_ptr = UnOp::call(*(uint*)src_ptr); } else { const uchar* src_end = src.ptr(y) + width; while (src_ptr < src_end) { *dst_ptr++ = UnOp::call(*src_ptr++); } } } } template void bitwiseUnOp(int rows, int width, const PtrStep src, PtrStep dst, cudaStream_t stream) { dim3 threads(16, 16); dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y)); bitwiseUnOpKernel<<>>(rows, width, src, dst); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } template __global__ void bitwiseUnOpKernel(int rows, int cols, int cn, const PtrStep src, const PtrStep mask, PtrStep dst) { const int x = blockDim.x * blockIdx.x + threadIdx.x; const int y = blockDim.y * blockIdx.y + threadIdx.y; if (x < cols && y < rows && mask.ptr(y)[x / cn]) { T* dst_row = (T*)dst.ptr(y); const T* src_row = (const T*)src.ptr(y); dst_row[x] = UnOp::call(src_row[x]); } } template void bitwiseUnOp(int rows, int cols, int cn, const PtrStep src, const PtrStep mask, PtrStep dst, cudaStream_t stream) { dim3 threads(16, 16); dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); bitwiseUnOpKernel<<>>(rows, cols, cn, src, mask, dst); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStep src, PtrStep dst, cudaStream_t stream) { bitwiseUnOp(rows, static_cast(cols * elem_size1 * cn), src, dst, stream); } template void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStep src, const PtrStep mask, PtrStep dst, cudaStream_t stream) { bitwiseUnOp(rows, cols * cn, cn, src, mask, dst, stream); } template void bitwiseMaskNotCaller(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t); template void bitwiseMaskNotCaller(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t); template void bitwiseMaskNotCaller(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t); ////////////////////////////////////////////////////////////////////////// // Binary bitwise logical matrix operations enum { BIN_OP_OR, BIN_OP_AND, BIN_OP_XOR }; template struct BinOp; template struct BinOp { static __device__ __forceinline__ T call(T a, T b) { return a | b; } }; template struct BinOp { static __device__ __forceinline__ T call(T a, T b) { return a & b; } }; template struct BinOp { static __device__ __forceinline__ T call(T a, T b) { return a ^ b; } }; template __global__ void bitwiseBinOpKernel(int rows, int width, const PtrStep src1, const PtrStep src2, PtrStep dst) { const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4; const int y = blockDim.y * blockIdx.y + threadIdx.y; if (y < rows) { uchar* dst_ptr = dst.ptr(y) + x; const uchar* src1_ptr = src1.ptr(y) + x; const uchar* src2_ptr = src2.ptr(y) + x; if (x + sizeof(uint) - 1 < width) { *(uint*)dst_ptr = BinOp::call(*(uint*)src1_ptr, *(uint*)src2_ptr); } else { const uchar* src1_end = src1.ptr(y) + width; while (src1_ptr < src1_end) { *dst_ptr++ = BinOp::call(*src1_ptr++, *src2_ptr++); } } } } template void bitwiseBinOp(int rows, int width, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream) { dim3 threads(16, 16); dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y)); bitwiseBinOpKernel<<>>(rows, width, src1, src2, dst); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } template __global__ void bitwiseBinOpKernel( int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst) { const int x = blockDim.x * blockIdx.x + threadIdx.x; const int y = blockDim.y * blockIdx.y + threadIdx.y; if (x < cols && y < rows && mask.ptr(y)[x / cn]) { T* dst_row = (T*)dst.ptr(y); const T* src1_row = (const T*)src1.ptr(y); const T* src2_row = (const T*)src2.ptr(y); dst_row[x] = BinOp::call(src1_row[x], src2_row[x]); } } template void bitwiseBinOp(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream) { dim3 threads(16, 16); dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); bitwiseBinOpKernel<<>>(rows, cols, cn, src1, src2, mask, dst); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream) { bitwiseBinOp(rows, static_cast(cols * elem_size1 * cn), src1, src2, dst, stream); } template void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream) { bitwiseBinOp(rows, cols * cn, cn, src1, src2, mask, dst, stream); } template void bitwiseMaskOrCaller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); template void bitwiseMaskOrCaller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); template void bitwiseMaskOrCaller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream) { bitwiseBinOp(rows, static_cast(cols * elem_size1 * cn), src1, src2, dst, stream); } template void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream) { bitwiseBinOp(rows, cols * cn, cn, src1, src2, mask, dst, stream); } template void bitwiseMaskAndCaller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); template void bitwiseMaskAndCaller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); template void bitwiseMaskAndCaller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream) { bitwiseBinOp(rows, static_cast(cols * elem_size1 * cn), src1, src2, dst, stream); } template void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream) { bitwiseBinOp(rows, cols * cn, cn, src1, src2, mask, dst, stream); } template void bitwiseMaskXorCaller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); template void bitwiseMaskXorCaller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); template void bitwiseMaskXorCaller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); ////////////////////////////////////////////////////////////////////////// // min/max namespace detail { template struct MinMaxTraits : DefaultTransformFunctorTraits { }; template struct MinMaxTraits<2, F> : DefaultTransformFunctorTraits { enum { smart_shift = 4 }; }; template struct MinMaxTraits<4, F> : DefaultTransformFunctorTraits { enum { smart_block_dim_y = 4 }; enum { smart_shift = 4 }; }; } template struct TransformFunctorTraits< minimum > : detail::MinMaxTraits< sizeof(T), minimum > { }; template struct TransformFunctorTraits< maximum > : detail::MinMaxTraits< sizeof(T), maximum > { }; template struct TransformFunctorTraits< binder2nd< minimum > > : detail::MinMaxTraits< sizeof(T), binder2nd< minimum > > { }; template struct TransformFunctorTraits< binder2nd< maximum > > : detail::MinMaxTraits< sizeof(T), binder2nd< maximum > > { }; template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream) { transform(src1, src2, dst, minimum(), stream); } template void min_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream) { transform(src1, src2, dst, maximum(), stream); } template void max_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, T src2, const DevMem2D_& dst, cudaStream_t stream) { transform(src1, dst, device::bind2nd(minimum(), src2), stream); } template void min_gpu(const DevMem2D& src1, uchar src2, const DevMem2D& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, schar src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, ushort src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, short src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, int src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, float src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, T src2, const DevMem2D_& dst, cudaStream_t stream) { transform(src1, dst, device::bind2nd(maximum(), src2), stream); } template void max_gpu(const DevMem2D& src1, uchar src2, const DevMem2D& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, schar src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, ushort src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, short src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, int src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, float src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); ////////////////////////////////////////////////////////////////////////// // threshold namespace detail { template struct ThresholdTraits : DefaultTransformFunctorTraits { }; template struct ThresholdTraits<2, F> : DefaultTransformFunctorTraits { enum { smart_shift = 4 }; }; template struct ThresholdTraits<4, F> : DefaultTransformFunctorTraits { enum { smart_block_dim_y = 4 }; enum { smart_shift = 4 }; }; } template struct TransformFunctorTraits< thresh_binary_func > : detail::ThresholdTraits< sizeof(T), thresh_binary_func > { }; template struct TransformFunctorTraits< thresh_binary_inv_func > : detail::ThresholdTraits< sizeof(T), thresh_binary_inv_func > { }; template struct TransformFunctorTraits< thresh_trunc_func > : detail::ThresholdTraits< sizeof(T), thresh_trunc_func > { }; template struct TransformFunctorTraits< thresh_to_zero_func > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_func > { }; template struct TransformFunctorTraits< thresh_to_zero_inv_func > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_inv_func > { }; template